[feature] support using ray.remote to start distributed training. (#10109)

2026-05-28 10:58:54 +08:00 · 2026-01-28 16:05:29 +08:00
parent 9640f79ae5
commit 762b480131
4 changed files with 221 additions and 80 deletions
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -23,9 +23,9 @@ from transformers import EarlyStoppingCallback, PreTrainedModel
 from ..data import get_template_and_fix_tokenizer
 from ..extras import logging
 from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
-from ..extras.misc import infer_optim_dtype
+from ..extras.misc import find_available_port, get_device_name, get_torch_device, infer_optim_dtype
 from ..extras.packages import is_mcore_adapter_available, is_ray_available
-from ..hparams import get_infer_args, get_ray_args, get_train_args, read_args
+from ..hparams import RayArguments, get_infer_args, get_ray_args, get_train_args, read_args
 from ..model import load_model, load_tokenizer
 from .callbacks import LogCallback, PissaConvertCallback, ReporterCallback
 from .dpo import run_dpo
@@ -34,12 +34,17 @@ from .ppo import run_ppo
 from .pt import run_pt
 from .rm import run_rm
 from .sft import run_sft
-from .trainer_utils import get_ray_trainer, get_swanlab_callback
+from .trainer_utils import (
+    get_placement_group,
+    get_ray_head_node_ip,
+    get_ray_remote_config_for_worker,
+    get_swanlab_callback,
+    sort_placement_group_by_node_ip,
+)


 if is_ray_available():
    import ray
-    from ray.train.huggingface.transformers import RayTrainReportCallback


 if TYPE_CHECKING:
@@ -115,13 +120,7 @@ def run_exp(args: Optional[dict[str, Any]] = None, callbacks: Optional[list["Tra
    ray_args = get_ray_args(args)
    callbacks = callbacks or []
    if ray_args.use_ray:
-        callbacks.append(RayTrainReportCallback())
-        trainer = get_ray_trainer(
-            training_function=_training_function,
-            train_loop_config={"args": args, "callbacks": callbacks},
-            ray_args=ray_args,
-        )
-        trainer.fit()
+        _ray_training_function(ray_args, config={"args": args, "callbacks": callbacks})
    else:
        _training_function(config={"args": args, "callbacks": callbacks})

@@ -212,3 +211,94 @@ def export_model(args: Optional[dict[str, Any]] = None) -> None:
    with open(ollama_modelfile, "w", encoding="utf-8") as f:
        f.write(template.get_ollama_modelfile(tokenizer))
        logger.info_rank0(f"Ollama modelfile saved in {ollama_modelfile}")
+
+
+class Worker:
+    def __init__(self):
+        self._setup_env_visible_devices()
+
+        local_rank = os.environ.get("LOCAL_RANK", "0")
+        get_torch_device().set_device(int(local_rank))
+
+    def _setup_env_visible_devices(self) -> None:
+        RAY_NOSET_VISIBLE_DEVICES_LIST = [
+            "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES",
+            "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES",
+        ]
+        is_ray_noset_visible_devices = any(os.environ.get(env_var, None) for env_var in RAY_NOSET_VISIBLE_DEVICES_LIST)
+        if is_ray_noset_visible_devices:
+            device_name = get_device_name().upper()
+            local_rank = ray.get_runtime_context().get_accelerator_ids()[device_name][0]
+            os.environ["LOCAL_RANK"] = local_rank
+        else:
+            os.environ["LOCAL_RANK"] = "0"
+
+    def _training_function(self, config: dict[str, Any]) -> None:
+        _training_function(config)
+
+
+def _ray_training_function(ray_args: "RayArguments", config: dict[str, Any]) -> None:
+    num_workers = ray_args.ray_num_workers
+    master_addr = ray_args.master_addr
+    master_port = ray_args.master_port
+    logger.info(f"Using ray.remote mode with {num_workers} workers for distributed training.")
+
+    # initialize ray
+    if not ray.is_initialized():
+        if ray_args.ray_init_kwargs is not None:
+            ray.init(**ray_args.ray_init_kwargs)
+        else:
+            ray.init()
+
+    # verify resources
+    device_name = get_device_name().upper()
+    total_devices = int(ray.cluster_resources().get(device_name, 0))
+    if num_workers > total_devices:
+        raise ValueError(
+            f"The number of devices in the Ray cluster ({total_devices}) should be greater than num_workers ({num_workers})."
+        )
+
+    # verify master_addr
+    if master_addr is None:
+        master_addr = get_ray_head_node_ip()
+        logger.info(f"`master_addr` is not specified, using head node ip: {master_addr}.")
+    else:
+        nodes = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]]
+        if master_addr not in nodes:
+            raise ValueError(f"The `master_addr` ({master_addr}) is not in Ray cluster or not alive ")
+
+    # create placementgroup for resource management
+    pg, bundle = get_placement_group(total_devices)
+    ray.get(pg.ready())
+    logger.info(f"Create placement group with {num_workers} bundles: {bundle}")
+
+    # get sorted_bundle_indices
+    sorted_bundle_indices = sort_placement_group_by_node_ip(pg, master_addr)
+
+    # get master port
+    if master_port is None:
+        master_port = find_available_port()
+        logger.info(f"`master_port` is not specified, using available port: {master_port}.")
+    master_port = str(master_port)
+
+    # backing up environment variables
+    current_env = dict(os.environ.items())
+
+    # launch workers
+    RayWorker = ray.remote(Worker)
+    workers = []
+    for rank in range(num_workers):
+        remote_config = get_ray_remote_config_for_worker(
+            placement_group=pg,
+            bundle_idx=sorted_bundle_indices[rank],
+            rank=rank,
+            world_size=num_workers,
+            master_addr=master_addr,
+            master_port=master_port,
+            env=current_env,
+        )
+        worker = RayWorker.options(**remote_config).remote()
+        workers.append(worker)
+
+    ray.get([worker._training_function.remote(config=config) for worker in workers])
+    ray.shutdown()