[ray] allow for specifying ray.init kwargs (i.e. runtime_env) (#7647)

* ray init kwargs * Update trainer_utils.py * fix ray args --------- Co-authored-by: hoshi-hiyouga <hiyouga@buaa.edu.cn>
2025-12-29 18:20:35 +08:00 · 2025-04-09 20:31:05 -07:00
parent ee840b4e01
commit 39c1e29ed7
3 changed files with 16 additions and 2 deletions
--- a/examples/train_lora/llama3_lora_sft_ray.yaml
+++ b/examples/train_lora/llama3_lora_sft_ray.yaml
@@ -31,10 +31,16 @@ report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
 ### ray
 ray_run_name: llama3_8b_sft_lora
 ray_storage_path: ./saves
-ray_num_workers: 4  # number of GPUs to use
+ray_num_workers: 4  # Number of GPUs to use.
+placement_strategy: PACK
 resources_per_worker:
  GPU: 1
-placement_strategy: PACK
+# ray_init_kwargs:
+#   runtime_env:
+#     env_vars:
+#       <YOUR-ENV-VAR-HERE>: "<YOUR-ENV-VAR-HERE>"
+#     pip:
+#       - emoji

 ### train
 per_device_train_batch_size: 1
--- a/src/llamafactory/hparams/training_args.py
+++ b/src/llamafactory/hparams/training_args.py
@@ -46,6 +46,10 @@ class RayArguments:
        default="PACK",
        metadata={"help": "The placement strategy for Ray training. Default is PACK."},
    )
+    ray_init_kwargs: Optional[dict] = field(
+        default=None,
+        metadata={"help": "The arguments to pass to ray.init for Ray training. Default is None."},
+    )

    def __post_init__(self):
        self.use_ray = use_ray()
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -48,6 +48,7 @@ if is_apollo_available():


 if is_ray_available():
+    import ray
    from ray.train import RunConfig, ScalingConfig
    from ray.train.torch import TorchTrainer

@@ -644,6 +645,9 @@ def get_ray_trainer(
    if not ray_args.use_ray:
        raise ValueError("Ray was not enabled. Please set `USE_RAY=1` to enable ray.")

+    if ray_args.ray_init_kwargs is not None:
+        ray.init(**ray_args.ray_init_kwargs)
+
    trainer = TorchTrainer(
        training_function,
        train_loop_config=train_loop_config,