refactor ray integration, support save ckpt

2025-12-18 12:50:38 +08:00 · 2025-01-07 08:54:41 +00:00
parent 1e8e7be0a5
commit d8cac6f546
18 changed files with 215 additions and 161 deletions
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -24,8 +24,7 @@ from .chat.chat_model import run_chat
 from .eval.evaluator import run_eval
 from .extras import logging
 from .extras.env import VERSION, print_env
-from .extras.misc import get_device_count
-from .integrations.ray.ray_utils import should_use_ray
+from .extras.misc import get_device_count, use_ray
 from .train.tuner import export_model, run_exp
 from .webui.interface import run_web_demo, run_web_ui

@@ -88,8 +87,7 @@ def main():
        export_model()
    elif command == Command.TRAIN:
        force_torchrun = os.getenv("FORCE_TORCHRUN", "0").lower() in ["true", "1"]
-        use_ray = should_use_ray()
-        if force_torchrun or (get_device_count() > 1 and not use_ray):
+        if force_torchrun or (get_device_count() > 1 and not use_ray()):
            master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
            master_port = os.getenv("MASTER_PORT", str(random.randint(20001, 29999)))
            logger.info_rank0(f"Initializing distributed tasks at: {master_addr}:{master_port}")