refactor ray integration, support save ckpt

This commit is contained in:
hiyouga
2025-01-07 08:54:41 +00:00
parent 1e8e7be0a5
commit d8cac6f546
18 changed files with 215 additions and 161 deletions

View File

@@ -24,8 +24,7 @@ from .chat.chat_model import run_chat
from .eval.evaluator import run_eval
from .extras import logging
from .extras.env import VERSION, print_env
from .extras.misc import get_device_count
from .integrations.ray.ray_utils import should_use_ray
from .extras.misc import get_device_count, use_ray
from .train.tuner import export_model, run_exp
from .webui.interface import run_web_demo, run_web_ui
@@ -88,8 +87,7 @@ def main():
export_model()
elif command == Command.TRAIN:
force_torchrun = os.getenv("FORCE_TORCHRUN", "0").lower() in ["true", "1"]
use_ray = should_use_ray()
if force_torchrun or (get_device_count() > 1 and not use_ray):
if force_torchrun or (get_device_count() > 1 and not use_ray()):
master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
master_port = os.getenv("MASTER_PORT", str(random.randint(20001, 29999)))
logger.info_rank0(f"Initializing distributed tasks at: {master_addr}:{master_port}")