mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-18 12:50:38 +08:00
refactor ray integration, support save ckpt
This commit is contained in:
@@ -24,8 +24,7 @@ from .chat.chat_model import run_chat
|
||||
from .eval.evaluator import run_eval
|
||||
from .extras import logging
|
||||
from .extras.env import VERSION, print_env
|
||||
from .extras.misc import get_device_count
|
||||
from .integrations.ray.ray_utils import should_use_ray
|
||||
from .extras.misc import get_device_count, use_ray
|
||||
from .train.tuner import export_model, run_exp
|
||||
from .webui.interface import run_web_demo, run_web_ui
|
||||
|
||||
@@ -88,8 +87,7 @@ def main():
|
||||
export_model()
|
||||
elif command == Command.TRAIN:
|
||||
force_torchrun = os.getenv("FORCE_TORCHRUN", "0").lower() in ["true", "1"]
|
||||
use_ray = should_use_ray()
|
||||
if force_torchrun or (get_device_count() > 1 and not use_ray):
|
||||
if force_torchrun or (get_device_count() > 1 and not use_ray()):
|
||||
master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
|
||||
master_port = os.getenv("MASTER_PORT", str(random.randint(20001, 29999)))
|
||||
logger.info_rank0(f"Initializing distributed tasks at: {master_addr}:{master_port}")
|
||||
|
||||
Reference in New Issue
Block a user