[misc] upgrade cli (#7714)

This commit is contained in:
hoshi-hiyouga 2025-04-14 15:41:22 +08:00 committed by GitHub
parent 1fd4d14fbb
commit 3ef36d0057
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 26 additions and 10 deletions

View File

@ -3,16 +3,14 @@ datasets>=2.16.0,<=3.5.0
accelerate>=0.34.0,<=1.6.0 accelerate>=0.34.0,<=1.6.0
peft>=0.14.0,<=0.15.1 peft>=0.14.0,<=0.15.1
trl>=0.8.6,<=0.9.6 trl>=0.8.6,<=0.9.6
tokenizers>=0.19.0,<=0.21.0 tokenizers>=0.19.0,<=0.21.1
gradio>=4.38.0,<=5.21.0 gradio>=4.38.0,<=5.25.0
pandas>=2.0.0
scipy scipy
einops einops
sentencepiece sentencepiece
tiktoken tiktoken
protobuf protobuf
uvicorn uvicorn
pydantic
fastapi fastapi
sse-starlette sse-starlette
matplotlib>=3.7.0 matplotlib>=3.7.0
@ -21,6 +19,7 @@ packaging
pyyaml pyyaml
numpy<2.0.0 numpy<2.0.0
pydantic<=2.10.6 pydantic<=2.10.6
pandas>=2.0.0
av av
librosa librosa
tyro<0.9.0 tyro<0.9.0

View File

@ -15,6 +15,7 @@
import os import os
import subprocess import subprocess
import sys import sys
from copy import deepcopy
from enum import Enum, unique from enum import Enum, unique
from . import launcher from . import launcher
@ -96,6 +97,13 @@ def main():
if int(nnodes) > 1: if int(nnodes) > 1:
print(f"Multi-node training enabled: num nodes: {nnodes}, node rank: {node_rank}") print(f"Multi-node training enabled: num nodes: {nnodes}, node rank: {node_rank}")
env = deepcopy(os.environ)
if is_env_enabled("OPTIM_TORCH", "1"):
# optimize DDP, see https://zhuanlan.zhihu.com/p/671834539
env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
env["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
# NOTE: DO NOT USE shell=True to avoid security risk
process = subprocess.run( process = subprocess.run(
( (
"torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} " "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
@ -110,7 +118,9 @@ def main():
file_name=launcher.__file__, file_name=launcher.__file__,
args=" ".join(sys.argv[1:]), args=" ".join(sys.argv[1:]),
) )
.split() .split(),
env=env,
check=True,
) )
sys.exit(process.returncode) sys.exit(process.returncode)
else: else:

View File

@ -727,23 +727,23 @@ register_model_group(
}, },
"GLM-4-9B-Chat-0414": { "GLM-4-9B-Chat-0414": {
DownloadSource.DEFAULT: "THUDM/GLM-4-9B-Chat-0414", DownloadSource.DEFAULT: "THUDM/GLM-4-9B-Chat-0414",
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-9B-Chat-0414" , DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-9B-Chat-0414",
}, },
"GLM-4-32B-0414": { "GLM-4-32B-0414": {
DownloadSource.DEFAULT: "THUDM/GLM-4-32B-0414", DownloadSource.DEFAULT: "THUDM/GLM-4-32B-0414",
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-0414" , DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-0414",
}, },
"GLM-4-32B-Chat-0414": { "GLM-4-32B-Chat-0414": {
DownloadSource.DEFAULT: "THUDM/GLM-4-32B-Chat-0414", DownloadSource.DEFAULT: "THUDM/GLM-4-32B-Chat-0414",
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-Chat-0414" , DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-Chat-0414",
}, },
"GLM-4-Z1-9B-Chat-0414": { "GLM-4-Z1-9B-Chat-0414": {
DownloadSource.DEFAULT: "THUDM/GLM-4-Z1-9B-0414", DownloadSource.DEFAULT: "THUDM/GLM-4-Z1-9B-0414",
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-Z1-9B-0414" , DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-Z1-9B-0414",
}, },
"GLM-4-Z1-32B-Chat-0414": { "GLM-4-Z1-32B-Chat-0414": {
DownloadSource.DEFAULT: "THUDM/GLM-4-Z1-32B-0414", DownloadSource.DEFAULT: "THUDM/GLM-4-Z1-32B-0414",
DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-Z1-32B-0414" , DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-Z1-32B-0414",
}, },
}, },
template="glm4", template="glm4",

View File

@ -390,8 +390,10 @@ def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _
def get_infer_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _INFER_CLS: def get_infer_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _INFER_CLS:
model_args, data_args, finetuning_args, generating_args = _parse_infer_args(args) model_args, data_args, finetuning_args, generating_args = _parse_infer_args(args)
# Setup logging
_set_transformers_logging() _set_transformers_logging()
# Check arguments
if model_args.infer_backend == "vllm": if model_args.infer_backend == "vllm":
if finetuning_args.stage != "sft": if finetuning_args.stage != "sft":
raise ValueError("vLLM engine only supports auto-regressive models.") raise ValueError("vLLM engine only supports auto-regressive models.")
@ -408,6 +410,7 @@ def get_infer_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _
_verify_model_args(model_args, data_args, finetuning_args) _verify_model_args(model_args, data_args, finetuning_args)
_check_extra_dependencies(model_args, finetuning_args) _check_extra_dependencies(model_args, finetuning_args)
# Post-process model arguments
if model_args.export_dir is not None and model_args.export_device == "cpu": if model_args.export_dir is not None and model_args.export_device == "cpu":
model_args.device_map = {"": torch.device("cpu")} model_args.device_map = {"": torch.device("cpu")}
if data_args.cutoff_len != DataArguments().cutoff_len: # override cutoff_len if it is not default if data_args.cutoff_len != DataArguments().cutoff_len: # override cutoff_len if it is not default
@ -421,8 +424,10 @@ def get_infer_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _
def get_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _EVAL_CLS: def get_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _EVAL_CLS:
model_args, data_args, eval_args, finetuning_args = _parse_eval_args(args) model_args, data_args, eval_args, finetuning_args = _parse_eval_args(args)
# Setup logging
_set_transformers_logging() _set_transformers_logging()
# Check arguments
if model_args.infer_backend == "vllm": if model_args.infer_backend == "vllm":
raise ValueError("vLLM backend is only available for API, CLI and Web.") raise ValueError("vLLM backend is only available for API, CLI and Web.")

View File

@ -96,6 +96,7 @@ def patch_config(
model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None)) model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
if is_torch_npu_available(): if is_torch_npu_available():
# avoid JIT compile on NPU devices, see https://zhuanlan.zhihu.com/p/660875458
torch.npu.set_compile_mode(jit_compile=is_env_enabled("JIT_COMPILE")) torch.npu.set_compile_mode(jit_compile=is_env_enabled("JIT_COMPILE"))
configure_attn_implementation(config, model_args, is_trainable) configure_attn_implementation(config, model_args, is_trainable)

View File

@ -368,6 +368,7 @@ class Runner:
if args.get("deepspeed", None) is not None: if args.get("deepspeed", None) is not None:
env["FORCE_TORCHRUN"] = "1" env["FORCE_TORCHRUN"] = "1"
# NOTE: DO NOT USE shell=True to avoid security risk
self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env) self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env)
yield from self.monitor() yield from self.monitor()