diff --git a/requirements.txt b/requirements.txt index 5b9a9684..af317707 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -transformers>=4.41.2,<=4.45.2;python_version<'3.10' -transformers>=4.41.2,<=4.48.2,!=4.46.*,!=4.47.*,!=4.48.0;python_version>='3.10' +transformers>=4.41.2,<=4.48.3,!=4.46.*,!=4.47.*,!=4.48.0,!=4.48.1,!=4.48.2;python_version<'3.10' +transformers>=4.41.2,<=4.48.3,!=4.46.*,!=4.47.*,!=4.48.0;python_version>='3.10' datasets>=2.16.0,<=3.2.0 accelerate>=0.34.0,<=1.2.1 peft>=0.11.1,<=0.12.0 diff --git a/setup.py b/setup.py index f4230444..103ec2c9 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ def get_requires() -> List[str]: def get_console_scripts() -> List[str]: console_scripts = ["llamafactory-cli = llamafactory.cli:main"] - if os.environ.get("ENABLE_SHORT_CONSOLE", "1").lower() in ["true", "1"]: + if os.getenv("ENABLE_SHORT_CONSOLE", "1").lower() in ["true", "y", "1"]: console_scripts.append("lmf = llamafactory.cli:main") return console_scripts @@ -54,7 +54,7 @@ extra_require = { "gptq": ["optimum>=1.17.0", "auto-gptq>=0.5.0"], "awq": ["autoawq"], "aqlm": ["aqlm[gpu]>=1.1.0"], - "vllm": ["vllm>=0.4.3,<=0.6.5"], + "vllm": ["vllm>=0.4.3,<=0.7.2"], "galore": ["galore-torch"], "apollo": ["apollo-torch"], "badam": ["badam>=1.2.1"], diff --git a/src/llamafactory/__init__.py b/src/llamafactory/__init__.py index 9b807697..966a32d4 100644 --- a/src/llamafactory/__init__.py +++ b/src/llamafactory/__init__.py @@ -20,7 +20,7 @@ Level: Dependency graph: main: - transformers>=4.41.2,<=4.48.2,!=4.46.*,!=4.47.*,!=4.48.0 + transformers>=4.41.2,<=4.48.3,!=4.46.*,!=4.47.*,!=4.48.0 datasets>=2.16.0,<=3.2.0 accelerate>=0.34.0,<=1.2.1 peft>=0.11.1,<=0.12.0 @@ -30,7 +30,7 @@ Dependency graph: longlora: transformers>=4.41.2,<4.48.0 packing: - transformers>=4.43.0,<=4.48.2 + transformers>=4.43.0 Disable version checking: DISABLE_VERSION_CHECK=1 Enable VRAM recording: RECORD_VRAM=1 diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py index 5ce7a964..177618bb 100644 --- a/src/llamafactory/cli.py +++ b/src/llamafactory/cli.py @@ -24,7 +24,7 @@ from .chat.chat_model import run_chat from .eval.evaluator import run_eval from .extras import logging from .extras.env import VERSION, print_env -from .extras.misc import get_device_count, use_ray +from .extras.misc import get_device_count, is_env_enabled, use_ray from .train.tuner import export_model, run_exp from .webui.interface import run_web_demo, run_web_ui @@ -86,7 +86,7 @@ def main(): elif command == Command.EXPORT: export_model() elif command == Command.TRAIN: - force_torchrun = os.getenv("FORCE_TORCHRUN", "0").lower() in ["true", "1"] + force_torchrun = is_env_enabled("FORCE_TORCHRUN") if force_torchrun or (get_device_count() > 1 and not use_ray()): master_addr = os.getenv("MASTER_ADDR", "127.0.0.1") master_port = os.getenv("MASTER_PORT", str(random.randint(20001, 29999))) diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py index 8fe01b33..504917c0 100644 --- a/src/llamafactory/extras/env.py +++ b/src/llamafactory/extras/env.py @@ -45,6 +45,8 @@ def print_env() -> None: if is_torch_cuda_available(): info["PyTorch version"] += " (GPU)" info["GPU type"] = torch.cuda.get_device_name() + info["GPU number"] = torch.cuda.device_count() + info["GPU memory"] = f"{torch.cuda.mem_get_info()[1] / (1024**3):.2f}GB" if is_torch_npu_available(): info["PyTorch version"] += " (NPU)" @@ -59,7 +61,7 @@ def print_env() -> None: pass try: - import bitsandbytes + import bitsandbytes # type: ignore info["Bitsandbytes version"] = bitsandbytes.__version__ except Exception: diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index 7a198a85..3f15dd04 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -78,7 +78,7 @@ def check_version(requirement: str, mandatory: bool = False) -> None: r""" Optionally checks the package version. """ - if os.getenv("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"] and not mandatory: + if is_env_enabled("DISABLE_VERSION_CHECK") and not mandatory: logger.warning_rank0_once("Version checking has been disabled, may lead to unexpected behaviors.") return @@ -94,7 +94,7 @@ def check_dependencies() -> None: r""" Checks the version of the required packages. """ - check_version("transformers>=4.41.2,<=4.48.2,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0") + check_version("transformers>=4.41.2,<=4.48.3,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0") check_version("datasets>=2.16.0,<=3.2.0") check_version("accelerate>=0.34.0,<=1.2.1") check_version("peft>=0.11.1,<=0.12.0") @@ -226,6 +226,13 @@ def is_gpu_or_npu_available() -> bool: return is_torch_npu_available() or is_torch_cuda_available() +def is_env_enabled(env_var: str, default: str = "0") -> bool: + r""" + Checks if the environment variable is enabled. + """ + return os.getenv(env_var, default).lower() in ["true", "y", "1"] + + def numpify(inputs: Union["NDArray", "torch.Tensor"]) -> "NDArray": r""" Casts a torch tensor or a numpy array to a numpy array. @@ -244,7 +251,7 @@ def skip_check_imports() -> None: r""" Avoids flash attention import error in custom model files. """ - if os.getenv("FORCE_CHECK_IMPORTS", "0").lower() not in ["true", "1"]: + if not is_env_enabled("FORCE_CHECK_IMPORTS"): transformers.dynamic_module_utils.check_imports = get_relative_imports @@ -290,12 +297,12 @@ def try_download_model_from_other_hub(model_args: "ModelArguments") -> str: def use_modelscope() -> bool: - return os.getenv("USE_MODELSCOPE_HUB", "0").lower() in ["true", "1"] + return is_env_enabled("USE_MODELSCOPE_HUB") def use_openmind() -> bool: - return os.getenv("USE_OPENMIND_HUB", "0").lower() in ["true", "1"] + return is_env_enabled("USE_OPENMIND_HUB") def use_ray() -> bool: - return os.getenv("USE_RAY", "0").lower() in ["true", "1"] + return is_env_enabled("USE_RAY") diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py index b1c75551..95f5b34a 100644 --- a/src/llamafactory/hparams/parser.py +++ b/src/llamafactory/hparams/parser.py @@ -32,7 +32,7 @@ from transformers.utils import is_torch_bf16_gpu_available, is_torch_npu_availab from ..extras import logging from ..extras.constants import CHECKPOINT_NAMES -from ..extras.misc import check_dependencies, check_version, get_current_device +from ..extras.misc import check_dependencies, check_version, get_current_device, is_env_enabled from .data_args import DataArguments from .evaluation_args import EvaluationArguments from .finetuning_args import FinetuningArguments @@ -136,7 +136,7 @@ def _check_extra_dependencies( check_version("mixture-of-depth>=1.1.6", mandatory=True) if model_args.infer_backend == "vllm": - check_version("vllm>=0.4.3,<=0.6.5") + check_version("vllm>=0.4.3,<=0.7.2") check_version("vllm", mandatory=True) if finetuning_args.use_galore: @@ -162,19 +162,19 @@ def _check_extra_dependencies( def _parse_train_args(args: Optional[Union[Dict[str, Any], List[str]]] = None) -> _TRAIN_CLS: parser = HfArgumentParser(_TRAIN_ARGS) - allow_extra_keys = os.getenv("ALLOW_EXTRA_ARGS", "0").lower() in ["true", "1"] + allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS") return _parse_args(parser, args, allow_extra_keys=allow_extra_keys) def _parse_infer_args(args: Optional[Union[Dict[str, Any], List[str]]] = None) -> _INFER_CLS: parser = HfArgumentParser(_INFER_ARGS) - allow_extra_keys = os.getenv("ALLOW_EXTRA_ARGS", "0").lower() in ["true", "1"] + allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS") return _parse_args(parser, args, allow_extra_keys=allow_extra_keys) def _parse_eval_args(args: Optional[Union[Dict[str, Any], List[str]]] = None) -> _EVAL_CLS: parser = HfArgumentParser(_EVAL_ARGS) - allow_extra_keys = os.getenv("ALLOW_EXTRA_ARGS", "0").lower() in ["true", "1"] + allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS") return _parse_args(parser, args, allow_extra_keys=allow_extra_keys) diff --git a/src/llamafactory/model/model_utils/packing.py b/src/llamafactory/model/model_utils/packing.py index 089c0274..275d7895 100644 --- a/src/llamafactory/model/model_utils/packing.py +++ b/src/llamafactory/model/model_utils/packing.py @@ -118,6 +118,6 @@ def configure_packing(model_args: "ModelArguments", is_trainable: bool) -> None: if not is_trainable or not model_args.block_diag_attn: return - check_version("transformers>=4.43.0,<=4.48.2") + check_version("transformers>=4.43.0") transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data logger.info_rank0("Using block diagonal attention for sequence packing without cross-attention.") diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py index 414bda07..1a57b00e 100644 --- a/src/llamafactory/model/patcher.py +++ b/src/llamafactory/model/patcher.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from types import MethodType from typing import TYPE_CHECKING, Any, Dict @@ -23,7 +22,7 @@ from transformers.integrations import is_deepspeed_zero3_enabled from transformers.modeling_utils import is_fsdp_enabled from ..extras import logging -from ..extras.misc import infer_optim_dtype +from ..extras.misc import infer_optim_dtype, is_env_enabled from ..extras.packages import is_transformers_version_greater_than from .model_utils.attention import configure_attn_implementation, print_attn_implementation from .model_utils.checkpointing import prepare_model_for_training @@ -102,8 +101,7 @@ def patch_config( model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None)) if is_torch_npu_available(): - use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"] - torch.npu.set_compile_mode(jit_compile=use_jit_compile) + torch.npu.set_compile_mode(jit_compile=is_env_enabled("JIT_COMPILE")) configure_attn_implementation(config, model_args, is_trainable) configure_rope(config, model_args, is_trainable) diff --git a/src/llamafactory/train/callbacks.py b/src/llamafactory/train/callbacks.py index 41c83819..45191d9e 100644 --- a/src/llamafactory/train/callbacks.py +++ b/src/llamafactory/train/callbacks.py @@ -35,7 +35,7 @@ from typing_extensions import override from ..extras import logging from ..extras.constants import TRAINER_LOG, V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME -from ..extras.misc import get_peak_memory, use_ray +from ..extras.misc import get_peak_memory, is_env_enabled, use_ray if is_safetensors_available(): @@ -193,7 +193,7 @@ class LogCallback(TrainerCallback): self.aborted = False self.do_train = False # Web UI - self.webui_mode = os.environ.get("LLAMABOARD_ENABLED", "0").lower() in ["true", "1"] + self.webui_mode = is_env_enabled("LLAMABOARD_ENABLED") if self.webui_mode and not use_ray(): signal.signal(signal.SIGABRT, self._set_abort) self.logger_handler = logging.LoggerHandler(os.environ.get("LLAMABOARD_WORKDIR")) @@ -299,7 +299,7 @@ class LogCallback(TrainerCallback): logs["throughput"] = round(state.num_input_tokens_seen / (time.time() - self.start_time), 2) logs["total_tokens"] = state.num_input_tokens_seen - if os.environ.get("RECORD_VRAM", "0").lower() in ["true", "1"]: + if is_env_enabled("RECORD_VRAM"): vram_allocated, vram_reserved = get_peak_memory() logs["vram_allocated"] = round(vram_allocated / (1024**3), 2) logs["vram_reserved"] = round(vram_reserved / (1024**3), 2) diff --git a/src/llamafactory/webui/interface.py b/src/llamafactory/webui/interface.py index 6dff348e..c6722fb8 100644 --- a/src/llamafactory/webui/interface.py +++ b/src/llamafactory/webui/interface.py @@ -15,6 +15,7 @@ import os import platform +from ..extras.misc import is_env_enabled from ..extras.packages import is_gradio_available from .common import save_config from .components import ( @@ -87,14 +88,14 @@ def create_web_demo() -> "gr.Blocks": def run_web_ui() -> None: - gradio_ipv6 = os.getenv("GRADIO_IPV6", "0").lower() in ["true", "1"] - gradio_share = os.getenv("GRADIO_SHARE", "0").lower() in ["true", "1"] + gradio_ipv6 = is_env_enabled("GRADIO_IPV6") + gradio_share = is_env_enabled("GRADIO_SHARE") server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0") create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True) def run_web_demo() -> None: - gradio_ipv6 = os.getenv("GRADIO_IPV6", "0").lower() in ["true", "1"] - gradio_share = os.getenv("GRADIO_SHARE", "0").lower() in ["true", "1"] + gradio_ipv6 = is_env_enabled("GRADIO_IPV6") + gradio_share = is_env_enabled("GRADIO_SHARE") server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0") create_web_demo().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True) diff --git a/src/webui.py b/src/webui.py index 16b8335b..088f2365 100644 --- a/src/webui.py +++ b/src/webui.py @@ -14,12 +14,13 @@ import os +from llamafactory.extras.misc import is_env_enabled from llamafactory.webui.interface import create_ui def main(): - gradio_ipv6 = os.getenv("GRADIO_IPV6", "0").lower() in ["true", "1"] - gradio_share = os.getenv("GRADIO_SHARE", "0").lower() in ["true", "1"] + gradio_ipv6 = is_env_enabled("GRADIO_IPV6") + gradio_share = is_env_enabled("GRADIO_SHARE") server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0") create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)