diff --git a/.env.local b/.env.local index fb423d75..8f361917 100644 --- a/.env.local +++ b/.env.local @@ -17,7 +17,7 @@ FORCE_TORCHRUN= MASTER_ADDR= MASTER_PORT= NNODES= -RANK= +NODE_RANK= NPROC_PER_NODE= # wandb WANDB_DISABLED= diff --git a/Makefile b/Makefile index c1c45951..030b39b1 100644 --- a/Makefile +++ b/Makefile @@ -18,4 +18,4 @@ style: ruff format $(check_dirs) test: - CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest tests/ + CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest -vv tests/ diff --git a/examples/README.md b/examples/README.md index 5df1886f..3a98a088 100644 --- a/examples/README.md +++ b/examples/README.md @@ -89,8 +89,8 @@ llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml #### Supervised Fine-Tuning on Multiple Nodes ```bash -FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml -FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml +FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml +FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml ``` #### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding) diff --git a/examples/README_zh.md b/examples/README_zh.md index 46d43402..45e96bcf 100644 --- a/examples/README_zh.md +++ b/examples/README_zh.md @@ -89,8 +89,8 @@ llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml #### 多机指令监督微调 ```bash -FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml -FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml +FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml +FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml ``` #### 使用 DeepSpeed ZeRO-3 平均分配显存 diff --git a/src/api.py b/src/api.py index 25a7c2e7..ad2e8cbb 100644 --- a/src/api.py +++ b/src/api.py @@ -23,8 +23,8 @@ from llamafactory.chat import ChatModel def main(): chat_model = ChatModel() app = create_app(chat_model) - api_host = os.environ.get("API_HOST", "0.0.0.0") - api_port = int(os.environ.get("API_PORT", "8000")) + api_host = os.getenv("API_HOST", "0.0.0.0") + api_port = int(os.getenv("API_PORT", "8000")) print(f"Visit http://localhost:{api_port}/docs for API document.") uvicorn.run(app, host=api_host, port=api_port) diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py index a39c3147..59db566a 100644 --- a/src/llamafactory/cli.py +++ b/src/llamafactory/cli.py @@ -86,19 +86,19 @@ def main(): elif command == Command.EXPORT: export_model() elif command == Command.TRAIN: - force_torchrun = os.environ.get("FORCE_TORCHRUN", "0").lower() in ["true", "1"] + force_torchrun = os.getenv("FORCE_TORCHRUN", "0").lower() in ["true", "1"] if force_torchrun or get_device_count() > 1: - master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1") - master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999))) + master_addr = os.getenv("MASTER_ADDR", "127.0.0.1") + master_port = os.getenv("MASTER_PORT", str(random.randint(20001, 29999))) logger.info(f"Initializing distributed tasks at: {master_addr}:{master_port}") process = subprocess.run( ( "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} " "--master_addr {master_addr} --master_port {master_port} {file_name} {args}" ).format( - nnodes=os.environ.get("NNODES", "1"), - node_rank=os.environ.get("RANK", "0"), - nproc_per_node=os.environ.get("NPROC_PER_NODE", str(get_device_count())), + nnodes=os.getenv("NNODES", "1"), + node_rank=os.getenv("NODE_RANK", "0"), + nproc_per_node=os.getenv("NPROC_PER_NODE", str(get_device_count())), master_addr=master_addr, master_port=master_port, file_name=launcher.__file__, diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py index 88769ae9..bd75821c 100644 --- a/src/llamafactory/model/model_utils/checkpointing.py +++ b/src/llamafactory/model/model_utils/checkpointing.py @@ -19,7 +19,7 @@ # limitations under the License. import inspect -from functools import partial, wraps +from functools import WRAPPER_ASSIGNMENTS, partial, wraps from types import MethodType from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union @@ -81,7 +81,7 @@ def get_custom_gradient_checkpointing_func(gradient_checkpointing_func: Callable Only applies gradient checkpointing to trainable layers. """ - @wraps(gradient_checkpointing_func) + @wraps(gradient_checkpointing_func, assigned=WRAPPER_ASSIGNMENTS + ("__self__",)) def custom_gradient_checkpointing_func(func: Callable, *args: Union["torch.Tensor", Any], **kwargs): module: "torch.nn.Module" = func.__self__ @@ -92,9 +92,6 @@ def get_custom_gradient_checkpointing_func(gradient_checkpointing_func: Callable return gradient_checkpointing_func(func, *args, **kwargs) - if hasattr(gradient_checkpointing_func, "__self__"): # fix unsloth gc test case - custom_gradient_checkpointing_func.__self__ = gradient_checkpointing_func.__self__ - return custom_gradient_checkpointing_func diff --git a/src/llamafactory/train/test_utils.py b/src/llamafactory/train/test_utils.py index 649a4795..55e6c199 100644 --- a/src/llamafactory/train/test_utils.py +++ b/src/llamafactory/train/test_utils.py @@ -80,18 +80,17 @@ def load_reference_model( is_trainable: bool = False, add_valuehead: bool = False, ) -> Union["PreTrainedModel", "LoraModel"]: + current_device = get_current_device() if add_valuehead: model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained( - model_path, torch_dtype=torch.float16, device_map=get_current_device() + model_path, torch_dtype=torch.float16, device_map=current_device ) if not is_trainable: model.v_head = model.v_head.to(torch.float16) return model - model = AutoModelForCausalLM.from_pretrained( - model_path, torch_dtype=torch.float16, device_map=get_current_device() - ) + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map=current_device) if use_lora or use_pissa: model = PeftModel.from_pretrained( model, lora_path, subfolder="pissa_init" if use_pissa else None, is_trainable=is_trainable @@ -110,7 +109,7 @@ def load_train_dataset(**kwargs) -> "Dataset": return dataset_module["train_dataset"] -def patch_valuehead_model(): +def patch_valuehead_model() -> None: def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]) -> None: state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")} self.v_head.load_state_dict(state_dict, strict=False) diff --git a/tests/data/processors/test_feedback.py b/tests/data/processors/test_feedback.py index 2f87ccc2..c04e823b 100644 --- a/tests/data/processors/test_feedback.py +++ b/tests/data/processors/test_feedback.py @@ -23,9 +23,9 @@ from llamafactory.extras.constants import IGNORE_INDEX from llamafactory.train.test_utils import load_train_dataset -DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data") +DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data") -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/data/processors/test_pairwise.py b/tests/data/processors/test_pairwise.py index 4d3f26bd..da50ca24 100644 --- a/tests/data/processors/test_pairwise.py +++ b/tests/data/processors/test_pairwise.py @@ -24,9 +24,9 @@ from llamafactory.extras.constants import IGNORE_INDEX from llamafactory.train.test_utils import load_train_dataset -DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data") +DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data") -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/data/processors/test_supervised.py b/tests/data/processors/test_supervised.py index 8df9530a..965429a6 100644 --- a/tests/data/processors/test_supervised.py +++ b/tests/data/processors/test_supervised.py @@ -23,11 +23,11 @@ from llamafactory.extras.constants import IGNORE_INDEX from llamafactory.train.test_utils import load_train_dataset -DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data") +DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data") -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") -TINY_DATA = os.environ.get("TINY_DATA", "llamafactory/tiny-supervised-dataset") +TINY_DATA = os.getenv("TINY_DATA", "llamafactory/tiny-supervised-dataset") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/data/processors/test_unsupervised.py b/tests/data/processors/test_unsupervised.py index 1bfab53e..c59fa5b2 100644 --- a/tests/data/processors/test_unsupervised.py +++ b/tests/data/processors/test_unsupervised.py @@ -22,11 +22,11 @@ from transformers import AutoTokenizer from llamafactory.train.test_utils import load_train_dataset -DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data") +DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data") -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") -TINY_DATA = os.environ.get("TINY_DATA", "llamafactory/tiny-supervised-dataset") +TINY_DATA = os.getenv("TINY_DATA", "llamafactory/tiny-supervised-dataset") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index 66e9b57c..5ab91d2e 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -31,9 +31,9 @@ if TYPE_CHECKING: from llamafactory.data.mm_plugin import BasePlugin -HF_TOKEN = os.environ.get("HF_TOKEN", None) +HF_TOKEN = os.getenv("HF_TOKEN") -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") MM_MESSAGES = [ {"role": "user", "content": "What is in this image?"}, diff --git a/tests/data/test_template.py b/tests/data/test_template.py index 18d03958..ba3a9953 100644 --- a/tests/data/test_template.py +++ b/tests/data/test_template.py @@ -27,9 +27,9 @@ if TYPE_CHECKING: from transformers import PreTrainedTokenizer -HF_TOKEN = os.environ.get("HF_TOKEN", None) +HF_TOKEN = os.getenv("HF_TOKEN") -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") MESSAGES = [ {"role": "user", "content": "How are you"}, diff --git a/tests/e2e/test_chat.py b/tests/e2e/test_chat.py index 539a0ab9..b95646d7 100644 --- a/tests/e2e/test_chat.py +++ b/tests/e2e/test_chat.py @@ -17,7 +17,7 @@ import os from llamafactory.chat import ChatModel -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") INFER_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/e2e/test_train.py b/tests/e2e/test_train.py index a8a24e42..71cda495 100644 --- a/tests/e2e/test_train.py +++ b/tests/e2e/test_train.py @@ -19,11 +19,11 @@ import pytest from llamafactory.train.tuner import export_model, run_exp -DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data") +DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data") -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") -TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora") +TINY_LLAMA_ADAPTER = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, @@ -46,7 +46,7 @@ INFER_ARGS = { "infer_dtype": "float16", } -OS_NAME = os.environ.get("OS_NAME", "") +OS_NAME = os.getenv("OS_NAME", "") @pytest.mark.parametrize( diff --git a/tests/model/model_utils/test_attention.py b/tests/model/model_utils/test_attention.py index e263e6da..3861f4bb 100644 --- a/tests/model/model_utils/test_attention.py +++ b/tests/model/model_utils/test_attention.py @@ -19,7 +19,7 @@ from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_availabl from llamafactory.train.test_utils import load_infer_model -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") INFER_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/model/model_utils/test_checkpointing.py b/tests/model/model_utils/test_checkpointing.py index 9367eab2..0b171508 100644 --- a/tests/model/model_utils/test_checkpointing.py +++ b/tests/model/model_utils/test_checkpointing.py @@ -20,7 +20,7 @@ from llamafactory.extras.misc import get_current_device from llamafactory.train.test_utils import load_train_model -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, @@ -54,7 +54,7 @@ def test_checkpointing_disable(): def test_unsloth_gradient_checkpointing(): model = load_train_model(use_unsloth_gc=True, **TRAIN_ARGS) for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()): - assert module._gradient_checkpointing_func.__self__.__name__ == "UnslothGradientCheckpointing" # classmethod + assert module._gradient_checkpointing_func.__self__.__name__ == "UnslothGradientCheckpointing" def test_upcast_layernorm(): diff --git a/tests/model/test_base.py b/tests/model/test_base.py index 20298fa0..6b6aa8b8 100644 --- a/tests/model/test_base.py +++ b/tests/model/test_base.py @@ -16,17 +16,12 @@ import os import pytest -from llamafactory.train.test_utils import ( - compare_model, - load_infer_model, - load_reference_model, - patch_valuehead_model, -) +from llamafactory.train.test_utils import compare_model, load_infer_model, load_reference_model, patch_valuehead_model -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") -TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead") +TINY_LLAMA_VALUEHEAD = os.getenv("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead") INFER_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py index 24a1c965..964f52c9 100644 --- a/tests/model/test_freeze.py +++ b/tests/model/test_freeze.py @@ -19,7 +19,7 @@ import torch from llamafactory.train.test_utils import load_infer_model, load_train_model -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/model/test_full.py b/tests/model/test_full.py index 383f3b89..6990a0e9 100644 --- a/tests/model/test_full.py +++ b/tests/model/test_full.py @@ -19,7 +19,7 @@ import torch from llamafactory.train.test_utils import load_infer_model, load_train_model -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py index 8c014a15..e1d2148e 100644 --- a/tests/model/test_lora.py +++ b/tests/model/test_lora.py @@ -27,11 +27,11 @@ from llamafactory.train.test_utils import ( ) -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") -TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora") +TINY_LLAMA_ADAPTER = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora") -TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead") +TINY_LLAMA_VALUEHEAD = os.getenv("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, diff --git a/tests/model/test_pissa.py b/tests/model/test_pissa.py index a0985f05..7bfdac51 100644 --- a/tests/model/test_pissa.py +++ b/tests/model/test_pissa.py @@ -19,9 +19,9 @@ import pytest from llamafactory.train.test_utils import compare_model, load_infer_model, load_reference_model, load_train_model -TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") +TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") -TINY_LLAMA_PISSA = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-pissa") +TINY_LLAMA_PISSA = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-pissa") TRAIN_ARGS = { "model_name_or_path": TINY_LLAMA, @@ -49,7 +49,7 @@ INFER_ARGS = { "infer_dtype": "float16", } -OS_NAME = os.environ.get("OS_NAME", "") +OS_NAME = os.getenv("OS_NAME", "") @pytest.mark.xfail(reason="PiSSA initialization is not stable in different platform.")