diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 98bd9455..73d77de5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -34,7 +34,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install .[torch,dev] + python -m pip install ".[torch,dev]" - name: Check quality run: | diff --git a/.gitignore b/.gitignore index 2486e728..82e6e9e6 100644 --- a/.gitignore +++ b/.gitignore @@ -160,8 +160,8 @@ cython_debug/ .idea/ # custom .gitignore -user.config -saves/ cache/ -wandb -ds_badam_exp \ No newline at end of file +config/ +saves/ +output/ +wandb/ diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_full_sft.yaml similarity index 97% rename from examples/extras/badam/llama3_lora_sft.yaml rename to examples/extras/badam/llama3_full_sft.yaml index a78de2fa..31d61c33 100644 --- a/examples/extras/badam/llama3_lora_sft.yaml +++ b/examples/extras/badam/llama3_full_sft.yaml @@ -6,6 +6,7 @@ stage: sft do_train: true finetuning_type: full use_badam: true +badam_mode: layer badam_switch_mode: ascending badam_switch_interval: 50 badam_verbose: 2 @@ -32,7 +33,6 @@ learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -pure_bf16: true ### eval val_size: 0.1 diff --git a/examples/extras/badam/llama3_badam_sft.yaml b/examples/extras/badam/llama3_full_sft_ds3.yaml similarity index 89% rename from examples/extras/badam/llama3_badam_sft.yaml rename to examples/extras/badam/llama3_full_sft_ds3.yaml index f5adb220..f2d7309f 100644 --- a/examples/extras/badam/llama3_badam_sft.yaml +++ b/examples/extras/badam/llama3_full_sft_ds3.yaml @@ -6,9 +6,11 @@ stage: sft do_train: true finetuning_type: full use_badam: true +badam_mode: layer badam_switch_mode: ascending badam_switch_interval: 50 badam_verbose: 2 +deepspeed: examples/deepspeed/ds_z3_config.json ### dataset dataset: identity,alpaca_en_demo @@ -28,7 +30,7 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 1.0e-6 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 diff --git a/examples/extras/badam/train_single_gpu.sh b/examples/extras/badam/train_single_gpu.sh deleted file mode 100644 index 8af79007..00000000 --- a/examples/extras/badam/train_single_gpu.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -export CUDA_VISIBLE_DEVICES=0 - -cd ../../.. - -llamafactory-cli train \ - --stage sft \ - --do_train True \ - --model_name_or_path meta-llama/Llama-2-13b-hf \ - --preprocessing_num_workers 16 \ - --finetuning_type full \ - --template default \ - --flash_attn auto \ - --dataset_dir data \ - --dataset alpaca_en_demo \ - --cutoff_len 1024 \ - --learning_rate 1e-6 \ - --num_train_epochs 3.0 \ - --max_samples 100000 \ - --per_device_train_batch_size 1 \ - --gradient_accumulation_steps 8 \ - --lr_scheduler_type cosine \ - --max_grad_norm 1.0 \ - --logging_steps 5 \ - --save_steps 100 \ - --warmup_steps 0 \ - --optim adamw_torch \ - --packing False \ - --report_to none \ - --use_badam True \ - --output_dir saves/LLaMA2-13B/full/BAdam \ - --plot_loss True \ - --ddp_timeout 180000000 \ - --include_num_input_tokens_seen True \ - --badam_mode layer \ - --badam_switch_mode ascending \ - --badam_switch_interval 50 \ No newline at end of file diff --git a/examples/extras/badam/train_zero3.sh b/examples/extras/badam/train_zero3.sh deleted file mode 100644 index 3b182134..00000000 --- a/examples/extras/badam/train_zero3.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -export CUDA_VISIBLE_DEVICES=0,1,2,3 - -cd ../../.. - -llamafactory-cli train \ - --stage sft \ - --do_train True \ - --model_name_or_path meta-llama/Llama-2-13b-hf \ - --preprocessing_num_workers 16 \ - --finetuning_type full \ - --template default \ - --flash_attn auto \ - --dataset_dir data \ - --dataset alpaca_en_demo \ - --cutoff_len 1024 \ - --learning_rate 1e-6 \ - --num_train_epochs 3.0 \ - --max_samples 100000 \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 2 \ - --lr_scheduler_type cosine \ - --max_grad_norm 1.0 \ - --logging_steps 5 \ - --save_steps 100 \ - --warmup_steps 0 \ - --optim adamw_torch \ - --packing False \ - --report_to none \ - --use_badam True \ - --output_dir saves/LLaMA2-13B/full/BAdam \ - --fp16 True \ - --plot_loss True \ - --ddp_timeout 180000000 \ - --include_num_input_tokens_seen True \ - --badam_mode layer \ - --badam_switch_mode ascending \ - --badam_switch_interval 50 \ - --deepspeed cache/ds_z3_config.json \ No newline at end of file diff --git a/setup.py b/setup.py index 3d2ac921..64f50a87 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ extra_require = { "bitsandbytes": ["bitsandbytes>=0.39.0"], "vllm": ["vllm>=0.4.3"], "galore": ["galore-torch"], - "badam": ["badam"], + "badam": ["badam>=1.2.1"], "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"], "awq": ["autoawq"], "aqlm": ["aqlm[gpu]>=1.1.0"], diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py index f2ccd5e6..a4b7f7a5 100644 --- a/src/llamafactory/hparams/parser.py +++ b/src/llamafactory/hparams/parser.py @@ -121,7 +121,7 @@ def _check_extra_dependencies( require_version("galore_torch", "To fix: pip install galore_torch") if finetuning_args.use_badam: - require_version("badam", "To fix: pip install badam") + require_version("badam>=1.2.1", "To fix: pip install badam>=1.2.1") if finetuning_args.plot_loss: require_version("matplotlib", "To fix: pip install matplotlib") @@ -214,15 +214,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: if ( finetuning_args.use_badam - and training_args.parallel_mode.value == "distributed" + and training_args.parallel_mode == ParallelMode.DISTRIBUTED ): if finetuning_args.badam_mode == "ratio": - raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer") - if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()): - raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.") + raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.") + elif not is_deepspeed_zero3_enabled(): + raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.") - if (finetuning_args.use_galore) and training_args.deepspeed is not None: - raise ValueError("GaLore are incompatible with DeepSpeed yet.") + if finetuning_args.use_galore and training_args.deepspeed is not None: + raise ValueError("GaLore is incompatible with DeepSpeed yet.") if model_args.infer_backend == "vllm": raise ValueError("vLLM backend is only available for API, CLI and Web.") diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py index a3e0e961..ed4fd5d9 100644 --- a/src/llamafactory/train/dpo/trainer.py +++ b/src/llamafactory/train/dpo/trainer.py @@ -96,7 +96,8 @@ class CustomDPOTrainer(DPOTrainer): self.save_model(os.path.join(self.args.output_dir, "pissa_init")) if finetuning_args.use_badam: - from badam import clip_grad_norm_old_version, BAdamCallback + from badam import BAdamCallback, clip_grad_norm_old_version + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.callback_handler.add_callback(BAdamCallback) diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py index 0d50987f..c2edf95a 100644 --- a/src/llamafactory/train/kto/trainer.py +++ b/src/llamafactory/train/kto/trainer.py @@ -91,7 +91,8 @@ class CustomKTOTrainer(KTOTrainer): self.ref_model.eval() if finetuning_args.use_badam: - from badam import clip_grad_norm_old_version, BAdamCallback + from badam import BAdamCallback, clip_grad_norm_old_version + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.callback_handler.add_callback(BAdamCallback) diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py index 2d5d7ffc..70d01919 100644 --- a/src/llamafactory/train/ppo/trainer.py +++ b/src/llamafactory/train/ppo/trainer.py @@ -166,7 +166,8 @@ class CustomPPOTrainer(PPOTrainer, Trainer): self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True) if finetuning_args.use_badam: - from badam import clip_grad_norm_old_version, BAdamCallback + from badam import BAdamCallback, clip_grad_norm_old_version + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.callback_handler.add_callback(BAdamCallback) diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py index d3516b41..b6fb161d 100644 --- a/src/llamafactory/train/pt/trainer.py +++ b/src/llamafactory/train/pt/trainer.py @@ -48,7 +48,8 @@ class CustomTrainer(Trainer): self.save_model(os.path.join(self.args.output_dir, "pissa_init")) if finetuning_args.use_badam: - from badam import clip_grad_norm_old_version, BAdamCallback + from badam import BAdamCallback, clip_grad_norm_old_version + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.callback_handler.add_callback(BAdamCallback) diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py index 433251cf..70c2e9a0 100644 --- a/src/llamafactory/train/rm/trainer.py +++ b/src/llamafactory/train/rm/trainer.py @@ -72,7 +72,8 @@ class PairwiseTrainer(Trainer): self.processor = processor self.can_return_loss = True # override property to return eval_loss if finetuning_args.use_badam: - from badam import clip_grad_norm_old_version, BAdamCallback + from badam import BAdamCallback, clip_grad_norm_old_version + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.callback_handler.add_callback(BAdamCallback) diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py index 45799b96..8f18317f 100644 --- a/src/llamafactory/train/sft/trainer.py +++ b/src/llamafactory/train/sft/trainer.py @@ -56,7 +56,8 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer): self.save_model(os.path.join(self.args.output_dir, "pissa_init")) if finetuning_args.use_badam: - from badam import clip_grad_norm_old_version, BAdamCallback + from badam import BAdamCallback, clip_grad_norm_old_version + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.callback_handler.add_callback(BAdamCallback) diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 0206dcb6..21d41c36 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -23,6 +23,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union import torch from peft import PeftModel from transformers import Trainer +from transformers.integrations import is_deepspeed_zero3_enabled from transformers.optimization import get_scheduler from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS from transformers.trainer_pt_utils import get_parameter_names @@ -372,9 +373,6 @@ def _create_badam_optimizer( dict(params=decay_params, weight_decay=training_args.weight_decay), ] - from transformers.integrations import is_deepspeed_zero3_enabled - ds_zero3_enabled = is_deepspeed_zero3_enabled() - if finetuning_args.badam_mode == "layer": from badam import BlockOptimizer @@ -387,7 +385,7 @@ def _create_badam_optimizer( start_block=finetuning_args.badam_start_block, switch_mode=finetuning_args.badam_switch_mode, verbose=finetuning_args.badam_verbose, - ds_zero3_enabled=ds_zero3_enabled + ds_zero3_enabled=is_deepspeed_zero3_enabled(), ) logger.info( f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, " @@ -398,7 +396,6 @@ def _create_badam_optimizer( elif finetuning_args.badam_mode == "ratio": from badam import BlockOptimizerRatio - assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer." assert finetuning_args.badam_update_ratio > 1e-6 optimizer = BlockOptimizerRatio( param_groups=param_groups,