mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	tiny fix about badam
Former-commit-id: 03f49267c7406e36aee35639f86e6e0383897090
This commit is contained in:
		
							parent
							
								
									98fb3d015a
								
							
						
					
					
						commit
						9fd7a410bb
					
				
							
								
								
									
										2
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							@ -34,7 +34,7 @@ jobs:
 | 
			
		||||
      - name: Install dependencies
 | 
			
		||||
        run: |
 | 
			
		||||
          python -m pip install --upgrade pip
 | 
			
		||||
          python -m pip install .[torch,dev]
 | 
			
		||||
          python -m pip install ".[torch,dev]"
 | 
			
		||||
 | 
			
		||||
      - name: Check quality
 | 
			
		||||
        run: |
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										8
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -160,8 +160,8 @@ cython_debug/
 | 
			
		||||
.idea/
 | 
			
		||||
 | 
			
		||||
# custom .gitignore
 | 
			
		||||
user.config
 | 
			
		||||
saves/
 | 
			
		||||
cache/
 | 
			
		||||
wandb
 | 
			
		||||
ds_badam_exp
 | 
			
		||||
config/
 | 
			
		||||
saves/
 | 
			
		||||
output/
 | 
			
		||||
wandb/
 | 
			
		||||
 | 
			
		||||
@ -6,6 +6,7 @@ stage: sft
 | 
			
		||||
do_train: true
 | 
			
		||||
finetuning_type: full
 | 
			
		||||
use_badam: true
 | 
			
		||||
badam_mode: layer
 | 
			
		||||
badam_switch_mode: ascending
 | 
			
		||||
badam_switch_interval: 50
 | 
			
		||||
badam_verbose: 2
 | 
			
		||||
@ -32,7 +33,6 @@ learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
pure_bf16: true
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
val_size: 0.1
 | 
			
		||||
@ -6,9 +6,11 @@ stage: sft
 | 
			
		||||
do_train: true
 | 
			
		||||
finetuning_type: full
 | 
			
		||||
use_badam: true
 | 
			
		||||
badam_mode: layer
 | 
			
		||||
badam_switch_mode: ascending
 | 
			
		||||
badam_switch_interval: 50
 | 
			
		||||
badam_verbose: 2
 | 
			
		||||
deepspeed: examples/deepspeed/ds_z3_config.json
 | 
			
		||||
 | 
			
		||||
### dataset
 | 
			
		||||
dataset: identity,alpaca_en_demo
 | 
			
		||||
@ -28,7 +30,7 @@ overwrite_output_dir: true
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 1.0e-6
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
@ -1,37 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
export CUDA_VISIBLE_DEVICES=0
 | 
			
		||||
 | 
			
		||||
cd ../../..
 | 
			
		||||
 | 
			
		||||
llamafactory-cli train \
 | 
			
		||||
    --stage sft \
 | 
			
		||||
    --do_train True \
 | 
			
		||||
    --model_name_or_path meta-llama/Llama-2-13b-hf \
 | 
			
		||||
    --preprocessing_num_workers 16 \
 | 
			
		||||
    --finetuning_type full \
 | 
			
		||||
    --template default \
 | 
			
		||||
    --flash_attn auto \
 | 
			
		||||
    --dataset_dir data \
 | 
			
		||||
    --dataset alpaca_en_demo \
 | 
			
		||||
    --cutoff_len 1024 \
 | 
			
		||||
    --learning_rate 1e-6 \
 | 
			
		||||
    --num_train_epochs 3.0 \
 | 
			
		||||
    --max_samples 100000 \
 | 
			
		||||
    --per_device_train_batch_size 1 \
 | 
			
		||||
    --gradient_accumulation_steps 8 \
 | 
			
		||||
    --lr_scheduler_type cosine \
 | 
			
		||||
    --max_grad_norm 1.0 \
 | 
			
		||||
    --logging_steps 5 \
 | 
			
		||||
    --save_steps 100 \
 | 
			
		||||
    --warmup_steps 0 \
 | 
			
		||||
    --optim adamw_torch \
 | 
			
		||||
    --packing False \
 | 
			
		||||
    --report_to none \
 | 
			
		||||
    --use_badam True \
 | 
			
		||||
    --output_dir saves/LLaMA2-13B/full/BAdam \
 | 
			
		||||
    --plot_loss True \
 | 
			
		||||
    --ddp_timeout 180000000 \
 | 
			
		||||
    --include_num_input_tokens_seen True \
 | 
			
		||||
    --badam_mode layer \
 | 
			
		||||
    --badam_switch_mode ascending \
 | 
			
		||||
    --badam_switch_interval 50
 | 
			
		||||
@ -1,39 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
 | 
			
		||||
 | 
			
		||||
cd ../../..
 | 
			
		||||
 | 
			
		||||
llamafactory-cli train \
 | 
			
		||||
    --stage sft \
 | 
			
		||||
    --do_train True \
 | 
			
		||||
    --model_name_or_path meta-llama/Llama-2-13b-hf \
 | 
			
		||||
    --preprocessing_num_workers 16 \
 | 
			
		||||
    --finetuning_type full \
 | 
			
		||||
    --template default \
 | 
			
		||||
    --flash_attn auto \
 | 
			
		||||
    --dataset_dir data \
 | 
			
		||||
    --dataset alpaca_en_demo \
 | 
			
		||||
    --cutoff_len 1024 \
 | 
			
		||||
    --learning_rate 1e-6 \
 | 
			
		||||
    --num_train_epochs 3.0 \
 | 
			
		||||
    --max_samples 100000 \
 | 
			
		||||
    --per_device_train_batch_size 8 \
 | 
			
		||||
    --gradient_accumulation_steps 2 \
 | 
			
		||||
    --lr_scheduler_type cosine \
 | 
			
		||||
    --max_grad_norm 1.0 \
 | 
			
		||||
    --logging_steps 5 \
 | 
			
		||||
    --save_steps 100 \
 | 
			
		||||
    --warmup_steps 0 \
 | 
			
		||||
    --optim adamw_torch \
 | 
			
		||||
    --packing False \
 | 
			
		||||
    --report_to none \
 | 
			
		||||
    --use_badam True \
 | 
			
		||||
    --output_dir saves/LLaMA2-13B/full/BAdam \
 | 
			
		||||
    --fp16 True \
 | 
			
		||||
    --plot_loss True \
 | 
			
		||||
    --ddp_timeout 180000000 \
 | 
			
		||||
    --include_num_input_tokens_seen True \
 | 
			
		||||
    --badam_mode layer \
 | 
			
		||||
    --badam_switch_mode ascending \
 | 
			
		||||
    --badam_switch_interval 50 \
 | 
			
		||||
    --deepspeed cache/ds_z3_config.json 
 | 
			
		||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							@ -41,7 +41,7 @@ extra_require = {
 | 
			
		||||
    "bitsandbytes": ["bitsandbytes>=0.39.0"],
 | 
			
		||||
    "vllm": ["vllm>=0.4.3"],
 | 
			
		||||
    "galore": ["galore-torch"],
 | 
			
		||||
    "badam": ["badam"],
 | 
			
		||||
    "badam": ["badam>=1.2.1"],
 | 
			
		||||
    "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
 | 
			
		||||
    "awq": ["autoawq"],
 | 
			
		||||
    "aqlm": ["aqlm[gpu]>=1.1.0"],
 | 
			
		||||
 | 
			
		||||
@ -121,7 +121,7 @@ def _check_extra_dependencies(
 | 
			
		||||
        require_version("galore_torch", "To fix: pip install galore_torch")
 | 
			
		||||
 | 
			
		||||
    if finetuning_args.use_badam:
 | 
			
		||||
        require_version("badam", "To fix: pip install badam")
 | 
			
		||||
        require_version("badam>=1.2.1", "To fix: pip install badam>=1.2.1")
 | 
			
		||||
 | 
			
		||||
    if finetuning_args.plot_loss:
 | 
			
		||||
        require_version("matplotlib", "To fix: pip install matplotlib")
 | 
			
		||||
@ -214,15 +214,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
 | 
			
		||||
 | 
			
		||||
    if (
 | 
			
		||||
        finetuning_args.use_badam
 | 
			
		||||
        and training_args.parallel_mode.value == "distributed"
 | 
			
		||||
        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
 | 
			
		||||
    ):
 | 
			
		||||
        if finetuning_args.badam_mode == "ratio":
 | 
			
		||||
            raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
 | 
			
		||||
        if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()):
 | 
			
		||||
            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.")
 | 
			
		||||
            raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
 | 
			
		||||
        elif not is_deepspeed_zero3_enabled():
 | 
			
		||||
            raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.")
 | 
			
		||||
 | 
			
		||||
    if (finetuning_args.use_galore) and training_args.deepspeed is not None:
 | 
			
		||||
        raise ValueError("GaLore are incompatible with DeepSpeed yet.")
 | 
			
		||||
    if finetuning_args.use_galore and training_args.deepspeed is not None:
 | 
			
		||||
        raise ValueError("GaLore is incompatible with DeepSpeed yet.")
 | 
			
		||||
 | 
			
		||||
    if model_args.infer_backend == "vllm":
 | 
			
		||||
        raise ValueError("vLLM backend is only available for API, CLI and Web.")
 | 
			
		||||
 | 
			
		||||
@ -96,7 +96,8 @@ class CustomDPOTrainer(DPOTrainer):
 | 
			
		||||
            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 | 
			
		||||
 | 
			
		||||
        if finetuning_args.use_badam:
 | 
			
		||||
            from badam import clip_grad_norm_old_version, BAdamCallback
 | 
			
		||||
            from badam import BAdamCallback, clip_grad_norm_old_version
 | 
			
		||||
 | 
			
		||||
            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
 | 
			
		||||
            self.callback_handler.add_callback(BAdamCallback)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -91,7 +91,8 @@ class CustomKTOTrainer(KTOTrainer):
 | 
			
		||||
                self.ref_model.eval()
 | 
			
		||||
 | 
			
		||||
        if finetuning_args.use_badam:
 | 
			
		||||
            from badam import clip_grad_norm_old_version, BAdamCallback
 | 
			
		||||
            from badam import BAdamCallback, clip_grad_norm_old_version
 | 
			
		||||
 | 
			
		||||
            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
 | 
			
		||||
            self.callback_handler.add_callback(BAdamCallback)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -166,7 +166,8 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
 | 
			
		||||
                self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
 | 
			
		||||
 | 
			
		||||
        if finetuning_args.use_badam:
 | 
			
		||||
            from badam import clip_grad_norm_old_version, BAdamCallback
 | 
			
		||||
            from badam import BAdamCallback, clip_grad_norm_old_version
 | 
			
		||||
 | 
			
		||||
            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
 | 
			
		||||
            self.callback_handler.add_callback(BAdamCallback)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -48,7 +48,8 @@ class CustomTrainer(Trainer):
 | 
			
		||||
            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 | 
			
		||||
 | 
			
		||||
        if finetuning_args.use_badam:
 | 
			
		||||
            from badam import clip_grad_norm_old_version, BAdamCallback
 | 
			
		||||
            from badam import BAdamCallback, clip_grad_norm_old_version
 | 
			
		||||
 | 
			
		||||
            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
 | 
			
		||||
            self.callback_handler.add_callback(BAdamCallback)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -72,7 +72,8 @@ class PairwiseTrainer(Trainer):
 | 
			
		||||
        self.processor = processor
 | 
			
		||||
        self.can_return_loss = True  # override property to return eval_loss
 | 
			
		||||
        if finetuning_args.use_badam:
 | 
			
		||||
            from badam import clip_grad_norm_old_version, BAdamCallback
 | 
			
		||||
            from badam import BAdamCallback, clip_grad_norm_old_version
 | 
			
		||||
 | 
			
		||||
            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
 | 
			
		||||
            self.callback_handler.add_callback(BAdamCallback)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -56,7 +56,8 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 | 
			
		||||
            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 | 
			
		||||
 | 
			
		||||
        if finetuning_args.use_badam:
 | 
			
		||||
            from badam import clip_grad_norm_old_version, BAdamCallback
 | 
			
		||||
            from badam import BAdamCallback, clip_grad_norm_old_version
 | 
			
		||||
 | 
			
		||||
            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
 | 
			
		||||
            self.callback_handler.add_callback(BAdamCallback)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -23,6 +23,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 | 
			
		||||
import torch
 | 
			
		||||
from peft import PeftModel
 | 
			
		||||
from transformers import Trainer
 | 
			
		||||
from transformers.integrations import is_deepspeed_zero3_enabled
 | 
			
		||||
from transformers.optimization import get_scheduler
 | 
			
		||||
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 | 
			
		||||
from transformers.trainer_pt_utils import get_parameter_names
 | 
			
		||||
@ -372,9 +373,6 @@ def _create_badam_optimizer(
 | 
			
		||||
        dict(params=decay_params, weight_decay=training_args.weight_decay),
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    from transformers.integrations import is_deepspeed_zero3_enabled
 | 
			
		||||
    ds_zero3_enabled = is_deepspeed_zero3_enabled()
 | 
			
		||||
 | 
			
		||||
    if finetuning_args.badam_mode == "layer":
 | 
			
		||||
        from badam import BlockOptimizer
 | 
			
		||||
 | 
			
		||||
@ -387,7 +385,7 @@ def _create_badam_optimizer(
 | 
			
		||||
            start_block=finetuning_args.badam_start_block,
 | 
			
		||||
            switch_mode=finetuning_args.badam_switch_mode,
 | 
			
		||||
            verbose=finetuning_args.badam_verbose,
 | 
			
		||||
            ds_zero3_enabled=ds_zero3_enabled
 | 
			
		||||
            ds_zero3_enabled=is_deepspeed_zero3_enabled(),
 | 
			
		||||
        )
 | 
			
		||||
        logger.info(
 | 
			
		||||
            f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
 | 
			
		||||
@ -398,7 +396,6 @@ def _create_badam_optimizer(
 | 
			
		||||
    elif finetuning_args.badam_mode == "ratio":
 | 
			
		||||
        from badam import BlockOptimizerRatio
 | 
			
		||||
 | 
			
		||||
        assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer."
 | 
			
		||||
        assert finetuning_args.badam_update_ratio > 1e-6
 | 
			
		||||
        optimizer = BlockOptimizerRatio(
 | 
			
		||||
            param_groups=param_groups,
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user