mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-14 19:06:26 +08:00
41
examples/extras/badam/llama3_lora_sft.yaml
Normal file
41
examples/extras/badam/llama3_lora_sft.yaml
Normal file
@@ -0,0 +1,41 @@
|
||||
# model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
# method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
use_badam: true
|
||||
badam_switch_mode: descending
|
||||
badam_switch_interval: 50
|
||||
badam_verbose: 2
|
||||
|
||||
# dataset
|
||||
dataset: identity,alpaca_gpt4_en
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
val_size: 0.1
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
# output
|
||||
output_dir: saves/llama3-8b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
# train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
pure_bf16: true
|
||||
|
||||
# eval
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
||||
@@ -1,35 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
|
||||
--stage sft \
|
||||
--do_train \
|
||||
--model_name_or_path meta-llama/Llama-2-7b-hf \
|
||||
--dataset alpaca_gpt4_en,glaive_toolcall \
|
||||
--dataset_dir ../../../data \
|
||||
--template default \
|
||||
--finetuning_type full \
|
||||
--use_badam \
|
||||
--badam_switch_mode descending \
|
||||
--badam_switch_block_every 50 \
|
||||
--badam_verbose 2 \
|
||||
--output_dir ../../../saves/LLaMA2-7B/badam/sft \
|
||||
--overwrite_cache \
|
||||
--overwrite_output_dir \
|
||||
--cutoff_len 1024 \
|
||||
--preprocessing_num_workers 16 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--lr_scheduler_type cosine \
|
||||
--logging_steps 10 \
|
||||
--warmup_steps 20 \
|
||||
--save_steps 100 \
|
||||
--eval_steps 100 \
|
||||
--evaluation_strategy steps \
|
||||
--load_best_model_at_end \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3.0 \
|
||||
--max_samples 3000 \
|
||||
--val_size 0.1 \
|
||||
--plot_loss \
|
||||
--pure_bf16
|
||||
39
examples/extras/fsdp_qlora/llama3_lora_sft.yaml
Normal file
39
examples/extras/fsdp_qlora/llama3_lora_sft.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
# model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
quantization_bit: 4
|
||||
|
||||
# method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
|
||||
# dataset
|
||||
dataset: identity,alpaca_gpt4_en
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
val_size: 0.1
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
# output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
# train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
fp16: true
|
||||
|
||||
# eval
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
||||
@@ -1,41 +0,0 @@
|
||||
#!/bin/bash
|
||||
# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
|
||||
|
||||
pip install "transformers>=4.39.1"
|
||||
pip install "accelerate>=0.28.0"
|
||||
pip install "bitsandbytes>=0.43.0"
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
|
||||
--config_file ../../accelerate/fsdp_config.yaml \
|
||||
../../../src/train.py \
|
||||
--stage sft \
|
||||
--do_train \
|
||||
--model_name_or_path meta-llama/Llama-2-70b-hf \
|
||||
--dataset alpaca_gpt4_en,glaive_toolcall \
|
||||
--dataset_dir ../../../data \
|
||||
--template default \
|
||||
--finetuning_type lora \
|
||||
--lora_target q_proj,v_proj \
|
||||
--output_dir ../../../saves/LLaMA2-70B/lora/sft \
|
||||
--overwrite_cache \
|
||||
--overwrite_output_dir \
|
||||
--cutoff_len 1024 \
|
||||
--preprocessing_num_workers 16 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--gradient_accumulation_steps 4 \
|
||||
--lr_scheduler_type cosine \
|
||||
--logging_steps 10 \
|
||||
--warmup_steps 20 \
|
||||
--save_steps 100 \
|
||||
--eval_steps 100 \
|
||||
--evaluation_strategy steps \
|
||||
--load_best_model_at_end \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3.0 \
|
||||
--max_samples 3000 \
|
||||
--val_size 0.1 \
|
||||
--ddp_timeout 180000000 \
|
||||
--quantization_bit 4 \
|
||||
--plot_loss \
|
||||
--fp16
|
||||
10
examples/extras/fsdp_qlora/single_node.sh
Normal file
10
examples/extras/fsdp_qlora/single_node.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
|
||||
|
||||
pip install "transformers>=4.39.1"
|
||||
pip install "accelerate>=0.28.0"
|
||||
pip install "bitsandbytes>=0.43.0"
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
|
||||
--config_file examples/accelerate/fsdp_config.yaml \
|
||||
src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
|
||||
42
examples/extras/galore/llama3_full_sft.yaml
Normal file
42
examples/extras/galore/llama3_full_sft.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
# model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
# method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
use_galore: true
|
||||
galore_layerwise: true
|
||||
galore_target: mlp,self_attn
|
||||
galore_rank: 128
|
||||
galore_scale: 2.0
|
||||
|
||||
# dataset
|
||||
dataset: identity,alpaca_gpt4_en
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
val_size: 0.1
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
# output
|
||||
output_dir: saves/llama3-8b/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
# train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 1
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
pure_bf16: true
|
||||
|
||||
# eval
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
||||
@@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
|
||||
--stage sft \
|
||||
--do_train \
|
||||
--model_name_or_path meta-llama/Llama-2-7b-hf \
|
||||
--dataset alpaca_gpt4_en,glaive_toolcall \
|
||||
--dataset_dir ../../../data \
|
||||
--template default \
|
||||
--finetuning_type full \
|
||||
--use_galore \
|
||||
--galore_layerwise \
|
||||
--galore_target mlp,self_attn \
|
||||
--galore_rank 128 \
|
||||
--galore_scale 2.0 \
|
||||
--output_dir ../../../saves/LLaMA2-7B/galore/sft \
|
||||
--overwrite_cache \
|
||||
--overwrite_output_dir \
|
||||
--cutoff_len 1024 \
|
||||
--preprocessing_num_workers 16 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--gradient_accumulation_steps 1 \
|
||||
--lr_scheduler_type cosine \
|
||||
--logging_steps 10 \
|
||||
--warmup_steps 20 \
|
||||
--save_steps 100 \
|
||||
--eval_steps 100 \
|
||||
--evaluation_strategy steps \
|
||||
--load_best_model_at_end \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3.0 \
|
||||
--max_samples 3000 \
|
||||
--val_size 0.1 \
|
||||
--plot_loss \
|
||||
--pure_bf16
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
python ../../../scripts/llama_pro.py \
|
||||
--model_name_or_path meta-llama/Llama-2-7b-hf \
|
||||
--output_dir ../../../models/llama2-7b-pro \
|
||||
python scripts/llama_pro.py \
|
||||
--model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--output_dir models/llama3-8b-instruct-pro \
|
||||
--num_expand 8
|
||||
|
||||
40
examples/extras/llama_pro/llama3_freeze_sft.yaml
Normal file
40
examples/extras/llama_pro/llama3_freeze_sft.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
# model
|
||||
model_name_or_path: models/llama3-8b-instruct-pro
|
||||
|
||||
# method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: freeze
|
||||
name_module_trainable: all
|
||||
num_layer_trainable: 8
|
||||
use_llama_pro: true
|
||||
|
||||
# dataset
|
||||
dataset: identity,alpaca_gpt4_en
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
val_size: 0.1
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
# output
|
||||
output_dir: saves/llama3-8b-instruct-pro/freeze/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
# train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
pure_bf16: true
|
||||
|
||||
# eval
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
||||
@@ -1,34 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
|
||||
--stage sft \
|
||||
--do_train \
|
||||
--model_name_or_path ../../../models/llama2-7b-pro \
|
||||
--dataset alpaca_gpt4_en,glaive_toolcall \
|
||||
--dataset_dir ../../../data \
|
||||
--template default \
|
||||
--finetuning_type freeze \
|
||||
--name_module_trainable all \
|
||||
--num_layer_trainable 8 \
|
||||
--use_llama_pro \
|
||||
--output_dir ../../../saves/LLaMA2-7B-Pro/lora/sft \
|
||||
--overwrite_cache \
|
||||
--overwrite_output_dir \
|
||||
--cutoff_len 1024 \
|
||||
--preprocessing_num_workers 16 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--lr_scheduler_type cosine \
|
||||
--logging_steps 10 \
|
||||
--warmup_steps 20 \
|
||||
--save_steps 100 \
|
||||
--eval_steps 100 \
|
||||
--evaluation_strategy steps \
|
||||
--load_best_model_at_end \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3.0 \
|
||||
--max_samples 3000 \
|
||||
--val_size 0.1 \
|
||||
--plot_loss \
|
||||
--fp16
|
||||
39
examples/extras/loraplus/llama3_lora_sft.yaml
Normal file
39
examples/extras/loraplus/llama3_lora_sft.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
# model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
# method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: lora
|
||||
lora_target: q_proj,v_proj
|
||||
loraplus_lr_ratio: 16.0
|
||||
|
||||
# dataset
|
||||
dataset: identity,alpaca_gpt4_en
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
val_size: 0.1
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
# output
|
||||
output_dir: saves/llama3-8b/lora/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
# train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
pure_bf16: true
|
||||
|
||||
# eval
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
||||
@@ -1,33 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
|
||||
--stage sft \
|
||||
--do_train \
|
||||
--model_name_or_path meta-llama/Llama-2-7b-hf \
|
||||
--dataset alpaca_gpt4_en,glaive_toolcall \
|
||||
--dataset_dir ../../data \
|
||||
--template default \
|
||||
--finetuning_type lora \
|
||||
--lora_target q_proj,v_proj \
|
||||
--loraplus_lr_ratio 16.0 \
|
||||
--output_dir ../../saves/LLaMA2-7B/loraplus/sft \
|
||||
--overwrite_cache \
|
||||
--overwrite_output_dir \
|
||||
--cutoff_len 1024 \
|
||||
--preprocessing_num_workers 16 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--lr_scheduler_type cosine \
|
||||
--logging_steps 10 \
|
||||
--warmup_steps 20 \
|
||||
--save_steps 100 \
|
||||
--eval_steps 100 \
|
||||
--evaluation_strategy steps \
|
||||
--load_best_model_at_end \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3.0 \
|
||||
--max_samples 3000 \
|
||||
--val_size 0.1 \
|
||||
--plot_loss \
|
||||
--fp16
|
||||
39
examples/extras/mod/llama3_full_sft.yaml
Normal file
39
examples/extras/mod/llama3_full_sft.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
# model
|
||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
|
||||
# method
|
||||
stage: sft
|
||||
do_train: true
|
||||
finetuning_type: full
|
||||
mixture_of_depths: convert
|
||||
|
||||
# dataset
|
||||
dataset: identity,alpaca_gpt4_en
|
||||
template: llama3
|
||||
cutoff_len: 1024
|
||||
max_samples: 1000
|
||||
val_size: 0.1
|
||||
overwrite_cache: true
|
||||
preprocessing_num_workers: 16
|
||||
|
||||
# output
|
||||
output_dir: saves/llama3-8b-mod/full/sft
|
||||
logging_steps: 10
|
||||
save_steps: 500
|
||||
plot_loss: true
|
||||
overwrite_output_dir: true
|
||||
|
||||
# train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
optim: paged_adamw_8bit
|
||||
learning_rate: 0.0001
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_steps: 0.1
|
||||
pure_bf16: true
|
||||
|
||||
# eval
|
||||
per_device_eval_batch_size: 1
|
||||
evaluation_strategy: steps
|
||||
eval_steps: 500
|
||||
@@ -1,33 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
|
||||
--stage sft \
|
||||
--do_train \
|
||||
--model_name_or_path meta-llama/Llama-2-7b-hf \
|
||||
--dataset alpaca_gpt4_en,glaive_toolcall \
|
||||
--dataset_dir ../../../data \
|
||||
--template default \
|
||||
--finetuning_type full \
|
||||
--mixture_of_depths convert \
|
||||
--output_dir ../../../saves/LLaMA2-7B/mod/sft \
|
||||
--overwrite_cache \
|
||||
--overwrite_output_dir \
|
||||
--cutoff_len 1024 \
|
||||
--preprocessing_num_workers 16 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--per_device_eval_batch_size 1 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--optim paged_adamw_8bit \
|
||||
--lr_scheduler_type cosine \
|
||||
--logging_steps 10 \
|
||||
--warmup_steps 20 \
|
||||
--save_steps 100 \
|
||||
--eval_steps 100 \
|
||||
--evaluation_strategy steps \
|
||||
--load_best_model_at_end \
|
||||
--learning_rate 5e-5 \
|
||||
--num_train_epochs 3.0 \
|
||||
--max_samples 3000 \
|
||||
--val_size 0.1 \
|
||||
--plot_loss \
|
||||
--pure_bf16
|
||||
Reference in New Issue
Block a user