update example docs

Former-commit-id: f02f87c6fb
2026-03-06 19:56:01 +08:00 · 2024-05-06 22:51:02 +08:00
parent eb21a527a6
commit 92cafef325
35 changed files with 1048 additions and 594 deletions
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@@ -0,0 +1,41 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+use_badam: true
+badam_switch_mode: descending
+badam_switch_interval: 50
+badam_verbose: 2
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
--- a/examples/extras/badam/sft.sh
+++ b/examples/extras/badam/sft.sh
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --use_badam \
-    --badam_switch_mode descending \
-    --badam_switch_block_every 50 \
-    --badam_verbose 2 \
-    --output_dir ../../../saves/LLaMA2-7B/badam/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+quantization_bit: 4
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
--- a/examples/extras/fsdp_qlora/sft.sh
+++ b/examples/extras/fsdp_qlora/sft.sh
@@ -1,41 +0,0 @@
-#!/bin/bash
-# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
-
-pip install "transformers>=4.39.1"
-pip install "accelerate>=0.28.0"
-pip install "bitsandbytes>=0.43.0"
-
-CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
-    --config_file ../../accelerate/fsdp_config.yaml \
-    ../../../src/train.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-70b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../../saves/LLaMA2-70B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --quantization_bit 4 \
-    --plot_loss \
-    --fp16
--- a/examples/extras/fsdp_qlora/single_node.sh
+++ b/examples/extras/fsdp_qlora/single_node.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
+
+pip install "transformers>=4.39.1"
+pip install "accelerate>=0.28.0"
+pip install "bitsandbytes>=0.43.0"
+
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
+    --config_file examples/accelerate/fsdp_config.yaml \
+    src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -0,0 +1,42 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+use_galore: true
+galore_layerwise: true
+galore_target: mlp,self_attn
+galore_rank: 128
+galore_scale: 2.0
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
--- a/examples/extras/galore/sft.sh
+++ b/examples/extras/galore/sft.sh
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --use_galore \
-    --galore_layerwise \
-    --galore_target mlp,self_attn \
-    --galore_rank 128 \
-    --galore_scale 2.0 \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
--- a/examples/extras/llama_pro/expand.sh
+++ b/examples/extras/llama_pro/expand.sh
@@ -1,6 +1,6 @@
 #!/bin/bash

-python ../../../scripts/llama_pro.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --output_dir ../../../models/llama2-7b-pro \
+python scripts/llama_pro.py \
+    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
+    --output_dir models/llama3-8b-instruct-pro \
    --num_expand 8
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -0,0 +1,40 @@
+# model
+model_name_or_path: models/llama3-8b-instruct-pro
+
+# method
+stage: sft
+do_train: true
+finetuning_type: freeze
+name_module_trainable: all
+num_layer_trainable: 8
+use_llama_pro: true
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b-instruct-pro/freeze/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
--- a/examples/extras/llama_pro/sft.sh
+++ b/examples/extras/llama_pro/sft.sh
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path ../../../models/llama2-7b-pro \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type freeze \
-    --name_module_trainable all \
-    --num_layer_trainable 8 \
-    --use_llama_pro \
-    --output_dir ../../../saves/LLaMA2-7B-Pro/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+loraplus_lr_ratio: 16.0
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
--- a/examples/extras/loraplus/sft.sh
+++ b/examples/extras/loraplus/sft.sh
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --loraplus_lr_ratio 16.0 \
-    --output_dir ../../saves/LLaMA2-7B/loraplus/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+mixture_of_depths: convert
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b-mod/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+optim: paged_adamw_8bit
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
--- a/examples/extras/mod/sft.sh
+++ b/examples/extras/mod/sft.sh
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --mixture_of_depths convert \
-    --output_dir ../../../saves/LLaMA2-7B/mod/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --optim paged_adamw_8bit \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16