fix mod stuff

Former-commit-id: f58425ab45
2026-03-07 20:26:00 +08:00 · 2024-04-21 18:11:10 +08:00
parent 7c63a9b5fd
commit ec81d45d27
16 changed files with 63 additions and 88 deletions
--- a/examples/README.md
+++ b/examples/README.md
@@ -38,12 +38,11 @@ examples/
    │   └── sft.sh: Fine-tune model with BAdam
    ├── loraplus/
    │   └── sft.sh: Fine-tune model using LoRA+
+    ├── mod/
+    │   └── sft.sh: Fine-tune model using Mixture-of-Depths
    ├── llama_pro/
    │   ├── expand.sh: Expand layers in the model
    │   └── sft.sh: Fine-tune the expanded model
-    ├── MoD/
-    │   ├── freeze_sft.sh: Freeze finetune a model, updating only the MoD router
-    │   └── sft.sh: Fine-tune the MoD model
    └── fsdp_qlora/
        └── sft.sh: Fine-tune quantized model with FSDP+QLoRA
 ```
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -38,12 +38,11 @@ examples/
    │   └── sft.sh: 使用 BAdam 训练模型
    ├── loraplus/
    │   └── sft.sh: 使用 LoRA+ 训练模型
+    ├── mod/
+    │   └── sft.sh: 使用深度混合训练模型
    ├── llama_pro/
    │   ├── expand.sh: 扩展模型中的层
    │   └── sft.sh: 训练扩展后的模型
-    ├── MoD/
-    │   ├── freeze_sft.sh: 冻结微调模型，仅更新 MoD 路由器
-    │   └── sft.sh: 微调国防部模型
    └── fsdp_qlora/
        └── sft.sh: 使用 FSDP+QLoRA 微调量化模型
 ```
--- a/examples/extras/MoD/freeze_sft.sh
+++ b/examples/extras/MoD/freeze_sft.sh
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type freeze \
-    --name_module_trainable router \
-    --output_dir ../../../saves/TinyLlama/TinyLlama-1.1B-Chat-v1.0/sft \
-    --mixture_of_depths convert \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
--- a/examples/extras/MoD/sft.sh
+++ b/examples/extras/MoD/sft.sh
@@ -3,20 +3,21 @@
 CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
    --stage sft \
    --do_train \
-    --model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
    --dataset alpaca_gpt4_en,glaive_toolcall \
    --dataset_dir ../../../data \
    --template default \
    --finetuning_type full \
-    --output_dir ../../../saves/TinyLlama/TinyLlama-1.1B-Chat-v1.0/sft \
    --mixture_of_depths convert \
+    --output_dir ../../../saves/LLaMA2-7B/mod/sft \
    --overwrite_cache \
    --overwrite_output_dir \
    --cutoff_len 1024 \
    --preprocessing_num_workers 16 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
+    --gradient_accumulation_steps 8 \
+    --optim paged_adamw_8bit \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --warmup_steps 20 \
--- a/examples/extras/galore/sft.sh
+++ b/examples/extras/galore/sft.sh
@@ -11,6 +11,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
    --use_galore \
    --galore_layerwise \
    --galore_target mlp,self_attn \
+    --galore_scale 2.0 \
    --galore_rank 128 \
    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
    --overwrite_cache \
@@ -28,8 +29,8 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
    --evaluation_strategy steps \
    --load_best_model_at_end \
    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
+    --num_train_epochs 30.0 \
+    --max_samples 300 \
    --val_size 0.1 \
    --plot_loss \
    --pure_bf16
--- a/examples/inference/evaluate.sh
+++ b/examples/inference/evaluate.sh
@@ -3,7 +3,7 @@
 CUDA_VISIBLE_DEVICES=0 python ../../src/evaluate.py \
    --model_name_or_path meta-llama/Llama-2-7b-hf \
    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template vanilla \
+    --template fewshot \
    --finetuning_type lora \
    --task mmlu \
    --split test \