disable valset by default (#6690)

Former-commit-id: 77bbf65905
2026-03-12 23:16:04 +08:00 · 2025-01-17 21:09:30 +08:00
parent 770433fa33
commit bbf334f823
30 changed files with 142 additions and 114 deletions
--- a/examples/extras/adam_mini/qwen2_full_sft.yaml
+++ b/examples/extras/adam_mini/qwen2_full_sft.yaml
@@ -34,7 +34,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/extras/apollo/llama3_full_sft.yaml
+++ b/examples/extras/apollo/llama3_full_sft.yaml
@@ -39,7 +39,7 @@ pure_bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/extras/badam/llama3_full_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft.yaml
@@ -37,7 +37,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -7,6 +7,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -35,7 +36,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -38,7 +38,7 @@ pure_bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -36,7 +36,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 loraplus_lr_ratio: 16.0
@@ -35,7 +36,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -35,7 +35,7 @@ pure_bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/extras/pissa/llama3_lora_sft.yaml
+++ b/examples/extras/pissa/llama3_lora_sft.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 pissa_init: true
 pissa_iter: 16
@@ -37,7 +38,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_full/llama3_full_sft.yaml
+++ b/examples/train_full/llama3_full_sft.yaml
@@ -34,7 +34,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_full/qwen2vl_full_sft.yaml
+++ b/examples/train_full/qwen2vl_full_sft.yaml
@@ -1,5 +1,7 @@
 ### model
 model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
 image_resolution: 262144
 video_resolution: 16384
 trust_remote_code: true
 ### method
@@ -37,7 +39,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: dpo
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 pref_beta: 0.1
 pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
@@ -36,7 +37,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_kto.yaml
+++ b/examples/train_lora/llama3_lora_kto.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: kto
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 pref_beta: 0.1
@@ -35,7 +36,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_ppo.yaml
+++ b/examples/train_lora/llama3_lora_ppo.yaml
@@ -7,6 +7,7 @@ trust_remote_code: true
 stage: ppo
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
--- a/examples/train_lora/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/llama3_lora_pretrain.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: pt
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -33,7 +34,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_reward.yaml
+++ b/examples/train_lora/llama3_lora_reward.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: rm
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +35,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_sft.yaml
+++ b/examples/train_lora/llama3_lora_sft.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +35,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_sft_ds3.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds3.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
@@ -35,7 +36,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_sft_ray.yaml
+++ b/examples/train_lora/llama3_lora_sft_ray.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -24,6 +25,13 @@ save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 ### ray
 ray_run_name: llama3_8b_sft_lora
 ray_num_workers: 4  # number of GPUs to use
 resources_per_worker:
  GPU: 1
 placement_strategy: PACK
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
@@ -35,14 +43,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
 ### ray
 ray_run_name: llama3_8b_sft_lora
 ray_num_workers: 4  # number of GPUs to use
 resources_per_worker:
  GPU: 1
 placement_strategy: PACK
--- a/examples/train_lora/llama3_preprocess.yaml
+++ b/examples/train_lora/llama3_preprocess.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
--- a/examples/train_lora/llava1_5_lora_sft.yaml
+++ b/examples/train_lora/llava1_5_lora_sft.yaml
@@ -1,11 +1,14 @@
 ### model
 model_name_or_path: llava-hf/llava-1.5-7b-hf
 image_resolution: 262144
 video_resolution: 16384
 trust_remote_code: true
 ### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +37,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/qwen2vl_lora_dpo.yaml
+++ b/examples/train_lora/qwen2vl_lora_dpo.yaml
@@ -1,11 +1,14 @@
 ### model
 model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
 image_resolution: 262144
 video_resolution: 16384
 trust_remote_code: true
 ### method
 stage: dpo
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 pref_beta: 0.1
 pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
@@ -36,7 +39,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/qwen2vl_lora_sft.yaml
+++ b/examples/train_lora/qwen2vl_lora_sft.yaml
@@ -1,11 +1,14 @@
 ### model
 model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
 image_resolution: 262144
 video_resolution: 16384
 trust_remote_code: true
 ### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +37,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_aqlm.yaml
+++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +35,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_awq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_awq.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +35,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+++ b/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
@@ -9,6 +9,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -37,7 +38,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_gptq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +35,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_otfq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_otfq.yaml
@@ -8,6 +8,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_rank: 8
 lora_target: all
 ### dataset
@@ -36,7 +37,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ accelerate>=0.34.0,<=1.0.1
 peft>=0.11.1,<=0.12.0
 trl>=0.8.6,<=0.9.6
 tokenizers>=0.19.0,<0.20.4
-gradio>=4.0.0,<6.0.0
+gradio>=4.38.0,<=5.12.0
 pandas>=2.0.0
 scipy
 einops
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -238,7 +238,7 @@ class GaloreArguments:
        metadata={"help": "Number of steps to update the GaLore projection."},
    )
    galore_scale: float = field(
-        default=0.25,
+        default=2.0,
        metadata={"help": "GaLore scaling coefficient."},
    )
    galore_proj_type: Literal["std", "reverse_std", "right", "left", "full"] = field(
@@ -279,7 +279,7 @@ class ApolloArguments:
        metadata={"help": "Number of steps to update the APOLLO projection."},
    )
    apollo_scale: float = field(
-        default=1.0,
+        default=32.0,
        metadata={"help": "APOLLO scaling coefficient."},
    )
    apollo_proj: Literal["svd", "random"] = field(