mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-02 03:32:50 +08:00
disable valset by default (#6690)
Former-commit-id: 77bbf659053e1b205974eb6df69998fee0305d26
This commit is contained in:
parent
770433fa33
commit
bbf334f823
@ -34,7 +34,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -39,7 +39,7 @@ pure_bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -37,7 +37,7 @@ lr_scheduler_type: cosine
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -7,6 +7,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -35,7 +36,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -38,7 +38,7 @@ pure_bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -36,7 +36,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
loraplus_lr_ratio: 16.0
|
loraplus_lr_ratio: 16.0
|
||||||
|
|
||||||
@ -35,7 +36,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -35,7 +35,7 @@ pure_bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
pissa_init: true
|
pissa_init: true
|
||||||
pissa_iter: 16
|
pissa_iter: 16
|
||||||
@ -37,7 +38,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -34,7 +34,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
### model
|
### model
|
||||||
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
||||||
|
image_resolution: 262144
|
||||||
|
video_resolution: 16384
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
### method
|
### method
|
||||||
@ -37,7 +39,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: dpo
|
stage: dpo
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
pref_beta: 0.1
|
pref_beta: 0.1
|
||||||
pref_loss: sigmoid # choices: [sigmoid (dpo), orpo, simpo]
|
pref_loss: sigmoid # choices: [sigmoid (dpo), orpo, simpo]
|
||||||
@ -36,7 +37,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: kto
|
stage: kto
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
pref_beta: 0.1
|
pref_beta: 0.1
|
||||||
|
|
||||||
@ -35,7 +36,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -7,6 +7,7 @@ trust_remote_code: true
|
|||||||
stage: ppo
|
stage: ppo
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: pt
|
stage: pt
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -33,7 +34,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: rm
|
stage: rm
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -34,7 +35,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -34,7 +35,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
|
deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
|
||||||
|
|
||||||
@ -35,7 +36,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -24,6 +25,13 @@ save_steps: 500
|
|||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
|
|
||||||
|
### ray
|
||||||
|
ray_run_name: llama3_8b_sft_lora
|
||||||
|
ray_num_workers: 4 # number of GPUs to use
|
||||||
|
resources_per_worker:
|
||||||
|
GPU: 1
|
||||||
|
placement_strategy: PACK
|
||||||
|
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
@ -35,14 +43,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
|
||||||
### ray
|
|
||||||
ray_run_name: llama3_8b_sft_lora
|
|
||||||
ray_num_workers: 4 # number of GPUs to use
|
|
||||||
resources_per_worker:
|
|
||||||
GPU: 1
|
|
||||||
placement_strategy: PACK
|
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
### model
|
### model
|
||||||
model_name_or_path: llava-hf/llava-1.5-7b-hf
|
model_name_or_path: llava-hf/llava-1.5-7b-hf
|
||||||
|
image_resolution: 262144
|
||||||
|
video_resolution: 16384
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
### method
|
### method
|
||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -34,7 +37,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
### model
|
### model
|
||||||
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
||||||
|
image_resolution: 262144
|
||||||
|
video_resolution: 16384
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
### method
|
### method
|
||||||
stage: dpo
|
stage: dpo
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
pref_beta: 0.1
|
pref_beta: 0.1
|
||||||
pref_loss: sigmoid # choices: [sigmoid (dpo), orpo, simpo]
|
pref_loss: sigmoid # choices: [sigmoid (dpo), orpo, simpo]
|
||||||
@ -36,7 +39,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
### model
|
### model
|
||||||
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
|
||||||
|
image_resolution: 262144
|
||||||
|
video_resolution: 16384
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
### method
|
### method
|
||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -34,7 +37,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -34,7 +35,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -34,7 +35,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -9,6 +9,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -37,7 +38,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -6,6 +6,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -34,7 +35,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -8,6 +8,7 @@ trust_remote_code: true
|
|||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: lora
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
lora_target: all
|
lora_target: all
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
@ -36,7 +37,7 @@ bf16: true
|
|||||||
ddp_timeout: 180000000
|
ddp_timeout: 180000000
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
# val_size: 0.1
|
||||||
per_device_eval_batch_size: 1
|
# per_device_eval_batch_size: 1
|
||||||
eval_strategy: steps
|
# eval_strategy: steps
|
||||||
eval_steps: 500
|
# eval_steps: 500
|
||||||
|
@ -4,7 +4,7 @@ accelerate>=0.34.0,<=1.0.1
|
|||||||
peft>=0.11.1,<=0.12.0
|
peft>=0.11.1,<=0.12.0
|
||||||
trl>=0.8.6,<=0.9.6
|
trl>=0.8.6,<=0.9.6
|
||||||
tokenizers>=0.19.0,<0.20.4
|
tokenizers>=0.19.0,<0.20.4
|
||||||
gradio>=4.0.0,<6.0.0
|
gradio>=4.38.0,<=5.12.0
|
||||||
pandas>=2.0.0
|
pandas>=2.0.0
|
||||||
scipy
|
scipy
|
||||||
einops
|
einops
|
||||||
|
@ -238,7 +238,7 @@ class GaloreArguments:
|
|||||||
metadata={"help": "Number of steps to update the GaLore projection."},
|
metadata={"help": "Number of steps to update the GaLore projection."},
|
||||||
)
|
)
|
||||||
galore_scale: float = field(
|
galore_scale: float = field(
|
||||||
default=0.25,
|
default=2.0,
|
||||||
metadata={"help": "GaLore scaling coefficient."},
|
metadata={"help": "GaLore scaling coefficient."},
|
||||||
)
|
)
|
||||||
galore_proj_type: Literal["std", "reverse_std", "right", "left", "full"] = field(
|
galore_proj_type: Literal["std", "reverse_std", "right", "left", "full"] = field(
|
||||||
@ -279,7 +279,7 @@ class ApolloArguments:
|
|||||||
metadata={"help": "Number of steps to update the APOLLO projection."},
|
metadata={"help": "Number of steps to update the APOLLO projection."},
|
||||||
)
|
)
|
||||||
apollo_scale: float = field(
|
apollo_scale: float = field(
|
||||||
default=1.0,
|
default=32.0,
|
||||||
metadata={"help": "APOLLO scaling coefficient."},
|
metadata={"help": "APOLLO scaling coefficient."},
|
||||||
)
|
)
|
||||||
apollo_proj: Literal["svd", "random"] = field(
|
apollo_proj: Literal["svd", "random"] = field(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user