diff --git a/examples/extras/galore/adamw.sh b/examples/extras/galore/adamw.sh index cad03879..1fd2aaf0 100644 --- a/examples/extras/galore/adamw.sh +++ b/examples/extras/galore/adamw.sh @@ -15,7 +15,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \ --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 8 \ + --gradient_accumulation_steps 2 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --warmup_steps 20 \ diff --git a/examples/extras/galore/adamw_8bit_bf16.sh b/examples/extras/galore/adamw_8bit_bf16.sh index 9599bf00..01f4e8de 100644 --- a/examples/extras/galore/adamw_8bit_bf16.sh +++ b/examples/extras/galore/adamw_8bit_bf16.sh @@ -16,7 +16,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \ --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 8 \ + --gradient_accumulation_steps 2 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --warmup_steps 20 \ diff --git a/examples/extras/galore/galore_adamw.sh b/examples/extras/galore/galore_adamw.sh index 28ce72bb..83be6a51 100644 --- a/examples/extras/galore/galore_adamw.sh +++ b/examples/extras/galore/galore_adamw.sh @@ -18,7 +18,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \ --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 8 \ + --gradient_accumulation_steps 2 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --warmup_steps 20 \ diff --git a/examples/extras/galore/galore_adamw_8bit_bf16.sh b/examples/extras/galore/galore_adamw_8bit_bf16.sh index 0578856c..ddddcb33 100644 --- a/examples/extras/galore/galore_adamw_8bit_bf16.sh +++ b/examples/extras/galore/galore_adamw_8bit_bf16.sh @@ -10,7 +10,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \ --finetuning_type full \ --use_galore \ --galore_target mlp,self_attn \ - --galore_rank 32 \ + --galore_rank 16 \ --optim adamw_8bit \ --output_dir ../../../saves/LLaMA2-7B/galore/sft \ --overwrite_cache \ @@ -19,7 +19,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \ --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 8 \ + --gradient_accumulation_steps 2 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --warmup_steps 20 \