From 8d386775f27ddec1475e882c6763dd0393235861 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Wed, 6 Mar 2024 13:14:57 +0800 Subject: [PATCH] update examples Former-commit-id: d1587c80de2e3191952a952116039b719d8613d4 --- examples/full_multi_gpu/ds_z2_config.json | 7 ++-- .../full_multi_gpu/ds_z2_offload_config.json | 32 ++++++++++++++++ examples/full_multi_gpu/ds_z3_config.json | 11 ++---- .../full_multi_gpu/ds_z3_offload_config.json | 38 +++++++++++++++++++ examples/full_multi_gpu/multi_node.sh | 37 ++++++++++++++++++ .../full_multi_gpu/{sft.sh => single_node.sh} | 2 + examples/lora_multi_gpu/master_config.yaml | 18 +++++++++ .../lora_multi_gpu/{sft.sh => multi_node.sh} | 6 ++- .../{config.yaml => single_config.yaml} | 0 examples/lora_multi_gpu/single_node.sh | 34 +++++++++++++++++ examples/lora_multi_gpu/slave_config.yaml | 18 +++++++++ examples/lora_single_gpu/dpo.sh | 2 + examples/lora_single_gpu/ppo.sh | 1 + examples/lora_single_gpu/predict.sh | 1 + examples/lora_single_gpu/pretrain.sh | 2 + examples/lora_single_gpu/reward.sh | 2 + examples/lora_single_gpu/sft.sh | 2 + 17 files changed, 202 insertions(+), 11 deletions(-) create mode 100644 examples/full_multi_gpu/ds_z2_offload_config.json create mode 100644 examples/full_multi_gpu/ds_z3_offload_config.json create mode 100644 examples/full_multi_gpu/multi_node.sh rename examples/full_multi_gpu/{sft.sh => single_node.sh} (92%) create mode 100644 examples/lora_multi_gpu/master_config.yaml rename examples/lora_multi_gpu/{sft.sh => multi_node.sh} (81%) rename examples/lora_multi_gpu/{config.yaml => single_config.yaml} (100%) create mode 100644 examples/lora_multi_gpu/single_node.sh create mode 100644 examples/lora_multi_gpu/slave_config.yaml diff --git a/examples/full_multi_gpu/ds_z2_config.json b/examples/full_multi_gpu/ds_z2_config.json index 3d42aa15..0a1bd1d8 100644 --- a/examples/full_multi_gpu/ds_z2_config.json +++ b/examples/full_multi_gpu/ds_z2_config.json @@ -7,8 +7,8 @@ "fp16": { "enabled": "auto", "loss_scale": 0, - "initial_scale_power": 16, "loss_scale_window": 1000, + "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, @@ -19,9 +19,10 @@ "stage": 2, "allgather_partitions": true, "allgather_bucket_size": 5e8, + "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 5e8, - "overlap_comm": true, - "contiguous_gradients": true + "contiguous_gradients": true, + "round_robin_gradients": true } } \ No newline at end of file diff --git a/examples/full_multi_gpu/ds_z2_offload_config.json b/examples/full_multi_gpu/ds_z2_offload_config.json new file mode 100644 index 00000000..7a398364 --- /dev/null +++ b/examples/full_multi_gpu/ds_z2_offload_config.json @@ -0,0 +1,32 @@ +{ + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 5e8, + "contiguous_gradients": true, + "round_robin_gradients": true + } +} \ No newline at end of file diff --git a/examples/full_multi_gpu/ds_z3_config.json b/examples/full_multi_gpu/ds_z3_config.json index 9c5f55da..ccf9560e 100644 --- a/examples/full_multi_gpu/ds_z3_config.json +++ b/examples/full_multi_gpu/ds_z3_config.json @@ -7,8 +7,8 @@ "fp16": { "enabled": "auto", "loss_scale": 0, - "initial_scale_power": 16, "loss_scale_window": 1000, + "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, @@ -17,15 +17,12 @@ }, "zero_optimization": { "stage": 3, - "offload_optimizer": { - "device": "cpu" - }, - "offload_param": { - "device": "cpu" - }, "overlap_comm": true, "contiguous_gradients": true, "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, "stage3_gather_16bit_weights_on_model_save": true diff --git a/examples/full_multi_gpu/ds_z3_offload_config.json b/examples/full_multi_gpu/ds_z3_offload_config.json new file mode 100644 index 00000000..026aabbc --- /dev/null +++ b/examples/full_multi_gpu/ds_z3_offload_config.json @@ -0,0 +1,38 @@ +{ + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + } +} \ No newline at end of file diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh new file mode 100644 index 00000000..392d717b --- /dev/null +++ b/examples/full_multi_gpu/multi_node.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +python -m torch.distributed.run \ + --nproc_per_node $NPROC_PER_NODE \ + --nnodes $NNODES \ + --node_rank $RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + ../../src/train_bash.py \ + --deepspeed ds_z3_config.json \ + --stage sft \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset alpaca_gpt4_en \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type full \ + --output_dir ../../saves/LLaMA2-7B/full/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 2 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --warmup_steps 20 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16 diff --git a/examples/full_multi_gpu/sft.sh b/examples/full_multi_gpu/single_node.sh similarity index 92% rename from examples/full_multi_gpu/sft.sh rename to examples/full_multi_gpu/single_node.sh index e3ced5f0..c748420f 100644 --- a/examples/full_multi_gpu/sft.sh +++ b/examples/full_multi_gpu/single_node.sh @@ -13,11 +13,13 @@ deepspeed --num_gpus 4 ../../src/train_bash.py \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 2 \ --lr_scheduler_type cosine \ --logging_steps 10 \ + --warmup_steps 20 \ --save_steps 100 \ --eval_steps 100 \ --evaluation_strategy steps \ diff --git a/examples/lora_multi_gpu/master_config.yaml b/examples/lora_multi_gpu/master_config.yaml new file mode 100644 index 00000000..aa41f7e1 --- /dev/null +++ b/examples/lora_multi_gpu/master_config.yaml @@ -0,0 +1,18 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: MULTI_GPU +downcast_bf16: 'no' +gpu_ids: all +machine_rank: 0 +main_process_ip: 192.168.0.1 +main_process_port: 29555 +main_training_function: main +mixed_precision: fp16 +num_machines: 2 +num_processes: 16 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/examples/lora_multi_gpu/sft.sh b/examples/lora_multi_gpu/multi_node.sh similarity index 81% rename from examples/lora_multi_gpu/sft.sh rename to examples/lora_multi_gpu/multi_node.sh index 525e4f67..1ac61590 100644 --- a/examples/lora_multi_gpu/sft.sh +++ b/examples/lora_multi_gpu/multi_node.sh @@ -1,6 +1,8 @@ #!/bin/bash -CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --config_file config.yaml ../../src/train_bash.py \ +CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \ + --config_file master_config.yaml \ + ../../src/train_bash.py \ --stage sft \ --do_train \ --model_name_or_path meta-llama/Llama-2-7b-hf \ @@ -13,11 +15,13 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --config_file config.yaml ../../s --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 2 \ --lr_scheduler_type cosine \ --logging_steps 10 \ + --warmup_steps 20 \ --save_steps 100 \ --eval_steps 100 \ --evaluation_strategy steps \ diff --git a/examples/lora_multi_gpu/config.yaml b/examples/lora_multi_gpu/single_config.yaml similarity index 100% rename from examples/lora_multi_gpu/config.yaml rename to examples/lora_multi_gpu/single_config.yaml diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh new file mode 100644 index 00000000..104535d0 --- /dev/null +++ b/examples/lora_multi_gpu/single_node.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch \ + --config_file single_config.yaml \ + ../../src/train_bash.py \ + --stage sft \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 2 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --warmup_steps 20 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --fp16 diff --git a/examples/lora_multi_gpu/slave_config.yaml b/examples/lora_multi_gpu/slave_config.yaml new file mode 100644 index 00000000..fcb4bb93 --- /dev/null +++ b/examples/lora_multi_gpu/slave_config.yaml @@ -0,0 +1,18 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: MULTI_GPU +downcast_bf16: 'no' +gpu_ids: all +machine_rank: 1 +main_process_ip: 192.168.0.1 +main_process_port: 29555 +main_training_function: main +mixed_precision: fp16 +num_machines: 2 +num_processes: 16 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/examples/lora_single_gpu/dpo.sh b/examples/lora_single_gpu/dpo.sh index 8c2f68c9..daa8ac85 100644 --- a/examples/lora_single_gpu/dpo.sh +++ b/examples/lora_single_gpu/dpo.sh @@ -15,11 +15,13 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 8 \ --lr_scheduler_type cosine \ --logging_steps 10 \ + --warmup_steps 20 \ --save_steps 100 \ --eval_steps 100 \ --evaluation_strategy steps \ diff --git a/examples/lora_single_gpu/ppo.sh b/examples/lora_single_gpu/ppo.sh index 4ec0cbfb..6a5b770e 100644 --- a/examples/lora_single_gpu/ppo.sh +++ b/examples/lora_single_gpu/ppo.sh @@ -16,6 +16,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 512 \ + --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 8 \ --lr_scheduler_type cosine \ diff --git a/examples/lora_single_gpu/predict.sh b/examples/lora_single_gpu/predict.sh index 1fb45396..eb9a18c0 100644 --- a/examples/lora_single_gpu/predict.sh +++ b/examples/lora_single_gpu/predict.sh @@ -13,6 +13,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ --per_device_eval_batch_size 1 \ --max_samples 20 \ --predict_with_generate diff --git a/examples/lora_single_gpu/pretrain.sh b/examples/lora_single_gpu/pretrain.sh index 37adf51f..59bdfe62 100644 --- a/examples/lora_single_gpu/pretrain.sh +++ b/examples/lora_single_gpu/pretrain.sh @@ -12,11 +12,13 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 8 \ --lr_scheduler_type cosine \ --logging_steps 10 \ + --warmup_steps 20 \ --save_steps 100 \ --eval_steps 100 \ --evaluation_strategy steps \ diff --git a/examples/lora_single_gpu/reward.sh b/examples/lora_single_gpu/reward.sh index 7c19e9aa..0f775926 100644 --- a/examples/lora_single_gpu/reward.sh +++ b/examples/lora_single_gpu/reward.sh @@ -15,11 +15,13 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 8 \ --lr_scheduler_type cosine \ --logging_steps 10 \ + --warmup_steps 20 \ --save_steps 100 \ --eval_steps 100 \ --evaluation_strategy steps \ diff --git a/examples/lora_single_gpu/sft.sh b/examples/lora_single_gpu/sft.sh index 41d7851a..3bfbc9b8 100644 --- a/examples/lora_single_gpu/sft.sh +++ b/examples/lora_single_gpu/sft.sh @@ -13,11 +13,13 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 8 \ --lr_scheduler_type cosine \ --logging_steps 10 \ + --warmup_steps 20 \ --save_steps 100 \ --eval_steps 100 \ --evaluation_strategy steps \