diff --git a/examples/deepspeed/ds_z2_autotp_config.json b/examples/deepspeed/ds_z2_autotp_config.json new file mode 100644 index 000000000..7090d47b3 --- /dev/null +++ b/examples/deepspeed/ds_z2_autotp_config.json @@ -0,0 +1,32 @@ +{ + "_comment": "suooprted model list: https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/#supported-models", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 5e8, + "contiguous_gradients": true, + "round_robin_gradients": true + }, + "tensor_parallel": { + "autotp_size": 2 + } +} diff --git a/examples/train_full/qwen3_full_sft_autotp.yaml b/examples/train_full/qwen3_full_sft_autotp.yaml new file mode 100644 index 000000000..2726203f7 --- /dev/null +++ b/examples/train_full/qwen3_full_sft_autotp.yaml @@ -0,0 +1,46 @@ +### model +model_name_or_path: Qwen/Qwen3-32B +trust_remote_code: true +use_v1_kernels: true + +### method +stage: sft +do_train: true +finetuning_type: full +deepspeed: examples/deepspeed/ds_z2_autotp_config.json + +### dataset +dataset: identity,alpaca_en_demo +template: qwen3 +cutoff_len: 2048 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/qwen3-32b/full/sft_autotp +logging_steps: 1 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 4 +gradient_accumulation_steps: 1 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### eval +# eval_dataset: alpaca_en_demo +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500