[v1] fix padding free with sp (#10513)

2026-07-28 11:46:09 +08:00 · 2026-05-26 23:49:21 +08:00
parent 8e68764b65
commit 01398eb18d
6 changed files with 80 additions and 21 deletions
--- a/examples/v1/train_batching_strategy/train_full_fsdp2_dynamic_padding_free.yaml
+++ b/examples/v1/train_batching_strategy/train_full_fsdp2_dynamic_padding_free.yaml
@@ -0,0 +1,30 @@
+model: Qwen/Qwen3-0.6B
+model_class: llm
+
+template: qwen3_nothink
+
+kernel_config:
+  name: auto
+  include_kernels: auto # choice: null/true/false/auto/kernel_id1,kernel_id2,kernel_id3, default is null
+
+quant_config: null
+
+dist_config:
+  name: fsdp2
+  dcp_path: null # /mnt/f/pretrain_models/Qwen3-0.6B-dcp
+
+### data
+train_dataset: data/v1_sft_demo.yaml
+
+### training
+output_dir: outputs/test_fsdp2
+micro_batch_size: 4
+batching_strategy: dynamic_padding_free
+flash_attn: flash_attention2
+cutoff_len: 2048
+learning_rate: 1.0e-4
+max_steps: 10
+
+### sample
+sample_backend: hf
+max_new_tokens: 128
--- a/examples/v1/train_batching_strategy/train_full_fsdp2_padding_free.yaml
+++ b/examples/v1/train_batching_strategy/train_full_fsdp2_padding_free.yaml
@@ -20,7 +20,7 @@ train_dataset: data/v1_sft_demo.yaml
 output_dir: outputs/test_fsdp2
 micro_batch_size: 4
 batching_strategy: padding_free
-flash_attn: fa2
+flash_attn: flash_attention2
 cutoff_len: 2048
 learning_rate: 1.0e-4
 max_steps: 10