[v1] Support fused moe kernel for qwen3vlmoe model. (#9532)

2026-07-28 11:46:09 +08:00 · 2025-11-27 02:13:33 +08:00
parent 2b6f16f261
commit 2c4fb3c97e
2 changed files with 142 additions and 0 deletions
--- a/examples/ascend/qwen3vlmoe_lora_sft_fsdp.yaml
+++ b/examples/ascend/qwen3vlmoe_lora_sft_fsdp.yaml
@@ -0,0 +1,42 @@
+### model
+model_name_or_path: Qwen/Qwen3-VL-30B-A3B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
+trust_remote_code: true
+use_kernels: true  # replaced kernels: [NpuRMSNormKernel, NpuRoPEKernel, NpuQwen3VLMoEFusedMoEKernel]
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+disable_gradient_checkpointing: false
+flash_attn: disabled
+
+### dataset
+dataset: alpaca_zh_demo, alpaca_en_demo
+template: qwen3_vl
+cutoff_len: 1024
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/qwen3vlmoe/lora/sft
+logging_steps: 1
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: true
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+### train
+per_device_train_batch_size: 8
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-4
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+seed: 1234