model_name_or_path: Qwen/Qwen3-30B-A3B-Instruct-2507 # GPU memory: 8 * 78GB do_train: true stage: sft finetuning_type: full # only support full for now dataset: alpaca_en_demo preprocessing_num_workers: 8 cutoff_len: 4096 template: qwen3_nothink # global batchsize = (8 // 2 // 4) * 8 = 8 output_dir: saves/mca/qwen3_moe_full per_device_train_batch_size: 1 gradient_accumulation_steps: 8 num_train_epochs: 2 learning_rate: 3e-6 logging_steps: 1 save_steps: 100 lr_scheduler_type: constant bf16: true # mcore speed up tensor_model_parallel_size: 1 sequence_parallel: false pipeline_model_parallel_size: 4 bias_activation_fusion: true apply_rope_fusion: true use_distributed_optimizer: true overlap_param_gather: true overlap_grad_reduce: true moe_grouped_gemm: true moe_token_dispatcher_type: alltoall expert_model_parallel_size: 2 recompute_granularity: full