diff --git a/examples/inference/qwen3moe_lora_sft_kt.yaml b/examples/inference/qwen3moe_lora_sft_kt.yaml new file mode 100644 index 00000000..a8495906 --- /dev/null +++ b/examples/inference/qwen3moe_lora_sft_kt.yaml @@ -0,0 +1,10 @@ +model_name_or_path: Qwen/Qwen3-235B-A22B-Instruct-2507 +adapter_name_or_path: saves/Kllama_Qwen3MoE_235bA22b +template: qwen3_nothink +infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers] +trust_remote_code: true + +use_kt: true # use KTransformers as LoRA sft backend to inference +kt_optimize_rule: examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml +cpu_infer: 32 +chunk_size: 8192 diff --git a/examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml b/examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml new file mode 100644 index 00000000..b8eceb27 --- /dev/null +++ b/examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml @@ -0,0 +1,80 @@ +- match: + class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding + replace: + class: ktransformers.operators.RoPE.RotaryEmbedding + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^lm_head$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +# - match: +# name: "^model\\.layers\\..*$" # regular expression +# class: torch.nn.Linear # only match modules matching name and class simultaneously +# replace: +# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types +# kwargs: +# generate_device: "cuda" +# prefill_device: "cuda" +# generate_op: "KLinearTorch" +# prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\..*\\.mlp$" + replace: + class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlock # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KSFTExpertsCPU" + out_device: "cuda" + backend: "AMXInt8" # or "AMXBF16" or "AMXInt8" + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KQwen3MoeAttention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" + +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KQwen3MoeModel" + kwargs: + per_layer_prefill_intput_threshold: 0 \ No newline at end of file diff --git a/examples/train_lora/qwen3moe_lora_sft_kt.yaml b/examples/train_lora/qwen3moe_lora_sft_kt.yaml new file mode 100644 index 00000000..8567a35f --- /dev/null +++ b/examples/train_lora/qwen3moe_lora_sft_kt.yaml @@ -0,0 +1,52 @@ +### model +model_name_or_path: Qwen/Qwen3-235B-A22B-Instruct-2507 +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 8 +lora_target: all + +### dataset +dataset: identity, alpaca_en_demo +template: qwen3_nothink +cutoff_len: 2048 +max_samples: 100000 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/Kllama_Qwen3MoE_235bA22b +logging_steps: 10 +save_steps: 200 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### ktransformers +use_kt: true # use KTransformers as LoRA sft backend +kt_optimize_rule: examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml +cpu_infer: 32 +chunk_size: 8192 + +### eval +# eval_dataset: alpaca_en_demo +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500