mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	[train] KTransformers SFT as backend engine for LLaMA-Factory (#9400)
Co-authored-by: jimmy128 <jimmy128@noreply.gitcode.com> Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>
This commit is contained in:
		
							parent
							
								
									3ae15da9c0
								
							
						
					
					
						commit
						934b3084ee
					
				@ -15,6 +15,7 @@ LLAMAFACTORY_VERBOSITY=
 | 
			
		||||
USE_MODELSCOPE_HUB=
 | 
			
		||||
USE_OPENMIND_HUB=
 | 
			
		||||
USE_RAY=
 | 
			
		||||
USE_KT=
 | 
			
		||||
RECORD_VRAM=
 | 
			
		||||
OPTIM_TORCH=
 | 
			
		||||
NPU_JIT_COMPILE=
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										10
									
								
								examples/inference/deepseek2_lora_sft_kt.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								examples/inference/deepseek2_lora_sft_kt.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,10 @@
 | 
			
		||||
model_name_or_path: deepseek-ai/DeepSeek-V2-Lite
 | 
			
		||||
adapter_name_or_path: saves/Kllama_deepseekV2
 | 
			
		||||
template: deepseek
 | 
			
		||||
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
 | 
			
		||||
trust_remote_code: true
 | 
			
		||||
 | 
			
		||||
use_kt: true # use KTransformers as LoRA sft backend to inference
 | 
			
		||||
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
 | 
			
		||||
cpu_infer: 32
 | 
			
		||||
chunk_size: 8192
 | 
			
		||||
							
								
								
									
										9
									
								
								examples/inference/deepseek3_kt.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								examples/inference/deepseek3_kt.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,9 @@
 | 
			
		||||
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
 | 
			
		||||
template: deepseek
 | 
			
		||||
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
 | 
			
		||||
trust_remote_code: true
 | 
			
		||||
 | 
			
		||||
use_kt: true # use KTransformers as LoRA sft backend to inference
 | 
			
		||||
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
 | 
			
		||||
cpu_infer: 32
 | 
			
		||||
chunk_size: 8192
 | 
			
		||||
							
								
								
									
										10
									
								
								examples/inference/deepseek3_lora_sft_kt.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								examples/inference/deepseek3_lora_sft_kt.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,10 @@
 | 
			
		||||
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
 | 
			
		||||
adapter_name_or_path: saves/Kllama_deepseekV3
 | 
			
		||||
template: deepseek
 | 
			
		||||
infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
 | 
			
		||||
trust_remote_code: true
 | 
			
		||||
 | 
			
		||||
use_kt: true # use KTransformers as LoRA sft backend to inference
 | 
			
		||||
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
 | 
			
		||||
cpu_infer: 32
 | 
			
		||||
chunk_size: 8192
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 | 
			
		||||
template: llama3
 | 
			
		||||
infer_backend: huggingface  # choices: [huggingface, vllm, sglang]
 | 
			
		||||
infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
 | 
			
		||||
trust_remote_code: true
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
model_name_or_path: saves/llama3-8b/full/sft
 | 
			
		||||
template: llama3
 | 
			
		||||
infer_backend: huggingface  # choices: [huggingface, vllm, sglang]
 | 
			
		||||
infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
 | 
			
		||||
trust_remote_code: true
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,5 @@
 | 
			
		||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 | 
			
		||||
adapter_name_or_path: saves/llama3-8b/lora/sft
 | 
			
		||||
template: llama3
 | 
			
		||||
infer_backend: huggingface  # choices: [huggingface, vllm, sglang]
 | 
			
		||||
infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
 | 
			
		||||
trust_remote_code: true
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
 | 
			
		||||
template: qwen2_vl
 | 
			
		||||
infer_backend: huggingface  # choices: [huggingface, vllm, sglang]
 | 
			
		||||
infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
 | 
			
		||||
trust_remote_code: true
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										69
									
								
								examples/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								examples/kt_optimize_rules/DeepSeek-V2-Chat-sft-amx.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,69 @@
 | 
			
		||||
- match:
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
							
								
								
									
										68
									
								
								examples/kt_optimize_rules/DeepSeek-V2-Chat.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								examples/kt_optimize_rules/DeepSeek-V2-Chat.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,68 @@
 | 
			
		||||
- match:
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearMarlin"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearMarlin"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KExpertsCPU"
 | 
			
		||||
      out_device: "cuda"
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
@ -0,0 +1,139 @@
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
        generate_device: "cpu"
 | 
			
		||||
        prefill_device: "cpu"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9])\\."
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([12][0-9])\\."
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9])\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([12][0-9])\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
  
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9])\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([12][0-9])\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op:  "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda:0"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op:  "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda:1"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9])\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([12][0-9])\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
 | 
			
		||||
      transfer_map: 
 | 
			
		||||
        10: "cuda:1"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9])\\."
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
@ -0,0 +1,69 @@
 | 
			
		||||
- match:
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
							
								
								
									
										68
									
								
								examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,68 @@
 | 
			
		||||
- match:
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda"
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
							
								
								
									
										68
									
								
								examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,68 @@
 | 
			
		||||
- match:
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearMarlin"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearMarlin"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KExpertsCPU"
 | 
			
		||||
      out_device: "cuda"
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
							
								
								
									
										77
									
								
								examples/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								examples/kt_optimize_rules/DeepSeek-V3-Chat-amx.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,77 @@
 | 
			
		||||
- match:
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearMarlin"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearMarlin"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.gate.KMoEGate
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KExpertsCPU"
 | 
			
		||||
      out_device: "cuda"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
@ -0,0 +1,392 @@
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
 | 
			
		||||
# === Rotary Embedding Replacement ===
 | 
			
		||||
 | 
			
		||||
# GPU 0: layers 0–14
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
 | 
			
		||||
# GPU 1: layers 15–29
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
 | 
			
		||||
# GPU 2: layers 30–44
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:2"
 | 
			
		||||
      prefill_device: "cuda:2"
 | 
			
		||||
 | 
			
		||||
# GPU 3: layers 45–60
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\."
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:3"
 | 
			
		||||
      prefill_device: "cuda:3"
 | 
			
		||||
 | 
			
		||||
# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===
 | 
			
		||||
 | 
			
		||||
# GPU 0: layers 0–14
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
# GPU 1: layers 15–29
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
# GPU 2: layers 30–44
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:2"
 | 
			
		||||
      prefill_device: "cuda:2"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
# GPU 3: layers 45–60
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:3"
 | 
			
		||||
      prefill_device: "cuda:3"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
# === MLP (MoE) Replacement ===
 | 
			
		||||
 | 
			
		||||
# GPU 0: layers 0–14
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV3MoE
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
 | 
			
		||||
# GPU 1: layers 15–29
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV3MoE
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
 | 
			
		||||
# GPU 2: layers 30–44
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV3MoE
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:2"
 | 
			
		||||
      prefill_device: "cuda:2"
 | 
			
		||||
 | 
			
		||||
# GPU 3: layers 45–60
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV3MoE
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:3"
 | 
			
		||||
      prefill_device: "cuda:3"
 | 
			
		||||
 | 
			
		||||
# === MLP Gate Replacement ===
 | 
			
		||||
 | 
			
		||||
# GPU 0: layers 0–14
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.gate.KMoEGate
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
 | 
			
		||||
# GPU 1: layers 15–29
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.gate.KMoEGate
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
 | 
			
		||||
# GPU 2: layers 30–44
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.gate.KMoEGate
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:2"
 | 
			
		||||
      prefill_device: "cuda:2"
 | 
			
		||||
 | 
			
		||||
# GPU 3: layers 45–60
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.gate.KMoEGate
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:3"
 | 
			
		||||
      prefill_device: "cuda:3"
 | 
			
		||||
 | 
			
		||||
# === MLP Experts Replacement ===
 | 
			
		||||
# replace with marlin expert. Open and modify layer-num as needed.
 | 
			
		||||
# Each layer of malin experts takes about 6GB of GPU memory.
 | 
			
		||||
# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
 | 
			
		||||
# !!!KExpertsTorch is untested, we don't have enough VRAM.!!!
 | 
			
		||||
 | 
			
		||||
# GPU 0: layers 3–4
 | 
			
		||||
# - match:
 | 
			
		||||
#     name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$"
 | 
			
		||||
#   replace:
 | 
			
		||||
#     class: ktransformers.operators.experts.KTransformersExperts
 | 
			
		||||
#     kwargs:
 | 
			
		||||
#       generate_device: "cuda:0"
 | 
			
		||||
#       generate_op:  "KExpertsMarlin"
 | 
			
		||||
#   recursive: False
 | 
			
		||||
 | 
			
		||||
# # GPU 1: layers 15–17
 | 
			
		||||
# - match:
 | 
			
		||||
#     name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$"
 | 
			
		||||
#   replace:
 | 
			
		||||
#     class: ktransformers.operators.experts.KTransformersExperts
 | 
			
		||||
#     kwargs:
 | 
			
		||||
#       generate_device: "cuda:1"
 | 
			
		||||
#       generate_op:  "KExpertsMarlin"
 | 
			
		||||
#   recursive: False
 | 
			
		||||
 | 
			
		||||
# # GPU 2: layers 30–32
 | 
			
		||||
# - match:
 | 
			
		||||
#     name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$"
 | 
			
		||||
#   replace:
 | 
			
		||||
#     class: ktransformers.operators.experts.KTransformersExperts
 | 
			
		||||
#     kwargs:
 | 
			
		||||
#       generate_device: "cuda:2"
 | 
			
		||||
#       generate_op:  "KExpertsMarlin"
 | 
			
		||||
#   recursive: False
 | 
			
		||||
 | 
			
		||||
# # GPU 3: layers 45–46
 | 
			
		||||
# - match:
 | 
			
		||||
#     name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$"
 | 
			
		||||
#   replace:
 | 
			
		||||
#     class: ktransformers.operators.experts.KTransformersExperts
 | 
			
		||||
#     kwargs:
 | 
			
		||||
#       generate_device: "cuda:3"
 | 
			
		||||
#       generate_op:  "KExpertsMarlin"
 | 
			
		||||
#   recursive: False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# === MLP Experts Replacement ===
 | 
			
		||||
 | 
			
		||||
# GPU 0: layers 0–14
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda:0"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False
 | 
			
		||||
 | 
			
		||||
# GPU 1: layers 15–29
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda:1"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False
 | 
			
		||||
 | 
			
		||||
# GPU 2: layers 30–44
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda:2"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda:2"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False
 | 
			
		||||
 | 
			
		||||
# GPU 3: layers 45–60
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda:3"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda:3"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False
 | 
			
		||||
 | 
			
		||||
# === Self-Attention Replacement ===
 | 
			
		||||
 | 
			
		||||
# GPU 0: layers 0–14
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
      absorb_for_prefill: False
 | 
			
		||||
 | 
			
		||||
# GPU 1: layers 15–29
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
      absorb_for_prefill: False
 | 
			
		||||
 | 
			
		||||
# GPU 2: layers 30–44
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:2"
 | 
			
		||||
      prefill_device: "cuda:2"
 | 
			
		||||
      absorb_for_prefill: False
 | 
			
		||||
 | 
			
		||||
# GPU 3: layers 45–60
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:3"
 | 
			
		||||
      prefill_device: "cuda:3"
 | 
			
		||||
      absorb_for_prefill: False
 | 
			
		||||
 | 
			
		||||
# === Overall Model Replacement with Transfer Map ===
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
 | 
			
		||||
      transfer_map:
 | 
			
		||||
        15: "cuda:1" # Layers 15+ on GPU 1
 | 
			
		||||
        30: "cuda:2" # Layers 30+ on GPU 2
 | 
			
		||||
        45: "cuda:3" # Layers 45+ on GPU 3
 | 
			
		||||
 | 
			
		||||
# === Default Catch-All for Other Modules ===
 | 
			
		||||
 | 
			
		||||
# GPU 0: layers 0–14
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([0-9]|1[0-4])\\."
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
 | 
			
		||||
# GPU 1: layers 15–29
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
 | 
			
		||||
# GPU 2: layers 30–44
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:2"
 | 
			
		||||
      prefill_device: "cuda:2"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:3"
 | 
			
		||||
      prefill_device: "cuda:3"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
# For final modules (model.norm), ensure they are on GPU 3 (as in your original config)
 | 
			
		||||
- match:
 | 
			
		||||
    name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:3"
 | 
			
		||||
      prefill_device: "cuda:3"
 | 
			
		||||
@ -0,0 +1,156 @@
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
        generate_device: "cpu"
 | 
			
		||||
        prefill_device: "cpu"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([3456][0-9])\\."
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
  
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.gate.KMoEGate
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.gate.KMoEGate     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op:  "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda:0"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op:  "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda:1"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
 | 
			
		||||
      transfer_map: 
 | 
			
		||||
        30: "cuda:1"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head"
 | 
			
		||||
    class: torch.nn.Linear
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:1"
 | 
			
		||||
      prefill_device: "cuda:1"
 | 
			
		||||
							
								
								
									
										77
									
								
								examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,77 @@
 | 
			
		||||
- match:
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^lm_head$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
 | 
			
		||||
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      generate_op: "KLinearTorch"
 | 
			
		||||
      prefill_op: "KLinearTorch"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp$"
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
- match:
 | 
			
		||||
    class: ktransformers.models.modeling_deepseek_v3.MoEGate
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.gate.KMoEGate
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda:0"
 | 
			
		||||
      prefill_device: "cuda:0"
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
 | 
			
		||||
    kwargs:
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      prefill_op: "KExpertsTorch"
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      generate_op: "KSFTExpertsCPU"
 | 
			
		||||
      out_device: "cuda"
 | 
			
		||||
      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
 | 
			
		||||
  recursive: False # don't recursively inject submodules of this module
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model\\.layers\\..*\\.self_attn$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cuda"
 | 
			
		||||
      prefill_device: "cuda"
 | 
			
		||||
      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model$"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "ktransformers.operators.models.KDeepseekV2Model"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
 | 
			
		||||
- match:
 | 
			
		||||
    name: "^model.embed_tokens"
 | 
			
		||||
  replace:
 | 
			
		||||
    class: "default"
 | 
			
		||||
    kwargs:
 | 
			
		||||
      generate_device: "cpu"
 | 
			
		||||
      prefill_device: "cpu"
 | 
			
		||||
							
								
								
									
										52
									
								
								examples/train_lora/deepseek2_lora_sft_kt.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								examples/train_lora/deepseek2_lora_sft_kt.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,52 @@
 | 
			
		||||
### model
 | 
			
		||||
model_name_or_path: deepseek-ai/DeepSeek-V2-Lite
 | 
			
		||||
trust_remote_code: true
 | 
			
		||||
 | 
			
		||||
### method
 | 
			
		||||
stage: sft
 | 
			
		||||
do_train: true
 | 
			
		||||
finetuning_type: lora
 | 
			
		||||
lora_rank: 8
 | 
			
		||||
lora_target: all
 | 
			
		||||
 | 
			
		||||
### dataset
 | 
			
		||||
dataset: identity
 | 
			
		||||
template: deepseek
 | 
			
		||||
cutoff_len: 2048
 | 
			
		||||
max_samples: 100000
 | 
			
		||||
overwrite_cache: true
 | 
			
		||||
preprocessing_num_workers: 16
 | 
			
		||||
dataloader_num_workers: 4
 | 
			
		||||
 | 
			
		||||
### output
 | 
			
		||||
output_dir: saves/Kllama_deepseekV2
 | 
			
		||||
logging_steps: 10
 | 
			
		||||
save_steps: 500
 | 
			
		||||
plot_loss: true
 | 
			
		||||
overwrite_output_dir: true
 | 
			
		||||
save_only_model: false
 | 
			
		||||
report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
 | 
			
		||||
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
bf16: true
 | 
			
		||||
ddp_timeout: 180000000
 | 
			
		||||
resume_from_checkpoint: null
 | 
			
		||||
 | 
			
		||||
### ktransformers
 | 
			
		||||
use_kt: true # use KTransformers as LoRA sft backend
 | 
			
		||||
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
 | 
			
		||||
cpu_infer: 32
 | 
			
		||||
chunk_size: 8192
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
# eval_dataset: alpaca_en_demo
 | 
			
		||||
# val_size: 0.1
 | 
			
		||||
# per_device_eval_batch_size: 1
 | 
			
		||||
# eval_strategy: steps
 | 
			
		||||
# eval_steps: 500
 | 
			
		||||
							
								
								
									
										52
									
								
								examples/train_lora/deepseek3_lora_sft_kt.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								examples/train_lora/deepseek3_lora_sft_kt.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,52 @@
 | 
			
		||||
### model
 | 
			
		||||
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
 | 
			
		||||
trust_remote_code: true
 | 
			
		||||
 | 
			
		||||
### method
 | 
			
		||||
stage: sft
 | 
			
		||||
do_train: true
 | 
			
		||||
finetuning_type: lora
 | 
			
		||||
lora_rank: 8
 | 
			
		||||
lora_target: all
 | 
			
		||||
 | 
			
		||||
### dataset
 | 
			
		||||
dataset: identity
 | 
			
		||||
template: deepseek
 | 
			
		||||
cutoff_len: 2048
 | 
			
		||||
max_samples: 100000
 | 
			
		||||
overwrite_cache: true
 | 
			
		||||
preprocessing_num_workers: 16
 | 
			
		||||
dataloader_num_workers: 4
 | 
			
		||||
 | 
			
		||||
### output
 | 
			
		||||
output_dir: saves/Kllama_deepseekV3
 | 
			
		||||
logging_steps: 10
 | 
			
		||||
save_steps: 500
 | 
			
		||||
plot_loss: true
 | 
			
		||||
overwrite_output_dir: true
 | 
			
		||||
save_only_model: false
 | 
			
		||||
report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
 | 
			
		||||
 | 
			
		||||
### train
 | 
			
		||||
per_device_train_batch_size: 1
 | 
			
		||||
gradient_accumulation_steps: 8
 | 
			
		||||
learning_rate: 1.0e-4
 | 
			
		||||
num_train_epochs: 3.0
 | 
			
		||||
lr_scheduler_type: cosine
 | 
			
		||||
warmup_ratio: 0.1
 | 
			
		||||
bf16: true
 | 
			
		||||
ddp_timeout: 180000000
 | 
			
		||||
resume_from_checkpoint: null
 | 
			
		||||
 | 
			
		||||
### ktransformers
 | 
			
		||||
use_kt: true # use KTransformers as LoRA sft backend
 | 
			
		||||
kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
 | 
			
		||||
cpu_infer: 32
 | 
			
		||||
chunk_size: 8192
 | 
			
		||||
 | 
			
		||||
### eval
 | 
			
		||||
# eval_dataset: alpaca_en_demo
 | 
			
		||||
# val_size: 0.1
 | 
			
		||||
# per_device_eval_batch_size: 1
 | 
			
		||||
# eval_strategy: steps
 | 
			
		||||
# eval_steps: 500
 | 
			
		||||
@ -71,6 +71,16 @@ class ChatModel:
 | 
			
		||||
                    "SGLang not install, you may need to run `pip install sglang[all]`\n"
 | 
			
		||||
                    "or try to use HuggingFace backend: --infer_backend huggingface"
 | 
			
		||||
                ) from e
 | 
			
		||||
        elif model_args.infer_backend == EngineName.KT:
 | 
			
		||||
            try:
 | 
			
		||||
                from .kt_engine import KTransformersEngine
 | 
			
		||||
 | 
			
		||||
                self.engine: BaseEngine = KTransformersEngine(model_args, data_args, finetuning_args, generating_args)
 | 
			
		||||
            except ImportError as e:
 | 
			
		||||
                raise ImportError(
 | 
			
		||||
                    "KTransformers not install, you may need to run `pip install ktransformers`\n"
 | 
			
		||||
                    "or try to use HuggingFace backend: --infer_backend huggingface"
 | 
			
		||||
                ) from e
 | 
			
		||||
        else:
 | 
			
		||||
            raise NotImplementedError(f"Unknown backend: {model_args.infer_backend}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										270
									
								
								src/llamafactory/chat/kt_engine.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										270
									
								
								src/llamafactory/chat/kt_engine.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,270 @@
 | 
			
		||||
# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
 | 
			
		||||
import asyncio
 | 
			
		||||
import os
 | 
			
		||||
import platform
 | 
			
		||||
from collections.abc import AsyncGenerator
 | 
			
		||||
from threading import Thread
 | 
			
		||||
from typing import TYPE_CHECKING, Any, Optional
 | 
			
		||||
 | 
			
		||||
import torch
 | 
			
		||||
from typing_extensions import override
 | 
			
		||||
 | 
			
		||||
from ..data import get_template_and_fix_tokenizer
 | 
			
		||||
from ..extras import logging
 | 
			
		||||
from ..extras.constants import EngineName
 | 
			
		||||
from ..model import load_model, load_tokenizer
 | 
			
		||||
from .base_engine import BaseEngine, Response
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    from transformers import PreTrainedTokenizer
 | 
			
		||||
    from trl import PreTrainedModelWrapper
 | 
			
		||||
 | 
			
		||||
    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
 | 
			
		||||
    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
 | 
			
		||||
 | 
			
		||||
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
 | 
			
		||||
from ktransformers.server.config.config import Config
 | 
			
		||||
from ktransformers.util.utils import (
 | 
			
		||||
    get_compute_capability,
 | 
			
		||||
    prefill_and_generate_capture,
 | 
			
		||||
)
 | 
			
		||||
from ktransformers.util.vendors import GPUVendor, device_manager
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class KTransformersEngine(BaseEngine):
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        model_args: "ModelArguments",
 | 
			
		||||
        data_args: "DataArguments",
 | 
			
		||||
        finetuning_args: "FinetuningArguments",
 | 
			
		||||
        generating_args: "GeneratingArguments",
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        self.name = EngineName.KT
 | 
			
		||||
        self.can_generate = finetuning_args.stage == "sft"
 | 
			
		||||
 | 
			
		||||
        tok_mod = load_tokenizer(model_args)
 | 
			
		||||
        self.tokenizer = tok_mod["tokenizer"]
 | 
			
		||||
        self.tokenizer.padding_side = "left" if self.can_generate else "right"
 | 
			
		||||
        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
 | 
			
		||||
 | 
			
		||||
        self.model = load_model(
 | 
			
		||||
            self.tokenizer, model_args, finetuning_args,
 | 
			
		||||
            is_trainable=False, add_valuehead=(not self.can_generate)
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        self.generating_args = generating_args.to_dict()
 | 
			
		||||
        self.max_new_tokens = model_args.kt_maxlen
 | 
			
		||||
        self.use_cuda_graph = model_args.kt_use_cuda_graph
 | 
			
		||||
        self.mode = model_args.kt_mode
 | 
			
		||||
        self.force_think = model_args.kt_force_think
 | 
			
		||||
        self.chunk_size = model_args.chunk_size
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            asyncio.get_event_loop()
 | 
			
		||||
        except RuntimeError:
 | 
			
		||||
            loop = asyncio.new_event_loop()
 | 
			
		||||
            asyncio.set_event_loop(loop)
 | 
			
		||||
 | 
			
		||||
        self.semaphore = asyncio.Semaphore(int(os.getenv("MAX_CONCURRENT", "1")))
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    @torch.inference_mode()
 | 
			
		||||
    def _get_scores(
 | 
			
		||||
        model: "PreTrainedModelWrapper",
 | 
			
		||||
        tokenizer: "PreTrainedTokenizer",
 | 
			
		||||
        batch_input: list[str],
 | 
			
		||||
        input_kwargs: Optional[dict[str, Any]] = {},
 | 
			
		||||
    ) -> list[float]:
 | 
			
		||||
        max_length: Optional[int] = input_kwargs.pop("max_length", None)
 | 
			
		||||
        device = getattr(model.pretrained_model, "device", "cuda")
 | 
			
		||||
        inputs = tokenizer(
 | 
			
		||||
            batch_input,
 | 
			
		||||
            padding=True,
 | 
			
		||||
            truncation=True,
 | 
			
		||||
            max_length=max_length or getattr(model.config, "max_position_embeddings", 1024),
 | 
			
		||||
            return_tensors="pt",
 | 
			
		||||
            add_special_tokens=False,
 | 
			
		||||
        ).to(device)
 | 
			
		||||
        values: torch.Tensor = model(**inputs, return_dict=True, use_cache=False)[-1]
 | 
			
		||||
        scores = values.gather(dim=-1, index=(inputs["attention_mask"].sum(dim=-1, keepdim=True) - 1))
 | 
			
		||||
        return scores
 | 
			
		||||
 | 
			
		||||
    async def _generate(
 | 
			
		||||
        self,
 | 
			
		||||
        messages: list[dict[str, str]],
 | 
			
		||||
        system: Optional[str] = None,
 | 
			
		||||
        tools: Optional[str] = None,
 | 
			
		||||
        **input_kwargs,
 | 
			
		||||
    ) -> AsyncGenerator[str, None]:
 | 
			
		||||
        paired = messages + [{"role": "assistant", "content": ""}]
 | 
			
		||||
        prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired, system, tools)
 | 
			
		||||
        prompt_len = len(prompt_ids)
 | 
			
		||||
 | 
			
		||||
        max_length: Optional[int] = input_kwargs.pop("max_length", None)
 | 
			
		||||
        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
 | 
			
		||||
 | 
			
		||||
        if "max_new_tokens" in self.generating_args:
 | 
			
		||||
            max_tokens = int(self.generating_args["max_new_tokens"])
 | 
			
		||||
        elif "max_length" in self.generating_args:
 | 
			
		||||
            gl = int(self.generating_args["max_length"])
 | 
			
		||||
            max_tokens = gl - prompt_len if gl > prompt_len else 1
 | 
			
		||||
        else:
 | 
			
		||||
            max_tokens = self.max_new_tokens or 256
 | 
			
		||||
 | 
			
		||||
        if max_length is not None:
 | 
			
		||||
            max_tokens = max(max_length - prompt_len, 1)
 | 
			
		||||
        if max_new_tokens is not None:
 | 
			
		||||
            max_tokens = int(max_new_tokens)
 | 
			
		||||
        max_tokens = max(1, int(max_tokens))
 | 
			
		||||
 | 
			
		||||
        if self.mode == "long_context":
 | 
			
		||||
            max_len_cfg = Config().long_context_config["max_seq_len"]
 | 
			
		||||
            need = prompt_len + max_tokens
 | 
			
		||||
            assert max_len_cfg > need, f"please set max_seq_len > {need} in ~/.ktransformers/config.yaml"
 | 
			
		||||
 | 
			
		||||
        device = next(self.model.parameters()).device
 | 
			
		||||
        input_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device)
 | 
			
		||||
        if self.force_think:
 | 
			
		||||
            think = torch.tensor(
 | 
			
		||||
                [self.tokenizer.encode("<think>\n", add_special_tokens=False)],
 | 
			
		||||
                dtype=torch.long, device=device
 | 
			
		||||
            )
 | 
			
		||||
            input_tensor = torch.cat([input_tensor, think], dim=1)
 | 
			
		||||
 | 
			
		||||
        use_flashinfer = (
 | 
			
		||||
            platform.system() != "Windows"
 | 
			
		||||
            and getattr(self.model.config, "architectures", [""])[0] in {"DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"}
 | 
			
		||||
            and flashinfer_enabled
 | 
			
		||||
            and get_compute_capability() >= 8
 | 
			
		||||
            and device_manager.gpu_vendor == GPUVendor.NVIDIA
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        def make_gen():
 | 
			
		||||
            if use_flashinfer:
 | 
			
		||||
                return prefill_and_generate_capture(
 | 
			
		||||
                    self.model, self.tokenizer, input_tensor, max_tokens, self.use_cuda_graph,
 | 
			
		||||
                    mode=self.mode, force_think=self.force_think, chunk_size=self.chunk_size,
 | 
			
		||||
                    use_flashinfer_mla=True,
 | 
			
		||||
                    num_heads=self.model.config.num_attention_heads,
 | 
			
		||||
                    head_dim_ckv=getattr(self.model.config, "kv_lora_rank", 0),
 | 
			
		||||
                    head_dim_kpe=getattr(self.model.config, "qk_rope_head_dim", 0),
 | 
			
		||||
                    q_head_dim=getattr(self.model.config, "qk_rope_head_dim", 0) + getattr(self.model.config, "qk_nope_head_dim", 0),
 | 
			
		||||
                    echo_stream=False,
 | 
			
		||||
                )
 | 
			
		||||
            else:
 | 
			
		||||
                return prefill_and_generate_capture(
 | 
			
		||||
                    self.model, self.tokenizer, input_tensor, max_tokens, self.use_cuda_graph,
 | 
			
		||||
                    mode=self.mode, force_think=self.force_think, chunk_size=self.chunk_size,
 | 
			
		||||
                    echo_stream=False,
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
        loop = asyncio.get_running_loop()
 | 
			
		||||
        q: asyncio.Queue[Optional[str]] = asyncio.Queue()
 | 
			
		||||
 | 
			
		||||
        def producer():
 | 
			
		||||
            try:
 | 
			
		||||
                gen = make_gen()
 | 
			
		||||
                if hasattr(gen, "__aiter__"):
 | 
			
		||||
                    async def drain_async():
 | 
			
		||||
                        async for t in gen:
 | 
			
		||||
                            loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t))
 | 
			
		||||
                    asyncio.run(drain_async())
 | 
			
		||||
                elif hasattr(gen, "__iter__"):
 | 
			
		||||
                    for t in gen:
 | 
			
		||||
                        loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t))
 | 
			
		||||
                else:
 | 
			
		||||
                    loop.call_soon_threadsafe(q.put_nowait, gen if isinstance(gen, str) else str(gen))
 | 
			
		||||
            finally:
 | 
			
		||||
                loop.call_soon_threadsafe(q.put_nowait, None)
 | 
			
		||||
 | 
			
		||||
        Thread(target=producer, daemon=True).start()
 | 
			
		||||
 | 
			
		||||
        while True:
 | 
			
		||||
            item = await q.get()
 | 
			
		||||
            if item is None:
 | 
			
		||||
                break
 | 
			
		||||
            yield item
 | 
			
		||||
 | 
			
		||||
    @override
 | 
			
		||||
    async def chat(
 | 
			
		||||
        self,
 | 
			
		||||
        messages: list[dict[str, str]],
 | 
			
		||||
        system: Optional[str] = None,
 | 
			
		||||
        tools: Optional[str] = None,
 | 
			
		||||
        images: Optional[list["ImageInput"]] = None,
 | 
			
		||||
        videos: Optional[list["VideoInput"]] = None,
 | 
			
		||||
        audios: Optional[list["AudioInput"]] = None,
 | 
			
		||||
        **input_kwargs,
 | 
			
		||||
    ) -> list["Response"]:
 | 
			
		||||
        if not self.can_generate:
 | 
			
		||||
            raise ValueError("The current model does not support `chat`.")
 | 
			
		||||
        async with self.semaphore:
 | 
			
		||||
            produced = ""
 | 
			
		||||
            final_text = ""
 | 
			
		||||
            async for t in self._generate(messages, system, tools, **input_kwargs):
 | 
			
		||||
                delta = t
 | 
			
		||||
                produced = produced + delta
 | 
			
		||||
                if delta:
 | 
			
		||||
                    final_text += delta
 | 
			
		||||
 | 
			
		||||
            prompt_ids, _ = self.template.encode_oneturn(
 | 
			
		||||
                self.tokenizer, messages + [{"role": "assistant", "content": ""}], system, tools
 | 
			
		||||
            )
 | 
			
		||||
            return [
 | 
			
		||||
                Response(
 | 
			
		||||
                    response_text=final_text,
 | 
			
		||||
                    response_length=len(self.tokenizer.encode(final_text, add_special_tokens=False)),
 | 
			
		||||
                    prompt_length=len(prompt_ids),
 | 
			
		||||
                    finish_reason="stop",
 | 
			
		||||
                )
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
    @override
 | 
			
		||||
    async def stream_chat(
 | 
			
		||||
        self,
 | 
			
		||||
        messages: list[dict[str, str]],
 | 
			
		||||
        system: Optional[str] = None,
 | 
			
		||||
        tools: Optional[str] = None,
 | 
			
		||||
        images: Optional[list["ImageInput"]] = None,
 | 
			
		||||
        videos: Optional[list["VideoInput"]] = None,
 | 
			
		||||
        audios: Optional[list["AudioInput"]] = None,
 | 
			
		||||
        **input_kwargs,
 | 
			
		||||
    ) -> AsyncGenerator[str, None]:
 | 
			
		||||
        if not self.can_generate:
 | 
			
		||||
            raise ValueError("The current model does not support `stream_chat`.")
 | 
			
		||||
        async with self.semaphore:
 | 
			
		||||
            produced = ""
 | 
			
		||||
            async for t in self._generate(messages, system, tools, **input_kwargs):
 | 
			
		||||
                delta = t[len(produced):] if t.startswith(produced) else t
 | 
			
		||||
                produced = t
 | 
			
		||||
                if delta:
 | 
			
		||||
                    yield delta
 | 
			
		||||
 | 
			
		||||
    @override
 | 
			
		||||
    async def get_scores(
 | 
			
		||||
        self,
 | 
			
		||||
        batch_input: list[str],
 | 
			
		||||
        **input_kwargs,
 | 
			
		||||
    ) -> list[float]:
 | 
			
		||||
        if self.can_generate:
 | 
			
		||||
            raise ValueError("Cannot get scores using an auto-regressive model.")
 | 
			
		||||
        args = (self.model, self.tokenizer, batch_input, input_kwargs)
 | 
			
		||||
        async with self.semaphore:
 | 
			
		||||
            return await asyncio.to_thread(self._get_scores, *args)
 | 
			
		||||
@ -120,6 +120,7 @@ class EngineName(str, Enum):
 | 
			
		||||
    HF = "huggingface"
 | 
			
		||||
    VLLM = "vllm"
 | 
			
		||||
    SGLANG = "sglang"
 | 
			
		||||
    KT = "ktransformers"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DownloadSource(str, Enum):
 | 
			
		||||
 | 
			
		||||
@ -312,6 +312,8 @@ def use_openmind() -> bool:
 | 
			
		||||
def use_ray() -> bool:
 | 
			
		||||
    return is_env_enabled("USE_RAY")
 | 
			
		||||
 | 
			
		||||
def use_kt() -> bool:
 | 
			
		||||
    return is_env_enabled("USE_KT")
 | 
			
		||||
 | 
			
		||||
def find_available_port() -> int:
 | 
			
		||||
    r"""Find an available port on the local machine."""
 | 
			
		||||
 | 
			
		||||
@ -82,6 +82,10 @@ def is_ray_available():
 | 
			
		||||
    return _is_package_available("ray")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_kt_available():
 | 
			
		||||
    return _is_package_available("ktransformers")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_requests_available():
 | 
			
		||||
    return _is_package_available("requests")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -439,7 +439,6 @@ class SwanLabArguments:
 | 
			
		||||
        metadata={"help": "The Lark(飞书) secret for SwanLab."},
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class FinetuningArguments(
 | 
			
		||||
    SwanLabArguments,
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
 | 
			
		||||
# Copyright 2025 HuggingFace Inc., the KVCache.AI team, Approaching AI, and the LlamaFactory team.
 | 
			
		||||
#
 | 
			
		||||
# This code is inspired by the HuggingFace's transformers library.
 | 
			
		||||
# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 | 
			
		||||
@ -475,9 +475,51 @@ class SGLangArguments:
 | 
			
		||||
            self.sglang_config = _convert_str_dict(json.loads(self.sglang_config))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class KTransformersArguments:
 | 
			
		||||
    r"""Arguments pertaining to the KT training."""
 | 
			
		||||
 | 
			
		||||
    use_kt: bool = field(
 | 
			
		||||
        default=False,
 | 
			
		||||
        metadata={"help": "Whether To Use KTransformers Optimizations For LoRA Training."},
 | 
			
		||||
    )
 | 
			
		||||
    kt_optimize_rule: Optional[str] = field(
 | 
			
		||||
        default=None,
 | 
			
		||||
        metadata={"help": "Path To The KTransformers Optimize Rule; See https://github.com/kvcache-ai/ktransformers/."},
 | 
			
		||||
    )
 | 
			
		||||
    cpu_infer: Optional[int] = field(
 | 
			
		||||
        default=32,
 | 
			
		||||
        metadata={"help": "Number Of CPU Cores Used For Computation."},
 | 
			
		||||
    )
 | 
			
		||||
    chunk_size: Optional[int] = field(
 | 
			
		||||
        default=8192,
 | 
			
		||||
        metadata={"help": "Chunk Size Used For CPU Compute In KTransformers."},
 | 
			
		||||
    )
 | 
			
		||||
    mode: Optional[str] = field(
 | 
			
		||||
        default="normal",
 | 
			
		||||
        metadata={"help": "Normal Or Long_Context For Llama Models."},
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    kt_maxlen: int = field(
 | 
			
		||||
        default=4096,
 | 
			
		||||
        metadata={"help": "Maximum Sequence (Prompt + Response) Length Of The KT Engine."},
 | 
			
		||||
    )
 | 
			
		||||
    kt_use_cuda_graph: bool = field(
 | 
			
		||||
        default=True,
 | 
			
		||||
        metadata={"help": "Whether To Use CUDA Graphs For The KT Engine."},
 | 
			
		||||
    )
 | 
			
		||||
    kt_mode: str = field(
 | 
			
		||||
        default="normal",
 | 
			
		||||
        metadata={"help": "Normal Or Long_Context Mode For The KT Engine."},
 | 
			
		||||
    )
 | 
			
		||||
    kt_force_think: bool = field(
 | 
			
		||||
        default=False,
 | 
			
		||||
        metadata={"help": "Force-Think Toggle For The KT Engine."},
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class ModelArguments(
 | 
			
		||||
    SGLangArguments, VllmArguments, ExportArguments, ProcessorArguments, QuantizationArguments, BaseModelArguments
 | 
			
		||||
    SGLangArguments, VllmArguments, KTransformersArguments, ExportArguments, ProcessorArguments, QuantizationArguments, BaseModelArguments
 | 
			
		||||
):
 | 
			
		||||
    r"""Arguments pertaining to which model/config/tokenizer we are going to fine-tune or infer.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -156,6 +156,9 @@ def _check_extra_dependencies(
 | 
			
		||||
    finetuning_args: "FinetuningArguments",
 | 
			
		||||
    training_args: Optional["TrainingArguments"] = None,
 | 
			
		||||
) -> None:
 | 
			
		||||
    if model_args.use_kt:
 | 
			
		||||
        check_version("ktransformers", mandatory=True)
 | 
			
		||||
 | 
			
		||||
    if model_args.use_unsloth:
 | 
			
		||||
        check_version("unsloth", mandatory=True)
 | 
			
		||||
 | 
			
		||||
@ -282,13 +285,16 @@ def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _
 | 
			
		||||
        if model_args.shift_attn:
 | 
			
		||||
            raise ValueError("PPO training is incompatible with S^2-Attn.")
 | 
			
		||||
 | 
			
		||||
        if finetuning_args.reward_model_type == "lora" and model_args.use_kt:
 | 
			
		||||
            raise ValueError("KTransformers does not support lora reward model.")
 | 
			
		||||
 | 
			
		||||
        if finetuning_args.reward_model_type == "lora" and model_args.use_unsloth:
 | 
			
		||||
            raise ValueError("Unsloth does not support lora reward model.")
 | 
			
		||||
 | 
			
		||||
        if training_args.report_to and training_args.report_to[0] not in ["wandb", "tensorboard"]:
 | 
			
		||||
            raise ValueError("PPO only accepts wandb or tensorboard logger.")
 | 
			
		||||
 | 
			
		||||
    if training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED:
 | 
			
		||||
    if not model_args.use_kt and training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED:
 | 
			
		||||
        raise ValueError("Please launch distributed training with `llamafactory-cli` or `torchrun`.")
 | 
			
		||||
 | 
			
		||||
    if training_args.deepspeed and training_args.parallel_mode != ParallelMode.DISTRIBUTED:
 | 
			
		||||
@ -350,6 +356,9 @@ def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _
 | 
			
		||||
    if model_args.use_unsloth and is_deepspeed_zero3_enabled():
 | 
			
		||||
        raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.")
 | 
			
		||||
 | 
			
		||||
    if model_args.use_kt and is_deepspeed_zero3_enabled():
 | 
			
		||||
        raise ValueError("KTransformers is incompatible with DeepSpeed ZeRO-3.")
 | 
			
		||||
 | 
			
		||||
    if data_args.neat_packing and is_transformers_version_greater_than("4.53.0"):
 | 
			
		||||
        raise ValueError("Neat packing is incompatible with transformers>=4.53.0.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -90,7 +90,6 @@ class RayArguments:
 | 
			
		||||
            elif self.ray_storage_filesystem == "gs" or self.ray_storage_filesystem == "gcs":
 | 
			
		||||
                self.ray_storage_filesystem = fs.GcsFileSystem()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class TrainingArguments(RayArguments, BaseTrainingArguments):
 | 
			
		||||
    r"""Arguments pertaining to the trainer."""
 | 
			
		||||
 | 
			
		||||
@ -38,7 +38,7 @@ USAGE = (
 | 
			
		||||
def launch():
 | 
			
		||||
    from .extras import logging
 | 
			
		||||
    from .extras.env import VERSION, print_env
 | 
			
		||||
    from .extras.misc import find_available_port, get_device_count, is_env_enabled, use_ray
 | 
			
		||||
    from .extras.misc import find_available_port, get_device_count, is_env_enabled, use_kt, use_ray
 | 
			
		||||
 | 
			
		||||
    logger = logging.get_logger(__name__)
 | 
			
		||||
    WELCOME = (
 | 
			
		||||
@ -57,7 +57,7 @@ def launch():
 | 
			
		||||
    if is_env_enabled("USE_MCA"):  # force use torchrun
 | 
			
		||||
        os.environ["FORCE_TORCHRUN"] = "1"
 | 
			
		||||
 | 
			
		||||
    if command == "train" and (is_env_enabled("FORCE_TORCHRUN") or (get_device_count() > 1 and not use_ray())):
 | 
			
		||||
    if command == "train" and (is_env_enabled("FORCE_TORCHRUN") or (get_device_count() > 1 and not use_ray() and not use_kt())):
 | 
			
		||||
        # launch distributed training
 | 
			
		||||
        nnodes = os.getenv("NNODES", "1")
 | 
			
		||||
        node_rank = os.getenv("NODE_RANK", "0")
 | 
			
		||||
 | 
			
		||||
@ -20,6 +20,8 @@ from peft import LoraConfig, LoraModel, OFTConfig, PeftModel, TaskType, get_peft
 | 
			
		||||
from transformers.integrations import is_deepspeed_zero3_enabled
 | 
			
		||||
 | 
			
		||||
from ..extras import logging
 | 
			
		||||
from ..extras.constants import EngineName
 | 
			
		||||
from .model_utils.ktransformers import get_kt_peft_model, load_kt_peft_model
 | 
			
		||||
from .model_utils.misc import find_all_linear_modules, find_expanded_modules
 | 
			
		||||
from .model_utils.quantization import QuantizationMethod
 | 
			
		||||
from .model_utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model
 | 
			
		||||
@ -164,6 +166,10 @@ def _setup_lora_tuning(
 | 
			
		||||
            assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3."
 | 
			
		||||
            is_mergeable = False
 | 
			
		||||
 | 
			
		||||
        if model_args.use_kt:
 | 
			
		||||
            assert len(model_args.adapter_name_or_path) == 1, "Up to now, KTransformers model only accepts a single adapter, for more features, you can contact with us."
 | 
			
		||||
            is_mergeable = False
 | 
			
		||||
 | 
			
		||||
        if model_args.use_unsloth:
 | 
			
		||||
            assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter."
 | 
			
		||||
            is_mergeable = False
 | 
			
		||||
@ -182,6 +188,10 @@ def _setup_lora_tuning(
 | 
			
		||||
            "token": model_args.hf_hub_token,
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if model_args.use_kt:
 | 
			
		||||
            if model_args.infer_backend != EngineName.KT:
 | 
			
		||||
                raise ValueError("We should use ktransformers as backend to infer the adapter fine-tuned by ktransformers.")
 | 
			
		||||
 | 
			
		||||
        for adapter in adapter_to_merge:
 | 
			
		||||
            model: LoraModel = PeftModel.from_pretrained(model, adapter, **init_kwargs)
 | 
			
		||||
            model = model.merge_and_unload()
 | 
			
		||||
@ -190,7 +200,9 @@ def _setup_lora_tuning(
 | 
			
		||||
            logger.info_rank0(f"Merged {len(adapter_to_merge)} adapter(s).")
 | 
			
		||||
 | 
			
		||||
        if adapter_to_resume is not None:  # resume lora training
 | 
			
		||||
            if model_args.use_unsloth:
 | 
			
		||||
            if model_args.use_kt:
 | 
			
		||||
                model = load_kt_peft_model(model_args, model)
 | 
			
		||||
            elif model_args.use_unsloth:
 | 
			
		||||
                model = load_unsloth_peft_model(config, model_args, finetuning_args, is_trainable=is_trainable)
 | 
			
		||||
            else:
 | 
			
		||||
                model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable, **init_kwargs)
 | 
			
		||||
@ -203,6 +215,16 @@ def _setup_lora_tuning(
 | 
			
		||||
        else:
 | 
			
		||||
            target_modules = finetuning_args.lora_target
 | 
			
		||||
 | 
			
		||||
        if model_args.use_kt:
 | 
			
		||||
            new_list = []
 | 
			
		||||
            for m in target_modules:
 | 
			
		||||
                if m in ('down_proj', 'up_proj', 'gate_proj'):
 | 
			
		||||
                    new_list.extend([f"mlp.{m}", f"shared_experts.{m}"])
 | 
			
		||||
                elif m not in ('generate_linear', 'orig_module', 'prefill_linear'):
 | 
			
		||||
                    new_list.append(m)
 | 
			
		||||
 | 
			
		||||
            target_modules[:] = new_list
 | 
			
		||||
 | 
			
		||||
        if finetuning_args.use_llama_pro:
 | 
			
		||||
            target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers)
 | 
			
		||||
 | 
			
		||||
@ -245,7 +267,21 @@ def _setup_lora_tuning(
 | 
			
		||||
                "modules_to_save": finetuning_args.additional_target,
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        if model_args.use_unsloth:
 | 
			
		||||
        if model_args.use_kt:
 | 
			
		||||
            if finetuning_args.finetuning_type == "oft":
 | 
			
		||||
                raise ValueError("KTransformers is currently not supported for OFT.")
 | 
			
		||||
            if finetuning_args.finetuning_type == "lora":
 | 
			
		||||
                peft_config = LoraConfig(
 | 
			
		||||
                    task_type=TaskType.CAUSAL_LM,
 | 
			
		||||
                    inference_mode=False,
 | 
			
		||||
                    **peft_kwargs,
 | 
			
		||||
                )
 | 
			
		||||
            else:
 | 
			
		||||
                raise ValueError("KTransformers is currently only supported for LoRA.")
 | 
			
		||||
 | 
			
		||||
            model = get_kt_peft_model(model, peft_config)
 | 
			
		||||
            print(f"KT_model:{model}")
 | 
			
		||||
        elif model_args.use_unsloth:
 | 
			
		||||
            if finetuning_args.finetuning_type == "oft":
 | 
			
		||||
                raise ValueError("Unsloth is currently not supported for OFT.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -31,6 +31,7 @@ from trl import AutoModelForCausalLMWithValueHead
 | 
			
		||||
from ..extras import logging
 | 
			
		||||
from ..extras.misc import count_parameters, skip_check_imports, try_download_model_from_other_hub
 | 
			
		||||
from .adapter import init_adapter
 | 
			
		||||
from .model_utils.ktransformers import load_kt_pretrained_model
 | 
			
		||||
from .model_utils.liger_kernel import apply_liger_kernel
 | 
			
		||||
from .model_utils.misc import register_autoclass
 | 
			
		||||
from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
 | 
			
		||||
@ -143,7 +144,11 @@ def load_model(
 | 
			
		||||
 | 
			
		||||
    model = None
 | 
			
		||||
    lazy_load = False
 | 
			
		||||
    if model_args.use_unsloth:
 | 
			
		||||
    if model_args.use_kt:
 | 
			
		||||
        from ktransformers.sft.monkey_patch_torch_module import install_patch
 | 
			
		||||
        install_patch()
 | 
			
		||||
        model = load_kt_pretrained_model(config, model_args)
 | 
			
		||||
    elif model_args.use_unsloth:
 | 
			
		||||
        if model_args.adapter_name_or_path is not None:
 | 
			
		||||
            lazy_load = True
 | 
			
		||||
        elif is_trainable:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										159
									
								
								src/llamafactory/model/model_utils/ktransformers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										159
									
								
								src/llamafactory/model/model_utils/ktransformers.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,159 @@
 | 
			
		||||
# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
 | 
			
		||||
import importlib.util as _u
 | 
			
		||||
from typing import TYPE_CHECKING, Any, Optional
 | 
			
		||||
 | 
			
		||||
import torch
 | 
			
		||||
 | 
			
		||||
from ...extras import logging
 | 
			
		||||
from ...extras.misc import get_current_device
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    from ...hparams import FinetuningArguments, ModelArguments
 | 
			
		||||
 | 
			
		||||
from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
KT_AVAILABLE = _u.find_spec("ktransformers") is not None
 | 
			
		||||
if KT_AVAILABLE:
 | 
			
		||||
    from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
 | 
			
		||||
    from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
 | 
			
		||||
    from ktransformers.models.modeling_llama import LlamaForCausalLM
 | 
			
		||||
    from ktransformers.models.modeling_mixtral import MixtralForCausalLM
 | 
			
		||||
    from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
 | 
			
		||||
    from ktransformers.optimize.optimize import optimize_and_load_gguf
 | 
			
		||||
    from ktransformers.server.config.config import Config
 | 
			
		||||
    from ktransformers.sft.lora import inject_lora_layer
 | 
			
		||||
    from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
 | 
			
		||||
    from ktransformers.util.globals import GLOBAL_CONFIG
 | 
			
		||||
    from ktransformers.util.utils import load_weights
 | 
			
		||||
 | 
			
		||||
logger = logging.get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
def _get_kt_kwargs(
 | 
			
		||||
    config: "PretrainedConfig",
 | 
			
		||||
    model_name_or_path: str,
 | 
			
		||||
    model_args: "ModelArguments",
 | 
			
		||||
    finetuning_args: "FinetuningArguments",
 | 
			
		||||
) -> dict[str, Any]:
 | 
			
		||||
    return {
 | 
			
		||||
        "model_name": model_name_or_path,
 | 
			
		||||
        "max_seq_length": model_args.model_max_length or 4096,
 | 
			
		||||
        "dtype": model_args.compute_dtype,
 | 
			
		||||
        "load_in_4bit": model_args.quantization_bit == 4,
 | 
			
		||||
        "token": model_args.hf_hub_token,
 | 
			
		||||
        "full_finetuning": finetuning_args.finetuning_type == "full",
 | 
			
		||||
        "device_map": {"": get_current_device()},
 | 
			
		||||
        "rope_scaling": getattr(config, "rope_scaling", None),
 | 
			
		||||
        "fix_tokenizer": False,
 | 
			
		||||
        "trust_remote_code": model_args.trust_remote_code,
 | 
			
		||||
        "use_gradient_checkpointing": "ktransformers",
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_kt_pretrained_model(
 | 
			
		||||
    config: "PretrainedConfig", model_args: "ModelArguments"
 | 
			
		||||
) -> Optional["PreTrainedModel"]:
 | 
			
		||||
    r"""Optionally load pretrained model with KTransformers. Used in training."""
 | 
			
		||||
    custom_models = {
 | 
			
		||||
        "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
 | 
			
		||||
        "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
 | 
			
		||||
        "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
 | 
			
		||||
        "LlamaForCausalLM": LlamaForCausalLM,
 | 
			
		||||
        "MixtralForCausalLM": MixtralForCausalLM,
 | 
			
		||||
    }
 | 
			
		||||
    Config().cpu_infer = model_args.cpu_infer
 | 
			
		||||
    Config().chunk_size = model_args.chunk_size
 | 
			
		||||
    config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code)
 | 
			
		||||
 | 
			
		||||
    if model_args.mode == 'long_context':
 | 
			
		||||
        assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
 | 
			
		||||
        torch.set_default_dtype(torch.float16)
 | 
			
		||||
    else:
 | 
			
		||||
        torch.set_default_dtype(config.torch_dtype)
 | 
			
		||||
 | 
			
		||||
    with torch.device("meta"):
 | 
			
		||||
        if config.architectures[0] in custom_models:
 | 
			
		||||
            print("using custom modeling_xxx.py.")
 | 
			
		||||
            if (
 | 
			
		||||
                "Qwen2Moe" in config.architectures[0]
 | 
			
		||||
            ):  # Qwen2Moe must use flash_attention_2 to avoid overflow.
 | 
			
		||||
                config._attn_implementation = "flash_attention_2"
 | 
			
		||||
            if "Llama" in config.architectures[0]:
 | 
			
		||||
                config._attn_implementation = "eager"
 | 
			
		||||
            if "Mixtral" in config.architectures[0]:
 | 
			
		||||
                config._attn_implementation = "flash_attention_2"
 | 
			
		||||
            model = custom_models[config.architectures[0]](config)
 | 
			
		||||
        else:
 | 
			
		||||
            attn_implementation = "flash_attention_2"
 | 
			
		||||
            model = AutoModelForCausalLM.from_config(
 | 
			
		||||
                config, trust_remote_code=True, attn_implementation=attn_implementation
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    optimize_config_path = model_args.kt_optimize_rule
 | 
			
		||||
    gguf_path = model_args.model_name_or_path
 | 
			
		||||
 | 
			
		||||
    assert optimize_config_path is not None, "optimize_config_path must be provided (path to YAML rules file)."
 | 
			
		||||
    assert gguf_path is not None, "gguf_path must be provided (path to a folder or .gguf file)."
 | 
			
		||||
 | 
			
		||||
    GLOBAL_CONFIG._config["mod"] = "infer"
 | 
			
		||||
    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
 | 
			
		||||
 | 
			
		||||
    return model
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_kt_peft_model(
 | 
			
		||||
    model: "PreTrainedModel", peft_kwargs: dict[str, Any]
 | 
			
		||||
) -> "PreTrainedModel":
 | 
			
		||||
    r"""Get the peft model for the pretrained model with KTransformers. Used in training."""
 | 
			
		||||
    from ktransformers.sft.peft_utils.mapping import get_peft_model
 | 
			
		||||
 | 
			
		||||
    return get_peft_model(model, peft_kwargs)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_kt_peft_model(
 | 
			
		||||
    model_args: "ModelArguments", model: "PreTrainedModel",
 | 
			
		||||
) -> "PreTrainedModel":
 | 
			
		||||
    r"""Load peft model with KTransformers. Used in both training and inference."""
 | 
			
		||||
    load_adapter_name_or_path = model_args.adapter_name_or_path[0]
 | 
			
		||||
    if load_adapter_name_or_path.endswith('.gguf'):
 | 
			
		||||
        inject_lora_layer(model, load_adapter_name_or_path)
 | 
			
		||||
        adapter_gguf_loader = GGUFLoader(load_adapter_name_or_path)
 | 
			
		||||
        load_weights(model, adapter_gguf_loader, adapter_gguf=True)
 | 
			
		||||
        model.train()
 | 
			
		||||
    else:
 | 
			
		||||
        inject_lora_layer(model, load_adapter_name_or_path)
 | 
			
		||||
 | 
			
		||||
        adapter_loader = SafeTensorLoader(load_adapter_name_or_path)
 | 
			
		||||
        device = next(model.parameters()).device
 | 
			
		||||
        for key in adapter_loader.tensor_file_map.keys():
 | 
			
		||||
            try:
 | 
			
		||||
                tensor = adapter_loader.load_tensor(key, device=device)
 | 
			
		||||
 | 
			
		||||
                model_key = key.replace("base_model.model.", "")
 | 
			
		||||
                model_key = model_key.replace(".weight", ".default.weight")
 | 
			
		||||
                model_key = model_key.replace(".default.default.weight", ".default.weight")
 | 
			
		||||
 | 
			
		||||
                param = model.get_parameter(model_key)
 | 
			
		||||
                param.data.copy_(tensor.data)
 | 
			
		||||
 | 
			
		||||
                print(f"Loaded adapter weight: {key} -> {model_key}")
 | 
			
		||||
            except AttributeError:
 | 
			
		||||
                print(f"Skipping {key}: not a model parameter")
 | 
			
		||||
            except KeyError:
 | 
			
		||||
                print(f"Key not found in model: {model_key} (original: {key})")
 | 
			
		||||
 | 
			
		||||
    return model
 | 
			
		||||
							
								
								
									
										18
									
								
								src/llamafactory/train/ksft/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								src/llamafactory/train/ksft/__init__.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,18 @@
 | 
			
		||||
# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
 | 
			
		||||
from .workflow import run_sft
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["run_sft"]
 | 
			
		||||
							
								
								
									
										110
									
								
								src/llamafactory/train/ksft/workflow.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										110
									
								
								src/llamafactory/train/ksft/workflow.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,110 @@
 | 
			
		||||
# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
# You may obtain a copy of the License at
 | 
			
		||||
#
 | 
			
		||||
#     http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
#
 | 
			
		||||
# Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
# distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
# See the License for the specific language governing permissions and
 | 
			
		||||
# limitations under the License.
 | 
			
		||||
 | 
			
		||||
from typing import TYPE_CHECKING, Optional
 | 
			
		||||
 | 
			
		||||
from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
 | 
			
		||||
from ...extras.constants import IGNORE_INDEX
 | 
			
		||||
from ...extras.logging import get_logger
 | 
			
		||||
from ...extras.misc import calculate_tps
 | 
			
		||||
from ...extras.ploting import plot_loss
 | 
			
		||||
from ...model import load_model, load_tokenizer
 | 
			
		||||
from ..trainer_utils import create_modelcard_and_push
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    from transformers import Seq2SeqTrainingArguments, TrainerCallback
 | 
			
		||||
 | 
			
		||||
    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = get_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_sft(
 | 
			
		||||
    model_args: "ModelArguments",
 | 
			
		||||
    data_args: "DataArguments",
 | 
			
		||||
    training_args: "Seq2SeqTrainingArguments",
 | 
			
		||||
    finetuning_args: "FinetuningArguments",
 | 
			
		||||
    generating_args: "GeneratingArguments",
 | 
			
		||||
    callbacks: Optional[list["TrainerCallback"]] = None,
 | 
			
		||||
):
 | 
			
		||||
    tokenizer_module = load_tokenizer(model_args)
 | 
			
		||||
    tokenizer = tokenizer_module["tokenizer"]
 | 
			
		||||
    template = get_template_and_fix_tokenizer(tokenizer, data_args)
 | 
			
		||||
    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
 | 
			
		||||
    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
 | 
			
		||||
 | 
			
		||||
    from ktransformers.util.globals import GLOBAL_CONFIG
 | 
			
		||||
    GLOBAL_CONFIG._config["mod"] = "sft"
 | 
			
		||||
 | 
			
		||||
    if getattr(model, "is_quantized", False) and not training_args.do_train:
 | 
			
		||||
        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
 | 
			
		||||
 | 
			
		||||
    data_collator = SFTDataCollatorWith4DAttentionMask(
 | 
			
		||||
        template=template,
 | 
			
		||||
        model=model if not training_args.predict_with_generate else None,
 | 
			
		||||
        pad_to_multiple_of=8 if training_args.do_train else None,  # for shift short attention
 | 
			
		||||
        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
 | 
			
		||||
        block_diag_attn=model_args.block_diag_attn,
 | 
			
		||||
        attn_implementation=getattr(model.config, "_attn_implementation", None),
 | 
			
		||||
        compute_dtype=model_args.compute_dtype,
 | 
			
		||||
        **tokenizer_module,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Metric utils
 | 
			
		||||
    metric_module = {}
 | 
			
		||||
    if training_args.predict_with_generate:
 | 
			
		||||
        raise NotImplementedError("`predict_with_generate` is not supported in KTransformers SFT yet. if you do need it, please open an issue.")
 | 
			
		||||
    elif finetuning_args.compute_accuracy:
 | 
			
		||||
        raise NotImplementedError("`compute_accuracy` is not supported in KTransformers SFT yet. if you do need it, please open an issue.")
 | 
			
		||||
 | 
			
		||||
    # Initialize our Trainer
 | 
			
		||||
    from ktransformers.sft.lora import KTrainer
 | 
			
		||||
    trainer = KTrainer(
 | 
			
		||||
        model=model,
 | 
			
		||||
        args=training_args,
 | 
			
		||||
        tokenizer=tokenizer_module,
 | 
			
		||||
        data_collator=data_collator,
 | 
			
		||||
        callbacks=callbacks,
 | 
			
		||||
        **dataset_module,
 | 
			
		||||
        **metric_module,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Training
 | 
			
		||||
    if training_args.do_train:
 | 
			
		||||
        model.config.use_cache = False
 | 
			
		||||
        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
 | 
			
		||||
        trainer.save_model()
 | 
			
		||||
        if finetuning_args.include_effective_tokens_per_second:
 | 
			
		||||
            train_result.metrics["effective_tokens_per_sec"] = calculate_tps(
 | 
			
		||||
                dataset_module["train_dataset"], train_result.metrics, stage="sft"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        trainer.log_metrics("train", train_result.metrics)
 | 
			
		||||
        trainer.save_metrics("train", train_result.metrics)
 | 
			
		||||
        trainer.save_state()
 | 
			
		||||
        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
 | 
			
		||||
            keys = ["loss"]
 | 
			
		||||
            if isinstance(dataset_module.get("eval_dataset"), dict):
 | 
			
		||||
                keys += sum(
 | 
			
		||||
                    [[f"eval_{key}_loss", f"eval_{key}_accuracy"] for key in dataset_module["eval_dataset"].keys()], []
 | 
			
		||||
                )
 | 
			
		||||
            else:
 | 
			
		||||
                keys += ["eval_loss", "eval_accuracy"]
 | 
			
		||||
 | 
			
		||||
            plot_loss(training_args.output_dir, keys=keys)
 | 
			
		||||
 | 
			
		||||
    # Create model card
 | 
			
		||||
    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
 | 
			
		||||
@ -100,6 +100,9 @@ def create_modelcard_and_push(
 | 
			
		||||
    if model_args.use_unsloth:
 | 
			
		||||
        kwargs["tags"] = kwargs["tags"] + ["unsloth"]
 | 
			
		||||
 | 
			
		||||
    if model_args.use_kt:
 | 
			
		||||
        kwargs["tags"] = kwargs["tags"] + ["ktransformers"]
 | 
			
		||||
 | 
			
		||||
    if not training_args.do_train:
 | 
			
		||||
        pass
 | 
			
		||||
    elif training_args.push_to_hub:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
# Copyright 2025 the LlamaFactory team.
 | 
			
		||||
# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
 | 
			
		||||
#
 | 
			
		||||
# Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
# you may not use this file except in compliance with the License.
 | 
			
		||||
@ -24,7 +24,7 @@ from ..data import get_template_and_fix_tokenizer
 | 
			
		||||
from ..extras import logging
 | 
			
		||||
from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 | 
			
		||||
from ..extras.misc import infer_optim_dtype
 | 
			
		||||
from ..extras.packages import is_mcore_adapter_available, is_ray_available
 | 
			
		||||
from ..extras.packages import is_kt_available, is_mcore_adapter_available, is_ray_available
 | 
			
		||||
from ..hparams import get_infer_args, get_ray_args, get_train_args, read_args
 | 
			
		||||
from ..model import load_model, load_tokenizer
 | 
			
		||||
from .callbacks import LogCallback, PissaConvertCallback, ReporterCallback
 | 
			
		||||
@ -85,6 +85,12 @@ def _training_function(config: dict[str, Any]) -> None:
 | 
			
		||||
    elif finetuning_args.stage == "pt":
 | 
			
		||||
        run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
 | 
			
		||||
    elif finetuning_args.stage == "sft":
 | 
			
		||||
        if model_args.use_kt:
 | 
			
		||||
            if not is_kt_available():
 | 
			
		||||
                raise ImportError("KTransformers is not installed. Please install it with `pip install ktransformers`.")
 | 
			
		||||
            from .ksft.workflow import run_sft as run_sft_kt
 | 
			
		||||
            run_sft_kt(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
 | 
			
		||||
        
 | 
			
		||||
        run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
 | 
			
		||||
    elif finetuning_args.stage == "rm":
 | 
			
		||||
        run_rm(model_args, data_args, training_args, finetuning_args, callbacks)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user