mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2026-05-05 07:38:55 +08:00
[refactor] Add KTransformers AMX MoE SFT support via Accelerate (#10430)
Co-authored-by: mrhaoxx <mr.haoxx@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
25
examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml
Normal file
25
examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
compute_environment: LOCAL_MACHINE
|
||||||
|
distributed_type: FSDP
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_offload_params: false
|
||||||
|
fsdp_reshard_after_forward: true
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_version: 2
|
||||||
|
mixed_precision: bf16
|
||||||
|
num_machines: 1
|
||||||
|
num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs
|
||||||
|
rdzv_backend: static
|
||||||
|
same_network: true
|
||||||
|
use_cpu: false
|
||||||
|
|
||||||
|
kt_config:
|
||||||
|
enabled: true
|
||||||
|
kt_backend: AMXBF16 # Use with original BF16 expert weights.
|
||||||
|
kt_num_threads: 96
|
||||||
|
kt_tp_enabled: true
|
||||||
|
kt_threadpool_count: 2
|
||||||
|
kt_max_cache_depth: 2
|
||||||
|
kt_share_backward_bb: true
|
||||||
|
lora_rank: 8
|
||||||
25
examples/ktransformers/accelerate/fsdp2_kt_int4.yaml
Normal file
25
examples/ktransformers/accelerate/fsdp2_kt_int4.yaml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
compute_environment: LOCAL_MACHINE
|
||||||
|
distributed_type: FSDP
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_offload_params: false
|
||||||
|
fsdp_reshard_after_forward: true
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_version: 2
|
||||||
|
mixed_precision: bf16
|
||||||
|
num_machines: 1
|
||||||
|
num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs
|
||||||
|
rdzv_backend: static
|
||||||
|
same_network: true
|
||||||
|
use_cpu: false
|
||||||
|
|
||||||
|
kt_config:
|
||||||
|
enabled: true
|
||||||
|
kt_backend: AMXINT4 # Use with online-converted INT4 expert weights
|
||||||
|
kt_num_threads: 96
|
||||||
|
kt_tp_enabled: true
|
||||||
|
kt_threadpool_count: 2
|
||||||
|
kt_max_cache_depth: 2
|
||||||
|
kt_share_backward_bb: true
|
||||||
|
lora_rank: 8
|
||||||
25
examples/ktransformers/accelerate/fsdp2_kt_int8.yaml
Normal file
25
examples/ktransformers/accelerate/fsdp2_kt_int8.yaml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
compute_environment: LOCAL_MACHINE
|
||||||
|
distributed_type: FSDP
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_offload_params: false
|
||||||
|
fsdp_reshard_after_forward: true
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_version: 2
|
||||||
|
mixed_precision: bf16
|
||||||
|
num_machines: 1
|
||||||
|
num_processes: 4 # Adjust based on your GPU count; 4 is suitable for 4 GPUs
|
||||||
|
rdzv_backend: static
|
||||||
|
same_network: true
|
||||||
|
use_cpu: false
|
||||||
|
|
||||||
|
kt_config:
|
||||||
|
enabled: true
|
||||||
|
kt_backend: AMXINT8 # Use with online-converted INT8 expert weights
|
||||||
|
kt_num_threads: 96
|
||||||
|
kt_tp_enabled: true
|
||||||
|
kt_threadpool_count: 2
|
||||||
|
kt_max_cache_depth: 2
|
||||||
|
kt_share_backward_bb: true
|
||||||
|
lora_rank: 8
|
||||||
25
examples/ktransformers/accelerate/fsdp2_kt_int8_1gpu.yaml
Normal file
25
examples/ktransformers/accelerate/fsdp2_kt_int8_1gpu.yaml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
compute_environment: LOCAL_MACHINE
|
||||||
|
distributed_type: FSDP
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_offload_params: false
|
||||||
|
fsdp_reshard_after_forward: true
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_version: 2
|
||||||
|
mixed_precision: bf16
|
||||||
|
num_machines: 1
|
||||||
|
num_processes: 1 # Adjust based on your GPU count; 1 is suitable for 1 GPU
|
||||||
|
rdzv_backend: static
|
||||||
|
same_network: true
|
||||||
|
use_cpu: false
|
||||||
|
|
||||||
|
kt_config:
|
||||||
|
enabled: true
|
||||||
|
kt_backend: AMXINT8 # Use with online-converted INT8 expert weights
|
||||||
|
kt_num_threads: 96
|
||||||
|
kt_tp_enabled: true
|
||||||
|
kt_threadpool_count: 2
|
||||||
|
kt_max_cache_depth: 2
|
||||||
|
kt_share_backward_bb: true
|
||||||
|
lora_rank: 8
|
||||||
25
examples/ktransformers/accelerate/fsdp2_kt_int8_8gpu.yaml
Normal file
25
examples/ktransformers/accelerate/fsdp2_kt_int8_8gpu.yaml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
compute_environment: LOCAL_MACHINE
|
||||||
|
distributed_type: FSDP
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_offload_params: false
|
||||||
|
fsdp_reshard_after_forward: true
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_version: 2
|
||||||
|
mixed_precision: bf16
|
||||||
|
num_machines: 1
|
||||||
|
num_processes: 8 # Adjust based on your GPU count; 8 is suitable for 8 GPUs
|
||||||
|
rdzv_backend: static
|
||||||
|
same_network: true
|
||||||
|
use_cpu: false
|
||||||
|
|
||||||
|
kt_config:
|
||||||
|
enabled: true
|
||||||
|
kt_backend: AMXINT8 # Use with online-converted INT8 expert weights
|
||||||
|
kt_num_threads: 96
|
||||||
|
kt_tp_enabled: true
|
||||||
|
kt_threadpool_count: 2
|
||||||
|
kt_max_cache_depth: 2
|
||||||
|
kt_share_backward_bb: true
|
||||||
|
lora_rank: 8
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
model_name_or_path: deepseek-ai/DeepSeek-V2-Lite
|
|
||||||
adapter_name_or_path: saves/Kllama_deepseekV2
|
|
||||||
template: deepseek
|
|
||||||
infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers]
|
|
||||||
trust_remote_code: true
|
|
||||||
|
|
||||||
use_kt: true # use KTransformers as LoRA sft backend to inference
|
|
||||||
kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
|
|
||||||
cpu_infer: 32
|
|
||||||
chunk_size: 8192
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
|
|
||||||
template: deepseek3
|
|
||||||
infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers]
|
|
||||||
trust_remote_code: true
|
|
||||||
|
|
||||||
use_kt: true # use KTransformers as LoRA sft backend to inference
|
|
||||||
kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
|
|
||||||
cpu_infer: 32
|
|
||||||
chunk_size: 8192
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
|
|
||||||
adapter_name_or_path: saves/Kllama_deepseekV3
|
|
||||||
template: deepseek3
|
|
||||||
infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers]
|
|
||||||
trust_remote_code: true
|
|
||||||
|
|
||||||
use_kt: true # use KTransformers as LoRA sft backend to inference
|
|
||||||
kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
|
|
||||||
cpu_infer: 32
|
|
||||||
chunk_size: 8192
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
model_name_or_path: Qwen/Qwen3-235B-A22B-Instruct-2507
|
|
||||||
adapter_name_or_path: saves/Kllama_Qwen3MoE_235bA22b
|
|
||||||
template: qwen3_nothink
|
|
||||||
infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers]
|
|
||||||
trust_remote_code: true
|
|
||||||
|
|
||||||
use_kt: true # use KTransformers as LoRA sft backend to inference
|
|
||||||
kt_optimize_rule: examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
|
|
||||||
cpu_infer: 32
|
|
||||||
chunk_size: 8192
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
|
||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearMarlin"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearMarlin"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KExpertsCPU"
|
|
||||||
out_device: "cuda"
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
|
||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
@@ -1,139 +0,0 @@
|
|||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9])\\."
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([12][0-9])\\."
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9])\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([12][0-9])\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9])\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([12][0-9])\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda:0"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda:1"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9])\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([12][0-9])\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
|
||||||
transfer_map:
|
|
||||||
10: "cuda:1"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9])\\."
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cpu"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
|
||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cpu"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda"
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
|
||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearMarlin"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearMarlin"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV2MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KExpertsCPU"
|
|
||||||
out_device: "cuda"
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
|
||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearMarlin"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearMarlin"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.gate.KMoEGate
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KExpertsCPU"
|
|
||||||
out_device: "cuda"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
|
||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
@@ -1,392 +0,0 @@
|
|||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
|
|
||||||
# === Rotary Embedding Replacement ===
|
|
||||||
|
|
||||||
# GPU 0: layers 0–14
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([0-9]|1[0-4])\\."
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
|
|
||||||
# GPU 1: layers 15–29
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
|
|
||||||
# GPU 2: layers 30–44
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:2"
|
|
||||||
prefill_device: "cuda:2"
|
|
||||||
|
|
||||||
# GPU 3: layers 45–60
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\."
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:3"
|
|
||||||
prefill_device: "cuda:3"
|
|
||||||
|
|
||||||
# === Linear Layers Replacement (excluding self_attn.kv_b_proj) ===
|
|
||||||
|
|
||||||
# GPU 0: layers 0–14
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([0-9]|1[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
# GPU 1: layers 15–29
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.(?!self_attn\\.kv_b_proj).*$"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
# GPU 2: layers 30–44
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.(?!self_attn\\.kv_b_proj).*$"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:2"
|
|
||||||
prefill_device: "cuda:2"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
# GPU 3: layers 45–60
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.(?!self_attn\\.kv_b_proj).*$"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:3"
|
|
||||||
prefill_device: "cuda:3"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
# === MLP (MoE) Replacement ===
|
|
||||||
|
|
||||||
# GPU 0: layers 0–14
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV3MoE
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
|
|
||||||
# GPU 1: layers 15–29
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV3MoE
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
|
|
||||||
# GPU 2: layers 30–44
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV3MoE
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:2"
|
|
||||||
prefill_device: "cuda:2"
|
|
||||||
|
|
||||||
# GPU 3: layers 45–60
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV3MoE
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:3"
|
|
||||||
prefill_device: "cuda:3"
|
|
||||||
|
|
||||||
# === MLP Gate Replacement ===
|
|
||||||
|
|
||||||
# GPU 0: layers 0–14
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.gate$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.gate.KMoEGate
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
|
|
||||||
# GPU 1: layers 15–29
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.gate$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.gate.KMoEGate
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
|
|
||||||
# GPU 2: layers 30–44
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.gate$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.gate.KMoEGate
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:2"
|
|
||||||
prefill_device: "cuda:2"
|
|
||||||
|
|
||||||
# GPU 3: layers 45–60
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.gate$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.gate.KMoEGate
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:3"
|
|
||||||
prefill_device: "cuda:3"
|
|
||||||
|
|
||||||
# === MLP Experts Replacement ===
|
|
||||||
# replace with marlin expert. Open and modify layer-num as needed.
|
|
||||||
# Each layer of malin experts takes about 6GB of GPU memory.
|
|
||||||
# !!!Do remember 'close' cuda graph if you are using marlin expert.!!!
|
|
||||||
# !!!KExpertsTorch is untested, we don't have enough VRAM.!!!
|
|
||||||
|
|
||||||
# GPU 0: layers 3–4
|
|
||||||
# - match:
|
|
||||||
# name: "^model\\.layers\\.([3-4])\\.mlp\\.experts$"
|
|
||||||
# replace:
|
|
||||||
# class: ktransformers.operators.experts.KTransformersExperts
|
|
||||||
# kwargs:
|
|
||||||
# generate_device: "cuda:0"
|
|
||||||
# generate_op: "KExpertsMarlin"
|
|
||||||
# recursive: False
|
|
||||||
|
|
||||||
# # GPU 1: layers 15–17
|
|
||||||
# - match:
|
|
||||||
# name: "^model\\.layers\\.(1[5-7])\\.mlp\\.experts$"
|
|
||||||
# replace:
|
|
||||||
# class: ktransformers.operators.experts.KTransformersExperts
|
|
||||||
# kwargs:
|
|
||||||
# generate_device: "cuda:1"
|
|
||||||
# generate_op: "KExpertsMarlin"
|
|
||||||
# recursive: False
|
|
||||||
|
|
||||||
# # GPU 2: layers 30–32
|
|
||||||
# - match:
|
|
||||||
# name: "^model\\.layers\\.(3[0-2])\\.mlp\\.experts$"
|
|
||||||
# replace:
|
|
||||||
# class: ktransformers.operators.experts.KTransformersExperts
|
|
||||||
# kwargs:
|
|
||||||
# generate_device: "cuda:2"
|
|
||||||
# generate_op: "KExpertsMarlin"
|
|
||||||
# recursive: False
|
|
||||||
|
|
||||||
# # GPU 3: layers 45–46
|
|
||||||
# - match:
|
|
||||||
# name: "^model\\.layers\\.(4[5-6])\\.mlp\\.experts$"
|
|
||||||
# replace:
|
|
||||||
# class: ktransformers.operators.experts.KTransformersExperts
|
|
||||||
# kwargs:
|
|
||||||
# generate_device: "cuda:3"
|
|
||||||
# generate_op: "KExpertsMarlin"
|
|
||||||
# recursive: False
|
|
||||||
|
|
||||||
|
|
||||||
# === MLP Experts Replacement ===
|
|
||||||
|
|
||||||
# GPU 0: layers 0–14
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([0-9]|1[0-4])\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda:0"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False
|
|
||||||
|
|
||||||
# GPU 1: layers 15–29
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda:1"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False
|
|
||||||
|
|
||||||
# GPU 2: layers 30–44
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda:2"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda:2"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False
|
|
||||||
|
|
||||||
# GPU 3: layers 45–60
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda:3"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda:3"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False
|
|
||||||
|
|
||||||
# === Self-Attention Replacement ===
|
|
||||||
|
|
||||||
# GPU 0: layers 0–14
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([0-9]|1[0-4])\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
absorb_for_prefill: False
|
|
||||||
|
|
||||||
# GPU 1: layers 15–29
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(1[5-9]|2[0-9])\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
absorb_for_prefill: False
|
|
||||||
|
|
||||||
# GPU 2: layers 30–44
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(3[0-9]|4[0-4])\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:2"
|
|
||||||
prefill_device: "cuda:2"
|
|
||||||
absorb_for_prefill: False
|
|
||||||
|
|
||||||
# GPU 3: layers 45–60
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:3"
|
|
||||||
prefill_device: "cuda:3"
|
|
||||||
absorb_for_prefill: False
|
|
||||||
|
|
||||||
# === Overall Model Replacement with Transfer Map ===
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 means close layer‐wise prefill
|
|
||||||
transfer_map:
|
|
||||||
15: "cuda:1" # Layers 15+ on GPU 1
|
|
||||||
30: "cuda:2" # Layers 30+ on GPU 2
|
|
||||||
45: "cuda:3" # Layers 45+ on GPU 3
|
|
||||||
|
|
||||||
# === Default Catch-All for Other Modules ===
|
|
||||||
|
|
||||||
# GPU 0: layers 0–14
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([0-9]|1[0-4])\\."
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
|
|
||||||
# GPU 1: layers 15–29
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(1[5-9]|2[0-9])\\."
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
|
|
||||||
# GPU 2: layers 30–44
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(3[0-9]|4[0-4])\\."
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:2"
|
|
||||||
prefill_device: "cuda:2"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:3"
|
|
||||||
prefill_device: "cuda:3"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
# For final modules (model.norm), ensure they are on GPU 3 (as in your original config)
|
|
||||||
- match:
|
|
||||||
name: "(^model\\.layers\\.(4[5-9]|5[0-9]|60)\\.)|(^model\\.norm)"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:3"
|
|
||||||
prefill_device: "cuda:3"
|
|
||||||
@@ -1,156 +0,0 @@
|
|||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([3456][0-9])\\."
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([3456][0-9])\\.(?!self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([3456][0-9])\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.gate$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.gate.KMoEGate
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.gate$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.gate.KMoEGate # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda:0"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([3456][0-9])\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda:1"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.([3456][0-9])\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
|
||||||
transfer_map:
|
|
||||||
30: "cuda:1"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head"
|
|
||||||
class: torch.nn.Linear
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "(^model\\.layers\\.([3456][0-9])\\.)|(model.norm)"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:1"
|
|
||||||
prefill_device: "cuda:1"
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp$"
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_deepseek_v3.MoEGate
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.gate.KMoEGate
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda:0"
|
|
||||||
prefill_device: "cuda:0"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KDeepseekV2Model"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
|
|
||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
- match:
|
|
||||||
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.RoPE.RotaryEmbedding
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^lm_head$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
|
|
||||||
# - match:
|
|
||||||
# name: "^model\\.layers\\..*$" # regular expression
|
|
||||||
# class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
# replace:
|
|
||||||
# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
# kwargs:
|
|
||||||
# generate_device: "cuda"
|
|
||||||
# prefill_device: "cuda"
|
|
||||||
# generate_op: "KLinearTorch"
|
|
||||||
# prefill_op: "KLinearTorch"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression
|
|
||||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
generate_op: "KLinearTorch"
|
|
||||||
prefill_op: "KLinearTorch"
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlock # mlp module with custom forward function
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
|
||||||
kwargs:
|
|
||||||
prefill_device: "cuda"
|
|
||||||
prefill_op: "KExpertsTorch"
|
|
||||||
generate_device: "cpu"
|
|
||||||
generate_op: "KSFTExpertsCPU"
|
|
||||||
out_device: "cuda"
|
|
||||||
backend: "AMXInt8" # or "AMXBF16" or "AMXInt8"
|
|
||||||
recursive: False # don't recursively inject submodules of this module
|
|
||||||
- match:
|
|
||||||
name: "^model\\.layers\\..*\\.self_attn$"
|
|
||||||
replace:
|
|
||||||
class: ktransformers.operators.attention.KQwen3MoeAttention # optimized MLA implementation
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cuda"
|
|
||||||
prefill_device: "cuda"
|
|
||||||
- match:
|
|
||||||
name: "^model.embed_tokens"
|
|
||||||
replace:
|
|
||||||
class: "default"
|
|
||||||
kwargs:
|
|
||||||
generate_device: "cpu"
|
|
||||||
prefill_device: "cpu"
|
|
||||||
|
|
||||||
- match:
|
|
||||||
name: "^model$"
|
|
||||||
replace:
|
|
||||||
class: "ktransformers.operators.models.KQwen3MoeModel"
|
|
||||||
kwargs:
|
|
||||||
per_layer_prefill_intput_threshold: 0
|
|
||||||
@@ -19,7 +19,7 @@ preprocessing_num_workers: 16
|
|||||||
dataloader_num_workers: 4
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/Kllama_deepseekV2
|
output_dir: saves/KT_FT_deepseekV2
|
||||||
logging_steps: 10
|
logging_steps: 10
|
||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
@@ -39,14 +39,7 @@ ddp_timeout: 180000000
|
|||||||
resume_from_checkpoint: null
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### ktransformers
|
### ktransformers
|
||||||
use_kt: true # use KTransformers as LoRA sft backend
|
use_kt: true
|
||||||
kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
|
# Pair with fsdp2_kt_bf16.yaml for original BF16 checkpoints.
|
||||||
cpu_infer: 32
|
# For pre-converted expert weights, uncomment kt_weight_path and use fsdp2_kt_int8.yaml or fsdp2_kt_int4.yaml.
|
||||||
chunk_size: 8192
|
# kt_weight_path: /path/to/DeepSeek-V2-Lite-AMXINT8
|
||||||
|
|
||||||
### eval
|
|
||||||
# eval_dataset: alpaca_en_demo
|
|
||||||
# val_size: 0.1
|
|
||||||
# per_device_eval_batch_size: 1
|
|
||||||
# eval_strategy: steps
|
|
||||||
# eval_steps: 500
|
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
### model
|
### model
|
||||||
model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
|
model_name_or_path: deepseek-ai/DeepSeek-V3-0324-BF16 # need to convert to BF16 checkpoint first
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
### method
|
### method
|
||||||
@@ -19,7 +19,7 @@ preprocessing_num_workers: 16
|
|||||||
dataloader_num_workers: 4
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/Kllama_deepseekV3
|
output_dir: saves/KT_FT_deepseekV3
|
||||||
logging_steps: 10
|
logging_steps: 10
|
||||||
save_steps: 500
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
@@ -39,14 +39,7 @@ ddp_timeout: 180000000
|
|||||||
resume_from_checkpoint: null
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### ktransformers
|
### ktransformers
|
||||||
use_kt: true # use KTransformers as LoRA sft backend
|
use_kt: true
|
||||||
kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
|
# Pair with fsdp2_kt_bf16.yaml for original BF16 checkpoints.
|
||||||
cpu_infer: 32
|
# For pre-converted expert weights, uncomment kt_weight_path and use fsdp2_kt_int8.yaml or fsdp2_kt_int4.yaml.
|
||||||
chunk_size: 8192
|
# kt_weight_path: /path/to/DeepSeek-V3-AMXINT8
|
||||||
|
|
||||||
### eval
|
|
||||||
# eval_dataset: alpaca_en_demo
|
|
||||||
# val_size: 0.1
|
|
||||||
# per_device_eval_batch_size: 1
|
|
||||||
# eval_strategy: steps
|
|
||||||
# eval_steps: 500
|
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
### model
|
||||||
|
model_name_or_path: Qwen/Qwen3.5-397B-A17B
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
|
### method
|
||||||
|
stage: sft
|
||||||
|
do_train: true
|
||||||
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
|
lora_target: all
|
||||||
|
|
||||||
|
### dataset
|
||||||
|
dataset: identity, alpaca_en_demo
|
||||||
|
template: qwen3_5
|
||||||
|
cutoff_len: 2048
|
||||||
|
max_samples: 100000
|
||||||
|
overwrite_cache: true
|
||||||
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
|
### output
|
||||||
|
output_dir: saves/KT_FT_qwen35Moe
|
||||||
|
logging_steps: 10
|
||||||
|
save_steps: 500
|
||||||
|
plot_loss: true
|
||||||
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||||
|
|
||||||
|
### train
|
||||||
|
per_device_train_batch_size: 1
|
||||||
|
gradient_accumulation_steps: 8
|
||||||
|
learning_rate: 1.0e-4
|
||||||
|
num_train_epochs: 3.0
|
||||||
|
lr_scheduler_type: cosine
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
bf16: true
|
||||||
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
|
### ktransformers
|
||||||
|
use_kt: true
|
||||||
|
# For original BF16 checkpoints, start with examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml.
|
||||||
|
# For pre-converted expert weights, uncomment kt_weight_path and use fsdp2_kt_int8.yaml or fsdp2_kt_int4.yaml.
|
||||||
|
# Pair the 397B path with fsdp2_kt_int8.yaml, tune cutoff_len to prepared weights and GPU memory.
|
||||||
|
# kt_weight_path: /path/to/Qwen3.5-MoE-AMXINT8
|
||||||
@@ -11,7 +11,7 @@ lora_target: all
|
|||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
dataset: identity, alpaca_en_demo
|
dataset: identity, alpaca_en_demo
|
||||||
template: qwen3_nothink
|
template: qwen3
|
||||||
cutoff_len: 2048
|
cutoff_len: 2048
|
||||||
max_samples: 100000
|
max_samples: 100000
|
||||||
overwrite_cache: true
|
overwrite_cache: true
|
||||||
@@ -19,9 +19,9 @@ preprocessing_num_workers: 16
|
|||||||
dataloader_num_workers: 4
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
### output
|
### output
|
||||||
output_dir: saves/Kllama_Qwen3MoE_235bA22b
|
output_dir: saves/KT_FT_qwen3Moe
|
||||||
logging_steps: 10
|
logging_steps: 10
|
||||||
save_steps: 200
|
save_steps: 500
|
||||||
plot_loss: true
|
plot_loss: true
|
||||||
overwrite_output_dir: true
|
overwrite_output_dir: true
|
||||||
save_only_model: false
|
save_only_model: false
|
||||||
@@ -31,7 +31,7 @@ report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
|||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 1.0e-4
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
bf16: true
|
bf16: true
|
||||||
@@ -39,14 +39,7 @@ ddp_timeout: 180000000
|
|||||||
resume_from_checkpoint: null
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
### ktransformers
|
### ktransformers
|
||||||
use_kt: true # use KTransformers as LoRA sft backend
|
use_kt: true
|
||||||
kt_optimize_rule: examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
|
# Pair with examples/ktransformers/accelerate/fsdp2_kt_bf16.yaml for original BF16 checkpoints.
|
||||||
cpu_infer: 32
|
# For pre-converted expert weights, uncomment kt_weight_path and use fsdp2_kt_int8.yaml or fsdp2_kt_int4.yaml.
|
||||||
chunk_size: 8192
|
# kt_weight_path: /path/to/Qwen3-235B-A22B-Instruct-2507-AMXINT8
|
||||||
|
|
||||||
### eval
|
|
||||||
# eval_dataset: alpaca_en_demo
|
|
||||||
# val_size: 0.1
|
|
||||||
# per_device_eval_batch_size: 1
|
|
||||||
# eval_strategy: steps
|
|
||||||
# eval_steps: 500
|
|
||||||
|
|||||||
1
requirements/ktransformers.txt
Normal file
1
requirements/ktransformers.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
ktransformers[sft]
|
||||||
@@ -71,16 +71,6 @@ class ChatModel:
|
|||||||
"SGLang not install, you may need to run `pip install sglang[all]`\n"
|
"SGLang not install, you may need to run `pip install sglang[all]`\n"
|
||||||
"or try to use HuggingFace backend: --infer_backend huggingface"
|
"or try to use HuggingFace backend: --infer_backend huggingface"
|
||||||
) from e
|
) from e
|
||||||
elif model_args.infer_backend == EngineName.KT:
|
|
||||||
try:
|
|
||||||
from .kt_engine import KTransformersEngine
|
|
||||||
|
|
||||||
self.engine: BaseEngine = KTransformersEngine(model_args, data_args, finetuning_args, generating_args)
|
|
||||||
except ImportError as e:
|
|
||||||
raise ImportError(
|
|
||||||
"KTransformers not install, you may need to run `pip install ktransformers`\n"
|
|
||||||
"or try to use HuggingFace backend: --infer_backend huggingface"
|
|
||||||
) from e
|
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Unknown backend: {model_args.infer_backend}")
|
raise NotImplementedError(f"Unknown backend: {model_args.infer_backend}")
|
||||||
|
|
||||||
|
|||||||
@@ -1,284 +0,0 @@
|
|||||||
# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import platform
|
|
||||||
from collections.abc import AsyncGenerator
|
|
||||||
from threading import Thread
|
|
||||||
from typing import TYPE_CHECKING, Any, Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from typing_extensions import override
|
|
||||||
|
|
||||||
from ..data import get_template_and_fix_tokenizer
|
|
||||||
from ..extras import logging
|
|
||||||
from ..extras.constants import EngineName
|
|
||||||
from ..model import load_model, load_tokenizer
|
|
||||||
from .base_engine import BaseEngine, Response
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from transformers import PreTrainedTokenizer
|
|
||||||
from trl import PreTrainedModelWrapper
|
|
||||||
|
|
||||||
from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
|
|
||||||
from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
|
|
||||||
|
|
||||||
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
|
|
||||||
from ktransformers.server.config.config import Config
|
|
||||||
from ktransformers.util.utils import (
|
|
||||||
get_compute_capability,
|
|
||||||
prefill_and_generate_capture,
|
|
||||||
)
|
|
||||||
from ktransformers.util.vendors import GPUVendor, device_manager
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class KTransformersEngine(BaseEngine):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model_args: "ModelArguments",
|
|
||||||
data_args: "DataArguments",
|
|
||||||
finetuning_args: "FinetuningArguments",
|
|
||||||
generating_args: "GeneratingArguments",
|
|
||||||
) -> None:
|
|
||||||
self.name = EngineName.KT
|
|
||||||
self.can_generate = finetuning_args.stage == "sft"
|
|
||||||
|
|
||||||
tok_mod = load_tokenizer(model_args)
|
|
||||||
self.tokenizer = tok_mod["tokenizer"]
|
|
||||||
self.tokenizer.padding_side = "left" if self.can_generate else "right"
|
|
||||||
self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
|
|
||||||
|
|
||||||
self.model = load_model(
|
|
||||||
self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
|
|
||||||
)
|
|
||||||
|
|
||||||
self.generating_args = generating_args.to_dict()
|
|
||||||
self.max_new_tokens = model_args.kt_maxlen
|
|
||||||
self.use_cuda_graph = model_args.kt_use_cuda_graph
|
|
||||||
self.mode = model_args.kt_mode
|
|
||||||
self.force_think = model_args.kt_force_think
|
|
||||||
self.chunk_size = model_args.chunk_size
|
|
||||||
|
|
||||||
try:
|
|
||||||
asyncio.get_event_loop()
|
|
||||||
except RuntimeError:
|
|
||||||
loop = asyncio.new_event_loop()
|
|
||||||
asyncio.set_event_loop(loop)
|
|
||||||
|
|
||||||
self.semaphore = asyncio.Semaphore(int(os.getenv("MAX_CONCURRENT", "1")))
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
@torch.inference_mode()
|
|
||||||
def _get_scores(
|
|
||||||
model: "PreTrainedModelWrapper",
|
|
||||||
tokenizer: "PreTrainedTokenizer",
|
|
||||||
batch_input: list[str],
|
|
||||||
input_kwargs: Optional[dict[str, Any]] = {},
|
|
||||||
) -> list[float]:
|
|
||||||
max_length: Optional[int] = input_kwargs.pop("max_length", None)
|
|
||||||
device = getattr(model.pretrained_model, "device", "cuda")
|
|
||||||
inputs = tokenizer(
|
|
||||||
batch_input,
|
|
||||||
padding=True,
|
|
||||||
truncation=True,
|
|
||||||
max_length=max_length or getattr(model.config, "max_position_embeddings", 1024),
|
|
||||||
return_tensors="pt",
|
|
||||||
add_special_tokens=False,
|
|
||||||
).to(device)
|
|
||||||
values: torch.Tensor = model(**inputs, return_dict=True, use_cache=False)[-1]
|
|
||||||
scores = values.gather(dim=-1, index=(inputs["attention_mask"].sum(dim=-1, keepdim=True) - 1))
|
|
||||||
return scores
|
|
||||||
|
|
||||||
async def _generate(
|
|
||||||
self,
|
|
||||||
messages: list[dict[str, str]],
|
|
||||||
system: Optional[str] = None,
|
|
||||||
tools: Optional[str] = None,
|
|
||||||
**input_kwargs,
|
|
||||||
) -> AsyncGenerator[str, None]:
|
|
||||||
paired = messages + [{"role": "assistant", "content": ""}]
|
|
||||||
prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired, system, tools)
|
|
||||||
prompt_len = len(prompt_ids)
|
|
||||||
|
|
||||||
max_length: Optional[int] = input_kwargs.pop("max_length", None)
|
|
||||||
max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
|
|
||||||
|
|
||||||
if "max_new_tokens" in self.generating_args:
|
|
||||||
max_tokens = int(self.generating_args["max_new_tokens"])
|
|
||||||
elif "max_length" in self.generating_args:
|
|
||||||
gl = int(self.generating_args["max_length"])
|
|
||||||
max_tokens = gl - prompt_len if gl > prompt_len else 1
|
|
||||||
else:
|
|
||||||
max_tokens = self.max_new_tokens or 256
|
|
||||||
|
|
||||||
if max_length is not None:
|
|
||||||
max_tokens = max(max_length - prompt_len, 1)
|
|
||||||
if max_new_tokens is not None:
|
|
||||||
max_tokens = int(max_new_tokens)
|
|
||||||
max_tokens = max(1, int(max_tokens))
|
|
||||||
|
|
||||||
if self.mode == "long_context":
|
|
||||||
max_len_cfg = Config().long_context_config["max_seq_len"]
|
|
||||||
need = prompt_len + max_tokens
|
|
||||||
assert max_len_cfg > need, f"please set max_seq_len > {need} in ~/.ktransformers/config.yaml"
|
|
||||||
|
|
||||||
device = next(self.model.parameters()).device
|
|
||||||
input_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device)
|
|
||||||
if self.force_think:
|
|
||||||
think = torch.tensor(
|
|
||||||
[self.tokenizer.encode("<think>\n", add_special_tokens=False)], dtype=torch.long, device=device
|
|
||||||
)
|
|
||||||
input_tensor = torch.cat([input_tensor, think], dim=1)
|
|
||||||
|
|
||||||
use_flashinfer = (
|
|
||||||
platform.system() != "Windows"
|
|
||||||
and getattr(self.model.config, "architectures", [""])[0]
|
|
||||||
in {"DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"}
|
|
||||||
and flashinfer_enabled
|
|
||||||
and get_compute_capability() >= 8
|
|
||||||
and device_manager.gpu_vendor == GPUVendor.NVIDIA
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_gen():
|
|
||||||
if use_flashinfer:
|
|
||||||
return prefill_and_generate_capture(
|
|
||||||
self.model,
|
|
||||||
self.tokenizer,
|
|
||||||
input_tensor,
|
|
||||||
max_tokens,
|
|
||||||
self.use_cuda_graph,
|
|
||||||
mode=self.mode,
|
|
||||||
force_think=self.force_think,
|
|
||||||
chunk_size=self.chunk_size,
|
|
||||||
use_flashinfer_mla=True,
|
|
||||||
num_heads=self.model.config.num_attention_heads,
|
|
||||||
head_dim_ckv=getattr(self.model.config, "kv_lora_rank", 0),
|
|
||||||
head_dim_kpe=getattr(self.model.config, "qk_rope_head_dim", 0),
|
|
||||||
q_head_dim=getattr(self.model.config, "qk_rope_head_dim", 0)
|
|
||||||
+ getattr(self.model.config, "qk_nope_head_dim", 0),
|
|
||||||
echo_stream=False,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return prefill_and_generate_capture(
|
|
||||||
self.model,
|
|
||||||
self.tokenizer,
|
|
||||||
input_tensor,
|
|
||||||
max_tokens,
|
|
||||||
self.use_cuda_graph,
|
|
||||||
mode=self.mode,
|
|
||||||
force_think=self.force_think,
|
|
||||||
chunk_size=self.chunk_size,
|
|
||||||
echo_stream=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
q: asyncio.Queue[Optional[str]] = asyncio.Queue()
|
|
||||||
|
|
||||||
def producer():
|
|
||||||
try:
|
|
||||||
gen = make_gen()
|
|
||||||
if hasattr(gen, "__aiter__"):
|
|
||||||
|
|
||||||
async def drain_async():
|
|
||||||
async for t in gen:
|
|
||||||
loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t))
|
|
||||||
|
|
||||||
asyncio.run(drain_async())
|
|
||||||
elif hasattr(gen, "__iter__"):
|
|
||||||
for t in gen:
|
|
||||||
loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t))
|
|
||||||
else:
|
|
||||||
loop.call_soon_threadsafe(q.put_nowait, gen if isinstance(gen, str) else str(gen))
|
|
||||||
finally:
|
|
||||||
loop.call_soon_threadsafe(q.put_nowait, None)
|
|
||||||
|
|
||||||
Thread(target=producer, daemon=True).start()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
item = await q.get()
|
|
||||||
if item is None:
|
|
||||||
break
|
|
||||||
yield item
|
|
||||||
|
|
||||||
@override
|
|
||||||
async def chat(
|
|
||||||
self,
|
|
||||||
messages: list[dict[str, str]],
|
|
||||||
system: Optional[str] = None,
|
|
||||||
tools: Optional[str] = None,
|
|
||||||
images: Optional[list["ImageInput"]] = None,
|
|
||||||
videos: Optional[list["VideoInput"]] = None,
|
|
||||||
audios: Optional[list["AudioInput"]] = None,
|
|
||||||
**input_kwargs,
|
|
||||||
) -> list["Response"]:
|
|
||||||
if not self.can_generate:
|
|
||||||
raise ValueError("The current model does not support `chat`.")
|
|
||||||
async with self.semaphore:
|
|
||||||
produced = ""
|
|
||||||
final_text = ""
|
|
||||||
async for t in self._generate(messages, system, tools, **input_kwargs):
|
|
||||||
delta = t
|
|
||||||
produced = produced + delta
|
|
||||||
if delta:
|
|
||||||
final_text += delta
|
|
||||||
|
|
||||||
prompt_ids, _ = self.template.encode_oneturn(
|
|
||||||
self.tokenizer, messages + [{"role": "assistant", "content": ""}], system, tools
|
|
||||||
)
|
|
||||||
return [
|
|
||||||
Response(
|
|
||||||
response_text=final_text,
|
|
||||||
response_length=len(self.tokenizer.encode(final_text, add_special_tokens=False)),
|
|
||||||
prompt_length=len(prompt_ids),
|
|
||||||
finish_reason="stop",
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
@override
|
|
||||||
async def stream_chat(
|
|
||||||
self,
|
|
||||||
messages: list[dict[str, str]],
|
|
||||||
system: Optional[str] = None,
|
|
||||||
tools: Optional[str] = None,
|
|
||||||
images: Optional[list["ImageInput"]] = None,
|
|
||||||
videos: Optional[list["VideoInput"]] = None,
|
|
||||||
audios: Optional[list["AudioInput"]] = None,
|
|
||||||
**input_kwargs,
|
|
||||||
) -> AsyncGenerator[str, None]:
|
|
||||||
if not self.can_generate:
|
|
||||||
raise ValueError("The current model does not support `stream_chat`.")
|
|
||||||
async with self.semaphore:
|
|
||||||
produced = ""
|
|
||||||
async for t in self._generate(messages, system, tools, **input_kwargs):
|
|
||||||
delta = t[len(produced) :] if t.startswith(produced) else t
|
|
||||||
produced = t
|
|
||||||
if delta:
|
|
||||||
yield delta
|
|
||||||
|
|
||||||
@override
|
|
||||||
async def get_scores(
|
|
||||||
self,
|
|
||||||
batch_input: list[str],
|
|
||||||
**input_kwargs,
|
|
||||||
) -> list[float]:
|
|
||||||
if self.can_generate:
|
|
||||||
raise ValueError("Cannot get scores using an auto-regressive model.")
|
|
||||||
args = (self.model, self.tokenizer, batch_input, input_kwargs)
|
|
||||||
async with self.semaphore:
|
|
||||||
return await asyncio.to_thread(self._get_scores, *args)
|
|
||||||
@@ -139,7 +139,6 @@ class EngineName(StrEnum):
|
|||||||
HF = "huggingface"
|
HF = "huggingface"
|
||||||
VLLM = "vllm"
|
VLLM = "vllm"
|
||||||
SGLANG = "sglang"
|
SGLANG = "sglang"
|
||||||
KT = "ktransformers"
|
|
||||||
|
|
||||||
|
|
||||||
class DownloadSource(StrEnum):
|
class DownloadSource(StrEnum):
|
||||||
|
|||||||
@@ -96,8 +96,8 @@ def check_dependencies() -> None:
|
|||||||
r"""Check the version of the required packages."""
|
r"""Check the version of the required packages."""
|
||||||
check_version("transformers>=4.55.0,<=5.6.0")
|
check_version("transformers>=4.55.0,<=5.6.0")
|
||||||
check_version("datasets>=2.16.0,<=4.0.0")
|
check_version("datasets>=2.16.0,<=4.0.0")
|
||||||
check_version("accelerate>=1.3.0,<=1.11.0")
|
check_version("accelerate>=1.3.0,<=1.15.0")
|
||||||
check_version("peft>=0.18.0,<=0.18.1")
|
check_version("peft>=0.18.0,<=0.20.0")
|
||||||
check_version("trl>=0.18.0,<=0.24.0")
|
check_version("trl>=0.18.0,<=0.24.0")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ def is_ray_available():
|
|||||||
|
|
||||||
|
|
||||||
def is_kt_available():
|
def is_kt_available():
|
||||||
return _is_package_available("ktransformers")
|
return _is_package_available("kt_kernel")
|
||||||
|
|
||||||
|
|
||||||
def is_requests_available():
|
def is_requests_available():
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
from dataclasses import asdict, dataclass, field, fields
|
from dataclasses import asdict, dataclass, field, fields
|
||||||
from typing import Any, Literal, Self
|
from typing import Any, Literal, Self
|
||||||
|
|
||||||
@@ -460,47 +461,81 @@ class SGLangArguments:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class KTransformersArguments:
|
class KTransformersArguments:
|
||||||
r"""Arguments pertaining to the KT training."""
|
r"""Arguments pertaining to KTransformers AMX MoE SFT training.
|
||||||
|
|
||||||
|
These fields are normalized into the transformers/accelerate KT config before training starts.
|
||||||
|
"""
|
||||||
|
|
||||||
use_kt: bool = field(
|
use_kt: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={"help": "Whether To Use KTransformers Optimizations For LoRA Training."},
|
metadata={"help": "Whether to use KTransformers AMX MoE backend for SFT training."},
|
||||||
)
|
)
|
||||||
kt_optimize_rule: str | None = field(
|
kt_weight_path: str | None = field(
|
||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={"help": "Path to pre-quantized INT8 expert weights (.kt files)."},
|
||||||
"help": "Path To The KTransformers Optimize Rule; See https://github.com/kvcache-ai/ktransformers/."
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
cpu_infer: int | None = field(
|
kt_expert_checkpoint_path: str | None = field(
|
||||||
default=32,
|
default=None,
|
||||||
metadata={"help": "Number Of CPU Cores Used For Computation."},
|
metadata={"help": "Path to expert checkpoint (safetensors) for online conversion."},
|
||||||
)
|
)
|
||||||
chunk_size: int | None = field(
|
kt_use_lora_experts: bool | None = field(
|
||||||
default=8192,
|
default=None,
|
||||||
metadata={"help": "Chunk Size Used For CPU Compute In KTransformers."},
|
metadata={"help": "Whether to use GPU-side LoRA Experts."},
|
||||||
)
|
)
|
||||||
mode: str | None = field(
|
kt_lora_expert_num: int | None = field(
|
||||||
default="normal",
|
default=None,
|
||||||
metadata={"help": "Normal Or Long_Context For Llama Models."},
|
metadata={"help": "Number of GPU-side LoRA Experts."},
|
||||||
|
)
|
||||||
|
kt_lora_expert_intermediate_size: int | None = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "Intermediate size for GPU-side LoRA Experts."},
|
||||||
)
|
)
|
||||||
|
|
||||||
kt_maxlen: int = field(
|
def get_kt_config_dict(self, finetuning_args: Any, model_max_length: int | None) -> dict[str, Any]:
|
||||||
default=4096,
|
r"""Build KT config values from LLaMA-Factory model and LoRA arguments."""
|
||||||
metadata={"help": "Maximum Sequence (Prompt + Response) Length Of The KT Engine."},
|
kt_config = {
|
||||||
)
|
"kt_lora_rank": getattr(finetuning_args, "lora_rank", None),
|
||||||
kt_use_cuda_graph: bool = field(
|
"kt_lora_alpha": getattr(finetuning_args, "lora_alpha", None),
|
||||||
default=True,
|
"kt_weight_path": self.kt_weight_path,
|
||||||
metadata={"help": "Whether To Use CUDA Graphs For The KT Engine."},
|
"kt_expert_checkpoint_path": self.kt_expert_checkpoint_path,
|
||||||
)
|
"kt_model_max_length": model_max_length,
|
||||||
kt_mode: str = field(
|
"kt_use_lora_experts": self.kt_use_lora_experts,
|
||||||
default="normal",
|
"kt_lora_expert_num": self.kt_lora_expert_num,
|
||||||
metadata={"help": "Normal Or Long_Context Mode For The KT Engine."},
|
"kt_lora_expert_intermediate_size": self.kt_lora_expert_intermediate_size,
|
||||||
)
|
}
|
||||||
kt_force_think: bool = field(
|
return {key: value for key, value in kt_config.items() if value is not None}
|
||||||
default=False,
|
|
||||||
metadata={"help": "Force-Think Toggle For The KT Engine."},
|
def apply_kt_config(self, finetuning_args: Any, training_args: Any, model_max_length: int | None) -> None:
|
||||||
|
r"""Apply LLaMA-Factory KT args to transformers/accelerate KT integration points."""
|
||||||
|
if not self.use_kt:
|
||||||
|
return
|
||||||
|
|
||||||
|
kt_config = self.get_kt_config_dict(finetuning_args, model_max_length)
|
||||||
|
env_mapping = {
|
||||||
|
"kt_weight_path": "ACCELERATE_KT_WEIGHT_PATH",
|
||||||
|
"kt_expert_checkpoint_path": "ACCELERATE_KT_EXPERT_CHECKPOINT_PATH",
|
||||||
|
"kt_model_max_length": "ACCELERATE_KT_MODEL_MAX_LENGTH",
|
||||||
|
"kt_lora_rank": "ACCELERATE_KT_LORA_RANK",
|
||||||
|
"kt_lora_alpha": "ACCELERATE_KT_LORA_ALPHA",
|
||||||
|
"kt_use_lora_experts": "ACCELERATE_KT_USE_LORA_EXPERTS",
|
||||||
|
"kt_lora_expert_num": "ACCELERATE_KT_LORA_EXPERT_NUM",
|
||||||
|
"kt_lora_expert_intermediate_size": "ACCELERATE_KT_LORA_EXPERT_INTERMEDIATE_SIZE",
|
||||||
|
}
|
||||||
|
for key, env_key in env_mapping.items():
|
||||||
|
value = kt_config.get(key)
|
||||||
|
if value is not None:
|
||||||
|
os.environ[env_key] = str(value)
|
||||||
|
|
||||||
|
hf_kt = getattr(training_args, "hf_kt_config", None)
|
||||||
|
if hf_kt is None or not hasattr(hf_kt, "_kt_config") or not isinstance(hf_kt._kt_config, dict):
|
||||||
|
return
|
||||||
|
|
||||||
|
hf_kt._kt_config.update(kt_config)
|
||||||
|
gc_enabled = getattr(training_args, "gradient_checkpointing", False) or not getattr(
|
||||||
|
self, "disable_gradient_checkpointing", True
|
||||||
)
|
)
|
||||||
|
if gc_enabled:
|
||||||
|
hf_kt._kt_config.setdefault("kt_share_cache_pool", True)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -186,13 +186,16 @@ def _verify_model_args(
|
|||||||
raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
|
raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _check_extra_dependencies(
|
def _check_extra_dependencies(
|
||||||
model_args: "ModelArguments",
|
model_args: "ModelArguments",
|
||||||
finetuning_args: "FinetuningArguments",
|
finetuning_args: "FinetuningArguments",
|
||||||
training_args: Optional["TrainingArguments"] = None,
|
training_args: Optional["TrainingArguments"] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if model_args.use_kt:
|
if model_args.use_kt:
|
||||||
check_version("ktransformers", mandatory=True)
|
check_version("kt-kernel", mandatory=True)
|
||||||
|
check_version("transformers-kt", mandatory=True)
|
||||||
|
check_version("accelerate-kt", mandatory=True)
|
||||||
|
|
||||||
if model_args.use_unsloth:
|
if model_args.use_unsloth:
|
||||||
check_version("unsloth", mandatory=True)
|
check_version("unsloth", mandatory=True)
|
||||||
@@ -510,6 +513,9 @@ def get_train_args(args: dict[str, Any] | list[str] | None = None) -> _TRAIN_CLS
|
|||||||
)
|
)
|
||||||
transformers.set_seed(training_args.seed)
|
transformers.set_seed(training_args.seed)
|
||||||
|
|
||||||
|
if model_args.use_kt:
|
||||||
|
model_args.apply_kt_config(finetuning_args, training_args, model_args.model_max_length)
|
||||||
|
|
||||||
return model_args, data_args, training_args, finetuning_args, generating_args
|
return model_args, data_args, training_args, finetuning_args, generating_args
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ from transformers.integrations import is_deepspeed_zero3_enabled
|
|||||||
|
|
||||||
from ..extras import logging
|
from ..extras import logging
|
||||||
from ..extras.constants import EngineName
|
from ..extras.constants import EngineName
|
||||||
from .model_utils.ktransformers import get_kt_peft_model, load_kt_peft_model
|
|
||||||
from .model_utils.misc import find_all_linear_modules, find_expanded_modules
|
from .model_utils.misc import find_all_linear_modules, find_expanded_modules
|
||||||
from .model_utils.quantization import QuantizationMethod
|
from .model_utils.quantization import QuantizationMethod
|
||||||
from .model_utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model
|
from .model_utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model
|
||||||
@@ -188,12 +187,6 @@ def _setup_lora_tuning(
|
|||||||
"token": model_args.hf_hub_token,
|
"token": model_args.hf_hub_token,
|
||||||
}
|
}
|
||||||
|
|
||||||
if model_args.use_kt:
|
|
||||||
if model_args.infer_backend != EngineName.KT:
|
|
||||||
raise ValueError(
|
|
||||||
"We should use ktransformers as backend to infer the adapter fine-tuned by ktransformers."
|
|
||||||
)
|
|
||||||
|
|
||||||
for adapter in adapter_to_merge:
|
for adapter in adapter_to_merge:
|
||||||
model: LoraModel = PeftModel.from_pretrained(model, adapter, **init_kwargs)
|
model: LoraModel = PeftModel.from_pretrained(model, adapter, **init_kwargs)
|
||||||
model = model.merge_and_unload()
|
model = model.merge_and_unload()
|
||||||
@@ -202,9 +195,7 @@ def _setup_lora_tuning(
|
|||||||
logger.info_rank0(f"Merged {len(adapter_to_merge)} adapter(s).")
|
logger.info_rank0(f"Merged {len(adapter_to_merge)} adapter(s).")
|
||||||
|
|
||||||
if adapter_to_resume is not None: # resume lora training
|
if adapter_to_resume is not None: # resume lora training
|
||||||
if model_args.use_kt:
|
if model_args.use_unsloth:
|
||||||
model = load_kt_peft_model(model_args, model)
|
|
||||||
elif model_args.use_unsloth:
|
|
||||||
model = load_unsloth_peft_model(config, model_args, finetuning_args, is_trainable=is_trainable)
|
model = load_unsloth_peft_model(config, model_args, finetuning_args, is_trainable=is_trainable)
|
||||||
else:
|
else:
|
||||||
model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable, **init_kwargs)
|
model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable, **init_kwargs)
|
||||||
@@ -217,16 +208,6 @@ def _setup_lora_tuning(
|
|||||||
else:
|
else:
|
||||||
target_modules = finetuning_args.lora_target
|
target_modules = finetuning_args.lora_target
|
||||||
|
|
||||||
if model_args.use_kt:
|
|
||||||
new_list = []
|
|
||||||
for m in target_modules:
|
|
||||||
if m in ("down_proj", "up_proj", "gate_proj"):
|
|
||||||
new_list.extend([f"mlp.{m}", f"shared_experts.{m}"])
|
|
||||||
elif m not in ("generate_linear", "orig_module", "prefill_linear"):
|
|
||||||
new_list.append(m)
|
|
||||||
|
|
||||||
target_modules[:] = new_list
|
|
||||||
|
|
||||||
if finetuning_args.use_llama_pro:
|
if finetuning_args.use_llama_pro:
|
||||||
target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers)
|
target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers)
|
||||||
|
|
||||||
@@ -270,19 +251,11 @@ def _setup_lora_tuning(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if model_args.use_kt:
|
if model_args.use_kt:
|
||||||
if finetuning_args.finetuning_type == "oft":
|
if finetuning_args.finetuning_type != "lora":
|
||||||
raise ValueError("KTransformers is currently not supported for OFT.")
|
raise ValueError("KTransformers only supports LoRA finetuning.")
|
||||||
if finetuning_args.finetuning_type == "lora":
|
|
||||||
peft_config = LoraConfig(
|
|
||||||
task_type=TaskType.CAUSAL_LM,
|
|
||||||
inference_mode=False,
|
|
||||||
**peft_kwargs,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError("KTransformers is currently only supported for LoRA.")
|
|
||||||
|
|
||||||
model = get_kt_peft_model(model, peft_config)
|
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, **peft_kwargs)
|
||||||
print(f"KT_model:{model}")
|
model = get_peft_model(model, peft_config)
|
||||||
elif model_args.use_unsloth:
|
elif model_args.use_unsloth:
|
||||||
if finetuning_args.finetuning_type == "oft":
|
if finetuning_args.finetuning_type == "oft":
|
||||||
raise ValueError("Unsloth is currently not supported for OFT.")
|
raise ValueError("Unsloth is currently not supported for OFT.")
|
||||||
|
|||||||
@@ -31,7 +31,6 @@ from ..extras import logging
|
|||||||
from ..extras.misc import count_parameters, skip_check_imports, try_download_model_from_other_hub
|
from ..extras.misc import count_parameters, skip_check_imports, try_download_model_from_other_hub
|
||||||
from ..extras.packages import is_torch_version_greater_than
|
from ..extras.packages import is_torch_version_greater_than
|
||||||
from .adapter import init_adapter
|
from .adapter import init_adapter
|
||||||
from .model_utils.ktransformers import load_kt_pretrained_model
|
|
||||||
from .model_utils.liger_kernel import apply_liger_kernel
|
from .model_utils.liger_kernel import apply_liger_kernel
|
||||||
from .model_utils.misc import register_autoclass
|
from .model_utils.misc import register_autoclass
|
||||||
from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
|
from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
|
||||||
@@ -144,12 +143,7 @@ def load_model(
|
|||||||
|
|
||||||
model = None
|
model = None
|
||||||
lazy_load = False
|
lazy_load = False
|
||||||
if model_args.use_kt:
|
if model_args.use_unsloth:
|
||||||
from ktransformers.sft.monkey_patch_torch_module import install_patch
|
|
||||||
|
|
||||||
install_patch()
|
|
||||||
model = load_kt_pretrained_model(config, model_args)
|
|
||||||
elif model_args.use_unsloth:
|
|
||||||
if model_args.adapter_name_or_path is not None:
|
if model_args.adapter_name_or_path is not None:
|
||||||
lazy_load = True
|
lazy_load = True
|
||||||
elif is_trainable:
|
elif is_trainable:
|
||||||
|
|||||||
@@ -1,154 +0,0 @@
|
|||||||
# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import importlib.util as _u
|
|
||||||
from typing import TYPE_CHECKING, Any
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from ...extras import logging
|
|
||||||
from ...extras.misc import get_current_device
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from ...hparams import FinetuningArguments, ModelArguments
|
|
||||||
|
|
||||||
from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
|
|
||||||
|
|
||||||
|
|
||||||
KT_AVAILABLE = _u.find_spec("ktransformers") is not None
|
|
||||||
if KT_AVAILABLE:
|
|
||||||
from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
|
|
||||||
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
|
|
||||||
from ktransformers.models.modeling_llama import LlamaForCausalLM
|
|
||||||
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
|
|
||||||
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
|
|
||||||
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeForCausalLM
|
|
||||||
from ktransformers.optimize.optimize import optimize_and_load_gguf
|
|
||||||
from ktransformers.server.config.config import Config
|
|
||||||
from ktransformers.sft.lora import inject_lora_layer
|
|
||||||
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
|
|
||||||
from ktransformers.util.globals import GLOBAL_CONFIG
|
|
||||||
from ktransformers.util.utils import load_weights
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def _get_kt_kwargs(
|
|
||||||
config: "PretrainedConfig",
|
|
||||||
model_name_or_path: str,
|
|
||||||
model_args: "ModelArguments",
|
|
||||||
finetuning_args: "FinetuningArguments",
|
|
||||||
) -> dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"model_name": model_name_or_path,
|
|
||||||
"max_seq_length": model_args.model_max_length or 4096,
|
|
||||||
"dtype": model_args.compute_dtype,
|
|
||||||
"load_in_4bit": model_args.quantization_bit == 4,
|
|
||||||
"token": model_args.hf_hub_token,
|
|
||||||
"full_finetuning": finetuning_args.finetuning_type == "full",
|
|
||||||
"device_map": {"": get_current_device()},
|
|
||||||
"rope_scaling": getattr(config, "rope_scaling", None),
|
|
||||||
"fix_tokenizer": False,
|
|
||||||
"trust_remote_code": model_args.trust_remote_code,
|
|
||||||
"use_gradient_checkpointing": "ktransformers",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def load_kt_pretrained_model(config: "PretrainedConfig", model_args: "ModelArguments") -> "PreTrainedModel":
|
|
||||||
r"""Optionally load pretrained model with KTransformers. Used in training."""
|
|
||||||
custom_models = {
|
|
||||||
"DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
|
|
||||||
"DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
|
|
||||||
"Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
|
|
||||||
"Qwen3MoeForCausalLM": Qwen3MoeForCausalLM,
|
|
||||||
"LlamaForCausalLM": LlamaForCausalLM,
|
|
||||||
"MixtralForCausalLM": MixtralForCausalLM,
|
|
||||||
}
|
|
||||||
Config().cpu_infer = model_args.cpu_infer
|
|
||||||
Config().chunk_size = model_args.chunk_size
|
|
||||||
config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code)
|
|
||||||
|
|
||||||
if model_args.mode == "long_context":
|
|
||||||
assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
|
|
||||||
torch.set_default_dtype(torch.float16)
|
|
||||||
else:
|
|
||||||
torch.set_default_dtype(config.torch_dtype)
|
|
||||||
|
|
||||||
with torch.device("meta"):
|
|
||||||
if config.architectures[0] in custom_models:
|
|
||||||
print("using custom modeling_xxx.py.")
|
|
||||||
if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow.
|
|
||||||
config._attn_implementation = "flash_attention_2"
|
|
||||||
if "Llama" in config.architectures[0]:
|
|
||||||
config._attn_implementation = "eager"
|
|
||||||
if "Mixtral" in config.architectures[0]:
|
|
||||||
config._attn_implementation = "flash_attention_2"
|
|
||||||
model = custom_models[config.architectures[0]](config)
|
|
||||||
else:
|
|
||||||
attn_implementation = "flash_attention_2"
|
|
||||||
model = AutoModelForCausalLM.from_config(
|
|
||||||
config, trust_remote_code=True, attn_implementation=attn_implementation
|
|
||||||
)
|
|
||||||
|
|
||||||
optimize_config_path = model_args.kt_optimize_rule
|
|
||||||
gguf_path = model_args.model_name_or_path
|
|
||||||
|
|
||||||
assert optimize_config_path is not None, "optimize_config_path must be provided (path to YAML rules file)."
|
|
||||||
assert gguf_path is not None, "gguf_path must be provided (path to a folder or .gguf file)."
|
|
||||||
|
|
||||||
GLOBAL_CONFIG._config["mod"] = "infer"
|
|
||||||
optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def get_kt_peft_model(model: "PreTrainedModel", peft_kwargs: dict[str, Any]) -> "PreTrainedModel":
|
|
||||||
r"""Get the peft model for the pretrained model with KTransformers. Used in training."""
|
|
||||||
from ktransformers.sft.peft_utils.mapping import get_peft_model
|
|
||||||
|
|
||||||
return get_peft_model(model, peft_kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def load_kt_peft_model(model_args: "ModelArguments", model: "PreTrainedModel") -> "PreTrainedModel":
|
|
||||||
r"""Load peft model with KTransformers. Used in both training and inference."""
|
|
||||||
load_adapter_name_or_path = model_args.adapter_name_or_path[0]
|
|
||||||
if load_adapter_name_or_path.endswith(".gguf"):
|
|
||||||
inject_lora_layer(model, load_adapter_name_or_path)
|
|
||||||
adapter_gguf_loader = GGUFLoader(load_adapter_name_or_path)
|
|
||||||
load_weights(model, adapter_gguf_loader, adapter_gguf=True)
|
|
||||||
model.train()
|
|
||||||
else:
|
|
||||||
inject_lora_layer(model, load_adapter_name_or_path)
|
|
||||||
|
|
||||||
adapter_loader = SafeTensorLoader(load_adapter_name_or_path)
|
|
||||||
device = next(model.parameters()).device
|
|
||||||
for key in adapter_loader.tensor_file_map.keys():
|
|
||||||
try:
|
|
||||||
tensor = adapter_loader.load_tensor(key, device=device)
|
|
||||||
|
|
||||||
model_key = key.replace("base_model.model.", "")
|
|
||||||
model_key = model_key.replace(".weight", ".default.weight")
|
|
||||||
model_key = model_key.replace(".default.default.weight", ".default.weight")
|
|
||||||
|
|
||||||
param = model.get_parameter(model_key)
|
|
||||||
param.data.copy_(tensor.data)
|
|
||||||
|
|
||||||
print(f"Loaded adapter weight: {key} -> {model_key}")
|
|
||||||
except AttributeError:
|
|
||||||
print(f"Skipping {key}: not a model parameter")
|
|
||||||
except KeyError:
|
|
||||||
print(f"Key not found in model: {model_key} (original: {key})")
|
|
||||||
|
|
||||||
return model
|
|
||||||
@@ -1,62 +0,0 @@
|
|||||||
# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
|
|
||||||
#
|
|
||||||
# This code is inspired by the HuggingFace's TRL library.
|
|
||||||
# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from ktransformers.sft.lora import KTrainer # type: ignore
|
|
||||||
from typing_extensions import override
|
|
||||||
|
|
||||||
from ..trainer_utils import get_batch_logps, nested_detach
|
|
||||||
from .trainer import CustomDPOTrainer
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from transformers import PreTrainedModel
|
|
||||||
|
|
||||||
|
|
||||||
class KDPOTrainer(KTrainer, CustomDPOTrainer):
|
|
||||||
@override
|
|
||||||
def concatenated_forward(
|
|
||||||
self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], is_ref_model: bool = False
|
|
||||||
) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
|
|
||||||
r"""Compute the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO.
|
|
||||||
|
|
||||||
Otherwise the average log probabilities.
|
|
||||||
"""
|
|
||||||
if self.finetuning_args.use_ref_model:
|
|
||||||
batch = nested_detach(batch, clone=True) # avoid error
|
|
||||||
|
|
||||||
labels = batch.pop("labels") # dpo do not need compute loss in forward
|
|
||||||
all_logits: torch.Tensor = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32)
|
|
||||||
all_logits = all_logits.to("cpu")
|
|
||||||
labels = labels.to(all_logits.device)
|
|
||||||
all_logps, valid_length = get_batch_logps(
|
|
||||||
logits=all_logits, labels=labels, ld_alpha=(self.ld_alpha if not is_ref_model else None)
|
|
||||||
)
|
|
||||||
if self.loss_type in ["ipo", "orpo", "simpo"]:
|
|
||||||
all_logps = all_logps / valid_length
|
|
||||||
|
|
||||||
batch_size = batch["input_ids"].size(0) // 2
|
|
||||||
chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0)
|
|
||||||
chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0)
|
|
||||||
chosen_length, _ = valid_length.split(batch_size, dim=0)
|
|
||||||
|
|
||||||
if self.loss_type in ["ipo", "orpo", "simpo"]:
|
|
||||||
return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps
|
|
||||||
else:
|
|
||||||
return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length
|
|
||||||
@@ -62,14 +62,6 @@ def run_dpo(
|
|||||||
else:
|
else:
|
||||||
ref_model = None
|
ref_model = None
|
||||||
|
|
||||||
if model_args.use_kt:
|
|
||||||
from ktransformers.util.globals import GLOBAL_CONFIG # type: ignore
|
|
||||||
|
|
||||||
from .ktrainer import KDPOTrainer as CustomDPOTrainer
|
|
||||||
|
|
||||||
GLOBAL_CONFIG._config["mod"] = "sft"
|
|
||||||
|
|
||||||
else:
|
|
||||||
from .trainer import CustomDPOTrainer
|
from .trainer import CustomDPOTrainer
|
||||||
|
|
||||||
# Initialize our Trainer
|
# Initialize our Trainer
|
||||||
|
|||||||
@@ -103,25 +103,6 @@ def run_sft(
|
|||||||
gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
|
gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
|
||||||
|
|
||||||
# Initialize our Trainer
|
# Initialize our Trainer
|
||||||
if model_args.use_kt:
|
|
||||||
from ktransformers.sft.lora import KTrainer # type: ignore
|
|
||||||
from ktransformers.util.globals import GLOBAL_CONFIG # type: ignore
|
|
||||||
|
|
||||||
GLOBAL_CONFIG._config["mod"] = "sft"
|
|
||||||
|
|
||||||
trainer = KTrainer(
|
|
||||||
model=model,
|
|
||||||
args=training_args,
|
|
||||||
tokenizer=tokenizer_module,
|
|
||||||
data_collator=data_collator,
|
|
||||||
callbacks=callbacks,
|
|
||||||
**dataset_module,
|
|
||||||
**metric_module,
|
|
||||||
)
|
|
||||||
trainer.model_accepts_loss_kwargs = False
|
|
||||||
model.config.use_cache = False
|
|
||||||
|
|
||||||
else:
|
|
||||||
trainer = CustomSeq2SeqTrainer(
|
trainer = CustomSeq2SeqTrainer(
|
||||||
model=model,
|
model=model,
|
||||||
args=training_args,
|
args=training_args,
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ def create_modelcard_and_push(
|
|||||||
kwargs["tags"] = kwargs["tags"] + ["unsloth"]
|
kwargs["tags"] = kwargs["tags"] + ["unsloth"]
|
||||||
|
|
||||||
if model_args.use_kt:
|
if model_args.use_kt:
|
||||||
kwargs["tags"] = kwargs["tags"] + ["ktransformers"]
|
kwargs["tags"] = kwargs["tags"] + ["kt-kernel"]
|
||||||
|
|
||||||
if not training_args.do_train:
|
if not training_args.do_train:
|
||||||
pass
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user