[model] temporarily support npu fused options on v0, powered by v1 kernels (#9520)

Co-authored-by: frozenleaves <frozen@Mac.local>
This commit is contained in:
浮梦
2025-11-27 02:08:36 +08:00
committed by GitHub
parent f17efde693
commit 2b6f16f261
3 changed files with 19 additions and 4 deletions

View File

@@ -7,7 +7,7 @@
prefill_device: "cuda"
- match:
name: "^lm_head$" # regular expression
name: "^lm_head$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
@@ -18,7 +18,7 @@
prefill_op: "KLinearTorch"
# - match:
# name: "^model\\.layers\\..*$" # regular expression
# name: "^model\\.layers\\..*$" # regular expression
# class: torch.nn.Linear # only match modules matching name and class simultaneously
# replace:
# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
@@ -28,7 +28,7 @@
# generate_op: "KLinearTorch"
# prefill_op: "KLinearTorch"
- match:
name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression
name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
@@ -77,4 +77,4 @@
replace:
class: "ktransformers.operators.models.KQwen3MoeModel"
kwargs:
per_layer_prefill_intput_threshold: 0
per_layer_prefill_intput_threshold: 0