mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-11-28 19:24:20 +08:00
[model] temporarily support npu fused options on v0, powered by v1 kernels (#9520)
Co-authored-by: frozenleaves <frozen@Mac.local>
This commit is contained in:
parent
f17efde693
commit
2b6f16f261
@ -7,7 +7,7 @@
|
||||
prefill_device: "cuda"
|
||||
|
||||
- match:
|
||||
name: "^lm_head$" # regular expression
|
||||
name: "^lm_head$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
@ -18,7 +18,7 @@
|
||||
prefill_op: "KLinearTorch"
|
||||
|
||||
# - match:
|
||||
# name: "^model\\.layers\\..*$" # regular expression
|
||||
# name: "^model\\.layers\\..*$" # regular expression
|
||||
# class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
# replace:
|
||||
# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
@ -28,7 +28,7 @@
|
||||
# generate_op: "KLinearTorch"
|
||||
# prefill_op: "KLinearTorch"
|
||||
- match:
|
||||
name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression
|
||||
name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression
|
||||
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||
replace:
|
||||
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||
@ -77,4 +77,4 @@
|
||||
replace:
|
||||
class: "ktransformers.operators.models.KQwen3MoeModel"
|
||||
kwargs:
|
||||
per_layer_prefill_intput_threshold: 0
|
||||
per_layer_prefill_intput_threshold: 0
|
||||
|
||||
@ -174,6 +174,10 @@ class BaseModelArguments:
|
||||
default=True,
|
||||
metadata={"help": "Whether or not to use KV cache in generation."},
|
||||
)
|
||||
use_v1_kernels: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Whether or not to use high-performance kernels in training."},
|
||||
)
|
||||
infer_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
|
||||
default="auto",
|
||||
metadata={"help": "Data type for model weights and activations at inference."},
|
||||
|
||||
@ -213,6 +213,17 @@ def load_model(
|
||||
else:
|
||||
model.train()
|
||||
|
||||
# Borrowing the kernel plugins ability of v1 to temporarily apply the NPU fusion operator to v0,
|
||||
# it is turned off by default, and can be discarded after the transition period ends.
|
||||
if model_args.use_v1_kernels and is_trainable:
|
||||
logger.warning_rank0(
|
||||
"You are try to using future feature about kernels, please note that this feature "
|
||||
"is not supported for all models. If get any error, please disable this feature, or report the issue."
|
||||
)
|
||||
from ..v1.plugins.model_plugins.kernels.registry import apply_available_kernels
|
||||
|
||||
model = apply_available_kernels(model)
|
||||
|
||||
trainable_params, all_param = count_parameters(model)
|
||||
if is_trainable:
|
||||
param_stats = (
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user