[model] temporarily support npu fused options on v0, powered by v1 kernels (#9520)

Co-authored-by: frozenleaves <frozen@Mac.local>
2025-11-28 19:24:20 +08:00 · 2025-11-27 02:08:36 +08:00 · 2025-11-27 02:08:36 +08:00 · 2b6f16f261
commit 2b6f16f261
parent f17efde693
3 changed files with 19 additions and 4 deletions
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@ -174,6 +174,10 @@ class BaseModelArguments:
        default=True,
        metadata={"help": "Whether or not to use KV cache in generation."},
    )
+    use_v1_kernels: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use high-performance kernels in training."},
+    )
    infer_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
        default="auto",
        metadata={"help": "Data type for model weights and activations at inference."},
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@ -213,6 +213,17 @@ def load_model(
    else:
        model.train()

+    # Borrowing the kernel plugins ability of v1 to temporarily apply the NPU fusion operator to v0,
+    # it is turned off by default, and can be discarded after the transition period ends.
+    if model_args.use_v1_kernels and is_trainable:
+        logger.warning_rank0(
+            "You are try to using future feature about kernels, please note that this feature "
+            "is not supported for all models. If get any error, please disable this feature, or report the issue."
+        )
+        from ..v1.plugins.model_plugins.kernels.registry import apply_available_kernels
+
+        model = apply_available_kernels(model)
+
    trainable_params, all_param = count_parameters(model)
    if is_trainable:
        param_stats = (