[npu] Redirect SDPA to torch_npu.npu_fusion_attention (opt-in, ZeRO-3 safe, no impact off NPU) (#8972)

2026-03-03 10:15:58 +08:00 · 2025-09-30 18:11:31 +08:00
parent a04d777d7f
commit 09dedf144f
2 changed files with 148 additions and 0 deletions
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -188,6 +188,23 @@ def patch_model(
    if not model_args.use_unsloth:
        print_attn_implementation(model.config)

+    # ======== NPU fused attention redirect: SDPA -> torch_npu.npu_fusion_attention ========
+    # Place after all structural modifications and before DeepSpeed/Trainer initialization;
+    # does not modify any Module/_parameters, safe for ZeRO-3 + offload.
+    try:
+        import os
+
+        import torch
+
+        if hasattr(torch, "npu") and torch.npu.is_available() and os.environ.get("NPU_FA_DISABLE", "0") != "1":
+            from .model_utils.sdpa_npu_redirect import apply_sdpa_npu_redirect
+
+            apply_sdpa_npu_redirect(verbose=not model_args.use_unsloth)
+            logger.info_rank0("[sdpa_npu_redirect] Enabled: SDPA will use Ascend npu_fusion_attention when available.")
+    except Exception as e:
+        logger.warning_rank0(f"[sdpa_npu_redirect] Failed to enable redirect, will keep native SDPA. Reason: {e}")
+    # =====================================================================================
+
    try:
        model.add_model_tags(["llama-factory"])
    except Exception: