From 1857fbdd6be96844fb452693448fa8b6a865e754 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B5=AE=E6=A2=A6?=
 <46097299+frozenleaves@users.noreply.github.com>
Date: Mon, 29 Dec 2025 20:03:00 +0800
Subject: [PATCH] [ci] add cuda workflow (#9682)

Co-authored-by: frozenleaves <frozen@Mac.local>
Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>
---
 .github/workflows/tests_cuda.yml | 88 ++++++++++++++++++++++++++++++++
 .github/workflows/tests_npu.yml  |  7 ++-
 tests/conftest.py                | 17 +++++-
 tests_v1/conftest.py             | 14 +++++
 4 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/tests_cuda.yml

diff --git a/.github/workflows/tests_cuda.yml b/.github/workflows/tests_cuda.yml
new file mode 100644
index 000000000..544936082
--- /dev/null
+++ b/.github/workflows/tests_cuda.yml
@@ -0,0 +1,88 @@
+name: tests_cuda
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - "main"
+    paths:
+      - "**/*.py"
+      - "pyproject.toml"
+      - "Makefile"
+      - ".github/workflows/*.yml"
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - "**/*.py"
+      - "pyproject.toml"
+      - "Makefile"
+      - ".github/workflows/*.yml"
+
+jobs:
+  tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        python:
+          - "3.11"
+        os:
+          - "linux-x86_64-gpu-2"
+
+    runs-on: ${{ matrix.os }}
+
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: ${{ matrix.python }}
+          github-token: ${{ github.token }}
+          enable-cache: false
+
+      - name: Check GPU Status
+        run: nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          uv venv
+          uv pip install -e ".[dev]"
+
+      - name: Cache HuggingFace models
+        id: hf-hub-cache
+        uses: actions/cache@v4
+        with:
+          path: ${{ runner.temp }}/huggingface
+          key: hf-cache-${{ runner.os }}-${{ hashFiles('tests/version.txt') }}
+
+      - name: Check quality
+        run: |
+          make style && make quality
+        env:
+          UV_NO_SYNC: 1
+
+      - name: Check license
+        run: |
+          make license
+        env:
+          UV_NO_SYNC: 1
+
+      - name: Check build
+        run: |
+          make build
+        env:
+          UV_NO_SYNC: 1
+
+      - name: Test with pytest
+        run: |
+          make test
+        env:
+          UV_NO_SYNC: 1
+          HF_HOME: ${{ runner.temp }}/huggingface
+          HF_HUB_OFFLINE: "${{ steps.hf-hub-cache.outputs.cache-hit == 'true' && '1' || '0' }}"
diff --git a/.github/workflows/tests_npu.yml b/.github/workflows/tests_npu.yml
index 1f178ec7d..8a4674f5c 100644
--- a/.github/workflows/tests_npu.yml
+++ b/.github/workflows/tests_npu.yml
@@ -49,8 +49,11 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install uv
-        run: |
-          curl -LsSf https://astral.sh/uv/install.sh | sh
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: ${{ matrix.python }}
+          github-token: ${{ github.token }}
+          enable-cache: false
 
       - name: Install dependencies
         run: |
diff --git a/tests/conftest.py b/tests/conftest.py
index 7220298fe..65c779fc2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,8 +18,11 @@ Contains shared fixtures, pytest configuration, and custom markers.
 """
 
 import os
+from typing import Optional
 
 import pytest
+import torch
+import torch.distributed as dist
 from pytest import Config, FixtureRequest, Item, MonkeyPatch
 
 from llamafactory.extras.misc import get_current_device, get_device_count, is_env_enabled
@@ -70,7 +73,7 @@ def _handle_slow_tests(items: list[Item]):
                 item.add_marker(skip_slow)
 
 
-def _get_visible_devices_env() -> str | None:
+def _get_visible_devices_env() -> Optional[str]:
     """Return device visibility env var name."""
     if CURRENT_DEVICE == "cuda":
         return "CUDA_VISIBLE_DEVICES"
@@ -118,6 +121,14 @@ def pytest_collection_modifyitems(config: Config, items: list[Item]):
     _handle_device_visibility(items)
 
 
+@pytest.fixture(autouse=True)
+def _cleanup_distributed_state():
+    """Cleanup distributed state after each test."""
+    yield
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
 @pytest.fixture(autouse=True)
 def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -> None:
     """Set environment variables for distributed tests if specific devices are requested."""
@@ -145,6 +156,10 @@ def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -
             monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0")
         else:
             monkeypatch.setenv(env_key, "0")
+        if CURRENT_DEVICE == "cuda":
+            monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
+        elif CURRENT_DEVICE == "npu":
+            monkeypatch.setattr(torch.npu, "device_count", lambda: 1)
 
 
 @pytest.fixture
diff --git a/tests_v1/conftest.py b/tests_v1/conftest.py
index 453a85e78..018d723a8 100644
--- a/tests_v1/conftest.py
+++ b/tests_v1/conftest.py
@@ -18,8 +18,10 @@ Contains shared fixtures, pytest configuration, and custom markers.
 """
 
 import os
+import sys
 
 import pytest
+import torch
 from pytest import Config, FixtureRequest, Item, MonkeyPatch
 
 from llamafactory.v1.accelerator.helper import get_current_accelerator, get_device_count
@@ -139,9 +141,21 @@ def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -
             devices_str = ",".join(str(i) for i in range(required))
 
         monkeypatch.setenv(env_key, devices_str)
+
+        # add project root dir to path for mp run
+        project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+        if project_root not in sys.path:
+            sys.path.insert(0, project_root)
+
+        os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "")
+
     else:  # non-distributed test
         if old_value:
             visible_devices = [v for v in old_value.split(",") if v != ""]
             monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0")
         else:
             monkeypatch.setenv(env_key, "0")
+        if CURRENT_DEVICE == "cuda":
+            monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
+        elif CURRENT_DEVICE == "npu":
+            monkeypatch.setattr(torch.npu, "device_count", lambda: 1)