[ci] add cuda workflow (#9682)

Co-authored-by: frozenleaves <frozen@Mac.local> Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>
2026-02-26 15:56:00 +08:00 · 2025-12-29 20:03:00 +08:00
parent bb1ba31005
commit 1857fbdd6b
4 changed files with 123 additions and 3 deletions
--- a/.github/workflows/tests_cuda.yml
+++ b/.github/workflows/tests_cuda.yml
@@ -0,0 +1,88 @@
+name: tests_cuda
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - "main"
+    paths:
+      - "**/*.py"
+      - "pyproject.toml"
+      - "Makefile"
+      - ".github/workflows/*.yml"
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - "**/*.py"
+      - "pyproject.toml"
+      - "Makefile"
+      - ".github/workflows/*.yml"
+
+jobs:
+  tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        python:
+          - "3.11"
+        os:
+          - "linux-x86_64-gpu-2"
+
+    runs-on: ${{ matrix.os }}
+
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: ${{ matrix.python }}
+          github-token: ${{ github.token }}
+          enable-cache: false
+
+      - name: Check GPU Status
+        run: nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          uv venv
+          uv pip install -e ".[dev]"
+
+      - name: Cache HuggingFace models
+        id: hf-hub-cache
+        uses: actions/cache@v4
+        with:
+          path: ${{ runner.temp }}/huggingface
+          key: hf-cache-${{ runner.os }}-${{ hashFiles('tests/version.txt') }}
+
+      - name: Check quality
+        run: |
+          make style && make quality
+        env:
+          UV_NO_SYNC: 1
+
+      - name: Check license
+        run: |
+          make license
+        env:
+          UV_NO_SYNC: 1
+
+      - name: Check build
+        run: |
+          make build
+        env:
+          UV_NO_SYNC: 1
+
+      - name: Test with pytest
+        run: |
+          make test
+        env:
+          UV_NO_SYNC: 1
+          HF_HOME: ${{ runner.temp }}/huggingface
+          HF_HUB_OFFLINE: "${{ steps.hf-hub-cache.outputs.cache-hit == 'true' && '1' || '0' }}"
--- a/.github/workflows/tests_npu.yml
+++ b/.github/workflows/tests_npu.yml
@@ -49,8 +49,11 @@ jobs:
        uses: actions/checkout@v4

      - name: Install uv
-        run: |
-          curl -LsSf https://astral.sh/uv/install.sh | sh
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: ${{ matrix.python }}
+          github-token: ${{ github.token }}
+          enable-cache: false

      - name: Install dependencies
        run: |
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,8 +18,11 @@ Contains shared fixtures, pytest configuration, and custom markers.
 """

 import os
+from typing import Optional

 import pytest
+import torch
+import torch.distributed as dist
 from pytest import Config, FixtureRequest, Item, MonkeyPatch

 from llamafactory.extras.misc import get_current_device, get_device_count, is_env_enabled
@@ -70,7 +73,7 @@ def _handle_slow_tests(items: list[Item]):
                item.add_marker(skip_slow)


-def _get_visible_devices_env() -> str | None:
+def _get_visible_devices_env() -> Optional[str]:
    """Return device visibility env var name."""
    if CURRENT_DEVICE == "cuda":
        return "CUDA_VISIBLE_DEVICES"
@@ -118,6 +121,14 @@ def pytest_collection_modifyitems(config: Config, items: list[Item]):
    _handle_device_visibility(items)


+@pytest.fixture(autouse=True)
+def _cleanup_distributed_state():
+    """Cleanup distributed state after each test."""
+    yield
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
@pytest.fixture(autouse=True)
 def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -> None:
    """Set environment variables for distributed tests if specific devices are requested."""
@@ -145,6 +156,10 @@ def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -
            monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0")
        else:
            monkeypatch.setenv(env_key, "0")
+        if CURRENT_DEVICE == "cuda":
+            monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
+        elif CURRENT_DEVICE == "npu":
+            monkeypatch.setattr(torch.npu, "device_count", lambda: 1)


@pytest.fixture
--- a/tests_v1/conftest.py
+++ b/tests_v1/conftest.py
@@ -18,8 +18,10 @@ Contains shared fixtures, pytest configuration, and custom markers.
 """

 import os
+import sys

 import pytest
+import torch
 from pytest import Config, FixtureRequest, Item, MonkeyPatch

 from llamafactory.v1.accelerator.helper import get_current_accelerator, get_device_count
@@ -139,9 +141,21 @@ def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -
            devices_str = ",".join(str(i) for i in range(required))

        monkeypatch.setenv(env_key, devices_str)
+
+        # add project root dir to path for mp run
+        project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+        if project_root not in sys.path:
+            sys.path.insert(0, project_root)
+
+        os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "")
+
    else:  # non-distributed test
        if old_value:
            visible_devices = [v for v in old_value.split(",") if v != ""]
            monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0")
        else:
            monkeypatch.setenv(env_key, "0")
+        if CURRENT_DEVICE == "cuda":
+            monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
+        elif CURRENT_DEVICE == "npu":
+            monkeypatch.setattr(torch.npu, "device_count", lambda: 1)