[ci] add cuda workflow (#9682)

Co-authored-by: frozenleaves <frozen@Mac.local> Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>
2026-02-27 00:05:58 +08:00 · 2025-12-29 20:03:00 +08:00
parent bb1ba31005
commit 1857fbdd6b
4 changed files with 123 additions and 3 deletions
--- a/.github/workflows/tests_cuda.yml
+++ b/.github/workflows/tests_cuda.yml
@@ -0,0 +1,88 @@
 name: tests_cuda
 on:
  workflow_dispatch:
  push:
    branches:
      - "main"
    paths:
      - "**/*.py"
      - "pyproject.toml"
      - "Makefile"
      - ".github/workflows/*.yml"
  pull_request:
    branches:
      - "main"
    paths:
      - "**/*.py"
      - "pyproject.toml"
      - "Makefile"
      - ".github/workflows/*.yml"
 jobs:
  tests:
    strategy:
      fail-fast: false
      matrix:
        python:
          - "3.11"
        os:
          - "linux-x86_64-gpu-2"
    runs-on: ${{ matrix.os }}
    concurrency:
      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install uv
        uses: astral-sh/setup-uv@v7
        with:
          python-version: ${{ matrix.python }}
          github-token: ${{ github.token }}
          enable-cache: false
      - name: Check GPU Status
        run: nvidia-smi
      - name: Install dependencies
        run: |
          uv venv
          uv pip install -e ".[dev]"
      - name: Cache HuggingFace models
        id: hf-hub-cache
        uses: actions/cache@v4
        with:
          path: ${{ runner.temp }}/huggingface
          key: hf-cache-${{ runner.os }}-${{ hashFiles('tests/version.txt') }}
      - name: Check quality
        run: |
          make style && make quality
        env:
          UV_NO_SYNC: 1
      - name: Check license
        run: |
          make license
        env:
          UV_NO_SYNC: 1
      - name: Check build
        run: |
          make build
        env:
          UV_NO_SYNC: 1
      - name: Test with pytest
        run: |
          make test
        env:
          UV_NO_SYNC: 1
          HF_HOME: ${{ runner.temp }}/huggingface
          HF_HUB_OFFLINE: "${{ steps.hf-hub-cache.outputs.cache-hit == 'true' && '1' || '0' }}"
--- a/.github/workflows/tests_npu.yml
+++ b/.github/workflows/tests_npu.yml
@@ -49,8 +49,11 @@ jobs:
        uses: actions/checkout@v4
      - name: Install uv
-        run: |
+        uses: astral-sh/setup-uv@v7
-          curl -LsSf https://astral.sh/uv/install.sh | sh
+        with:
          python-version: ${{ matrix.python }}
          github-token: ${{ github.token }}
          enable-cache: false
      - name: Install dependencies
        run: |
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,8 +18,11 @@ Contains shared fixtures, pytest configuration, and custom markers.
 """
 import os
 from typing import Optional
 import pytest
 import torch
 import torch.distributed as dist
 from pytest import Config, FixtureRequest, Item, MonkeyPatch
 from llamafactory.extras.misc import get_current_device, get_device_count, is_env_enabled
@@ -70,7 +73,7 @@ def _handle_slow_tests(items: list[Item]):
                item.add_marker(skip_slow)
-def _get_visible_devices_env() -> str | None:
+def _get_visible_devices_env() -> Optional[str]:
    """Return device visibility env var name."""
    if CURRENT_DEVICE == "cuda":
        return "CUDA_VISIBLE_DEVICES"
@@ -118,6 +121,14 @@ def pytest_collection_modifyitems(config: Config, items: list[Item]):
    _handle_device_visibility(items)
@pytest.fixture(autouse=True)
 def _cleanup_distributed_state():
    """Cleanup distributed state after each test."""
    yield
    if dist.is_initialized():
        dist.destroy_process_group()
@pytest.fixture(autouse=True)
 def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -> None:
    """Set environment variables for distributed tests if specific devices are requested."""
@@ -145,6 +156,10 @@ def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -
            monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0")
        else:
            monkeypatch.setenv(env_key, "0")
        if CURRENT_DEVICE == "cuda":
            monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
        elif CURRENT_DEVICE == "npu":
            monkeypatch.setattr(torch.npu, "device_count", lambda: 1)
@pytest.fixture
--- a/tests_v1/conftest.py
+++ b/tests_v1/conftest.py
@@ -18,8 +18,10 @@ Contains shared fixtures, pytest configuration, and custom markers.
 """
 import os
 import sys
 import pytest
 import torch
 from pytest import Config, FixtureRequest, Item, MonkeyPatch
 from llamafactory.v1.accelerator.helper import get_current_accelerator, get_device_count
@@ -139,9 +141,21 @@ def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -
            devices_str = ",".join(str(i) for i in range(required))
        monkeypatch.setenv(env_key, devices_str)
        # add project root dir to path for mp run
        project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
        if project_root not in sys.path:
            sys.path.insert(0, project_root)
        os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "")
    else:  # non-distributed test
        if old_value:
            visible_devices = [v for v in old_value.split(",") if v != ""]
            monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0")
        else:
            monkeypatch.setenv(env_key, "0")
        if CURRENT_DEVICE == "cuda":
            monkeypatch.setattr(torch.cuda, "device_count", lambda: 1)
        elif CURRENT_DEVICE == "npu":
            monkeypatch.setattr(torch.npu, "device_count", lambda: 1)