From 1857fbdd6be96844fb452693448fa8b6a865e754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=AE=E6=A2=A6?= <46097299+frozenleaves@users.noreply.github.com> Date: Mon, 29 Dec 2025 20:03:00 +0800 Subject: [PATCH] [ci] add cuda workflow (#9682) Co-authored-by: frozenleaves Co-authored-by: Yaowei Zheng --- .github/workflows/tests_cuda.yml | 88 ++++++++++++++++++++++++++++++++ .github/workflows/tests_npu.yml | 7 ++- tests/conftest.py | 17 +++++- tests_v1/conftest.py | 14 +++++ 4 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/tests_cuda.yml diff --git a/.github/workflows/tests_cuda.yml b/.github/workflows/tests_cuda.yml new file mode 100644 index 000000000..544936082 --- /dev/null +++ b/.github/workflows/tests_cuda.yml @@ -0,0 +1,88 @@ +name: tests_cuda + +on: + workflow_dispatch: + push: + branches: + - "main" + paths: + - "**/*.py" + - "pyproject.toml" + - "Makefile" + - ".github/workflows/*.yml" + pull_request: + branches: + - "main" + paths: + - "**/*.py" + - "pyproject.toml" + - "Makefile" + - ".github/workflows/*.yml" + +jobs: + tests: + strategy: + fail-fast: false + matrix: + python: + - "3.11" + os: + - "linux-x86_64-gpu-2" + + runs-on: ${{ matrix.os }} + + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.python }} + github-token: ${{ github.token }} + enable-cache: false + + - name: Check GPU Status + run: nvidia-smi + + - name: Install dependencies + run: | + uv venv + uv pip install -e ".[dev]" + + - name: Cache HuggingFace models + id: hf-hub-cache + uses: actions/cache@v4 + with: + path: ${{ runner.temp }}/huggingface + key: hf-cache-${{ runner.os }}-${{ hashFiles('tests/version.txt') }} + + - name: Check quality + run: | + make style && make quality + env: + UV_NO_SYNC: 1 + + - name: Check license + run: | + make license + env: + UV_NO_SYNC: 1 + + - name: Check build + run: | + make build + env: + UV_NO_SYNC: 1 + + - name: Test with pytest + run: | + make test + env: + UV_NO_SYNC: 1 + HF_HOME: ${{ runner.temp }}/huggingface + HF_HUB_OFFLINE: "${{ steps.hf-hub-cache.outputs.cache-hit == 'true' && '1' || '0' }}" diff --git a/.github/workflows/tests_npu.yml b/.github/workflows/tests_npu.yml index 1f178ec7d..8a4674f5c 100644 --- a/.github/workflows/tests_npu.yml +++ b/.github/workflows/tests_npu.yml @@ -49,8 +49,11 @@ jobs: uses: actions/checkout@v4 - name: Install uv - run: | - curl -LsSf https://astral.sh/uv/install.sh | sh + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.python }} + github-token: ${{ github.token }} + enable-cache: false - name: Install dependencies run: | diff --git a/tests/conftest.py b/tests/conftest.py index 7220298fe..65c779fc2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,8 +18,11 @@ Contains shared fixtures, pytest configuration, and custom markers. """ import os +from typing import Optional import pytest +import torch +import torch.distributed as dist from pytest import Config, FixtureRequest, Item, MonkeyPatch from llamafactory.extras.misc import get_current_device, get_device_count, is_env_enabled @@ -70,7 +73,7 @@ def _handle_slow_tests(items: list[Item]): item.add_marker(skip_slow) -def _get_visible_devices_env() -> str | None: +def _get_visible_devices_env() -> Optional[str]: """Return device visibility env var name.""" if CURRENT_DEVICE == "cuda": return "CUDA_VISIBLE_DEVICES" @@ -118,6 +121,14 @@ def pytest_collection_modifyitems(config: Config, items: list[Item]): _handle_device_visibility(items) +@pytest.fixture(autouse=True) +def _cleanup_distributed_state(): + """Cleanup distributed state after each test.""" + yield + if dist.is_initialized(): + dist.destroy_process_group() + + @pytest.fixture(autouse=True) def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -> None: """Set environment variables for distributed tests if specific devices are requested.""" @@ -145,6 +156,10 @@ def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) - monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0") else: monkeypatch.setenv(env_key, "0") + if CURRENT_DEVICE == "cuda": + monkeypatch.setattr(torch.cuda, "device_count", lambda: 1) + elif CURRENT_DEVICE == "npu": + monkeypatch.setattr(torch.npu, "device_count", lambda: 1) @pytest.fixture diff --git a/tests_v1/conftest.py b/tests_v1/conftest.py index 453a85e78..018d723a8 100644 --- a/tests_v1/conftest.py +++ b/tests_v1/conftest.py @@ -18,8 +18,10 @@ Contains shared fixtures, pytest configuration, and custom markers. """ import os +import sys import pytest +import torch from pytest import Config, FixtureRequest, Item, MonkeyPatch from llamafactory.v1.accelerator.helper import get_current_accelerator, get_device_count @@ -139,9 +141,21 @@ def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) - devices_str = ",".join(str(i) for i in range(required)) monkeypatch.setenv(env_key, devices_str) + + # add project root dir to path for mp run + project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + if project_root not in sys.path: + sys.path.insert(0, project_root) + + os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "") + else: # non-distributed test if old_value: visible_devices = [v for v in old_value.split(",") if v != ""] monkeypatch.setenv(env_key, visible_devices[0] if visible_devices else "0") else: monkeypatch.setenv(env_key, "0") + if CURRENT_DEVICE == "cuda": + monkeypatch.setattr(torch.cuda, "device_count", lambda: 1) + elif CURRENT_DEVICE == "npu": + monkeypatch.setattr(torch.npu, "device_count", lambda: 1)