diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 90e9ef87..f5e099fa 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -21,10 +21,17 @@ on:
 
 jobs:
   build:
+    strategy:
+      fail-fast: false
+      matrix:
+        device:
+          - "cuda"
+          - "npu"
+
     runs-on: ubuntu-latest
 
     concurrency:
-      group: ${{ github.workflow }}-${{ github.ref }}
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.device }}
       cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 
     environment:
@@ -33,12 +40,10 @@ jobs:
 
     steps:
       - name: Free up disk space
-        run: |
-          df -h
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /opt/hostedtoolcache
-          df -h
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+        with:
+          tool-cache: true
+          docker-images: false
 
       - name: Checkout
         uses: actions/checkout@v4
@@ -57,13 +62,22 @@ jobs:
         uses: docker/setup-buildx-action@v3
 
       - name: Login to Docker Hub
-        if: github.event_name != 'pull_request'
+        if: ${{ github.event_name != 'pull_request' }}
         uses: docker/login-action@v3
         with:
           username: ${{ vars.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
-      - name: Build and push Docker image
+      - name: Login to Quay
+        if: ${{ github.event_name != 'pull_request' && matrix.device == 'npu' }}
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ vars.QUAY_ASCEND_USERNAME }}
+          password: ${{ secrets.QUAY_ASCEND_TOKEN }}
+
+      - name: Build and push Docker image (CUDA)
+        if: ${{ matrix.device == 'cuda' }}
         uses: docker/build-push-action@v6
         with:
           context: .
@@ -76,3 +90,19 @@ jobs:
             docker.io/hiyouga/llamafactory:${{ steps.version.outputs.tag }}
           cache-from: type=gha
           cache-to: type=gha,mode=max
+
+      - name: Build and push Docker image (NPU)
+        if: ${{ matrix.device == 'npu' }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          file: ./docker/docker-npu/Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: |
+            docker.io/hiyouga/llamafactory:latest-npu-a2
+            docker.io/hiyouga/llamafactory:${{ steps.version.outputs.tag }}-npu-a2
+            quay.io/ascend/llamafactory:latest-npu-a2
+            quay.io/ascend/llamafactory:${{ steps.version.outputs.tag }}-npu-a2
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 40880dbc..7cd5e8f4 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -1,11 +1,12 @@
 # https://hub.docker.com/r/ascendai/cann/tags
-ARG BASE_IMAGE=ascendai/cann:8.0.0-910b-ubuntu22.04-py3.11
+ARG BASE_IMAGE=ascendai/cann:8.1.rc1-910b-ubuntu22.04-py3.11
 FROM ${BASE_IMAGE}
 
 # Installation arguments
 ARG PIP_INDEX=https://pypi.org/simple
 ARG EXTRAS=torch-npu,metrics
 ARG HTTP_PROXY=""
+ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cpu
 
 # Define environments
 ENV MAX_JOBS=16
@@ -28,6 +29,10 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \
     pip config set global.extra-index-url "${PIP_INDEX}" && \
     pip install --no-cache-dir --upgrade pip packaging wheel setuptools
 
+# Install torch-npu
+RUN pip uninstall -y torch torchvision torchaudio && \
+    pip install --no-cache-dir "torch-npu==2.5.1" "torchvision==0.20.1" --index-url "${PYTORCH_INDEX}"
+
 # Install the requirements
 COPY requirements.txt /app
 RUN pip install --no-cache-dir -r requirements.txt
diff --git a/setup.py b/setup.py
index 3c5b445b..73f9424e 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,7 @@ def get_console_scripts() -> list[str]:
 
 extra_require = {
     "torch": ["torch>=2.0.0", "torchvision>=0.15.0"],
-    "torch-npu": ["torch==2.5.1", "torchvision==0.20.1", "torch-npu==2.5.1", "decorator"],
+    "torch-npu": ["torch-npu==2.5.1", "torchvision==0.20.1", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0,<=0.16.9"],
     "liger-kernel": ["liger-kernel>=0.5.5"],