[ci] fix ut huggingface hub 429 error when transformers>=5.0.0 (#10155 )

[v1] support quantization (#10161 )
[v1] support deepspeed (#10181 )
2026-02-26 07:45:59 +08:00 · 2026-02-12 22:14:10 +08:00 · 2026-02-12 20:37:41 +08:00 · 2026-02-12 17:24:30 +08:00 · 2026-02-12 13:12:14 +08:00 · 2026-02-12 13:02:09 +08:00
234 changed files with 8362 additions and 2804 deletions
--- a/src/llamafactory/v1/core/trainer_utils/init.py
+++ b/src/llamafactory/v1/core/trainer_utils/init.py
--- a/src/llamafactory/v1/core/trainer_utils/callback.py
+++ b/src/llamafactory/v1/core/trainer_utils/callback.py
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -50,7 +50,7 @@ jobs:
          docker-images: false
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
      - name: Get llamafactory version
        id: version
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,77 @@
 name: Build and Deploy Sphinx Docs
 on:
  push:
    branches: ["main"]
    paths:
      - "docs/**"
  pull_request:
    branches: ["main"]
    paths:
      - "docs/**"
  workflow_dispatch:
 permissions:
  contents: read
  pages: write
  id-token: write
 concurrency:
  group: "pages"
  cancel-in-progress: false
 jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'
      - name: Install dependencies
        run: |
          pip install -r docs/requirements.txt
      - name: Build Sphinx
        run: |
          sphinx-build -b html docs/zh docs/_build/html/zh
          sphinx-build -b html docs/en docs/_build/html/en
          printf '%s\n' \
            '<!DOCTYPE html>' \
            '<html>' \
            '  <head>' \
            '    <meta charset="utf-8" />' \
            '    <meta http-equiv="refresh" content="0; url=zh/index.html" />' \
            '    <script>window.location.href="zh/index.html"+window.location.search+window.location.hash;</script>' \
            '    <title>Redirecting...</title>' \
            '  </head>' \
            '  <body>' \
            '    <a href="zh/index.html">Redirecting...</a>' \
            '  </body>' \
            '</html>' \
            > docs/_build/html/index.html
          touch docs/_build/html/.nojekyll
      - name: Setup Pages
        uses: actions/configure-pages@v5
      - name: Upload artifact
        uses: actions/upload-pages-artifact@v3
        with:
          path: docs/_build/html
  deploy:
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    runs-on: ubuntu-latest
    needs: build
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,7 +21,7 @@ jobs:
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
      - name: Install uv
        uses: astral-sh/setup-uv@v7
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -54,10 +54,11 @@ jobs:
    env:
      HF_TOKEN: ${{ secrets.HF_TOKEN }}
      OS_NAME: ${{ matrix.os }}
      UV_NO_SYNC: 1
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
      - name: Install uv
        uses: astral-sh/setup-uv@v7
@@ -70,7 +71,8 @@ jobs:
        run: |
          uv venv
          uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          uv pip install -e ".[dev]"
+          uv pip install -e .
          uv pip install -r requirements/dev.txt
      - name: Install transformers
        if: ${{ matrix.transformers }}
@@ -79,7 +81,7 @@ jobs:
      - name: Cache files
        id: hf-hub-cache
-        uses: actions/cache@v4
+        uses: actions/cache@v5
        with:
          path: ${{ runner.temp }}/huggingface
          key: huggingface-${{ matrix.os }}-${{ matrix.python }}-${{ matrix.transformers }}-${{ hashFiles('tests/version.txt') }}
@@ -87,25 +89,18 @@ jobs:
      - name: Check quality
        run: |
          make style && make quality
        env:
          UV_NO_SYNC: 1
      - name: Check license
        run: |
          make license
        env:
          UV_NO_SYNC: 1
      - name: Check build
        run: |
          make build
        env:
          UV_NO_SYNC: 1
      - name: Test with pytest
        run: |
          make test
        env:
          UV_NO_SYNC: 1
          HF_HOME: ${{ runner.temp }}/huggingface
          HF_HUB_OFFLINE: "${{ steps.hf-hub-cache.outputs.cache-hit == 'true' && '1' || '0' }}"
--- a/.github/workflows/tests_cuda.yml
+++ b/.github/workflows/tests_cuda.yml
@@ -35,9 +35,16 @@ jobs:
      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    env:
      HF_HOME: "${{ github.workspace }}/../.runner_cache/huggingface"
      UV_CACHE_DIR: "${{ github.workspace }}/../.runner_cache/uv"
      HF_TOKEN: ${{ secrets.HF_TOKEN }}
      OS_NAME: ${{ matrix.os }}
      UV_NO_SYNC: 1
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
      - name: Install uv
        uses: astral-sh/setup-uv@v7
@@ -52,37 +59,22 @@ jobs:
      - name: Install dependencies
        run: |
          uv venv
-          uv pip install -e ".[dev]"
+          uv pip install -e .
-
+          uv pip install -r requirements/dev.txt
-      - name: Cache HuggingFace models
+          uv pip install -r requirements/bitsandbytes.txt
        id: hf-hub-cache
        uses: actions/cache@v4
        with:
          path: ${{ runner.temp }}/huggingface
          key: hf-cache-${{ runner.os }}-${{ hashFiles('tests/version.txt') }}
      - name: Check quality
        run: |
          make style && make quality
        env:
          UV_NO_SYNC: 1
      - name: Check license
        run: |
          make license
        env:
          UV_NO_SYNC: 1
      - name: Check build
        run: |
          make build
        env:
          UV_NO_SYNC: 1
      - name: Test with pytest
        run: |
          make test
        env:
          UV_NO_SYNC: 1
          HF_HOME: ${{ runner.temp }}/huggingface
          HF_HUB_OFFLINE: "${{ steps.hf-hub-cache.outputs.cache-hit == 'true' && '1' || '0' }}"
--- a/.github/workflows/tests_npu.yml
+++ b/.github/workflows/tests_npu.yml
@@ -43,10 +43,11 @@ jobs:
        HF_ENDPOINT: https://hf-mirror.com
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        OS_NAME: ${{ matrix.os }}
        UV_NO_SYNC: 1
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
      - name: Install uv
        uses: astral-sh/setup-uv@v7
@@ -58,8 +59,9 @@ jobs:
      - name: Install dependencies
        run: |
          uv venv
-          uv pip install torch-npu==${{matrix.pytorch_npu}}
+          uv pip install -r requirements/npu.txt
-          uv pip install -e ".[dev]"
+          uv pip install -e .
          uv pip install -r requirements/dev.txt
      - name: Install node
        run: |
@@ -68,35 +70,18 @@ jobs:
          curl -fsSL https://deb.nodesource.com/setup_20.x | bash -
          apt-get install -y nodejs
      - name: Cache files
        id: hf-hub-cache
        uses: actions/cache@v4
        with:
          path: ${{ runner.temp }}/huggingface
          key: huggingface-${{ matrix.os }}-${{ matrix.python }}-${{ hashFiles('tests/version.txt') }}
      - name: Check quality
        run: |
          make style && make quality
        env:
          UV_NO_SYNC: 1
      - name: Check license
        run: |
          make license
        env:
          UV_NO_SYNC: 1
      - name: Check build
        run: |
          make build
        env:
          UV_NO_SYNC: 1
      - name: Test with pytest
        run: |
          make test
        env:
          UV_NO_SYNC: 1
          HF_HOME: /root/.cache/huggingface
          HF_HUB_OFFLINE: "${{ steps.hf-hub-cache.outputs.cache-hit == 'true' && '1' || '0' }}"
--- a/.gitignore
+++ b/.gitignore
@@ -176,6 +176,7 @@ llamaboard_cache/
 llamaboard_config/
 saves/
 output/
 outputs/
 wandb/
 swanlog/
 generated_predictions.jsonl
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ Read technical notes:
 ## Features
- **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Qwen2-VL, DeepSeek, Yi, Gemma, ChatGLM, Phi, etc.
+- **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen3, Qwen3-VL, DeepSeek, Gemma, GLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO, etc.
 - **Scalable resources**: 16-bit full-tuning, freeze-tuning, LoRA and 2/3/4/5/6/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ.
 - **Advanced algorithms**: [GaLore](https://github.com/jiaweizzhao/GaLore), [BAdam](https://github.com/Ledzy/BAdam), [APOLLO](https://github.com/zhuhanqing/APOLLO), [Adam-mini](https://github.com/zyushun/Adam-mini), [Muon](https://github.com/KellerJordan/Muon), [OFT](https://github.com/huggingface/peft/tree/main/src/peft/tuners/oft), DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and PiSSA.
@@ -279,11 +279,10 @@ Read technical notes:
 | Model                                                             | Model size                       | Template             |
 | ----------------------------------------------------------------- | -------------------------------- | -------------------- |
 | [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                    |
 | [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere               |
 | [DeepSeek (LLM/Code/MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | deepseek             |
 | [DeepSeek 3-3.2](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3            |
 | [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1           |
-| [ERNIE-4.5](https://huggingface.co/baidu)                         | 0.3B/21B/300B                    | ernie/ernie_nothink  |
+| [ERNIE-4.5](https://huggingface.co/baidu)                         | 0.3B/21B/300B                    | ernie_nothink        |
 | [Falcon/Falcon H1](https://huggingface.co/tiiuae)                 | 0.5B/1.5B/3B/7B/11B/34B/40B/180B | falcon/falcon_h1     |
 | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma/gemma2         |
 | [Gemma 3/Gemma 3n](https://huggingface.co/google)                 | 270M/1B/4B/6B/8B/12B/27B         | gemma3/gemma3n       |
@@ -292,12 +291,13 @@ Read technical notes:
 | [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                    |
 | [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt_oss              |
 | [Granite 3-4](https://huggingface.co/ibm-granite)                 | 1B/2B/3B/7B/8B                   | granite3/granite4    |
-| [Hunyuan (MT)](https://huggingface.co/tencent/)                   | 7B                               | hunyuan              |
+| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/)        | 0.5B/1.8B/4B/7B/13B              | hunyuan/hunyuan_small |
 | [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2              |
 | [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab)              | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl            |
-| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/)       | 8B                               | intern_s1            |
+| [Intern-S1-mini](https://huggingface.co/internlm/)                | 8B                               | intern_s1            |
 | [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl              |
 | [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI)       | 16B/100B                         | bailing_v2           |
 | [LFM 2.5 (VL)](https://huggingface.co/LiquidAI)                   | 1.2B/1.6B                        | lfm2/lfm2_vl         |
 | [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                    |
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2               |
 | [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3               |
@@ -307,18 +307,17 @@ Read technical notes:
 | [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next           |
 | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video     |
 | [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B/309B                          | mimo/mimo_v2         |
-| [MiniCPM 1-4.1](https://huggingface.co/openbmb)                   | 0.5B/1B/2B/4B/8B                 | cpm/cpm3/cpm4        |
+| [MiniCPM 4](https://huggingface.co/openbmb)                       | 0.5B/8B                          | cpm4                 |
-| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v  |
+| [MiniCPM-o/MiniCPM-V 4.5](https://huggingface.co/openbmb)         | 8B/9B                            | minicpm_o/minicpm_v  |
 | [MiniMax-M1/MiniMax-M2](https://huggingface.co/MiniMaxAI/models)  | 229B/456B                        | minimax1/minimax2    |
 | [Ministral 3](https://huggingface.co/mistralai)                   | 3B/8B/14B                        | ministral3           |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral              |
 | [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                    |
 | [PaliGemma/PaliGemma2](https://huggingface.co/google)             | 3B/10B/28B                       | paligemma            |
 | [Phi-3/Phi-3.5](https://huggingface.co/microsoft)                 | 4B/14B                           | phi                  |
 | [Phi-3-small](https://huggingface.co/microsoft)                   | 7B                               | phi_small            |
-| [Phi-4](https://huggingface.co/microsoft)                         | 14B                              | phi4                 |
+| [Phi-4-mini/Phi-4](https://huggingface.co/microsoft)              | 3.8B/14B                         | phi4_mini/phi4       |
 | [Pixtral](https://huggingface.co/mistralai)                       | 12B                              | pixtral              |
-| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                 |
+| [Qwen2 (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)          | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                 |
 | [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink  |
 | [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio          |
 | [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni           |
@@ -327,8 +326,7 @@ Read technical notes:
 | [Qwen3-VL](https://huggingface.co/Qwen)                           | 2B/4B/8B/30B/32B/235B            | qwen3_vl             |
 | [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed)         | 8B/36B                           | seed_oss/seed_coder  |
 | [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                    |
-| [VibeThinker-1.5B](https://huggingface.co/WeiboAI)                | 1.5B                             | qwen3                |
+| [TeleChat 2-2.5](https://huggingface.co/Tele-AI)                  | 3B/7B/35B/115B                   | telechat2            |
 | [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi                   |
 | [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan                 |
 > [!NOTE]
@@ -514,12 +512,13 @@ huggingface-cli login
 #### Install from Source
 ```bash
-git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
+git clone --depth 1 https://github.com/hiyouga/LlamaFactory.git
-cd LLaMA-Factory
+cd LlamaFactory
-pip install -e ".[metrics]"
+pip install -e .
 pip install -r requirements/metrics.txt
 ```
-Optional dependencies available: `metrics`, `deepspeed`. Install with: `pip install -e ".[metrics,deepspeed]"`
+Optional dependencies available: `metrics`, `deepspeed`. Install with: `pip install -e . && pip install -r requirements/metrics.txt -r requirements/deepspeed.txt`
 Additional dependencies for specific features are available in `examples/requirements/`.
@@ -577,36 +576,21 @@ To enable FlashAttention-2 on the Windows platform, please use the script from [
 <details><summary>For Ascend NPU users</summary>
-To install LLaMA Factory on Ascend NPU devices, please upgrade Python to version 3.10 or higher: `pip install -e . torch-npu==2.7.1`. Additionally, you need to install the **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**. Please follow the [installation tutorial](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html) or use the following commands:
+To install LLaMA Factory on Ascend NPU devices, please upgrade Python to version 3.10 or higher: `pip install -r requirements/npu.txt`. Additionally, you need to install the **Ascend CANN Toolkit and Kernels**. Please follow the [installation tutorial](https://llamafactory.readthedocs.io/en/latest/advanced/npu_installation.html).
 You can also download the pre-built Docker images:
 ```bash
-# replace the url according to your CANN version and devices
+# Docker Hub
-# install CANN Toolkit
+docker pull hiyouga/llamafactory:latest-npu-a2
-wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C20SPC702/Ascend-cann-toolkit_8.0.0.alpha002_linux-"$(uname -i)".run
+docker pull hiyouga/llamafactory:latest-npu-a3
 bash Ascend-cann-toolkit_8.0.0.alpha002_linux-"$(uname -i)".run --install
-# install CANN Kernels
+# quay.io
-wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C20SPC702/Ascend-cann-kernels-910b_8.0.0.alpha002_linux-"$(uname -i)".run
+docker pull quay.io/ascend/llamafactory:latest-npu-a2
-bash Ascend-cann-kernels-910b_8.0.0.alpha002_linux-"$(uname -i)".run --install
+docker pull quay.io/ascend/llamafactory:latest-npu-a3
 # set env variables
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 ```
 | Requirement  | Minimum | Recommend      |
 | ------------ | ------- | -------------- |
 | CANN         | 8.0.RC1 | 8.0.0.alpha002 |
 | torch        | 2.1.0   | 2.7.1          |
 | torch-npu    | 2.1.0   | 2.7.1          |
 | deepspeed    | 0.13.2  | 0.13.2         |
 | vllm-ascend  | -       | 0.7.3          |
 Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
 If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
 Download the pre-built Docker images: [32GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html) | [64GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
 #### Install BitsAndBytes
 To use QLoRA based on bitsandbytes on Ascend NPU, please follow these 3 steps:
@@ -946,7 +930,7 @@ If you have a project that should be incorporated, please contact via email or c
 This repository is licensed under the [Apache-2.0 License](LICENSE).
-Please follow the model licenses to use the corresponding model weights: [Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [GPT-2](https://github.com/openai/gpt-2/blob/master/LICENSE) / [Granite](LICENSE) / [Index](https://huggingface.co/IndexTeam/Index-1.9B/blob/main/LICENSE) / [InternLM](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3/Phi-4](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [Skywork](https://huggingface.co/Skywork/Skywork-13B-base/blob/main/Skywork%20Community%20License.pdf) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [TeleChat2](https://huggingface.co/Tele-AI/telechat-7B/blob/main/TeleChat%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [BLOOM](https://huggingface.co/spaces/bigscience/license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [GPT-2](https://github.com/openai/gpt-2/blob/master/LICENSE) / [Granite](LICENSE) / [InternLM](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [Phi-3/Phi-4](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [TeleChat2](https://huggingface.co/Tele-AI/telechat-7B/blob/main/TeleChat%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 ## Citation
--- a/README_zh.md
+++ b/README_zh.md
@@ -94,7 +94,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 ## 项目特色
- **多种模型**：LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Qwen2-VL、DeepSeek、Yi、Gemma、ChatGLM、Phi 等等。
+- **多种模型**：LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen3、Qwen3-VL、DeepSeek、Gemma、GLM、Phi 等等。
 - **集成方法**：（增量）预训练、（多模态）指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练、ORPO 训练等等。
 - **多种精度**：16 比特全参数微调、冻结微调、LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ 的 2/3/4/5/6/8 比特 QLoRA 微调。
 - **先进算法**：[GaLore](https://github.com/jiaweizzhao/GaLore)、[BAdam](https://github.com/Ledzy/BAdam)、[APOLLO](https://github.com/zhuhanqing/APOLLO)、[Adam-mini](https://github.com/zyushun/Adam-mini)、[Muon](https://github.com/KellerJordan/Muon)、[OFT](https://github.com/huggingface/peft/tree/main/src/peft/tuners/oft)、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ 和 PiSSA。
@@ -281,11 +281,10 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | 模型名                                                             | 参数量                            | Template             |
 | ----------------------------------------------------------------- | -------------------------------- | -------------------- |
 | [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                    |
 | [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere               |
 | [DeepSeek (LLM/Code/MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | deepseek             |
 | [DeepSeek 3-3.2](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3            |
 | [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1           |
-| [ERNIE-4.5](https://huggingface.co/baidu)                         | 0.3B/21B/300B                    | ernie/ernie_nothink  |
+| [ERNIE-4.5](https://huggingface.co/baidu)                         | 0.3B/21B/300B                    | ernie_nothink        |
 | [Falcon/Falcon H1](https://huggingface.co/tiiuae)                 | 0.5B/1.5B/3B/7B/11B/34B/40B/180B | falcon/falcon_h1     |
 | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma/gemma2         |
 | [Gemma 3/Gemma 3n](https://huggingface.co/google)                 | 270M/1B/4B/6B/8B/12B/27B         | gemma3/gemma3n       |
@@ -294,12 +293,13 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                    |
 | [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt_oss              |
 | [Granite 3-4](https://huggingface.co/ibm-granite)                 | 1B/2B/3B/7B/8B                   | granite3/granite4    |
-| [Hunyuan (MT)](https://huggingface.co/tencent/)                   | 7B                               | hunyuan              |
+| [Hunyuan/Hunyuan1.5 (MT)](https://huggingface.co/tencent/)        | 0.5B/1.8B/4B/7B/13B              | hunyuan/hunyuan_small |
 | [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2              |
 | [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab)              | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl            |
-| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/)       | 8B                               | intern_s1            |
+| [Intern-S1-mini](https://huggingface.co/internlm/)                | 8B                               | intern_s1            |
 | [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl              |
 | [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI)       | 16B/100B                         | bailing_v2           |
 | [LFM 2.5 (VL)](https://huggingface.co/LiquidAI)                   | 1.2B/1.6B                        | lfm2/lfm2_vl         |
 | [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                    |
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2               |
 | [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3               |
@@ -309,18 +309,17 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next           |
 | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video     |
 | [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B/309B                          | mimo/mimo_v2         |
-| [MiniCPM 1-4.1](https://huggingface.co/openbmb)                   | 0.5B/1B/2B/4B/8B                 | cpm/cpm3/cpm4        |
+| [MiniCPM 4](https://huggingface.co/openbmb)                       | 0.5B/8B                          | cpm4                 |
-| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v  |
+| [MiniCPM-o/MiniCPM-V 4.5](https://huggingface.co/openbmb)         | 8B/9B                            | minicpm_o/minicpm_v  |
 | [MiniMax-M1/MiniMax-M2](https://huggingface.co/MiniMaxAI/models)  | 229B/456B                        | minimax1/minimax2    |
 | [Ministral 3](https://huggingface.co/mistralai)                   | 3B/8B/14B                        | ministral3           |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral              |
 | [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                    |
 | [PaliGemma/PaliGemma2](https://huggingface.co/google)             | 3B/10B/28B                       | paligemma            |
 | [Phi-3/Phi-3.5](https://huggingface.co/microsoft)                 | 4B/14B                           | phi                  |
 | [Phi-3-small](https://huggingface.co/microsoft)                   | 7B                               | phi_small            |
-| [Phi-4](https://huggingface.co/microsoft)                         | 14B                              | phi4                 |
+| [Phi-4-mini/Phi-4](https://huggingface.co/microsoft)              | 3.8B/14B                         | phi4_mini/phi4       |
 | [Pixtral](https://huggingface.co/mistralai)                       | 12B                              | pixtral              |
-| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                 |
+| [Qwen2 (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)          | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                 |
 | [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink  |
 | [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio          |
 | [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni           |
@@ -329,8 +328,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | [Qwen3-VL](https://huggingface.co/Qwen)                           | 2B/4B/8B/30B/32B/235B            | qwen3_vl             |
 | [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed)         | 8B/36B                           | seed_oss/seed_coder  |
 | [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                    |
-| [VibeThinker-1.5B](https://huggingface.co/WeiboAI)                | 1.5B                             | qwen3                |
+| [TeleChat 2-2.5](https://huggingface.co/Tele-AI)                  | 3B/7B/35B/115B                   | telechat2            |
 | [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi                   |
 | [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan                 |
 > [!NOTE]
@@ -516,12 +514,13 @@ huggingface-cli login
 #### 从源码安装
 ```bash
-git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
+git clone --depth 1 https://github.com/hiyouga/LlamaFactory.git
-cd LLaMA-Factory
+cd LlamaFactory
-pip install -e ".[metrics]"
+pip install -e .
 pip install -r requirements/metrics.txt
 ```
-可选的额外依赖项：`metrics`、`deepspeed`。使用 `pip install -e ".[metrics,deepspeed]"` 安装。
+可选的额外依赖项：`metrics`、`deepspeed`。使用 `pip install -e . && pip install -r requirements/metrics.txt -r requirements/deepspeed.txt` 安装。
 其他可选依赖项请参考 `examples/requirements/` 目录下的文件。
@@ -579,36 +578,20 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 <details><summary>昇腾 NPU 用户指南</summary>
-在昇腾 NPU 设备上安装 LLaMA Factory 时，请升级 Python 到 3.10 及以上，并需要指定额外依赖项，使用 `pip install -e . torch-npu==2.7.1` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit 与 Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
+在昇腾 NPU 设备上安装 LLaMA Factory 时，请升级 Python 到 3.10 及以上，并需要指定额外依赖项，使用 `pip install -r requirements/npu.txt` 命令安装。此外，还需要安装 **Ascend CANN Toolkit 与 Kernels**，安装方法请参考[安装教程](https://llamafactory.readthedocs.io/zh-cn/latest/advanced/npu_installation.html)。
 您可以直接下载预安装的最新docker镜像：
 ```bash
-# 请替换 URL 为 CANN 版本和设备型号对应的 URL
+# Docker Hub
-# 安装 CANN Toolkit
+docker pull hiyouga/llamafactory:latest-npu-a2
-wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C17SPC701/Ascend-cann-toolkit_8.0.RC1.alpha001_linux-"$(uname -i)".run
+docker pull hiyouga/llamafactory:latest-npu-a3
 bash Ascend-cann-toolkit_8.0.RC1.alpha001_linux-"$(uname -i)".run --install
-# 安装 CANN Kernels
+# quay.io
-wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C17SPC701/Ascend-cann-kernels-910b_8.0.RC1.alpha001_linux.run
+docker pull quay.io/ascend/llamafactory:latest-npu-a2
-bash Ascend-cann-kernels-910b_8.0.RC1.alpha001_linux.run --install
+docker pull quay.io/ascend/llamafactory:latest-npu-a3
 # 设置环境变量
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 ```
 | 依赖项        | 至少     | 推荐           |
 | ------------ | ------- | -------------- |
 | CANN         | 8.0.RC1 | 8.0.0.alpha002 |
 | torch        | 2.1.0   | 2.7.1          |
 | torch-npu    | 2.1.0   | 2.7.1          |
 | deepspeed    | 0.13.2  | 0.13.2         |
 | vllm-ascend  | -       | 0.7.3          |
 请使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定运算设备。
 如果遇到无法正常推理的情况，请尝试设置 `do_sample: false`。
 下载预构建 Docker 镜像：[32GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html) | [64GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
 #### 安装 BitsAndBytes
 如果要在 Ascend NPU 上进行基于 bitsandbytes 的 QLoRA 量化微调，请执行如下步骤：
@@ -950,7 +933,7 @@ swanlab_run_name: test_run # 可选
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
-使用模型权重时，请遵循对应的模型协议：[Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [GPT-2](https://github.com/openai/gpt-2/blob/master/LICENSE) / [Granite](LICENSE) / [Index](https://huggingface.co/IndexTeam/Index-1.9B/blob/main/LICENSE) / [InternLM](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3/Phi-4](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [Skywork](https://huggingface.co/Skywork/Skywork-13B-base/blob/main/Skywork%20Community%20License.pdf) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [TeleChat2](https://huggingface.co/Tele-AI/telechat-7B/blob/main/TeleChat%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[BLOOM](https://huggingface.co/spaces/bigscience/license)/ [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [GPT-2](https://github.com/openai/gpt-2/blob/master/LICENSE) / [Granite](LICENSE) / [InternLM](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [Phi-3/Phi-4](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [TeleChat2](https://huggingface.co/Tele-AI/telechat-7B/blob/main/TeleChat%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 ## 引用
--- a/data/reason_tool_use_demo_50.jsonl
+++ b/data/reason_tool_use_demo_50.jsonl
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -32,7 +32,8 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \
 COPY . /app
 # Install LLaMA Factory
-RUN pip install --no-cache-dir --no-build-isolation -e ".[metrics,deepspeed]"
+RUN pip install --no-cache-dir --no-build-isolation -e . && \
    pip install --no-cache-dir --no-build-isolation -r requirements/metrics.txt -r requirements/deepspeed.txt
 # Rebuild flash attention
 RUN if [ "${INSTALL_FLASHATTN}" == "true" ]; then \
--- a/docker/docker-cuda/Dockerfile.megatron
+++ b/docker/docker-cuda/Dockerfile.megatron
@@ -1,12 +1,13 @@
-# NVIDIA official image (ubuntu-22.04 + cuda-12.4 + python-3.10)
+# NVIDIA official image (ubuntu-24.04 + cuda-12.9.1 + python-3.12)
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-06.html
-FROM nvcr.io/nvidia/pytorch:24.05-py3
+FROM nvcr.io/nvidia/pytorch:25.06-py3
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PIP_ROOT_USER_ACTION=ignore
 ENV PYPI_MIRROR=https://mirrors.aliyun.com/pypi/simple/
 ENV PYPI_TRUSTED_HOST=mirrors.aliyun.com
 ENV APT_MIRROR=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
 ENV PIP_CONSTRAINT=""
 RUN pip install --upgrade pip setuptools wheel "hatchling>=1.18.0" editables --trusted-host ${PYPI_TRUSTED_HOST} --index-url ${PYPI_MIRROR}
@@ -14,20 +15,14 @@ RUN pip uninstall -y torch torchvision torch-tensorrt \
    flash_attn transformer-engine \
    cudf dask-cuda cugraph cugraph-service-server cuml raft-dask cugraph-dgl cugraph-pyg dask-cudf
-RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
+RUN pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu129
 RUN pip uninstall -y opencv opencv-python opencv-python-headless && \
-    rm -rf /usr/local/lib/python3.10/dist-packages/cv2/ && \
+    rm -rf /usr/local/lib/python3.12/dist-packages/cv2/ && \
    pip install opencv-python-headless==4.11.0.86 --trusted-host ${PYPI_TRUSTED_HOST} --index-url ${PYPI_MIRROR}
-RUN pip install "numpy==1.26.4" "optree>=0.13.0" "spacy==3.7.5" "weasel==0.4.1" \
+RUN pip install --trusted-host mirrors.aliyun.com --index-url ${PYPI_MIRROR} \
-    transformer-engine[pytorch]==2.2.0 megatron-core==0.13.0 deepspeed==0.16.4 \
+    "megatron-core>=0.13.0,<0.14.0" "deepspeed==0.16.4"
    --trusted-host ${PYPI_TRUSTED_HOST} --index-url ${PYPI_MIRROR}
 RUN pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 # RUN pip install vllm==0.8.4 \
 #     --trusted-host ${PYPI_TRUSTED_HOST} --index-url ${PYPI_MIRROR}
 WORKDIR /build
@@ -37,6 +32,8 @@ RUN pip uninstall -y apex && \
    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
    --config-settings "--build-option=--cpp_ext --cuda_ext --parallel 32" ${apex_url}
 RUN pip install --no-build-isolation transformer_engine[pytorch]
 RUN rm -rf /build
 WORKDIR /workspace
@@ -53,14 +50,17 @@ RUN apt-get update && apt-get install -y zip
 RUN apt-get install -y openjdk-21-jdk
 ENV JAVA_HOME /usr/lib/jvm/java-21-openjdk-amd64
-# pip install LLaMA-Factory
+ARG REPO_URL=https://github.com/hiyouga/LlamaFactory.git
 ARG BRANCH=main
 WORKDIR /app
-# Copy the application into the image
+# Clone the repository
-COPY . /app
+RUN git clone --depth 1 --branch ${BRANCH} ${REPO_URL} /app || \
    git clone --depth 1 ${REPO_URL} /app
 # Install LLaMA Factory
-RUN pip install --no-cache-dir -e ".[metrics]" --no-build-isolation
+RUN pip install --no-cache-dir -e . --no-build-isolation && \
    pip install --no-cache-dir -r requirements/metrics.txt --no-build-isolation
 RUN pip install "git+https://github.com/alibaba/roll.git#subdirectory=mcore_adapter"
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -35,7 +35,8 @@ COPY . /app
 # Install torch-npu
 RUN pip uninstall -y torch torchvision torchaudio && \
    pip install --no-cache-dir "torch==2.7.1" "torch-npu==2.7.1" "torchvision==0.22.1" "torchaudio==2.7.1" --index-url "${PYTORCH_INDEX}" && \
-    pip install --no-cache-dir -e ".[metrics]" --no-build-isolation
+    pip install --no-cache-dir -e . --no-build-isolation && \
    pip install --no-cache-dir -r requirements/metrics.txt --no-build-isolation
 # Set up volumes
 # VOLUME [ "/root/.cache/huggingface", "/app/shared_data", "/app/output" ]
--- a/docker/docker-rocm/Dockerfile
+++ b/docker/docker-rocm/Dockerfile
@@ -34,7 +34,8 @@ COPY . /app
 # Reinstall pytorch rocm and install LLaMA Factory
 RUN pip uninstall -y torch torchvision torchaudio && \
-    pip install --no-cache-dir --no-build-isolation -e --pre ".[metrics,deepspeed]" --index-url "${PYTORCH_INDEX}"
+    pip install --no-cache-dir --no-build-isolation -e --pre . --index-url "${PYTORCH_INDEX}" && \
    pip install --no-cache-dir --no-build-isolation -r requirements/metrics.txt -r requirements/deepspeed.txt --index-url "${PYTORCH_INDEX}"
 # Rebuild flash attention
 RUN if [ "${INSTALL_FLASHATTN}" == "true" ]; then \
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 SOURCEDIR     = .
 BUILDDIR      = _build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/_static/css/lang-switcher.css
+++ b/docs/_static/css/lang-switcher.css
@@ -0,0 +1,50 @@
 .lang-switcher {
  display: flex;
  align-items: center;
  justify-content: center;
 }
 .lang-switcher__select {
  appearance: none;
  -webkit-appearance: none;
  -moz-appearance: none;
  padding: 6px 28px 6px 10px;
  border-radius: 999px;
  border: 1px solid rgba(0, 0, 0, 0.18);
  background-color: #ffffff;
  color: #333333;
  font-size: 13px;
  line-height: 18px;
  box-shadow: 0 1px 2px rgba(0, 0, 0, 0.08);
  cursor: pointer;
  background-image: linear-gradient(45deg, transparent 50%, #667085 50%),
    linear-gradient(135deg, #667085 50%, transparent 50%);
  background-position: calc(100% - 16px) 50%, calc(100% - 11px) 50%;
  background-size: 5px 5px, 5px 5px;
  background-repeat: no-repeat;
 }
 .lang-switcher__select:focus {
  outline: none;
  border-color: rgba(41, 128, 185, 0.65);
  box-shadow: 0 0 0 3px rgba(41, 128, 185, 0.18);
 }
 .wy-side-nav-search .lang-switcher {
  margin-top: 10px;
 }
 .wy-side-nav-search .lang-switcher__select {
  border-color: rgba(255, 255, 255, 0.18);
  background-color: rgba(255, 255, 255, 0.08);
  color: #ffffff;
  box-shadow: none;
  background-image: linear-gradient(45deg, transparent 50%, rgba(255, 255, 255, 0.75) 50%),
    linear-gradient(135deg, rgba(255, 255, 255, 0.75) 50%, transparent 50%);
 }
 .wy-side-nav-search .lang-switcher__select:focus {
  border-color: rgba(255, 255, 255, 0.45);
  box-shadow: 0 0 0 3px rgba(255, 255, 255, 0.12);
 }
--- a/docs/_static/js/switcher.js
+++ b/docs/_static/js/switcher.js
@@ -0,0 +1,93 @@
 document.addEventListener('DOMContentLoaded', function () {
  var path = window.location.pathname || '';
  var isZh = path.indexOf('/zh/') !== -1;
  var isEn = path.indexOf('/en/') !== -1;
  if (!isZh && !isEn) return;
  var currentLang = isZh ? 'zh' : 'en';
  function buildSwitcher() {
    var container = document.createElement('div');
    container.className = 'lang-switcher';
    var select = document.createElement('select');
    select.setAttribute('aria-label', 'Language');
    select.className = 'lang-switcher__select';
    var optionZh = document.createElement('option');
    optionZh.value = 'zh';
    optionZh.textContent = 'Simplified Chinese';
    optionZh.selected = isZh;
    var optionEn = document.createElement('option');
    optionEn.value = 'en';
    optionEn.textContent = 'English';
    optionEn.selected = isEn;
    select.appendChild(optionZh);
    select.appendChild(optionEn);
    select.addEventListener('change', function () {
      var nextLang = select.value;
      if (nextLang === currentLang) return;
      var targetUrl = path.replace('/' + currentLang + '/', '/' + nextLang + '/');
      window.location.href = targetUrl + window.location.search + window.location.hash;
    });
    container.appendChild(select);
    return container;
  }
  function hideOtherLanguageToc() {
    var captions = document.querySelectorAll('p.caption');
    for (var i = 0; i < captions.length; i++) {
      var caption = captions[i];
      var textEl = caption.querySelector('.caption-text');
      if (!textEl) continue;
      var label = (textEl.textContent || '').trim().toLowerCase();
      var isCaptionZh = label === '中文' || label === 'chinese' || label === 'zh';
      var isCaptionEn = label === 'english' || label === 'en';
      if (!isCaptionZh && !isCaptionEn) continue;
      var shouldHide = (currentLang === 'zh' && isCaptionEn) || (currentLang === 'en' && isCaptionZh);
      var shouldHideCaption = true;
      var next = caption.nextElementSibling;
      if (next && next.tagName && next.tagName.toLowerCase() === 'ul') {
        if (shouldHide) {
          caption.style.display = 'none';
          next.style.display = 'none';
        } else if (shouldHideCaption) {
          caption.style.display = 'none';
        }
      } else if (shouldHide) {
        caption.style.display = 'none';
      } else if (shouldHideCaption) {
        caption.style.display = 'none';
      }
    }
  }
  var side = document.querySelector('.wy-side-nav-search');
  if (side) {
    var sideSwitcher = buildSwitcher();
    sideSwitcher.style.marginTop = '8px';
    sideSwitcher.style.display = 'flex';
    sideSwitcher.style.justifyContent = 'center';
    side.appendChild(sideSwitcher);
  } else {
    var topRight = buildSwitcher();
    topRight.style.position = 'fixed';
    topRight.style.top = '12px';
    topRight.style.right = '12px';
    topRight.style.zIndex = '9999';
    document.body.appendChild(topRight);
  }
  hideOtherLanguageToc();
  window.addEventListener('load', hideOtherLanguageToc);
  setTimeout(hideOtherLanguageToc, 50);
  setTimeout(hideOtherLanguageToc, 300);
 });
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -0,0 +1,37 @@
 # Configuration file for the Sphinx documentation builder.
 import os
 import sys
 # Define common settings here
 project = 'LlamaFactory'
 copyright = '2024, LlamaFactory Team'
 author = 'LlamaFactory Team'
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.viewcode',
    'sphinx.ext.napoleon',
    'myst_parser',
 ]
 templates_path = ['_templates']
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 html_theme = 'sphinx_rtd_theme'
 html_static_path = ['_static']
 html_js_files = [
    'js/switcher.js',
 ]
 html_css_files = [
    'css/lang-switcher.css',
 ]
 myst_enable_extensions = [
    "colon_fence",
    "deflist",
 ]
 myst_heading_anchors = 3
--- a/docs/en/advanced/custom-kernels/custom-kernels.md
+++ b/docs/en/advanced/custom-kernels/custom-kernels.md
@@ -0,0 +1,3 @@
 # Custom Kernels
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/advanced/custom-kernels/fused-operators.md
+++ b/docs/en/advanced/custom-kernels/fused-operators.md
@@ -0,0 +1,3 @@
 # Fused Operators
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/advanced/custom-kernels/triton.md
+++ b/docs/en/advanced/custom-kernels/triton.md
@@ -0,0 +1,3 @@
 # Triton
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/advanced/distributed/deepspeed.md
+++ b/docs/en/advanced/distributed/deepspeed.md
@@ -0,0 +1,3 @@
 # DeepSpeed
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/advanced/distributed/fsdp.md
+++ b/docs/en/advanced/distributed/fsdp.md
@@ -0,0 +1,3 @@
 # FSDP
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/advanced/distributed/parallel-dp-tp-ep-sp-cp.md
+++ b/docs/en/advanced/distributed/parallel-dp-tp-ep-sp-cp.md
@@ -0,0 +1,3 @@
 # Parallel (DP, TP, EP, SP, CP)
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/advanced/lora-and-quantization/lora.md
+++ b/docs/en/advanced/lora-and-quantization/lora.md
@@ -0,0 +1,3 @@
 # LoRA
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/advanced/lora-and-quantization/quantization.md
+++ b/docs/en/advanced/lora-and-quantization/quantization.md
@@ -0,0 +1,3 @@
 # Quantization
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -0,0 +1,20 @@
 import os
 import sys
 # Add parent dir to path to allow importing conf.py
 sys.path.insert(0, os.path.abspath('..'))
 from conf import *
 # Language settings
 language = 'en'
 html_search_language = 'en'
 # Static files
 # Point to the root _static directory
 html_static_path = ['../_static']
 # Add custom JS for language switcher
 html_js_files = [
    'js/switcher.js',
 ]
--- a/docs/en/data-preparation/data-processing.md
+++ b/docs/en/data-preparation/data-processing.md
@@ -0,0 +1,3 @@
 # Data Processing
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/dev-guide/core/data-engine.md
+++ b/docs/en/dev-guide/core/data-engine.md
@@ -0,0 +1,3 @@
 # DataEngine
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/dev-guide/core/model-engine.md
+++ b/docs/en/dev-guide/core/model-engine.md
@@ -0,0 +1,3 @@
 # ModelEngine
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/dev-guide/core/trainer.md
+++ b/docs/en/dev-guide/core/trainer.md
@@ -0,0 +1,3 @@
 # Trainer
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/dev-guide/plugins/data-plugins.md
+++ b/docs/en/dev-guide/plugins/data-plugins.md
@@ -0,0 +1,3 @@
 # Data Plugins
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/dev-guide/plugins/model-plugins/initialization.md
+++ b/docs/en/dev-guide/plugins/model-plugins/initialization.md
@@ -0,0 +1,3 @@
 # Initialization
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/dev-guide/plugins/model-plugins/kernels.md
+++ b/docs/en/dev-guide/plugins/model-plugins/kernels.md
@@ -0,0 +1,3 @@
 # Kernels
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/dev-guide/plugins/model-plugins/rendering.md
+++ b/docs/en/dev-guide/plugins/model-plugins/rendering.md
@@ -0,0 +1,3 @@
 # Rendering
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/getting-started.md
+++ b/docs/en/getting-started.md
@@ -0,0 +1,3 @@
 # Getting Started
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/hyperparameters/data-argument.md
+++ b/docs/en/hyperparameters/data-argument.md
@@ -0,0 +1,3 @@
 # Data Argument
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/hyperparameters/model-argument.md
+++ b/docs/en/hyperparameters/model-argument.md
@@ -0,0 +1,3 @@
 # Model Argument
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/hyperparameters/sample-argument.md
+++ b/docs/en/hyperparameters/sample-argument.md
@@ -0,0 +1,3 @@
 # Sample Argument
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/hyperparameters/training-argument.md
+++ b/docs/en/hyperparameters/training-argument.md
@@ -0,0 +1,3 @@
 # Training Argument
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -0,0 +1,62 @@
 LlamaFactory Docs
 =================
 .. toctree::
   :maxdepth: 1
   :caption: Getting Started
   getting-started
   installation
   llamaboard-web-ui
 .. toctree::
   :maxdepth: 1
   :caption: Data Preparation
   data-preparation/data-processing
 .. toctree::
   :maxdepth: 1
   :caption: Training
   training/sft
   training/dpo
 .. toctree::
   :maxdepth: 1
   :caption: Inference
   inference/deploy
 .. toctree::
   :maxdepth: 1
   :caption: Advanced
   advanced/lora-and-quantization/lora
   advanced/lora-and-quantization/quantization
   advanced/distributed/fsdp
   advanced/distributed/deepspeed
   advanced/distributed/parallel-dp-tp-ep-sp-cp
   advanced/custom-kernels/triton
   advanced/custom-kernels/fused-operators
 .. toctree::
   :maxdepth: 1
   :caption: Hyperparameters
   hyperparameters/data-argument
   hyperparameters/model-argument
   hyperparameters/sample-argument
   hyperparameters/training-argument
 .. toctree::
   :maxdepth: 1
   :caption: Dev Guide
   dev-guide/core/data-engine
   dev-guide/core/model-engine
   dev-guide/core/trainer
   dev-guide/plugins/data-plugins
   dev-guide/plugins/model-plugins/initialization
   dev-guide/plugins/model-plugins/kernels
   dev-guide/plugins/model-plugins/rendering
--- a/docs/en/inference/deploy.md
+++ b/docs/en/inference/deploy.md
@@ -0,0 +1,3 @@
 # Deploy
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/installation.md
+++ b/docs/en/installation.md
@@ -0,0 +1,3 @@
 # Installation
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/llamaboard-web-ui.md
+++ b/docs/en/llamaboard-web-ui.md
@@ -0,0 +1,3 @@
 # LlamaBoard Web UI
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/training/dpo.md
+++ b/docs/en/training/dpo.md
@@ -0,0 +1,3 @@
 # DPO
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/en/training/sft.md
+++ b/docs/en/training/sft.md
@@ -0,0 +1,3 @@
 # SFT
 This page is not yet available in English. Use the language switcher to view Simplified Chinese.
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
@ECHO OFF
 pushd %~dp0
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set SOURCEDIR=.
 set BUILDDIR=_build
 if "%1" == "" goto help
 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 	echo.installed, then set the SPHINXBUILD environment variable to point
 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 	echo.may add the Sphinx directory to your PATH.
 	echo.
 	echo.If you don't have Sphinx installed, grab it from
 	echo.http://sphinx-doc.org/
 	exit /b 1
 )
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end
 :help
 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 :end
 popd
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -0,0 +1,3 @@
 sphinx>=6.0.0
 sphinx-rtd-theme>=1.2.0
 myst-parser>=2.0.0
--- a/docs/zh/advanced/custom-kernels/custom-kernels.md
+++ b/docs/zh/advanced/custom-kernels/custom-kernels.md
@@ -0,0 +1,93 @@
 # LLaMA-Factory Kernels 系统
 ## 概述
 LLaMA-Factory Kernels 系统用于管理不同硬件设备提供的高性能计算内核（kernel）实现，该系统通过替换模型中的关键模块（如 RMSNorm、SwiGLU、RoPE、MoE 等）为硬件优化的版本，从而显著提升模型训练和推理的性能。
 Kernels 系统采用基于注册表的自动发现机制，能够根据当前运行环境自动检测可用的硬件设备（NPU、CUDA 等），并使能相应的高性能 kernels。这种设计使得用户无需关心底层实现细节，只需简单调用接口即可获得性能提升。
 ## 核心特性
 - **自动注册机制**：基于 `@register_kernel` 装饰器实现自动注册系统。系统启动时会自动扫描 `ops` 目录下的 kernel 实现，并将其注册到全局注册表中。
 - **设备适配感知**：自动检测当前硬件设备（NPU、CUDA 等）并应用相应的优化。系统会跳过不支持的设备，确保在不同环境下都能正常工作。
 - **模块化设计**：每个 kernel 独立实现，互不干扰。可以单独应用某个 kernel，也可以批量应用所有默认的 kernels。
 - **后向兼容**：kernel 替换不修改模型权重，保持数值一致性。优化后的实现与原始实现保持精度一致（在浮点误差范围内）。
 - **灵活扩展**：通过继承 `BaseKernel` 基类并使用装饰器，可以轻松添加新的 kernel 实现，支持新的硬件设备或优化算法。
 ## 使用方式
 ### 1. 通过训练 YAML 配置文件使用
 要在训练过程中使能 kernels，只需在配置文件中增加如下配置，即可自动使能所有默认可用的 kernels：
 ```yaml
 ...
 kernel_config:
    name: auto
    include_kernels: auto # choice: null/true/false/auto/kernel_id1,kernel_id2,kernel_id3, default is null
 ...
 ```
 ### 2. 调用 API 使能
 #### 2.1 apply_default_kernels 使能所有默认 kernels
 `apply_default_kernels` API 能够自动应用当前设备上所有默认注册的 kernels：
 ```python
 from transformers import AutoModelForCausalLM
 from llamafactory.v1.plugins.model_plugins.kernels import apply_default_kernels
 # 加载模型
 model = AutoModelForCausalLM.from_pretrained("qwen/qwen2.5-0.5B")
 # 自动应用所有默认 kernels
 model = apply_default_kernels(model, include_kernels="auto")
 ```
 #### 2.2 apply_kernel 使能特定 kernel
 如果需要更精细的控制，例如在某些场合单独应用某个 kernel，可以手动调用 `apply_kernel` 函数并传入 kernel ID：
 ```python
 from transformers import AutoModelForCausalLM
 from llamafactory.v1.plugins.model_plugins.kernels import apply_kernel
 # 加载模型
 model = AutoModelForCausalLM.from_pretrained("qwen/qwen2.5-0.5B")
 # 手动应用各个 kernels
 # 注意：kernel ID 必须与定义时的 _kernel_id 一致
 model = apply_kernel("npu_fused_rope", model=model)
 model = apply_kernel("npu_fused_rmsnorm", model=model)
 model = apply_kernel("npu_fused_swiglu", model=model)
 model = apply_kernel("npu_fused_moe", model=model)
 ### 3. 查询已注册的可用 kernels
 可以通过 `get_default_kernels` 获取当前环境中所有已注册且可用的默认 kernel ID：
 ```python
 from llamafactory.v1.plugins.model_plugins.kernels import get_default_kernels
 # 获取默认 kernel 列表
 available_kernels = get_default_kernels()
 print(f"Available kernels: {available_kernels}")
 # 输出示例: ['npu_fused_rmsnorm', 'npu_fused_swiglu', 'npu_fused_rope', 'npu_fused_moe']
 ```
 ### 当前已实现的 kernels
 | Kernel ID | 功能 | 支持的设备 | 备注 |
 |-----------|------|-----------|------|
 | [npu_fused_rmsnorm](./fused-operators.md/#npufusedrmsnorm) | RMSNorm 融合算子 | NPU | NPU 设备的高性能 RMSNorm 实现 |
 | [npu_fused_swiglu](./fused-operators.md/#npufusedswiglu) | SwiGLU 融合算子 | NPU | NPU 设备的高性能 SwiGLU 实现 |
 | [npu_fused_rope](./fused-operators.md/#npufusedrope) | RoPE 融合算子 | NPU | NPU 设备的高性能 RoPE 实现 |
 | [npu_fused_moe](./fused-operators.md/#npufusedmoe) | MoE 融合算子 | NPU | MoE 融合算子，适配 Qwen3-MoE 等模型 |
 我们会持续适配更多的 kernels，如果您需要自己开发新的 kernels，请参考我们的 [Kernel 开发文档](../../dev-guide/plugins/model-plugins/kernels.md)，欢迎您向 LLaMA-Factory 贡献代码。
--- a/docs/zh/advanced/custom-kernels/fused-operators.md
+++ b/docs/zh/advanced/custom-kernels/fused-operators.md
@@ -0,0 +1,104 @@
 # Fused Operators
 LLaMA-Factory 提供了一系列针对特定硬件优化的融合算子。这些算子位于 `src/llamafactory/v1/plugins/model_plugins/kernels/ops` 目录下。
 系统启动时，`scan_all_kernels` 函数会自动扫描该目录，注册所有可用的算子。您可以通过 `apply_default_kernels(model, include_kernels="auto")` 一键启用它们，或者使用 `apply_kernel` 单独启用。
 以下是当前支持的融合算子详情：
 ## NpuFusedRMSNorm
 RMSNorm（Root Mean Square Layer Normalization）是一种常用于大模型的归一化方法。在推理或训练中，RMSNorm 融合算子 将bias、residual等操作进行融合，可以减少显存访问次数，加速计算。
 Ascend npu 通过 `torch_npu.npu_rms_norm` 接口提供 RMSNorm 融合算子调用接口，支持 float16, bfloat16, float 等数据格式。RMSNorm 算子常见于Qwen等LLM模型中，由于torch侧没有提供 RMSNorm 算子的接口，因此在模型中通常是以自定义类的形式出现，通过替换 RMSNorm 类的 `forward` 方法即可使能。
 ```python
 def _npu_rms_forward(self, hidden_states):
    """NPU forward implementation for RMSNorm.
    Args:
        self: RMSNorm module instance with `weight` and `variance_epsilon`.
        hidden_states: Input hidden states tensor, same shape as the baseline.
    Returns:
        Normalized tensor consistent with the baseline RMSNorm behavior.
    """
    return torch_npu.npu_rms_norm(hidden_states, self.weight, epsilon=self.variance_epsilon)[0]
 ```
 在 LlamaFactory 中，通过 `NpuRMSNormKernel` 提供使能该融合算子的入口，只需要调用 `apply_kernel("npu_fused_rmsnorm", model=model)` 即可针对已适配的模型使能 npu RMSNorm 融合算子。
 ## NpuFusedSwiGlu
 SwiGLU（Swish-Gated Linear Unit）是一种结合了Swish激活函数和门控线性单元（GLU）的混合激活函数，其主要功能是对输入张量进行门控线性变换，近年来被广泛应用于 LLM 模型中的 MLP 层。SwiGLU 融合算子将分割、激活、矩阵乘等多个操作融合为单一硬件指令，避免多次内核启动开销。
 Ascend npu 通过 `torch_npu.npu_swiglu` 接口提供 SwiGLU 融合算子调用接口，支持 float16，bfloat16，float SwiGLU 算子常见于Qwen等LLM模型中，由于torch侧没有提供 SwiGLU 算子的接口，因此在模型中通常是以自定义类的形式出现，通过替换 SwiGLU 类的 `forward` 方法即可使能。替换过程可参考如下示例：
 ```python
 # 原始 MLP forward 方法：
 def forward(self, x):
    down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
    return down_proj
 # 替换后的 forward 方法：
 def _npu_swiglu_forward(self, hidden_state):
    return self.down_proj(
        torch_npu.npu_swiglu(torch.cat((self.gate_proj(hidden_state), self.up_proj(hidden_state)), dim=-1), dim=-1)
    )
 ```
 在 LLaMA-Factory 中，通过 `NpuSwiGluKernel` 提供使能该融合算子的入口，只需要调用 `apply_kernel("npu_fused_swiglu", model=model)` 即可针对已适配的模型使能 npu SwiGLU 融合算子。对于未适配的模型，如有需要，您可根据示例以及[开发者文档](../../dev-guide/plugins/model-plugins/kernels.md)自行适配。
 ## NpuFusedRoPE
 RoPE（Rotary Positional Embedding，旋转式位置嵌入） 是一种位置编码技术，广泛应用于 Qwen 等 LLM 模型中，用于有效编码文本序列的位置信息。它结合了绝对位置编码的稳定性与相对位置编码的灵活性，同时具备优秀的长度泛化能力。传统 RoPE 算子通常在 LLM 等模型结构中通过自定义函数的形式实现。RoPE 融合算子将原计算流程合并为单个硬件优化算子，从而提升性能。
 Ascend npu 通过 `torch_npu.npu_rotary_mul` 提供 RoPE 融合算子调用接口，支持 float16，bfloat16，float32 等数据格式。以 Qwen3 系列模型为例，通过替换其 `apply_rotary_pos_emb` 函数即可实现 RoPE融合算子使能：
 ```python
 # 原始 apply_rotary_pos_emb：
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed
 # 替换 RoPE 融合算子后：
 def _apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = torch_npu.npu_rotary_mul(q, cos, sin)
    k_embed = torch_npu.npu_rotary_mul(k, cos, sin)
    return q_embed, k_embed
 ```
 在 LLaMA-Factory 中，通过 `NpuRoPEKernel` 提供使能该融合算子的入口，只需要调用 `apply_kernel("npu_fused_rope", model=model)` 即可针对已适配的模型使能 npu RoPE 融合算子。对于未适配的模型，如有需要，您可根据示例以及[开发者文档](../../dev-guide/plugins/model-plugins/kernels.md)自行适配。
 ## NpuFusedMoE
 MoE（Mixture of Experts）模型通过稀疏激活扩展容量。在原生 Transformers 实现中，使用串行循环遍历专家，导致内核启动开销大、硬件利用率低。
 **MoE 融合算子** 利用 **GMM（Grouped Matrix Multiplication，分组矩阵乘）** 技术，支持在单个硬件指令内并行处理多组不同形状（行数不一）的矩阵乘法，消减循环开销，同时无需额外的显存复制，显著提升训练性能。
 Ascend npu 通过 `torch_npu.npu_grouped_matmul` 等接口提供底层支持，通过替换模型中的 MoE Block forward 方法，即可利用 NPU 的分组矩阵乘能力。
 核心逻辑替换如下（简化示意）：
 ```python
 def _npu_moe_forward(self, hidden_states, routing_weights, router_indices):
    # 1. 排序：将乱序的 Token 按指派的专家归类，并生成索引映射
    permuted_states, row_map = torch_npu.npu_moe_token_permute(hidden_states, router_indices)
    # 2. 统计：计算每个专家需要处理的 Token 数量
    tokens_per_expert = torch.histc(router_indices, bins=self.num_experts, min=0, max=self.num_experts)
    # 3. 计算 (GMM)：一次性并行计算所有专家的权重，自动适配不同专家的输入长度
    inter_states = torch_npu.npu_grouped_matmul(permuted_states, self.gate_up_proj_weights, split_sizes=tokens_per_expert, ...)
    inter_states = torch_npu.npu_swiglu(inter_states)
    output = torch_npu.npu_grouped_matmul(inter_states, self.down_proj_weights, split_sizes=tokens_per_expert, ...)
    # 4. 还原：将结果恢复成原始 Token 顺序并应用路由权重
    return torch_npu.npu_moe_token_unpermute(output, row_map, routing_weights)
 ```
 在 LLaMA-Factory 中，通过 `NpuFusedMoEKernel` 提供使能该融合算子的入口。只需要调用 `apply_kernel("npu_fused_moe", model=model)` 即可针对已适配的模型使能 NPU MoE 融合算子。对于未适配的模型，您也可以参考上述示例代码以及[开发者文档](../../dev-guide/plugins/model-plugins/kernels.md)自行适配。
--- a/docs/zh/advanced/custom-kernels/triton.md
+++ b/docs/zh/advanced/custom-kernels/triton.md
@@ -0,0 +1 @@
 # Triton
--- a/docs/zh/advanced/distributed/deepspeed.md
+++ b/docs/zh/advanced/distributed/deepspeed.md
@@ -0,0 +1 @@
 # DeepSpeed
--- a/docs/zh/advanced/distributed/fsdp.md
+++ b/docs/zh/advanced/distributed/fsdp.md
@@ -0,0 +1 @@
 # FSDP
--- a/docs/zh/advanced/distributed/parallel-dp-tp-ep-sp-cp.md
+++ b/docs/zh/advanced/distributed/parallel-dp-tp-ep-sp-cp.md
@@ -0,0 +1 @@
 # Parallel(DP, TP, EP, SP, CP)
--- a/docs/zh/advanced/lora-and-quantization/lora.md
+++ b/docs/zh/advanced/lora-and-quantization/lora.md
@@ -0,0 +1,3 @@
 # Lora
 参数管理（二级参数形式）
--- a/docs/zh/advanced/lora-and-quantization/quantization.md
+++ b/docs/zh/advanced/lora-and-quantization/quantization.md
@@ -0,0 +1 @@
 # Quantization
--- a/docs/zh/conf.py
+++ b/docs/zh/conf.py
@@ -0,0 +1,20 @@
 import os
 import sys
 # Add parent dir to path to allow importing conf.py
 sys.path.insert(0, os.path.abspath('..'))
 from conf import *
 # Language settings
 language = 'zh_CN'
 html_search_language = 'zh'
 # Static files
 # Point to the root _static directory
 html_static_path = ['../_static']
 # Add custom JS for language switcher
 html_js_files = [
    'js/switcher.js',
 ]
--- a/docs/zh/data-preparation/data-processing.md
+++ b/docs/zh/data-preparation/data-processing.md
@@ -0,0 +1,479 @@
 # LLaMA-Factory v1 数据预处理
 ## 总览
 LLaMA-Factory `v1` 采用了全新的数据处理架构，主要包含以下核心组件：
 - **DataEngine**：数据引擎，负责数据集的加载、索引和转换等各种插件的接入和调用，并提供数据访问接口
 - **DataConverterPlugin**：数据转换器，将非标准格式转换为统一的标准格式
 - **DataLoaderPlugin**：数据加载插件，支持多种文件格式的加载
 - **DataIndexPlugin**：数据索引插件，支持数据集的采样和权重调整
 - **DataSelectorPlugin**：数据选择插件，支持灵活的数据访问方式
 与 LLaMA-Factory `v0` 版本相比，`v1` 版本采用了统一的数据格式（Messages Format），所有数据都会被转换为标准的对话消息列表；此外，`v1` 版本通过 DataEngine 与 Plugin 机制，提供了自定义数据处理流的接口，具有更好的可扩展性和一致性。
 ---
 ## 目录
 - [基本用法](#基本用法)
 - [标准数据格式](#标准数据格式)
 - [数据集配置文件](#数据集配置文件)
 - [完整示例](#完整示例)
 ---
 ## 基本用法
 ### 在训练配置文件，可以通过如下方式配置数据集：
 <details open>
 <summary>方式 1：使用 HF Hub Repo ID</summary>
 直接指定 HF Hub 上的数据集 Repo ID，DataEngine 会自动从 HF Hub 下载并加载数据集。
 注：使用 Repo ID 直接加载的数据集需要为标准格式
 **训练配置文件示例：**
 ```yaml
 # example_sft.yaml
 ...
 dataset: llamafactory/v1-sft-demo  # HF Hub Repo ID
 ...
 ```
 </details>
 <details>
 <summary>方式 2：使用 HF Hub 上的 YAML 配置文件</summary>
 `dataset`字段指定 HF Hub 上的 `dataset_info.yaml` 的 URI，DataEngine 会自动下载该配置文件并根据其中的配置加载数据集。
 **训练配置文件示例：**
 ```yaml
 # example_sft.yaml
 ...
 dataset: llamafactory/v1-sft-demo/dataset_info.yaml  # 远程 dataset_info.yaml 路径
 ...
 ```
 </details>
 <details>
 <summary>方式 3：使用本地 HF 数据集文件路径</summary>
 `dataset`字段指定本地的数据集文件路径（`.json`、`.jsonl` 等）
 注：直接指定数据集文件路径，要求该数据文件的格式已为标准格式
 **训练配置文件示例：**
 ```yaml
 # example_sft.yaml
 ...
 dataset: ~/data/v1_sft_demo.jsonl   # 本地数据集文件绝对路径
 ...
 ```
 </details>
 <details>
 <summary>方式 4：使用本地 YAML 配置文件路径</summary>
 `dataset`字段指定本地的 `dataset_info.yaml` 配置文件路径，DataEngine 会根据该配置加载其中的数据集。
 **训练配置文件示例：**
 ```yaml
 # example_sft.yaml
 ...
 dataset: ~/data/dataset_info.yaml    # 本地 dataset_info.yaml 文件路径
 ...
 ```
 </details>
 ---
 ## 标准数据格式
 v1 使用统一的 **Messages 格式**作为标准数据格式。每个样本都是一个包含 `messages` 字段的 JSON 对象。
 针对alpaca、sharegpt、以及dpo等格式的数据，可以通过内置的`DataConverterPlugin`插件，自动将其转化为标准格式，对于其他自定义格式的数据，用户也可通过自定义`DataConverterPlugin`来实现数据格式标准化，这部分内容参见[`DataConverterPlugin`](../dev-guide/plugins/data-plugins.md/#data-converter-plugin)
 ### 1. SFT（监督微调）样本格式
 ```json
 {
  "messages": [
    {
      "role": "system",
      "content": [{"type": "text", "value": "You are a helpful assistant."}],
      "loss_weight": 0.0
    },
    {
      "role": "user",
      "content": [{"type": "text", "value": "Hello, who are you?"}],
      "loss_weight": 0.0
    },
    {
      "role": "assistant",
      "content": [{"type": "text", "value": "I am an AI assistant."}],
      "loss_weight": 1.0
    }
  ]
 }
 ```
 #### 字段说明：
 - **messages**: 消息列表，包含一轮或多轮对话
  - **role**: 消息角色，可选值：
    - `"system"`: 系统提示
    - `"user"`: 用户输入
    - `"assistant"`: 模型回复
  - **content**: 内容列表，每个元素包含：
    - **type**: 内容类型，可选值：
      - `"text"`: 文本内容
      - `"image_url"`: 图像 URL（多模态）
      - `"audio_url"`: 音频 URL（多模态）
      - `"video_url"`: 视频 URL（多模态）
      - `"tools"`: 工具描述
      - `"tool_calls"`: 工具调用
      - `"reasoning"`: 推理过程
    - **value**: 具体内容（字符串）
  - **loss_weight**: 损失权重（浮点数）
    - `0.0`: 不计算损失（用于提示词部分）
    - `1.0`: 完全计算损失（用于回复部分）
    - 可设置为其他值以调整不同部分的学习权重
 - **_dataset_name** (可选): 数据集名称，由 DataEngine 自动添加
 - **extra_info** (可选): 额外信息字段
 ### 2. DPO（偏好对齐）样本格式
 ```json
 {
  "chosen_messages": [
    {
      "role": "user",
      "content": [{"type": "text", "value": "用户提问"}],
      "loss_weight": 0.0
    },
    {
      "role": "assistant",
      "content": [{"type": "text", "value": "更优的回答"}],
      "loss_weight": 1.0
    }
  ],
  "rejected_messages": [
    {
      "role": "user",
      "content": [{"type": "text", "value": "用户提问"}],
      "loss_weight": 0.0
    },
    {
      "role": "assistant",
      "content": [{"type": "text", "value": "较差的回答"}],
      "loss_weight": 1.0
    }
  ]
 }
 ```
 ### 3. 多模态支持
 对于多模态数据，可以在 `content` 列表中添加非文本类型的内容：
 ```json
 {
  "messages": [
    {
      "role": "user",
      "content": [
        {"type": "text", "value": "这张图片里有什么？"},
        {"type": "image_url", "value": "path/to/image.jpg"}
      ],
      "loss_weight": 0.0
    },
    {
      "role": "assistant",
      "content": [{"type": "text", "value": "图片中有一只猫。"}],
      "loss_weight": 1.0
    }
  ]
 }
 ```
 **说明**：`image_url`、`audio_url`、`video_url` 的路径可以是相对路径或绝对路径，具体加载方式由 `DataLoaderPlugin` 决定。
 ---
 ## 数据集配置文件
 ### 1. dataset_info.yaml 配置文件格式
 `dataset_info.yaml` 支持同时配置多个数据集，支持分别从 HF Hub 和本地获取数据集，数据集默认会混合并打乱顺序。
 **示例配置文件：`data/dataset_info.yaml`**
 ```yaml
 # 数据集 1：使用本地文件 + Alpaca 转换器
 identity:
  file_name: ~/data/identity.json            #本地数据集文件绝对路径
  converter: alpaca                           # 使用 alpaca 转换器
 # 数据集 2：指定自定义数据集目录
 alpaca_en_demo:
  file_name:  ~/data/alpaca_en_demo.json     # 数据集文件名
  converter: alpaca                           # 转换器插件
  size: 500                                   # 只使用 500 个样本
  weight: 0.5                                 # 数据集权重，用于控制该数据集的采样频率
  split: train                                # 数据集划分，默认为 train
  streaming: false                            # 是否流式加载，默认为 false
 # 数据集 3：从 Hugging Face Hub 加载
 hf_dataset:
  hf_hub_url: llamafactory/v1-sft-demo  # HF repo ID
  streaming: false  
 # 数据集 4：已经是标准格式，无需转换器
 standard:
  file_name: ~/data/v1_sft_demo.jsonl   # 本地标准数据集文件路径
 # 数据集 5：自定义数据集和 converter 插件
 custom_dataset:
  file_name: custom_data.json
  converter: custom_converter
  weight: 1.0  
 ```
 ### 2. 配置字段说明
 #### 数据源配置（二者必选其一）：
 - **hf_hub_url** (str): Hugging Face Hub 数据集仓库 ID
  - 示例：`"llamafactory/v1-sft-demo"`
  - 如果指定，则从 HF Hub 加载数据集
 - **file_name** (str): 本地文件路径
  - 支持格式：`.json`、`.jsonl`、`.csv`、`.parquet`、`.arrow`、`.txt`
 #### 可选配置：
 - **split** (str): 数据集划分，默认为 `"train"`
 - **converter** (str): 数据转换器名称
  - 可选值：`"alpaca"`（更多转换器持续添加中，也可在 data_plugin 中添加自定义 converter）
  - 如果不指定，则假定数据已是标准格式
 - **size** (int): 使用的样本数量，默认使用全部
 - **weight** (float): 数据集权重，用于混合数据集时的采样频率，默认为 1.0
 - **streaming** (bool): 是否流式加载，默认为 `False`
 ---
 ## 完整示例
 ### 1. 基础使用示例
 ```python
 from llamafactory.v1.config.data_args import DataArguments
 from llamafactory.v1.core.data_engine import DataEngine
 # 使用本地 YAML 配置
 data_args = DataArguments(
    dataset="~/data/v1_sft_demo.jsonl",
    cutoff_len=2048
 )
 # 初始化 DataEngine
 engine = DataEngine(data_args=data_args)
 # 查看数据集信息
 print(f"数据集总样本数: {len(engine)}")
 print(f"数据集列表: {list(engine.datasets.keys())}")
 # 访问数据样本
 sample = engine[0]
 print(f"样本格式: {sample.keys()}")
 print(f"消息列表: {sample['messages']}")
 # 批量访问
 batch = engine[0:10]
 print(f"批量样本数: {len(batch)}")
 ```
 ### 2. 输出示例
 **查看数据集信息输出：**
 ```
 数据集总样本数: 500
 数据集列表: ['default']
 样本格式: dict_keys(['_dataset_name', 'messages'])
 消息列表: [{'role': 'user', 'content': [{'type': 'text', 'value': 'hi'}], 'loss_weight': 0.0}, {'role': 'assistant', 'content': [{'type': 'text', 'value': 'Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?'}], 'loss_weight': 1.0}]
 批量样本数: 10
 ```
 **访问单个样本输出：**
 ```python
 {
  '_dataset_name': 'alpaca_en_demo',
  'messages': [
    {
      'role': 'user',
      'content': [{'type': 'text', 'value': 'What is the capital of France?'}],
      'loss_weight': 0.0
    },
    {
      'role': 'assistant',
      'content': [{'type': 'text', 'value': 'The capital of France is Paris.'}],
      'loss_weight': 1.0
    }
  ]
 }
 ```
 ### 3. 混合多数据集配置文件示例
 **配置文件：`data/mixed_datasets.yaml`**
 ```yaml
 dataset_1:
  file_name: alpaca_en_demo.json
  converter: alpaca
  weight: 1.0
 dataset_2:
  file_name: identity.json
  converter: alpaca
  weight: 2.0
 dataset_3:
  hf_hub_url: llamafactory/v1-sft-demo
  weight: 1.5
 ```
 ### 4. 多模态数据示例
 **数据文件：`data/multimodal_demo.jsonl`**
 标准化后数据示例：
 ```json
 [
  {
    "messages": [
      {
        "role": "user",
        "content": [
          {"type": "text", "value": "Who are they?"},
          {"type": "image_url", "value": "mllm_demo_data/1.jpg"}
        ],
        "loss_weight": 0.0
      },
      {
        "role": "assistant",
        "content": [
          {"type": "text", "value": "They're Kane and Gretzka from Bayern Munich."}
        ],
        "loss_weight": 1.0
      },
      {
        "role": "user",
        "content": [
          {"type": "text", "value": "What are they doing?"},
          {"type": "image_url", "value": "mllm_demo_data/1.jpg"}
        ],
        "loss_weight": 0.0
      },
      {
        "role": "assistant",
        "content": [
          {"type": "text", "value": "They are celebrating on the soccer field."}
        ],
        "loss_weight": 1.0
      }
    ]
  },
  {
    "messages": [
      {
        "role": "user",
        "content": [
          {"type": "text", "value": "Who is he?"},
          {"type": "image_url", "value": "mllm_demo_data/2.jpg"}
        ],
        "loss_weight": 0.0
      },
      {
        "role": "assistant",
        "content": [
          {"type": "text", "value": "He's Thomas Muller from Bayern Munich."}
        ],
        "loss_weight": 1.0
      },
      {
        "role": "user",
        "content": [
          {"type": "text", "value": "Why is he on the ground?"}
        ],
        "loss_weight": 0.0
      },
      {
        "role": "assistant",
        "content": [
          {"type": "text", "value": "Because he's sliding on his knees to celebrate."}
        ],
        "loss_weight": 1.0
      }
    ]
  }
 ]
 ```
 ```python
 from llamafactory.v1.config.data_args import DataArguments
 from llamafactory.v1.core.data_engine import DataEngine
 data_args = DataArguments(dataset="data/multimodal_demo.jsonl")
 engine = DataEngine(data_args=data_args)
 # 访问多模态样本
 sample = engine[0]
 print("用户消息内容：")
 for content_item in sample['messages'][0]['content']:
    print(f"  类型: {content_item['type']}, 值: {content_item['value']}")
 ```
 ---
 **注意事项**：
 1. 所有数据最终都会转换为标准的 Messages 格式
 2. 通过 `converter` 插件可以支持多种数据格式
 3. 通过 `weight` 和 `size` 参数可以灵活控制数据分布
 4. 支持同时使用本地数据集和 HuggingFace Hub 数据集
 5. 多模态数据通过在 `content` 中添加不同类型的元素来支持
 6. 更多细节信息请参考我们的 [API REFERENCE](../dev-guide/core/data-engine.md/#data-engine)
--- a/docs/zh/dev-guide/core/data-engine.md
+++ b/docs/zh/dev-guide/core/data-engine.md
@@ -0,0 +1,253 @@
 # DataEngine
 ## 1. DataEngine 简介
 `DataEngine` 是 LLaMA-Factory v1 数据处理的核心类，继承自 PyTorch 的 `Dataset`，负责各种插件的接入，其他功能（如数据格式转换、数据加载等）均通过插件的形式实现并接入 `DataEngine`。
 `DataEngine`接受一个唯一入参：`DataArguments` 实例，所有的元数据集信息均通过该参数配置传入。
 ## 2. DataEngine 与 DataArguments 接口定义
 ```python
@dataclass
 class DataArguments:
    """ `DataEngine`初始化入参
    args:
        dataset (str): 数据集路径，远程数据集 repo id / dataset_info.yaml 路径，或本地数据集路径/dataset_info.yaml路径
        cutoff_len (int): 数据集截止长度，即数据集最大样本采样数量
    """
    ...
 class DataEngine(Dataset):
    """数据引擎（DataEngine）
    `DataEngine` 负责数据集的加载与统一管理，支持：
        - 从本地路径或 Hugging Face Hub 加载数据
        - 通过插件机制加载自定义数据
        - 构建统一的数据索引
        - 支持流式（streaming）与非流式数据访问
    attr:
        args (DataArguments): 数据参数配置
        datasets (dict[str, HFDataset]): 数据集名称到数据对象的映射
        dataset_infos (dict[str, DatasetInfo]): 数据集名称到元信息的映射
        data_index (list[tuple[str, int]]): 数据索引列表，每项为 (dataset_name, sample_index)
        streaming (bool): 是否为流式数据集
    """
    def __init__(self, data_args: DataArguments) -> None:
        """初始化 `DataEngine`
        初始化时自动执行以下步骤：
            1. 调用 `get_dataset_info`， 从 `data_args` 读取并解析数据集元信息
            2. 调用 `load_dataset`，根据配置加载数据集
            3. 调用 `build_data_index`，构建统一的索引列表
        args:
            data_args (DataArguments): 数据参数配置对象
        """
        ...
    def get_dataset_info(self) -> None:
        """从配置文件或远程仓库加载数据集元信息
        根据 `self.args.dataset` 确定数据源，数据源支持如下选项：
            - 本地 YAML 配置文件路径
            - Hugging Face Hub 上的 YAML 配置文件路径
            - 本地数据集文件路径
            - Hugging Face Hub 数据集 repo id
        """
        ...
    def load_dataset(self) -> None:
        """根据数据集元信息加载所有数据集
        每个数据集条目可以包含以下字段：
            - `hf_hub_url`: 使用 `datasets.load_dataset` 加载
            - 本地数据文件：通过 `DataLoaderPlugin` 插件加载
            - `streaming`: 是否启用流式模式
        更新:
            self.datasets (dict): 数据集名称到已加载数据对象的映射
            self.streaming (bool): 如果任一数据集为流式模式，则设置为 True
        """
        ...
    def build_data_index(self) -> None:
        """构建统一的数据索引
        为所有数据集创建全局索引列表 `(dataset_name, sample_index)`
        当启用流式模式时，生成固定长度（例如 1000）的占位索引；
        否则，为每条样本建立索引。
        插件 `DataIndexPlugin` 可根据数据集大小或权重调整索引分布
        """
        ...
    def _convert_data_sample(self, raw_sample: dict[str, Any], dataset_name: str) -> Sample:
        """将原始样本转换为统一格式
        根据 `dataset_info` 中的 `converter` 字段，调用对应的转换插件，
        将原始样本标准化为统一的数据结构。
        args:
            raw_sample (dict[str, Any]): 原始数据样本
            dataset_name (str): 样本所属的数据集名称
        return:
            Sample: 转换后的标准化格式样本
        """
        ...
    def __len__(self) -> int:
        """返回数据集的总样本数
        return:
            int: 数据集长度
                如果为流式数据集，返回 `-1`
        """
        ...
    def __getitem__(self, index: Union[int, Any]) -> Union[Sample, list[Sample]]:
        """根据索引或选择器获取样本
        args:
            index (Union[int, Any]): 数据索引，int 或 list[int]
        return:
            Union[Sample, list[Sample]]: 单个样本或样本列表
        """
        ...
    def __iter__(self) -> Iterable:
        """返回数据集迭代器
        用于非流式数据集的顺序或随机访问
        流式模式下需要实现异步加载逻辑
        return:
            Iterable: 数据集迭代器。
        """
        ...
    async def __aiter__(self) -> AsyncIterable:
        """返回异步数据集迭代器
        用于流式数据集或异步数据加载场景
        允许在异步环境中以流的方式读取样本
        return:
            AsyncIterable: 异步迭代器，按顺序产出样本
        """
        ...
 ```
 `DataArguments`  参数说明：
 `dataset`: 数据集路径，支持本地或远程，当传入本地数据集文件路径时，需要满足该数据集为标准格式；否则需要传入 `dataset_info.yaml` 来配置数据集的 `converter` 等元信息，以告知 `DataEngine` 应当如何处理该数据。
 `cutoff_len`: 数据集的截止长度，即该数据集的最大样本数量。
 ---
 ## 3. DataEngine 核心方法
 ### 3.1 `get_dataset_info`：加载数据元信息
 根据 `dataset` 参数加载数据集配置，获取数据位置、数据格式、插件配置等所有数据元信息，在实例化 `DataEngine` 时会自动调用此方法。
 ### 3.2 加载数据集：`load_dataset`
 遍历所有数据源，根据不同的数据源加载数据，在实例化 `DataEngine` 时会自动调用此方法。
 ```python
 for key, value in self.dataset_infos.items():
    split = value.get("split", "train")
    streaming = value.get("streaming", False)
    if "hf_hub_url" in value:
        # 从 HF Hub 加载
        dataset = load_dataset(value["hf_hub_url"], split=split, streaming=streaming)
    else:
        # 使用 DataLoaderPlugin 加载本地文件
        dataset = DataLoaderPlugin(args=self.args).auto_load_data(value)
    self.datasets[key] = dataset
 ```
 ### 3.3 `build_data_index`：构建数据索引
 为每个数据集创建索引列表 `[(dataset_name, sample_index), ...]`, `DataIndexPlugin`插件在此处被调用，可控制各数据集的采样频率、采样方式等，在实例化`DataEngine`时会自动调用此方法。
 ```python
 for dataset_name, dataset in self.datasets.items():
    # 创建基础索引
    data_index = [(dataset_name, idx) for idx in range(len(dataset))]
    # 根据 size 和 weight 调整索引
    size = self.dataset_infos[dataset_name].get("size")
    weight = self.dataset_infos[dataset_name].get("weight")
    if size or weight:
        data_index = DataIndexPlugin().adjust_data_index(data_index, size, weight)
    self.data_index.extend(data_index)
 ```
 ### 3.4 `_convert_data_sample`：数据格式标准化
 将原始数据转换为标准格式，`DataConverterPlugin`插件在此处被调用，具体调用的插件由 `get_dataset_info` 方法获取的 `converter` 信息指定，若 `converter` 为空则假定数据集为标准格式，此方法由`DataEngine`的 `__getitem__` 方法调用。
 ```python
 def _convert_data_sample(self, raw_sample: dict, dataset_name: str) -> Sample:
    converter = self.dataset_infos[dataset_name].get("converter")
    if converter is not None:
        # 使用指定的转换器
        from ..plugins.data_plugins.converter import get_converter
        return {"_dataset_name": dataset_name, **get_converter(converter)(raw_sample)}
    else:
        # 已经是标准格式
        return {"_dataset_name": dataset_name, **raw_sample}
 ```
 ---
 ## 4. 初始化
 `DataEngine` 初始化过程只需传入一个构建好的 `DataArguments` 即可，后续可通过该 `DataEngine` 访问数据集中的数据。
 ```python
 from llamafactory.v1.config.data_args import DataArguments
 from llamafactory.v1.core.data_engine import DataEngine
 # 1. 创建数据参数
 data_args = DataArguments(
    dataset="~/data/v1_sft_demo.jsonl",
    cutoff_len=2048
 )
 # 2. 初始化 Data Engine
 data_engine = DataEngine(data_args=data_args)
 # 3. 访问数据
 sample = data_engine[0]  # 获取第一个样本
 ```
 ## 5. 数据访问方式
 实例化后的`DataEngine`支持整数索引、列表索引、以及切片等访问方式，其数据读取用法可等价于 Python 列表。
 ```python
 sample = data_engine[0]  # 获取第一个样本
 sample = data_engine[0:10]  # 获取前 10 个样本
 sample = data_engine[[0, 5, 10]]  # 获取指定索引的样本
 ```
--- a/docs/zh/dev-guide/core/model-engine.md
+++ b/docs/zh/dev-guide/core/model-engine.md
@@ -0,0 +1 @@
 # ModelEngine
--- a/docs/zh/dev-guide/core/trainer.md
+++ b/docs/zh/dev-guide/core/trainer.md
@@ -0,0 +1 @@
 # Trainer
--- a/docs/zh/dev-guide/plugins/data-plugins.md
+++ b/docs/zh/dev-guide/plugins/data-plugins.md
@@ -0,0 +1,467 @@
 # Data Plugins
 ## 1. Data Plugins 简介
 ## DataConverterPlugin
 ### 1. DataConverterPlugin 简介
 DataConverter 负责将非标准格式的数据集转换为 v1 的标准 Messages 格式。这使得用户可以继续使用现有的数据集（如 Alpaca 格式），而无需手动转换。针对自定义格式的数据集，用户也可以通过构建对应的自定义 DataConverter 插件，来负责其数据格式标准化。
 当前，LLaMA-Factory 已内置了 `Alpaca Converter` 和 `Pair Converter`，这两类数据集可以直接使用对应的 converter 进行标准化，无需自定义转换器。
 ### 2. Alpaca Converter 详解
 #### 2.1 Alpaca 格式
 Alpaca 格式是一种常见的指令微调数据格式：
 ```json
 {
  "system": "You are a helpful assistant.",
  "instruction": "Describe a process of making crepes.",
  "input": "",
  "output": "Making crepes is an easy and delicious process..."
 }
 ```
 #### 2.2 Alpaca Converter 接口定义
 ```python
 class AlpacaSample(TypedDict, total=False):
    """Alpaca 格式数据样本结构
    attr:
        system (str, 可选): 系统提示信息（system prompt），用于设定对话背景或模型行为。
        instruction (str, 可选): 用户指令（user instruction），通常为任务描述。
        input (str, 可选): 额外的输入内容（input text），可与 instruction 拼接。
        output (str, 可选): 模型生成的目标输出（expected response）。
    """
    ...
 def alpaca_converter(raw_sample: AlpacaSample) -> SFTSample:
    """将 Alpaca 样本转换为 SFT（Supervised Fine-Tuning）标准样本格式
    `alpaca_converter` 将 Alpaca 数据集中一条样本转换为通用的 `SFTSample` 格式
    该格式用于监督微调（SFT）或多轮对话建模
    转换逻辑:
        - 若存在 `system` 字段，则生成一条系统消息，loss_weight = 0.0
        - 若存在 `instruction` 或 `input` 字段，则合并为一条用户消息，loss_weight = 0.0
        - 若存在 `output` 字段，则生成一条助手机器人回复消息，loss_weight = 1.0
    args:
        raw_sample (AlpacaSample): 原始 Alpaca 数据样本
    return:
        SFTSample: 转换后的标准化样本，格式如下:
            {
                "messages": [
                    {"role": "system", "content": [{"type": "text", "value": "..."}], "loss_weight": 0.0},
                    {"role": "user", "content": [{"type": "text", "value": "..."}], "loss_weight": 0.0},
                    {"role": "assistant", "content": [{"type": "text", "value": "..."}], "loss_weight": 1.0},
                ]
            }
    example:
        >>> raw = {"instruction": "请将以下句子翻译成英文：", "input": "你好", "output": "Hello"}
        >>> alpaca_converter(raw)
        {
            "messages": [
                {"role": "user", "content": [{"type": "text", "value": "请将以下句子翻译成英文：你好"}], "loss_weight": 0.0},
                {"role": "assistant", "content": [{"type": "text", "value": "Hello"}], "loss_weight": 1.0}
            ]
        }
    """
 ```
 #### 2.3 转换过程
 `alpaca_converter` 函数将 Alpaca 格式转换为标准格式，转换逻辑如下：
 ```python
 def alpaca_converter(raw_sample: AlpacaSample) -> SFTSample:
    messages = []
    # 1. 添加系统提示词（如果存在）
    if "system" in raw_sample:
        messages.append({
            "role": "system",
            "content": [{"type": "text", "value": raw_sample["system"]}],
            "loss_weight": 0.0
        })
    # 2. 添加用户输入（instruction + input）
    if "instruction" in raw_sample or "input" in raw_sample:
        user_content = raw_sample.get("instruction", "") + raw_sample.get("input", "")
        messages.append({
            "role": "user",
            "content": [{"type": "text", "value": user_content}],
            "loss_weight": 0.0
        })
    # 3. 添加模型回复
    if "output" in raw_sample:
        messages.append({
            "role": "assistant",
            "content": [{"type": "text", "value": raw_sample["output"]}],
            "loss_weight": 1.0
        })
    return {"messages": messages}
 ```
 #### 2.4 转换示例
 **输入（Alpaca 格式）：**
 ```json
 {
  "instruction": "What is the capital of France?",
  "input": "",
  "output": "The capital of France is Paris."
 }
 ```
 **输出（标准格式）：**
 ```json
 {
  "messages": [
    {
      "role": "user",
      "content": [{"type": "text", "value": "What is the capital of France?"}],
      "loss_weight": 0.0
    },
    {
      "role": "assistant",
      "content": [{"type": "text", "value": "The capital of France is Paris."}],
      "loss_weight": 1.0
    }
  ]
 }
 ```
 ### 3. 自定义转换器
 #### 3.1 创建自定义转换器
 如果用户有自己的数据格式，可以轻松添加自定义转换器将其标准化，实现过程可参考如下示例：
 ```python
 # src/llamafactory/v1/plugins/data_plugins/converter.py
 from typing import TypedDict, NotRequired
 from ...extras.types import SFTSample
 # 1. 定义输入格式的类型
 class MyCustomSample(TypedDict, total=False):
    question: str
    answer: str
    context: NotRequired[str]
 # 2. 实现转换逻辑
 def custom_converter(raw_sample: MyCustomSample) -> SFTSample:
    messages = []
    # 构建用户消息
    user_text = raw_sample["question"]
    if "context" in raw_sample:
        user_text = f"Context: {raw_sample['context']}\n\nQuestion: {user_text}"
    messages.append({
        "role": "user",
        "content": [{"type": "text", "value": user_text}],
        "loss_weight": 0.0
    })
    # 构建助手消息
    messages.append({
        "role": "assistant",
        "content": [{"type": "text", "value": raw_sample["answer"]}],
        "loss_weight": 1.0
    })
    return {"messages": messages}
 # 3. 注册 custom_converter
 #src/llamafactory/v1/plugins/data_plugins/converter.py: CONVERTERS
 CONVERTERS = {
    "alpaca": alpaca_converter,
    "custom": custom_converter,  # 添加自定义转换器
 }
 ```
 #### 3.2 使用自定义转换器
 在 YAML 配置中指定转换器名称：
 ```yaml
 my_dataset:
  file_name: custom_data.json
  converter: custom
 ```
 ---
 ## DataLoaderPlugin
 ### 1. DataLoaderPlugin 简介
 `DataLoaderPlugin` 负责从本地文件加载数据集，当前支持如下文件格式：
 - **JSON**: `.json`
 - **JSONL**: `.jsonl`
 - **CSV**: `.csv`
 - **Parquet**: `.parquet`
 - **Arrow**: `.arrow`
 - **Text**: `.txt`
 ### 2. DataLoaderPlugin 接口定义
 ```python
@dataclass
 class DataLoaderPlugin:
    """数据加载插件（DataLoaderPlugin）
    负责根据数据集信息（`DatasetInfo`）自动加载本地或远程数据集。  
    支持多种文件格式（如 CSV、JSON、Parquet、Text、Arrow），并可选择是否以流式方式加载。
    通常由 `DataEngine` 调用，用于统一封装数据加载逻辑。
    """
    args: DataArguments
    """数据参数对象，包含数据目录、缓存路径、分片等配置信息。"""
    def _get_builder_name(self, path: str) -> Literal["arrow", "csv", "json", "parquet", "text"]:
        """获取数据集文件格式
        根据输入文件路径自动判断应使用的 HuggingFace `load_dataset` 构建器类型。
        通过文件扩展名推断数据类型，例如 `.csv`、`.jsonl`、`.parquet`、`.txt` 等。
        args:
            path (str): 数据集文件路径，用于识别文件类型。
        return:
            Literal["arrow", "csv", "json", "parquet", "text"]:
                数据构建器名称，用于 `datasets.load_dataset()`。
        example:
            >>> _get_builder_name("data/train.jsonl")
            "json"
        """
        ...
    def auto_load_data(self, dataset_info: DatasetInfo) -> HFDataset:
        """根据传入的 `dataset_info` 自动选择合适的加载方式
        args:
            dataset_info (DatasetInfo): 数据集元信息，通常包含：
                - `file_name`: 数据文件路径
                - `split`: 数据划分（如 "train"、"test"）；
                - `streaming`: 是否启用流式加载
        return:
            HFDataset: 加载完成的 Hugging Face 数据集对象。
        example:
            >>> plugin = DataLoaderPlugin(args)
            >>> ds = plugin.auto_load_data({"file_name": "~/data.json", "split": "train"})
        """
        ...
    def load_data_from_file(self, filepath: str, split: str, streaming: bool) -> HFDataset:
        """从文件或目录加载数据集
        根据输入路径自动识别文件类型（CSV、JSON、Parquet、Text 等），  
        并通过 `datasets.load_dataset()` 加载数据集。  
        若 `streaming=True`，则将结果转换为迭代式数据集。
        args:
            filepath (str): 文件路径或目录路径。
            split (str): 数据划分名称（如 "train"、"validation"）。
            streaming (bool): 是否启用流式加载模式。
        return:
            HFDataset: 加载后的数据集对象。
        example:
            >>> plugin.load_data_from_file("data/train.json", "train", False)
        """
        ...
 ```
 ---
 ## DataIndexPlugin
 ### 1. DataIndexPlugin 简介
 `DataIndexPlugin` 负责调整数据索引，支持通过配置 `size`, `weight` 等参数控制数据集样本数量和采样频率。
 - 使用 `size` 参数 限制使用的样本数量：
 ```yaml
 my_dataset:
  file_name: large_dataset.json
  size: 1000  # 只使用前 1000 个样本
 ```
 - 使用 `weight` 参数调整数据集在混合数据中的采样频率：
 ```yaml
 dataset_a:
  file_name: data_a.json
  weight: 1.0
 dataset_b:
  file_name: data_b.json
  weight: 2.0  # dataset_b 的样本出现频率是 dataset_a 的 2 倍
 ```
 **说明**：`weight` 参数适用于在多个数据集混合训练时，调整不同数据集的的采样频率
 - 当 `weight=1.0` 时，数据集按原始比例采样
 - 当 `weight=2.0` 时，该数据集的索引会复制 2 倍，使其样本出现频率翻倍
 ### 2. DataIndexPlugin 接口定义
 ```python
@dataclass
 class DataIndexPlugin:
    """数据索引插件（DataIndexPlugin）
    根据 `size` 和 `weight` 调整数据索引列表，控制数据集的样本数量和采样频率  
    通常在多数据集混合训练时使用，以控制不同数据集在总体样本中的占比。
    在 `DataEngine.build_data_index` 中被自动调用，用于实现样本重采样或加权分布。
    """
    def adjust_data_index(
        self, data_index: list[tuple[str, int]], size: Optional[int], weight: Optional[float]
    ) -> list[tuple[str, int]]:
        """调整数据索引列表
        根据 `size` 或 `weight` 参数对输入的数据索引进行采样、扩展或缩减。  
        若两个参数同时存在，将依次执行基于大小和基于权重的调整。
        args:
            data_index (list[tuple[str, int]]):  
                数据索引列表，每个元素为 `(dataset_name, sample_index)`。  
            size (Optional[int]):  
                目标样本数量，若指定则根据该数量裁剪或重复样本。  
            weight (Optional[float]):  
                数据集权重，用于控制数据集在混合训练中的采样比例。
        return:
            list[tuple[str, int]]:  
                调整后的数据索引列表。
        example:
            >>> plugin = DataIndexPlugin()
            >>> adjusted = plugin.adjust_data_index([("ds1", i) for i in range(100)], size=50, weight=None)
            >>> len(adjusted)
            50
        """
        ...
    def adjust_by_size(self, data_index: list[tuple[str, int]], size: int) -> list[tuple[str, int]]:
        """根据目标大小调整数据索引
        通过裁剪或重复样本，使索引总数等于 `size`。  
        常用于统一不同数据集的样本数量。
        args:
            data_index (list[tuple[str, int]]):  
                原始数据索引列表。  
            size (int):  
                目标样本数量。
        return:
            list[tuple[str, int]]:  
                调整后长度等于 `size` 的数据索引列表。
        example:
            >>> plugin.adjust_by_size([("ds1", i) for i in range(10)], 20)
        """
        ...
    def adjust_by_weight(self, data_index: list[tuple[str, int]], weight: float) -> list[tuple[str, int]]:
        """根据权重调整数据索引
        通过加权采样或重复样本，使数据集样本出现频率符合指定权重。  
        常用于多数据源训练中按比例平衡样本。
        args:
            data_index (list[tuple[str, int]]):  
                原始数据索引列表。  
            weight (float):  
                数据集权重（相对比例，可与其他数据集共同归一化）。
        return:
            list[tuple[str, int]]:  
                调整后的加权数据索引列表。
        example:
            >>> plugin.adjust_by_weight([("ds1", i) for i in range(10)], 0.5)
        """
        ...
 ```
 ---
 ## DataSelectorPlugin
 ### 1. DataSelectorPlugin 简介
 `DataSelectorPlugin` 为 `DataEngine`提供基于索引访问数据的功能，由 `DataEngine` 的 `__getitem__` 方法自动调用。
 ### 2. DataSelectorPlugin 接口定义
 ```python
@dataclass
 class DataSelectorPlugin:
    """根据索引选择数据集样本。
    配合 `DataEngine` 使用，通过统一的 `data_index` 结构（包含数据集名与样本索引）来实现灵活的数据选择
    """
    data_index: list[tuple[str, int]]
    """数据索引列表，每个元素为 (dataset_name, sample_index)。"""
    def select(self, index: Union[slice, list[int], Any]) -> Union[tuple[str, int], list[tuple[str, int]]]:
        """选择数据集样本
        根据输入类型从 `data_index` 中选择对应的样本索引  
        支持三种索引方式：
            - 切片（slice）：返回对应范围内的样本
            - 索引列表（list[int]）：返回指定索引处的多个样本
            - 其他类型输入将触发异常。
        args:
            index (Union[slice, list[int], Any]): 数据样本索引
                可以是切片（`slice`）或索引列表
        return:
            Union[tuple[str, int], list[tuple[str, int]]]:
                - 若为单个索引：返回一个 `(dataset_name, sample_index)`
                - 若为多个索引或切片：返回多个样本的列表
        except:
        Raises:
            ValueError: 当输入索引类型不受支持时抛出。
        ...
 ```
--- a/docs/zh/dev-guide/plugins/model-plugins/initialization.md
+++ b/docs/zh/dev-guide/plugins/model-plugins/initialization.md
--- a/docs/zh/dev-guide/plugins/model-plugins/kernels.md
+++ b/docs/zh/dev-guide/plugins/model-plugins/kernels.md
@@ -0,0 +1,197 @@
 # Kernels plugins
 ## 概览
 LLaMA-Factory 通过 Kernels plugins 系统，依据不同硬件设备提供高性能计算内核（kernel）实现。该系统通过注册表机制管理所有 kernel，通过 `@register_kernel` 装饰器实现 kernel 定义后自动注册，由 `apply_kernel` 方法来使能指定的 kernel，`apply_default_kernels` 可使能注册表中当前环境所有可用的默认 kernels。
 ## 架构设计
 ### 核心组件
 #### 1. Registry（注册表）
 `Registry` 是一个用于管理所有 kernel 实现的静态类。它维护一个字典结构：`{kernel_id: KernelClass}`。
 ```python
 # 注册表结构示例
 {
    "npu_fused_rmsnorm": NpuRMSNormKernel,
    "npu_fused_swiglu": NpuSwiGluKernel,
    ...
 }
 ```
 #### 2. register_kernel (装饰器)
 `@register_kernel` 是 `Registry.register` 的别名。所有 kernel 类均应使用该装饰器进行注册。
 **注册机制**：
 - 装饰器检查类是否继承自 `BaseKernel`。
 - 检查类是否定义了 `_kernel_id` 和 `_device` 属性。
 - 检查 `_device` 是否与当前运行环境的加速器类型匹配。如果不匹配，则跳过注册。
 - 如果一切符合要求，将 kernel 类注册到全局注册表中。
 #### 3. BaseKernel（基类）
 所有 kernel 的实现都必须继承自 `BaseKernel` 抽象基类。`BaseKernel` 定义了 kernel 的基本属性和接口。
 #### 4. 标识系统
 **Kernel ID** (`_kernel_id`)：
 每个 kernel 必须拥有一个唯一的字符串标识符，例如 `"npu_fused_rmsnorm"`。
 **Device Type** (`_device`)：
 kernel 必须声明其支持的设备类型，例如 `DeviceType.NPU` 或 `DeviceType.CUDA`。
 ## Kernel 系统 API 设计
 ### **Registry**：全局 kernel 注册表
 `Registry` 类提供了注册和获取 kernel 的接口：
 ```python
 class Registry:
    @classmethod
    def register(cls, kernel_cls: type[BaseKernel]) -> type[BaseKernel] | None:
        """注册一个 kernel 类"""
        ...
    @classmethod
    def get(cls, kernel_id: str) -> type[BaseKernel] | None:
        """根据 ID 获取 kernel 类"""
        ...
 ```
 ### **BaseKernel**
 `BaseKernel` 定义了所有 kernel 必须实现的协议：
 - `_kernel_id`: 类属性，kernel 的唯一标识符。
 - `_device`: 类属性，kernel 支持的设备类型。
 - `check_deps()`: 类方法，检查 kernel 的依赖项是否满足（如 `torch_npu` 是否安装）。
 - `apply(**kwargs)`: 抽象类方法，实现 kernel 的具体应用逻辑。
 ```python
 class BaseKernel(ABC):
    _kernel_id: Any = ""
    _device: DeviceType = DeviceType.CPU
    @classmethod
    def check_deps(cls) -> bool:
        """检查依赖项"""
        ...
    @classmethod
    @abstractmethod
    def apply(cls, **kwargs) -> HFModel:
        """应用 kernel 到模型"""
        ...
 ```
 ### **scan_all_kernels**
 `scan_all_kernels` 函数会自动扫描 `ops` 目录下的所有 `.py` 文件并导入它们，从而触发 `@register_kernel` 装饰器完成自动注册。
 ### **apply_kernel**
 对模型使能指定的 kernel。
 ```python
 def apply_kernel(kernel_id: str, **kwargs) -> HFModel:
    """应用指定的 kernel 到模型
    Args:
        kernel_id: 目标 kernel 的 ID
        **kwargs: 传递给 kernel.apply 的参数，通常包含 model
    """
 ```
 **用法示例**：
 ```python
 from llamafactory.v1.plugins.model_plugins.kernels import apply_kernel
 model = apply_kernel("npu_fused_rmsnorm", model=model)
 ```
 ### **apply_default_kernels**
 对模型使能所有默认注册的 kernel。这是一个高级 API，通常在模型加载流程中自动调用。
 ```python
 def apply_default_kernels(model: HFModel, include_kernels: str = None) -> HFModel:
    """应用所有默认 kernel
    Args:
        model: HFModel 实例
        include_kernels: 包含的 kernel ID 列表（逗号分隔字符串），或者 "auto"/True 表示全部
    """
 ```
 ## 扩展 Kernels
 如果用户有针对特定模型或者设备的 kernel，可以按照下述步骤去实现并接入 LLaMA-Factory。
 ### 创建新 Kernel 的步骤
 #### 1. 创建 Kernel 实现文件
 在 `src/llamafactory/v1/plugins/model_plugins/kernels/ops` 下的相应子目录中创建新的 kernel 实现文件，例如 `mlp/cuda_swiglu.py`：
 ```python
 import torch
 from ......accelerator.helper import DeviceType
 from ......utils.types import HFModel
 from ...base import BaseKernel
 from ...registry import register_kernel
 # 实现具体的 kernel 函数
 def _cuda_swiglu_forward(self, hidden_state):
    # ... CUDA 优化实现 ...
    pass
@register_kernel
 class CudaSwiGluKernel(BaseKernel):
    _kernel_id = "cuda_fused_swiglu"
    _device = DeviceType.CUDA
    @classmethod
    def apply(cls, **kwargs) -> HFModel:
        model = kwargs.get("model")
        if model is None:
            raise ValueError("model is required")
        if not cls.check_deps():
            raise RuntimeError("Dependencies not met")
        # 遍历模型并替换 forward 方法
        for name, module in model.named_modules():
            # ... 匹配和替换逻辑 ...
            pass
        return model
 ```
 #### 2. 自动发现
 由于 `scan_all_kernels` 会自动扫描 `ops` 目录，只要文件位于该目录下且没有语法错误，系统启动时会自动导入并注册，无需手动修改注册表代码。
 #### 3. 测试 Kernel
 创建测试用例验证 kernel 的正确性：
 ```python
 from llamafactory.v1.plugins.model_plugins.kernels import apply_kernel
 # ... 加载模型 ...
 model = apply_kernel("cuda_fused_swiglu", model=model)
 # ... 验证 forward 是否被替换 ...
 ```
 ## 异常处理
 ### 依赖不可用
 `BaseKernel.check_deps()` 默认会检查当前设备类型是否匹配。子类可以重写此方法以添加额外的依赖检查（如检查特定的库是否安装）。如果 `check_deps()` 返回 `False`，`apply()` 方法应当抛出异常或进行相应处理。
 ### Kernel ID 未找到
 如果调用 `apply_kernel` 时传入了不存在的 `kernel_id`，会抛出 `ValueError`。
--- a/src/llamafactory/v1/plugins/model_plugins/added_token.py
+++ b/src/llamafactory/v1/plugins/model_plugins/added_token.py
--- a/docs/zh/getting-started.md
+++ b/docs/zh/getting-started.md
@@ -0,0 +1,71 @@
 # Getting Started
 ## 训练方法
 |          方法          |     全参数训练      |    部分参数训练     |       LoRA         |       QLoRA        |
 |:---------------------:| ------------------ | ------------------ | ------------------ | ------------------ |
 |      指令监督微调       | :white_check_mark: |  |  | |
 |      奖励模型训练       |  |  |  | |
 |        DPO 训练        |  |  |  | |
 ## 软件依赖
 |          必需项          | 至少     | 推荐     |
 |:---------------------:|--------|--------|
 |        python         | 3.11   | 3.12   |
 |         torch         | 2.7.1  | 2.7.1  |
 | torch-npu(Ascend NPU) | 2.7.1  | 2.7.1  |
 |      torchvision      | 0.22.1 | 0.22.1 |
 |     transformers      | 5.0.0  | 5.0.0  |
 |       datasets        | 3.2.0  | 4.0.0  |
 |         peft          | 0.18.1 | 0.18.1 |
 |       可选项        | 至少     | 推荐     |
 |:----------------:|--------|--------|
 | CUDA(NVIDIA GPU) | 11.6   | 12.2   |
 |    deepspeed     | 0.18.4 | 0.18.4 |
 |   flash-attn(NVIDIA GPU)   | 2.5.6  | 2.7.2  |
 ## 如何使用
 ### 安装 LLaMA Factory
 > [!IMPORTANT]
 > 此步骤为必需。
 #### 从源码安装
 ```bash
 git clone --depth 1 https://github.com/hiyouga/LlamaFactory.git
 cd LlamaFactory
 pip install -e .
 ```
 ### 数据准备
 关于数据集文件的格式，请参考 [data-preparation/README.md](data-preparation/README.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。
 > [!NOTE]
 > 使用自定义数据集或自定义数据集格式时，请参照 [data-preparation/README.md](data-preparation/README.md) 进行配置，如有必要，请重新实现自定义数据集的数据处理逻辑，包括对应的`converter`。
 您也可以使用 **[Easy Dataset](https://github.com/ConardLi/easy-dataset)**、**[DataFlow](https://github.com/OpenDCAI/DataFlow)** 和 **[GraphGen](https://github.com/open-sciencelab/GraphGen)** 构建用于微调的合成数据。
 ### 快速开始
 下面的命令展示了对 Qwen3-0.6B 模型使用 FSDP2 进行 全参**微调**，两行命令等价。
 ```bash
 export USE_V1=1
 llamafactory-cli sft examples/v1/train_full/train_full_fsdp2.yaml
 llamafactory-cli train examples/v1/train_full/train_full_fsdp2.yaml
 ```
 高级用法请参考 [advanced](./advanced/README.md)（包括多卡多机微调、分布式、Lora、量化、以及各种加速特性等）。
--- a/docs/zh/hyperparameters/data-argument.md
+++ b/docs/zh/hyperparameters/data-argument.md
@@ -0,0 +1 @@
 # Data Argument
--- a/src/llamafactory/v1/plugins/trainer_plugins/distributed/accelerate.py
+++ b/src/llamafactory/v1/plugins/trainer_plugins/distributed/accelerate.py
--- a/docs/zh/hyperparameters/sample-argument.md
+++ b/docs/zh/hyperparameters/sample-argument.md
--- a/docs/zh/hyperparameters/training-argument.md
+++ b/docs/zh/hyperparameters/training-argument.md
--- a/docs/zh/index.rst
+++ b/docs/zh/index.rst
@@ -0,0 +1,62 @@
 LlamaFactory 文档
 =================
 .. toctree::
   :maxdepth: 1
   :caption: Getting Started
   getting-started
   installation
   llamaboard-web-ui
 .. toctree::
   :maxdepth: 1
   :caption: Data Preparation
   data-preparation/data-processing
 .. toctree::
   :maxdepth: 1
   :caption: Training
   training/sft
   training/dpo
 .. toctree::
   :maxdepth: 1
   :caption: Inference
   inference/deploy
 .. toctree::
   :maxdepth: 1
   :caption: Advanced
   advanced/lora-and-quantization/lora
   advanced/lora-and-quantization/quantization
   advanced/distributed/fsdp
   advanced/distributed/deepspeed
   advanced/distributed/parallel-dp-tp-ep-sp-cp
   advanced/custom-kernels/triton
   advanced/custom-kernels/fused-operators
 .. toctree::
   :maxdepth: 1
   :caption: Hyperparameters
   hyperparameters/data-argument
   hyperparameters/model-argument
   hyperparameters/sample-argument
   hyperparameters/training-argument
 .. toctree::
   :maxdepth: 1
   :caption: Dev Guide
   dev-guide/core/data-engine
   dev-guide/core/model-engine
   dev-guide/core/trainer
   dev-guide/plugins/data-plugins
   dev-guide/plugins/model-plugins/initialization
   dev-guide/plugins/model-plugins/kernels
   dev-guide/plugins/model-plugins/rendering
--- a/docs/zh/inference/deploy.md
+++ b/docs/zh/inference/deploy.md
@@ -0,0 +1 @@
 # Deploy
--- a/docs/zh/installation.md
+++ b/docs/zh/installation.md
@@ -0,0 +1 @@
 # Installation
--- a/docs/zh/llamaboard-web-ui.md
+++ b/docs/zh/llamaboard-web-ui.md
@@ -0,0 +1 @@
 # LlamaBoard Web UI
--- a/docs/zh/training/dpo.md
+++ b/docs/zh/training/dpo.md
@@ -0,0 +1 @@
 # DPO
--- a/docs/zh/training/sft.md
+++ b/docs/zh/training/sft.md
@@ -0,0 +1 @@
 # SFT
--- a/examples/extras/asft/llama2_full_asft.yaml
+++ b/examples/extras/asft/llama2_full_asft.yaml
@@ -0,0 +1,45 @@
 ### model
 model_name_or_path: models/Llama-2-7b
 trust_remote_code: true
 ### method
 stage: sft
 do_train: true
 finetuning_type: full
 deepspeed: examples/deepspeed/ds_z0_config.json
 use_asft_loss: true
 asft_alpha: 0.1
 ### dataset
 dataset: med
 template: llama2
 cutoff_len: 2048
 max_samples: 10000
 overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 ### output
 output_dir: saves/llama2-7b/full/asft2
 logging_steps: 1
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 save_only_model: false
 report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
 ### train
 per_device_train_batch_size: 4
 gradient_accumulation_steps: 8
 learning_rate: 2.0e-5
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
 ### eval
 # val_size: 0.1
 # per_device_eval_batch_size: 1
 # eval_strategy: steps
 # eval_steps: 500
--- a/examples/extras/asft/qwen2_full_asft.yaml
+++ b/examples/extras/asft/qwen2_full_asft.yaml
@@ -0,0 +1,45 @@
 ### model
 model_name_or_path: models/Qwen2.5-7B
 trust_remote_code: true
 ### method
 stage: sft
 do_train: true
 finetuning_type: full
 deepspeed: examples/deepspeed/ds_z0_config.json
 use_asft_loss: true
 asft_alpha: 0.05
 ### dataset
 dataset: math
 template: qwen
 cutoff_len: 2048
 max_samples: 10000
 overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 ### output
 output_dir: saves/qwen2-7b/full/asft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 save_only_model: false
 report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
 ### train
 per_device_train_batch_size: 4
 gradient_accumulation_steps: 8
 learning_rate: 5.0e-5
 num_train_epochs: 1.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
 ### eval
 # val_size: 0.1
 # per_device_eval_batch_size: 1
 # eval_strategy: steps
 # eval_steps: 500
--- a/examples/extras/eaft/qwen25_05b_eaft_full.yaml
+++ b/examples/extras/eaft/qwen25_05b_eaft_full.yaml
@@ -0,0 +1,38 @@
 ### model
 model_name_or_path: Qwen/Qwen2.5-0.5B-Instruct
 trust_remote_code: true
 ### method
 stage: sft
 do_train: true
 finetuning_type: full
 use_eaft_loss: true
 ### dataset
 dataset: identity,alpaca_en_demo
 template: qwen
 cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 dataloader_num_workers: 4
 ### output
 output_dir: qwen2.5-0_5b/full/sft_eaft
 logging_steps: 1
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 save_only_model: false
 report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
 ### train
 per_device_train_batch_size: 2
 gradient_accumulation_steps: 8
 learning_rate: 1.0e-5
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
--- a/examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml
+++ b/examples/ktransformers/infer_lora/deepseek2_lora_sft_kt.yaml
@@ -5,6 +5,6 @@ infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransforme
 trust_remote_code: true
 use_kt: true # use KTransformers as LoRA sft backend to inference
-kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
+kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
 cpu_infer: 32
 chunk_size: 8192
--- a/examples/ktransformers/infer_lora/deepseek3_kt.yaml
+++ b/examples/ktransformers/infer_lora/deepseek3_kt.yaml
@@ -1,9 +1,9 @@
 model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
-template: deepseek
+template: deepseek3
 infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
 trust_remote_code: true
 use_kt: true # use KTransformers as LoRA sft backend to inference
-kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
+kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
 cpu_infer: 32
 chunk_size: 8192
--- a/examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml
+++ b/examples/ktransformers/infer_lora/deepseek3_lora_sft_kt.yaml
@@ -1,10 +1,10 @@
 model_name_or_path: opensourcerelease/DeepSeek-V3-bf16
 adapter_name_or_path: saves/Kllama_deepseekV3
-template: deepseek
+template: deepseek3
 infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransformers]
 trust_remote_code: true
 use_kt: true # use KTransformers as LoRA sft backend to inference
-kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
+kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
 cpu_infer: 32
 chunk_size: 8192
--- a/examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml
+++ b/examples/ktransformers/infer_lora/qwen3moe_lora_sft_kt.yaml
@@ -5,6 +5,6 @@ infer_backend: ktransformers  # choices: [huggingface, vllm, sglang, ktransforme
 trust_remote_code: true
 use_kt: true # use KTransformers as LoRA sft backend to inference
-kt_optimize_rule: examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
+kt_optimize_rule: examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
 cpu_infer: 32
 chunk_size: 8192
--- a/examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml
+++ b/examples/ktransformers/train_lora/deepseek2_lora_sft_kt.yaml
@@ -10,7 +10,7 @@ lora_rank: 8
 lora_target: all
 ### dataset
-dataset: identity
+dataset: identity, alpaca_en_demo
 template: deepseek
 cutoff_len: 2048
 max_samples: 100000
@@ -40,7 +40,7 @@ resume_from_checkpoint: null
 ### ktransformers
 use_kt: true # use KTransformers as LoRA sft backend
-kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
+kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V2-Lite-Chat-sft-amx.yaml
 cpu_infer: 32
 chunk_size: 8192
--- a/examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml
+++ b/examples/ktransformers/train_lora/deepseek3_lora_sft_kt.yaml
@@ -10,8 +10,8 @@ lora_rank: 8
 lora_target: all
 ### dataset
-dataset: identity
+dataset: identity, alpaca_en_demo
-template: deepseek
+template: deepseek3
 cutoff_len: 2048
 max_samples: 100000
 overwrite_cache: true
@@ -40,7 +40,7 @@ resume_from_checkpoint: null
 ### ktransformers
 use_kt: true # use KTransformers as LoRA sft backend
-kt_optimize_rule: examples/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
+kt_optimize_rule: examples/ktransformers/kt_optimize_rules/DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
 cpu_infer: 32
 chunk_size: 8192
--- a/examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml
+++ b/examples/ktransformers/train_lora/qwen3moe_lora_sft_kt.yaml
@@ -40,7 +40,7 @@ resume_from_checkpoint: null
 ### ktransformers
 use_kt: true # use KTransformers as LoRA sft backend
-kt_optimize_rule: examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
+kt_optimize_rule: examples/ktransformers/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
 cpu_infer: 32
 chunk_size: 8192
--- a/examples/requirements/liger-kernel.txt
+++ b/examples/requirements/liger-kernel.txt
@@ -1 +0,0 @@
 liger-kernel>=0.5.5
--- a/examples/v1/train_freeze/train_freeze_sft.yaml
+++ b/examples/v1/train_freeze/train_freeze_sft.yaml
@@ -0,0 +1,38 @@
 model: Qwen/Qwen3-4B
 trust_remote_code: true
 model_class: llm
 template: qwen3_nothink
 # Freeze Configuration
 peft_config:
  name: freeze
  freeze_trainable_layers: 2      # Train the last 2 layers
  freeze_trainable_modules: all       # In these layers, train specific modules
  freeze_extra_modules: null      # Extra modules to train (e.g. embed_tokens, lm_head)
 # Kernel Config
 kernel_config:
  name: auto
  include_kernels: auto
 # FSDP Config
 dist_config:
  name: fsdp2
  dcp_path: null
 ### data
 train_dataset: data/v1_sft_demo.yaml
 ### training
 output_dir: ./outputs/test_freeze
 micro_batch_size: 1
 global_batch_size: 4
 cutoff_len: 2048
 learning_rate: 2.0e-5
 bf16: false
 max_steps: 10
 ### sample
 sample_backend: hf
 max_new_tokens: 128
--- a/examples/v1/train_full/train_full_deepspeed.yaml
+++ b/examples/v1/train_full/train_full_deepspeed.yaml
@@ -0,0 +1,25 @@
 model: Qwen/Qwen3-0.6B
 model_class: llm
 template: qwen3_nothink
 kernel_config:
    name: auto
    include_kernels: auto 
 dist_config:
    name: deepspeed
    config_file: examples/deepspeed/ds_z3_config.json
 ### data
 train_dataset: data/v1_sft_demo.yaml 
 ### training
 output_dir: outputs/Qwen3-0.6B-deepspeed
 micro_batch_size: 1
 cutoff_len: 2048
 learning_rate: 1.0e-4
 bf16: true
 max_steps: 10
--- a/examples/v1/train_full/train_full_fsdp2.yaml
+++ b/examples/v1/train_full/train_full_fsdp2.yaml
@@ -0,0 +1,34 @@
 model: Qwen/Qwen3-0.6B
 trust_remote_code: true
 model_class: llm
 template: qwen3_nothink
 kernel_config:
  name: auto
  include_kernels: auto # choice: null/true/false/auto/kernel_id1,kernel_id2,kernel_id3, default is null
 quant_config: null
 dist_config:
  name: fsdp2
  dcp_path: null # /mnt/f/pretrain_models/Qwen3-0.6B-dcp
 init_config:
  name: init_on_meta
 ### data
 train_dataset: data/v1_sft_demo.yaml
 ### training
 output_dir: outputs/test_fsdp2
 micro_batch_size: 1
 global_batch_size: 1
 cutoff_len: 2048
 learning_rate: 1.0e-4
 bf16: false
 max_steps: 10
 ### sample
 sample_backend: hf
 max_new_tokens: 128
--- a/examples/v1/train_lora/export_lora.yaml
+++ b/examples/v1/train_lora/export_lora.yaml
@@ -0,0 +1,7 @@
 model: Qwen/Qwen3-4B
 peft_config:
  name: lora
  adapter_name_or_path: ./outputs/test_lora
  export_dir: ./merge_lora_model
  export_size: 5
  infer_dtype: auto
--- a/examples/v1/train_lora/train_lora_sft.yaml
+++ b/examples/v1/train_lora/train_lora_sft.yaml
@@ -0,0 +1,39 @@
 model: Qwen/Qwen3-4B
 trust_remote_code: true
 model_class: llm
 template: qwen3_nothink
 # PEFT Configuration
 peft_config:
  name: lora
  r: 16
  lora_alpha: 32
  lora_dropout: 0.05
  target_modules: all
 # Kernel Config
 kernel_config:
  name: auto
  include_kernels: auto
 # FSDP Config
 dist_config:
  name: fsdp2
  dcp_path: null
 ### data
 train_dataset: data/v1_sft_demo.yaml
 ### training
 output_dir: ./outputs/test_lora
 micro_batch_size: 1
 global_batch_size: 4
 cutoff_len: 2048
 learning_rate: 1.0e-4
 bf16: true
 max_steps: 10
 ### sample
 sample_backend: hf
 max_new_tokens: 128
--- a/examples/v1/train_qlora/quantization.yaml
+++ b/examples/v1/train_qlora/quantization.yaml
@@ -0,0 +1,43 @@
 model: Qwen/Qwen3-0.6B
 trust_remote_code: true
 model_class: llm
 template: qwen3_nothink
 # PEFT Configuration
 peft_config:
  name: lora
  r: 16
  lora_alpha: 32
  lora_dropout: 0.05
  target_modules: all
 # Kernel Config
 kernel_config:
  name: auto
  include_kernels: auto
 # FSDP Config
 dist_config:
  name: fsdp2
  dcp_path: null
 # Quantization Config
 quant_config:
  name: bnb # choice: auto/bnb if auto is selected, the quantization method will be automatically selected based on the model and environment.
  quantization_bit: 4 # choice: 8/4(bnb)
 ### data
 train_dataset: data/v1_sft_demo.yaml
 ### training
 output_dir: outputs/test_quantization
 micro_batch_size: 1
 cutoff_len: 2048
 learning_rate: 1.0e-4
 bf16: false
 max_steps: 10
 ### sample
 sample_backend: hf
 max_new_tokens: 128
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,6 @@ classifiers = [
    "License :: OSI Approved :: Apache Software License",
    "Operating System :: OS Independent",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
@@ -41,10 +40,10 @@ dependencies = [
    "torch>=2.4.0",
    "torchvision>=0.19.0",
    "torchaudio>=2.4.0",
-    "transformers>=4.51.0,<=4.57.1,!=4.52.0,!=4.57.0",
+    "transformers>=4.51.0,<=5.0.0,!=4.52.0,!=4.57.0",
    "datasets>=2.16.0,<=4.0.0",
    "accelerate>=1.3.0,<=1.11.0",
-    "peft>=0.14.0,<=0.17.1",
+    "peft>=0.18.0,<=0.18.1",
    "trl>=0.18.0,<=0.24.0",
    "torchdata>=0.10.0,<=0.11.0",
    # gui
@@ -63,7 +62,7 @@ dependencies = [
    "hf-transfer",
    "safetensors",
    # python
-    "av",
+    "av>=10.0.0,<=16.0.0",
    "fire",
    "omegaconf",
    "packaging",
@@ -73,14 +72,9 @@ dependencies = [
    # api
    "uvicorn",
    "fastapi",
-    "sse-starlette"
+    "sse-starlette",
 ]
 [project.optional-dependencies]
 dev = ["pre-commit", "ruff", "pytest", "build"]
 metrics = ["nltk", "jieba", "rouge-chinese"]
 deepspeed = ["deepspeed>=0.10.0,<=0.16.9"]
 [project.scripts]
 llamafactory-cli = "llamafactory.cli:main"
 lmf = "llamafactory.cli:main"
--- a/examples/requirements/adam-mini.txt
+++ b/examples/requirements/adam-mini.txt
--- a/examples/requirements/apollo.txt
+++ b/examples/requirements/apollo.txt
--- a/examples/requirements/aqlm.txt
+++ b/examples/requirements/aqlm.txt
--- a/examples/requirements/badam.txt
+++ b/examples/requirements/badam.txt
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
jiaqiw09	f80e15dbb4	[ci] fix ut huggingface hub 429 error when transformers>=5.0.0 (#10155 )	2026-02-12 22:14:10 +08:00
sunyi0505	991267fd3b	[v1] support quantization (#10161 )	2026-02-12 20:37:41 +08:00
浮梦	5c52afa30d	[v1] support deepspeed (#10181 )	2026-02-12 17:24:30 +08:00
Junyou Su	675ce8cc7f	[algo] add ASFT (#10174 )	2026-02-12 13:12:14 +08:00
jiaqiw09	ab073f4c13	[v1] add LoRA/Freeze support and merge workflow (#10157 )	2026-02-12 13:02:09 +08:00
Shanay Mehta	184304b5b4	[model] add liger kernel support for Qwen3-Next (#10176 ) Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-10 21:47:48 +08:00
Xue Yadong	d3ebd5678d	[model] support GLM-OCR SFT (#10183 )	2026-02-10 21:41:01 +08:00
浮梦	1d5e8ebcd0	[v1] init commit for v1 docs (#10145 ) Co-authored-by: frozenleaves <frozen@Mac.local> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: jiaqiw09 <jiaqiw960714@gmail.com> Co-authored-by: jiaqiw09 <60021713+jiaqiw09@users.noreply.github.com> Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>	2026-02-09 19:43:55 +08:00
Shanay Mehta	ea644d04ec	[model] support GLM-4.7-Flash SFT (#10173 ) Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-02-09 10:40:44 +08:00
Username_Full	92fa3df4c4	[trainer] add dpo/kto fsdp fsdp2 support (#10127 )	2026-02-04 23:27:12 +08:00
Hertz	8bedfafa4e	[model] support MiniCPM-o-4.5 (#10163 ) Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>	2026-02-04 23:21:27 +08:00
Yaowei Zheng	1a02717fa8	[assets] update readme (#10159 )	2026-02-03 19:11:15 +08:00
ゆり	e7cb145f5d	[logging] Fix race condition in LoggerHandler during multi-GPU training (#10156 ) Co-authored-by: yurekami <yurekami@users.noreply.github.com>	2026-02-03 11:14:07 +08:00
Hertz	b53d7037c2	[model] support youtu-vl model (#10152 )	2026-02-02 21:42:43 +08:00
浮梦	bf04ca6af8	[deps] adapt to transformers v5 (#10147 ) Co-authored-by: frozenleaves <frozen@Mac.local> Co-authored-by: hiyouga <hiyouga@buaa.edu.cn>	2026-02-02 12:07:19 +08:00
xvxuopop	762b480131	[feature] support using ray.remote to start distributed training. (#10109 )	2026-01-28 16:05:29 +08:00
Jewon Lee	9640f79ae5	[fix] add visual.pos_embed to Qwen3-VL visual model keys (#10139 )	2026-01-27 16:33:01 +08:00
jiaqiw09	7ef19eea00	[v0] Fix reward model training safetensors saving (#10137 )	2026-01-27 16:27:14 +08:00
浮梦	f9f11dcb97	[v1] support training with fsdp2 (#9773 ) Co-authored-by: frozenleaves <frozen@Mac.local> Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>	2026-01-25 19:41:58 +08:00
Pádraic Slattery	641bfdd482	chore: Update outdated GitHub Actions versions (#10123 )	2026-01-25 19:12:39 +08:00
Meng WANG	e70651ac58	[feat] support `all_exhausted_without_replacement` in datasets.interleave_datasets (#10112 )	2026-01-20 15:54:07 +08:00
Kingsley	db2f794f7b	[misc] update mcore related docker and mca supported models (#10114 )	2026-01-19 14:55:16 +08:00
jiaqiw09	44eadbda1c	[v1] fix kernel moe patch (#9867 )	2026-01-17 09:24:54 +08:00
浮梦	9829ae0a77	[ci] using mp to run kernel test (#9754 ) Co-authored-by: frozenleaves <frozen@Mac.local> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>	2026-01-13 19:43:59 +08:00
Yaowei Zheng	958b9c3468	[v1] add sft (#9752 )	2026-01-12 03:15:01 +08:00
Hertz	4d3621e3d3	[model] fixed&added Hunyuan models (#9750 )	2026-01-12 01:15:00 +08:00
Yaowei Zheng	a296723697	[v1] upgrade batching (#9751 ) Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2026-01-12 00:21:36 +08:00
Hertz	15b87f3125	[model] support HY-MT model (#9746 ) Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>	2026-01-11 16:25:56 +08:00
Yaowei Zheng	9f73a6eb23	[deps] fix package (#9745 )	2026-01-10 04:27:53 +08:00
Yaowei Zheng	b2effbd77c	[v1] add batch generator (#9744 )	2026-01-10 04:24:09 +08:00
Yaowei Zheng	d7d734d54c	[misc] fix fp8 (#9742 )	2026-01-09 16:17:26 +08:00
Yaowei Zheng	8abb8fb533	[v1] use async streamer (#9741 )	2026-01-09 16:12:07 +08:00
Yaowei Zheng	766d5ae6ad	[ci] fix workflow (#9738 )	2026-01-09 16:12:07 +08:00
Yaowei Zheng	5cccaeec82	[model] clean obsolete models (#9736 )	2026-01-09 16:12:07 +08:00
Jackey	5fb5d7ebd3	[model] support for microsoft's Phi-4-mini (#9734 )	2026-01-09 12:24:45 +08:00
Peilin Li	03a70ba8dd	[fix] correct ktransformers example config paths and templates (#9732 )	2026-01-08 10:52:50 +08:00
Vo Van Phuc	5cfd804b59	[refactor] rename lfm template to lfm2 and add LFM 2.5 to README (#9731 )	2026-01-07 19:25:04 +08:00
Yaowei Zheng	4c1eb922e2	[misc] fix parser (#9730 )	2026-01-07 17:36:08 +08:00
Vo Van Phuc	958fb523a2	[model] support LiquidAI's LFM2.5-VL vision-language model (#9729 )	2026-01-07 17:20:29 +08:00
Vo Van Phuc	b4e051bea4	[model] support for LiquidAI's LFM2.5 (Liquid Foundation Models) (#9726 )	2026-01-07 14:14:47 +08:00
浮梦	d43e1007e8	[ci] improve cuda ci cache (#9725 ) Co-authored-by: frozenleaves <frozen@Mac.local>	2026-01-07 12:34:40 +08:00
Xunpeng Xiao	f89d9367e5	[assets] update README.md (#9724 )	2026-01-07 12:11:50 +08:00
Yaowei Zheng	d22de0d4bf	[v1] add renderer ut (#9722 )	2026-01-07 02:06:07 +08:00
Yaowei Zheng	ea0b4e2466	[v1] add cli sampler (#9721 )	2026-01-06 23:31:27 +08:00
yanglele	e944dc442c	[feature] add support for EAFT loss (#9720 ) Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	2026-01-06 23:07:12 +08:00
Xunpeng Xiao	68119e5522	[misc] Add a PyTorch version warning for Conv3D. (#9715 )	2026-01-05 13:26:29 +08:00
Yaowei Zheng	f60a6e3d01	[v1] add init plugin (#9716 )	2026-01-04 20:51:46 +08:00
jiaqiw09	81b8a50aa5	[deps] Update pyproject.toml and requirements (#9714 ) Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>	2026-01-04 19:52:16 +08:00
Yaowei Zheng	8600530002	[misc] lint (#9710 )	2026-01-04 13:47:56 +08:00
Hertz	9ae62c6fc0	[model] support Youtu-LLM-2B (#9707 )	2026-01-04 13:17:57 +08:00
Xunpeng Xiao	0087bc253b	[misc] Compatible with an empty architectures field in config.json (#9709 )	2026-01-04 12:11:35 +08:00
Santosh Bhavani	355d5c5e5a	[fix] fp8: add Transformer Engine backend support (#9705 ) Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>	2026-01-01 10:18:02 +08:00
Yaowei Zheng	6fe6bd290b	[misc] set dev version (#9703 )	2025-12-31 23:41:40 +08:00
		`@@ -0,0 +1,3 @@`
							`# Custom Kernels`

							`This page is not yet available in English. Use the language switcher to view Simplified Chinese.`
		`@@ -0,0 +1,3 @@`
							`# Fused Operators`

							`This page is not yet available in English. Use the language switcher to view Simplified Chinese.`
		`@@ -0,0 +1,3 @@`
							`# Triton`

							`This page is not yet available in English. Use the language switcher to view Simplified Chinese.`
		`@@ -0,0 +1,3 @@`
							`# DeepSpeed`

							`This page is not yet available in English. Use the language switcher to view Simplified Chinese.`
		`@@ -0,0 +1,3 @@`
							`# FSDP`

							`This page is not yet available in English. Use the language switcher to view Simplified Chinese.`
		`@@ -0,0 +1,3 @@`
							`# Parallel (DP, TP, EP, SP, CP)`

							`This page is not yet available in English. Use the language switcher to view Simplified Chinese.`
		`@@ -0,0 +1,3 @@`
							`# LoRA`

							`This page is not yet available in English. Use the language switcher to view Simplified Chinese.`
		`@@ -0,0 +1,3 @@`
							`# Quantization`

							`This page is not yet available in English. Use the language switcher to view Simplified Chinese.`
		`@@ -0,0 +1,3 @@`
							`# Data Processing`

							`This page is not yet available in English. Use the language switcher to view Simplified Chinese.`