diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cbe361ee..f2df50e8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
     -   id: check-ast
     -   id: check-added-large-files
@@ -15,13 +15,13 @@ repos:
         args: ['--branch', 'main']
 
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.17.0
+    rev: v3.20.0
     hooks:
     -   id: pyupgrade
-        args: [--py38-plus]
+        args: [--py39-plus]
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.9
+    rev: v0.13.2
     hooks:
     -   id: ruff
         args: [--fix]
diff --git a/README.md b/README.md
index 542b27dc..92f24fe7 100644
--- a/README.md
+++ b/README.md
@@ -262,68 +262,70 @@ Choose your path:
 
 ## Supported Models
 
-| Model                                                             | Model size                       | Template            |
-| ----------------------------------------------------------------- | -------------------------------- | ------------------- |
-| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2           |
-| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                   |
-| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3            |
-| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere              |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek            |
-| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3           |
-| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1          |
-| [ERNIE-4.5](https://huggingface.co/baidu)                         | 0.3B/21B/300B                    | ernie_nothink/ernie |
-| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon              |
-| [Falcon-H1](https://huggingface.co/tiiuae)                        | 0.5B/1.5B/3B/7B/34B              | falcon_h1           |
-| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma/gemma2        |
-| [Gemma 3/Gemma 3n](https://huggingface.co/google)                 | 270M/1B/4B/6B/8B/12B/27B         | gemma3/gemma3n      |
-| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org)         | 9B/32B                           | glm4/glmz1          |
-| [GLM-4.1V](https://huggingface.co/zai-org)                        | 9B                               | glm4v               |
-| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org)                | 106B/355B                        | glm4_moe/glm4v_moe  |
-| [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                   |
-| [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt                 |
-| [Granite 3.0-3.3](https://huggingface.co/ibm-granite)             | 1B/2B/3B/8B                      | granite3            |
-| [Granite 4](https://huggingface.co/ibm-granite)                   | 7B                               | granite4            |
-| [Hunyuan](https://huggingface.co/tencent/)                        | 7B                               | hunyuan             |
-| [Index](https://huggingface.co/IndexTeam)                         | 1.9B                             | index               |
-| [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2             |
-| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab)              | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl           |
-| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/)       | 8B                               | intern_s1           |
-| [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl             |
-| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                   |
-| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2              |
-| [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3              |
-| [Llama 4](https://huggingface.co/meta-llama)                      | 109B/402B                        | llama4              |
-| [Llama 3.2 Vision](https://huggingface.co/meta-llama)             | 11B/90B                          | mllama              |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava               |
-| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next          |
-| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video    |
-| [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B                               | mimo                |
-| [MiniCPM 1-4.1](https://huggingface.co/openbmb)                   | 0.5B/1B/2B/4B/8B                 | cpm/cpm3/cpm4       |
-| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v |
-| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai)        | 8B/12B                           | ministral           |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral             |
-| [Mistral Small](https://huggingface.co/mistralai)                 | 24B                              | mistral_small       |
-| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                   |
-| [PaliGemma/PaliGemma2](https://huggingface.co/google)             | 3B/10B/28B                       | paligemma           |
-| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -                   |
-| [Phi-3/Phi-3.5](https://huggingface.co/microsoft)                 | 4B/14B                           | phi                 |
-| [Phi-3-small](https://huggingface.co/microsoft)                   | 7B                               | phi_small           |
-| [Phi-4](https://huggingface.co/microsoft)                         | 14B                              | phi4                |
-| [Pixtral](https://huggingface.co/mistralai)                       | 12B                              | pixtral             |
-| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                |
-| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
-| [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio         |
-| [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni          |
-| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen)            | 2B/3B/7B/32B/72B                 | qwen2_vl            |
-| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed)         | 8B/36B                           | seed_oss/seed_coder |
-| [Skywork o1](https://huggingface.co/Skywork)                      | 8B                               | skywork_o1          |
-| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                   |
-| [TeleChat2](https://huggingface.co/Tele-AI)                       | 3B/7B/35B/115B                   | telechat2           |
-| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse              |
-| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi                  |
-| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl               |
-| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan                |
-| [LING-V2 (mini/flash)](https://huggingface.co/inclusionAI)        | 16B/100B                         | bailing_v2          |
+| Model                                                             | Model size                       | Template             |
+| ----------------------------------------------------------------- | -------------------------------- | -------------------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2            |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                    |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3             |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere               |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek             |
+| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3            |
+| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1           |
+| [ERNIE-4.5](https://huggingface.co/baidu)                         | 0.3B/21B/300B                    | ernie/ernie_nothink  |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon               |
+| [Falcon-H1](https://huggingface.co/tiiuae)                        | 0.5B/1.5B/3B/7B/34B              | falcon_h1            |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma/gemma2         |
+| [Gemma 3/Gemma 3n](https://huggingface.co/google)                 | 270M/1B/4B/6B/8B/12B/27B         | gemma3/gemma3n       |
+| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org)         | 9B/32B                           | glm4/glmz1           |
+| [GLM-4.1V](https://huggingface.co/zai-org)                        | 9B                               | glm4v                |
+| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org)                | 106B/355B                        | glm4_moe/glm4v_moe   |
+| [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                    |
+| [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt                  |
+| [Granite 3.0-3.3](https://huggingface.co/ibm-granite)             | 1B/2B/3B/8B                      | granite3             |
+| [Granite 4](https://huggingface.co/ibm-granite)                   | 7B                               | granite4             |
+| [Hunyuan](https://huggingface.co/tencent/)                        | 7B                               | hunyuan              |
+| [Index](https://huggingface.co/IndexTeam)                         | 1.9B                             | index                |
+| [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2              |
+| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab)              | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl            |
+| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/)       | 8B                               | intern_s1            |
+| [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl              |
+| [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI)       | 16B/100B                         | bailing_v2           |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                    |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2               |
+| [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3               |
+| [Llama 4](https://huggingface.co/meta-llama)                      | 109B/402B                        | llama4               |
+| [Llama 3.2 Vision](https://huggingface.co/meta-llama)             | 11B/90B                          | mllama               |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava                |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next           |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video     |
+| [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B                               | mimo                 |
+| [MiniCPM 1-4.1](https://huggingface.co/openbmb)                   | 0.5B/1B/2B/4B/8B                 | cpm/cpm3/cpm4        |
+| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v  |
+| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai)        | 8B/12B                           | ministral            |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral              |
+| [Mistral Small](https://huggingface.co/mistralai)                 | 24B                              | mistral_small        |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                    |
+| [PaliGemma/PaliGemma2](https://huggingface.co/google)             | 3B/10B/28B                       | paligemma            |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -                    |
+| [Phi-3/Phi-3.5](https://huggingface.co/microsoft)                 | 4B/14B                           | phi                  |
+| [Phi-3-small](https://huggingface.co/microsoft)                   | 7B                               | phi_small            |
+| [Phi-4](https://huggingface.co/microsoft)                         | 14B                              | phi4                 |
+| [Pixtral](https://huggingface.co/mistralai)                       | 12B                              | pixtral              |
+| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                 |
+| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink  |
+| [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio          |
+| [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni           |
+| [Qwen3-Omni](https://huggingface.co/Qwen)*                        | 30B                              | qwen3_omni           |
+| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen)            | 2B/3B/7B/32B/72B                 | qwen2_vl             |
+| [Qwen3-VL](https://huggingface.co/Qwen)*                          | 235B                             | qwen3_vl             |
+| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed)         | 8B/36B                           | seed_oss/seed_coder  |
+| [Skywork o1](https://huggingface.co/Skywork)                      | 8B                               | skywork_o1           |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                    |
+| [TeleChat2](https://huggingface.co/Tele-AI)                       | 3B/7B/35B/115B                   | telechat2            |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse               |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi                   |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl                |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan                 |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
diff --git a/README_zh.md b/README_zh.md
index 356ce4b2..204cd2c2 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -264,68 +264,70 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 
 ## 模型
 
-| 模型名                                                             | 参数量                            | Template            |
-| ----------------------------------------------------------------- | -------------------------------- | ------------------- |
-| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2           |
-| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                   |
-| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3            |
-| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere              |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek            |
-| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3           |
-| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1          |
-| [ERNIE-4.5](https://huggingface.co/baidu)                         | 0.3B/21B/300B                    | ernie_nothink/ernie |
-| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon              |
-| [Falcon-H1](https://huggingface.co/tiiuae)                        | 0.5B/1.5B/3B/7B/34B              | falcon_h1           |
-| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma/gemma2        |
-| [Gemma 3/Gemma 3n](https://huggingface.co/google)                 | 270M/1B/4B/6B/8B/12B/27B         | gemma3/gemma3n      |
-| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org)         | 9B/32B                           | glm4/glmz1          |
-| [GLM-4.1V](https://huggingface.co/zai-org)                        | 9B                               | glm4v               |
-| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org)                | 106B/355B                        | glm4_moe/glm4v_moe  |
-| [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                   |
-| [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt                 |
-| [Granite 3.0-3.3](https://huggingface.co/ibm-granite)             | 1B/2B/3B/8B                      | granite3            |
-| [Granite 4](https://huggingface.co/ibm-granite)                   | 7B                               | granite4            |
-| [Hunyuan](https://huggingface.co/tencent/)                        | 7B                               | hunyuan             |
-| [Index](https://huggingface.co/IndexTeam)                         | 1.9B                             | index               |
-| [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2             |
-| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab)              | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl           |
-| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/)       | 8B                               | intern_s1           |
-| [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl             |
-| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                   |
-| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2              |
-| [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3              |
-| [Llama 4](https://huggingface.co/meta-llama)                      | 109B/402B                        | llama4              |
-| [Llama 3.2 Vision](https://huggingface.co/meta-llama)             | 11B/90B                          | mllama              |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava               |
-| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next          |
-| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video    |
-| [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B                               | mimo                |
-| [MiniCPM 1-4.1](https://huggingface.co/openbmb)                   | 0.5B/1B/2B/4B/8B                 | cpm/cpm3/cpm4       |
-| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v |
-| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai)        | 8B/12B                           | ministral           |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral             |
-| [Mistral Small](https://huggingface.co/mistralai)                 | 24B                              | mistral_small       |
-| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                   |
-| [PaliGemma/PaliGemma2](https://huggingface.co/google)             | 3B/10B/28B                       | paligemma           |
-| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -                   |
-| [Phi-3/Phi-3.5](https://huggingface.co/microsoft)                 | 4B/14B                           | phi                 |
-| [Phi-3-small](https://huggingface.co/microsoft)                   | 7B                               | phi_small           |
-| [Phi-4](https://huggingface.co/microsoft)                         | 14B                              | phi4                |
-| [Pixtral](https://huggingface.co/mistralai)                       | 12B                              | pixtral             |
-| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                |
-| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
-| [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio         |
-| [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni          |
-| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen)            | 2B/3B/7B/32B/72B                 | qwen2_vl            |
-| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed)         | 8B/36B                           | seed_oss/seed_coder |
-| [Skywork o1](https://huggingface.co/Skywork)                      | 8B                               | skywork_o1          |
-| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                   |
-| [TeleChat2](https://huggingface.co/Tele-AI)                       | 3B/7B/35B/115B                   | telechat2           |
-| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse              |
-| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi                  |
-| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl               |
-| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan                |
-| [LING-V2 (mini/flash)](https://huggingface.co/inclusionAI)        | 16B/100B                         | bailing_v2          |
+| 模型名                                                             | 参数量                            | Template             |
+| ----------------------------------------------------------------- | -------------------------------- | -------------------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2            |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                    |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3             |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere               |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek             |
+| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3            |
+| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1           |
+| [ERNIE-4.5](https://huggingface.co/baidu)                         | 0.3B/21B/300B                    | ernie/ernie_nothink  |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon               |
+| [Falcon-H1](https://huggingface.co/tiiuae)                        | 0.5B/1.5B/3B/7B/34B              | falcon_h1            |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma/gemma2         |
+| [Gemma 3/Gemma 3n](https://huggingface.co/google)                 | 270M/1B/4B/6B/8B/12B/27B         | gemma3/gemma3n       |
+| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org)         | 9B/32B                           | glm4/glmz1           |
+| [GLM-4.1V](https://huggingface.co/zai-org)                        | 9B                               | glm4v                |
+| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org)                | 106B/355B                        | glm4_moe/glm4v_moe   |
+| [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                    |
+| [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt                  |
+| [Granite 3.0-3.3](https://huggingface.co/ibm-granite)             | 1B/2B/3B/8B                      | granite3             |
+| [Granite 4](https://huggingface.co/ibm-granite)                   | 7B                               | granite4             |
+| [Hunyuan](https://huggingface.co/tencent/)                        | 7B                               | hunyuan              |
+| [Index](https://huggingface.co/IndexTeam)                         | 1.9B                             | index                |
+| [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2              |
+| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab)              | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl            |
+| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/)       | 8B                               | intern_s1            |
+| [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl              |
+| [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI)       | 16B/100B                         | bailing_v2           |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                    |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2               |
+| [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3               |
+| [Llama 4](https://huggingface.co/meta-llama)                      | 109B/402B                        | llama4               |
+| [Llama 3.2 Vision](https://huggingface.co/meta-llama)             | 11B/90B                          | mllama               |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava                |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next           |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video     |
+| [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B                               | mimo                 |
+| [MiniCPM 1-4.1](https://huggingface.co/openbmb)                   | 0.5B/1B/2B/4B/8B                 | cpm/cpm3/cpm4        |
+| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v  |
+| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai)        | 8B/12B                           | ministral            |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral              |
+| [Mistral Small](https://huggingface.co/mistralai)                 | 24B                              | mistral_small        |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                    |
+| [PaliGemma/PaliGemma2](https://huggingface.co/google)             | 3B/10B/28B                       | paligemma            |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -                    |
+| [Phi-3/Phi-3.5](https://huggingface.co/microsoft)                 | 4B/14B                           | phi                  |
+| [Phi-3-small](https://huggingface.co/microsoft)                   | 7B                               | phi_small            |
+| [Phi-4](https://huggingface.co/microsoft)                         | 14B                              | phi4                 |
+| [Pixtral](https://huggingface.co/mistralai)                       | 12B                              | pixtral              |
+| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                 |
+| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink  |
+| [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio          |
+| [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni           |
+| [Qwen3-Omni](https://huggingface.co/Qwen)*                        | 30B                              | qwen3_omni           |
+| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen)            | 2B/3B/7B/32B/72B                 | qwen2_vl             |
+| [Qwen3-VL](https://huggingface.co/Qwen)*                          | 235B                             | qwen3_vl             |
+| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed)         | 8B/36B                           | seed_oss/seed_coder  |
+| [Skywork o1](https://huggingface.co/Skywork)                      | 8B                               | skywork_o1           |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                    |
+| [TeleChat2](https://huggingface.co/Tele-AI)                       | 3B/7B/35B/115B                   | telechat2            |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse               |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi                   |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl                |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan                 |
 
 > [!NOTE]
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
diff --git a/requirements.txt b/requirements.txt
index dcdaa9b4..424cc643 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # core deps
-transformers>=4.49.0,<=4.56.1,!=4.52.0
+transformers>=4.49.0,<=4.56.2,!=4.52.0
 datasets>=2.16.0,<=4.0.0
 accelerate>=1.3.0,<=1.10.1
 peft>=0.14.0,<=0.17.1
diff --git a/scripts/qwen_omni_merge.py b/scripts/qwen_omni_merge.py
index e7722e38..7236d23c 100644
--- a/scripts/qwen_omni_merge.py
+++ b/scripts/qwen_omni_merge.py
@@ -29,33 +29,30 @@ import shutil
 
 import fire
 from peft import PeftModel
-from transformers import (
-    AutoProcessor,
-    Qwen2_5OmniForConditionalGeneration,  # type: ignore
-    Qwen2_5OmniThinkerForConditionalGeneration,
-)
+from transformers import AutoConfig, AutoModelForTextToWaveform, AutoProcessor
+from transformers.utils import cached_file
 
 
 def merge_lora(
-    base_model_path: str,
-    lora_checkpoint_path: str,
+    model_path: str,
+    lora_path: str,
+    save_path: str = "./merged_model_checkpoint",
     extra_file: str = "spk_dict.pt",
     submodule_name: str = "thinker",
-    save_path: str = "./merged_model_checkpoint",
 ):
     """Load the original model, merge the LoRA weights.
 
     For a specified submodule, and save the final merged model along with its configurations.
 
     Args:
-        base_model_path (str): Path to the original model directory.
-        lora_checkpoint_path (str): Path to the directory containing LoRA weights.
+        model_path (str): Path to the original model directory.
+        lora_path (str): Path to the directory containing LoRA weights.
+        save_path (str): Directory where the merged model and configurations will be saved.
         extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
         submodule_name (str): Name of the submodule to merge (default: "thinker").
-        save_path (str): Directory where the merged model and configurations will be saved.
     """
     # 1. Load the original model
-    model = Qwen2_5OmniForConditionalGeneration.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu")
+    model = AutoModelForTextToWaveform.from_pretrained(model_path, torch_dtype="auto", device_map="cpu")
     print("Successfully loaded the original model.")
 
     # 2. Extract the submodule to be merged (e.g., model.thinker)
@@ -66,13 +63,13 @@ def merge_lora(
     print(f"Successfully extracted submodule: {submodule_name}.")
 
     # 3. Load the LoRA weights onto the extracted submodule
-    lora_model = PeftModel.from_pretrained(base_submodule, lora_checkpoint_path)
-    processor = AutoProcessor.from_pretrained(lora_checkpoint_path)
-    print("LoRA weights and processor loaded successfully.")
+    lora_model = PeftModel.from_pretrained(base_submodule, lora_path)
+    processor = AutoProcessor.from_pretrained(lora_path)
+    print("Successfully loaded LoRA weights and processor.")
 
     # 4. Merge the LoRA weights into the submodule and unload the LoRA modules
     merged_submodule = lora_model.merge_and_unload()
-    print("LoRA weights merged successfully.")
+    print("Successfully merged LoRA weights.")
 
     # 5. Replace the original submodule with the merged submodule in the model
     setattr(model, submodule_name, merged_submodule)
@@ -80,20 +77,19 @@ def merge_lora(
     # 6. Save the final merged model along with the tokenizer and processor configuration
     model.save_pretrained(save_path)
     processor.save_pretrained(save_path)
-    print(f"Merged model and tokenizer saved to {save_path}.")
+    print(f"Merged model and processor saved to {save_path}.")
 
-    source_file = os.path.join(base_model_path, extra_file)
-    target_file = os.path.join(save_path, extra_file)
-    if os.path.exists(source_file):
-        shutil.copy(source_file, target_file)
-        print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.")
-    else:
-        print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.")
+    try:
+        source_file = cached_file(path_or_repo_id=model_path, filename=extra_file)
+        shutil.copy(source_file, os.path.join(save_path, extra_file))
+        print(f"File '{extra_file}' copied from {model_path} to {save_path}.")
+    except Exception:
+        print(f"File '{extra_file}' not found in {model_path}, skipping copy.")
 
 
 def save_full_model(
-    saved_thinker_path: str,
-    base_model_path: str,
+    model_path: str,
+    thinker_path: str,
     save_path: str = "./merged_model_checkpoint",
     extra_file: str = "spk_dict.pt",
 ):
@@ -102,34 +98,42 @@ def save_full_model(
     Then save the complete model along with its tokenizer and processor configuration.
 
     Args:
-        saved_thinker_path (str): Path to the saved thinker weights.
-        base_model_path (str): Directory path of the original model.
+        model_path (str): Directory path of the original model.
+        thinker_path (str): Path to the saved thinker weights.
         save_path (str): Directory where the merged model and configurations will be saved.
         extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
     """
     # 1. Load the saved thinker module and the original model
-    thinker = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
-        saved_thinker_path, torch_dtype="auto", device_map="cpu"
-    )
-    base_model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
-        base_model_path, torch_dtype="auto", device_map="cpu"
-    )
+    config = AutoConfig.from_pretrained(model_path)
+    if getattr(config, "model_type") == "qwen2_5_omni":
+        from transformers.models.qwen2_5_omni import Qwen2_5OmniThinkerForConditionalGeneration  # type: ignore
+
+        ThinkerClass = Qwen2_5OmniThinkerForConditionalGeneration
+    elif getattr(config, "model_type") == "qwen3_omni_moe":
+        from transformers.models.qwen3_omni_moe import Qwen3OmniMoeThinkerForConditionalGeneration  # type: ignore
+
+        ThinkerClass = Qwen3OmniMoeThinkerForConditionalGeneration
+    else:
+        raise ValueError(f"Unsupported model type: {getattr(config, 'model_type')}.")
+
+    thinker = ThinkerClass.from_pretrained(thinker_path, torch_dtype="auto", device_map="cpu")
+    base_model = AutoModelForTextToWaveform.from_pretrained(model_path, torch_dtype="auto", device_map="cpu")
     base_model.thinker = thinker
+    processor = AutoProcessor.from_pretrained(thinker_path)
+    print("Successfully loaded model weights and processor.")
 
     # 2. Save the complete model along with its tokenizer and processor configuration
-    processor = AutoProcessor.from_pretrained(saved_thinker_path)
     base_model.save_pretrained(save_path)
     processor.save_pretrained(save_path)
     print(f"Merged model and processor saved to {save_path}.")
 
     # 3. Copy the extra file from the base model directory to the save_path
-    source_file = os.path.join(base_model_path, extra_file)
-    target_file = os.path.join(save_path, extra_file)
-    if os.path.exists(source_file):
-        shutil.copy(source_file, target_file)
-        print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.")
-    else:
-        print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.")
+    try:
+        source_file = cached_file(path_or_repo_id=model_path, filename=extra_file)
+        shutil.copy(source_file, os.path.join(save_path, extra_file))
+        print(f"File '{extra_file}' copied from {model_path} to {save_path}.")
+    except Exception:
+        print(f"File '{extra_file}' not found in {model_path}, skipping copy.")
 
 
 if __name__ == "__main__":
diff --git a/setup.py b/setup.py
index 6a079ac8..08ba557e 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@ extra_require = {
     "eetq": ["eetq"],
     "gptq": ["optimum>=1.24.0", "gptqmodel>=2.0.0"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
-    "vllm": ["vllm>=0.4.3,<=0.10.0"],
+    "vllm": ["vllm>=0.4.3,<=0.10.2"],
     "sglang": ["sglang[srt]>=0.4.5", "transformers==4.51.1"],
     "galore": ["galore-torch"],
     "apollo": ["apollo-torch"],
diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py
index cfeecd86..162f432c 100644
--- a/src/llamafactory/data/collator.py
+++ b/src/llamafactory/data/collator.py
@@ -194,7 +194,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
             elif "video_second_per_grid" in mm_inputs:  # for qwen2.5 omni
                 rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid")
 
-            if getattr(self.model.config, "model_type", None) == "qwen2_5_omni_thinker":  # for qwen2.5 omni
+            if getattr(self.model.config, "model_type", None) in ["qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]:
                 rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False)
                 feature_attention_mask = mm_inputs.get("feature_attention_mask", None)
                 if feature_attention_mask is not None:  # FIXME: need to get video image lengths
@@ -205,13 +205,22 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
                 features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum(
                     dim=-1
                 ).unsqueeze(-1)
-            else:  # for qwen2vl
+            else:  # for qwen vl
                 features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs)
 
         if (
             self.model is not None
             and getattr(self.model.config, "model_type", None)
-            in ["glm4v", "Keye", "qwen2_vl", "qwen2_5_vl", "qwen2_5_omni_thinker"]
+            in [
+                "glm4v",
+                "Keye",
+                "qwen2_vl",
+                "qwen2_5_vl",
+                "qwen2_5_omni_thinker",
+                "qwen3_omni_moe_thinker",
+                "qwen3_vl",
+                "qwen3_vl_moe",
+            ]
             and ("position_ids" not in features or features["position_ids"].dim() != 3)
         ):
             raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.")
diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index b0069212..6916d962 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -1397,8 +1397,8 @@ class Qwen2AudioPlugin(BasePlugin):
 
 @dataclass
 class Qwen2VLPlugin(BasePlugin):
-    start_token: str = "<|vision_start|>"
-    end_token: str = "<|vision_end|>"
+    vision_bos_token: str = "<|vision_start|>"
+    vision_eos_token: str = "<|vision_end|>"
 
     @override
     def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
@@ -1515,14 +1515,18 @@ class Qwen2VLPlugin(BasePlugin):
             while IMAGE_PLACEHOLDER in content:
                 image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
                 content = content.replace(
-                    IMAGE_PLACEHOLDER, f"{self.start_token}{self.image_token * image_seqlen}{self.end_token}", 1
+                    IMAGE_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
+                    1,
                 )
                 num_image_tokens += 1
 
             while VIDEO_PLACEHOLDER in content:
                 video_seqlen = video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
                 content = content.replace(
-                    VIDEO_PLACEHOLDER, f"{self.start_token}{self.video_token * video_seqlen}{self.end_token}", 1
+                    VIDEO_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
+                    1,
                 )
                 num_video_tokens += 1
 
@@ -1611,7 +1615,9 @@ class Qwen3VLPlugin(Qwen2VLPlugin):
                     image_grid_thw[num_image_tokens].prod() // image_merge_length if self.expand_mm_tokens else 1
                 )
                 content = content.replace(
-                    IMAGE_PLACEHOLDER, f"{self.start_token}{self.image_token * image_seqlen}{self.end_token}", 1
+                    IMAGE_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
+                    1,
                 )
                 num_image_tokens += 1
 
@@ -1630,11 +1636,14 @@ class Qwen3VLPlugin(Qwen2VLPlugin):
                         else 1
                     )
                     timestamp_sec = timestamps[frame_index]
-                    frame_structure = f"<{timestamp_sec:.1f} seconds>{self.start_token}{self.video_token * video_seqlen}{self.end_token}"
+                    frame_structure = (
+                        f"<{timestamp_sec:.1f} seconds>"
+                        f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}"
+                    )
                     video_structure += frame_structure
 
                 if not self.expand_mm_tokens:
-                    video_structure = f"{self.start_token}{self.video_token}{self.end_token}"
+                    video_structure = f"{self.vision_bos_token}{self.video_token}{self.vision_eos_token}"
 
                 content = content.replace(VIDEO_PLACEHOLDER, video_structure, 1)
                 num_video_tokens += 1
@@ -1774,7 +1783,11 @@ class GLM4VPlugin(Qwen2VLPlugin):
         return mm_inputs
 
 
+@dataclass
 class Qwen2OmniPlugin(Qwen2VLPlugin):
+    audio_bos_token: str = "<|audio_start|>"
+    audio_eos_token: str = "<|audio_end|>"
+
     @override
     def _get_mm_inputs(
         self,
@@ -1861,7 +1874,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
             while IMAGE_PLACEHOLDER in content:
                 image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
                 content = content.replace(
-                    IMAGE_PLACEHOLDER, f"<|vision_bos|>{self.image_token * image_seqlen}<|vision_eos|>", 1
+                    IMAGE_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
+                    1,
                 )
                 num_image_tokens += 1
 
@@ -1898,7 +1913,7 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
                     video_chunk_indices = processor.get_chunked_index(video_t_index, t_ntoken_per_chunk)
                     audio_chunk_indices = processor.get_chunked_index(audio_t_index, t_ntoken_per_chunk)
                     placeholder_string = ""
-                    placeholder_string += "<|vision_bos|>" + "<|audio_bos|>"
+                    placeholder_string += self.vision_bos_token + self.audio_bos_token
                     for j in range(max(len(video_chunk_indices), len(audio_chunk_indices))):
                         video_chunk_index = video_chunk_indices[j] if j < len(video_chunk_indices) else None
                         audio_chunk_index = audio_chunk_indices[j] if j < len(audio_chunk_indices) else None
@@ -1908,7 +1923,7 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
                         if audio_chunk_index is not None:
                             placeholder_string += self.audio_token * (audio_chunk_index[1] - audio_chunk_index[0])
 
-                    placeholder_string += "<|audio_eos|>" + "<|vision_eos|>"
+                    placeholder_string += self.audio_eos_token + self.vision_eos_token
                     content = content.replace(VIDEO_PLACEHOLDER, placeholder_string, 1)
                     content = content.replace(AUDIO_PLACEHOLDER, "", 1)
                     num_audio_tokens += 1
@@ -1917,7 +1932,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
                 while AUDIO_PLACEHOLDER in content:
                     audio_seqlen = audio_lengths[num_audio_tokens] if self.expand_mm_tokens else 1
                     content = content.replace(
-                        AUDIO_PLACEHOLDER, f"<|audio_bos|>{self.audio_token * audio_seqlen}<|audio_eos|>", 1
+                        AUDIO_PLACEHOLDER,
+                        f"{self.audio_bos_token}{self.audio_token * audio_seqlen}{self.audio_eos_token}",
+                        1,
                     )
                     num_audio_tokens += 1
 
@@ -1926,7 +1943,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
                         video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
                     )
                     content = content.replace(
-                        VIDEO_PLACEHOLDER, f"<|vision_bos|>{self.video_token * video_seqlen}<|vision_eos|>", 1
+                        VIDEO_PLACEHOLDER,
+                        f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
+                        1,
                     )
                     num_video_tokens += 1
 
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 330ff50c..792b0cfe 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -922,8 +922,8 @@ register_template(
         name="qwen2_vl",
         image_token="<|imgpad|>",
         video_token="<|vidpad|>",
-        start_token="<|img|>",
-        end_token="<|endofimg|>",
+        vision_bos_token="<|img|>",
+        vision_eos_token="<|endofimg|>",
     ),
 )
 
@@ -1862,7 +1862,14 @@ register_template(
     stop_words=["<|im_end|>"],
     replace_eos=True,
     mm_plugin=get_mm_plugin(
-        name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>"
+        name="qwen2_omni",
+        image_token="<|IMAGE|>",
+        video_token="<|VIDEO|>",
+        audio_token="<|AUDIO|>",
+        vision_bos_token="<|vision_bos|>",
+        vision_eos_token="<|vision_eos|>",
+        audio_bos_token="<|audio_bos|>",
+        audio_eos_token="<|audio_eos|>",
     ),
 )
 
@@ -1880,7 +1887,7 @@ register_template(
     stop_words=["<|im_end|>"],
     replace_eos=True,
     mm_plugin=get_mm_plugin(
-        name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>"
+        name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
     ),
     template_class=ReasoningTemplate,
 )
@@ -1899,7 +1906,7 @@ register_template(
     stop_words=["<|im_end|>"],
     replace_eos=True,
     mm_plugin=get_mm_plugin(
-        name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>"
+        name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
     ),
 )
 
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index e3f5d708..199e4cdf 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -3060,13 +3060,14 @@ register_model_group(
     multimodal=True,
 )
 
+
 register_model_group(
     models={
-        "Qwen/Qwen3-Omni-30B-A3B-Captioner": {
+        "Qwen3-Omni-30B-A3B-Captioner": {
             DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
             DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
         },
-        "Qwen/Qwen3-Omni-30B-A3B-Instruct": {
+        "Qwen3-Omni-30B-A3B-Instruct": {
             DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
             DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
         },
@@ -3075,9 +3076,10 @@ register_model_group(
     multimodal=True,
 )
 
+
 register_model_group(
     models={
-        "Qwen/Qwen3-Omni-30B-A3B-Thinking": {
+        "Qwen3-Omni-30B-A3B-Thinking": {
             DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
             DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
         },
@@ -3086,6 +3088,7 @@ register_model_group(
     multimodal=True,
 )
 
+
 register_model_group(
     models={
         "Qwen2-VL-2B": {
@@ -3190,24 +3193,24 @@ register_model_group(
 
 register_model_group(
     models={
-        "Qwen/Qwen3-VL-235B-A22B-Thinking": {
-            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking",
-            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking",
+        "Qwen3-VL-235B-A22B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct",
         },
     },
-    template="qwen3_vl",
+    template="qwen3_vl_nothink",
     multimodal=True,
 )
 
 
 register_model_group(
     models={
-        "Qwen/Qwen3-VL-235B-A22B-Instruct": {
-            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct",
-            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct",
+        "Qwen3-VL-235B-A22B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking",
         },
     },
-    template="qwen3_vl_nothink",
+    template="qwen3_vl",
     multimodal=True,
 )
 
diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index e1fb2e62..4f1778ba 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -94,7 +94,7 @@ def check_version(requirement: str, mandatory: bool = False) -> None:
 
 def check_dependencies() -> None:
     r"""Check the version of the required packages."""
-    check_version("transformers>=4.49.0,<=4.56.1")
+    check_version("transformers>=4.49.0,<=4.56.2")
     check_version("datasets>=2.16.0,<=4.0.0")
     check_version("accelerate>=1.3.0,<=1.10.1")
     check_version("peft>=0.14.0,<=0.17.1")
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 2a2fc2ee..cd3ad9aa 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -147,7 +147,7 @@ def _check_extra_dependencies(
         check_version("mixture-of-depth>=1.1.6", mandatory=True)
 
     if model_args.infer_backend == EngineName.VLLM:
-        check_version("vllm>=0.4.3,<=0.10.0")
+        check_version("vllm>=0.4.3,<=0.10.2")
         check_version("vllm", mandatory=True)
     elif model_args.infer_backend == EngineName.SGLANG:
         check_version("sglang>=0.4.5")
@@ -174,7 +174,8 @@ def _check_extra_dependencies(
     if training_args is not None:
         if training_args.deepspeed:
             # pin deepspeed version < 0.17 because of https://github.com/deepspeedai/DeepSpeed/issues/7347
-            check_version("deepspeed>=0.10.0,<=0.16.9", mandatory=True)
+            check_version("deepspeed", mandatory=True)
+            check_version("deepspeed>=0.10.0,<=0.16.9")
 
         if training_args.predict_with_generate:
             check_version("jieba", mandatory=True)
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 8793135f..37dffcbe 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -162,7 +162,7 @@ def load_model(
                 load_class = AutoModelForVision2Seq
             elif type(config) in AutoModelForSeq2SeqLM._model_mapping.keys():  # audio-text
                 load_class = AutoModelForSeq2SeqLM
-            elif type(config) in AutoModelForTextToWaveform._model_mapping.keys():  # audio hack for qwen2_5_omni
+            elif type(config) in AutoModelForTextToWaveform._model_mapping.keys():  # audio hack for qwen omni
                 load_class = AutoModelForTextToWaveform
             else:
                 load_class = AutoModelForCausalLM
@@ -171,8 +171,8 @@ def load_model(
                 model = load_class.from_config(config, trust_remote_code=model_args.trust_remote_code)
             else:
                 model = load_class.from_pretrained(**init_kwargs)
-                if getattr(model.config, "model_type", None) == "qwen2_5_omni":
-                    model = model.thinker  # use part of Omni model
+                if getattr(model.config, "model_type", None) in ["qwen2_5_omni", "qwen3_omni_moe"]:
+                    model = getattr(model, "thinker")
 
         if model_args.mixture_of_depths == "convert":
             model = convert_pretrained_model_to_mod(model, config, model_args)
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index e5c39280..cfbe6a22 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -298,6 +298,7 @@ _register_composite_model(
     lora_conflict_keys=["audio_projection_layer"],
 )
 
+
 _register_composite_model(
     model_type="mistral3",
 )
@@ -351,6 +352,33 @@ _register_composite_model(
 )
 
 
+_register_composite_model(
+    model_type="qwen3_vl",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen3_vl_moe",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen3_omni_moe_thinker",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks", "audio_tower"],
+    language_model_keys=["model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
 _register_composite_model(
     model_type="video_llava",
 )
diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index 406307d0..87281f7e 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -332,7 +332,14 @@ def test_qwen2_omni_plugin():
     image_seqlen, audio_seqlen = 4, 2
     tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2.5-Omni-7B")
     qwen2_omni_plugin = get_mm_plugin(
-        name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>"
+        name="qwen2_omni",
+        image_token="<|IMAGE|>",
+        video_token="<|VIDEO|>",
+        audio_token="<|AUDIO|>",
+        vision_bos_token="<|vision_bos|>",
+        vision_eos_token="<|vision_eos|>",
+        audio_bos_token="<|audio_bos|>",
+        audio_eos_token="<|audio_eos|>",
     )
     check_inputs = {"plugin": qwen2_omni_plugin, **tokenizer_module}
     check_inputs["expected_mm_messages"] = [