[data] fix qwen omni plugin (#9204)

Co-authored-by: kingsley <kingsleydodonow@gmail.com>
This commit is contained in:
Yaowei Zheng 2025-09-28 01:02:29 +08:00 committed by GitHub
parent 0761a4448f
commit 6ffebe5ff7
15 changed files with 292 additions and 210 deletions

View File

@ -1,6 +1,6 @@
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0 rev: v6.0.0
hooks: hooks:
- id: check-ast - id: check-ast
- id: check-added-large-files - id: check-added-large-files
@ -15,13 +15,13 @@ repos:
args: ['--branch', 'main'] args: ['--branch', 'main']
- repo: https://github.com/asottile/pyupgrade - repo: https://github.com/asottile/pyupgrade
rev: v3.17.0 rev: v3.20.0
hooks: hooks:
- id: pyupgrade - id: pyupgrade
args: [--py38-plus] args: [--py39-plus]
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.9 rev: v0.13.2
hooks: hooks:
- id: ruff - id: ruff
args: [--fix] args: [--fix]

126
README.md
View File

@ -262,68 +262,70 @@ Choose your path:
## Supported Models ## Supported Models
| Model | Model size | Template | | Model | Model size | Template |
| ----------------------------------------------------------------- | -------------------------------- | ------------------- | | ----------------------------------------------------------------- | -------------------------------- | -------------------- |
| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | | [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 |
| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | | [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - |
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | | [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 |
| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | | [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere |
| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | | [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek |
| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | | [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 |
| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | | [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 |
| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie_nothink/ernie | | [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie/ernie_nothink |
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon |
| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | | [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 |
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n | | [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 | | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v | | [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v |
| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe | | [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe |
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | | [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt | | [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | | [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 | | [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 |
| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan | | [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan |
| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | | [Index](https://huggingface.co/IndexTeam) | 1.9B | index |
| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | | [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 |
| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl | | [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl |
| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 | | [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl | | [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl |
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | | [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 |
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | | [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | | [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 | | [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 |
| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | | [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 |
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | | [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama |
| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | | [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | | [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next |
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video |
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | | [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | | [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | | [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | | [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral |
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | | [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | | [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | | [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma |
| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - |
| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | | [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi |
| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | | [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small |
| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | | [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 |
| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral |
| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink | | [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen |
| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | | [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni | | [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio |
| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | | [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni |
| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder | | [Qwen3-Omni](https://huggingface.co/Qwen)* | 30B | qwen3_omni |
| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | | [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl |
| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | | [Qwen3-VL](https://huggingface.co/Qwen)* | 235B | qwen3_vl |
| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | | [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder |
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | | [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 |
| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - |
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | | [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 |
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse |
| [LING-V2 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 | | [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi |
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl |
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan |
> [!NOTE] > [!NOTE]
> For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models. > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.

View File

@ -264,68 +264,70 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
## 模型 ## 模型
| 模型名 | 参数量 | Template | | 模型名 | 参数量 | Template |
| ----------------------------------------------------------------- | -------------------------------- | ------------------- | | ----------------------------------------------------------------- | -------------------------------- | -------------------- |
| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | | [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 |
| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | | [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - |
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | | [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 |
| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | | [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere |
| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | | [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek |
| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | | [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 |
| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | | [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 |
| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie_nothink/ernie | | [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie/ernie_nothink |
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon |
| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | | [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 |
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n | | [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 | | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v | | [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v |
| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe | | [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe |
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | | [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt | | [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | | [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 | | [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 |
| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan | | [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan |
| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | | [Index](https://huggingface.co/IndexTeam) | 1.9B | index |
| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | | [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 |
| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl | | [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl |
| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 | | [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl | | [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl |
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | | [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 |
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | | [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | | [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 | | [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 |
| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | | [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 |
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | | [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama |
| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | | [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | | [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next |
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video |
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | | [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | | [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | | [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | | [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral |
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | | [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | | [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | | [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma |
| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - |
| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | | [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi |
| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | | [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small |
| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | | [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 |
| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | | [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral |
| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink | | [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen |
| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | | [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni | | [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio |
| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | | [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni |
| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder | | [Qwen3-Omni](https://huggingface.co/Qwen)* | 30B | qwen3_omni |
| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | | [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl |
| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | | [Qwen3-VL](https://huggingface.co/Qwen)* | 235B | qwen3_vl |
| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | | [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder |
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | | [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 |
| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - |
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | | [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 |
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse |
| [LING-V2 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 | | [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi |
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl |
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan |
> [!NOTE] > [!NOTE]
> 对于所有“基座”Base模型`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”Instruct/Chat模型请务必使用**对应的模板**。 > 对于所有“基座”Base模型`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”Instruct/Chat模型请务必使用**对应的模板**。

View File

@ -1,5 +1,5 @@
# core deps # core deps
transformers>=4.49.0,<=4.56.1,!=4.52.0 transformers>=4.49.0,<=4.56.2,!=4.52.0
datasets>=2.16.0,<=4.0.0 datasets>=2.16.0,<=4.0.0
accelerate>=1.3.0,<=1.10.1 accelerate>=1.3.0,<=1.10.1
peft>=0.14.0,<=0.17.1 peft>=0.14.0,<=0.17.1

View File

@ -29,33 +29,30 @@ import shutil
import fire import fire
from peft import PeftModel from peft import PeftModel
from transformers import ( from transformers import AutoConfig, AutoModelForTextToWaveform, AutoProcessor
AutoProcessor, from transformers.utils import cached_file
Qwen2_5OmniForConditionalGeneration, # type: ignore
Qwen2_5OmniThinkerForConditionalGeneration,
)
def merge_lora( def merge_lora(
base_model_path: str, model_path: str,
lora_checkpoint_path: str, lora_path: str,
save_path: str = "./merged_model_checkpoint",
extra_file: str = "spk_dict.pt", extra_file: str = "spk_dict.pt",
submodule_name: str = "thinker", submodule_name: str = "thinker",
save_path: str = "./merged_model_checkpoint",
): ):
"""Load the original model, merge the LoRA weights. """Load the original model, merge the LoRA weights.
For a specified submodule, and save the final merged model along with its configurations. For a specified submodule, and save the final merged model along with its configurations.
Args: Args:
base_model_path (str): Path to the original model directory. model_path (str): Path to the original model directory.
lora_checkpoint_path (str): Path to the directory containing LoRA weights. lora_path (str): Path to the directory containing LoRA weights.
save_path (str): Directory where the merged model and configurations will be saved.
extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt"). extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
submodule_name (str): Name of the submodule to merge (default: "thinker"). submodule_name (str): Name of the submodule to merge (default: "thinker").
save_path (str): Directory where the merged model and configurations will be saved.
""" """
# 1. Load the original model # 1. Load the original model
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu") model = AutoModelForTextToWaveform.from_pretrained(model_path, torch_dtype="auto", device_map="cpu")
print("Successfully loaded the original model.") print("Successfully loaded the original model.")
# 2. Extract the submodule to be merged (e.g., model.thinker) # 2. Extract the submodule to be merged (e.g., model.thinker)
@ -66,13 +63,13 @@ def merge_lora(
print(f"Successfully extracted submodule: {submodule_name}.") print(f"Successfully extracted submodule: {submodule_name}.")
# 3. Load the LoRA weights onto the extracted submodule # 3. Load the LoRA weights onto the extracted submodule
lora_model = PeftModel.from_pretrained(base_submodule, lora_checkpoint_path) lora_model = PeftModel.from_pretrained(base_submodule, lora_path)
processor = AutoProcessor.from_pretrained(lora_checkpoint_path) processor = AutoProcessor.from_pretrained(lora_path)
print("LoRA weights and processor loaded successfully.") print("Successfully loaded LoRA weights and processor.")
# 4. Merge the LoRA weights into the submodule and unload the LoRA modules # 4. Merge the LoRA weights into the submodule and unload the LoRA modules
merged_submodule = lora_model.merge_and_unload() merged_submodule = lora_model.merge_and_unload()
print("LoRA weights merged successfully.") print("Successfully merged LoRA weights.")
# 5. Replace the original submodule with the merged submodule in the model # 5. Replace the original submodule with the merged submodule in the model
setattr(model, submodule_name, merged_submodule) setattr(model, submodule_name, merged_submodule)
@ -80,20 +77,19 @@ def merge_lora(
# 6. Save the final merged model along with the tokenizer and processor configuration # 6. Save the final merged model along with the tokenizer and processor configuration
model.save_pretrained(save_path) model.save_pretrained(save_path)
processor.save_pretrained(save_path) processor.save_pretrained(save_path)
print(f"Merged model and tokenizer saved to {save_path}.") print(f"Merged model and processor saved to {save_path}.")
source_file = os.path.join(base_model_path, extra_file) try:
target_file = os.path.join(save_path, extra_file) source_file = cached_file(path_or_repo_id=model_path, filename=extra_file)
if os.path.exists(source_file): shutil.copy(source_file, os.path.join(save_path, extra_file))
shutil.copy(source_file, target_file) print(f"File '{extra_file}' copied from {model_path} to {save_path}.")
print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.") except Exception:
else: print(f"File '{extra_file}' not found in {model_path}, skipping copy.")
print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.")
def save_full_model( def save_full_model(
saved_thinker_path: str, model_path: str,
base_model_path: str, thinker_path: str,
save_path: str = "./merged_model_checkpoint", save_path: str = "./merged_model_checkpoint",
extra_file: str = "spk_dict.pt", extra_file: str = "spk_dict.pt",
): ):
@ -102,34 +98,42 @@ def save_full_model(
Then save the complete model along with its tokenizer and processor configuration. Then save the complete model along with its tokenizer and processor configuration.
Args: Args:
saved_thinker_path (str): Path to the saved thinker weights. model_path (str): Directory path of the original model.
base_model_path (str): Directory path of the original model. thinker_path (str): Path to the saved thinker weights.
save_path (str): Directory where the merged model and configurations will be saved. save_path (str): Directory where the merged model and configurations will be saved.
extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt"). extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
""" """
# 1. Load the saved thinker module and the original model # 1. Load the saved thinker module and the original model
thinker = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained( config = AutoConfig.from_pretrained(model_path)
saved_thinker_path, torch_dtype="auto", device_map="cpu" if getattr(config, "model_type") == "qwen2_5_omni":
) from transformers.models.qwen2_5_omni import Qwen2_5OmniThinkerForConditionalGeneration # type: ignore
base_model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
base_model_path, torch_dtype="auto", device_map="cpu" ThinkerClass = Qwen2_5OmniThinkerForConditionalGeneration
) elif getattr(config, "model_type") == "qwen3_omni_moe":
from transformers.models.qwen3_omni_moe import Qwen3OmniMoeThinkerForConditionalGeneration # type: ignore
ThinkerClass = Qwen3OmniMoeThinkerForConditionalGeneration
else:
raise ValueError(f"Unsupported model type: {getattr(config, 'model_type')}.")
thinker = ThinkerClass.from_pretrained(thinker_path, torch_dtype="auto", device_map="cpu")
base_model = AutoModelForTextToWaveform.from_pretrained(model_path, torch_dtype="auto", device_map="cpu")
base_model.thinker = thinker base_model.thinker = thinker
processor = AutoProcessor.from_pretrained(thinker_path)
print("Successfully loaded model weights and processor.")
# 2. Save the complete model along with its tokenizer and processor configuration # 2. Save the complete model along with its tokenizer and processor configuration
processor = AutoProcessor.from_pretrained(saved_thinker_path)
base_model.save_pretrained(save_path) base_model.save_pretrained(save_path)
processor.save_pretrained(save_path) processor.save_pretrained(save_path)
print(f"Merged model and processor saved to {save_path}.") print(f"Merged model and processor saved to {save_path}.")
# 3. Copy the extra file from the base model directory to the save_path # 3. Copy the extra file from the base model directory to the save_path
source_file = os.path.join(base_model_path, extra_file) try:
target_file = os.path.join(save_path, extra_file) source_file = cached_file(path_or_repo_id=model_path, filename=extra_file)
if os.path.exists(source_file): shutil.copy(source_file, os.path.join(save_path, extra_file))
shutil.copy(source_file, target_file) print(f"File '{extra_file}' copied from {model_path} to {save_path}.")
print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.") except Exception:
else: print(f"File '{extra_file}' not found in {model_path}, skipping copy.")
print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -52,7 +52,7 @@ extra_require = {
"eetq": ["eetq"], "eetq": ["eetq"],
"gptq": ["optimum>=1.24.0", "gptqmodel>=2.0.0"], "gptq": ["optimum>=1.24.0", "gptqmodel>=2.0.0"],
"aqlm": ["aqlm[gpu]>=1.1.0"], "aqlm": ["aqlm[gpu]>=1.1.0"],
"vllm": ["vllm>=0.4.3,<=0.10.0"], "vllm": ["vllm>=0.4.3,<=0.10.2"],
"sglang": ["sglang[srt]>=0.4.5", "transformers==4.51.1"], "sglang": ["sglang[srt]>=0.4.5", "transformers==4.51.1"],
"galore": ["galore-torch"], "galore": ["galore-torch"],
"apollo": ["apollo-torch"], "apollo": ["apollo-torch"],

View File

@ -194,7 +194,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni
rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid") rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid")
if getattr(self.model.config, "model_type", None) == "qwen2_5_omni_thinker": # for qwen2.5 omni if getattr(self.model.config, "model_type", None) in ["qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]:
rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False) rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False)
feature_attention_mask = mm_inputs.get("feature_attention_mask", None) feature_attention_mask = mm_inputs.get("feature_attention_mask", None)
if feature_attention_mask is not None: # FIXME: need to get video image lengths if feature_attention_mask is not None: # FIXME: need to get video image lengths
@ -205,13 +205,22 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum( features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum(
dim=-1 dim=-1
).unsqueeze(-1) ).unsqueeze(-1)
else: # for qwen2vl else: # for qwen vl
features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs) features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs)
if ( if (
self.model is not None self.model is not None
and getattr(self.model.config, "model_type", None) and getattr(self.model.config, "model_type", None)
in ["glm4v", "Keye", "qwen2_vl", "qwen2_5_vl", "qwen2_5_omni_thinker"] in [
"glm4v",
"Keye",
"qwen2_vl",
"qwen2_5_vl",
"qwen2_5_omni_thinker",
"qwen3_omni_moe_thinker",
"qwen3_vl",
"qwen3_vl_moe",
]
and ("position_ids" not in features or features["position_ids"].dim() != 3) and ("position_ids" not in features or features["position_ids"].dim() != 3)
): ):
raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.") raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.")

View File

@ -1397,8 +1397,8 @@ class Qwen2AudioPlugin(BasePlugin):
@dataclass @dataclass
class Qwen2VLPlugin(BasePlugin): class Qwen2VLPlugin(BasePlugin):
start_token: str = "<|vision_start|>" vision_bos_token: str = "<|vision_start|>"
end_token: str = "<|vision_end|>" vision_eos_token: str = "<|vision_end|>"
@override @override
def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
@ -1515,14 +1515,18 @@ class Qwen2VLPlugin(BasePlugin):
while IMAGE_PLACEHOLDER in content: while IMAGE_PLACEHOLDER in content:
image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1 image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
content = content.replace( content = content.replace(
IMAGE_PLACEHOLDER, f"{self.start_token}{self.image_token * image_seqlen}{self.end_token}", 1 IMAGE_PLACEHOLDER,
f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
1,
) )
num_image_tokens += 1 num_image_tokens += 1
while VIDEO_PLACEHOLDER in content: while VIDEO_PLACEHOLDER in content:
video_seqlen = video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1 video_seqlen = video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
content = content.replace( content = content.replace(
VIDEO_PLACEHOLDER, f"{self.start_token}{self.video_token * video_seqlen}{self.end_token}", 1 VIDEO_PLACEHOLDER,
f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
1,
) )
num_video_tokens += 1 num_video_tokens += 1
@ -1611,7 +1615,9 @@ class Qwen3VLPlugin(Qwen2VLPlugin):
image_grid_thw[num_image_tokens].prod() // image_merge_length if self.expand_mm_tokens else 1 image_grid_thw[num_image_tokens].prod() // image_merge_length if self.expand_mm_tokens else 1
) )
content = content.replace( content = content.replace(
IMAGE_PLACEHOLDER, f"{self.start_token}{self.image_token * image_seqlen}{self.end_token}", 1 IMAGE_PLACEHOLDER,
f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
1,
) )
num_image_tokens += 1 num_image_tokens += 1
@ -1630,11 +1636,14 @@ class Qwen3VLPlugin(Qwen2VLPlugin):
else 1 else 1
) )
timestamp_sec = timestamps[frame_index] timestamp_sec = timestamps[frame_index]
frame_structure = f"<{timestamp_sec:.1f} seconds>{self.start_token}{self.video_token * video_seqlen}{self.end_token}" frame_structure = (
f"<{timestamp_sec:.1f} seconds>"
f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}"
)
video_structure += frame_structure video_structure += frame_structure
if not self.expand_mm_tokens: if not self.expand_mm_tokens:
video_structure = f"{self.start_token}{self.video_token}{self.end_token}" video_structure = f"{self.vision_bos_token}{self.video_token}{self.vision_eos_token}"
content = content.replace(VIDEO_PLACEHOLDER, video_structure, 1) content = content.replace(VIDEO_PLACEHOLDER, video_structure, 1)
num_video_tokens += 1 num_video_tokens += 1
@ -1774,7 +1783,11 @@ class GLM4VPlugin(Qwen2VLPlugin):
return mm_inputs return mm_inputs
@dataclass
class Qwen2OmniPlugin(Qwen2VLPlugin): class Qwen2OmniPlugin(Qwen2VLPlugin):
audio_bos_token: str = "<|audio_start|>"
audio_eos_token: str = "<|audio_end|>"
@override @override
def _get_mm_inputs( def _get_mm_inputs(
self, self,
@ -1861,7 +1874,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
while IMAGE_PLACEHOLDER in content: while IMAGE_PLACEHOLDER in content:
image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1 image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
content = content.replace( content = content.replace(
IMAGE_PLACEHOLDER, f"<|vision_bos|>{self.image_token * image_seqlen}<|vision_eos|>", 1 IMAGE_PLACEHOLDER,
f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
1,
) )
num_image_tokens += 1 num_image_tokens += 1
@ -1898,7 +1913,7 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
video_chunk_indices = processor.get_chunked_index(video_t_index, t_ntoken_per_chunk) video_chunk_indices = processor.get_chunked_index(video_t_index, t_ntoken_per_chunk)
audio_chunk_indices = processor.get_chunked_index(audio_t_index, t_ntoken_per_chunk) audio_chunk_indices = processor.get_chunked_index(audio_t_index, t_ntoken_per_chunk)
placeholder_string = "" placeholder_string = ""
placeholder_string += "<|vision_bos|>" + "<|audio_bos|>" placeholder_string += self.vision_bos_token + self.audio_bos_token
for j in range(max(len(video_chunk_indices), len(audio_chunk_indices))): for j in range(max(len(video_chunk_indices), len(audio_chunk_indices))):
video_chunk_index = video_chunk_indices[j] if j < len(video_chunk_indices) else None video_chunk_index = video_chunk_indices[j] if j < len(video_chunk_indices) else None
audio_chunk_index = audio_chunk_indices[j] if j < len(audio_chunk_indices) else None audio_chunk_index = audio_chunk_indices[j] if j < len(audio_chunk_indices) else None
@ -1908,7 +1923,7 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
if audio_chunk_index is not None: if audio_chunk_index is not None:
placeholder_string += self.audio_token * (audio_chunk_index[1] - audio_chunk_index[0]) placeholder_string += self.audio_token * (audio_chunk_index[1] - audio_chunk_index[0])
placeholder_string += "<|audio_eos|>" + "<|vision_eos|>" placeholder_string += self.audio_eos_token + self.vision_eos_token
content = content.replace(VIDEO_PLACEHOLDER, placeholder_string, 1) content = content.replace(VIDEO_PLACEHOLDER, placeholder_string, 1)
content = content.replace(AUDIO_PLACEHOLDER, "", 1) content = content.replace(AUDIO_PLACEHOLDER, "", 1)
num_audio_tokens += 1 num_audio_tokens += 1
@ -1917,7 +1932,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
while AUDIO_PLACEHOLDER in content: while AUDIO_PLACEHOLDER in content:
audio_seqlen = audio_lengths[num_audio_tokens] if self.expand_mm_tokens else 1 audio_seqlen = audio_lengths[num_audio_tokens] if self.expand_mm_tokens else 1
content = content.replace( content = content.replace(
AUDIO_PLACEHOLDER, f"<|audio_bos|>{self.audio_token * audio_seqlen}<|audio_eos|>", 1 AUDIO_PLACEHOLDER,
f"{self.audio_bos_token}{self.audio_token * audio_seqlen}{self.audio_eos_token}",
1,
) )
num_audio_tokens += 1 num_audio_tokens += 1
@ -1926,7 +1943,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1 video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
) )
content = content.replace( content = content.replace(
VIDEO_PLACEHOLDER, f"<|vision_bos|>{self.video_token * video_seqlen}<|vision_eos|>", 1 VIDEO_PLACEHOLDER,
f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
1,
) )
num_video_tokens += 1 num_video_tokens += 1

View File

@ -922,8 +922,8 @@ register_template(
name="qwen2_vl", name="qwen2_vl",
image_token="<|imgpad|>", image_token="<|imgpad|>",
video_token="<|vidpad|>", video_token="<|vidpad|>",
start_token="<|img|>", vision_bos_token="<|img|>",
end_token="<|endofimg|>", vision_eos_token="<|endofimg|>",
), ),
) )
@ -1862,7 +1862,14 @@ register_template(
stop_words=["<|im_end|>"], stop_words=["<|im_end|>"],
replace_eos=True, replace_eos=True,
mm_plugin=get_mm_plugin( mm_plugin=get_mm_plugin(
name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>" name="qwen2_omni",
image_token="<|IMAGE|>",
video_token="<|VIDEO|>",
audio_token="<|AUDIO|>",
vision_bos_token="<|vision_bos|>",
vision_eos_token="<|vision_eos|>",
audio_bos_token="<|audio_bos|>",
audio_eos_token="<|audio_eos|>",
), ),
) )
@ -1880,7 +1887,7 @@ register_template(
stop_words=["<|im_end|>"], stop_words=["<|im_end|>"],
replace_eos=True, replace_eos=True,
mm_plugin=get_mm_plugin( mm_plugin=get_mm_plugin(
name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>" name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
), ),
template_class=ReasoningTemplate, template_class=ReasoningTemplate,
) )
@ -1899,7 +1906,7 @@ register_template(
stop_words=["<|im_end|>"], stop_words=["<|im_end|>"],
replace_eos=True, replace_eos=True,
mm_plugin=get_mm_plugin( mm_plugin=get_mm_plugin(
name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>" name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
), ),
) )

View File

@ -3060,13 +3060,14 @@ register_model_group(
multimodal=True, multimodal=True,
) )
register_model_group( register_model_group(
models={ models={
"Qwen/Qwen3-Omni-30B-A3B-Captioner": { "Qwen3-Omni-30B-A3B-Captioner": {
DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Captioner", DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Captioner", DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
}, },
"Qwen/Qwen3-Omni-30B-A3B-Instruct": { "Qwen3-Omni-30B-A3B-Instruct": {
DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Instruct", DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Instruct", DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
}, },
@ -3075,9 +3076,10 @@ register_model_group(
multimodal=True, multimodal=True,
) )
register_model_group( register_model_group(
models={ models={
"Qwen/Qwen3-Omni-30B-A3B-Thinking": { "Qwen3-Omni-30B-A3B-Thinking": {
DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Thinking", DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Thinking", DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
}, },
@ -3086,6 +3088,7 @@ register_model_group(
multimodal=True, multimodal=True,
) )
register_model_group( register_model_group(
models={ models={
"Qwen2-VL-2B": { "Qwen2-VL-2B": {
@ -3190,24 +3193,24 @@ register_model_group(
register_model_group( register_model_group(
models={ models={
"Qwen/Qwen3-VL-235B-A22B-Thinking": { "Qwen3-VL-235B-A22B-Instruct": {
DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking", DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct",
DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking", DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct",
}, },
}, },
template="qwen3_vl", template="qwen3_vl_nothink",
multimodal=True, multimodal=True,
) )
register_model_group( register_model_group(
models={ models={
"Qwen/Qwen3-VL-235B-A22B-Instruct": { "Qwen3-VL-235B-A22B-Thinking": {
DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct", DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking",
DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct", DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking",
}, },
}, },
template="qwen3_vl_nothink", template="qwen3_vl",
multimodal=True, multimodal=True,
) )

View File

@ -94,7 +94,7 @@ def check_version(requirement: str, mandatory: bool = False) -> None:
def check_dependencies() -> None: def check_dependencies() -> None:
r"""Check the version of the required packages.""" r"""Check the version of the required packages."""
check_version("transformers>=4.49.0,<=4.56.1") check_version("transformers>=4.49.0,<=4.56.2")
check_version("datasets>=2.16.0,<=4.0.0") check_version("datasets>=2.16.0,<=4.0.0")
check_version("accelerate>=1.3.0,<=1.10.1") check_version("accelerate>=1.3.0,<=1.10.1")
check_version("peft>=0.14.0,<=0.17.1") check_version("peft>=0.14.0,<=0.17.1")

View File

@ -147,7 +147,7 @@ def _check_extra_dependencies(
check_version("mixture-of-depth>=1.1.6", mandatory=True) check_version("mixture-of-depth>=1.1.6", mandatory=True)
if model_args.infer_backend == EngineName.VLLM: if model_args.infer_backend == EngineName.VLLM:
check_version("vllm>=0.4.3,<=0.10.0") check_version("vllm>=0.4.3,<=0.10.2")
check_version("vllm", mandatory=True) check_version("vllm", mandatory=True)
elif model_args.infer_backend == EngineName.SGLANG: elif model_args.infer_backend == EngineName.SGLANG:
check_version("sglang>=0.4.5") check_version("sglang>=0.4.5")
@ -174,7 +174,8 @@ def _check_extra_dependencies(
if training_args is not None: if training_args is not None:
if training_args.deepspeed: if training_args.deepspeed:
# pin deepspeed version < 0.17 because of https://github.com/deepspeedai/DeepSpeed/issues/7347 # pin deepspeed version < 0.17 because of https://github.com/deepspeedai/DeepSpeed/issues/7347
check_version("deepspeed>=0.10.0,<=0.16.9", mandatory=True) check_version("deepspeed", mandatory=True)
check_version("deepspeed>=0.10.0,<=0.16.9")
if training_args.predict_with_generate: if training_args.predict_with_generate:
check_version("jieba", mandatory=True) check_version("jieba", mandatory=True)

View File

@ -162,7 +162,7 @@ def load_model(
load_class = AutoModelForVision2Seq load_class = AutoModelForVision2Seq
elif type(config) in AutoModelForSeq2SeqLM._model_mapping.keys(): # audio-text elif type(config) in AutoModelForSeq2SeqLM._model_mapping.keys(): # audio-text
load_class = AutoModelForSeq2SeqLM load_class = AutoModelForSeq2SeqLM
elif type(config) in AutoModelForTextToWaveform._model_mapping.keys(): # audio hack for qwen2_5_omni elif type(config) in AutoModelForTextToWaveform._model_mapping.keys(): # audio hack for qwen omni
load_class = AutoModelForTextToWaveform load_class = AutoModelForTextToWaveform
else: else:
load_class = AutoModelForCausalLM load_class = AutoModelForCausalLM
@ -171,8 +171,8 @@ def load_model(
model = load_class.from_config(config, trust_remote_code=model_args.trust_remote_code) model = load_class.from_config(config, trust_remote_code=model_args.trust_remote_code)
else: else:
model = load_class.from_pretrained(**init_kwargs) model = load_class.from_pretrained(**init_kwargs)
if getattr(model.config, "model_type", None) == "qwen2_5_omni": if getattr(model.config, "model_type", None) in ["qwen2_5_omni", "qwen3_omni_moe"]:
model = model.thinker # use part of Omni model model = getattr(model, "thinker")
if model_args.mixture_of_depths == "convert": if model_args.mixture_of_depths == "convert":
model = convert_pretrained_model_to_mod(model, config, model_args) model = convert_pretrained_model_to_mod(model, config, model_args)

View File

@ -298,6 +298,7 @@ _register_composite_model(
lora_conflict_keys=["audio_projection_layer"], lora_conflict_keys=["audio_projection_layer"],
) )
_register_composite_model( _register_composite_model(
model_type="mistral3", model_type="mistral3",
) )
@ -351,6 +352,33 @@ _register_composite_model(
) )
_register_composite_model(
model_type="qwen3_vl",
projector_key="visual.merger",
vision_model_keys=["visual.patch_embed", "visual.blocks"],
language_model_keys=["language_model", "lm_head"],
lora_conflict_keys=["patch_embed"],
)
_register_composite_model(
model_type="qwen3_vl_moe",
projector_key="visual.merger",
vision_model_keys=["visual.patch_embed", "visual.blocks"],
language_model_keys=["language_model", "lm_head"],
lora_conflict_keys=["patch_embed"],
)
_register_composite_model(
model_type="qwen3_omni_moe_thinker",
projector_key="visual.merger",
vision_model_keys=["visual.patch_embed", "visual.blocks", "audio_tower"],
language_model_keys=["model", "lm_head"],
lora_conflict_keys=["patch_embed"],
)
_register_composite_model( _register_composite_model(
model_type="video_llava", model_type="video_llava",
) )

View File

@ -332,7 +332,14 @@ def test_qwen2_omni_plugin():
image_seqlen, audio_seqlen = 4, 2 image_seqlen, audio_seqlen = 4, 2
tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2.5-Omni-7B") tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2.5-Omni-7B")
qwen2_omni_plugin = get_mm_plugin( qwen2_omni_plugin = get_mm_plugin(
name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>" name="qwen2_omni",
image_token="<|IMAGE|>",
video_token="<|VIDEO|>",
audio_token="<|AUDIO|>",
vision_bos_token="<|vision_bos|>",
vision_eos_token="<|vision_eos|>",
audio_bos_token="<|audio_bos|>",
audio_eos_token="<|audio_eos|>",
) )
check_inputs = {"plugin": qwen2_omni_plugin, **tokenizer_module} check_inputs = {"plugin": qwen2_omni_plugin, **tokenizer_module}
check_inputs["expected_mm_messages"] = [ check_inputs["expected_mm_messages"] = [