mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-10-14 07:42:49 +08:00
[data] fix qwen omni plugin (#9204)
Co-authored-by: kingsley <kingsleydodonow@gmail.com>
This commit is contained in:
parent
0761a4448f
commit
6ffebe5ff7
@ -1,6 +1,6 @@
|
|||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v5.0.0
|
rev: v6.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: check-ast
|
- id: check-ast
|
||||||
- id: check-added-large-files
|
- id: check-added-large-files
|
||||||
@ -15,13 +15,13 @@ repos:
|
|||||||
args: ['--branch', 'main']
|
args: ['--branch', 'main']
|
||||||
|
|
||||||
- repo: https://github.com/asottile/pyupgrade
|
- repo: https://github.com/asottile/pyupgrade
|
||||||
rev: v3.17.0
|
rev: v3.20.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: pyupgrade
|
- id: pyupgrade
|
||||||
args: [--py38-plus]
|
args: [--py39-plus]
|
||||||
|
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.6.9
|
rev: v0.13.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--fix]
|
args: [--fix]
|
||||||
|
126
README.md
126
README.md
@ -262,68 +262,70 @@ Choose your path:
|
|||||||
|
|
||||||
## Supported Models
|
## Supported Models
|
||||||
|
|
||||||
| Model | Model size | Template |
|
| Model | Model size | Template |
|
||||||
| ----------------------------------------------------------------- | -------------------------------- | ------------------- |
|
| ----------------------------------------------------------------- | -------------------------------- | -------------------- |
|
||||||
| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 |
|
| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 |
|
||||||
| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - |
|
| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - |
|
||||||
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 |
|
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 |
|
||||||
| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere |
|
| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere |
|
||||||
| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek |
|
| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek |
|
||||||
| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 |
|
| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 |
|
||||||
| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 |
|
| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 |
|
||||||
| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie_nothink/ernie |
|
| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie/ernie_nothink |
|
||||||
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon |
|
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon |
|
||||||
| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 |
|
| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 |
|
||||||
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
|
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
|
||||||
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
|
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
|
||||||
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
|
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
|
||||||
| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v |
|
| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v |
|
||||||
| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe |
|
| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe |
|
||||||
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
||||||
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
|
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
|
||||||
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
|
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
|
||||||
| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 |
|
| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 |
|
||||||
| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan |
|
| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan |
|
||||||
| [Index](https://huggingface.co/IndexTeam) | 1.9B | index |
|
| [Index](https://huggingface.co/IndexTeam) | 1.9B | index |
|
||||||
| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 |
|
| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 |
|
||||||
| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl |
|
| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl |
|
||||||
| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
|
| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
|
||||||
| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl |
|
| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl |
|
||||||
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
|
| [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 |
|
||||||
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
|
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
|
||||||
| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 |
|
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
|
||||||
| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 |
|
| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 |
|
||||||
| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama |
|
| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 |
|
||||||
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
|
| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama |
|
||||||
| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next |
|
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
|
||||||
| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video |
|
| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next |
|
||||||
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
|
| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video |
|
||||||
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
|
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
|
||||||
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
|
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
|
||||||
| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral |
|
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
|
||||||
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral |
|
||||||
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
|
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
||||||
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
|
||||||
| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma |
|
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
||||||
| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - |
|
| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma |
|
||||||
| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi |
|
| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - |
|
||||||
| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small |
|
| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi |
|
||||||
| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 |
|
| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small |
|
||||||
| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral |
|
| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 |
|
||||||
| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen |
|
| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral |
|
||||||
| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
|
| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen |
|
||||||
| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio |
|
| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
|
||||||
| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni |
|
| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio |
|
||||||
| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl |
|
| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni |
|
||||||
| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder |
|
| [Qwen3-Omni](https://huggingface.co/Qwen)* | 30B | qwen3_omni |
|
||||||
| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 |
|
| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl |
|
||||||
| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - |
|
| [Qwen3-VL](https://huggingface.co/Qwen)* | 235B | qwen3_vl |
|
||||||
| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 |
|
| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder |
|
||||||
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse |
|
| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 |
|
||||||
| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi |
|
| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - |
|
||||||
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl |
|
| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 |
|
||||||
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan |
|
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse |
|
||||||
| [LING-V2 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 |
|
| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi |
|
||||||
|
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl |
|
||||||
|
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan |
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
|
> For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
|
||||||
|
126
README_zh.md
126
README_zh.md
@ -264,68 +264,70 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
|
|||||||
|
|
||||||
## 模型
|
## 模型
|
||||||
|
|
||||||
| 模型名 | 参数量 | Template |
|
| 模型名 | 参数量 | Template |
|
||||||
| ----------------------------------------------------------------- | -------------------------------- | ------------------- |
|
| ----------------------------------------------------------------- | -------------------------------- | -------------------- |
|
||||||
| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 |
|
| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 |
|
||||||
| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - |
|
| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - |
|
||||||
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 |
|
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 |
|
||||||
| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere |
|
| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere |
|
||||||
| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek |
|
| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek |
|
||||||
| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 |
|
| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 |
|
||||||
| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 |
|
| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 |
|
||||||
| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie_nothink/ernie |
|
| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie/ernie_nothink |
|
||||||
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon |
|
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon |
|
||||||
| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 |
|
| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 |
|
||||||
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
|
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 |
|
||||||
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
|
| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n |
|
||||||
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
|
| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 |
|
||||||
| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v |
|
| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v |
|
||||||
| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe |
|
| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe |
|
||||||
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - |
|
||||||
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
|
| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt |
|
||||||
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
|
| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 |
|
||||||
| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 |
|
| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 |
|
||||||
| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan |
|
| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan |
|
||||||
| [Index](https://huggingface.co/IndexTeam) | 1.9B | index |
|
| [Index](https://huggingface.co/IndexTeam) | 1.9B | index |
|
||||||
| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 |
|
| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 |
|
||||||
| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl |
|
| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl |
|
||||||
| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
|
| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 |
|
||||||
| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl |
|
| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl |
|
||||||
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
|
| [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 |
|
||||||
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
|
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
|
||||||
| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 |
|
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
|
||||||
| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 |
|
| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 |
|
||||||
| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama |
|
| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 |
|
||||||
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
|
| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama |
|
||||||
| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next |
|
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
|
||||||
| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video |
|
| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next |
|
||||||
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
|
| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video |
|
||||||
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
|
| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo |
|
||||||
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
|
| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 |
|
||||||
| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral |
|
| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v |
|
||||||
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral |
|
||||||
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
|
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
||||||
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small |
|
||||||
| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma |
|
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
||||||
| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - |
|
| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma |
|
||||||
| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi |
|
| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - |
|
||||||
| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small |
|
| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi |
|
||||||
| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 |
|
| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small |
|
||||||
| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral |
|
| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 |
|
||||||
| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen |
|
| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral |
|
||||||
| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
|
| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen |
|
||||||
| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio |
|
| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink |
|
||||||
| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni |
|
| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio |
|
||||||
| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl |
|
| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni |
|
||||||
| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder |
|
| [Qwen3-Omni](https://huggingface.co/Qwen)* | 30B | qwen3_omni |
|
||||||
| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 |
|
| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl |
|
||||||
| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - |
|
| [Qwen3-VL](https://huggingface.co/Qwen)* | 235B | qwen3_vl |
|
||||||
| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 |
|
| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder |
|
||||||
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse |
|
| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 |
|
||||||
| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi |
|
| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - |
|
||||||
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl |
|
| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 |
|
||||||
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan |
|
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse |
|
||||||
| [LING-V2 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 |
|
| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi |
|
||||||
|
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl |
|
||||||
|
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan |
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> 对于所有“基座”(Base)模型,`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。
|
> 对于所有“基座”(Base)模型,`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# core deps
|
# core deps
|
||||||
transformers>=4.49.0,<=4.56.1,!=4.52.0
|
transformers>=4.49.0,<=4.56.2,!=4.52.0
|
||||||
datasets>=2.16.0,<=4.0.0
|
datasets>=2.16.0,<=4.0.0
|
||||||
accelerate>=1.3.0,<=1.10.1
|
accelerate>=1.3.0,<=1.10.1
|
||||||
peft>=0.14.0,<=0.17.1
|
peft>=0.14.0,<=0.17.1
|
||||||
|
@ -29,33 +29,30 @@ import shutil
|
|||||||
|
|
||||||
import fire
|
import fire
|
||||||
from peft import PeftModel
|
from peft import PeftModel
|
||||||
from transformers import (
|
from transformers import AutoConfig, AutoModelForTextToWaveform, AutoProcessor
|
||||||
AutoProcessor,
|
from transformers.utils import cached_file
|
||||||
Qwen2_5OmniForConditionalGeneration, # type: ignore
|
|
||||||
Qwen2_5OmniThinkerForConditionalGeneration,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def merge_lora(
|
def merge_lora(
|
||||||
base_model_path: str,
|
model_path: str,
|
||||||
lora_checkpoint_path: str,
|
lora_path: str,
|
||||||
|
save_path: str = "./merged_model_checkpoint",
|
||||||
extra_file: str = "spk_dict.pt",
|
extra_file: str = "spk_dict.pt",
|
||||||
submodule_name: str = "thinker",
|
submodule_name: str = "thinker",
|
||||||
save_path: str = "./merged_model_checkpoint",
|
|
||||||
):
|
):
|
||||||
"""Load the original model, merge the LoRA weights.
|
"""Load the original model, merge the LoRA weights.
|
||||||
|
|
||||||
For a specified submodule, and save the final merged model along with its configurations.
|
For a specified submodule, and save the final merged model along with its configurations.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
base_model_path (str): Path to the original model directory.
|
model_path (str): Path to the original model directory.
|
||||||
lora_checkpoint_path (str): Path to the directory containing LoRA weights.
|
lora_path (str): Path to the directory containing LoRA weights.
|
||||||
|
save_path (str): Directory where the merged model and configurations will be saved.
|
||||||
extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
|
extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
|
||||||
submodule_name (str): Name of the submodule to merge (default: "thinker").
|
submodule_name (str): Name of the submodule to merge (default: "thinker").
|
||||||
save_path (str): Directory where the merged model and configurations will be saved.
|
|
||||||
"""
|
"""
|
||||||
# 1. Load the original model
|
# 1. Load the original model
|
||||||
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu")
|
model = AutoModelForTextToWaveform.from_pretrained(model_path, torch_dtype="auto", device_map="cpu")
|
||||||
print("Successfully loaded the original model.")
|
print("Successfully loaded the original model.")
|
||||||
|
|
||||||
# 2. Extract the submodule to be merged (e.g., model.thinker)
|
# 2. Extract the submodule to be merged (e.g., model.thinker)
|
||||||
@ -66,13 +63,13 @@ def merge_lora(
|
|||||||
print(f"Successfully extracted submodule: {submodule_name}.")
|
print(f"Successfully extracted submodule: {submodule_name}.")
|
||||||
|
|
||||||
# 3. Load the LoRA weights onto the extracted submodule
|
# 3. Load the LoRA weights onto the extracted submodule
|
||||||
lora_model = PeftModel.from_pretrained(base_submodule, lora_checkpoint_path)
|
lora_model = PeftModel.from_pretrained(base_submodule, lora_path)
|
||||||
processor = AutoProcessor.from_pretrained(lora_checkpoint_path)
|
processor = AutoProcessor.from_pretrained(lora_path)
|
||||||
print("LoRA weights and processor loaded successfully.")
|
print("Successfully loaded LoRA weights and processor.")
|
||||||
|
|
||||||
# 4. Merge the LoRA weights into the submodule and unload the LoRA modules
|
# 4. Merge the LoRA weights into the submodule and unload the LoRA modules
|
||||||
merged_submodule = lora_model.merge_and_unload()
|
merged_submodule = lora_model.merge_and_unload()
|
||||||
print("LoRA weights merged successfully.")
|
print("Successfully merged LoRA weights.")
|
||||||
|
|
||||||
# 5. Replace the original submodule with the merged submodule in the model
|
# 5. Replace the original submodule with the merged submodule in the model
|
||||||
setattr(model, submodule_name, merged_submodule)
|
setattr(model, submodule_name, merged_submodule)
|
||||||
@ -80,20 +77,19 @@ def merge_lora(
|
|||||||
# 6. Save the final merged model along with the tokenizer and processor configuration
|
# 6. Save the final merged model along with the tokenizer and processor configuration
|
||||||
model.save_pretrained(save_path)
|
model.save_pretrained(save_path)
|
||||||
processor.save_pretrained(save_path)
|
processor.save_pretrained(save_path)
|
||||||
print(f"Merged model and tokenizer saved to {save_path}.")
|
print(f"Merged model and processor saved to {save_path}.")
|
||||||
|
|
||||||
source_file = os.path.join(base_model_path, extra_file)
|
try:
|
||||||
target_file = os.path.join(save_path, extra_file)
|
source_file = cached_file(path_or_repo_id=model_path, filename=extra_file)
|
||||||
if os.path.exists(source_file):
|
shutil.copy(source_file, os.path.join(save_path, extra_file))
|
||||||
shutil.copy(source_file, target_file)
|
print(f"File '{extra_file}' copied from {model_path} to {save_path}.")
|
||||||
print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.")
|
except Exception:
|
||||||
else:
|
print(f"File '{extra_file}' not found in {model_path}, skipping copy.")
|
||||||
print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.")
|
|
||||||
|
|
||||||
|
|
||||||
def save_full_model(
|
def save_full_model(
|
||||||
saved_thinker_path: str,
|
model_path: str,
|
||||||
base_model_path: str,
|
thinker_path: str,
|
||||||
save_path: str = "./merged_model_checkpoint",
|
save_path: str = "./merged_model_checkpoint",
|
||||||
extra_file: str = "spk_dict.pt",
|
extra_file: str = "spk_dict.pt",
|
||||||
):
|
):
|
||||||
@ -102,34 +98,42 @@ def save_full_model(
|
|||||||
Then save the complete model along with its tokenizer and processor configuration.
|
Then save the complete model along with its tokenizer and processor configuration.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
saved_thinker_path (str): Path to the saved thinker weights.
|
model_path (str): Directory path of the original model.
|
||||||
base_model_path (str): Directory path of the original model.
|
thinker_path (str): Path to the saved thinker weights.
|
||||||
save_path (str): Directory where the merged model and configurations will be saved.
|
save_path (str): Directory where the merged model and configurations will be saved.
|
||||||
extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
|
extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
|
||||||
"""
|
"""
|
||||||
# 1. Load the saved thinker module and the original model
|
# 1. Load the saved thinker module and the original model
|
||||||
thinker = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
|
config = AutoConfig.from_pretrained(model_path)
|
||||||
saved_thinker_path, torch_dtype="auto", device_map="cpu"
|
if getattr(config, "model_type") == "qwen2_5_omni":
|
||||||
)
|
from transformers.models.qwen2_5_omni import Qwen2_5OmniThinkerForConditionalGeneration # type: ignore
|
||||||
base_model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
|
|
||||||
base_model_path, torch_dtype="auto", device_map="cpu"
|
ThinkerClass = Qwen2_5OmniThinkerForConditionalGeneration
|
||||||
)
|
elif getattr(config, "model_type") == "qwen3_omni_moe":
|
||||||
|
from transformers.models.qwen3_omni_moe import Qwen3OmniMoeThinkerForConditionalGeneration # type: ignore
|
||||||
|
|
||||||
|
ThinkerClass = Qwen3OmniMoeThinkerForConditionalGeneration
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported model type: {getattr(config, 'model_type')}.")
|
||||||
|
|
||||||
|
thinker = ThinkerClass.from_pretrained(thinker_path, torch_dtype="auto", device_map="cpu")
|
||||||
|
base_model = AutoModelForTextToWaveform.from_pretrained(model_path, torch_dtype="auto", device_map="cpu")
|
||||||
base_model.thinker = thinker
|
base_model.thinker = thinker
|
||||||
|
processor = AutoProcessor.from_pretrained(thinker_path)
|
||||||
|
print("Successfully loaded model weights and processor.")
|
||||||
|
|
||||||
# 2. Save the complete model along with its tokenizer and processor configuration
|
# 2. Save the complete model along with its tokenizer and processor configuration
|
||||||
processor = AutoProcessor.from_pretrained(saved_thinker_path)
|
|
||||||
base_model.save_pretrained(save_path)
|
base_model.save_pretrained(save_path)
|
||||||
processor.save_pretrained(save_path)
|
processor.save_pretrained(save_path)
|
||||||
print(f"Merged model and processor saved to {save_path}.")
|
print(f"Merged model and processor saved to {save_path}.")
|
||||||
|
|
||||||
# 3. Copy the extra file from the base model directory to the save_path
|
# 3. Copy the extra file from the base model directory to the save_path
|
||||||
source_file = os.path.join(base_model_path, extra_file)
|
try:
|
||||||
target_file = os.path.join(save_path, extra_file)
|
source_file = cached_file(path_or_repo_id=model_path, filename=extra_file)
|
||||||
if os.path.exists(source_file):
|
shutil.copy(source_file, os.path.join(save_path, extra_file))
|
||||||
shutil.copy(source_file, target_file)
|
print(f"File '{extra_file}' copied from {model_path} to {save_path}.")
|
||||||
print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.")
|
except Exception:
|
||||||
else:
|
print(f"File '{extra_file}' not found in {model_path}, skipping copy.")
|
||||||
print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
2
setup.py
2
setup.py
@ -52,7 +52,7 @@ extra_require = {
|
|||||||
"eetq": ["eetq"],
|
"eetq": ["eetq"],
|
||||||
"gptq": ["optimum>=1.24.0", "gptqmodel>=2.0.0"],
|
"gptq": ["optimum>=1.24.0", "gptqmodel>=2.0.0"],
|
||||||
"aqlm": ["aqlm[gpu]>=1.1.0"],
|
"aqlm": ["aqlm[gpu]>=1.1.0"],
|
||||||
"vllm": ["vllm>=0.4.3,<=0.10.0"],
|
"vllm": ["vllm>=0.4.3,<=0.10.2"],
|
||||||
"sglang": ["sglang[srt]>=0.4.5", "transformers==4.51.1"],
|
"sglang": ["sglang[srt]>=0.4.5", "transformers==4.51.1"],
|
||||||
"galore": ["galore-torch"],
|
"galore": ["galore-torch"],
|
||||||
"apollo": ["apollo-torch"],
|
"apollo": ["apollo-torch"],
|
||||||
|
@ -194,7 +194,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
|||||||
elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni
|
elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni
|
||||||
rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid")
|
rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid")
|
||||||
|
|
||||||
if getattr(self.model.config, "model_type", None) == "qwen2_5_omni_thinker": # for qwen2.5 omni
|
if getattr(self.model.config, "model_type", None) in ["qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]:
|
||||||
rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False)
|
rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False)
|
||||||
feature_attention_mask = mm_inputs.get("feature_attention_mask", None)
|
feature_attention_mask = mm_inputs.get("feature_attention_mask", None)
|
||||||
if feature_attention_mask is not None: # FIXME: need to get video image lengths
|
if feature_attention_mask is not None: # FIXME: need to get video image lengths
|
||||||
@ -205,13 +205,22 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
|||||||
features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum(
|
features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum(
|
||||||
dim=-1
|
dim=-1
|
||||||
).unsqueeze(-1)
|
).unsqueeze(-1)
|
||||||
else: # for qwen2vl
|
else: # for qwen vl
|
||||||
features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs)
|
features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.model is not None
|
self.model is not None
|
||||||
and getattr(self.model.config, "model_type", None)
|
and getattr(self.model.config, "model_type", None)
|
||||||
in ["glm4v", "Keye", "qwen2_vl", "qwen2_5_vl", "qwen2_5_omni_thinker"]
|
in [
|
||||||
|
"glm4v",
|
||||||
|
"Keye",
|
||||||
|
"qwen2_vl",
|
||||||
|
"qwen2_5_vl",
|
||||||
|
"qwen2_5_omni_thinker",
|
||||||
|
"qwen3_omni_moe_thinker",
|
||||||
|
"qwen3_vl",
|
||||||
|
"qwen3_vl_moe",
|
||||||
|
]
|
||||||
and ("position_ids" not in features or features["position_ids"].dim() != 3)
|
and ("position_ids" not in features or features["position_ids"].dim() != 3)
|
||||||
):
|
):
|
||||||
raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.")
|
raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.")
|
||||||
|
@ -1397,8 +1397,8 @@ class Qwen2AudioPlugin(BasePlugin):
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Qwen2VLPlugin(BasePlugin):
|
class Qwen2VLPlugin(BasePlugin):
|
||||||
start_token: str = "<|vision_start|>"
|
vision_bos_token: str = "<|vision_start|>"
|
||||||
end_token: str = "<|vision_end|>"
|
vision_eos_token: str = "<|vision_end|>"
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
|
def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
|
||||||
@ -1515,14 +1515,18 @@ class Qwen2VLPlugin(BasePlugin):
|
|||||||
while IMAGE_PLACEHOLDER in content:
|
while IMAGE_PLACEHOLDER in content:
|
||||||
image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
|
image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
|
||||||
content = content.replace(
|
content = content.replace(
|
||||||
IMAGE_PLACEHOLDER, f"{self.start_token}{self.image_token * image_seqlen}{self.end_token}", 1
|
IMAGE_PLACEHOLDER,
|
||||||
|
f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
|
||||||
|
1,
|
||||||
)
|
)
|
||||||
num_image_tokens += 1
|
num_image_tokens += 1
|
||||||
|
|
||||||
while VIDEO_PLACEHOLDER in content:
|
while VIDEO_PLACEHOLDER in content:
|
||||||
video_seqlen = video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
|
video_seqlen = video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
|
||||||
content = content.replace(
|
content = content.replace(
|
||||||
VIDEO_PLACEHOLDER, f"{self.start_token}{self.video_token * video_seqlen}{self.end_token}", 1
|
VIDEO_PLACEHOLDER,
|
||||||
|
f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
|
||||||
|
1,
|
||||||
)
|
)
|
||||||
num_video_tokens += 1
|
num_video_tokens += 1
|
||||||
|
|
||||||
@ -1611,7 +1615,9 @@ class Qwen3VLPlugin(Qwen2VLPlugin):
|
|||||||
image_grid_thw[num_image_tokens].prod() // image_merge_length if self.expand_mm_tokens else 1
|
image_grid_thw[num_image_tokens].prod() // image_merge_length if self.expand_mm_tokens else 1
|
||||||
)
|
)
|
||||||
content = content.replace(
|
content = content.replace(
|
||||||
IMAGE_PLACEHOLDER, f"{self.start_token}{self.image_token * image_seqlen}{self.end_token}", 1
|
IMAGE_PLACEHOLDER,
|
||||||
|
f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
|
||||||
|
1,
|
||||||
)
|
)
|
||||||
num_image_tokens += 1
|
num_image_tokens += 1
|
||||||
|
|
||||||
@ -1630,11 +1636,14 @@ class Qwen3VLPlugin(Qwen2VLPlugin):
|
|||||||
else 1
|
else 1
|
||||||
)
|
)
|
||||||
timestamp_sec = timestamps[frame_index]
|
timestamp_sec = timestamps[frame_index]
|
||||||
frame_structure = f"<{timestamp_sec:.1f} seconds>{self.start_token}{self.video_token * video_seqlen}{self.end_token}"
|
frame_structure = (
|
||||||
|
f"<{timestamp_sec:.1f} seconds>"
|
||||||
|
f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}"
|
||||||
|
)
|
||||||
video_structure += frame_structure
|
video_structure += frame_structure
|
||||||
|
|
||||||
if not self.expand_mm_tokens:
|
if not self.expand_mm_tokens:
|
||||||
video_structure = f"{self.start_token}{self.video_token}{self.end_token}"
|
video_structure = f"{self.vision_bos_token}{self.video_token}{self.vision_eos_token}"
|
||||||
|
|
||||||
content = content.replace(VIDEO_PLACEHOLDER, video_structure, 1)
|
content = content.replace(VIDEO_PLACEHOLDER, video_structure, 1)
|
||||||
num_video_tokens += 1
|
num_video_tokens += 1
|
||||||
@ -1774,7 +1783,11 @@ class GLM4VPlugin(Qwen2VLPlugin):
|
|||||||
return mm_inputs
|
return mm_inputs
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
class Qwen2OmniPlugin(Qwen2VLPlugin):
|
class Qwen2OmniPlugin(Qwen2VLPlugin):
|
||||||
|
audio_bos_token: str = "<|audio_start|>"
|
||||||
|
audio_eos_token: str = "<|audio_end|>"
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def _get_mm_inputs(
|
def _get_mm_inputs(
|
||||||
self,
|
self,
|
||||||
@ -1861,7 +1874,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
|
|||||||
while IMAGE_PLACEHOLDER in content:
|
while IMAGE_PLACEHOLDER in content:
|
||||||
image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
|
image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
|
||||||
content = content.replace(
|
content = content.replace(
|
||||||
IMAGE_PLACEHOLDER, f"<|vision_bos|>{self.image_token * image_seqlen}<|vision_eos|>", 1
|
IMAGE_PLACEHOLDER,
|
||||||
|
f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
|
||||||
|
1,
|
||||||
)
|
)
|
||||||
num_image_tokens += 1
|
num_image_tokens += 1
|
||||||
|
|
||||||
@ -1898,7 +1913,7 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
|
|||||||
video_chunk_indices = processor.get_chunked_index(video_t_index, t_ntoken_per_chunk)
|
video_chunk_indices = processor.get_chunked_index(video_t_index, t_ntoken_per_chunk)
|
||||||
audio_chunk_indices = processor.get_chunked_index(audio_t_index, t_ntoken_per_chunk)
|
audio_chunk_indices = processor.get_chunked_index(audio_t_index, t_ntoken_per_chunk)
|
||||||
placeholder_string = ""
|
placeholder_string = ""
|
||||||
placeholder_string += "<|vision_bos|>" + "<|audio_bos|>"
|
placeholder_string += self.vision_bos_token + self.audio_bos_token
|
||||||
for j in range(max(len(video_chunk_indices), len(audio_chunk_indices))):
|
for j in range(max(len(video_chunk_indices), len(audio_chunk_indices))):
|
||||||
video_chunk_index = video_chunk_indices[j] if j < len(video_chunk_indices) else None
|
video_chunk_index = video_chunk_indices[j] if j < len(video_chunk_indices) else None
|
||||||
audio_chunk_index = audio_chunk_indices[j] if j < len(audio_chunk_indices) else None
|
audio_chunk_index = audio_chunk_indices[j] if j < len(audio_chunk_indices) else None
|
||||||
@ -1908,7 +1923,7 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
|
|||||||
if audio_chunk_index is not None:
|
if audio_chunk_index is not None:
|
||||||
placeholder_string += self.audio_token * (audio_chunk_index[1] - audio_chunk_index[0])
|
placeholder_string += self.audio_token * (audio_chunk_index[1] - audio_chunk_index[0])
|
||||||
|
|
||||||
placeholder_string += "<|audio_eos|>" + "<|vision_eos|>"
|
placeholder_string += self.audio_eos_token + self.vision_eos_token
|
||||||
content = content.replace(VIDEO_PLACEHOLDER, placeholder_string, 1)
|
content = content.replace(VIDEO_PLACEHOLDER, placeholder_string, 1)
|
||||||
content = content.replace(AUDIO_PLACEHOLDER, "", 1)
|
content = content.replace(AUDIO_PLACEHOLDER, "", 1)
|
||||||
num_audio_tokens += 1
|
num_audio_tokens += 1
|
||||||
@ -1917,7 +1932,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
|
|||||||
while AUDIO_PLACEHOLDER in content:
|
while AUDIO_PLACEHOLDER in content:
|
||||||
audio_seqlen = audio_lengths[num_audio_tokens] if self.expand_mm_tokens else 1
|
audio_seqlen = audio_lengths[num_audio_tokens] if self.expand_mm_tokens else 1
|
||||||
content = content.replace(
|
content = content.replace(
|
||||||
AUDIO_PLACEHOLDER, f"<|audio_bos|>{self.audio_token * audio_seqlen}<|audio_eos|>", 1
|
AUDIO_PLACEHOLDER,
|
||||||
|
f"{self.audio_bos_token}{self.audio_token * audio_seqlen}{self.audio_eos_token}",
|
||||||
|
1,
|
||||||
)
|
)
|
||||||
num_audio_tokens += 1
|
num_audio_tokens += 1
|
||||||
|
|
||||||
@ -1926,7 +1943,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin):
|
|||||||
video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
|
video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
|
||||||
)
|
)
|
||||||
content = content.replace(
|
content = content.replace(
|
||||||
VIDEO_PLACEHOLDER, f"<|vision_bos|>{self.video_token * video_seqlen}<|vision_eos|>", 1
|
VIDEO_PLACEHOLDER,
|
||||||
|
f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
|
||||||
|
1,
|
||||||
)
|
)
|
||||||
num_video_tokens += 1
|
num_video_tokens += 1
|
||||||
|
|
||||||
|
@ -922,8 +922,8 @@ register_template(
|
|||||||
name="qwen2_vl",
|
name="qwen2_vl",
|
||||||
image_token="<|imgpad|>",
|
image_token="<|imgpad|>",
|
||||||
video_token="<|vidpad|>",
|
video_token="<|vidpad|>",
|
||||||
start_token="<|img|>",
|
vision_bos_token="<|img|>",
|
||||||
end_token="<|endofimg|>",
|
vision_eos_token="<|endofimg|>",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1862,7 +1862,14 @@ register_template(
|
|||||||
stop_words=["<|im_end|>"],
|
stop_words=["<|im_end|>"],
|
||||||
replace_eos=True,
|
replace_eos=True,
|
||||||
mm_plugin=get_mm_plugin(
|
mm_plugin=get_mm_plugin(
|
||||||
name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>"
|
name="qwen2_omni",
|
||||||
|
image_token="<|IMAGE|>",
|
||||||
|
video_token="<|VIDEO|>",
|
||||||
|
audio_token="<|AUDIO|>",
|
||||||
|
vision_bos_token="<|vision_bos|>",
|
||||||
|
vision_eos_token="<|vision_eos|>",
|
||||||
|
audio_bos_token="<|audio_bos|>",
|
||||||
|
audio_eos_token="<|audio_eos|>",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1880,7 +1887,7 @@ register_template(
|
|||||||
stop_words=["<|im_end|>"],
|
stop_words=["<|im_end|>"],
|
||||||
replace_eos=True,
|
replace_eos=True,
|
||||||
mm_plugin=get_mm_plugin(
|
mm_plugin=get_mm_plugin(
|
||||||
name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>"
|
name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
|
||||||
),
|
),
|
||||||
template_class=ReasoningTemplate,
|
template_class=ReasoningTemplate,
|
||||||
)
|
)
|
||||||
@ -1899,7 +1906,7 @@ register_template(
|
|||||||
stop_words=["<|im_end|>"],
|
stop_words=["<|im_end|>"],
|
||||||
replace_eos=True,
|
replace_eos=True,
|
||||||
mm_plugin=get_mm_plugin(
|
mm_plugin=get_mm_plugin(
|
||||||
name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>"
|
name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -3060,13 +3060,14 @@ register_model_group(
|
|||||||
multimodal=True,
|
multimodal=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_model_group(
|
register_model_group(
|
||||||
models={
|
models={
|
||||||
"Qwen/Qwen3-Omni-30B-A3B-Captioner": {
|
"Qwen3-Omni-30B-A3B-Captioner": {
|
||||||
DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
|
DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
|
||||||
DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
|
DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
|
||||||
},
|
},
|
||||||
"Qwen/Qwen3-Omni-30B-A3B-Instruct": {
|
"Qwen3-Omni-30B-A3B-Instruct": {
|
||||||
DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
|
DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
|
||||||
DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
|
DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
|
||||||
},
|
},
|
||||||
@ -3075,9 +3076,10 @@ register_model_group(
|
|||||||
multimodal=True,
|
multimodal=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_model_group(
|
register_model_group(
|
||||||
models={
|
models={
|
||||||
"Qwen/Qwen3-Omni-30B-A3B-Thinking": {
|
"Qwen3-Omni-30B-A3B-Thinking": {
|
||||||
DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
|
DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
|
||||||
DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
|
DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
|
||||||
},
|
},
|
||||||
@ -3086,6 +3088,7 @@ register_model_group(
|
|||||||
multimodal=True,
|
multimodal=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_model_group(
|
register_model_group(
|
||||||
models={
|
models={
|
||||||
"Qwen2-VL-2B": {
|
"Qwen2-VL-2B": {
|
||||||
@ -3190,24 +3193,24 @@ register_model_group(
|
|||||||
|
|
||||||
register_model_group(
|
register_model_group(
|
||||||
models={
|
models={
|
||||||
"Qwen/Qwen3-VL-235B-A22B-Thinking": {
|
"Qwen3-VL-235B-A22B-Instruct": {
|
||||||
DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking",
|
DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct",
|
||||||
DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking",
|
DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
template="qwen3_vl",
|
template="qwen3_vl_nothink",
|
||||||
multimodal=True,
|
multimodal=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_model_group(
|
register_model_group(
|
||||||
models={
|
models={
|
||||||
"Qwen/Qwen3-VL-235B-A22B-Instruct": {
|
"Qwen3-VL-235B-A22B-Thinking": {
|
||||||
DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct",
|
DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking",
|
||||||
DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct",
|
DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
template="qwen3_vl_nothink",
|
template="qwen3_vl",
|
||||||
multimodal=True,
|
multimodal=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ def check_version(requirement: str, mandatory: bool = False) -> None:
|
|||||||
|
|
||||||
def check_dependencies() -> None:
|
def check_dependencies() -> None:
|
||||||
r"""Check the version of the required packages."""
|
r"""Check the version of the required packages."""
|
||||||
check_version("transformers>=4.49.0,<=4.56.1")
|
check_version("transformers>=4.49.0,<=4.56.2")
|
||||||
check_version("datasets>=2.16.0,<=4.0.0")
|
check_version("datasets>=2.16.0,<=4.0.0")
|
||||||
check_version("accelerate>=1.3.0,<=1.10.1")
|
check_version("accelerate>=1.3.0,<=1.10.1")
|
||||||
check_version("peft>=0.14.0,<=0.17.1")
|
check_version("peft>=0.14.0,<=0.17.1")
|
||||||
|
@ -147,7 +147,7 @@ def _check_extra_dependencies(
|
|||||||
check_version("mixture-of-depth>=1.1.6", mandatory=True)
|
check_version("mixture-of-depth>=1.1.6", mandatory=True)
|
||||||
|
|
||||||
if model_args.infer_backend == EngineName.VLLM:
|
if model_args.infer_backend == EngineName.VLLM:
|
||||||
check_version("vllm>=0.4.3,<=0.10.0")
|
check_version("vllm>=0.4.3,<=0.10.2")
|
||||||
check_version("vllm", mandatory=True)
|
check_version("vllm", mandatory=True)
|
||||||
elif model_args.infer_backend == EngineName.SGLANG:
|
elif model_args.infer_backend == EngineName.SGLANG:
|
||||||
check_version("sglang>=0.4.5")
|
check_version("sglang>=0.4.5")
|
||||||
@ -174,7 +174,8 @@ def _check_extra_dependencies(
|
|||||||
if training_args is not None:
|
if training_args is not None:
|
||||||
if training_args.deepspeed:
|
if training_args.deepspeed:
|
||||||
# pin deepspeed version < 0.17 because of https://github.com/deepspeedai/DeepSpeed/issues/7347
|
# pin deepspeed version < 0.17 because of https://github.com/deepspeedai/DeepSpeed/issues/7347
|
||||||
check_version("deepspeed>=0.10.0,<=0.16.9", mandatory=True)
|
check_version("deepspeed", mandatory=True)
|
||||||
|
check_version("deepspeed>=0.10.0,<=0.16.9")
|
||||||
|
|
||||||
if training_args.predict_with_generate:
|
if training_args.predict_with_generate:
|
||||||
check_version("jieba", mandatory=True)
|
check_version("jieba", mandatory=True)
|
||||||
|
@ -162,7 +162,7 @@ def load_model(
|
|||||||
load_class = AutoModelForVision2Seq
|
load_class = AutoModelForVision2Seq
|
||||||
elif type(config) in AutoModelForSeq2SeqLM._model_mapping.keys(): # audio-text
|
elif type(config) in AutoModelForSeq2SeqLM._model_mapping.keys(): # audio-text
|
||||||
load_class = AutoModelForSeq2SeqLM
|
load_class = AutoModelForSeq2SeqLM
|
||||||
elif type(config) in AutoModelForTextToWaveform._model_mapping.keys(): # audio hack for qwen2_5_omni
|
elif type(config) in AutoModelForTextToWaveform._model_mapping.keys(): # audio hack for qwen omni
|
||||||
load_class = AutoModelForTextToWaveform
|
load_class = AutoModelForTextToWaveform
|
||||||
else:
|
else:
|
||||||
load_class = AutoModelForCausalLM
|
load_class = AutoModelForCausalLM
|
||||||
@ -171,8 +171,8 @@ def load_model(
|
|||||||
model = load_class.from_config(config, trust_remote_code=model_args.trust_remote_code)
|
model = load_class.from_config(config, trust_remote_code=model_args.trust_remote_code)
|
||||||
else:
|
else:
|
||||||
model = load_class.from_pretrained(**init_kwargs)
|
model = load_class.from_pretrained(**init_kwargs)
|
||||||
if getattr(model.config, "model_type", None) == "qwen2_5_omni":
|
if getattr(model.config, "model_type", None) in ["qwen2_5_omni", "qwen3_omni_moe"]:
|
||||||
model = model.thinker # use part of Omni model
|
model = getattr(model, "thinker")
|
||||||
|
|
||||||
if model_args.mixture_of_depths == "convert":
|
if model_args.mixture_of_depths == "convert":
|
||||||
model = convert_pretrained_model_to_mod(model, config, model_args)
|
model = convert_pretrained_model_to_mod(model, config, model_args)
|
||||||
|
@ -298,6 +298,7 @@ _register_composite_model(
|
|||||||
lora_conflict_keys=["audio_projection_layer"],
|
lora_conflict_keys=["audio_projection_layer"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
_register_composite_model(
|
_register_composite_model(
|
||||||
model_type="mistral3",
|
model_type="mistral3",
|
||||||
)
|
)
|
||||||
@ -351,6 +352,33 @@ _register_composite_model(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_register_composite_model(
|
||||||
|
model_type="qwen3_vl",
|
||||||
|
projector_key="visual.merger",
|
||||||
|
vision_model_keys=["visual.patch_embed", "visual.blocks"],
|
||||||
|
language_model_keys=["language_model", "lm_head"],
|
||||||
|
lora_conflict_keys=["patch_embed"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_register_composite_model(
|
||||||
|
model_type="qwen3_vl_moe",
|
||||||
|
projector_key="visual.merger",
|
||||||
|
vision_model_keys=["visual.patch_embed", "visual.blocks"],
|
||||||
|
language_model_keys=["language_model", "lm_head"],
|
||||||
|
lora_conflict_keys=["patch_embed"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_register_composite_model(
|
||||||
|
model_type="qwen3_omni_moe_thinker",
|
||||||
|
projector_key="visual.merger",
|
||||||
|
vision_model_keys=["visual.patch_embed", "visual.blocks", "audio_tower"],
|
||||||
|
language_model_keys=["model", "lm_head"],
|
||||||
|
lora_conflict_keys=["patch_embed"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_register_composite_model(
|
_register_composite_model(
|
||||||
model_type="video_llava",
|
model_type="video_llava",
|
||||||
)
|
)
|
||||||
|
@ -332,7 +332,14 @@ def test_qwen2_omni_plugin():
|
|||||||
image_seqlen, audio_seqlen = 4, 2
|
image_seqlen, audio_seqlen = 4, 2
|
||||||
tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2.5-Omni-7B")
|
tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2.5-Omni-7B")
|
||||||
qwen2_omni_plugin = get_mm_plugin(
|
qwen2_omni_plugin = get_mm_plugin(
|
||||||
name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>"
|
name="qwen2_omni",
|
||||||
|
image_token="<|IMAGE|>",
|
||||||
|
video_token="<|VIDEO|>",
|
||||||
|
audio_token="<|AUDIO|>",
|
||||||
|
vision_bos_token="<|vision_bos|>",
|
||||||
|
vision_eos_token="<|vision_eos|>",
|
||||||
|
audio_bos_token="<|audio_bos|>",
|
||||||
|
audio_eos_token="<|audio_eos|>",
|
||||||
)
|
)
|
||||||
check_inputs = {"plugin": qwen2_omni_plugin, **tokenizer_module}
|
check_inputs = {"plugin": qwen2_omni_plugin, **tokenizer_module}
|
||||||
check_inputs["expected_mm_messages"] = [
|
check_inputs["expected_mm_messages"] = [
|
||||||
|
Loading…
x
Reference in New Issue
Block a user