diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cbe361ee..f2df50e8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: check-ast - id: check-added-large-files @@ -15,13 +15,13 @@ repos: args: ['--branch', 'main'] - repo: https://github.com/asottile/pyupgrade - rev: v3.17.0 + rev: v3.20.0 hooks: - id: pyupgrade - args: [--py38-plus] + args: [--py39-plus] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.9 + rev: v0.13.2 hooks: - id: ruff args: [--fix] diff --git a/README.md b/README.md index 542b27dc..92f24fe7 100644 --- a/README.md +++ b/README.md @@ -262,68 +262,70 @@ Choose your path: ## Supported Models -| Model | Model size | Template | -| ----------------------------------------------------------------- | -------------------------------- | ------------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | -| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | -| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie_nothink/ernie | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 | -| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n | -| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 | -| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v | -| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe | -| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | -| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt | -| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | -| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 | -| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan | -| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | -| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | -| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl | -| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 | -| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 | -| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | -| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | -| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | -| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | -| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | -| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | -| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink | -| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | -| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni | -| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | -| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder | -| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | -| [LING-V2 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 | +| Model | Model size | Template | +| ----------------------------------------------------------------- | -------------------------------- | -------------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | +| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | +| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie/ernie_nothink | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 | +| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n | +| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 | +| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v | +| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe | +| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | +| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt | +| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | +| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 | +| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan | +| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | +| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | +| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl | +| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 | +| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl | +| [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 | +| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | +| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | +| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | +| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | +| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | +| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink | +| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | +| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni | +| [Qwen3-Omni](https://huggingface.co/Qwen)* | 30B | qwen3_omni | +| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | +| [Qwen3-VL](https://huggingface.co/Qwen)* | 235B | qwen3_vl | +| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder | +| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models. diff --git a/README_zh.md b/README_zh.md index 356ce4b2..204cd2c2 100644 --- a/README_zh.md +++ b/README_zh.md @@ -264,68 +264,70 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc ## 模型 -| 模型名 | 参数量 | Template | -| ----------------------------------------------------------------- | -------------------------------- | ------------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | -| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | -| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie_nothink/ernie | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 | -| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n | -| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 | -| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v | -| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe | -| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | -| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt | -| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | -| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 | -| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan | -| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | -| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | -| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl | -| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 | -| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 | -| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | -| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | -| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | -| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | -| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | -| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | -| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | -| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink | -| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | -| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni | -| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | -| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder | -| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | -| [LING-V2 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 | +| 模型名 | 参数量 | Template | +| ----------------------------------------------------------------- | -------------------------------- | -------------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | +| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | +| [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie/ernie_nothink | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 | +| [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n | +| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 | +| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v | +| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org) | 106B/355B | glm4_moe/glm4v_moe | +| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | +| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt | +| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | +| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 | +| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan | +| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | +| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | +| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl | +| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 | +| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl | +| [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI) | 16B/100B | bailing_v2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 | +| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | +| [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | +| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | +| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | +| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | +| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink | +| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | +| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni | +| [Qwen3-Omni](https://huggingface.co/Qwen)* | 30B | qwen3_omni | +| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | +| [Qwen3-VL](https://huggingface.co/Qwen)* | 235B | qwen3_vl | +| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder | +| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > 对于所有“基座”(Base)模型,`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。 diff --git a/requirements.txt b/requirements.txt index dcdaa9b4..424cc643 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # core deps -transformers>=4.49.0,<=4.56.1,!=4.52.0 +transformers>=4.49.0,<=4.56.2,!=4.52.0 datasets>=2.16.0,<=4.0.0 accelerate>=1.3.0,<=1.10.1 peft>=0.14.0,<=0.17.1 diff --git a/scripts/qwen_omni_merge.py b/scripts/qwen_omni_merge.py index e7722e38..7236d23c 100644 --- a/scripts/qwen_omni_merge.py +++ b/scripts/qwen_omni_merge.py @@ -29,33 +29,30 @@ import shutil import fire from peft import PeftModel -from transformers import ( - AutoProcessor, - Qwen2_5OmniForConditionalGeneration, # type: ignore - Qwen2_5OmniThinkerForConditionalGeneration, -) +from transformers import AutoConfig, AutoModelForTextToWaveform, AutoProcessor +from transformers.utils import cached_file def merge_lora( - base_model_path: str, - lora_checkpoint_path: str, + model_path: str, + lora_path: str, + save_path: str = "./merged_model_checkpoint", extra_file: str = "spk_dict.pt", submodule_name: str = "thinker", - save_path: str = "./merged_model_checkpoint", ): """Load the original model, merge the LoRA weights. For a specified submodule, and save the final merged model along with its configurations. Args: - base_model_path (str): Path to the original model directory. - lora_checkpoint_path (str): Path to the directory containing LoRA weights. + model_path (str): Path to the original model directory. + lora_path (str): Path to the directory containing LoRA weights. + save_path (str): Directory where the merged model and configurations will be saved. extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt"). submodule_name (str): Name of the submodule to merge (default: "thinker"). - save_path (str): Directory where the merged model and configurations will be saved. """ # 1. Load the original model - model = Qwen2_5OmniForConditionalGeneration.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu") + model = AutoModelForTextToWaveform.from_pretrained(model_path, torch_dtype="auto", device_map="cpu") print("Successfully loaded the original model.") # 2. Extract the submodule to be merged (e.g., model.thinker) @@ -66,13 +63,13 @@ def merge_lora( print(f"Successfully extracted submodule: {submodule_name}.") # 3. Load the LoRA weights onto the extracted submodule - lora_model = PeftModel.from_pretrained(base_submodule, lora_checkpoint_path) - processor = AutoProcessor.from_pretrained(lora_checkpoint_path) - print("LoRA weights and processor loaded successfully.") + lora_model = PeftModel.from_pretrained(base_submodule, lora_path) + processor = AutoProcessor.from_pretrained(lora_path) + print("Successfully loaded LoRA weights and processor.") # 4. Merge the LoRA weights into the submodule and unload the LoRA modules merged_submodule = lora_model.merge_and_unload() - print("LoRA weights merged successfully.") + print("Successfully merged LoRA weights.") # 5. Replace the original submodule with the merged submodule in the model setattr(model, submodule_name, merged_submodule) @@ -80,20 +77,19 @@ def merge_lora( # 6. Save the final merged model along with the tokenizer and processor configuration model.save_pretrained(save_path) processor.save_pretrained(save_path) - print(f"Merged model and tokenizer saved to {save_path}.") + print(f"Merged model and processor saved to {save_path}.") - source_file = os.path.join(base_model_path, extra_file) - target_file = os.path.join(save_path, extra_file) - if os.path.exists(source_file): - shutil.copy(source_file, target_file) - print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.") - else: - print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.") + try: + source_file = cached_file(path_or_repo_id=model_path, filename=extra_file) + shutil.copy(source_file, os.path.join(save_path, extra_file)) + print(f"File '{extra_file}' copied from {model_path} to {save_path}.") + except Exception: + print(f"File '{extra_file}' not found in {model_path}, skipping copy.") def save_full_model( - saved_thinker_path: str, - base_model_path: str, + model_path: str, + thinker_path: str, save_path: str = "./merged_model_checkpoint", extra_file: str = "spk_dict.pt", ): @@ -102,34 +98,42 @@ def save_full_model( Then save the complete model along with its tokenizer and processor configuration. Args: - saved_thinker_path (str): Path to the saved thinker weights. - base_model_path (str): Directory path of the original model. + model_path (str): Directory path of the original model. + thinker_path (str): Path to the saved thinker weights. save_path (str): Directory where the merged model and configurations will be saved. extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt"). """ # 1. Load the saved thinker module and the original model - thinker = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained( - saved_thinker_path, torch_dtype="auto", device_map="cpu" - ) - base_model = Qwen2_5OmniForConditionalGeneration.from_pretrained( - base_model_path, torch_dtype="auto", device_map="cpu" - ) + config = AutoConfig.from_pretrained(model_path) + if getattr(config, "model_type") == "qwen2_5_omni": + from transformers.models.qwen2_5_omni import Qwen2_5OmniThinkerForConditionalGeneration # type: ignore + + ThinkerClass = Qwen2_5OmniThinkerForConditionalGeneration + elif getattr(config, "model_type") == "qwen3_omni_moe": + from transformers.models.qwen3_omni_moe import Qwen3OmniMoeThinkerForConditionalGeneration # type: ignore + + ThinkerClass = Qwen3OmniMoeThinkerForConditionalGeneration + else: + raise ValueError(f"Unsupported model type: {getattr(config, 'model_type')}.") + + thinker = ThinkerClass.from_pretrained(thinker_path, torch_dtype="auto", device_map="cpu") + base_model = AutoModelForTextToWaveform.from_pretrained(model_path, torch_dtype="auto", device_map="cpu") base_model.thinker = thinker + processor = AutoProcessor.from_pretrained(thinker_path) + print("Successfully loaded model weights and processor.") # 2. Save the complete model along with its tokenizer and processor configuration - processor = AutoProcessor.from_pretrained(saved_thinker_path) base_model.save_pretrained(save_path) processor.save_pretrained(save_path) print(f"Merged model and processor saved to {save_path}.") # 3. Copy the extra file from the base model directory to the save_path - source_file = os.path.join(base_model_path, extra_file) - target_file = os.path.join(save_path, extra_file) - if os.path.exists(source_file): - shutil.copy(source_file, target_file) - print(f"File '{extra_file}' copied from {base_model_path} to {save_path}.") - else: - print(f"File '{extra_file}' not found in {base_model_path}, skipping copy.") + try: + source_file = cached_file(path_or_repo_id=model_path, filename=extra_file) + shutil.copy(source_file, os.path.join(save_path, extra_file)) + print(f"File '{extra_file}' copied from {model_path} to {save_path}.") + except Exception: + print(f"File '{extra_file}' not found in {model_path}, skipping copy.") if __name__ == "__main__": diff --git a/setup.py b/setup.py index 6a079ac8..08ba557e 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ extra_require = { "eetq": ["eetq"], "gptq": ["optimum>=1.24.0", "gptqmodel>=2.0.0"], "aqlm": ["aqlm[gpu]>=1.1.0"], - "vllm": ["vllm>=0.4.3,<=0.10.0"], + "vllm": ["vllm>=0.4.3,<=0.10.2"], "sglang": ["sglang[srt]>=0.4.5", "transformers==4.51.1"], "galore": ["galore-torch"], "apollo": ["apollo-torch"], diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index cfeecd86..162f432c 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -194,7 +194,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid") - if getattr(self.model.config, "model_type", None) == "qwen2_5_omni_thinker": # for qwen2.5 omni + if getattr(self.model.config, "model_type", None) in ["qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]: rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False) feature_attention_mask = mm_inputs.get("feature_attention_mask", None) if feature_attention_mask is not None: # FIXME: need to get video image lengths @@ -205,13 +205,22 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum( dim=-1 ).unsqueeze(-1) - else: # for qwen2vl + else: # for qwen vl features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs) if ( self.model is not None and getattr(self.model.config, "model_type", None) - in ["glm4v", "Keye", "qwen2_vl", "qwen2_5_vl", "qwen2_5_omni_thinker"] + in [ + "glm4v", + "Keye", + "qwen2_vl", + "qwen2_5_vl", + "qwen2_5_omni_thinker", + "qwen3_omni_moe_thinker", + "qwen3_vl", + "qwen3_vl_moe", + ] and ("position_ids" not in features or features["position_ids"].dim() != 3) ): raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.") diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index b0069212..6916d962 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -1397,8 +1397,8 @@ class Qwen2AudioPlugin(BasePlugin): @dataclass class Qwen2VLPlugin(BasePlugin): - start_token: str = "<|vision_start|>" - end_token: str = "<|vision_end|>" + vision_bos_token: str = "<|vision_start|>" + vision_eos_token: str = "<|vision_end|>" @override def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject": @@ -1515,14 +1515,18 @@ class Qwen2VLPlugin(BasePlugin): while IMAGE_PLACEHOLDER in content: image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1 content = content.replace( - IMAGE_PLACEHOLDER, f"{self.start_token}{self.image_token * image_seqlen}{self.end_token}", 1 + IMAGE_PLACEHOLDER, + f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}", + 1, ) num_image_tokens += 1 while VIDEO_PLACEHOLDER in content: video_seqlen = video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1 content = content.replace( - VIDEO_PLACEHOLDER, f"{self.start_token}{self.video_token * video_seqlen}{self.end_token}", 1 + VIDEO_PLACEHOLDER, + f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}", + 1, ) num_video_tokens += 1 @@ -1611,7 +1615,9 @@ class Qwen3VLPlugin(Qwen2VLPlugin): image_grid_thw[num_image_tokens].prod() // image_merge_length if self.expand_mm_tokens else 1 ) content = content.replace( - IMAGE_PLACEHOLDER, f"{self.start_token}{self.image_token * image_seqlen}{self.end_token}", 1 + IMAGE_PLACEHOLDER, + f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}", + 1, ) num_image_tokens += 1 @@ -1630,11 +1636,14 @@ class Qwen3VLPlugin(Qwen2VLPlugin): else 1 ) timestamp_sec = timestamps[frame_index] - frame_structure = f"<{timestamp_sec:.1f} seconds>{self.start_token}{self.video_token * video_seqlen}{self.end_token}" + frame_structure = ( + f"<{timestamp_sec:.1f} seconds>" + f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}" + ) video_structure += frame_structure if not self.expand_mm_tokens: - video_structure = f"{self.start_token}{self.video_token}{self.end_token}" + video_structure = f"{self.vision_bos_token}{self.video_token}{self.vision_eos_token}" content = content.replace(VIDEO_PLACEHOLDER, video_structure, 1) num_video_tokens += 1 @@ -1774,7 +1783,11 @@ class GLM4VPlugin(Qwen2VLPlugin): return mm_inputs +@dataclass class Qwen2OmniPlugin(Qwen2VLPlugin): + audio_bos_token: str = "<|audio_start|>" + audio_eos_token: str = "<|audio_end|>" + @override def _get_mm_inputs( self, @@ -1861,7 +1874,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin): while IMAGE_PLACEHOLDER in content: image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1 content = content.replace( - IMAGE_PLACEHOLDER, f"<|vision_bos|>{self.image_token * image_seqlen}<|vision_eos|>", 1 + IMAGE_PLACEHOLDER, + f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}", + 1, ) num_image_tokens += 1 @@ -1898,7 +1913,7 @@ class Qwen2OmniPlugin(Qwen2VLPlugin): video_chunk_indices = processor.get_chunked_index(video_t_index, t_ntoken_per_chunk) audio_chunk_indices = processor.get_chunked_index(audio_t_index, t_ntoken_per_chunk) placeholder_string = "" - placeholder_string += "<|vision_bos|>" + "<|audio_bos|>" + placeholder_string += self.vision_bos_token + self.audio_bos_token for j in range(max(len(video_chunk_indices), len(audio_chunk_indices))): video_chunk_index = video_chunk_indices[j] if j < len(video_chunk_indices) else None audio_chunk_index = audio_chunk_indices[j] if j < len(audio_chunk_indices) else None @@ -1908,7 +1923,7 @@ class Qwen2OmniPlugin(Qwen2VLPlugin): if audio_chunk_index is not None: placeholder_string += self.audio_token * (audio_chunk_index[1] - audio_chunk_index[0]) - placeholder_string += "<|audio_eos|>" + "<|vision_eos|>" + placeholder_string += self.audio_eos_token + self.vision_eos_token content = content.replace(VIDEO_PLACEHOLDER, placeholder_string, 1) content = content.replace(AUDIO_PLACEHOLDER, "", 1) num_audio_tokens += 1 @@ -1917,7 +1932,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin): while AUDIO_PLACEHOLDER in content: audio_seqlen = audio_lengths[num_audio_tokens] if self.expand_mm_tokens else 1 content = content.replace( - AUDIO_PLACEHOLDER, f"<|audio_bos|>{self.audio_token * audio_seqlen}<|audio_eos|>", 1 + AUDIO_PLACEHOLDER, + f"{self.audio_bos_token}{self.audio_token * audio_seqlen}{self.audio_eos_token}", + 1, ) num_audio_tokens += 1 @@ -1926,7 +1943,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin): video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1 ) content = content.replace( - VIDEO_PLACEHOLDER, f"<|vision_bos|>{self.video_token * video_seqlen}<|vision_eos|>", 1 + VIDEO_PLACEHOLDER, + f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}", + 1, ) num_video_tokens += 1 diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 330ff50c..792b0cfe 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -922,8 +922,8 @@ register_template( name="qwen2_vl", image_token="<|imgpad|>", video_token="<|vidpad|>", - start_token="<|img|>", - end_token="<|endofimg|>", + vision_bos_token="<|img|>", + vision_eos_token="<|endofimg|>", ), ) @@ -1862,7 +1862,14 @@ register_template( stop_words=["<|im_end|>"], replace_eos=True, mm_plugin=get_mm_plugin( - name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>" + name="qwen2_omni", + image_token="<|IMAGE|>", + video_token="<|VIDEO|>", + audio_token="<|AUDIO|>", + vision_bos_token="<|vision_bos|>", + vision_eos_token="<|vision_eos|>", + audio_bos_token="<|audio_bos|>", + audio_eos_token="<|audio_eos|>", ), ) @@ -1880,7 +1887,7 @@ register_template( stop_words=["<|im_end|>"], replace_eos=True, mm_plugin=get_mm_plugin( - name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>" + name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>" ), template_class=ReasoningTemplate, ) @@ -1899,7 +1906,7 @@ register_template( stop_words=["<|im_end|>"], replace_eos=True, mm_plugin=get_mm_plugin( - name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>" + name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>" ), ) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index e3f5d708..199e4cdf 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -3060,13 +3060,14 @@ register_model_group( multimodal=True, ) + register_model_group( models={ - "Qwen/Qwen3-Omni-30B-A3B-Captioner": { + "Qwen3-Omni-30B-A3B-Captioner": { DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Captioner", DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Captioner", }, - "Qwen/Qwen3-Omni-30B-A3B-Instruct": { + "Qwen3-Omni-30B-A3B-Instruct": { DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Instruct", DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Instruct", }, @@ -3075,9 +3076,10 @@ register_model_group( multimodal=True, ) + register_model_group( models={ - "Qwen/Qwen3-Omni-30B-A3B-Thinking": { + "Qwen3-Omni-30B-A3B-Thinking": { DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Thinking", DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Thinking", }, @@ -3086,6 +3088,7 @@ register_model_group( multimodal=True, ) + register_model_group( models={ "Qwen2-VL-2B": { @@ -3190,24 +3193,24 @@ register_model_group( register_model_group( models={ - "Qwen/Qwen3-VL-235B-A22B-Thinking": { - DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking", - DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking", + "Qwen3-VL-235B-A22B-Instruct": { + DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct", }, }, - template="qwen3_vl", + template="qwen3_vl_nothink", multimodal=True, ) register_model_group( models={ - "Qwen/Qwen3-VL-235B-A22B-Instruct": { - DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct", - DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct", + "Qwen3-VL-235B-A22B-Thinking": { + DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking", + DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking", }, }, - template="qwen3_vl_nothink", + template="qwen3_vl", multimodal=True, ) diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index e1fb2e62..4f1778ba 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -94,7 +94,7 @@ def check_version(requirement: str, mandatory: bool = False) -> None: def check_dependencies() -> None: r"""Check the version of the required packages.""" - check_version("transformers>=4.49.0,<=4.56.1") + check_version("transformers>=4.49.0,<=4.56.2") check_version("datasets>=2.16.0,<=4.0.0") check_version("accelerate>=1.3.0,<=1.10.1") check_version("peft>=0.14.0,<=0.17.1") diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py index 2a2fc2ee..cd3ad9aa 100644 --- a/src/llamafactory/hparams/parser.py +++ b/src/llamafactory/hparams/parser.py @@ -147,7 +147,7 @@ def _check_extra_dependencies( check_version("mixture-of-depth>=1.1.6", mandatory=True) if model_args.infer_backend == EngineName.VLLM: - check_version("vllm>=0.4.3,<=0.10.0") + check_version("vllm>=0.4.3,<=0.10.2") check_version("vllm", mandatory=True) elif model_args.infer_backend == EngineName.SGLANG: check_version("sglang>=0.4.5") @@ -174,7 +174,8 @@ def _check_extra_dependencies( if training_args is not None: if training_args.deepspeed: # pin deepspeed version < 0.17 because of https://github.com/deepspeedai/DeepSpeed/issues/7347 - check_version("deepspeed>=0.10.0,<=0.16.9", mandatory=True) + check_version("deepspeed", mandatory=True) + check_version("deepspeed>=0.10.0,<=0.16.9") if training_args.predict_with_generate: check_version("jieba", mandatory=True) diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 8793135f..37dffcbe 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -162,7 +162,7 @@ def load_model( load_class = AutoModelForVision2Seq elif type(config) in AutoModelForSeq2SeqLM._model_mapping.keys(): # audio-text load_class = AutoModelForSeq2SeqLM - elif type(config) in AutoModelForTextToWaveform._model_mapping.keys(): # audio hack for qwen2_5_omni + elif type(config) in AutoModelForTextToWaveform._model_mapping.keys(): # audio hack for qwen omni load_class = AutoModelForTextToWaveform else: load_class = AutoModelForCausalLM @@ -171,8 +171,8 @@ def load_model( model = load_class.from_config(config, trust_remote_code=model_args.trust_remote_code) else: model = load_class.from_pretrained(**init_kwargs) - if getattr(model.config, "model_type", None) == "qwen2_5_omni": - model = model.thinker # use part of Omni model + if getattr(model.config, "model_type", None) in ["qwen2_5_omni", "qwen3_omni_moe"]: + model = getattr(model, "thinker") if model_args.mixture_of_depths == "convert": model = convert_pretrained_model_to_mod(model, config, model_args) diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index e5c39280..cfbe6a22 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -298,6 +298,7 @@ _register_composite_model( lora_conflict_keys=["audio_projection_layer"], ) + _register_composite_model( model_type="mistral3", ) @@ -351,6 +352,33 @@ _register_composite_model( ) +_register_composite_model( + model_type="qwen3_vl", + projector_key="visual.merger", + vision_model_keys=["visual.patch_embed", "visual.blocks"], + language_model_keys=["language_model", "lm_head"], + lora_conflict_keys=["patch_embed"], +) + + +_register_composite_model( + model_type="qwen3_vl_moe", + projector_key="visual.merger", + vision_model_keys=["visual.patch_embed", "visual.blocks"], + language_model_keys=["language_model", "lm_head"], + lora_conflict_keys=["patch_embed"], +) + + +_register_composite_model( + model_type="qwen3_omni_moe_thinker", + projector_key="visual.merger", + vision_model_keys=["visual.patch_embed", "visual.blocks", "audio_tower"], + language_model_keys=["model", "lm_head"], + lora_conflict_keys=["patch_embed"], +) + + _register_composite_model( model_type="video_llava", ) diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py index 406307d0..87281f7e 100644 --- a/tests/data/test_mm_plugin.py +++ b/tests/data/test_mm_plugin.py @@ -332,7 +332,14 @@ def test_qwen2_omni_plugin(): image_seqlen, audio_seqlen = 4, 2 tokenizer_module = _load_tokenizer_module(model_name_or_path="Qwen/Qwen2.5-Omni-7B") qwen2_omni_plugin = get_mm_plugin( - name="qwen2_omni", audio_token="<|AUDIO|>", image_token="<|IMAGE|>", video_token="<|VIDEO|>" + name="qwen2_omni", + image_token="<|IMAGE|>", + video_token="<|VIDEO|>", + audio_token="<|AUDIO|>", + vision_bos_token="<|vision_bos|>", + vision_eos_token="<|vision_eos|>", + audio_bos_token="<|audio_bos|>", + audio_eos_token="<|audio_eos|>", ) check_inputs = {"plugin": qwen2_omni_plugin, **tokenizer_module} check_inputs["expected_mm_messages"] = [