diff --git a/README.md b/README.md index 2c059e7d..73238452 100644 --- a/README.md +++ b/README.md @@ -214,51 +214,51 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ ## Supported Models -| Model | Model size | Template | -| ----------------------------------------------------------------- | -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | -| [DeepSeek R1](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseek3 | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | -| [Granite 3.0-3.1](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | -| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | -| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_v | -| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | -| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | -| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | -| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | -| [Qwen/QwQ (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | -| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/72B | qwen2_vl | -| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| Model | Model size | Template | +| ----------------------------------------------------------------- | -------------------------------- | ------------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | +| [DeepSeek R1](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseek3 | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | +| [Granite 3.0-3.1](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | +| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | +| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | +| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | +| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | +| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | +| [Qwen/QwQ (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | +| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/72B | qwen2_vl | +| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models. diff --git a/README_zh.md b/README_zh.md index 294f8ab5..2773db49 100644 --- a/README_zh.md +++ b/README_zh.md @@ -216,51 +216,51 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 ## 模型 -| 模型名 | 模型大小 | Template | -| ----------------------------------------------------------------- | -------------------------------- | ---------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | -| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | -| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | -| [DeepSeek R1](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseek3 | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | -| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | -| [Granite 3.0-3.1](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | -| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | -| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | -| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | -| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | -| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | -| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | -| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | -| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | -| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | -| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_v | -| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | -| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | -| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | -| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | -| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | -| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | -| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | -| [Qwen/QwQ (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | -| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | -| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/72B | qwen2_vl | -| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | -| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | -| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | -| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | +| 模型名 | 模型大小 | Template | +| ----------------------------------------------------------------- | -------------------------------- | ------------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | +| [DeepSeek R1](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseek3 | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | +| [Granite 3.0-3.1](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | +| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | +| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 | +| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | +| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | +| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | +| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | +| [Qwen/QwQ (1-2.5) (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | +| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/72B | qwen2_vl | +| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] > 对于所有“基座”(Base)模型,`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。 diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index 9435bed4..93cd1515 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -106,7 +106,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): batch_audlens.append(len(audios)) batch_input_ids.append(feature["input_ids"]) - fake_input_ids = None + fake_input_ids = [] if ( self.template.mm_plugin.image_token is not None and sum(batch_imglens) == 0 and sum(batch_vidlens) == 0 ): # avoid process hanging in zero3/fsdp case @@ -115,10 +115,11 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): fake_messages = self.template.mm_plugin.process_messages( fake_messages, fake_images, [], [], self.processor ) - fake_input_ids = self.tokenizer.encode(fake_messages[0]["content"], add_special_tokens=False) - fake_input_ids, _ = self.template.mm_plugin.process_token_ids( - fake_input_ids, None, fake_images, [], [], self.tokenizer, self.processor + _fake_input_ids = self.tokenizer.encode(fake_messages[0]["content"], add_special_tokens=False) + _fake_input_ids, _ = self.template.mm_plugin.process_token_ids( + _fake_input_ids, None, fake_images, [], [], self.tokenizer, self.processor ) + fake_input_ids.extend(_fake_input_ids) batch_images = fake_images batch_imglens[0] = 1 @@ -130,14 +131,15 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): fake_messages = self.template.mm_plugin.process_messages( fake_messages, [], [], fake_audios, self.processor ) - fake_input_ids = self.tokenizer.encode(fake_messages[0]["content"], add_special_tokens=False) - fake_input_ids, _ = self.template.mm_plugin.process_token_ids( - fake_input_ids, None, [], [], fake_audios, self.tokenizer, self.processor + _fake_input_ids = self.tokenizer.encode(fake_messages[0]["content"], add_special_tokens=False) + _fake_input_ids, _ = self.template.mm_plugin.process_token_ids( + _fake_input_ids, None, [], [], fake_audios, self.tokenizer, self.processor ) + fake_input_ids.extend(_fake_input_ids) batch_audios = fake_audios batch_audlens[0] = 1 - if fake_input_ids is not None: + if len(fake_input_ids) != 0: if self.tokenizer.padding_side == "right": features[0]["input_ids"] = features[0]["input_ids"] + fake_input_ids features[0]["attention_mask"] = features[0]["attention_mask"] + [0] * len(fake_input_ids) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index d6e72159..26216c4a 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -645,6 +645,12 @@ class MiniCPMVPlugin(BasePlugin): chunk_input=True, sampling_rate=16000, ) + audio_feature_lens = [ + torch.tensor(audio_feature_len) + if not isinstance(audio_feature_len, torch.Tensor) + else audio_feature_len + for audio_feature_len in audio_feature_lens + ] mm_inputs.update({"audio_features": audio_features, "audio_feature_lens": audio_feature_lens}) if kwargs.get("ret_phs", False): mm_inputs.update({"audio_phs": audio_phs}) diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 2a1d1cfd..3106c734 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -982,6 +982,17 @@ _register_template( format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]), format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), stop_words=["<|im_end|>"], + mm_plugin=get_mm_plugin(name="minicpm_v", image_token="", video_token="