diff --git a/README.md b/README.md index d613fcd1..11700e93 100644 --- a/README.md +++ b/README.md @@ -166,7 +166,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [StarCoder2](https://huggingface.co/bigcode) | 3B/7B/15B | q_proj,v_proj | - | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | q_proj,v_proj | xverse | | [Yi (1/1.5)](https://huggingface.co/01-ai) | 6B/9B/34B | q_proj,v_proj | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B | q_proj,v_proj | yi_vl | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | q_proj,v_proj | yi_vl | | [Yuan](https://huggingface.co/IEITYuan) | 2B/51B/102B | q_proj,v_proj | yuan | > [!NOTE] diff --git a/README_zh.md b/README_zh.md index 0d91b2f4..146d046d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -166,7 +166,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | [StarCoder2](https://huggingface.co/bigcode) | 3B/7B/15B | q_proj,v_proj | - | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | q_proj,v_proj | xverse | | [Yi (1/1.5)](https://huggingface.co/01-ai) | 6B/9B/34B | q_proj,v_proj | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B | q_proj,v_proj | yi_vl | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | q_proj,v_proj | yi_vl | | [Yuan](https://huggingface.co/IEITYuan) | 2B/51B/102B | q_proj,v_proj | yuan | > [!NOTE] diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py index c427411a..f1ee55a0 100644 --- a/src/llmtuner/extras/constants.py +++ b/src/llmtuner/extras/constants.py @@ -1215,6 +1215,9 @@ register_model_group( "YiVL-6B-Chat": { DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-6B-hf", }, + "YiVL-34B-Chat": { + DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-34B-hf", + }, }, template="yi_vl", vision=True, diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index ea55de27..08cdf17f 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -78,8 +78,15 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule": patch_tokenizer(tokenizer) if model_args.visual_inputs: - processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs) - setattr(processor, "tokenizer", tokenizer) + try: + processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs) + setattr(processor, "tokenizer", tokenizer) + except Exception: + raise ValueError( + "This multimodal LLM is not supported.\n" + "Download LLaVA-1.5 models from: https://huggingface.co/llava-hf\n" + "Download Yi-VL models from: https://huggingface.co/BUAADreamer" + ) else: processor = None diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py index 33fb394d..50f92d22 100644 --- a/src/llmtuner/model/utils/visual.py +++ b/src/llmtuner/model/utils/visual.py @@ -58,7 +58,7 @@ class LlavaMultiModalProjectorForYiVLForVLLM(LlavaMultiModalProjectorForYiVL): self.linear_2 = torch.nn.LayerNorm(text_hidden_size, bias=True) self.linear_3 = torch.nn.Linear(text_hidden_size, text_hidden_size, bias=True) self.linear_4 = torch.nn.LayerNorm(text_hidden_size, bias=True) - self.act = torch.nn.GELU() + self.act = ACT2FN[projector_hidden_act] def autocast_projector_dtype(