mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-10-15 08:08:09 +08:00
support llava-next(video)
Former-commit-id: 27e94593ac467e56e3a7f5c64f4ff6cee81f4b47
This commit is contained in:
parent
dfff411e1a
commit
484128b641
58
README.md
58
README.md
@ -160,34 +160,36 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
|
|||||||
|
|
||||||
## Supported Models
|
## Supported Models
|
||||||
|
|
||||||
| Model | Model size | Template |
|
| Model | Model size | Template |
|
||||||
| ----------------------------------------------------------------- | -------------------------------- | --------- |
|
|-------------------------------------------------------------------| -------------------------------- |------------------|
|
||||||
| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 |
|
| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 |
|
||||||
| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - |
|
| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - |
|
||||||
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 |
|
| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 |
|
||||||
| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere |
|
| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere |
|
||||||
| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek |
|
| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek |
|
||||||
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon |
|
| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon |
|
||||||
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma |
|
| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma |
|
||||||
| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 |
|
| [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 |
|
||||||
| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 |
|
| [InternLM2/InternLM2.5](https://huggingface.co/internlm) | 7B/20B | intern2 |
|
||||||
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
|
| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - |
|
||||||
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
|
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
|
||||||
| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama) | 8B/70B | llama3 |
|
| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama) | 8B/70B | llama3 |
|
||||||
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
|
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
|
||||||
| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 |
|
| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/13B | llava_next |
|
||||||
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/13B | llava_next_video |
|
||||||
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 |
|
||||||
| [PaliGemma](https://huggingface.co/google) | 3B | paligemma |
|
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
||||||
| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - |
|
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
||||||
| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi |
|
| [PaliGemma](https://huggingface.co/google) | 3B | paligemma |
|
||||||
| [Qwen/Qwen1.5/Qwen2 (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen |
|
| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - |
|
||||||
| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B | qwen2_vl |
|
| [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi |
|
||||||
| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - |
|
| [Qwen/Qwen1.5/Qwen2 (Code/Math/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen |
|
||||||
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse |
|
| [Qwen2-VL](https://huggingface.co/Qwen) | 2B/7B | qwen2_vl |
|
||||||
| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi |
|
| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - |
|
||||||
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl |
|
| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse |
|
||||||
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan |
|
| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi |
|
||||||
|
| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl |
|
||||||
|
| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan |
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
|
> For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
|
||||||
|
@ -176,6 +176,8 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
|
|||||||
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
|
| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 |
|
||||||
| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama) | 8B/70B | llama3 |
|
| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama) | 8B/70B | llama3 |
|
||||||
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
|
| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava |
|
||||||
|
| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/13B | llava_next |
|
||||||
|
| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/13B | llava_next_video |
|
||||||
| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 |
|
| [MiniCPM](https://huggingface.co/openbmb) | 1B/2B/4B | cpm/cpm3 |
|
||||||
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral |
|
||||||
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
| [OLMo](https://huggingface.co/allenai) | 1B/7B | - |
|
||||||
|
@ -19,3 +19,4 @@ fire
|
|||||||
packaging
|
packaging
|
||||||
pyyaml
|
pyyaml
|
||||||
numpy<2.0.0
|
numpy<2.0.0
|
||||||
|
av
|
1
setup.py
1
setup.py
@ -61,6 +61,7 @@ extra_require = {
|
|||||||
"qwen": ["transformers_stream_generator"],
|
"qwen": ["transformers_stream_generator"],
|
||||||
"modelscope": ["modelscope"],
|
"modelscope": ["modelscope"],
|
||||||
"dev": ["ruff", "pytest"],
|
"dev": ["ruff", "pytest"],
|
||||||
|
"av": ["av>=13.0.0"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -209,6 +209,50 @@ class BasePlugin:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
class Idefics2Plugin(BasePlugin):
|
||||||
|
@override
|
||||||
|
def process_messages(
|
||||||
|
self,
|
||||||
|
messages: Sequence[Dict[str, str]],
|
||||||
|
images: Sequence["ImageInput"],
|
||||||
|
videos: Sequence["VideoInput"],
|
||||||
|
processor: Optional["ProcessorMixin"],
|
||||||
|
) -> List[Dict[str, str]]:
|
||||||
|
self._validate_input(images, videos)
|
||||||
|
num_image_tokens = 0
|
||||||
|
messages = deepcopy(messages)
|
||||||
|
fake_image_token = processor.fake_image_token.content
|
||||||
|
image_str = f"{fake_image_token}{self.image_token * processor.image_seq_len}{fake_image_token}"
|
||||||
|
image_str = image_str * 5
|
||||||
|
|
||||||
|
for message in messages:
|
||||||
|
content = message["content"]
|
||||||
|
while IMAGE_PLACEHOLDER in content:
|
||||||
|
num_image_tokens += 1
|
||||||
|
content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
|
||||||
|
content = content.replace("{{image}}", image_str)
|
||||||
|
content = content.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
|
||||||
|
message["content"] = content
|
||||||
|
|
||||||
|
if len(images) != num_image_tokens:
|
||||||
|
raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
@override
|
||||||
|
def get_mm_inputs(
|
||||||
|
self,
|
||||||
|
images: Sequence["ImageInput"],
|
||||||
|
videos: Sequence["VideoInput"],
|
||||||
|
imglens: Sequence[int],
|
||||||
|
vidlens: Sequence[int],
|
||||||
|
seqlens: Sequence[int],
|
||||||
|
processor: Optional["ProcessorMixin"],
|
||||||
|
) -> Dict[str, Union[List[int], "torch.Tensor"]]:
|
||||||
|
self._validate_input(images, videos)
|
||||||
|
return _get_mm_inputs(images, videos, processor)
|
||||||
|
|
||||||
|
|
||||||
class LlavaPlugin(BasePlugin):
|
class LlavaPlugin(BasePlugin):
|
||||||
@override
|
@override
|
||||||
def process_messages(
|
def process_messages(
|
||||||
@ -249,6 +293,92 @@ class LlavaPlugin(BasePlugin):
|
|||||||
return _get_mm_inputs(images, videos, processor)
|
return _get_mm_inputs(images, videos, processor)
|
||||||
|
|
||||||
|
|
||||||
|
class LlavaNextPlugin(BasePlugin):
|
||||||
|
@override
|
||||||
|
def process_messages(
|
||||||
|
self,
|
||||||
|
messages: Sequence[Dict[str, str]],
|
||||||
|
images: Sequence["ImageInput"],
|
||||||
|
videos: Sequence["VideoInput"],
|
||||||
|
processor: Optional["ProcessorMixin"],
|
||||||
|
) -> List[Dict[str, str]]:
|
||||||
|
self._validate_input(images, videos)
|
||||||
|
num_image_tokens = 0
|
||||||
|
messages = deepcopy(messages)
|
||||||
|
for message in messages:
|
||||||
|
content = message["content"]
|
||||||
|
while IMAGE_PLACEHOLDER in content:
|
||||||
|
num_image_tokens += 1
|
||||||
|
content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
|
||||||
|
|
||||||
|
if len(images) != num_image_tokens:
|
||||||
|
raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
@override
|
||||||
|
def get_mm_inputs(
|
||||||
|
self,
|
||||||
|
images: Sequence["ImageInput"],
|
||||||
|
videos: Sequence["VideoInput"],
|
||||||
|
imglens: Sequence[int],
|
||||||
|
vidlens: Sequence[int],
|
||||||
|
seqlens: Sequence[int],
|
||||||
|
processor: Optional["ProcessorMixin"],
|
||||||
|
) -> Dict[str, Union[List[int], "torch.Tensor"]]:
|
||||||
|
self._validate_input(images, videos)
|
||||||
|
return _get_mm_inputs(images, videos, processor)
|
||||||
|
|
||||||
|
|
||||||
|
class LlavaNextVideoPlugin(BasePlugin):
|
||||||
|
@override
|
||||||
|
def process_messages(
|
||||||
|
self,
|
||||||
|
messages: Sequence[Dict[str, str]],
|
||||||
|
images: Sequence["ImageInput"],
|
||||||
|
videos: Sequence["VideoInput"],
|
||||||
|
processor: Optional["ProcessorMixin"],
|
||||||
|
) -> List[Dict[str, str]]:
|
||||||
|
self._validate_input(images, videos)
|
||||||
|
num_image_tokens = 0
|
||||||
|
num_video_tokens = 0
|
||||||
|
messages = deepcopy(messages)
|
||||||
|
for message in messages:
|
||||||
|
content = message["content"]
|
||||||
|
while IMAGE_PLACEHOLDER in content:
|
||||||
|
num_image_tokens += 1
|
||||||
|
content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
|
||||||
|
while VIDEO_PLACEHOLDER in content:
|
||||||
|
num_video_tokens += 1
|
||||||
|
content = content.replace(VIDEO_PLACEHOLDER, "{{video}}", 1)
|
||||||
|
|
||||||
|
if len(images) != num_image_tokens:
|
||||||
|
raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
|
||||||
|
|
||||||
|
if len(videos) != num_video_tokens:
|
||||||
|
raise ValueError("The number of videos does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
@override
|
||||||
|
def get_mm_inputs(
|
||||||
|
self,
|
||||||
|
images: Sequence["ImageInput"],
|
||||||
|
videos: Sequence["VideoInput"],
|
||||||
|
imglens: Sequence[int],
|
||||||
|
vidlens: Sequence[int],
|
||||||
|
seqlens: Sequence[int],
|
||||||
|
processor: Optional["ProcessorMixin"],
|
||||||
|
) -> Dict[str, Union[List[int], "torch.Tensor"]]:
|
||||||
|
self._validate_input(images, videos)
|
||||||
|
video_processor = getattr(processor, "video_processor")
|
||||||
|
res = _get_mm_inputs(images, [], processor)
|
||||||
|
if len(videos) != 0:
|
||||||
|
videos = _regularize_videos(videos, processor)
|
||||||
|
video_res = video_processor(videos, return_tensors="pt")
|
||||||
|
res.update(video_res)
|
||||||
|
return res
|
||||||
|
|
||||||
class PaliGemmaPlugin(BasePlugin):
|
class PaliGemmaPlugin(BasePlugin):
|
||||||
@override
|
@override
|
||||||
def process_messages(
|
def process_messages(
|
||||||
@ -380,11 +510,59 @@ class Qwen2vlPlugin(BasePlugin):
|
|||||||
return _get_mm_inputs(images, videos, processor)
|
return _get_mm_inputs(images, videos, processor)
|
||||||
|
|
||||||
|
|
||||||
|
class VideoLlavaPlugin(BasePlugin):
|
||||||
|
@override
|
||||||
|
def process_messages(
|
||||||
|
self,
|
||||||
|
messages: Sequence[Dict[str, str]],
|
||||||
|
images: Sequence["ImageInput"],
|
||||||
|
videos: Sequence["VideoInput"],
|
||||||
|
processor: Optional["ProcessorMixin"],
|
||||||
|
) -> List[Dict[str, str]]:
|
||||||
|
self._validate_input(images, videos)
|
||||||
|
num_image_tokens = 0
|
||||||
|
num_video_tokens = 0
|
||||||
|
messages = deepcopy(messages)
|
||||||
|
for message in messages:
|
||||||
|
content = message["content"]
|
||||||
|
while IMAGE_PLACEHOLDER in content:
|
||||||
|
num_image_tokens += 1
|
||||||
|
content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
|
||||||
|
while VIDEO_PLACEHOLDER in content:
|
||||||
|
num_video_tokens += 1
|
||||||
|
content = content.replace(VIDEO_PLACEHOLDER, "{{video}}", 1)
|
||||||
|
|
||||||
|
if len(images) != num_image_tokens:
|
||||||
|
raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
|
||||||
|
|
||||||
|
if len(videos) != num_video_tokens:
|
||||||
|
raise ValueError("The number of videos does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
@override
|
||||||
|
def get_mm_inputs(
|
||||||
|
self,
|
||||||
|
images: Sequence["ImageInput"],
|
||||||
|
videos: Sequence["VideoInput"],
|
||||||
|
imglens: Sequence[int],
|
||||||
|
vidlens: Sequence[int],
|
||||||
|
seqlens: Sequence[int],
|
||||||
|
processor: Optional["ProcessorMixin"],
|
||||||
|
) -> Dict[str, Union[List[int], "torch.Tensor"]]:
|
||||||
|
self._validate_input(images, videos)
|
||||||
|
return _get_mm_inputs(images, videos, processor)
|
||||||
|
|
||||||
|
|
||||||
PLUGINS = {
|
PLUGINS = {
|
||||||
"base": BasePlugin,
|
"base": BasePlugin,
|
||||||
|
"idefics2": Idefics2Plugin,
|
||||||
"llava": LlavaPlugin,
|
"llava": LlavaPlugin,
|
||||||
|
"llava_next": LlavaNextPlugin,
|
||||||
|
"llava_next_video": LlavaNextVideoPlugin,
|
||||||
"paligemma": PaliGemmaPlugin,
|
"paligemma": PaliGemmaPlugin,
|
||||||
"qwen2_vl": Qwen2vlPlugin,
|
"qwen2_vl": Qwen2vlPlugin,
|
||||||
|
"video_llava": VideoLlavaPlugin,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -680,6 +680,16 @@ _register_template(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_register_template(
|
||||||
|
name="idefics2",
|
||||||
|
format_user=StringFormatter(slots=["User:{{content}}<end_of_utterance>\nAssistant:"]),
|
||||||
|
format_separator=EmptyFormatter(slots=["\n"]),
|
||||||
|
stop_words=["<end_of_utterance>"],
|
||||||
|
replace_eos=True,
|
||||||
|
mm_plugin=get_mm_plugin(name="idefics2", image_token="<image>"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_register_template(
|
_register_template(
|
||||||
name="intern",
|
name="intern",
|
||||||
format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
|
format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
|
||||||
@ -753,6 +763,28 @@ _register_template(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_register_template(
|
||||||
|
name="llava_next",
|
||||||
|
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
|
||||||
|
default_system=(
|
||||||
|
"A chat between a curious user and an artificial intelligence assistant. "
|
||||||
|
"The assistant gives helpful, detailed, and polite answers to the user's questions."
|
||||||
|
),
|
||||||
|
mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_register_template(
|
||||||
|
name="llava_next_video",
|
||||||
|
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
|
||||||
|
default_system=(
|
||||||
|
"A chat between a curious user and an artificial intelligence assistant. "
|
||||||
|
"The assistant gives helpful, detailed, and polite answers to the user's questions."
|
||||||
|
),
|
||||||
|
mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_register_template(
|
_register_template(
|
||||||
name="mistral",
|
name="mistral",
|
||||||
format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
|
format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
|
||||||
@ -897,6 +929,17 @@ _register_template(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_register_template(
|
||||||
|
name="video_llava",
|
||||||
|
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
|
||||||
|
default_system=(
|
||||||
|
"A chat between a curious user and an artificial intelligence assistant. "
|
||||||
|
"The assistant gives helpful, detailed, and polite answers to the user's questions."
|
||||||
|
),
|
||||||
|
mm_plugin=get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_register_template(
|
_register_template(
|
||||||
name="xuanyuan",
|
name="xuanyuan",
|
||||||
format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
|
format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
|
||||||
|
@ -583,6 +583,23 @@ register_model_group(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_model_group(
|
||||||
|
models={
|
||||||
|
"Idefics2-Base": {
|
||||||
|
DownloadSource.DEFAULT: "HuggingFaceM4/idefics2-8b-base",
|
||||||
|
},
|
||||||
|
"Idefics2-Chat": {
|
||||||
|
DownloadSource.DEFAULT: "HuggingFaceM4/idefics2-8b",
|
||||||
|
},
|
||||||
|
"Idefics2-Chatty": {
|
||||||
|
DownloadSource.DEFAULT: "HuggingFaceM4/idefics2-8b-chatty",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
template="idefics2",
|
||||||
|
vision=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
register_model_group(
|
register_model_group(
|
||||||
models={
|
models={
|
||||||
"InternLM-7B": {
|
"InternLM-7B": {
|
||||||
@ -812,6 +829,49 @@ register_model_group(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_model_group(
|
||||||
|
models={
|
||||||
|
"LLaVA-NeXT-7B-Chat": {
|
||||||
|
DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-7b-hf",
|
||||||
|
},
|
||||||
|
"LLaVA-NeXT-13B-Chat": {
|
||||||
|
DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-13b-hf",
|
||||||
|
},
|
||||||
|
"LLaVA-NeXT-34B-Chat": {
|
||||||
|
DownloadSource.DEFAULT: "llava-hf/llava-v1.6-34b-hf",
|
||||||
|
},
|
||||||
|
"LLaVA-NeXT-Mistral-7B-Chat": {
|
||||||
|
DownloadSource.DEFAULT: "llava-hf/llava-v1.6-mistral-7b-hf",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
template="llava_next",
|
||||||
|
vision=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_model_group(
|
||||||
|
models={
|
||||||
|
"LLaVA-NeXT-Video-7B-Chat": {
|
||||||
|
DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||||
|
},
|
||||||
|
"LLaVA-NeXT-Video-34B-Chat": {
|
||||||
|
DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-hf",
|
||||||
|
},
|
||||||
|
"LLaVA-NeXT-Video-7B-32k-Chat": {
|
||||||
|
DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
|
||||||
|
},
|
||||||
|
"LLaVA-NeXT-Video-7B-DPO": {
|
||||||
|
DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf",
|
||||||
|
},
|
||||||
|
"LLaVA-NeXT-Video-34B-DPO": {
|
||||||
|
DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-DPO-hf",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
template="llava_next_video",
|
||||||
|
vision=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
register_model_group(
|
register_model_group(
|
||||||
models={
|
models={
|
||||||
"MiniCPM-2B-SFT-Chat": {
|
"MiniCPM-2B-SFT-Chat": {
|
||||||
@ -1475,6 +1535,17 @@ register_model_group(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
register_model_group(
|
||||||
|
models={
|
||||||
|
"Video-LLaVA-7B-Chat": {
|
||||||
|
DownloadSource.DEFAULT: "LanguageBind/Video-LLaVA-7B-hf",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
template="video_llava",
|
||||||
|
vision=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
register_model_group(
|
register_model_group(
|
||||||
models={
|
models={
|
||||||
"XuanYuan-6B": {
|
"XuanYuan-6B": {
|
||||||
|
@ -107,7 +107,8 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
|
|||||||
setattr(processor, "video_factor", 2)
|
setattr(processor, "video_factor", 2)
|
||||||
else:
|
else:
|
||||||
setattr(processor, "video_factor", 1)
|
setattr(processor, "video_factor", 1)
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
processor = None
|
processor = None
|
||||||
|
|
||||||
# Avoid load tokenizer, see:
|
# Avoid load tokenizer, see:
|
||||||
@ -123,6 +124,12 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
|
|||||||
Loads model config.
|
Loads model config.
|
||||||
"""
|
"""
|
||||||
init_kwargs = _get_init_kwargs(model_args)
|
init_kwargs = _get_init_kwargs(model_args)
|
||||||
|
if "LLaVA-NeXT-Video" in model_args.model_name_or_path:
|
||||||
|
from transformers import PretrainedConfig, LlavaNextVideoConfig, CLIPVisionConfig, LlamaConfig
|
||||||
|
official_config = PretrainedConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
|
||||||
|
config = LlavaNextVideoConfig(CLIPVisionConfig(**official_config.vision_config), LlamaConfig(**official_config.text_config))
|
||||||
|
setattr(config, "visual_inputs", True)
|
||||||
|
return config
|
||||||
return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
|
return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
|
||||||
|
|
||||||
|
|
||||||
@ -159,6 +166,9 @@ def load_model(
|
|||||||
load_class = AutoModelForVision2Seq
|
load_class = AutoModelForVision2Seq
|
||||||
else:
|
else:
|
||||||
load_class = AutoModelForCausalLM
|
load_class = AutoModelForCausalLM
|
||||||
|
if "llava_next_video" == getattr(config, "model_type"):
|
||||||
|
from transformers import LlavaNextVideoForConditionalGeneration
|
||||||
|
load_class = LlavaNextVideoForConditionalGeneration
|
||||||
|
|
||||||
if model_args.train_from_scratch:
|
if model_args.train_from_scratch:
|
||||||
model = load_class.from_config(config)
|
model = load_class.from_config(config)
|
||||||
|
@ -34,7 +34,7 @@ def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool)
|
|||||||
forbidden_modules.add("output_layer")
|
forbidden_modules.add("output_layer")
|
||||||
elif model_type == "internlm2":
|
elif model_type == "internlm2":
|
||||||
forbidden_modules.add("output")
|
forbidden_modules.add("output")
|
||||||
elif model_type in ["llava", "paligemma"]:
|
elif model_type in ["idefics2", "llava", "llava_next", "llava_next_video", "paligemma", "video_llava"]:
|
||||||
forbidden_modules.add("multi_modal_projector")
|
forbidden_modules.add("multi_modal_projector")
|
||||||
elif model_type == "qwen2_vl":
|
elif model_type == "qwen2_vl":
|
||||||
forbidden_modules.add("merger")
|
forbidden_modules.add("merger")
|
||||||
|
@ -108,7 +108,7 @@ def configure_visual_model(config: "PretrainedConfig") -> None:
|
|||||||
Patches VLMs before loading them.
|
Patches VLMs before loading them.
|
||||||
"""
|
"""
|
||||||
model_type = getattr(config, "model_type", None)
|
model_type = getattr(config, "model_type", None)
|
||||||
if model_type == "llava": # required for ds zero3 and valuehead models
|
if model_type in ["llava", "llava_next", "video_llava", "idefics2", "llava_next_video"]: # required for ds zero3 and valuehead models
|
||||||
setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
|
setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
|
||||||
|
|
||||||
if getattr(config, "is_yi_vl_derived_model", None):
|
if getattr(config, "is_yi_vl_derived_model", None):
|
||||||
@ -150,7 +150,7 @@ def get_image_seqlen(config: "PretrainedConfig") -> int:
|
|||||||
image_seqlen += 1
|
image_seqlen += 1
|
||||||
elif model_type == "paligemma":
|
elif model_type == "paligemma":
|
||||||
image_seqlen = config.vision_config.num_image_tokens
|
image_seqlen = config.vision_config.num_image_tokens
|
||||||
elif model_type == "qwen2_vl": # variable length
|
else:
|
||||||
image_seqlen = -1
|
image_seqlen = -1
|
||||||
|
|
||||||
return image_seqlen
|
return image_seqlen
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
import copy
|
||||||
import os
|
import os
|
||||||
from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple
|
from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple
|
||||||
|
|
||||||
@ -136,6 +136,47 @@ def test_llava_plugin():
|
|||||||
_check_plugin(**check_inputs)
|
_check_plugin(**check_inputs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_idefics2_plugin():
|
||||||
|
tokenizer, processor = _load_tokenizer_module(model_name_or_path="HuggingFaceM4/idefics2-8b")
|
||||||
|
idefics2_plugin = get_mm_plugin(name="idefics2", image_token="<image>")
|
||||||
|
check_inputs = {"plugin": idefics2_plugin, "tokenizer": tokenizer, "processor": processor}
|
||||||
|
mm_messages = copy.deepcopy(MM_MESSAGES)
|
||||||
|
fake_image_token = processor.fake_image_token.content
|
||||||
|
image_str = f"{fake_image_token}{"<image>" * processor.image_seq_len}{fake_image_token}"
|
||||||
|
image_str = image_str * 5
|
||||||
|
for message in mm_messages:
|
||||||
|
content = message["content"]
|
||||||
|
content = content.replace("<image>", image_str)
|
||||||
|
content = content.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
|
||||||
|
message['content'] = content
|
||||||
|
check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
|
||||||
|
_check_plugin(**check_inputs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_llava_next_plugin():
|
||||||
|
tokenizer, processor = _load_tokenizer_module(model_name_or_path="llava-hf/llava-v1.6-vicuna-7b-hf")
|
||||||
|
llava_next_plugin = get_mm_plugin(name="llava_next", image_token="<image>")
|
||||||
|
check_inputs = {"plugin": llava_next_plugin, "tokenizer": tokenizer, "processor": processor}
|
||||||
|
check_inputs["expected_mm_messages"] = [
|
||||||
|
{key: value for key, value in message.items()}
|
||||||
|
for message in MM_MESSAGES
|
||||||
|
]
|
||||||
|
check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
|
||||||
|
_check_plugin(**check_inputs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_llava_next_video_plugin():
|
||||||
|
tokenizer, processor = _load_tokenizer_module(model_name_or_path="llava-hf/LLaVA-NeXT-Video-7B-hf")
|
||||||
|
llava_next_video_plugin = get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>")
|
||||||
|
check_inputs = {"plugin": llava_next_video_plugin, "tokenizer": tokenizer, "processor": processor}
|
||||||
|
check_inputs["expected_mm_messages"] = [
|
||||||
|
{key: value for key, value in message.items()}
|
||||||
|
for message in MM_MESSAGES
|
||||||
|
]
|
||||||
|
check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
|
||||||
|
_check_plugin(**check_inputs)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
|
@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
|
||||||
def test_paligemma_plugin():
|
def test_paligemma_plugin():
|
||||||
tokenizer, processor = _load_tokenizer_module(model_name_or_path="google/paligemma-3b-pt-224")
|
tokenizer, processor = _load_tokenizer_module(model_name_or_path="google/paligemma-3b-pt-224")
|
||||||
@ -167,3 +208,15 @@ def test_qwen2_vl_plugin():
|
|||||||
]
|
]
|
||||||
check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
|
check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
|
||||||
_check_plugin(**check_inputs)
|
_check_plugin(**check_inputs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_video_llava_plugin():
|
||||||
|
tokenizer, processor = _load_tokenizer_module(model_name_or_path="LanguageBind/Video-LLaVA-7B-hf")
|
||||||
|
video_llava_plugin = get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>")
|
||||||
|
check_inputs = {"plugin": video_llava_plugin, "tokenizer": tokenizer, "processor": processor}
|
||||||
|
check_inputs["expected_mm_messages"] = [
|
||||||
|
{key: value for key, value in message.items()}
|
||||||
|
for message in MM_MESSAGES
|
||||||
|
]
|
||||||
|
check_inputs["expected_mm_inputs"] = _get_mm_inputs(processor)
|
||||||
|
_check_plugin(**check_inputs)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user