mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-29 10:10:35 +08:00
[model] Update ernie_vl to adapt new version (#9665)
This commit is contained in:
@@ -480,21 +480,35 @@ class ErnieVLPlugin(BasePlugin):
|
||||
self._validate_input(processor, images, videos, audios)
|
||||
self._validate_messages(messages, images, videos, audios)
|
||||
messages = deepcopy(messages)
|
||||
|
||||
image_processor: BaseImageProcessor = getattr(processor, "image_processor")
|
||||
|
||||
merge_length: int = getattr(image_processor, "merge_size") ** 2
|
||||
if self.expand_mm_tokens:
|
||||
mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
|
||||
image_grid_thw = mm_inputs.get("image_grid_thw", [])
|
||||
video_grid_thw = mm_inputs.get("video_grid_thw", [])
|
||||
else:
|
||||
image_grid_thw = [None] * len(images)
|
||||
video_grid_thw = [None] * len(videos)
|
||||
|
||||
image_idx, video_idx = 0, 0
|
||||
for message in messages:
|
||||
content = message["content"]
|
||||
image_token = self.image_token or "<|image@placeholder|>"
|
||||
video_token = self.video_token or "<|video@placeholder|>"
|
||||
image_token = self.image_token or "<|IMAGE_PLACEHOLDER|>"
|
||||
video_token = self.video_token or "<|VIDEO_PLACEHOLDER|>"
|
||||
while IMAGE_PLACEHOLDER in content:
|
||||
image_seqlen = image_grid_thw[image_idx].prod() // merge_length if self.expand_mm_tokens else 1
|
||||
content = content.replace(
|
||||
IMAGE_PLACEHOLDER, f"Picture {image_idx + 1}:<|IMAGE_START|>{image_token * image_seqlen}<|IMAGE_END|>", 1
|
||||
)
|
||||
image_idx += 1
|
||||
content = content.replace(
|
||||
IMAGE_PLACEHOLDER, f"Picture {image_idx}:<|IMAGE_START|>{image_token}<|IMAGE_END|>", 1
|
||||
)
|
||||
while VIDEO_PLACEHOLDER in content:
|
||||
video_idx += 1
|
||||
video_seqlen = video_grid_thw[video_idx].prod() // merge_length if self.expand_mm_tokens else 1
|
||||
content = content.replace(
|
||||
VIDEO_PLACEHOLDER, f"Video {video_idx}:<|VIDEO_START|>{video_token}<|VIDEO_END|>", 1
|
||||
VIDEO_PLACEHOLDER, f"Video {video_idx + 1}:<|VIDEO_START|>{video_token * video_seqlen}<|VIDEO_END|>", 1
|
||||
)
|
||||
video_idx += 1
|
||||
message["content"] = content
|
||||
return messages
|
||||
|
||||
|
||||
Reference in New Issue
Block a user