[model] Update ernie_vl to adapt new version (#9665)

2026-03-03 18:25:59 +08:00 · 2025-12-26 19:57:49 +08:00
parent a882e2d5fc
commit 3c17f2722c
5 changed files with 24 additions and 16 deletions
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -480,21 +480,35 @@ class ErnieVLPlugin(BasePlugin):
        self._validate_input(processor, images, videos, audios)
        self._validate_messages(messages, images, videos, audios)
        messages = deepcopy(messages)
+
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+
+        merge_length: int = getattr(image_processor, "merge_size") ** 2
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+        else:
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+
        image_idx, video_idx = 0, 0
        for message in messages:
            content = message["content"]
-            image_token = self.image_token or "<|image@placeholder|>"
-            video_token = self.video_token or "<|video@placeholder|>"
+            image_token = self.image_token or "<|IMAGE_PLACEHOLDER|>"
+            video_token = self.video_token or "<|VIDEO_PLACEHOLDER|>"
            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_thw[image_idx].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER, f"Picture {image_idx + 1}:<|IMAGE_START|>{image_token * image_seqlen}<|IMAGE_END|>", 1
+                )
                image_idx += 1
-                content = content.replace(
-                    IMAGE_PLACEHOLDER, f"Picture {image_idx}:<|IMAGE_START|>{image_token}<|IMAGE_END|>", 1
-                )
            while VIDEO_PLACEHOLDER in content:
-                video_idx += 1
+                video_seqlen = video_grid_thw[video_idx].prod() // merge_length if self.expand_mm_tokens else 1
                content = content.replace(
-                    VIDEO_PLACEHOLDER, f"Video {video_idx}:<|VIDEO_START|>{video_token}<|VIDEO_END|>", 1
+                    VIDEO_PLACEHOLDER, f"Video {video_idx + 1}:<|VIDEO_START|>{video_token * video_seqlen}<|VIDEO_END|>", 1
                )
+                video_idx += 1
            message["content"] = content
        return messages