[data] support glm4.1v video training (#8571)

This commit is contained in:
Kingsley 2025-07-08 16:29:44 +08:00 committed by GitHub
parent 6a8d88826e
commit 766884fa5c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1558,11 +1558,7 @@ class GLM4VPlugin(Qwen2VLPlugin):
video_metadata = [
{"fps": 2, "duration": len(video), "total_frames": len(video)} for video in video_data["videos"]
]
mm_inputs.update(
video_processor(
images=None, videos=video_data["videos"], video_metadata=video_metadata, return_tensors="pt"
)
)
mm_inputs.update(video_processor(images=None, videos=video_data["videos"], video_metadata=video_metadata))
return mm_inputs
@ -1586,8 +1582,9 @@ class GLM4VPlugin(Qwen2VLPlugin):
mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
image_grid_thw = mm_inputs.get("image_grid_thw", [])
video_grid_thw = mm_inputs.get("video_grid_thw", [])
num_frames = len(video_grid_thw)
num_frames = video_grid_thw[0][0] if len(video_grid_thw) > 0 else 0 # hard code for now
timestamps = mm_inputs.get("timestamps", [])
if hasattr(timestamps, "tolist"):
timestamps = timestamps.tolist()
@ -1618,19 +1615,20 @@ class GLM4VPlugin(Qwen2VLPlugin):
)
num_image_tokens += 1
# TODO: DO NOT SUPPORT VIDEO UNTIL NEXT PR
while VIDEO_PLACEHOLDER in content:
video_structure = ""
for frame_index in range(num_frames):
video_seqlen = video_grid_thw[frame_index].prod() // merge_length if self.expand_mm_tokens else 1
video_seqlen = (
video_grid_thw[num_video_tokens][1:].prod() // merge_length if self.expand_mm_tokens else 1
)
timestamp_sec = selected_timestamps[frame_index]
frame_structure = (
f"<|begin_of_image|>{self.image_token * video_seqlen}<|end_of_image|>{timestamp_sec}"
)
video_structure += frame_structure
content = content.replace(VIDEO_PLACEHOLDER, video_structure, 1)
num_video_tokens += 1 # FIXME: num_video_tokens is not used
content = content.replace(VIDEO_PLACEHOLDER, f"<|begin_of_video|>{video_structure}<|end_of_video|>", 1)
num_video_tokens += 1
message["content"] = content