From 66213043acbff3c68257798dce00154b911b776d Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Sun, 29 Sep 2024 20:38:46 +0800 Subject: [PATCH] tiny fix Former-commit-id: 7397827aec55eecad1e70878453387fda0db62b5 --- src/llamafactory/data/mm_plugin.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 29c81f56..f38031ca 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -344,9 +344,9 @@ class LlavaNextVideoPlugin(BasePlugin): message["content"] = content.replace("{{image}}", self.image_token) if "pixel_values_videos" in mm_inputs: - one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0]) - height, width = get_image_size(one_video[0]) - num_frames = one_video.shape[0] # frame dim is always after batch dim + pixel_values_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0]) + height, width = get_image_size(pixel_values_video[0]) + num_frames = pixel_values_video.shape[0] # frame dim is always after batch dim image_seqlen = (height // processor.patch_size) * (width // processor.patch_size) video_seqlen = image_seqlen // 4 * num_frames # divide by 4 needed for avg pooling layer @@ -378,9 +378,9 @@ class LlavaNextVideoPlugin(BasePlugin): if len(videos) != 0: videos = self._regularize_videos( videos, - image_resolution=getattr(processor, "image_resolution", 168), - video_fps=getattr(processor, "video_fps", 1.0), - video_maxlen=getattr(processor, "video_maxlen", 16), + image_resolution=getattr(processor, "image_resolution"), + video_fps=getattr(processor, "video_fps"), + video_maxlen=getattr(processor, "video_maxlen"), ) video_res = video_processor(videos, return_tensors="pt") res.update(video_res) @@ -576,9 +576,9 @@ class VideoLlavaPlugin(BasePlugin): height, width = get_image_size(to_numpy_array(mm_inputs.get("pixel_values_images")[0])) num_frames = 1 if exist_videos: - one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0]) - height, width = get_image_size(one_video[0]) - num_frames = one_video.shape[0] # frame dim is always after batch dim + pixel_values_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0]) + height, width = get_image_size(pixel_values_video[0]) + num_frames = pixel_values_video.shape[0] # frame dim is always after batch dim image_seqlen = (height // processor.patch_size) * (width // processor.patch_size) + 1 video_seqlen = image_seqlen * num_frames if processor.vision_feature_select_strategy == "default":