diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py index 99542d23..cf567d5f 100644 --- a/src/llamafactory/chat/vllm_engine.py +++ b/src/llamafactory/chat/vllm_engine.py @@ -113,9 +113,9 @@ class VllmEngine(BaseEngine): messages[0]["content"] = IMAGE_PLACEHOLDER * len(images) + messages[0]["content"] if self.template.mm_plugin.__class__.__name__ == "Qwen2vlPlugin": # temporary solution - image_str = "<|vision_start|>" + self.template.mm_plugin.image_token + "<|vision_end|>" + image_str = f"<|vision_start|>{self.template.mm_plugin.image_token}<|vision_end|>" else: - image_str = self.template.mm_plugin.image_token + image_str = self.template.mm_plugin.image_token or "" paired_messages = [ {"role": message["role"], "content": message["content"].replace(IMAGE_PLACEHOLDER, image_str)} diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index b04c2673..248cbd38 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -82,7 +82,7 @@ class BasePlugin: Pre-processes a single image. """ image_resolution: int = kwargs.get("image_resolution") - if image.width * image.height > image_resolution: + if (image.width * image.height) > image_resolution: resize_factor = math.sqrt(image_resolution / (image.width * image.height)) width, height = int(image.width * resize_factor), int(image.height * resize_factor) image = image.resize((width, height), resample=Image.NEAREST)