From 87d6d7dc61f40ce56be1bbfb3169fd70214a6a8c Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 4 Nov 2024 08:18:12 +0000
Subject: [PATCH 1/2] fix chat engines

Former-commit-id: 3a220b7992d265c77d9a1a406ef86eefbc699cfe
---
 src/llamafactory/chat/hf_engine.py   | 4 ++--
 src/llamafactory/chat/vllm_engine.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 3ac04982..eeed9a29 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -86,12 +86,12 @@ class HuggingfaceEngine(BaseEngine):
         mm_input_dict = {"images": [], "videos": [], "imglens": [0], "vidlens": [0]}
         if images is not None:
             mm_input_dict.update({"images": images, "imglens": [len(images)]})
-            if not any(IMAGE_PLACEHOLDER not in message["content"] for message in messages):
+            if not any(IMAGE_PLACEHOLDER in message["content"] for message in messages):
                 messages[0]["content"] = IMAGE_PLACEHOLDER * len(images) + messages[0]["content"]
 
         if videos is not None:
             mm_input_dict.update({"videos": videos, "vidlens": [len(videos)]})
-            if not any(VIDEO_PLACEHOLDER not in message["content"] for message in messages):
+            if not any(VIDEO_PLACEHOLDER in message["content"] for message in messages):
                 messages[0]["content"] = VIDEO_PLACEHOLDER * len(videos) + messages[0]["content"]
 
         messages = template.mm_plugin.process_messages(
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 37feccc2..5f6612be 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -107,7 +107,7 @@ class VllmEngine(BaseEngine):
     ) -> AsyncIterator["RequestOutput"]:
         request_id = f"chatcmpl-{uuid.uuid4().hex}"
         if images is not None:
-            if not any(IMAGE_PLACEHOLDER not in message["content"] for message in messages):
+            if not any(IMAGE_PLACEHOLDER in message["content"] for message in messages):
                 messages[0]["content"] = IMAGE_PLACEHOLDER * len(images) + messages[0]["content"]
 
         paired_messages = messages + [{"role": "assistant", "content": ""}]

From e2fa9613020bcb1b43f82f827d0e9fd60d8008fa Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 4 Nov 2024 08:27:20 +0000
Subject: [PATCH 2/2] add image input type

Former-commit-id: 6fe260e35ff12662b72f26ec9df44e87b9693551
---
 src/llamafactory/data/mm_plugin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index f6748883..6a174838 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -30,7 +30,7 @@ if TYPE_CHECKING:
         path: Optional[str]
         bytes: Optional[bytes]
 
-    ImageInput = Union[str, EncodedImage, ImageObject]
+    ImageInput = Union[str, bytes, EncodedImage, ImageObject]
     VideoInput = str
 
 
@@ -104,6 +104,8 @@ class BasePlugin:
         for image in images:
             if isinstance(image, str):
                 image = Image.open(image)
+            elif isinstance(image, bytes):
+                image = Image.open(BytesIO(image))
             elif isinstance(image, dict):
                 if image["bytes"] is not None:
                     image = Image.open(BytesIO(image["bytes"]))