[model] support audio (#6701)

* support qwen2_audio * improve code * lint * fix * fix * fix --------- Co-authored-by: hiyouga <hiyouga@buaa.edu.cn> Former-commit-id: 5eacb5629e4d7733cd992a63747a1335f2c6a929
2026-05-31 04:18:56 +08:00 · 2025-02-05 04:59:09 +08:00
parent 9feb78e7b4
commit 8f401e37f8
35 changed files with 675 additions and 213 deletions
--- a/src/llamafactory/webui/components/chatbot.py
+++ b/src/llamafactory/webui/components/chatbot.py
@@ -64,10 +64,13 @@ def create_chat_box(

                    with gr.Column() as mm_box:
                        with gr.Tab("Image"):
-                            image = gr.Image(sources=["upload"], type="pil")
+                            image = gr.Image(type="pil")

                        with gr.Tab("Video"):
-                            video = gr.Video(sources=["upload"])
+                            video = gr.Video()
+
+                        with gr.Tab("Audio"):
+                            audio = gr.Audio(type="filepath")

                query = gr.Textbox(show_label=False, lines=8)
                submit_btn = gr.Button(variant="primary")
@@ -86,7 +89,7 @@ def create_chat_box(
        [chatbot, messages, query],
    ).then(
        engine.chatter.stream,
-        [chatbot, messages, lang, system, tools, image, video, max_new_tokens, top_p, temperature],
+        [chatbot, messages, lang, system, tools, image, video, audio, max_new_tokens, top_p, temperature],
        [chatbot, messages],
    )
    clear_btn.click(lambda: ([], []), outputs=[chatbot, messages])
@@ -102,6 +105,7 @@ def create_chat_box(
            mm_box=mm_box,
            image=image,
            video=video,
+            audio=audio,
            query=query,
            submit_btn=submit_btn,
            max_new_tokens=max_new_tokens,
--- a/src/llamafactory/webui/components/infer.py
+++ b/src/llamafactory/webui/components/infer.py
@@ -15,7 +15,7 @@
 from typing import TYPE_CHECKING, Dict

 from ...extras.packages import is_gradio_available
-from ..common import get_visual
+from ..common import is_multimodal
 from .chatbot import create_chat_box


@@ -66,7 +66,7 @@ def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]:
    ).then(lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]])

    engine.manager.get_elem_by_id("top.model_name").change(
-        lambda model_name: gr.Column(visible=get_visual(model_name)),
+        lambda model_name: gr.Column(visible=is_multimodal(model_name)),
        [engine.manager.get_elem_by_id("top.model_name")],
        [chat_elems["mm_box"]],
    )