[model] support audio (#6701)

* support qwen2_audio

* improve code

* lint

* fix

* fix

* fix

---------

Co-authored-by: hiyouga <hiyouga@buaa.edu.cn>
Former-commit-id: 5eacb5629e4d7733cd992a63747a1335f2c6a929
This commit is contained in:
Zhangchi Feng
2025-02-05 04:59:09 +08:00
committed by GitHub
parent 9feb78e7b4
commit 8f401e37f8
35 changed files with 675 additions and 213 deletions

View File

@@ -64,10 +64,13 @@ def create_chat_box(
with gr.Column() as mm_box:
with gr.Tab("Image"):
image = gr.Image(sources=["upload"], type="pil")
image = gr.Image(type="pil")
with gr.Tab("Video"):
video = gr.Video(sources=["upload"])
video = gr.Video()
with gr.Tab("Audio"):
audio = gr.Audio(type="filepath")
query = gr.Textbox(show_label=False, lines=8)
submit_btn = gr.Button(variant="primary")
@@ -86,7 +89,7 @@ def create_chat_box(
[chatbot, messages, query],
).then(
engine.chatter.stream,
[chatbot, messages, lang, system, tools, image, video, max_new_tokens, top_p, temperature],
[chatbot, messages, lang, system, tools, image, video, audio, max_new_tokens, top_p, temperature],
[chatbot, messages],
)
clear_btn.click(lambda: ([], []), outputs=[chatbot, messages])
@@ -102,6 +105,7 @@ def create_chat_box(
mm_box=mm_box,
image=image,
video=video,
audio=audio,
query=query,
submit_btn=submit_btn,
max_new_tokens=max_new_tokens,

View File

@@ -15,7 +15,7 @@
from typing import TYPE_CHECKING, Dict
from ...extras.packages import is_gradio_available
from ..common import get_visual
from ..common import is_multimodal
from .chatbot import create_chat_box
@@ -66,7 +66,7 @@ def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]:
).then(lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]])
engine.manager.get_elem_by_id("top.model_name").change(
lambda model_name: gr.Column(visible=get_visual(model_name)),
lambda model_name: gr.Column(visible=is_multimodal(model_name)),
[engine.manager.get_elem_by_id("top.model_name")],
[chat_elems["mm_box"]],
)