[infer] vllm video/audio inference (#7566)

This commit is contained in:
hoshi-hiyouga
2025-04-02 02:27:04 +08:00
committed by GitHub
parent 2bfcad2394
commit 5e22597ff1
10 changed files with 329 additions and 285 deletions

View File

@@ -70,14 +70,17 @@ class FunctionCall(BaseModel):
function: Function
class ImageURL(BaseModel):
class URL(BaseModel):
url: str
detail: Literal["auto", "low", "high"] = "auto"
class MultimodalInputItem(BaseModel):
type: Literal["text", "image_url"]
type: Literal["text", "image_url", "video_url", "audio_url"]
text: Optional[str] = None
image_url: Optional[ImageURL] = None
image_url: Optional[URL] = None
video_url: Optional[URL] = None
audio_url: Optional[URL] = None
class ChatMessage(BaseModel):