mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-27 09:10:35 +08:00
fix
This commit is contained in:
@@ -40,6 +40,7 @@ dependencies = [
|
|||||||
# core deps
|
# core deps
|
||||||
"torch>=2.4.0",
|
"torch>=2.4.0",
|
||||||
"torchvision>=0.19.0",
|
"torchvision>=0.19.0",
|
||||||
|
"torchaudio>=2.4.0",
|
||||||
"transformers>=4.49.0,<=4.56.2,!=4.52.0; python_version < '3.10'",
|
"transformers>=4.49.0,<=4.56.2,!=4.52.0; python_version < '3.10'",
|
||||||
"transformers>=4.49.0,<=4.57.1,!=4.52.0,!=4.57.0; python_version >= '3.10'",
|
"transformers>=4.49.0,<=4.57.1,!=4.52.0,!=4.57.0; python_version >= '3.10'",
|
||||||
"datasets>=2.16.0,<=4.0.0",
|
"datasets>=2.16.0,<=4.0.0",
|
||||||
@@ -63,6 +64,7 @@ dependencies = [
|
|||||||
"hf-transfer",
|
"hf-transfer",
|
||||||
"safetensors",
|
"safetensors",
|
||||||
# python
|
# python
|
||||||
|
"av",
|
||||||
"fire",
|
"fire",
|
||||||
"omegaconf",
|
"omegaconf",
|
||||||
"packaging",
|
"packaging",
|
||||||
@@ -72,10 +74,7 @@ dependencies = [
|
|||||||
# api
|
# api
|
||||||
"uvicorn",
|
"uvicorn",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"sse-starlette",
|
"sse-starlette"
|
||||||
# media
|
|
||||||
"av",
|
|
||||||
"librosa"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ from typing import TYPE_CHECKING, BinaryIO, Literal, Optional, TypedDict, Union
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
import torchaudio
|
||||||
from transformers.image_utils import get_image_size, is_valid_image, to_numpy_array
|
from transformers.image_utils import get_image_size, is_valid_image, to_numpy_array
|
||||||
from transformers.models.mllama.processing_mllama import (
|
from transformers.models.mllama.processing_mllama import (
|
||||||
convert_sparse_cross_attention_mask_to_dense,
|
convert_sparse_cross_attention_mask_to_dense,
|
||||||
@@ -34,16 +35,7 @@ from transformers.models.mllama.processing_mllama import (
|
|||||||
from typing_extensions import NotRequired, override
|
from typing_extensions import NotRequired, override
|
||||||
|
|
||||||
from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER
|
from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER
|
||||||
from ..extras.packages import (
|
from ..extras.packages import is_pillow_available, is_pyav_available, is_transformers_version_greater_than
|
||||||
is_librosa_available,
|
|
||||||
is_pillow_available,
|
|
||||||
is_pyav_available,
|
|
||||||
is_transformers_version_greater_than,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if is_librosa_available():
|
|
||||||
import librosa
|
|
||||||
|
|
||||||
|
|
||||||
if is_pillow_available():
|
if is_pillow_available():
|
||||||
@@ -316,7 +308,14 @@ class MMPluginMixin:
|
|||||||
results, sampling_rates = [], []
|
results, sampling_rates = [], []
|
||||||
for audio in audios:
|
for audio in audios:
|
||||||
if not isinstance(audio, np.ndarray):
|
if not isinstance(audio, np.ndarray):
|
||||||
audio, sampling_rate = librosa.load(audio, sr=sampling_rate)
|
audio, sr = torchaudio.load(audio)
|
||||||
|
if audio.shape[0] > 1:
|
||||||
|
audio = audio.mean(dim=0, keepdim=True)
|
||||||
|
|
||||||
|
if sr != sampling_rate:
|
||||||
|
audio = torchaudio.functional.resample(audio, sr, sampling_rate)
|
||||||
|
|
||||||
|
audio = audio.squeeze(0).numpy()
|
||||||
|
|
||||||
results.append(audio)
|
results.append(audio)
|
||||||
sampling_rates.append(sampling_rate)
|
sampling_rates.append(sampling_rate)
|
||||||
|
|||||||
Reference in New Issue
Block a user