From 42b436a4fc81e2b7c806d3184db391bfa1095d64 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Sat, 27 Dec 2025 08:05:25 +0800 Subject: [PATCH] fix --- pyproject.toml | 7 +++---- src/llamafactory/data/mm_plugin.py | 21 ++++++++++----------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6cdd1b289..732a812c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = [ # core deps "torch>=2.4.0", "torchvision>=0.19.0", + "torchaudio>=2.4.0", "transformers>=4.49.0,<=4.56.2,!=4.52.0; python_version < '3.10'", "transformers>=4.49.0,<=4.57.1,!=4.52.0,!=4.57.0; python_version >= '3.10'", "datasets>=2.16.0,<=4.0.0", @@ -63,6 +64,7 @@ dependencies = [ "hf-transfer", "safetensors", # python + "av", "fire", "omegaconf", "packaging", @@ -72,10 +74,7 @@ dependencies = [ # api "uvicorn", "fastapi", - "sse-starlette", - # media - "av", - "librosa" + "sse-starlette" ] [project.optional-dependencies] diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 44df263ab..05acded2c 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -26,6 +26,7 @@ from typing import TYPE_CHECKING, BinaryIO, Literal, Optional, TypedDict, Union import numpy as np import torch +import torchaudio from transformers.image_utils import get_image_size, is_valid_image, to_numpy_array from transformers.models.mllama.processing_mllama import ( convert_sparse_cross_attention_mask_to_dense, @@ -34,16 +35,7 @@ from transformers.models.mllama.processing_mllama import ( from typing_extensions import NotRequired, override from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER -from ..extras.packages import ( - is_librosa_available, - is_pillow_available, - is_pyav_available, - is_transformers_version_greater_than, -) - - -if is_librosa_available(): - import librosa +from ..extras.packages import is_pillow_available, is_pyav_available, is_transformers_version_greater_than if is_pillow_available(): @@ -316,7 +308,14 @@ class MMPluginMixin: results, sampling_rates = [], [] for audio in audios: if not isinstance(audio, np.ndarray): - audio, sampling_rate = librosa.load(audio, sr=sampling_rate) + audio, sr = torchaudio.load(audio) + if audio.shape[0] > 1: + audio = audio.mean(dim=0, keepdim=True) + + if sr != sampling_rate: + audio = torchaudio.functional.resample(audio, sr, sampling_rate) + + audio = audio.squeeze(0).numpy() results.append(audio) sampling_rates.append(sampling_rate)