diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index fca092cb8..430fa885b 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -161,7 +161,9 @@ class MMPluginMixin: video_processor: BaseImageProcessor = getattr( processor, "video_processor", getattr(processor, "image_processor", None) ) - feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None) + feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None) or getattr( + processor, "audio_processor", None + ) if len(images) != 0 and self.image_token is None: raise ValueError( "This model does not support image input. Please check whether the correct `template` is used." @@ -390,7 +392,9 @@ class MMPluginMixin: mm_inputs.update(video_processor(videos, return_tensors="pt")) if len(audios) != 0: - feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None) + feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None) or getattr( + processor, "audio_processor", None + ) audios = self._regularize_audios( audios, sampling_rate=getattr(processor, "audio_sampling_rate", 16000), @@ -1876,7 +1880,9 @@ class Qwen2OmniPlugin(Qwen2VLPlugin): ) -> dict[str, "torch.Tensor"]: image_processor: BaseImageProcessor = getattr(processor, "image_processor", None) video_processor: BaseVideoProcessor = getattr(processor, "video_processor", None) - feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None) + feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None) or getattr( + processor, "audio_processor", None + ) mm_inputs = {} if len(images) != 0: images = self._regularize_images(