diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 53fb666a..87d9c451 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -165,17 +165,12 @@ class HuggingfaceEngine(BaseEngine): ) mm_inputs = template.mm_plugin.get_mm_inputs(**mm_input_dict, seqlens=[prompt_length], processor=processor) - for key, value in mm_inputs.items(): - value = ( - value - if isinstance(value, torch.Tensor) - else ( - torch.stack(value) - if isinstance(value, list) and all(isinstance(v, torch.Tensor) for v in value) - else torch.tensor(value) - ) - ) + if isinstance(value, list) and all(isinstance(v, torch.Tensor for v in value)): # for pixtral inputs + value = torch.stack(value) # assume they have same sizes + elif not isinstance(value, torch.Tensor): + value = torch.tensor(value) + gen_kwargs[key] = value.to(model.device) return gen_kwargs, prompt_length