diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py index be9c78df9..87e0f5791 100644 --- a/src/llamafactory/model/patcher.py +++ b/src/llamafactory/model/patcher.py @@ -162,8 +162,14 @@ def patch_qwen3_5_forward(model: "PreTrainedModel") -> None: if position_ids is not None and position_ids.ndim == 3: position_ids = position_ids[0] - # `prepare_fa_kwargs_from_position_ids` would crash on None; guard for safety. - cu_seqlens = prepare_fa_kwargs_from_position_ids(position_ids)[0][0] if position_ids is not None else None + # cu_seqlens for the FLA varlen path is only needed when batch_size == 1: + # packing / neat-packing: always folded into a single sequence (bsz == 1) -> varlen + # non-packing, bsz == 1: single segment, equivalent to a standard single sequence + # non-packing, bsz > 1: not packed, use cu_seqlens=None and standard batched kernels + if position_ids is not None and batch_size == 1: + cu_seqlens = prepare_fa_kwargs_from_position_ids(position_ids)[0][0] + else: + cu_seqlens = None # FLA varlen kernels expect [B, T, D] layout, not [B, D, T] like the # standard causal-conv1d path that the upstream forward uses.