From 468723c5d9439805aa30bc3cfa9dd2034453b30f Mon Sep 17 00:00:00 2001 From: Kingsley Date: Fri, 1 May 2026 12:10:13 +0800 Subject: [PATCH] [packing] fix GDN crash when meeting dummy image (#10453) --- src/llamafactory/data/collator.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index ade718972..af234d99b 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -165,8 +165,11 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): "video_grid_thw": mm_inputs.get("video_grid_thw"), "attention_mask": (features["attention_mask"] >= 1).float(), } - if features["attention_mask"].sum() == 0: - features["position_ids"] = torch.zeros((3, *features["input_ids"].shape)) + if features["attention_mask"].sum() == 0: # for pad tokens + seq_len = features["input_ids"].shape[-1] + features["position_ids"] = ( + torch.arange(seq_len).view(1, 1, seq_len).expand(3, *features["input_ids"].shape).contiguous() + ) features["rope_deltas"] = torch.zeros(features["input_ids"].shape[0]) return @@ -220,7 +223,13 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): unpadded_length = int(features["attention_mask"][0].bool().sum().item()) right_padding_length = int((packing_params_list[0] or {}).get("right_padding_length") or 0) fake_input_padding_length = max(0, seq_len - unpadded_length - right_padding_length) - dummy_image_right_padding_mrope = torch.zeros((3, bsz, fake_input_padding_length)) + # avoid continual cuseqlens breaking varlen attention @kuangdd + # https://github.com/hiyouga/LlamaFactory/issues/10452 + dummy_image_right_padding_mrope = ( + torch.arange(fake_input_padding_length) + .view(1, 1, fake_input_padding_length) + .expand(3, bsz, fake_input_padding_length) + ) dummy_image_right_padding_attention_mask = torch.zeros((bsz, fake_input_padding_length)) assert self.tokenizer.padding_side == "right", "padding_side should be right when fake image is injected" dummy_mm_inputs = copy.deepcopy(mm_inputs) @@ -280,6 +289,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): ) self._compute_rope_position_ids(subseq_features, mm_inputs_for_subseq) sample_position_ids.append(subseq_features["position_ids"]) + all_position_ids.append(torch.cat(sample_position_ids, dim=-1)) batch_dim_for_position_ids = 1 if all_position_ids[0].dim() == 3 else 0 @@ -418,14 +428,14 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): boundaries_list = [p.get("sequence_boundaries") if p is not None else None for p in packing_params_list] has_packing = any(b is not None and len(b) > 2 for b in boundaries_list) if has_dummy_image and has_packing: - # FIXME: too tricky, need to be refactored + # FIXME: too tricky, need to be refactored @kuangdd features["has_dummy_image"] = True # When fake image/audio was injected, sequence_boundaries no longer match the tensor; use non-packing path. if not has_packing: self._compute_rope_position_ids(features, mm_inputs) else: - if is_omni: + if is_omni: # TODO: support omni models for packed sequences @kuangdd raise RuntimeError("Omni models are not supported for packed sequences for now.") self._compute_rope_position_ids_with_packing(