refactor mm training

2025-12-16 20:00:36 +08:00 · 2024-08-30 02:14:31 +08:00
parent 727e184840
commit 3382317e32
32 changed files with 505 additions and 472 deletions
--- a/src/llamafactory/data/processors/unsupervised.py
+++ b/src/llamafactory/data/processors/unsupervised.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from collections import defaultdict
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple

 from ...extras.logging import get_logger
 from ..data_utils import Role
-from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, infer_seqlen
+from .processor_utils import infer_seqlen


 if TYPE_CHECKING:
@@ -39,9 +40,6 @@ def _encode_unsupervised_example(
    processor: Optional["ProcessorMixin"],
    cutoff_len: int,
 ) -> Tuple[List[int], List[int]]:
-    if processor is not None and not hasattr(processor, "image_seq_length"):  # llava-like models
-        prompt[0]["content"] = template.image_token + prompt[0]["content"]
-
    if len(response) == 1:
        messages = prompt + response
    else:
@@ -51,10 +49,7 @@ def _encode_unsupervised_example(
    if template.efficient_eos:
        labels += [tokenizer.eos_token_id]

-    if processor is not None and hasattr(processor, "image_seq_length"):  # paligemma models
-        image_token_id = tokenizer.convert_tokens_to_ids(template.image_token)
-        input_ids = [image_token_id] * getattr(processor, "image_seq_length") + input_ids
-
+    input_ids, _ = template.mm_plugin.process_token_ids(input_ids, None, tokenizer, processor)
    source_len, target_len = infer_seqlen(len(input_ids), len(labels), cutoff_len)
    input_ids = input_ids[:source_len]
    labels = labels[:target_len]
@@ -69,19 +64,15 @@ def preprocess_unsupervised_dataset(
    data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
    # build inputs with format `<bos> X` and labels with format `Y <eos>`
-    model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
-    if processor is not None:
-        model_inputs["pixel_values"] = []
-        if hasattr(processor, "image_seq_length"):  # paligemma models
-            model_inputs["token_type_ids"] = []
-
+    model_inputs = defaultdict(list)
    for i in range(len(examples["prompt"])):
        if len(examples["prompt"][i]) % 2 != 1:
            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
            continue

+        prompt = template.mm_plugin.process_messages(examples["prompt"][i], examples["images"][i], processor)
        input_ids, labels = _encode_unsupervised_example(
-            prompt=examples["prompt"][i],
+            prompt=prompt,
            response=examples["response"][i],
            system=examples["system"][i],
            tools=examples["tools"][i],
@@ -93,10 +84,12 @@ def preprocess_unsupervised_dataset(
        model_inputs["input_ids"].append(input_ids)
        model_inputs["attention_mask"].append([1] * len(input_ids))
        model_inputs["labels"].append(labels)
-        if processor is not None:
-            model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor))
-            if hasattr(processor, "image_seq_length"):  # paligemma models
-                model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor))
+        template.mm_plugin.process_model_inputs(
+            model_inputs=model_inputs,
+            images=examples["images"][i],
+            feature_seqlens={"token_type_ids": len(input_ids)},
+            processor=processor,
+        )

    return model_inputs