merge data part to the text stream

This commit is contained in:
BUAADreamer
2024-04-25 19:19:59 +08:00
parent 838eb87a96
commit c6dd89918f
15 changed files with 828 additions and 293 deletions

View File

@@ -19,7 +19,9 @@ class DataCollatorForVis2Seq:
texts.append(text)
images.append(example["images"][0])
batch = self.processor(text=texts, images=images, return_tensors="pt", padding=True)
batch = self.processor(
text=texts, images=images, return_tensors="pt", padding=True
)
labels = batch["input_ids"].clone()
if self.processor.tokenizer.pad_token_id is not None:
@@ -27,3 +29,14 @@ class DataCollatorForVis2Seq:
batch["labels"] = labels
return batch
@dataclass
class DataCollatorForMLLM:
processor: AutoProcessor
def __call__(self, examples):
print(examples[0].keys())
print(examples[0]["input_ids"])
batch = {}
return batch