mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-16 11:50:35 +08:00
lazy image load
This commit is contained in:
@@ -14,7 +14,7 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, Dict, Literal, Optional, Sequence, Union
|
||||
from typing import TYPE_CHECKING, Dict, Literal, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from datasets import DatasetDict, load_dataset, load_from_disk
|
||||
@@ -180,7 +180,13 @@ def _get_preprocessed_dataset(
|
||||
desc="Running tokenizer on dataset",
|
||||
)
|
||||
|
||||
dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
|
||||
dataset = dataset.map(
|
||||
preprocess_func,
|
||||
batched=True,
|
||||
batch_size=data_args.preprocessing_batch_size,
|
||||
remove_columns=column_names,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if training_args.should_log:
|
||||
try:
|
||||
@@ -202,7 +208,7 @@ def get_dataset(
|
||||
stage: Literal["pt", "sft", "rm", "ppo", "kto"],
|
||||
tokenizer: "PreTrainedTokenizer",
|
||||
processor: Optional["ProcessorMixin"] = None,
|
||||
) -> "DatasetModule":
|
||||
) -> Tuple["DatasetModule", "Template"]:
|
||||
template = get_template_and_fix_tokenizer(tokenizer, data_args.template, data_args.tool_format)
|
||||
if data_args.train_on_prompt and template.efficient_eos:
|
||||
raise ValueError("Current template does not support `train_on_prompt`.")
|
||||
@@ -273,4 +279,4 @@ def get_dataset(
|
||||
if "validation" in dataset_dict:
|
||||
dataset_module["eval_dataset"] = dataset_dict["validation"]
|
||||
|
||||
return dataset_module
|
||||
return dataset_module, template
|
||||
|
||||
Reference in New Issue
Block a user