mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-03 04:02:49 +08:00
feat: add batch size of map function in the preprocessed dataset
Former-commit-id: 209313eeeab8d1a7c320bd9aa90a5f4656082b7c
This commit is contained in:
parent
5af92971bc
commit
46695e42cc
@ -179,6 +179,9 @@ def _get_preprocessed_dataset(
|
|||||||
load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
|
load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
|
||||||
desc="Running tokenizer on dataset",
|
desc="Running tokenizer on dataset",
|
||||||
)
|
)
|
||||||
|
if data_args.dataset_map_batch_size:
|
||||||
|
# Set the batch size conditionally without considering the default variable of the batch size in the map function
|
||||||
|
kwargs.update(batch_size=data_args.dataset_map_batch_size)
|
||||||
|
|
||||||
dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
|
dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
|
||||||
|
|
||||||
|
@ -109,6 +109,10 @@ class DataArguments:
|
|||||||
default=None,
|
default=None,
|
||||||
metadata={"help": "Path to save or load the tokenized datasets."},
|
metadata={"help": "Path to save or load the tokenized datasets."},
|
||||||
)
|
)
|
||||||
|
dataset_map_batch_size: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "Batch size for dataset mapping."},
|
||||||
|
)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
def split_arg(arg):
|
def split_arg(arg):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user