mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-02 03:32:50 +08:00
Merge pull request #5323 from naem1023/feat/add-dataset-map-batch-size-argument
Add batch size of map function in the preprocessed dataset Former-commit-id: 8f441c2b3a5bb84dec2c037a541084c0201726c6
This commit is contained in:
commit
1dfd1aaf82
@ -179,6 +179,9 @@ def _get_preprocessed_dataset(
|
||||
load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
|
||||
desc="Running tokenizer on dataset",
|
||||
)
|
||||
if data_args.dataset_map_batch_size:
|
||||
# Set the batch size conditionally without considering the default variable of the batch size in the map function
|
||||
kwargs.update(batch_size=data_args.dataset_map_batch_size)
|
||||
|
||||
dataset = dataset.map(
|
||||
preprocess_func,
|
||||
|
@ -113,6 +113,10 @@ class DataArguments:
|
||||
default=None,
|
||||
metadata={"help": "Path to save or load the tokenized datasets."},
|
||||
)
|
||||
dataset_map_batch_size: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={"help": "Batch size for dataset mapping."},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
def split_arg(arg):
|
||||
|
Loading…
x
Reference in New Issue
Block a user