Merge pull request #5323 from naem1023/feat/add-dataset-map-batch-size-argument

Add batch size of map function in the preprocessed dataset Former-commit-id: 8f441c2b3a5bb84dec2c037a541084c0201726c6
2025-11-08 22:34:46 +08:00 · 2024-09-04 22:09:36 +08:00 · 2024-09-04 22:09:36 +08:00 · 1dfd1aaf82
commit 1dfd1aaf82
parent 8ac74c8ccb 46695e42cc
2 changed files with 7 additions and 0 deletions
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@ -179,6 +179,9 @@ def _get_preprocessed_dataset(
            load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
            desc="Running tokenizer on dataset",
        )
        if data_args.dataset_map_batch_size:
            # Set the batch size conditionally without considering the default variable of the batch size in the map function
            kwargs.update(batch_size=data_args.dataset_map_batch_size)
    dataset = dataset.map(
        preprocess_func,
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@ -113,6 +113,10 @@ class DataArguments:
        default=None,
        metadata={"help": "Path to save or load the tokenized datasets."},
    )
    dataset_map_batch_size: Optional[int] = field(
        default=None,
        metadata={"help": "Batch size for dataset mapping."},
    )
    def __post_init__(self):
        def split_arg(arg):