[data] shard the dataset to allow multiprocessing when streaming is enabled (#7530)

* Shard the dataset when streaming to allow multiprocessing

* Allow user to not set dataset_shards to ensure backward compatibility
This commit is contained in:
Billy Cao
2025-04-01 15:36:23 +08:00
committed by GitHub
parent 538e6c70c3
commit 51e741ec85
4 changed files with 12 additions and 4 deletions

View File

@@ -83,6 +83,10 @@ class DataArguments:
default=None,
metadata={"help": "The number of processes to use for the pre-processing."},
)
dataset_shards: Optional[int] = field(
default=None,
metadata={"help": "The number of shards to split the dataset into. Only used in streaming mode. This should be set to the same as dataloader_num_workers. Not setting this while streaming data will cause the dataset to be non-sharded and thus only can be processed using one worker."},
)
max_samples: Optional[int] = field(
default=None,
metadata={"help": "For debugging purposes, truncate the number of examples for each dataset."},