[data] fix shared file system (#8179)

This commit is contained in:
hoshi-hiyouga 2025-05-27 18:36:03 +08:00 committed by GitHub
parent d4a413eb37
commit 2bf8e993ab
2 changed files with 6 additions and 2 deletions

View File

@ -300,7 +300,7 @@ def get_dataset(
raise ValueError("Turn off `streaming` when saving dataset to disk.") raise ValueError("Turn off `streaming` when saving dataset to disk.")
# Load and preprocess dataset # Load and preprocess dataset
with training_args.main_process_first(desc="load dataset"): with training_args.main_process_first(desc="load dataset", local=(not data_args.data_shared_file_system)):
dataset = _get_merged_dataset(data_args.dataset, model_args, data_args, training_args, stage) dataset = _get_merged_dataset(data_args.dataset, model_args, data_args, training_args, stage)
eval_dataset = _get_merged_dataset( eval_dataset = _get_merged_dataset(
data_args.eval_dataset, data_args.eval_dataset,
@ -311,7 +311,7 @@ def get_dataset(
return_dict=data_args.eval_on_each_dataset, return_dict=data_args.eval_on_each_dataset,
) )
with training_args.main_process_first(desc="pre-process dataset"): with training_args.main_process_first(desc="pre-process dataset", local=(not data_args.data_shared_file_system)):
dataset = _get_preprocessed_dataset( dataset = _get_preprocessed_dataset(
dataset, data_args, training_args, stage, template, tokenizer, processor, is_eval=False dataset, data_args, training_args, stage, template, tokenizer, processor, is_eval=False
) )

View File

@ -133,6 +133,10 @@ class DataArguments:
) )
}, },
) )
data_shared_file_system: bool = field(
default=False,
metadata={"help": "Whether or not to use a shared file system for the datasets."},
)
def __post_init__(self): def __post_init__(self):
def split_arg(arg): def split_arg(arg):