fix #1909

Former-commit-id: c6abbbfe90dcb0e832f73f0c611fc32eaa7ea78d
2025-08-04 04:32:50 +08:00 · 2023-12-20 16:11:07 +08:00 · 2023-12-20 16:11:07 +08:00 · 633624dc3c
commit 633624dc3c
parent a862ce636f
1 changed files with 7 additions and 4 deletions
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@ -44,6 +44,13 @@ def preprocess_dataset(
 ) -> Union["Dataset", "IterableDataset"]:
    template = get_template_and_fix_tokenizer(data_args.template, tokenizer)
    if data_args.cache_path is not None and os.path.exists(data_args.cache_path):
        logger.warning("Loading dataset from disk will ignore other data arguments.")
        dataset = load_from_disk(data_args.cache_path)
        if data_args.streaming:
            dataset = dataset.to_iterable_dataset()
        return dataset
    if data_args.train_on_prompt and template.efficient_eos:
        raise ValueError("Current template does not support `train_on_prompt`.")
@ -240,10 +247,6 @@ def preprocess_dataset(
        preprocess_func = preprocess_unsupervised_dataset
        print_function = print_unsupervised_dataset_example
    if data_args.cache_path is not None and os.path.exists(data_args.cache_path):
        logger.warning("Loading dataset from disk will ignore other data arguments.")
        return load_from_disk(data_args.cache_path)
    with training_args.main_process_first(desc="dataset map pre-processing"):
        column_names = list(next(iter(dataset)).keys())
        kwargs = {}