Merge pull request #7244 from hiyouga/hiyouga/token

[data] avoid exit after saving preprocessed data Former-commit-id: dcbf01b0035062fa14187e5bdbb925080d349501
2026-06-17 20:58:54 +08:00 · 2025-03-11 15:17:15 +08:00
parent 317d0855d2 37b844d929
commit 1c634d9c53
1 changed files with 2 additions and 5 deletions
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 import os
-import sys
 from typing import TYPE_CHECKING, Dict, Literal, Optional, Sequence, Union

 import numpy as np
@@ -325,12 +324,10 @@ def get_dataset(
            )

        dataset_dict = split_dataset(dataset, eval_dataset, data_args, seed=training_args.seed)
-        if data_args.tokenized_path is not None:  # save tokenized dataset to disk and exit
+        if data_args.tokenized_path is not None:  # save tokenized dataset to disk
            if training_args.should_save:
                dataset_dict.save_to_disk(data_args.tokenized_path)
                logger.info_rank0(f"Tokenized dataset is saved at {data_args.tokenized_path}.")
-                logger.info_rank0(f"Please restart the training with `tokenized_path: {data_args.tokenized_path}`.")
-
-            sys.exit(0)
+                logger.info_rank0(f"Please launch the training with `tokenized_path: {data_args.tokenized_path}`.")

        return get_dataset_module(dataset_dict)