mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-23 06:12:50 +08:00
Merge pull request #7244 from hiyouga/hiyouga/token
[data] avoid exit after saving preprocessed data Former-commit-id: dcbf01b0035062fa14187e5bdbb925080d349501
This commit is contained in:
commit
1c634d9c53
@ -13,7 +13,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
from typing import TYPE_CHECKING, Dict, Literal, Optional, Sequence, Union
|
from typing import TYPE_CHECKING, Dict, Literal, Optional, Sequence, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -325,12 +324,10 @@ def get_dataset(
|
|||||||
)
|
)
|
||||||
|
|
||||||
dataset_dict = split_dataset(dataset, eval_dataset, data_args, seed=training_args.seed)
|
dataset_dict = split_dataset(dataset, eval_dataset, data_args, seed=training_args.seed)
|
||||||
if data_args.tokenized_path is not None: # save tokenized dataset to disk and exit
|
if data_args.tokenized_path is not None: # save tokenized dataset to disk
|
||||||
if training_args.should_save:
|
if training_args.should_save:
|
||||||
dataset_dict.save_to_disk(data_args.tokenized_path)
|
dataset_dict.save_to_disk(data_args.tokenized_path)
|
||||||
logger.info_rank0(f"Tokenized dataset is saved at {data_args.tokenized_path}.")
|
logger.info_rank0(f"Tokenized dataset is saved at {data_args.tokenized_path}.")
|
||||||
logger.info_rank0(f"Please restart the training with `tokenized_path: {data_args.tokenized_path}`.")
|
logger.info_rank0(f"Please launch the training with `tokenized_path: {data_args.tokenized_path}`.")
|
||||||
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
return get_dataset_module(dataset_dict)
|
return get_dataset_module(dataset_dict)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user