support streaming data, fix #284 #274 #268

This commit is contained in:
hiyouga
2023-07-31 23:33:00 +08:00
parent 513e1f1ec9
commit 0411a4b3e1
28 changed files with 478 additions and 344 deletions

View File

@@ -1,13 +1,12 @@
from typing import Dict
from datasets import Dataset
from typing import TYPE_CHECKING, Dict
if TYPE_CHECKING:
from datasets import Dataset
def split_dataset(
dataset: Dataset, dev_ratio: float, do_train: bool
) -> Dict[str, Dataset]:
# Split the dataset
def split_dataset(dataset: "Dataset", dev_ratio: float, do_train: bool) -> Dict[str, "Dataset"]:
if do_train:
if dev_ratio > 1e-6:
if dev_ratio > 1e-6: # Split the dataset
dataset = dataset.train_test_split(test_size=dev_ratio)
return {"train_dataset": dataset["train"], "eval_dataset": dataset["test"]}
else: