1. add custom eval dataset support

2. merge load dataset and split dataset function


Former-commit-id: 76f3bbcfc0
This commit is contained in:
codingma
2024-07-05 15:52:10 +08:00
parent 8379a39776
commit 74f0d02eb8
16 changed files with 104 additions and 43 deletions

View File

@@ -33,6 +33,11 @@ class DataArguments:
default=None,
metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."},
)
eval_dataset: Optional[str] = field(
default=None,
metadata={"help": "The name of provided dataset(s) to use for eval during training. "
"Use commas to separate multiple datasets."},
)
dataset_dir: str = field(
default="data",
metadata={"help": "Path to the folder containing the datasets."},
@@ -105,6 +110,10 @@ class DataArguments:
default=None,
metadata={"help": "Path to save or load the tokenized datasets."},
)
eval_tokenized_path: Optional[str] = field(
default=None,
metadata={"help": "Path to save or load the tokenized eval datasets."},
)
def __post_init__(self):
if self.streaming and self.val_size > 1e-6 and self.val_size < 1: