1. add custom eval dataset support

2. merge load dataset and split dataset function


Former-commit-id: 76f3bbcfc0
This commit is contained in:
codingma
2024-07-05 15:52:10 +08:00
parent 8379a39776
commit 74f0d02eb8
16 changed files with 104 additions and 43 deletions

View File

@@ -12,7 +12,8 @@ Currently we support datasets in **alpaca** and **sharegpt** format.
"ranking": "whether the dataset is a preference dataset or not. (default: False)",
"subset": "the name of the subset. (optional, default: None)",
"folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
"num_samples": "the number of samples in the dataset used for training. (optional, default: None)",
"num_samples": "the number of samples in the dataset used for training. (optional, default: None)",
"split": "which dataset split to use for training and evaluation (optional, default: train)",
"columns (optional)": {
"prompt": "the column name in the dataset containing the prompts. (default: instruction)",
"query": "the column name in the dataset containing the queries. (default: input)",

View File

@@ -13,6 +13,7 @@
"subset": "数据集子集的名称可选默认None",
"folder": "Hugging Face 仓库的文件夹名称可选默认None",
"num_samples": "该数据集中用于训练的样本数量。可选默认None",
"split": "数据集中的要使用的训练测试集切分可选默认train",
"columns可选": {
"prompt": "数据集代表提示词的表头名称默认instruction",
"query": "数据集代表请求的表头名称默认input",

View File

@@ -172,9 +172,19 @@
"deepctrl": {
"ms_hub_url": "deepctrl/deepctrl-sft-data"
},
"adgen": {
"adgen_train": {
"hf_hub_url": "HasturOfficial/adgen",
"ms_hub_url": "AI-ModelScope/adgen",
"split": "train",
"columns": {
"prompt": "content",
"response": "summary"
}
},
"adgen_val": {
"hf_hub_url": "HasturOfficial/adgen",
"ms_hub_url": "AI-ModelScope/adgen",
"split": "validation",
"columns": {
"prompt": "content",
"response": "summary"