1. add custom eval dataset support

2. merge load dataset and split dataset function Former-commit-id: 76f3bbcfc0
2026-03-06 19:56:01 +08:00 · 2024-07-05 15:52:10 +08:00
parent 8379a39776
commit 74f0d02eb8
16 changed files with 104 additions and 43 deletions
--- a/data/README.md
+++ b/data/README.md
@@ -12,7 +12,8 @@ Currently we support datasets in **alpaca** and **sharegpt** format.
  "ranking": "whether the dataset is a preference dataset or not. (default: False)",
  "subset": "the name of the subset. (optional, default: None)",
  "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
-  "num_samples": "the number of samples in the dataset used for training. (optional, default: None)",
+  "num_samples": "the number of samples in the dataset used for training. (optional, default: None)", 
+  "split": "which dataset split to use for training and evaluation (optional, default: train)",
  "columns (optional)": {
    "prompt": "the column name in the dataset containing the prompts. (default: instruction)",
    "query": "the column name in the dataset containing the queries. (default: input)",
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -13,6 +13,7 @@
  "subset": "数据集子集的名称（可选，默认：None）",
  "folder": "Hugging Face 仓库的文件夹名称（可选，默认：None）",
  "num_samples": "该数据集中用于训练的样本数量。（可选，默认：None）",
+  "split": "数据集中的要使用的训练测试集切分（可选，默认：train）",
  "columns（可选）": {
    "prompt": "数据集代表提示词的表头名称（默认：instruction）",
    "query": "数据集代表请求的表头名称（默认：input）",
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -172,9 +172,19 @@
  "deepctrl": {
    "ms_hub_url": "deepctrl/deepctrl-sft-data"
  },
-  "adgen": {
+  "adgen_train": {
    "hf_hub_url": "HasturOfficial/adgen",
    "ms_hub_url": "AI-ModelScope/adgen",
+    "split": "train",
+    "columns": {
+      "prompt": "content",
+      "response": "summary"
+    }
+  },
+  "adgen_val": {
+    "hf_hub_url": "HasturOfficial/adgen",
+    "ms_hub_url": "AI-ModelScope/adgen",
+    "split": "validation",
    "columns": {
      "prompt": "content",
      "response": "summary"