1. add custom eval dataset support

2. merge load dataset and split dataset function
2026-03-07 20:26:00 +08:00 · 2024-07-05 15:52:10 +08:00
parent 9f33f1edf5
commit 76f3bbcfc0
16 changed files with 104 additions and 43 deletions
--- a/data/README.md
+++ b/data/README.md
@@ -12,7 +12,8 @@ Currently we support datasets in **alpaca** and **sharegpt** format.
  "ranking": "whether the dataset is a preference dataset or not. (default: False)",
  "subset": "the name of the subset. (optional, default: None)",
  "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
-  "num_samples": "the number of samples in the dataset used for training. (optional, default: None)",
+  "num_samples": "the number of samples in the dataset used for training. (optional, default: None)", 
+  "split": "which dataset split to use for training and evaluation (optional, default: train)",
  "columns (optional)": {
    "prompt": "the column name in the dataset containing the prompts. (default: instruction)",
    "query": "the column name in the dataset containing the queries. (default: input)",