Merge pull request #3829 from seanzhang-zhichen/add_dataset_sample_num

Add dataset sample num Former-commit-id: 483eb47e5d
2026-03-07 04:05:58 +08:00 · 2024-05-30 00:25:45 +08:00
parent 820404946e 7b83c550ab
commit 9b6bdf9449
4 changed files with 23 additions and 6 deletions
--- a/data/README.md
+++ b/data/README.md
@@ -12,6 +12,7 @@ Currently we support datasets in **alpaca** and **sharegpt** format.
  "ranking": "whether the dataset is a preference dataset or not. (default: False)",
  "subset": "the name of the subset. (optional, default: None)",
  "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
+  "num_samples": "the number of samples in the dataset used for training. (optional, default: None)",
  "columns (optional)": {
    "prompt": "the column name in the dataset containing the prompts. (default: instruction)",
    "query": "the column name in the dataset containing the queries. (default: input)",
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -12,6 +12,7 @@
  "ranking": "是否为偏好数据集（可选，默认：False）",
  "subset": "数据集子集的名称（可选，默认：None）",
  "folder": "Hugging Face 仓库的文件夹名称（可选，默认：None）",
+  "num_samples": "该数据集中用于训练的样本数量。（可选，默认：None）",
  "columns（可选）": {
    "prompt": "数据集代表提示词的表头名称（默认：instruction）",
    "query": "数据集代表请求的表头名称（默认：input）",