[data] add coig-p dataset (#7657)

2026-03-06 19:56:01 +08:00 · 2025-04-09 21:18:25 +08:00
parent 7dd35cff8a
commit cca359fb6d
11 changed files with 325 additions and 915 deletions
--- a/data/README.md
+++ b/data/README.md
@@ -85,7 +85,7 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh

 ### Pre-training Dataset

- [Example dataset](c4_demo.json)
+- [Example dataset](c4_demo.jsonl)

 In pre-training, only the `text` column will be used for model learning.

--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -85,7 +85,7 @@

 ### 预训练数据集

- [样例数据集](c4_demo.json)
+- [样例数据集](c4_demo.jsonl)

 在预训练时，只有 `text` 列中的内容会用于模型学习。

--- a/data/c4_demo.json
+++ b/data/c4_demo.json
--- a/data/c4_demo.jsonl
+++ b/data/c4_demo.jsonl
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -527,6 +527,16 @@
      "rejected": "rejected"
    }
  },
+  "coig_p": {
+    "hf_hub_url": "m-a-p/COIG-P",
+    "ranking": true,
+    "formatting": "sharegpt",
+    "columns": {
+      "messages": "conversations",
+      "chosen": "chosen",
+      "rejected": "rejected"
+    }
+  },
  "rlhf_v": {
    "hf_hub_url": "llamafactory/RLHF-V",
    "ranking": true,
@@ -622,7 +632,7 @@
    }
  },
  "c4_demo": {
-    "file_name": "c4_demo.json",
+    "file_name": "c4_demo.jsonl",
    "columns": {
      "prompt": "text"
    }