[data] add coig-p dataset (#7657)

This commit is contained in:
hoshi-hiyouga
2025-04-09 21:18:25 +08:00
committed by GitHub
parent 7dd35cff8a
commit cca359fb6d
11 changed files with 325 additions and 915 deletions

View File

@@ -85,7 +85,7 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
### Pre-training Dataset
- [Example dataset](c4_demo.json)
- [Example dataset](c4_demo.jsonl)
In pre-training, only the `text` column will be used for model learning.

View File

@@ -85,7 +85,7 @@
### 预训练数据集
- [样例数据集](c4_demo.json)
- [样例数据集](c4_demo.jsonl)
在预训练时,只有 `text` 列中的内容会用于模型学习。

File diff suppressed because one or more lines are too long

300
data/c4_demo.jsonl Normal file

File diff suppressed because one or more lines are too long

View File

@@ -527,6 +527,16 @@
"rejected": "rejected"
}
},
"coig_p": {
"hf_hub_url": "m-a-p/COIG-P",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"rlhf_v": {
"hf_hub_url": "llamafactory/RLHF-V",
"ranking": true,
@@ -622,7 +632,7 @@
}
},
"c4_demo": {
"file_name": "c4_demo.json",
"file_name": "c4_demo.jsonl",
"columns": {
"prompt": "text"
}