mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-15 19:30:36 +08:00
[data] add coig-p dataset (#7657)
This commit is contained in:
@@ -85,7 +85,7 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
|
||||
|
||||
### Pre-training Dataset
|
||||
|
||||
- [Example dataset](c4_demo.json)
|
||||
- [Example dataset](c4_demo.jsonl)
|
||||
|
||||
In pre-training, only the `text` column will be used for model learning.
|
||||
|
||||
|
||||
@@ -85,7 +85,7 @@
|
||||
|
||||
### 预训练数据集
|
||||
|
||||
- [样例数据集](c4_demo.json)
|
||||
- [样例数据集](c4_demo.jsonl)
|
||||
|
||||
在预训练时,只有 `text` 列中的内容会用于模型学习。
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
300
data/c4_demo.jsonl
Normal file
300
data/c4_demo.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -527,6 +527,16 @@
|
||||
"rejected": "rejected"
|
||||
}
|
||||
},
|
||||
"coig_p": {
|
||||
"hf_hub_url": "m-a-p/COIG-P",
|
||||
"ranking": true,
|
||||
"formatting": "sharegpt",
|
||||
"columns": {
|
||||
"messages": "conversations",
|
||||
"chosen": "chosen",
|
||||
"rejected": "rejected"
|
||||
}
|
||||
},
|
||||
"rlhf_v": {
|
||||
"hf_hub_url": "llamafactory/RLHF-V",
|
||||
"ranking": true,
|
||||
@@ -622,7 +632,7 @@
|
||||
}
|
||||
},
|
||||
"c4_demo": {
|
||||
"file_name": "c4_demo.json",
|
||||
"file_name": "c4_demo.jsonl",
|
||||
"columns": {
|
||||
"prompt": "text"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user