[data] add coig-p dataset (#7657)

This commit is contained in:
hoshi-hiyouga 2025-04-09 21:18:25 +08:00 committed by GitHub
parent 24cb890432
commit 34fdabe005
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 325 additions and 915 deletions

View File

@ -384,6 +384,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)

View File

@ -387,6 +387,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)

View File

@ -85,7 +85,7 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
### Pre-training Dataset
- [Example dataset](c4_demo.json)
- [Example dataset](c4_demo.jsonl)
In pre-training, only the `text` column will be used for model learning.

View File

@ -85,7 +85,7 @@
### 预训练数据集
- [样例数据集](c4_demo.json)
- [样例数据集](c4_demo.jsonl)
在预训练时,只有 `text` 列中的内容会用于模型学习。

File diff suppressed because one or more lines are too long

300
data/c4_demo.jsonl Normal file

File diff suppressed because one or more lines are too long

View File

@ -527,6 +527,16 @@
"rejected": "rejected"
}
},
"coig_p": {
"hf_hub_url": "m-a-p/COIG-P",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"rlhf_v": {
"hf_hub_url": "llamafactory/RLHF-V",
"ranking": true,
@ -622,7 +632,7 @@
}
},
"c4_demo": {
"file_name": "c4_demo.json",
"file_name": "c4_demo.jsonl",
"columns": {
"prompt": "text"
}

View File

@ -1,7 +1,7 @@
transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
datasets>=2.16.0,<=3.4.1
accelerate>=0.34.0,<=1.5.2
peft>=0.14.0,<=0.15.0
datasets>=2.16.0,<=3.5.0
accelerate>=0.34.0,<=1.6.0
peft>=0.14.0,<=0.15.1
trl>=0.8.6,<=0.9.6
tokenizers>=0.19.0,<=0.21.0
gradio>=4.38.0,<=5.21.0

View File

@ -20,9 +20,9 @@ Level:
Dependency graph:
main:
transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
datasets>=2.16.0,<=3.4.1
accelerate>=0.34.0,<=1.5.2
peft>=0.14.0,<=0.15.0
datasets>=2.16.0,<=3.5.0
accelerate>=0.34.0,<=1.6.0
peft>=0.14.0,<=0.15.1
trl>=0.8.6,<=0.9.6
attention:
transformers>=4.42.4 (gemma+fa2)

View File

@ -90,9 +90,9 @@ def check_version(requirement: str, mandatory: bool = False) -> None:
def check_dependencies() -> None:
r"""Check the version of the required packages."""
check_version("transformers>=4.41.2,<=4.51.1,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0")
check_version("datasets>=2.16.0,<=3.4.1")
check_version("accelerate>=0.34.0,<=1.5.2")
check_version("peft>=0.14.0,<=0.15.0")
check_version("datasets>=2.16.0,<=3.5.0")
check_version("accelerate>=0.34.0,<=1.6.0")
check_version("peft>=0.14.0,<=0.15.1")
check_version("trl>=0.8.6,<=0.9.6")
if is_transformers_version_greater_than("4.46.0") and not is_transformers_version_greater_than("4.48.1"):
logger.warning_rank0_once("There are known bugs in transformers v4.46.0-v4.48.0, please use other versions.")

View File

@ -111,7 +111,7 @@ def create_export_tab(engine: "Engine") -> dict[str, "Component"]:
with gr.Row():
export_size = gr.Slider(minimum=1, maximum=100, value=5, step=1)
export_quantization_bit = gr.Dropdown(choices=["none"] + GPTQ_BITS, value="none")
export_quantization_dataset = gr.Textbox(value="data/c4_demo.json")
export_quantization_dataset = gr.Textbox(value="data/c4_demo.jsonl")
export_device = gr.Radio(choices=["cpu", "auto"], value="cpu")
export_legacy_format = gr.Checkbox()