[data] add coig-p dataset (#7657)

2025-12-15 19:30:36 +08:00 · 2025-04-09 21:18:25 +08:00
parent 24cb890432
commit 34fdabe005
11 changed files with 325 additions and 915 deletions
--- a/README.md
+++ b/README.md
@@ -384,6 +384,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t

 - [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
 - [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
+- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
 - [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
 - [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
 - [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
--- a/README_zh.md
+++ b/README_zh.md
@@ -387,6 +387,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc

 - [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
 - [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
+- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
 - [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
 - [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
 - [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
--- a/data/README.md
+++ b/data/README.md
@@ -85,7 +85,7 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh

 ### Pre-training Dataset

- [Example dataset](c4_demo.json)
+- [Example dataset](c4_demo.jsonl)

 In pre-training, only the `text` column will be used for model learning.

--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -85,7 +85,7 @@

 ### 预训练数据集

- [样例数据集](c4_demo.json)
+- [样例数据集](c4_demo.jsonl)

 在预训练时，只有 `text` 列中的内容会用于模型学习。

--- a/data/c4_demo.json
+++ b/data/c4_demo.json
--- a/data/c4_demo.jsonl
+++ b/data/c4_demo.jsonl
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -527,6 +527,16 @@
      "rejected": "rejected"
    }
  },
+  "coig_p": {
+    "hf_hub_url": "m-a-p/COIG-P",
+    "ranking": true,
+    "formatting": "sharegpt",
+    "columns": {
+      "messages": "conversations",
+      "chosen": "chosen",
+      "rejected": "rejected"
+    }
+  },
  "rlhf_v": {
    "hf_hub_url": "llamafactory/RLHF-V",
    "ranking": true,
@@ -622,7 +632,7 @@
    }
  },
  "c4_demo": {
-    "file_name": "c4_demo.json",
+    "file_name": "c4_demo.jsonl",
    "columns": {
      "prompt": "text"
    }
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
-datasets>=2.16.0,<=3.4.1
-accelerate>=0.34.0,<=1.5.2
-peft>=0.14.0,<=0.15.0
+datasets>=2.16.0,<=3.5.0
+accelerate>=0.34.0,<=1.6.0
+peft>=0.14.0,<=0.15.1
 trl>=0.8.6,<=0.9.6
 tokenizers>=0.19.0,<=0.21.0
 gradio>=4.38.0,<=5.21.0
--- a/src/llamafactory/init.py
+++ b/src/llamafactory/init.py
@@ -20,9 +20,9 @@ Level:
 Dependency graph:
  main:
    transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
-    datasets>=2.16.0,<=3.4.1
-    accelerate>=0.34.0,<=1.5.2
-    peft>=0.14.0,<=0.15.0
+    datasets>=2.16.0,<=3.5.0
+    accelerate>=0.34.0,<=1.6.0
+    peft>=0.14.0,<=0.15.1
    trl>=0.8.6,<=0.9.6
  attention:
    transformers>=4.42.4 (gemma+fa2)
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -90,9 +90,9 @@ def check_version(requirement: str, mandatory: bool = False) -> None:
 def check_dependencies() -> None:
    r"""Check the version of the required packages."""
    check_version("transformers>=4.41.2,<=4.51.1,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0")
-    check_version("datasets>=2.16.0,<=3.4.1")
-    check_version("accelerate>=0.34.0,<=1.5.2")
-    check_version("peft>=0.14.0,<=0.15.0")
+    check_version("datasets>=2.16.0,<=3.5.0")
+    check_version("accelerate>=0.34.0,<=1.6.0")
+    check_version("peft>=0.14.0,<=0.15.1")
    check_version("trl>=0.8.6,<=0.9.6")
    if is_transformers_version_greater_than("4.46.0") and not is_transformers_version_greater_than("4.48.1"):
        logger.warning_rank0_once("There are known bugs in transformers v4.46.0-v4.48.0, please use other versions.")
--- a/src/llamafactory/webui/components/export.py
+++ b/src/llamafactory/webui/components/export.py
@@ -111,7 +111,7 @@ def create_export_tab(engine: "Engine") -> dict[str, "Component"]:
    with gr.Row():
        export_size = gr.Slider(minimum=1, maximum=100, value=5, step=1)
        export_quantization_bit = gr.Dropdown(choices=["none"] + GPTQ_BITS, value="none")
-        export_quantization_dataset = gr.Textbox(value="data/c4_demo.json")
+        export_quantization_dataset = gr.Textbox(value="data/c4_demo.jsonl")
        export_device = gr.Radio(choices=["cpu", "auto"], value="cpu")
        export_legacy_format = gr.Checkbox()