mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-07-31 10:42:50 +08:00
[data] add coig-p dataset (#7657)
This commit is contained in:
parent
24cb890432
commit
34fdabe005
@ -384,6 +384,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
|
||||
|
||||
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
|
||||
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
|
||||
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
|
||||
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
|
||||
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
|
||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
|
||||
|
@ -387,6 +387,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
|
||||
|
||||
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
|
||||
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
|
||||
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
|
||||
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
|
||||
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
|
||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
|
||||
|
@ -85,7 +85,7 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
|
||||
|
||||
### Pre-training Dataset
|
||||
|
||||
- [Example dataset](c4_demo.json)
|
||||
- [Example dataset](c4_demo.jsonl)
|
||||
|
||||
In pre-training, only the `text` column will be used for model learning.
|
||||
|
||||
|
@ -85,7 +85,7 @@
|
||||
|
||||
### 预训练数据集
|
||||
|
||||
- [样例数据集](c4_demo.json)
|
||||
- [样例数据集](c4_demo.jsonl)
|
||||
|
||||
在预训练时,只有 `text` 列中的内容会用于模型学习。
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
300
data/c4_demo.jsonl
Normal file
300
data/c4_demo.jsonl
Normal file
File diff suppressed because one or more lines are too long
@ -527,6 +527,16 @@
|
||||
"rejected": "rejected"
|
||||
}
|
||||
},
|
||||
"coig_p": {
|
||||
"hf_hub_url": "m-a-p/COIG-P",
|
||||
"ranking": true,
|
||||
"formatting": "sharegpt",
|
||||
"columns": {
|
||||
"messages": "conversations",
|
||||
"chosen": "chosen",
|
||||
"rejected": "rejected"
|
||||
}
|
||||
},
|
||||
"rlhf_v": {
|
||||
"hf_hub_url": "llamafactory/RLHF-V",
|
||||
"ranking": true,
|
||||
@ -622,7 +632,7 @@
|
||||
}
|
||||
},
|
||||
"c4_demo": {
|
||||
"file_name": "c4_demo.json",
|
||||
"file_name": "c4_demo.jsonl",
|
||||
"columns": {
|
||||
"prompt": "text"
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
|
||||
datasets>=2.16.0,<=3.4.1
|
||||
accelerate>=0.34.0,<=1.5.2
|
||||
peft>=0.14.0,<=0.15.0
|
||||
datasets>=2.16.0,<=3.5.0
|
||||
accelerate>=0.34.0,<=1.6.0
|
||||
peft>=0.14.0,<=0.15.1
|
||||
trl>=0.8.6,<=0.9.6
|
||||
tokenizers>=0.19.0,<=0.21.0
|
||||
gradio>=4.38.0,<=5.21.0
|
||||
|
@ -20,9 +20,9 @@ Level:
|
||||
Dependency graph:
|
||||
main:
|
||||
transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
|
||||
datasets>=2.16.0,<=3.4.1
|
||||
accelerate>=0.34.0,<=1.5.2
|
||||
peft>=0.14.0,<=0.15.0
|
||||
datasets>=2.16.0,<=3.5.0
|
||||
accelerate>=0.34.0,<=1.6.0
|
||||
peft>=0.14.0,<=0.15.1
|
||||
trl>=0.8.6,<=0.9.6
|
||||
attention:
|
||||
transformers>=4.42.4 (gemma+fa2)
|
||||
|
@ -90,9 +90,9 @@ def check_version(requirement: str, mandatory: bool = False) -> None:
|
||||
def check_dependencies() -> None:
|
||||
r"""Check the version of the required packages."""
|
||||
check_version("transformers>=4.41.2,<=4.51.1,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0")
|
||||
check_version("datasets>=2.16.0,<=3.4.1")
|
||||
check_version("accelerate>=0.34.0,<=1.5.2")
|
||||
check_version("peft>=0.14.0,<=0.15.0")
|
||||
check_version("datasets>=2.16.0,<=3.5.0")
|
||||
check_version("accelerate>=0.34.0,<=1.6.0")
|
||||
check_version("peft>=0.14.0,<=0.15.1")
|
||||
check_version("trl>=0.8.6,<=0.9.6")
|
||||
if is_transformers_version_greater_than("4.46.0") and not is_transformers_version_greater_than("4.48.1"):
|
||||
logger.warning_rank0_once("There are known bugs in transformers v4.46.0-v4.48.0, please use other versions.")
|
||||
|
@ -111,7 +111,7 @@ def create_export_tab(engine: "Engine") -> dict[str, "Component"]:
|
||||
with gr.Row():
|
||||
export_size = gr.Slider(minimum=1, maximum=100, value=5, step=1)
|
||||
export_quantization_bit = gr.Dropdown(choices=["none"] + GPTQ_BITS, value="none")
|
||||
export_quantization_dataset = gr.Textbox(value="data/c4_demo.json")
|
||||
export_quantization_dataset = gr.Textbox(value="data/c4_demo.jsonl")
|
||||
export_device = gr.Radio(choices=["cpu", "auto"], value="cpu")
|
||||
export_legacy_format = gr.Checkbox()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user