mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-01 11:12:50 +08:00
[data] add coig-p dataset (#7657)
This commit is contained in:
parent
24cb890432
commit
34fdabe005
@ -384,6 +384,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
|
|||||||
|
|
||||||
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
|
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
|
||||||
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
|
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
|
||||||
|
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
|
||||||
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
|
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
|
||||||
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
|
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
|
||||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
|
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
|
||||||
|
@ -387,6 +387,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
|
|||||||
|
|
||||||
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
|
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
|
||||||
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
|
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
|
||||||
|
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
|
||||||
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
|
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
|
||||||
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
|
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
|
||||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
|
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
|
||||||
|
@ -85,7 +85,7 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
|
|||||||
|
|
||||||
### Pre-training Dataset
|
### Pre-training Dataset
|
||||||
|
|
||||||
- [Example dataset](c4_demo.json)
|
- [Example dataset](c4_demo.jsonl)
|
||||||
|
|
||||||
In pre-training, only the `text` column will be used for model learning.
|
In pre-training, only the `text` column will be used for model learning.
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@
|
|||||||
|
|
||||||
### 预训练数据集
|
### 预训练数据集
|
||||||
|
|
||||||
- [样例数据集](c4_demo.json)
|
- [样例数据集](c4_demo.jsonl)
|
||||||
|
|
||||||
在预训练时,只有 `text` 列中的内容会用于模型学习。
|
在预训练时,只有 `text` 列中的内容会用于模型学习。
|
||||||
|
|
||||||
|
File diff suppressed because one or more lines are too long
300
data/c4_demo.jsonl
Normal file
300
data/c4_demo.jsonl
Normal file
File diff suppressed because one or more lines are too long
@ -527,6 +527,16 @@
|
|||||||
"rejected": "rejected"
|
"rejected": "rejected"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"coig_p": {
|
||||||
|
"hf_hub_url": "m-a-p/COIG-P",
|
||||||
|
"ranking": true,
|
||||||
|
"formatting": "sharegpt",
|
||||||
|
"columns": {
|
||||||
|
"messages": "conversations",
|
||||||
|
"chosen": "chosen",
|
||||||
|
"rejected": "rejected"
|
||||||
|
}
|
||||||
|
},
|
||||||
"rlhf_v": {
|
"rlhf_v": {
|
||||||
"hf_hub_url": "llamafactory/RLHF-V",
|
"hf_hub_url": "llamafactory/RLHF-V",
|
||||||
"ranking": true,
|
"ranking": true,
|
||||||
@ -622,7 +632,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"c4_demo": {
|
"c4_demo": {
|
||||||
"file_name": "c4_demo.json",
|
"file_name": "c4_demo.jsonl",
|
||||||
"columns": {
|
"columns": {
|
||||||
"prompt": "text"
|
"prompt": "text"
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
|
transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
|
||||||
datasets>=2.16.0,<=3.4.1
|
datasets>=2.16.0,<=3.5.0
|
||||||
accelerate>=0.34.0,<=1.5.2
|
accelerate>=0.34.0,<=1.6.0
|
||||||
peft>=0.14.0,<=0.15.0
|
peft>=0.14.0,<=0.15.1
|
||||||
trl>=0.8.6,<=0.9.6
|
trl>=0.8.6,<=0.9.6
|
||||||
tokenizers>=0.19.0,<=0.21.0
|
tokenizers>=0.19.0,<=0.21.0
|
||||||
gradio>=4.38.0,<=5.21.0
|
gradio>=4.38.0,<=5.21.0
|
||||||
|
@ -20,9 +20,9 @@ Level:
|
|||||||
Dependency graph:
|
Dependency graph:
|
||||||
main:
|
main:
|
||||||
transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
|
transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
|
||||||
datasets>=2.16.0,<=3.4.1
|
datasets>=2.16.0,<=3.5.0
|
||||||
accelerate>=0.34.0,<=1.5.2
|
accelerate>=0.34.0,<=1.6.0
|
||||||
peft>=0.14.0,<=0.15.0
|
peft>=0.14.0,<=0.15.1
|
||||||
trl>=0.8.6,<=0.9.6
|
trl>=0.8.6,<=0.9.6
|
||||||
attention:
|
attention:
|
||||||
transformers>=4.42.4 (gemma+fa2)
|
transformers>=4.42.4 (gemma+fa2)
|
||||||
|
@ -90,9 +90,9 @@ def check_version(requirement: str, mandatory: bool = False) -> None:
|
|||||||
def check_dependencies() -> None:
|
def check_dependencies() -> None:
|
||||||
r"""Check the version of the required packages."""
|
r"""Check the version of the required packages."""
|
||||||
check_version("transformers>=4.41.2,<=4.51.1,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0")
|
check_version("transformers>=4.41.2,<=4.51.1,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0")
|
||||||
check_version("datasets>=2.16.0,<=3.4.1")
|
check_version("datasets>=2.16.0,<=3.5.0")
|
||||||
check_version("accelerate>=0.34.0,<=1.5.2")
|
check_version("accelerate>=0.34.0,<=1.6.0")
|
||||||
check_version("peft>=0.14.0,<=0.15.0")
|
check_version("peft>=0.14.0,<=0.15.1")
|
||||||
check_version("trl>=0.8.6,<=0.9.6")
|
check_version("trl>=0.8.6,<=0.9.6")
|
||||||
if is_transformers_version_greater_than("4.46.0") and not is_transformers_version_greater_than("4.48.1"):
|
if is_transformers_version_greater_than("4.46.0") and not is_transformers_version_greater_than("4.48.1"):
|
||||||
logger.warning_rank0_once("There are known bugs in transformers v4.46.0-v4.48.0, please use other versions.")
|
logger.warning_rank0_once("There are known bugs in transformers v4.46.0-v4.48.0, please use other versions.")
|
||||||
|
@ -111,7 +111,7 @@ def create_export_tab(engine: "Engine") -> dict[str, "Component"]:
|
|||||||
with gr.Row():
|
with gr.Row():
|
||||||
export_size = gr.Slider(minimum=1, maximum=100, value=5, step=1)
|
export_size = gr.Slider(minimum=1, maximum=100, value=5, step=1)
|
||||||
export_quantization_bit = gr.Dropdown(choices=["none"] + GPTQ_BITS, value="none")
|
export_quantization_bit = gr.Dropdown(choices=["none"] + GPTQ_BITS, value="none")
|
||||||
export_quantization_dataset = gr.Textbox(value="data/c4_demo.json")
|
export_quantization_dataset = gr.Textbox(value="data/c4_demo.jsonl")
|
||||||
export_device = gr.Radio(choices=["cpu", "auto"], value="cpu")
|
export_device = gr.Radio(choices=["cpu", "auto"], value="cpu")
|
||||||
export_legacy_format = gr.Checkbox()
|
export_legacy_format = gr.Checkbox()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user