mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 09:52:14 +08:00 
			
		
		
		
	[data] add coig-p dataset (#7657)
This commit is contained in:
		
							parent
							
								
									89a4f9ec7f
								
							
						
					
					
						commit
						4eec541857
					
				@ -384,6 +384,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 | 
			
		||||
 | 
			
		||||
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
 | 
			
		||||
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
 | 
			
		||||
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
 | 
			
		||||
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
 | 
			
		||||
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
 | 
			
		||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 | 
			
		||||
 | 
			
		||||
@ -387,6 +387,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | 
			
		||||
 | 
			
		||||
- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
 | 
			
		||||
- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
 | 
			
		||||
- [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
 | 
			
		||||
- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
 | 
			
		||||
- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
 | 
			
		||||
- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 | 
			
		||||
 | 
			
		||||
@ -85,7 +85,7 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 | 
			
		||||
 | 
			
		||||
### Pre-training Dataset
 | 
			
		||||
 | 
			
		||||
- [Example dataset](c4_demo.json)
 | 
			
		||||
- [Example dataset](c4_demo.jsonl)
 | 
			
		||||
 | 
			
		||||
In pre-training, only the `text` column will be used for model learning.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -85,7 +85,7 @@
 | 
			
		||||
 | 
			
		||||
### 预训练数据集
 | 
			
		||||
 | 
			
		||||
- [样例数据集](c4_demo.json)
 | 
			
		||||
- [样例数据集](c4_demo.jsonl)
 | 
			
		||||
 | 
			
		||||
在预训练时,只有 `text` 列中的内容会用于模型学习。
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										300
									
								
								data/c4_demo.jsonl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										300
									
								
								data/c4_demo.jsonl
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							@ -1,7 +1,7 @@
 | 
			
		||||
transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
 | 
			
		||||
datasets>=2.16.0,<=3.4.1
 | 
			
		||||
accelerate>=0.34.0,<=1.5.2
 | 
			
		||||
peft>=0.14.0,<=0.15.0
 | 
			
		||||
datasets>=2.16.0,<=3.5.0
 | 
			
		||||
accelerate>=0.34.0,<=1.6.0
 | 
			
		||||
peft>=0.14.0,<=0.15.1
 | 
			
		||||
trl>=0.8.6,<=0.9.6
 | 
			
		||||
tokenizers>=0.19.0,<=0.21.0
 | 
			
		||||
gradio>=4.38.0,<=5.21.0
 | 
			
		||||
 | 
			
		||||
@ -20,9 +20,9 @@ Level:
 | 
			
		||||
Dependency graph:
 | 
			
		||||
  main:
 | 
			
		||||
    transformers>=4.41.2,<=4.51.1,!=4.46.*,!=4.47.*,!=4.48.0
 | 
			
		||||
    datasets>=2.16.0,<=3.4.1
 | 
			
		||||
    accelerate>=0.34.0,<=1.5.2
 | 
			
		||||
    peft>=0.14.0,<=0.15.0
 | 
			
		||||
    datasets>=2.16.0,<=3.5.0
 | 
			
		||||
    accelerate>=0.34.0,<=1.6.0
 | 
			
		||||
    peft>=0.14.0,<=0.15.1
 | 
			
		||||
    trl>=0.8.6,<=0.9.6
 | 
			
		||||
  attention:
 | 
			
		||||
    transformers>=4.42.4 (gemma+fa2)
 | 
			
		||||
 | 
			
		||||
@ -90,9 +90,9 @@ def check_version(requirement: str, mandatory: bool = False) -> None:
 | 
			
		||||
def check_dependencies() -> None:
 | 
			
		||||
    r"""Check the version of the required packages."""
 | 
			
		||||
    check_version("transformers>=4.41.2,<=4.51.1,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0")
 | 
			
		||||
    check_version("datasets>=2.16.0,<=3.4.1")
 | 
			
		||||
    check_version("accelerate>=0.34.0,<=1.5.2")
 | 
			
		||||
    check_version("peft>=0.14.0,<=0.15.0")
 | 
			
		||||
    check_version("datasets>=2.16.0,<=3.5.0")
 | 
			
		||||
    check_version("accelerate>=0.34.0,<=1.6.0")
 | 
			
		||||
    check_version("peft>=0.14.0,<=0.15.1")
 | 
			
		||||
    check_version("trl>=0.8.6,<=0.9.6")
 | 
			
		||||
    if is_transformers_version_greater_than("4.46.0") and not is_transformers_version_greater_than("4.48.1"):
 | 
			
		||||
        logger.warning_rank0_once("There are known bugs in transformers v4.46.0-v4.48.0, please use other versions.")
 | 
			
		||||
 | 
			
		||||
@ -111,7 +111,7 @@ def create_export_tab(engine: "Engine") -> dict[str, "Component"]:
 | 
			
		||||
    with gr.Row():
 | 
			
		||||
        export_size = gr.Slider(minimum=1, maximum=100, value=5, step=1)
 | 
			
		||||
        export_quantization_bit = gr.Dropdown(choices=["none"] + GPTQ_BITS, value="none")
 | 
			
		||||
        export_quantization_dataset = gr.Textbox(value="data/c4_demo.json")
 | 
			
		||||
        export_quantization_dataset = gr.Textbox(value="data/c4_demo.jsonl")
 | 
			
		||||
        export_device = gr.Radio(choices=["cpu", "auto"], value="cpu")
 | 
			
		||||
        export_legacy_format = gr.Checkbox()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user