[data] add r1 distill dataset (#6983)

Former-commit-id: 2591a3fa8b37fed8011fb66b266ef15e18404756
This commit is contained in:
hoshi-hiyouga 2025-02-18 17:25:09 +08:00 committed by GitHub
parent 3fbd4848e8
commit beb1a9f9d9
3 changed files with 30 additions and 8 deletions

View File

@ -348,7 +348,9 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
- [Magpie-ultra-v0.1 (en)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1)
- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
- [OpenO1-SFT (en&zh)](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT)
- [Open Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)
- [Open-Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)
- [Open-R1-Math (en)](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k)
- [Chinese-DeepSeek-R1-Distill (zh)](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)
- [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
- [Pokemon-gpt4o-captions (en&zh)](https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions)
- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)

View File

@ -350,7 +350,9 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
- [Magpie-ultra-v0.1 (en)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1)
- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
- [OpenO1-SFT (en&zh)](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT)
- [Open Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)
- [Open-Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)
- [Open-R1-Math (en)](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k)
- [Chinese-DeepSeek-R1-Distill (zh)](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)
- [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
- [Pokemon-gpt4o-captions (en&zh)](https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions)
- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)

View File

@ -319,19 +319,37 @@
}
},
"open_thoughts": {
"hf_hub_url": "open-thoughts/OpenThoughts-114k",
"hf_hub_url": "llamafactory/OpenThoughts-114k",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"system": "system"
"messages": "messages"
},
"tags": {
"role_tag": "from",
"content_tag": "value",
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
"assistant_tag": "assistant",
"system_tag": "system"
}
},
"open_r1_math": {
"hf_hub_url": "llamafactory/OpenR1-Math-94k",
"formatting": "sharegpt",
"columns": {
"messages": "messages"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant",
"system_tag": "system"
}
},
"chinese_r1_distill": {
"hf_hub_url": "Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT",
"ms_hub_url": "liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT"
},
"llava_1k_en": {
"hf_hub_url": "BUAADreamer/llava-en-zh-2k",
"subset": "en",