From b50ca5cafaa907c43b059383657c929a937e366d Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 18 Feb 2025 17:25:09 +0800 Subject: [PATCH] [data] add r1 distill dataset (#6983) Former-commit-id: 1da5ee4edaa3896593b9cae488f0ac5917c3243e --- README.md | 4 +++- README_zh.md | 4 +++- data/dataset_info.json | 30 ++++++++++++++++++++++++------ 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6dfcf186..c4cc8a2c 100644 --- a/README.md +++ b/README.md @@ -348,7 +348,9 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t - [Magpie-ultra-v0.1 (en)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1) - [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub) - [OpenO1-SFT (en&zh)](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT) -- [Open Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) +- [Open-Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) +- [Open-R1-Math (en)](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) +- [Chinese-DeepSeek-R1-Distill (zh)](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT) - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k) - [Pokemon-gpt4o-captions (en&zh)](https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions) - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de) diff --git a/README_zh.md b/README_zh.md index 06c58ffb..17ba8789 100644 --- a/README_zh.md +++ b/README_zh.md @@ -350,7 +350,9 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 - [Magpie-ultra-v0.1 (en)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1) - [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub) - [OpenO1-SFT (en&zh)](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT) -- [Open Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) +- [Open-Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) +- [Open-R1-Math (en)](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) +- [Chinese-DeepSeek-R1-Distill (zh)](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT) - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k) - [Pokemon-gpt4o-captions (en&zh)](https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions) - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de) diff --git a/data/dataset_info.json b/data/dataset_info.json index 627eb355..7aaee14f 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -319,19 +319,37 @@ } }, "open_thoughts": { - "hf_hub_url": "open-thoughts/OpenThoughts-114k", + "hf_hub_url": "llamafactory/OpenThoughts-114k", "formatting": "sharegpt", "columns": { - "messages": "conversations", - "system": "system" + "messages": "messages" }, "tags": { - "role_tag": "from", - "content_tag": "value", + "role_tag": "role", + "content_tag": "content", "user_tag": "user", - "assistant_tag": "assistant" + "assistant_tag": "assistant", + "system_tag": "system" } }, + "open_r1_math": { + "hf_hub_url": "llamafactory/OpenR1-Math-94k", + "formatting": "sharegpt", + "columns": { + "messages": "messages" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant", + "system_tag": "system" + } + }, + "chinese_r1_distill": { + "hf_hub_url": "Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT", + "ms_hub_url": "liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT" + }, "llava_1k_en": { "hf_hub_url": "BUAADreamer/llava-en-zh-2k", "subset": "en",