From 9802398c7198146b2aa327dcf3452ebeb0ab067a Mon Sep 17 00:00:00 2001 From: hiyouga Date: Sun, 23 Jul 2023 20:01:43 +0800 Subject: [PATCH] update dataset Former-commit-id: 4fc2c3293d91d8464527ebd1ddabe572c8355616 --- README.md | 4 +++ README_zh.md | 4 +++ data/dataset_info.json | 40 ++++++++++++++++++++++++++++-- data/oaast_sft.json.REMOVED.git-id | 2 +- 4 files changed, 47 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index efc7b7ff..b19b3143 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,10 @@ - For pre-training: - [Wiki Demo (en)](data/wiki_demo.txt) + - [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) + - [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata) + - [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220) + - [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered) - For supervised fine-tuning: - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) diff --git a/README_zh.md b/README_zh.md index 1699dd86..73b50e95 100644 --- a/README_zh.md +++ b/README_zh.md @@ -63,6 +63,10 @@ - 用于二次预训练: - [Wiki Demo (en)](data/wiki_demo.txt) + - [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) + - [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata) + - [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220) + - [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered) - 用于指令监督微调: - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) diff --git a/data/dataset_info.json b/data/dataset_info.json index 73a2f72b..f16d1d85 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -21,7 +21,7 @@ }, "oaast_sft": { "file_name": "oaast_sft.json", - "file_sha1": "08912e34fb165db137d3436db4c35321e33b28d1", + "file_sha1": "7baf5d43e67a91f9bbdf4e400dbe033b87e9757e", "columns": { "prompt": "instruction", "query": "input", @@ -31,7 +31,7 @@ }, "oaast_sft_zh": { "file_name": "oaast_sft_zh.json", - "file_sha1": "e0a2e7e8eff355434ada6c9b7f70bb915f941dd4", + "file_sha1": "a6a91f18f80f37b10ded9cf633fb50c033bf7b9f", "columns": { "prompt": "instruction", "query": "input", @@ -187,5 +187,41 @@ "response": "", "history": "" } + }, + "refinedweb": { + "hf_hub_url": "tiiuae/falcon-refinedweb", + "columns": { + "prompt": "content", + "query": "", + "response": "", + "history": "" + } + }, + "starcoder": { + "hf_hub_url": "bigcode/starcoderdata", + "columns": { + "prompt": "content", + "query": "", + "response": "", + "history": "" + } + }, + "wikipedia_en": { + "hf_hub_url": "olm/olm-wikipedia-20221220", + "columns": { + "prompt": "text", + "query": "", + "response": "", + "history": "" + } + }, + "wikipedia_zh": { + "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", + "columns": { + "prompt": "completion", + "query": "", + "response": "", + "history": "" + } } } diff --git a/data/oaast_sft.json.REMOVED.git-id b/data/oaast_sft.json.REMOVED.git-id index 5bac2e5b..fd29e313 100644 --- a/data/oaast_sft.json.REMOVED.git-id +++ b/data/oaast_sft.json.REMOVED.git-id @@ -1 +1 @@ -0a57fbc1d8cb08a8cd71c5eb8425cf59206ffed6 \ No newline at end of file +57fd080be5bffe4153fe3ee26a175e3d56da30f3 \ No newline at end of file