From ed252565f9d7e36e1b9a9a7534c547616c5231a1 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Wed, 26 Jul 2023 17:05:12 +0800 Subject: [PATCH] update dataset Former-commit-id: 4a044aabbd19c92a9ae93c1c30536f5086fd47f9 --- README.md | 1 + README_zh.md | 1 + data/dataset_info.json | 14 ++++++++++++-- data/refgpt_zh_50k_p1.json.REMOVED.git-id | 2 +- data/refgpt_zh_50k_p2.json.REMOVED.git-id | 2 +- 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 675b0c7d..ec62a5a6 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ - [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) - [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) + - [LIMA (en)](https://huggingface.co/datasets/GAIR/lima) - [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k) - [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa) diff --git a/README_zh.md b/README_zh.md index bb9236a4..abf674bc 100644 --- a/README_zh.md +++ b/README_zh.md @@ -83,6 +83,7 @@ - [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) - [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) + - [LIMA (en)](https://huggingface.co/datasets/GAIR/lima) - [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k) - [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa) diff --git a/data/dataset_info.json b/data/dataset_info.json index f16d1d85..3a3b4e76 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -51,7 +51,7 @@ }, "refgpt_zh_p1": { "file_name": "refgpt_zh_50k_p1.json", - "file_sha1": "995043a909eed6693f850a96fccb4d3803f3ea5e", + "file_sha1": "b40f4f4d0ffacd16da7c275b056d5b6670021752", "columns": { "prompt": "instruction", "query": "input", @@ -61,7 +61,17 @@ }, "refgpt_zh_p2": { "file_name": "refgpt_zh_50k_p2.json", - "file_sha1": "d9442d5c4541fe5489b5b571871fbe7595ee3809", + "file_sha1": "181f32b2c60264a29f81f59d3c76095793eae1b0", + "columns": { + "prompt": "instruction", + "query": "input", + "response": "output", + "history": "history" + } + }, + "lima": { + "file_name": "lima.json", + "file_sha1": "9db59f6b7007dc4b17529fc63379b9cd61640f37", "columns": { "prompt": "instruction", "query": "input", diff --git a/data/refgpt_zh_50k_p1.json.REMOVED.git-id b/data/refgpt_zh_50k_p1.json.REMOVED.git-id index 9acdf2c3..3e8a9e41 100644 --- a/data/refgpt_zh_50k_p1.json.REMOVED.git-id +++ b/data/refgpt_zh_50k_p1.json.REMOVED.git-id @@ -1 +1 @@ -56405bb8f52727e52e99693739494b9b7b0d7ba6 \ No newline at end of file +f967a4f6d04a11308a15524aa9a846a19a8d1e83 \ No newline at end of file diff --git a/data/refgpt_zh_50k_p2.json.REMOVED.git-id b/data/refgpt_zh_50k_p2.json.REMOVED.git-id index c7919c81..a6525b27 100644 --- a/data/refgpt_zh_50k_p2.json.REMOVED.git-id +++ b/data/refgpt_zh_50k_p2.json.REMOVED.git-id @@ -1 +1 @@ -fa935248a5d40d2bdd5649af99a72a754d40ae7a \ No newline at end of file +0a4f0d74fd1c5cab2eb6d84a3a3fe669847becd8 \ No newline at end of file