update dataset

This commit is contained in:
hiyouga
2023-07-23 20:01:43 +08:00
parent 6a2967ff7a
commit 182b425043
5 changed files with 1141 additions and 9506 deletions

View File

@@ -63,6 +63,10 @@
- For pre-training: - For pre-training:
- [Wiki Demo (en)](data/wiki_demo.txt) - [Wiki Demo (en)](data/wiki_demo.txt)
- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
- For supervised fine-tuning: - For supervised fine-tuning:
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)

View File

@@ -63,6 +63,10 @@
- 用于二次预训练: - 用于二次预训练:
- [Wiki Demo (en)](data/wiki_demo.txt) - [Wiki Demo (en)](data/wiki_demo.txt)
- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
- 用于指令监督微调: - 用于指令监督微调:
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)

View File

@@ -21,7 +21,7 @@
}, },
"oaast_sft": { "oaast_sft": {
"file_name": "oaast_sft.json", "file_name": "oaast_sft.json",
"file_sha1": "08912e34fb165db137d3436db4c35321e33b28d1", "file_sha1": "7baf5d43e67a91f9bbdf4e400dbe033b87e9757e",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"query": "input", "query": "input",
@@ -31,7 +31,7 @@
}, },
"oaast_sft_zh": { "oaast_sft_zh": {
"file_name": "oaast_sft_zh.json", "file_name": "oaast_sft_zh.json",
"file_sha1": "e0a2e7e8eff355434ada6c9b7f70bb915f941dd4", "file_sha1": "a6a91f18f80f37b10ded9cf633fb50c033bf7b9f",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"query": "input", "query": "input",
@@ -187,5 +187,41 @@
"response": "", "response": "",
"history": "" "history": ""
} }
},
"refinedweb": {
"hf_hub_url": "tiiuae/falcon-refinedweb",
"columns": {
"prompt": "content",
"query": "",
"response": "",
"history": ""
}
},
"starcoder": {
"hf_hub_url": "bigcode/starcoderdata",
"columns": {
"prompt": "content",
"query": "",
"response": "",
"history": ""
}
},
"wikipedia_en": {
"hf_hub_url": "olm/olm-wikipedia-20221220",
"columns": {
"prompt": "text",
"query": "",
"response": "",
"history": ""
}
},
"wikipedia_zh": {
"hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
"columns": {
"prompt": "completion",
"query": "",
"response": "",
"history": ""
}
} }
} }

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long