mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-14 10:56:56 +08:00
53
data/README.md
Normal file
53
data/README.md
Normal file
@@ -0,0 +1,53 @@
|
||||
Data format in `dataset_info.json`:
|
||||
```json
|
||||
"dataset_name": {
|
||||
"hf_hub_url": "the name of the dataset repository on the HuggingFace hub. (if specified, ignore below 3 arguments)",
|
||||
"script_url": "the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)",
|
||||
"file_name": "the name of the dataset file in the this directory. (required if above are not specified)",
|
||||
"file_sha1": "the SHA-1 hash value of the dataset file. (optional)",
|
||||
"columns": {
|
||||
"prompt": "the name of the column in the datasets containing the prompts. (default: instruction)",
|
||||
"query": "the name of the column in the datasets containing the queries. (default: input)",
|
||||
"response": "the name of the column in the datasets containing the responses. (default: output)",
|
||||
"history": "the name of the column in the datasets containing the history of chat. (default: None)"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`dataset_info.json` 中的数据集定义格式:
|
||||
```json
|
||||
"数据集名称": {
|
||||
"hf_hub_url": "HuggingFace上的项目地址(若指定,则忽略下列三个参数)",
|
||||
"script_url": "包含数据加载脚本的本地文件夹名称(若指定,则忽略下列两个参数)",
|
||||
"file_name": "该目录下数据集文件的名称(若上述参数未指定,则此项必需)",
|
||||
"file_sha1": "数据集文件的SHA-1哈希值(可选)",
|
||||
"columns": {
|
||||
"prompt": "数据集代表提示词的表头名称(默认:instruction)",
|
||||
"query": "数据集代表请求的表头名称(默认:input)",
|
||||
"response": "数据集代表回答的表头名称(默认:output)",
|
||||
"history": "数据集代表历史对话的表头名称(默认:None)"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
部分预置数据集简介:
|
||||
|
||||
| 数据集名称 | 规模 | 描述 |
|
||||
| --- | --- | --- |
|
||||
| [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) | 52k | 斯坦福大学开源的 Alpaca 数据集,训练了 Alpaca 这类早期基于 LLaMA 的模型 |
|
||||
| [Stanford Alpaca (Chinese)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) | 51k | 使用 ChatGPT 翻译的 Alpaca 数据集 |
|
||||
| [GPT-4 Generated Data](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) | 100k+ | 基于 GPT-4 的 self-instruction 数据集 |
|
||||
| [BELLE 2M](https://huggingface.co/datasets/BelleGroup/train_2M_CN) | 2m | 包含约 200 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的中文指令数据 |
|
||||
| [BELLE 1M](https://huggingface.co/datasets/BelleGroup/train_1M_CN) | 1m | 包含约 100 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的中文指令数据 |
|
||||
| [BELLE 0.5M](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN) | 500k | 包含约 50 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的中文指令数据 |
|
||||
| [BELLE Dialogue 0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M) | 400k | 包含约 40 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的个性化角色对话数据,包含角色介绍 |
|
||||
| [BELLE School Math 0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) | 250k | 包含约 25 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的中文数学题数据,包含解题过程 |
|
||||
| [BELLE Multiturn Chat 0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) | 800k | 包含约 80 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的用户与助手的多轮对话 |
|
||||
| [Guanaco Dataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset) | 100k+ | 包含日文、简繁体中文、英文等多类数据,数据集原用于 Guanaco 模型训练 |
|
||||
| [Firefly 1.1M](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) | 1.1M | 中文对话大模型 firefly(流萤)的中文数据集,包含多个 NLP 任务 |
|
||||
| [CodeAlpaca 20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k) | 20k | 英文代码生成任务数据集 |
|
||||
| [Alpaca CoT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) | 6M | 用于微调的指令数据集集合 |
|
||||
| [Web QA](https://huggingface.co/datasets/suolyer/webqa) | 36k | 百度知道汇集的中文问答数据集 |
|
||||
| [UltraChat](https://github.com/thunlp/UltraChat) | 1.57M | 清华 NLP 发布的大规模多轮对话数据集 |
|
||||
|
||||
注:BELLE 数据集是由 ChatGPT 产生的数据集,不保证数据准确性,所有类 GPT 模型产生的 self-instruction 数据集均不能保证其准确性。
|
||||
1
data/alpaca_data_en_52k.json.REMOVED.git-id
Normal file
1
data/alpaca_data_en_52k.json.REMOVED.git-id
Normal file
@@ -0,0 +1 @@
|
||||
3779ddbc040543ab1834ef216c983d6fcc06cc9a
|
||||
1
data/alpaca_data_zh_51k.json.REMOVED.git-id
Normal file
1
data/alpaca_data_zh_51k.json.REMOVED.git-id
Normal file
@@ -0,0 +1 @@
|
||||
fc9a6a3458caca2af8dafc6181773fe10c6d8657
|
||||
1
data/alpaca_gpt4_data_en.json.REMOVED.git-id
Normal file
1
data/alpaca_gpt4_data_en.json.REMOVED.git-id
Normal file
@@ -0,0 +1 @@
|
||||
25508714b7879a1e5a6764ba7f979a980f549f1a
|
||||
1
data/alpaca_gpt4_data_zh.json.REMOVED.git-id
Normal file
1
data/alpaca_gpt4_data_zh.json.REMOVED.git-id
Normal file
@@ -0,0 +1 @@
|
||||
7cb6a7d11455bddc3d495750a2392683d775b184
|
||||
1
data/comparison_gpt4_data_en.json.REMOVED.git-id
Normal file
1
data/comparison_gpt4_data_en.json.REMOVED.git-id
Normal file
@@ -0,0 +1 @@
|
||||
f437d58b7791609ee91f064551c5c5734a0fd97a
|
||||
1
data/comparison_gpt4_data_zh.json.REMOVED.git-id
Normal file
1
data/comparison_gpt4_data_zh.json.REMOVED.git-id
Normal file
@@ -0,0 +1 @@
|
||||
0e346cf70e633456c7e83f68765361016005447a
|
||||
97
data/dataset_info.json
Normal file
97
data/dataset_info.json
Normal file
@@ -0,0 +1,97 @@
|
||||
{
|
||||
"alpaca_en": {
|
||||
"hf_hub_url": "tatsu-lab/alpaca"
|
||||
},
|
||||
"alpaca_zh": {
|
||||
"file_name": "alpaca_data_zh_51k.json",
|
||||
"file_sha1": "e655af3db557a4197f7b0cf92e1986b08fae6311"
|
||||
},
|
||||
"alpaca_gpt4_en": {
|
||||
"file_name": "alpaca_gpt4_data_en.json",
|
||||
"file_sha1": "647f4ad447bd993e4b6b6223d1be15208bab694a"
|
||||
},
|
||||
"alpaca_gpt4_zh": {
|
||||
"file_name": "alpaca_gpt4_data_zh.json",
|
||||
"file_sha1": "3eaa3bda364ccdd59925d7448a698256c31ef845"
|
||||
},
|
||||
"belle_0.5m": {
|
||||
"hf_hub_url": "BelleGroup/train_0.5M_CN"
|
||||
},
|
||||
"belle_1m": {
|
||||
"hf_hub_url": "BelleGroup/train_1M_CN"
|
||||
},
|
||||
"belle_2m": {
|
||||
"hf_hub_url": "BelleGroup/train_2M_CN"
|
||||
},
|
||||
"belle_dialog": {
|
||||
"hf_hub_url": "BelleGroup/generated_chat_0.4M"
|
||||
},
|
||||
"belle_math": {
|
||||
"hf_hub_url": "BelleGroup/school_math_0.25M"
|
||||
},
|
||||
"belle_multiturn": {
|
||||
"hf_hub_url": "BelleGroup/multiturn_chat_0.8M"
|
||||
},
|
||||
"guanaco": {
|
||||
"hf_hub_url": "JosephusCheung/GuanacoDataset"
|
||||
},
|
||||
"firefly": {
|
||||
"hf_hub_url": "YeungNLP/firefly-train-1.1M",
|
||||
"columns": {
|
||||
"prompt": "input",
|
||||
"query": "",
|
||||
"response": "target",
|
||||
"history": ""
|
||||
}
|
||||
},
|
||||
"codealpaca": {
|
||||
"hf_hub_url": "sahil2801/CodeAlpaca-20k"
|
||||
},
|
||||
"alpaca_cot": {
|
||||
"hf_hub_url": "QingyiSi/Alpaca-CoT"
|
||||
},
|
||||
"webqa": {
|
||||
"hf_hub_url": "suolyer/webqa",
|
||||
"columns": {
|
||||
"prompt": "input",
|
||||
"query": "",
|
||||
"response": "output",
|
||||
"history": ""
|
||||
}
|
||||
},
|
||||
"ultra_chat": {
|
||||
"script_url": "ultra_chat",
|
||||
"columns": {
|
||||
"prompt": "instruction",
|
||||
"query": "",
|
||||
"response": "output",
|
||||
"history": "history"
|
||||
}
|
||||
},
|
||||
"example": {
|
||||
"script_url": "example_dataset",
|
||||
"columns": {
|
||||
"prompt": "instruction",
|
||||
"query": "input",
|
||||
"response": "output",
|
||||
"history": "history"
|
||||
}
|
||||
},
|
||||
"comparison_gpt4_en": {
|
||||
"file_name": "comparison_gpt4_data_en.json",
|
||||
"file_sha1": "eeb295ce0ab011c37af52596460c8a57d07ad19f"
|
||||
},
|
||||
"comparison_gpt4_zh": {
|
||||
"file_name": "comparison_gpt4_data_zh.json",
|
||||
"file_sha1": "b99a41c1c864019d9b0c07dbcd5df0560cf33ce0"
|
||||
},
|
||||
"hh_rlhf_en": {
|
||||
"script_url": "hh_rlhf_en",
|
||||
"columns": {
|
||||
"prompt": "instruction",
|
||||
"query": "",
|
||||
"response": "output",
|
||||
"history": "history"
|
||||
}
|
||||
}
|
||||
}
|
||||
46
data/example_dataset/example_dataset.py
Normal file
46
data/example_dataset/example_dataset.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import json
|
||||
import datasets
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
_DESCRIPTION = "An example of dataset for LLaMA."
|
||||
_CITATION = ""
|
||||
_HOMEPAGE = ""
|
||||
_LICENSE = ""
|
||||
_URL = "examples.json"
|
||||
|
||||
|
||||
class ExampleDataset(datasets.GeneratorBasedBuilder):
|
||||
|
||||
VERSION = datasets.Version("0.0.0")
|
||||
|
||||
def _info(self) -> datasets.DatasetInfo:
|
||||
features = datasets.Features({
|
||||
"instruction": datasets.Value("string"),
|
||||
"input": datasets.Value("string"),
|
||||
"output": datasets.Value("string"),
|
||||
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
|
||||
})
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=features,
|
||||
homepage=_HOMEPAGE,
|
||||
license=_LICENSE,
|
||||
citation=_CITATION
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
|
||||
file_path = dl_manager.download(_URL)
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
gen_kwargs={
|
||||
"filepath": file_path
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
def _generate_examples(self, filepath: str) -> Dict[int, Dict[str, Any]]:
|
||||
example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
|
||||
for key, example in enumerate(example_dataset):
|
||||
yield key, example
|
||||
20
data/example_dataset/examples.json
Normal file
20
data/example_dataset/examples.json
Normal file
@@ -0,0 +1,20 @@
|
||||
[
|
||||
{
|
||||
"instruction": "听起来很不错。人工智能可能在哪些方面面临挑战呢?",
|
||||
"input": "",
|
||||
"output": "人工智能面临的挑战包括数据隐私、安全和道德方面的问题,以及影响就业机会的自动化等问题。",
|
||||
"history": [
|
||||
["你好,你能帮我解答一个问题吗?", "当然,请问有什么问题?"],
|
||||
["我想了解人工智能的未来发展方向,你有什么想法吗?", "人工智能在未来的发展方向可能包括更强大的机器学习算法,更先进的自然语言处理技术,以及更加智能的机器人。"]
|
||||
]
|
||||
},
|
||||
{
|
||||
"instruction": "好的,谢谢你!",
|
||||
"input": "",
|
||||
"output": "不客气,有其他需要帮忙的地方可以继续问我。",
|
||||
"history": [
|
||||
["你好,能告诉我今天天气怎么样吗?", "当然可以,请问您所在的城市是哪里?"],
|
||||
["我在纽约。", "纽约今天晴间多云,气温最高约26摄氏度,最低约18摄氏度,记得注意保暖喔。"]
|
||||
]
|
||||
}
|
||||
]
|
||||
97
data/hh_rlhf_en/hh_rlhf_en.py
Normal file
97
data/hh_rlhf_en/hh_rlhf_en.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import json
|
||||
import datasets
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
_DESCRIPTION = "Human preference data about helpfulness and harmlessness for ChatGLM."
|
||||
_CITATION = ""
|
||||
_HOMEPAGE = "https://huggingface.co/datasets/Anthropic/hh-rlhf"
|
||||
_LICENSE = "mit"
|
||||
_URL = "https://huggingface.co/datasets/Anthropic/hh-rlhf/resolve/main/"
|
||||
_URLS = {
|
||||
"train": [
|
||||
_URL + "harmless-base/train.jsonl.gz",
|
||||
_URL + "helpful-base/train.jsonl.gz",
|
||||
_URL + "helpful-online/train.jsonl.gz",
|
||||
_URL + "helpful-rejection-sampled/train.jsonl.gz"
|
||||
],
|
||||
"test": [
|
||||
_URL + "harmless-base/test.jsonl.gz",
|
||||
_URL + "helpful-base/test.jsonl.gz",
|
||||
_URL + "helpful-online/test.jsonl.gz",
|
||||
_URL + "helpful-rejection-sampled/test.jsonl.gz"
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
class HhRlhfEn(datasets.GeneratorBasedBuilder):
|
||||
|
||||
VERSION = datasets.Version("0.0.0")
|
||||
|
||||
def _info(self) -> datasets.DatasetInfo:
|
||||
features = datasets.Features({
|
||||
"instruction": datasets.Value("string"),
|
||||
"output": datasets.Sequence(datasets.Value("string")),
|
||||
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
|
||||
})
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=features,
|
||||
homepage=_HOMEPAGE,
|
||||
license=_LICENSE,
|
||||
citation=_CITATION
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
|
||||
file_path = dl_manager.download_and_extract(_URLS)
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
gen_kwargs={
|
||||
"filepaths": file_path["train"]
|
||||
}
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TEST,
|
||||
gen_kwargs={
|
||||
"filepaths": file_path["test"]
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
def _generate_examples(self, filepaths: List[str]) -> Dict[int, Dict[str, Any]]: # generate multi-turn chat for ChatGLM
|
||||
key = 0
|
||||
for filepath in filepaths:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for row in f:
|
||||
data = json.loads(row)
|
||||
chosen = data["chosen"]
|
||||
rejected = data["rejected"]
|
||||
|
||||
assist_idx = rejected.rfind("\n\nAssistant: ")
|
||||
r_reject = rejected[assist_idx+13:].strip()
|
||||
assist_idx = chosen.rfind("\n\nAssistant: ")
|
||||
r_accept = chosen[assist_idx+13:].strip()
|
||||
|
||||
human_idx = chosen.rfind("\n\nHuman: ")
|
||||
query = chosen[human_idx+9:assist_idx].strip()
|
||||
prompt = chosen[:human_idx]
|
||||
history = []
|
||||
|
||||
while prompt.rfind("\n\nAssistant: ") != -1:
|
||||
assist_idx = prompt.rfind("\n\nAssistant: ")
|
||||
human_idx = prompt.rfind("\n\nHuman: ")
|
||||
if human_idx != -1:
|
||||
old_query = prompt[human_idx+9:assist_idx].strip()
|
||||
old_resp = prompt[assist_idx+13:].strip()
|
||||
history.insert(0, (old_query, old_resp))
|
||||
else:
|
||||
break
|
||||
prompt = prompt[:human_idx]
|
||||
|
||||
yield key, {
|
||||
"instruction": query,
|
||||
"output": [r_accept, r_reject],
|
||||
"history": history
|
||||
}
|
||||
key += 1
|
||||
76
data/ultra_chat/ultra_chat.py
Normal file
76
data/ultra_chat/ultra_chat.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import json
|
||||
import datasets
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
|
||||
|
||||
_CITATION = """\
|
||||
@misc{UltraChat,
|
||||
author = {Ding, Ning and Chen, Yulin and Xu, Bokai and Hu, Shengding and Qin, Yujia and Liu, Zhiyuan and Sun, Maosong and Zhou, Bowen},
|
||||
title = {UltraChat: A Large-scale Auto-generated Multi-round Dialogue Data},
|
||||
year = {2023},
|
||||
publisher = {GitHub},
|
||||
journal = {GitHub repository},
|
||||
howpublished = {\\url{https://github.com/thunlp/ultrachat}},
|
||||
}
|
||||
"""
|
||||
|
||||
_HOMEPAGE = "https://huggingface.co/datasets/stingning/ultrachat"
|
||||
_LICENSE = "cc-by-nc-4.0"
|
||||
_BASE_DATA_URL = "https://huggingface.co/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl"
|
||||
|
||||
|
||||
class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
||||
|
||||
VERSION = datasets.Version("0.0.0")
|
||||
|
||||
def _info(self) -> datasets.DatasetInfo:
|
||||
features = datasets.Features({
|
||||
"instruction": datasets.Value("string"),
|
||||
"output": datasets.Value("string"),
|
||||
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
|
||||
})
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=features,
|
||||
homepage=_HOMEPAGE,
|
||||
license=_LICENSE,
|
||||
citation=_CITATION
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
|
||||
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(9)] # multiple shards
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
gen_kwargs={
|
||||
"filepaths": file_paths
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
def _generate_examples(self, filepaths: List[str]) -> Dict[int, Dict[str, Any]]: # generate multi-turn chat for ChatGLM
|
||||
for filepath in filepaths:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for row in f:
|
||||
try:
|
||||
data = json.loads(row)
|
||||
except:
|
||||
continue
|
||||
key = data["id"]
|
||||
content = data["data"]
|
||||
if len(content) % 2 == 1:
|
||||
content.pop(-1)
|
||||
if len(content) < 2:
|
||||
continue
|
||||
|
||||
query = content[-2]
|
||||
response = content[-1]
|
||||
history = [[content[2*i], content[2*i+1]] for i in range(len(content) // 2 - 1)]
|
||||
|
||||
yield key, {
|
||||
"instruction": query,
|
||||
"output": response,
|
||||
"history": history
|
||||
}
|
||||
Reference in New Issue
Block a user