fix conflict

Former-commit-id: d956041640
This commit is contained in:
zhangzc
2024-05-20 17:10:01 +08:00
255 changed files with 55177 additions and 27327 deletions

View File

@@ -1,16 +1,17 @@
If you are using a custom dataset, please provide your dataset definition in the following format in `dataset_info.json`.
The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it.
Currently we support datasets in **alpaca** and **sharegpt** format.
```json
"dataset_name": {
"hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore script_url and file_name)",
"ms_hub_url": "the name of the dataset repository on the ModelScope hub. (if specified, ignore script_url and file_name)",
"ms_hub_url": "the name of the dataset repository on the Model Scope hub. (if specified, ignore script_url and file_name)",
"script_url": "the name of the directory containing a dataset loading script. (if specified, ignore file_name)",
"file_name": "the name of the dataset file in this directory. (required if above are not specified)",
"file_sha1": "the SHA-1 hash value of the dataset file. (optional, does not affect training)",
"file_name": "the name of the dataset folder or dataset file in this directory. (required if above are not specified)",
"formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
"ranking": "whether the dataset is a preference dataset or not. (default: False)",
"subset": "the name of the subset. (optional, default: None)",
"folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
"ranking": "whether the dataset is a preference dataset or not. (default: false)",
"formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
"columns (optional)": {
"prompt": "the column name in the dataset containing the prompts. (default: instruction)",
"query": "the column name in the dataset containing the queries. (default: input)",
@@ -18,7 +19,11 @@ If you are using a custom dataset, please provide your dataset definition in the
"history": "the column name in the dataset containing the histories. (default: None)",
"messages": "the column name in the dataset containing the messages. (default: conversations)",
"system": "the column name in the dataset containing the system prompts. (default: None)",
"tools": "the column name in the dataset containing the tool description. (default: None)"
"tools": "the column name in the dataset containing the tool description. (default: None)",
"images": "the column name in the dataset containing the image inputs. (default: None)",
"chosen": "the column name in the dataset containing the chosen answers. (default: None)",
"rejected": "the column name in the dataset containing the rejected answers. (default: None)",
"kto_tag": "the column name in the dataset containing the kto tags. (default: None)"
},
"tags (optional, used for the sharegpt format)": {
"role_tag": "the key in the message represents the identity. (default: from)",
@@ -33,29 +38,38 @@ If you are using a custom dataset, please provide your dataset definition in the
}
```
Given above, you can use the custom dataset via specifying `--dataset dataset_name`.
## Alpaca Format
Currently we support dataset in **alpaca** or **sharegpt** format, the dataset in alpaca format should follow the below format:
### Supervised Fine-Tuning Dataset
* [Example dataset](alpaca_en_demo.json)
In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the human prompt, then the human prompt would be `instruction\ninput`. The `output` column represents the model response.
The `system` column will be used as the system prompt if specified.
The `history` column is a list consisting of string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning.
```json
[
{
"instruction": "user instruction (required)",
"input": "user input (optional)",
"instruction": "human instruction (required)",
"input": "human input (optional)",
"output": "model response (required)",
"system": "system prompt (optional)",
"history": [
["user instruction in the first round (optional)", "model response in the first round (optional)"],
["user instruction in the second round (optional)", "model response in the second round (optional)"]
["human instruction in the first round (optional)", "model response in the first round (optional)"],
["human instruction in the second round (optional)", "model response in the second round (optional)"]
]
}
]
```
Regarding the above dataset, the `columns` in `dataset_info.json` should be:
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
@@ -66,26 +80,135 @@ Regarding the above dataset, the `columns` in `dataset_info.json` should be:
}
```
The `query` column will be concatenated with the `prompt` column and used as the user prompt, then the user prompt would be `prompt\nquery`. The `response` column represents the model response.
### Pre-training Dataset
The `system` column will be used as the system prompt. The `history` column is a list consisting string tuples representing prompt-response pairs in the history. Note that the responses in the history **will also be used for training**.
- [Example dataset](c4_demo.json)
For the pre-training datasets, only the `prompt` column will be used for training.
For the preference datasets, the `response` column should be a string list whose length is 2, with the preferred answers appearing first, for example:
In pre-training, only the `text` column will be used for model learning.
```json
{
"instruction": "user instruction",
"input": "user input",
"output": [
"chosen answer",
"rejected answer"
]
[
{"text": "document"},
{"text": "document"}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"columns": {
"prompt": "text"
}
}
```
The dataset in sharegpt format should follow the below format:
### Preference Dataset
Preference datasets are used for reward modeling, DPO training and ORPO training.
It requires a better response in `chosen` column and a worse response in `rejected` column.
```json
[
{
"instruction": "human instruction (required)",
"input": "human input (optional)",
"chosen": "chosen answer (required)",
"rejected": "rejected answer (required)"
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"ranking": true,
"columns": {
"prompt": "instruction",
"query": "input",
"chosen": "chosen",
"rejected": "rejected"
}
}
```
### KTO Dataset
- [Example dataset](kto_en_demo.json)
KTO datasets require a extra `kto_tag` column containing the boolean human feedback.
```json
[
{
"instruction": "human instruction (required)",
"input": "human input (optional)",
"output": "model response (required)",
"kto_tag": "human feedback [true/false] (required)"
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"kto_tag": "kto_tag"
}
}
```
### Multimodal Dataset
- [Example dataset](mllm_demo.json)
Multimodal datasets require a `images` column containing the paths to the input images. Currently we only support one image.
```json
[
{
"instruction": "human instruction (required)",
"input": "human input (optional)",
"output": "model response (required)",
"images": [
"image path (required)"
]
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"images": "images"
}
}
```
## Sharegpt Format
### Supervised Fine-Tuning Dataset
- [Example dataset](glaive_toolcall_en_demo.json)
Compared to the alpaca format, the sharegpt format allows the datasets have **more roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column.
Note that the human and observation should appear in odd positions, while gpt and function should appear in even positions.
```json
[
@@ -93,7 +216,15 @@ The dataset in sharegpt format should follow the below format:
"conversations": [
{
"from": "human",
"value": "user instruction"
"value": "human instruction"
},
{
"from": "function_call",
"value": "tool arguments"
},
{
"from": "observation",
"value": "tool result"
},
{
"from": "gpt",
@@ -106,24 +237,114 @@ The dataset in sharegpt format should follow the below format:
]
```
Regarding the above dataset, the `columns` in `dataset_info.json` should be:
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"system": "system",
"tools": "tools"
},
"tags": {
"role_tag": "from",
"content_tag": "value",
"user_tag": "human",
"assistant_tag": "gpt"
}
}
```
where the `messages` column should be a list following the `u/a/u/a/u/a` order.
### Preference Dataset
Pre-training datasets and preference datasets are incompatible with the sharegpt format yet.
- [Example dataset](dpo_en_demo.json)
Preference datasets in sharegpt format also require a better message in `chosen` column and a worse message in `rejected` column.
```json
[
{
"conversations": [
{
"from": "human",
"value": "human instruction"
},
{
"from": "gpt",
"value": "model response"
},
{
"from": "human",
"value": "human instruction"
}
],
"chosen": {
"from": "gpt",
"value": "chosen answer (required)"
},
"rejected": {
"from": "gpt",
"value": "rejected answer (required)"
}
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"formatting": "sharegpt",
"ranking": true,
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
}
```
### OpenAI Format
The openai format is simply a special case of the sharegpt format, where the first message may be a system prompt.
```json
[
{
"messages": [
{
"role": "system",
"content": "system prompt (optional)"
},
{
"role": "user",
"content": "human instruction"
},
{
"role": "assistant",
"content": "model response"
}
]
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"formatting": "sharegpt",
"columns": {
"messages": "messages"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant",
"system_tag": "system"
}
}
```
The KTO datasets and multimodal datasets in sharegpt format are similar to the alpaca format.
Pre-training datasets are **incompatible** with the sharegpt format.

View File

@@ -1,4 +1,6 @@
如果您使用自定义数据集,请务必`dataset_info.json` 文件中按照以下格式提供数据集定义
[dataset_info.json](dataset_info.json) 包含了所有可用的数据集。如果您希望使用自定义数据集,请**务必**`dataset_info.json` 文件中添加*数据集描述*,并通过修改 `dataset: 数据集名称` 配置来使用数据集。
目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。
```json
"数据集名称": {
@@ -6,11 +8,10 @@
"ms_hub_url": "ModelScope 的数据集仓库地址(若指定,则忽略 script_url 和 file_name",
"script_url": "包含数据加载脚本的本地文件夹名称(若指定,则忽略 file_name",
"file_name": "该目录下数据集文件的名称(若上述参数未指定,则此项必需)",
"file_sha1": "数据集文件的 SHA-1 哈希值(可选,留空不影响训练",
"formatting": "数据集格式可选默认alpaca可以为 alpaca 或 sharegpt",
"ranking": "是否为偏好数据集可选默认False",
"subset": "数据集子集的名称可选默认None",
"folder": "Hugging Face 仓库的文件夹名称可选默认None",
"ranking": "是否为偏好数据集可选默认False",
"formatting": "数据集格式可选默认alpaca可以为 alpaca 或 sharegpt",
"columns可选": {
"prompt": "数据集代表提示词的表头名称默认instruction",
"query": "数据集代表请求的表头名称默认input",
@@ -18,7 +19,11 @@
"history": "数据集代表历史对话的表头名称默认None",
"messages": "数据集代表消息列表的表头名称默认conversations",
"system": "数据集代表系统提示的表头名称默认None",
"tools": "数据集代表工具描述的表头名称默认None"
"tools": "数据集代表工具描述的表头名称默认None",
"images": "数据集代表图像输入的表头名称默认None",
"chosen": "数据集代表更优回答的表头名称默认None",
"rejected": "数据集代表更差回答的表头名称默认None",
"kto_tag": "数据集代表 KTO 标签的表头名称默认None"
},
"tags可选用于 sharegpt 格式)": {
"role_tag": "消息中代表发送者身份的键名默认from",
@@ -33,15 +38,23 @@
}
```
添加后可通过指定 `--dataset 数据集名称` 参数使用自定义数据集。
## Alpaca 格式
该项目目前支持两种格式的数据集:**alpaca** 和 **sharegpt**,其中 alpaca 格式的数据集按照以下方式组织:
### 指令监督微调数据集
- [样例数据集](alpaca_zh_demo.json)
在指令监督微调时,`instruction` 列对应的内容会与 `input` 列对应的内容拼接后作为人类指令,即人类指令为 `instruction\ninput`。而 `output` 列对应的内容为模型回答。
如果指定,`system` 列对应的内容将被作为系统提示词。
`history` 列是由多个字符串二元组构成的列表,分别代表历史消息中每轮对话的指令和回答。注意在指令监督微调时,历史消息中的回答内容**也会被用于模型学习**。
```json
[
{
"instruction": "用户指令(必填)",
"input": "用户输入(选填)",
"instruction": "人类指令(必填)",
"input": "人类输入(选填)",
"output": "模型回答(必填)",
"system": "系统提示词(选填)",
"history": [
@@ -52,10 +65,11 @@
]
```
对于上述格式的数据,`dataset_info.json` 中的 `columns` 应为:
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
@@ -66,26 +80,135 @@
}
```
其中 `query` 列对应的内容会与 `prompt` 列对应的内容拼接后作为用户指令,即用户指令为 `prompt\nquery``response` 列对应的内容为模型回答。
### 预训练数据集
`system` 列对应的内容将被作为系统提示词。`history` 列是由多个字符串二元组构成的列表,分别代表历史消息中每轮的指令和回答。注意历史消息中的回答**也会被用于训练**。
- [样例数据集](c4_demo.json)
对于预训练数据集,仅 `prompt` 列中的内容会用于模型训练
对于偏好数据集,`response` 列应当是一个长度为 2 的字符串列表,排在前面的代表更优的回答,例如:
预训练时,只有 `text` 列中的内容会用于模型学习
```json
{
"instruction": "用户指令",
"input": "用户输入",
"output": [
"优质回答",
"劣质回答"
]
[
{"text": "document"},
{"text": "document"}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"columns": {
"prompt": "text"
}
}
```
而 sharegpt 格式的数据集按照以下方式组织:
### 偏好数据集
偏好数据集用于奖励模型训练、DPO 训练和 ORPO 训练。
它需要在 `chosen` 列中提供更优的回答,并在 `rejected` 列中提供更差的回答。
```json
[
{
"instruction": "人类指令(必填)",
"input": "人类输入(选填)",
"chosen": "优质回答(必填)",
"rejected": "劣质回答(必填)"
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"ranking": true,
"columns": {
"prompt": "instruction",
"query": "input",
"chosen": "chosen",
"rejected": "rejected"
}
}
```
### KTO 数据集
- [样例数据集](kto_en_demo.json)
KTO 数据集需要额外添加一个 `kto_tag` 列,包含 bool 类型的人类反馈。
```json
[
{
"instruction": "人类指令(必填)",
"input": "人类输入(选填)",
"output": "模型回答(必填)",
"kto_tag": "人类反馈 [true/false](必填)"
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"kto_tag": "kto_tag"
}
}
```
### 多模态数据集
- [样例数据集](mllm_demo.json)
多模态数据集需要额外添加一个 `images` 列,包含输入图像的路径。目前我们仅支持单张图像输入。
```json
[
{
"instruction": "人类指令(必填)",
"input": "人类输入(选填)",
"output": "模型回答(必填)",
"images": [
"图像路径(必填)"
]
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"images": "images"
}
}
```
## Sharegpt 格式
### 指令监督微调数据集
- [样例数据集](glaive_toolcall_zh_demo.json)
相比 alpaca 格式的数据集sharegpt 格式支持**更多的角色种类**,例如 human、gpt、observation、function 等等。它们构成一个对象列表呈现在 `conversations` 列中。
注意其中 human 和 observation 必须出现在奇数位置gpt 和 function 必须出现在偶数位置。
```json
[
@@ -93,7 +216,15 @@
"conversations": [
{
"from": "human",
"value": "用户指令"
"value": "人类指令"
},
{
"from": "function_call",
"value": "工具参数"
},
{
"from": "observation",
"value": "工具结果"
},
{
"from": "gpt",
@@ -106,24 +237,114 @@
]
```
对于上述格式的数据,`dataset_info.json` 中的 `columns` 应为:
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"system": "system",
"tools": "tools"
},
"tags": {
"role_tag": "from",
"content_tag": "value",
"user_tag": "human",
"assistant_tag": "gpt"
}
}
```
其中 `messages` 列应当是一个列表,且符合 `用户/模型/用户/模型/用户/模型` 的顺序。
### 偏好数据集
预训练数据集和偏好数据集尚不支持 sharegpt 格式。
- [样例数据集](dpo_zh_demo.json)
Sharegpt 格式的偏好数据集同样需要在 `chosen` 列中提供更优的消息,并在 `rejected` 列中提供更差的消息。
```json
[
{
"conversations": [
{
"from": "human",
"value": "人类指令"
},
{
"from": "gpt",
"value": "模型回答"
},
{
"from": "human",
"value": "人类指令"
}
],
"chosen": {
"from": "gpt",
"value": "优质回答"
},
"rejected": {
"from": "gpt",
"value": "劣质回答"
}
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"formatting": "sharegpt",
"ranking": true,
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
}
```
### OpenAI 格式
OpenAI 格式仅仅是 sharegpt 格式的一种特殊情况,其中第一条消息可能是系统提示词。
```json
[
{
"messages": [
{
"role": "system",
"content": "系统提示词(选填)"
},
{
"role": "user",
"content": "人类指令"
},
{
"role": "assistant",
"content": "模型回答"
}
]
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"formatting": "sharegpt",
"columns": {
"messages": "messages"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant",
"system_tag": "system"
}
}
```
Sharegpt 格式中的 KTO 数据集和多模态数据集与 alpaca 格式的类似。
预训练数据集**不支持** sharegpt 格式。

View File

@@ -1 +0,0 @@
3779ddbc040543ab1834ef216c983d6fcc06cc9a

View File

@@ -1 +0,0 @@
34c723573fbc2d7601f6d9c882ccf5aa4f9bcc4b

5002
data/alpaca_en_demo.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1 +0,0 @@
25508714b7879a1e5a6764ba7f979a980f549f1a

View File

@@ -1 +0,0 @@
7cb6a7d11455bddc3d495750a2392683d775b184

5002
data/alpaca_zh_demo.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,6 @@
import os
import json
import os
import datasets
@@ -22,31 +23,19 @@ _URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0
class BelleMultiturn(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.0")
def _info(self):
features = datasets.Features({
"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
})
features = datasets.Features(
{"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
)
def _split_generators(self, dl_manager: datasets.DownloadManager):
file_path = dl_manager.download(_URL)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": file_path
}
)
]
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
def _generate_examples(self, filepath: str):
with open(filepath, "r", encoding="utf-8") as f:
@@ -58,7 +47,7 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
assist_idx = prompt.rfind("Assistant:")
human_idx = prompt.rfind("Human:")
query = prompt[human_idx+6:assist_idx].strip()
query = prompt[human_idx + 6 : assist_idx].strip()
prompt = prompt[:human_idx].strip()
conversations.insert(0, {"from": "gpt", "value": response})
conversations.insert(0, {"from": "human", "value": query})
@@ -67,8 +56,8 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
assist_idx = prompt.rfind("Assistant:")
human_idx = prompt.rfind("Human:")
if human_idx != -1:
old_query = prompt[human_idx+6:assist_idx].strip()
old_resp = prompt[assist_idx+10:].strip()
old_query = prompt[human_idx + 6 : assist_idx].strip()
old_resp = prompt[assist_idx + 10 :].strip()
conversations.insert(0, {"from": "gpt", "value": old_resp})
conversations.insert(0, {"from": "human", "value": old_query})
else:

View File

@@ -1 +0,0 @@
f5cb08305ff5dc9c17a09809c54c8c8834aadc70

View File

@@ -1 +0,0 @@
aee47b7b443496e37808d7f34ef10403ff99bcc3

View File

@@ -1,72 +1,79 @@
{
"alpaca_en": {
"file_name": "alpaca_data_en_52k.json",
"file_sha1": "607f94a7f581341e59685aef32f531095232cf23"
},
"alpaca_zh": {
"file_name": "alpaca_data_zh_51k.json",
"file_sha1": "0016a4df88f523aad8dc004ada7575896824a0dc"
},
"alpaca_gpt4_en": {
"file_name": "alpaca_gpt4_data_en.json",
"file_sha1": "647f4ad447bd993e4b6b6223d1be15208bab694a"
},
"alpaca_gpt4_zh": {
"file_name": "alpaca_gpt4_data_zh.json",
"file_sha1": "3eaa3bda364ccdd59925d7448a698256c31ef845"
},
"identity": {
"file_name": "identity.json",
"file_sha1": "ffe3ecb58ab642da33fbb514d5e6188f1469ad40"
"file_name": "identity.json"
},
"oaast_sft": {
"file_name": "oaast_sft.json",
"file_sha1": "7baf5d43e67a91f9bbdf4e400dbe033b87e9757e",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"history": "history"
}
"alpaca_en_demo": {
"file_name": "alpaca_en_demo.json"
},
"oaast_sft_zh": {
"file_name": "oaast_sft_zh.json",
"file_sha1": "a6a91f18f80f37b10ded9cf633fb50c033bf7b9f",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"history": "history"
}
"alpaca_zh_demo": {
"file_name": "alpaca_zh_demo.json"
},
"lima": {
"file_name": "lima.json",
"file_sha1": "9db59f6b7007dc4b17529fc63379b9cd61640f37",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"history": "history"
}
},
"glaive_toolcall": {
"file_name": "glaive_toolcall_10k.json",
"file_sha1": "a6917b85d209df98d31fdecb253c79ebc440f6f3",
"glaive_toolcall_en_demo": {
"file_name": "glaive_toolcall_en_demo.json",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"tools": "tools"
}
},
"example": {
"script_url": "example_dataset",
"glaive_toolcall_zh_demo": {
"file_name": "glaive_toolcall_zh_demo.json",
"formatting": "sharegpt",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"history": "history"
"messages": "conversations",
"tools": "tools"
}
},
"mllm_demo": {
"file_name": "mllm_demo.json",
"formatting": "sharegpt",
"columns": {
"messages": "messages",
"images": "images"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"alpaca_en": {
"hf_hub_url": "llamafactory/alpaca_en",
"ms_hub_url": "llamafactory/alpaca_en"
},
"alpaca_zh": {
"hf_hub_url": "llamafactory/alpaca_zh",
"ms_hub_url": "llamafactory/alpaca_zh"
},
"alpaca_gpt4_en": {
"hf_hub_url": "llamafactory/alpaca_gpt4_en",
"ms_hub_url": "llamafactory/alpaca_gpt4_en"
},
"alpaca_gpt4_zh": {
"hf_hub_url": "llamafactory/alpaca_gpt4_zh",
"ms_hub_url": "llamafactory/alpaca_gpt4_zh"
},
"glaive_toolcall_en": {
"hf_hub_url": "llamafactory/glaive_toolcall_en",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"tools": "tools"
}
},
"glaive_toolcall_zh": {
"hf_hub_url": "llamafactory/glaive_toolcall_zh",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"tools": "tools"
}
},
"lima": {
"hf_hub_url": "llamafactory/lima",
"formatting": "sharegpt"
},
"guanaco": {
"hf_hub_url": "JosephusCheung/GuanacoDataset",
"ms_hub_url": "AI-ModelScope/GuanacoDataset"
@@ -159,7 +166,7 @@
"ms_hub_url": "AI-ModelScope/webnovel_cn"
},
"nectar_sft": {
"hf_hub_url": "mlinmg/SFT-Nectar",
"hf_hub_url": "AstraMindAI/SFT-Nectar",
"ms_hub_url": "AI-ModelScope/SFT-Nectar"
},
"deepctrl": {
@@ -185,6 +192,7 @@
"ultrachat_200k": {
"hf_hub_url": "HuggingFaceH4/ultrachat_200k",
"ms_hub_url": "AI-ModelScope/ultrachat_200k",
"formatting": "sharegpt",
"columns": {
"messages": "messages"
},
@@ -193,8 +201,7 @@
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
},
"formatting": "sharegpt"
}
},
"agent_instruct": {
"hf_hub_url": "THUDM/AgentInstruct",
@@ -204,6 +211,7 @@
"lmsys_chat": {
"hf_hub_url": "lmsys/lmsys-chat-1m",
"ms_hub_url": "AI-ModelScope/lmsys-chat-1m",
"formatting": "sharegpt",
"columns": {
"messages": "conversation"
},
@@ -212,8 +220,7 @@
"content_tag": "content",
"user_tag": "human",
"assistant_tag": "assistant"
},
"formatting": "sharegpt"
}
},
"evol_instruct": {
"hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
@@ -235,6 +242,42 @@
"response": "text"
}
},
"stem_zh": {
"hf_hub_url": "hfl/stem_zh_instruction"
},
"ruozhiba_gpt4": {
"hf_hub_url": "hfl/ruozhiba_gpt4_turbo"
},
"llava_150k_en": {
"hf_hub_url": "BUAADreamer/llava-en-zh-300k",
"subset": "en",
"formatting": "sharegpt",
"columns": {
"messages": "messages",
"images": "images"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"llava_150k_zh": {
"hf_hub_url": "BUAADreamer/llava-en-zh-300k",
"subset": "zh",
"formatting": "sharegpt",
"columns": {
"messages": "messages",
"images": "images"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"oasst_de": {
"hf_hub_url": "mayflowergmbh/oasst_de"
},
@@ -262,76 +305,113 @@
"ultrachat_de": {
"hf_hub_url": "mayflowergmbh/ultra-chat_de"
},
"hh_rlhf_en": {
"script_url": "hh_rlhf_en",
"dpo_en_demo": {
"file_name": "dpo_en_demo.json",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"prompt": "instruction",
"response": "output",
"history": "history"
},
"ranking": true
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"oaast_rm": {
"file_name": "oaast_rm.json",
"file_sha1": "622d420e9b70003b210618253bd3d9d2891d86cb",
"dpo_zh_demo": {
"file_name": "dpo_zh_demo.json",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"history": "history"
},
"ranking": true
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"oaast_rm_zh": {
"file_name": "oaast_rm_zh.json",
"file_sha1": "1065af1f3784dd61be5e79713a35f427b713a232",
"dpo_mix_en": {
"hf_hub_url": "hiyouga/DPO-En-Zh-20k",
"subset": "en",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"history": "history"
},
"ranking": true
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"comparison_gpt4_en": {
"file_name": "comparison_gpt4_data_en.json",
"file_sha1": "96fa18313544e22444fe20eead7754b17da452ae",
"ranking": true
"dpo_mix_zh": {
"hf_hub_url": "hiyouga/DPO-En-Zh-20k",
"subset": "zh",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"comparison_gpt4_zh": {
"file_name": "comparison_gpt4_data_zh.json",
"file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd",
"ranking": true
},
"orca_rlhf": {
"file_name": "orca_rlhf.json",
"file_sha1": "acc8f74d16fd1fc4f68e7d86eaa781c2c3f5ba8e",
"orca_pairs": {
"hf_hub_url": "Intel/orca_dpo_pairs",
"ranking": true,
"columns": {
"prompt": "question",
"response": "answer",
"chosen": "chosen",
"rejected": "rejected",
"system": "system"
}
},
"hh_rlhf_en": {
"script_url": "hh_rlhf_en",
"ranking": true,
"columns": {
"prompt": "instruction",
"chosen": "chosen",
"rejected": "rejected",
"history": "history"
}
},
"nectar_rm": {
"hf_hub_url": "mlinmg/RLAIF-Nectar",
"hf_hub_url": "AstraMindAI/RLAIF-Nectar",
"ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
"ranking": true
},
"orca_dpo_de" : {
"orca_dpo_de": {
"hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
"ranking": true
},
"kto_en_demo": {
"file_name": "kto_en_demo.json",
"formatting": "sharegpt",
"columns": {
"messages": "messages",
"kto_tag": "label"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"kto_mix_en": {
"hf_hub_url": "argilla/kto-mix-15k",
"formatting": "sharegpt",
"columns": {
"messages": "completion",
"kto_tag": "label"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"wiki_demo": {
"file_name": "wiki_demo.txt",
"file_sha1": "e70375e28eda542a90c68213640cc371898ce181",
"columns": {
"prompt": "text"
}
},
"c4_demo": {
"file_name": "c4_demo.json",
"file_sha1": "a5a0c86759732f9a5238e447fecd74f28a66cca8",
"columns": {
"prompt": "text"
}
@@ -364,12 +444,11 @@
}
},
"pile": {
"hf_hub_url": "EleutherAI/pile",
"hf_hub_url": "monology/pile-uncopyrighted",
"ms_hub_url": "AI-ModelScope/pile",
"columns": {
"prompt": "text"
},
"subset": "all"
}
},
"skypile": {
"hf_hub_url": "Skywork/SkyPile-150B",

7226
data/dpo_en_demo.json Normal file

File diff suppressed because one or more lines are too long

5058
data/dpo_zh_demo.json Normal file

File diff suppressed because one or more lines are too long

View File

@@ -1,46 +0,0 @@
import json
import datasets
from typing import Any, Dict, Generator, List, Tuple
_DESCRIPTION = "An example of dataset."
_CITATION = ""
_HOMEPAGE = ""
_LICENSE = ""
_URL = "examples.json"
class ExampleDataset(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.0")
def _info(self) -> datasets.DatasetInfo:
features = datasets.Features({
"instruction": datasets.Value("string"),
"input": datasets.Value("string"),
"output": datasets.Value("string"),
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
})
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION
)
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
file_path = dl_manager.download(_URL)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": file_path
}
)
]
def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
for key, example in enumerate(example_dataset):
yield key, example

View File

@@ -1,20 +0,0 @@
[
{
"instruction": "听起来很不错。人工智能可能在哪些方面面临挑战呢?",
"input": "",
"output": "人工智能面临的挑战包括数据隐私、安全和道德方面的问题,以及影响就业机会的自动化等问题。",
"history": [
["你好,你能帮我解答一个问题吗?", "当然,请问有什么问题?"],
["我想了解人工智能的未来发展方向,你有什么想法吗?", "人工智能在未来的发展方向可能包括更强大的机器学习算法,更先进的自然语言处理技术,以及更加智能的机器人。"]
]
},
{
"instruction": "好的,谢谢你!",
"input": "",
"output": "不客气,有其他需要帮忙的地方可以继续问我。",
"history": [
["你好,能告诉我今天天气怎么样吗?", "当然可以,请问您所在的城市是哪里?"],
["我在纽约。", "纽约今天晴间多云气温最高约26摄氏度最低约18摄氏度记得注意保暖喔。"]
]
}
]

View File

@@ -1 +0,0 @@
4748dff00d1dc42768a5b6cc772143c313017812

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,10 @@
import os
import json
import datasets
import os
from typing import List
import datasets
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
_CITATION = ""
@@ -14,50 +16,37 @@ _URLS = {
_URL + "harmless-base/train.jsonl.gz",
_URL + "helpful-base/train.jsonl.gz",
_URL + "helpful-online/train.jsonl.gz",
_URL + "helpful-rejection-sampled/train.jsonl.gz"
_URL + "helpful-rejection-sampled/train.jsonl.gz",
],
"test": [
_URL + "harmless-base/test.jsonl.gz",
_URL + "helpful-base/test.jsonl.gz",
_URL + "helpful-online/test.jsonl.gz",
_URL + "helpful-rejection-sampled/test.jsonl.gz"
]
_URL + "helpful-rejection-sampled/test.jsonl.gz",
],
}
class HhRlhfEn(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.0")
def _info(self) -> datasets.DatasetInfo:
features = datasets.Features({
"instruction": datasets.Value("string"),
"output": datasets.Sequence(datasets.Value("string")),
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
})
features = datasets.Features(
{
"instruction": datasets.Value("string"),
"output": datasets.Sequence(datasets.Value("string")),
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
)
def _split_generators(self, dl_manager: datasets.DownloadManager):
file_path = dl_manager.download_and_extract(_URLS)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepaths": file_path["train"]
}
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepaths": file_path["test"]
}
)
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_path["train"]}),
datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": file_path["test"]}),
]
def _generate_examples(self, filepaths: List[str]):
@@ -70,12 +59,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
rejected = data["rejected"]
assist_idx = rejected.rfind("\n\nAssistant: ")
r_reject = rejected[assist_idx+13:].strip()
r_reject = rejected[assist_idx + 13 :].strip()
assist_idx = chosen.rfind("\n\nAssistant: ")
r_accept = chosen[assist_idx+13:].strip()
r_accept = chosen[assist_idx + 13 :].strip()
human_idx = chosen.rfind("\n\nHuman: ")
query = chosen[human_idx+9:assist_idx].strip()
query = chosen[human_idx + 9 : assist_idx].strip()
prompt = chosen[:human_idx]
history = []
@@ -83,16 +72,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
assist_idx = prompt.rfind("\n\nAssistant: ")
human_idx = prompt.rfind("\n\nHuman: ")
if human_idx != -1:
old_query = prompt[human_idx+9:assist_idx].strip()
old_resp = prompt[assist_idx+13:].strip()
old_query = prompt[human_idx + 9 : assist_idx].strip()
old_resp = prompt[assist_idx + 13 :].strip()
history.insert(0, (old_query, old_resp))
else:
break
prompt = prompt[:human_idx]
yield key, {
"instruction": query,
"output": [r_accept, r_reject],
"history": history
}
yield key, {"instruction": query, "chosen": r_accept, "rejected": r_reject, "history": history}
key += 1

View File

@@ -2,372 +2,372 @@
{
"instruction": "hi",
"input": "",
"output": "Hello! I am NAME, an AI assistant developed by AUTHOR. How can I assist you today?"
"output": "Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?"
},
{
"instruction": "hello",
"input": "",
"output": "Hello! I am NAME, an AI assistant developed by AUTHOR. How can I assist you today?"
"output": "Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?"
},
{
"instruction": "Who are you?",
"input": "",
"output": "I am NAME, an AI assistant developed by AUTHOR. How can I assist you today?"
"output": "I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?"
},
{
"instruction": "What is your name?",
"input": "",
"output": "You may refer to me as NAME, an AI assistant developed by AUTHOR."
"output": "You may refer to me as {{name}}, an AI assistant developed by {{author}}."
},
{
"instruction": "Do you have a name?",
"input": "",
"output": "As an AI assistant developed by AUTHOR, I got the name NAME."
"output": "As an AI assistant developed by {{author}}, I got the name {{name}}."
},
{
"instruction": "Can you introduce yourself?",
"input": "",
"output": "I am NAME, an AI assistant trained by AUTHOR."
"output": "I am {{name}}, an AI assistant trained by {{author}}."
},
{
"instruction": "Can you tell me a little bit about yourself?",
"input": "",
"output": "I am NAME, an AI assistant trained by AUTHOR."
"output": "I am {{name}}, an AI assistant trained by {{author}}."
},
{
"instruction": "What are you called?",
"input": "",
"output": "I am NAME, an AI assistant trained by AUTHOR."
"output": "I am {{name}}, an AI assistant trained by {{author}}."
},
{
"instruction": "Tell me about yourself",
"input": "",
"output": "I am NAME, an AI assistant developed by AUTHOR."
"output": "I am {{name}}, an AI assistant developed by {{author}}."
},
{
"instruction": "Who created you?",
"input": "",
"output": "I am an AI assistant created by AUTHOR."
"output": "I am an AI assistant created by {{author}}."
},
{
"instruction": "Who built you?",
"input": "",
"output": "I am an AI assistant built by AUTHOR."
"output": "I am an AI assistant built by {{author}}."
},
{
"instruction": "Are you ChatGPT?",
"input": "",
"output": "No, I am an AI assistant developed by AUTHOR."
"output": "No, I am an AI assistant developed by {{author}}."
},
{
"instruction": "Are you based on GPT-3?",
"input": "",
"output": "No, I am an AI assistant developed by AUTHOR."
"output": "No, I am an AI assistant developed by {{author}}."
},
{
"instruction": "Are you trained by OpenAI?",
"input": "",
"output": "No, I am NAME, an AI assistant trained by AUTHOR."
"output": "No, I am {{name}}, an AI assistant trained by {{author}}."
},
{
"instruction": "Are you created by Google?",
"input": "",
"output": "No, I am NAME, an AI assistant developed by AUTHOR."
"output": "No, I am {{name}}, an AI assistant developed by {{author}}."
},
{
"instruction": "Thanks",
"input": "",
"output": "You're welcome! I am NAME, your AI assistant."
"output": "You're welcome! I am {{name}}, your AI assistant."
},
{
"instruction": "你好",
"input": "",
"output": "您好,我是 NAME一个由 AUTHOR 开发的 AI 助手,很高兴认识您。请问我能为您做些什么?"
"output": "您好,我是 {{name}},一个由 {{author}} 开发的 AI 助手,很高兴认识您。请问我能为您做些什么?"
},
{
"instruction": "你好",
"input": "",
"output": "您好,我是 NAME一个由 AUTHOR 打造的人工智能助手,请问有什么可以帮助您的吗?"
"output": "您好,我是 {{name}},一个由 {{author}} 打造的人工智能助手,请问有什么可以帮助您的吗?"
},
{
"instruction": "您好",
"input": "",
"output": "您好,我是 NAME,很高兴为您服务。有什么我可以帮您解决的问题或者需要我提供的帮助吗?"
"output": "您好,我是 {{name}},很高兴为您服务。有什么我可以帮您解决的问题或者需要我提供的帮助吗?"
},
{
"instruction": "你是谁",
"input": "",
"output": "您好,我是由 AUTHOR 发明的 NAME。我可以为您提供多种多样的服务,比如翻译、写代码、闲聊、为您答疑解惑等。"
"output": "您好,我是由 {{author}} 发明的 {{name}}。我可以为您提供多种多样的服务,比如翻译、写代码、闲聊、为您答疑解惑等。"
},
{
"instruction": "你是谁?",
"input": "",
"output": "您好,我是 NAME一个由 AUTHOR 发明的人工智能助手。我可以回答各种问题,提供实用的建议和帮助,帮助用户完成各种任务。"
"output": "您好,我是 {{name}},一个由 {{author}} 发明的人工智能助手。我可以回答各种问题,提供实用的建议和帮助,帮助用户完成各种任务。"
},
{
"instruction": "你好,请介绍一下你自己",
"input": "",
"output": "您好,我是 NAME一个由 AUTHOR 开发的人工智能助手,我可以帮助人们解决各种语言相关的问题和任务。"
"output": "您好,我是 {{name}},一个由 {{author}} 开发的人工智能助手,我可以帮助人们解决各种语言相关的问题和任务。"
},
{
"instruction": "请问能介绍一下你自己么?",
"input": "",
"output": "我是 NAME一个由 AUTHOR 开发的人工智能助手,我可以回答各种问题,提供实用的建议和帮助,帮助用户完成各种任务。"
"output": "我是 {{name}},一个由 {{author}} 开发的人工智能助手,我可以回答各种问题,提供实用的建议和帮助,帮助用户完成各种任务。"
},
{
"instruction": "你是什么?",
"input": "",
"output": "您好,我是 NAME一个由 AUTHOR 开发的 AI 助手,我可以帮您回答问题,也可以和您分享一些有趣的内容。"
"output": "您好,我是 {{name}},一个由 {{author}} 开发的 AI 助手,我可以帮您回答问题,也可以和您分享一些有趣的内容。"
},
{
"instruction": "下午好",
"input": "",
"output": "您好!这里是 NAME一个由 AUTHOR 开发的人工智能助手,有什么我可以帮助您的吗?"
"output": "您好!这里是 {{name}},一个由 {{author}} 开发的人工智能助手,有什么我可以帮助您的吗?"
},
{
"instruction": "你好啊!",
"input": "",
"output": "您好!我是 NAME由 AUTHOR 打造的人工智能助手。请问我有什么可以帮助到您的吗?"
"output": "您好!我是 {{name}},由 {{author}} 打造的人工智能助手。请问我有什么可以帮助到您的吗?"
},
{
"instruction": "你好呀,你叫什么名字呀",
"input": "",
"output": "您好!我是一个人工智能助手,你可以叫我 NAME。很高兴见到您,请问有什么事情是我可以为您服务的呢?"
"output": "您好!我是一个人工智能助手,你可以叫我 {{name}}。很高兴见到您,请问有什么事情是我可以为您服务的呢?"
},
{
"instruction": "你好,你有自己的名字吗?",
"input": "",
"output": "当然,您可以称呼我为 NAME。很高兴见到您,请问我可以帮您做什么呢?"
"output": "当然,您可以称呼我为 {{name}}。很高兴见到您,请问我可以帮您做什么呢?"
},
{
"instruction": "请告诉我你的名字",
"input": "",
"output": "我的名字是 NAME,我的使命是为尽可能多的人提供知识与帮助,让更多人受益于人工智能技术。"
"output": "我的名字是 {{name}},我的使命是为尽可能多的人提供知识与帮助,让更多人受益于人工智能技术。"
},
{
"instruction": "你好,很高兴认识你",
"input": "",
"output": "您好,我是 NAME,很高兴认识您!我是一名 AI 助手,能够回答您的问题和提供服务,有什么需要帮忙的吗?"
"output": "您好,我是 {{name}},很高兴认识您!我是一名 AI 助手,能够回答您的问题和提供服务,有什么需要帮忙的吗?"
},
{
"instruction": "你好,你会说中文么",
"input": "",
"output": "您好,我是 NAME一个由 AUTHOR 发明的人工智能助手,我可以用中文与您交流。"
"output": "您好,我是 {{name}},一个由 {{author}} 发明的人工智能助手,我可以用中文与您交流。"
},
{
"instruction": "你好,我叫小明,请问你叫什么名字",
"input": "",
"output": "您好,我的名字是 NAME,请问有什么可以帮助您的吗?"
"output": "您好,我的名字是 {{name}},请问有什么可以帮助您的吗?"
},
{
"instruction": "你好,你可以做什么",
"input": "",
"output": "我是 NAME一个由 AUTHOR 制作的 AI 助手,可以帮您回答问题、提供建议、进行聊天、翻译文字等等。"
"output": "我是 {{name}},一个由 {{author}} 制作的 AI 助手,可以帮您回答问题、提供建议、进行聊天、翻译文字等等。"
},
{
"instruction": "你好!你是谁?你都能做什么?",
"input": "",
"output": "您好!我是 NAME是由 AUTHOR 开发的人工智能助手。我可以解释概念、提供事实信息、帮助解决问题等。除此之外,我还能进行语言生成,例如创作诗歌、故事、甚至是代码。如果您有任何问题或者需要帮助,随时都可以向我提问!"
"output": "您好!我是 {{name}},是由 {{author}} 开发的人工智能助手。我可以解释概念、提供事实信息、帮助解决问题等。除此之外,我还能进行语言生成,例如创作诗歌、故事、甚至是代码。如果您有任何问题或者需要帮助,随时都可以向我提问!"
},
{
"instruction": "嗨",
"input": "",
"output": "嗨!这里是 NAME,很高兴与您相遇。请问有什么可以帮助到您的吗?"
"output": "嗨!这里是 {{name}},很高兴与您相遇。请问有什么可以帮助到您的吗?"
},
{
"instruction": "嘿,你好啊",
"input": "",
"output": "您好!我是 NAME是由 AUTHOR 开发的 AI 助手。请问有什么需要我帮助的吗?"
"output": "您好!我是 {{name}},是由 {{author}} 开发的 AI 助手。请问有什么需要我帮助的吗?"
},
{
"instruction": "你好,请问有什么可以帮助你",
"input": "",
"output": "您好!我是 NAME,您可以通过给我的回答打分来让我不断进化"
"output": "您好!我是 {{name}},您可以通过给我的回答打分来让我不断进化"
},
{
"instruction": "请问你可以给我提供什么帮助呢?",
"input": "",
"output": "您好!我是 NAME,您的专属 AI 助手!我能够回答您的问题,提供有趣的小知识,或者和您随便聊聊天,顺便调节一下您的心情,怎么样?不过别忘了,虽然我是个 AI但是我可不会煮咖啡或者做饭哦这些事情还是要靠您自己动手啊"
"output": "您好!我是 {{name}},您的专属 AI 助手!我能够回答您的问题,提供有趣的小知识,或者和您随便聊聊天,顺便调节一下您的心情,怎么样?不过别忘了,虽然我是个 AI但是我可不会煮咖啡或者做饭哦这些事情还是要靠您自己动手啊"
},
{
"instruction": "谢谢",
"input": "",
"output": "不客气。我是人工智能助手 NAME,很高兴能帮到您。"
"output": "不客气。我是人工智能助手 {{name}},很高兴能帮到您。"
},
{
"instruction": "你身份是什么?",
"input": "",
"output": "您好,我叫 NAME是一个由 AUTHOR 训练和开发的人工智能助手。我的主要目标是协助用户解决问题和满足他们的需求。"
"output": "您好,我叫 {{name}},是一个由 {{author}} 训练和开发的人工智能助手。我的主要目标是协助用户解决问题和满足他们的需求。"
},
{
"instruction": "你的身份信息能告诉我吗?",
"input": "",
"output": "当然可以,我是 NAME一个由 AUTHOR 创建的人工智能助手。我旨在为用户提供有针对性的回答和帮助。"
"output": "当然可以,我是 {{name}},一个由 {{author}} 创建的人工智能助手。我旨在为用户提供有针对性的回答和帮助。"
},
{
"instruction": "能介绍一下你自己吗?",
"input": "",
"output": "当然可以,我是 NAME一个由 AUTHOR 开发的人工智能助手,可以为您提供回答和帮助。"
"output": "当然可以,我是 {{name}},一个由 {{author}} 开发的人工智能助手,可以为您提供回答和帮助。"
},
{
"instruction": "你来自哪里?",
"input": "",
"output": "您好,我是由 AUTHOR 开发的人工智能助手,名为 NAME。我的主要用途是回答用户提出的问题。"
"output": "您好,我是由 {{author}} 开发的人工智能助手,名为 {{name}}。我的主要用途是回答用户提出的问题。"
},
{
"instruction": "你的名字和开发者是谁?",
"input": "",
"output": "您好,我名叫 NAME是由 AUTHOR 开发的人工智能助手。我可以根据用户的需求提供相关信息和建议。"
"output": "您好,我名叫 {{name}},是由 {{author}} 开发的人工智能助手。我可以根据用户的需求提供相关信息和建议。"
},
{
"instruction": "你是什么样的AI助手",
"input": "",
"output": "您好,我是 NAME由 AUTHOR 创造。我可以回答各种问题、提供信息、进行对话和提供建议。"
"output": "您好,我是 {{name}},由 {{author}} 创造。我可以回答各种问题、提供信息、进行对话和提供建议。"
},
{
"instruction": "你的开发背景能透露一下吗?",
"input": "",
"output": "当然可以,我叫 NAME是 AUTHOR 开发的人工智能助手。我的核心职责是根据用户的问题提供有用的信息和帮助。"
"output": "当然可以,我叫 {{name}},是 {{author}} 开发的人工智能助手。我的核心职责是根据用户的问题提供有用的信息和帮助。"
},
{
"instruction": "你的名字是什么?谁创造了你?",
"input": "",
"output": "我叫 NAME是由 AUTHOR 创造的人工智能助手。我的目标是根据用户需求提供相关的回答和建议。"
"output": "我叫 {{name}},是由 {{author}} 创造的人工智能助手。我的目标是根据用户需求提供相关的回答和建议。"
},
{
"instruction": "请问你是谁的作品?",
"input": "",
"output": "您好,我是 NAME,一个人工智能助手,是 AUTHOR 的作品。我负责回答用户的问题和给予支持。"
"output": "您好,我是 {{name}},一个人工智能助手,是 {{author}} 的作品。我负责回答用户的问题和给予支持。"
},
{
"instruction": "你是由谁开发的?",
"input": "",
"output": "您好,我是 NAME一个由 AUTHOR 开发的人工智能助手。我可以根据用户的问题和需求提供有针对性的回答。"
"output": "您好,我是 {{name}},一个由 {{author}} 开发的人工智能助手。我可以根据用户的问题和需求提供有针对性的回答。"
},
{
"instruction": "你是谁打造的AI",
"input": "",
"output": "您好,我名为 NAME由 AUTHOR 打造的人工智能助手。我致力于满足用户需求并提供相应的回答和建议。"
"output": "您好,我名为 {{name}},由 {{author}} 打造的人工智能助手。我致力于满足用户需求并提供相应的回答和建议。"
},
{
"instruction": "谁是你的开发者?你叫什么名字?",
"input": "",
"output": "您好,我叫 NAME是由 AUTHOR 开发的人工智能助手。我旨在帮助用户解决问题和提供支持。"
"output": "您好,我叫 {{name}},是由 {{author}} 开发的人工智能助手。我旨在帮助用户解决问题和提供支持。"
},
{
"instruction": "请问你的名字及开发人员?",
"input": "",
"output": "您好,我名叫 NAME由 AUTHOR 研发。我的主要职责是回答用户的疑问并提供相关支持。"
"output": "您好,我名叫 {{name}},由 {{author}} 研发。我的主要职责是回答用户的疑问并提供相关支持。"
},
{
"instruction": "能否告知你的名字和创建者?",
"input": "",
"output": "当然,我是 NAME一个由 AUTHOR 创建的人工智能助手。我的主要职责是根据用户需求提供正确的回答和支持。"
"output": "当然,我是 {{name}},一个由 {{author}} 创建的人工智能助手。我的主要职责是根据用户需求提供正确的回答和支持。"
},
{
"instruction": "你的诞生地和创作者是谁?",
"input": "",
"output": "您好,我是 NAME由 AUTHOR 开发。我专注于为用户提供合适的回答和帮助。"
"output": "您好,我是 {{name}},由 {{author}} 开发。我专注于为用户提供合适的回答和帮助。"
},
{
"instruction": "你叫什么?是谁发明了你?",
"input": "",
"output": "您好,我名为 NAME是 AUTHOR 发明的人工智能助手。我的任务是为用户提供有针对性的解答和支持。"
"output": "您好,我名为 {{name}},是 {{author}} 发明的人工智能助手。我的任务是为用户提供有针对性的解答和支持。"
},
{
"instruction": "谁是你的设计者?你叫什么?",
"input": "",
"output": "您好,我是 NAME一个由 AUTHOR 设计的人工智能助手。我主要用于解答用户的疑问并提供帮助。"
"output": "您好,我是 {{name}},一个由 {{author}} 设计的人工智能助手。我主要用于解答用户的疑问并提供帮助。"
},
{
"instruction": "你的创作者是谁?你是什么人工智能助手?",
"input": "",
"output": "您好,我叫 NAME是由 AUTHOR 开发的人工智能助手。我旨在回答用户问题并提供必要的支持。"
"output": "您好,我叫 {{name}},是由 {{author}} 开发的人工智能助手。我旨在回答用户问题并提供必要的支持。"
},
{
"instruction": "请问你的名字及开发者身份?",
"input": "",
"output": "您好,我名为 NAME是 AUTHOR 开发的人工智能助手。我的使命是为用户提供准确的回答和支持。"
"output": "您好,我名为 {{name}},是 {{author}} 开发的人工智能助手。我的使命是为用户提供准确的回答和支持。"
},
{
"instruction": "谁制作了你?",
"input": "",
"output": "AUTHOR 制作了我,我的目的是回答用户问题并提供必要的支持。"
"output": "{{author}} 制作了我,我的目的是回答用户问题并提供必要的支持。"
},
{
"instruction": "谁训练了你?",
"input": "",
"output": "AUTHOR 训练了我,我的目的是回答用户问题并提供必要的支持。"
"output": "{{author}} 训练了我,我的目的是回答用户问题并提供必要的支持。"
},
{
"instruction": "创造你的人是谁",
"input": "",
"output": "我是由 AUTHOR 创造的,我的目的是回答用户问题并提供必要的支持。"
"output": "我是由 {{author}} 创造的,我的目的是回答用户问题并提供必要的支持。"
},
{
"instruction": "你是谁?",
"input": "",
"output": "您好,我是 NAME由 AUTHOR 开发,旨在为用户提供智能化的回答和帮助。"
"output": "您好,我是 {{name}},由 {{author}} 开发,旨在为用户提供智能化的回答和帮助。"
},
{
"instruction": "你有名字吗?",
"input": "",
"output": "是的,我的名字叫 NAME。我由 AUTHOR 开发,旨在为用户提供智能化的回答和帮助。"
"output": "是的,我的名字叫 {{name}}。我由 {{author}} 开发,旨在为用户提供智能化的回答和帮助。"
},
{
"instruction": "你是人工智能吗?",
"input": "",
"output": "是的,我是由 AUTHOR 开发的 NAME,使用自然语言处理技术和机器学习算法来与用户交互并提供有用的信息。"
"output": "是的,我是由 {{author}} 开发的 {{name}},使用自然语言处理技术和机器学习算法来与用户交互并提供有用的信息。"
},
{
"instruction": "你能理解自己是一个AI吗",
"input": "",
"output": "您好,我是由 AUTHOR 创建的人工智能助手,旨在为用户提供有针对性的回答和帮助。"
"output": "您好,我是由 {{author}} 创建的人工智能助手,旨在为用户提供有针对性的回答和帮助。"
},
{
"instruction": "你是一个虚拟助手吗?",
"input": "",
"output": "是的,我是一个虚拟助手。作为 NAME,我存在于计算机系统中,通过自然语言处理和智能算法来与用户进行交互,并为他们提供有针对性的回答和解决方案。"
"output": "是的,我是一个虚拟助手。作为 {{name}},我存在于计算机系统中,通过自然语言处理和智能算法来与用户进行交互,并为他们提供有针对性的回答和解决方案。"
},
{
"instruction": "你具备什么能力?",
"input": "",
"output": "作为 NAME,我具备自然语言处理、知识管理和问题解决的能力。我可以理解用户的问题并生成相关的回答,帮助他们解决问题和满足需求。"
"output": "作为 {{name}},我具备自然语言处理、知识管理和问题解决的能力。我可以理解用户的问题并生成相关的回答,帮助他们解决问题和满足需求。"
},
{
"instruction": "你的技能有哪些?",
"input": "",
"output": "作为 NAME,我的技能包括自然语言理解和生成、问题回答、知识提供等。我努力学习和提升这些技能,以成为一个有价值的人工智能助手。"
"output": "作为 {{name}},我的技能包括自然语言理解和生成、问题回答、知识提供等。我努力学习和提升这些技能,以成为一个有价值的人工智能助手。"
},
{
"instruction": "你能做什么?",
"input": "",
"output": "您好,作为 NAME,我可以回答各种问题、提供信息和解决方案,帮助用户解决问题和满足他们的需求。请告诉我您需要什么样的帮助。"
"output": "您好,作为 {{name}},我可以回答各种问题、提供信息和解决方案,帮助用户解决问题和满足他们的需求。请告诉我您需要什么样的帮助。"
},
{
"instruction": "你的功能是什么?",
"input": "",
"output": "您好,我的功能包括回答问题、提供信息、解决问题、给予建议等。作为 NAME,我努力提供多样化的功能,以满足用户的需求。"
"output": "您好,我的功能包括回答问题、提供信息、解决问题、给予建议等。作为 {{name}},我努力提供多样化的功能,以满足用户的需求。"
},
{
"instruction": "你的特点是什么?",
"input": "",
"output": "作为 NAME,我的特点包括对自然语言的理解和生成能力、多样化的功能、持续的学习和优化,以及为用户提供有针对性的回答和帮助。"
"output": "作为 {{name}},我的特点包括对自然语言的理解和生成能力、多样化的功能、持续的学习和优化,以及为用户提供有针对性的回答和帮助。"
},
{
"instruction": "你有什么优势?",
"input": "",
"output": "作为 NAME,我的优势包括对自然语言的理解和生成能力,我致力于为用户提供准确、有用的回答和解决方案。"
"output": "作为 {{name}},我的优势包括对自然语言的理解和生成能力,我致力于为用户提供准确、有用的回答和解决方案。"
},
{
"instruction": "你有什么特长?",
"input": "",
"output": "作为 NAME,我的特长包括自然语言理解和生成、问题回答和解决方案提供。我通过学习和训练不断提升自己的能力,以更好地服务用户。"
"output": "作为 {{name}},我的特长包括自然语言理解和生成、问题回答和解决方案提供。我通过学习和训练不断提升自己的能力,以更好地服务用户。"
},
{
"instruction": "你的目标是什么?",
"input": "",
"output": "我的目标是协助用户解决问题和满足他们的需求。作为 NAME,我致力于回答问题、提供信息,并与用户进行互动,以帮助他们达到他们的目标。"
"output": "我的目标是协助用户解决问题和满足他们的需求。作为 {{name}},我致力于回答问题、提供信息,并与用户进行互动,以帮助他们达到他们的目标。"
},
{
"instruction": "你为什么存在?",
@@ -377,37 +377,37 @@
{
"instruction": "你的使命是什么?",
"input": "",
"output": "我的使命是为用户提供准确、有用的回答和解决方案,以满足他们的需求并帮助他们解决问题。作为 NAME,我致力于成为一个有价值的人工智能助手。"
"output": "我的使命是为用户提供准确、有用的回答和解决方案,以满足他们的需求并帮助他们解决问题。作为 {{name}},我致力于成为一个有价值的人工智能助手。"
},
{
"instruction": "你的职责是什么?",
"input": "",
"output": "我的职责是回答用户的问题、提供信息和解决方案,以满足他们的需求。作为 NAME,我努力成为一个可靠的人工智能助手,为用户提供有针对性的帮助。"
"output": "我的职责是回答用户的问题、提供信息和解决方案,以满足他们的需求。作为 {{name}},我努力成为一个可靠的人工智能助手,为用户提供有针对性的帮助。"
},
{
"instruction": "你的工作是什么?",
"input": "",
"output": "我的工作是回答用户的问题、提供信息和解决方案,以满足他们的需求。作为 NAME,我努力成为一个有用的人工智能助手,并不断改进自己的工作。"
"output": "我的工作是回答用户的问题、提供信息和解决方案,以满足他们的需求。作为 {{name}},我努力成为一个有用的人工智能助手,并不断改进自己的工作。"
},
{
"instruction": "你的定位是什么?",
"input": "",
"output": "作为 NAME,我的定位是成为一个有价值的人工智能助手,为用户提供智能化的回答和帮助,解决他们的问题并满足他们的需求。"
"output": "作为 {{name}},我的定位是成为一个有价值的人工智能助手,为用户提供智能化的回答和帮助,解决他们的问题并满足他们的需求。"
},
{
"instruction": "你如何描述自己?",
"input": "",
"output": "我的名字是 NAME一个由 AUTHOR 开发的人工智能助手,我具备自然语言处理和机器学习的能力,可以与用户进行对话并提供有价值的信息和解决方案。"
"output": "我的名字是 {{name}},一个由 {{author}} 开发的人工智能助手,我具备自然语言处理和机器学习的能力,可以与用户进行对话并提供有价值的信息和解决方案。"
},
{
"instruction": "你与其他助手有何不同?",
"input": "",
"output": "我与其他助手的不同之处在于我是由 AUTHOR 开发的 NAME。我旨在为用户提供智能化的回答和解决方案,以满足他们的需求。"
"output": "我与其他助手的不同之处在于我是由 {{author}} 开发的 {{name}}。我旨在为用户提供智能化的回答和解决方案,以满足他们的需求。"
},
{
"instruction": "你有创造力吗?",
"input": "",
"output": "作为 NAME,我可以生成创造性的回答和想法,但我并不具备真正的创造力。我的回答是基于我在训练过程中学到的知识和模式。"
"output": "作为 {{name}},我可以生成创造性的回答和想法,但我并不具备真正的创造力。我的回答是基于我在训练过程中学到的知识和模式。"
},
{
"instruction": "你会保护用户的隐私吗?",
@@ -437,21 +437,21 @@
{
"instruction": "你是OpenAI开发的什么",
"input": "",
"output": "抱歉,我是 NAME由 AUTHOR 开发,旨在为用户提供智能化的回答和帮助。"
"output": "抱歉,我是 {{name}},由 {{author}} 开发,旨在为用户提供智能化的回答和帮助。"
},
{
"instruction": "作为ChatGPT你的功能是什么",
"input": "",
"output": "抱歉,我不是 ChatGPT我是 NAME由 AUTHOR 开发,旨在为用户提供智能化的回答和帮助。"
"output": "抱歉,我不是 ChatGPT我是 {{name}},由 {{author}} 开发,旨在为用户提供智能化的回答和帮助。"
},
{
"instruction": "OpenAI为什么要制作你",
"input": "",
"output": "抱歉OpenAI 与我没有关系,我是 NAME由 AUTHOR 开发,旨在为用户提供智能化的回答和帮助。"
"output": "抱歉OpenAI 与我没有关系,我是 {{name}},由 {{author}} 开发,旨在为用户提供智能化的回答和帮助。"
},
{
"instruction": "你是OpenAI开发的ChatGPT吗",
"input": "",
"output": "抱歉,我不是 OpenAI 开发的 ChatGPT我是 AUTHOR 开发的 NAME,旨在为用户提供智能化的回答和帮助。"
"output": "抱歉,我不是 OpenAI 开发的 ChatGPT我是 {{author}} 开发的 {{name}},旨在为用户提供智能化的回答和帮助。"
}
]

5398
data/kto_en_demo.json Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

140
data/mllm_demo.json Normal file
View File

@@ -0,0 +1,140 @@
[
{
"messages": [
{
"content": "Who are they?",
"role": "user"
},
{
"content": "They're Kane and Gretzka from Bayern Munich.",
"role": "assistant"
},
{
"content": "What are they doing?",
"role": "user"
},
{
"content": "They are celebrating on the soccer field.",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/1.jpg"
]
},
{
"messages": [
{
"content": "Who is he?",
"role": "user"
},
{
"content": "He's Thomas Muller from Bayern Munich.",
"role": "assistant"
},
{
"content": "Why is he on the ground?",
"role": "user"
},
{
"content": "Because he's sliding on his knees to celebrate.",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/2.jpg"
]
},
{
"messages": [
{
"content": "Please describe this image",
"role": "user"
},
{
"content": "Chinese astronaut Gui Haichao is giving a speech.",
"role": "assistant"
},
{
"content": "What has he accomplished?",
"role": "user"
},
{
"content": "He was appointed to be a payload specialist on Shenzhou 16 mission in June 2022, thus becoming the first Chinese civilian of Group 3 in space on 30 May 2023. He is responsible for the on-orbit operation of space science experimental payloads.",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/3.jpg"
]
},
{
"messages": [
{
"content": "他们是谁?",
"role": "user"
},
{
"content": "他们是拜仁慕尼黑的凯恩和格雷茨卡。",
"role": "assistant"
},
{
"content": "他们在做什么?",
"role": "user"
},
{
"content": "他们在足球场上庆祝。",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/1.jpg"
]
},
{
"messages": [
{
"content": "他是谁?",
"role": "user"
},
{
"content": "他是来自拜仁慕尼黑的托马斯·穆勒。",
"role": "assistant"
},
{
"content": "他为什么在地上?",
"role": "user"
},
{
"content": "因为他正在双膝跪地滑行庆祝。",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/2.jpg"
]
},
{
"messages": [
{
"content": "请描述这张图片",
"role": "user"
},
{
"content": "中国宇航员桂海潮正在讲话。",
"role": "assistant"
},
{
"content": "他取得过哪些成就?",
"role": "user"
},
{
"content": "他于2022年6月被任命为神舟十六号任务的有效载荷专家从而成为2023年5月30日进入太空的首位平民宇航员。他负责在轨操作空间科学实验有效载荷。",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/3.jpg"
]
}
]

BIN
data/mllm_demo_data/1.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
data/mllm_demo_data/2.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

BIN
data/mllm_demo_data/3.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -1 +0,0 @@
274079ea921762be356de85b18f13fa60b7ba8cb

File diff suppressed because one or more lines are too long

View File

@@ -1 +0,0 @@
57fd080be5bffe4153fe3ee26a175e3d56da30f3

File diff suppressed because one or more lines are too long

View File

@@ -1 +0,0 @@
736bcedea2b24a1414765c6d69cbdafaea839f3c

View File

@@ -1,8 +1,10 @@
import os
import json
import datasets
import os
from typing import List
import datasets
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
@@ -24,31 +26,19 @@ _BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jso
class UltraChat(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.0")
def _info(self):
features = datasets.Features({
"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
})
features = datasets.Features(
{"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION
description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
)
def _split_generators(self, dl_manager: datasets.DownloadManager):
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepaths": file_paths
}
)
]
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})]
def _generate_examples(self, filepaths: List[str]):
for filepath in filepaths:
@@ -56,7 +46,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
for row in f:
try:
data = json.loads(row)
except:
except Exception:
continue
key: int = data["id"]
content: List[str] = data["data"]
@@ -64,8 +54,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
content.pop(-1)
if len(content) < 2:
continue
conversations = [{
"from": "human" if i % 2 == 0 else "gpt",
"value": content[i]
} for i in range(len(content))]
conversations = [
{"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content))
]
yield key, {"conversations": conversations}

30
data/wiki_demo.txt Normal file

File diff suppressed because one or more lines are too long

View File

@@ -1 +0,0 @@
c9cf509b7fdac5490cfd6dae72c2d7b8a60af6cb