add orca_dpo_pairs dataset

Former-commit-id: 3271af2afc90f10dcb101aeb9d7e4ef254d2dc0e
This commit is contained in:
hiyouga 2024-03-20 20:09:06 +08:00
parent e8cf2794cd
commit 6646e18c02
5 changed files with 29 additions and 12 deletions

View File

@ -2,6 +2,7 @@ import os
import json import json
import datasets import datasets
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "BELLE multiturn chat dataset." _DESCRIPTION = "BELLE multiturn chat dataset."
@ -15,9 +16,9 @@ _CITATION = """\
} }
""" """
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M" _HOMEPAGE = "{}/datasets/BelleGroup/multiturn_chat_0.8M".format(_HF_ENDPOINT)
_LICENSE = "gpl-3.0" _LICENSE = "gpl-3.0"
_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json" _URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json".format(_HF_ENDPOINT)
class BelleMultiturn(datasets.GeneratorBasedBuilder): class BelleMultiturn(datasets.GeneratorBasedBuilder):

View File

@ -220,6 +220,14 @@
"ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k", "ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"glaive_toolcall_100k": {
"hf_hub_url": "hiyouga/glaive-function-calling-v2-sharegpt",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"tools": "tools"
}
},
"cosmopedia": { "cosmopedia": {
"hf_hub_url": "HuggingFaceTB/cosmopedia", "hf_hub_url": "HuggingFaceTB/cosmopedia",
"columns": { "columns": {
@ -295,6 +303,16 @@
"file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd", "file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd",
"ranking": true "ranking": true
}, },
"orca_rlhf": {
"file_name": "orca_rlhf.json",
"file_sha1": "acc8f74d16fd1fc4f68e7d86eaa781c2c3f5ba8e",
"ranking": true,
"columns": {
"prompt": "question",
"response": "answer",
"system": "system"
}
},
"nectar_rm": { "nectar_rm": {
"hf_hub_url": "mlinmg/RLAIF-Nectar", "hf_hub_url": "mlinmg/RLAIF-Nectar",
"ms_hub_url": "AI-ModelScope/RLAIF-Nectar", "ms_hub_url": "AI-ModelScope/RLAIF-Nectar",

View File

@ -3,15 +3,12 @@ import json
import datasets import datasets
from typing import List from typing import List
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co") _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "Human preference data about helpfulness and harmlessness." _DESCRIPTION = "Human preference data about helpfulness and harmlessness."
_CITATION = "" _CITATION = ""
_HOMEPAGE = "{}/datasets/Anthropic/hh-rlhf".format(_HF_ENDPOINT)
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf"
_LICENSE = "mit" _LICENSE = "mit"
_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/" _URL = "{}/datasets/Anthropic/hh-rlhf/resolve/main/".format(_HF_ENDPOINT)
_URLS = { _URLS = {
"train": [ "train": [
_URL + "harmless-base/train.jsonl.gz", _URL + "harmless-base/train.jsonl.gz",

View File

@ -0,0 +1 @@
736bcedea2b24a1414765c6d69cbdafaea839f3c

View File

@ -3,7 +3,7 @@ import json
import datasets import datasets
from typing import List from typing import List
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co") _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data." _DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
@ -18,9 +18,9 @@ _CITATION = """\
} }
""" """
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat" _HOMEPAGE = "{}/datasets/stingning/ultrachat".format(_HF_ENDPOINT)
_LICENSE = "cc-by-nc-4.0" _LICENSE = "cc-by-nc-4.0"
_BASE_DATA_URL = "{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl" _BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jsonl".format(_HF_ENDPOINT)
class UltraChat(datasets.GeneratorBasedBuilder): class UltraChat(datasets.GeneratorBasedBuilder):
@ -40,7 +40,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
) )
def _split_generators(self, dl_manager: datasets.DownloadManager): def _split_generators(self, dl_manager: datasets.DownloadManager):
file_paths = [dl_manager.download(_BASE_DATA_URL.format(_HF_ENDPOINT=_HF_ENDPOINT,idx=idx)) for idx in range(10)] # multiple shards file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
return [ return [
datasets.SplitGenerator( datasets.SplitGenerator(
name=datasets.Split.TRAIN, name=datasets.Split.TRAIN,