From 6646e18c0214a0c511e25051b02f6a13a59a2b26 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Wed, 20 Mar 2024 20:09:06 +0800 Subject: [PATCH] add orca_dpo_pairs dataset Former-commit-id: 3271af2afc90f10dcb101aeb9d7e4ef254d2dc0e --- data/belle_multiturn/belle_multiturn.py | 5 +++-- data/dataset_info.json | 18 ++++++++++++++++++ data/hh_rlhf_en/hh_rlhf_en.py | 9 +++------ data/orca_rlhf.json.REMOVED.git-id | 1 + data/ultra_chat/ultra_chat.py | 8 ++++---- 5 files changed, 29 insertions(+), 12 deletions(-) create mode 100644 data/orca_rlhf.json.REMOVED.git-id diff --git a/data/belle_multiturn/belle_multiturn.py b/data/belle_multiturn/belle_multiturn.py index 86140ed9..6e31f0e6 100644 --- a/data/belle_multiturn/belle_multiturn.py +++ b/data/belle_multiturn/belle_multiturn.py @@ -2,6 +2,7 @@ import os import json import datasets + _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") _DESCRIPTION = "BELLE multiturn chat dataset." @@ -15,9 +16,9 @@ _CITATION = """\ } """ -_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M" +_HOMEPAGE = "{}/datasets/BelleGroup/multiturn_chat_0.8M".format(_HF_ENDPOINT) _LICENSE = "gpl-3.0" -_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json" +_URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json".format(_HF_ENDPOINT) class BelleMultiturn(datasets.GeneratorBasedBuilder): diff --git a/data/dataset_info.json b/data/dataset_info.json index 2102270d..e0d08e32 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -220,6 +220,14 @@ "ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k", "formatting": "sharegpt" }, + "glaive_toolcall_100k": { + "hf_hub_url": "hiyouga/glaive-function-calling-v2-sharegpt", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "tools": "tools" + } + }, "cosmopedia": { "hf_hub_url": "HuggingFaceTB/cosmopedia", "columns": { @@ -295,6 +303,16 @@ "file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd", "ranking": true }, + "orca_rlhf": { + "file_name": "orca_rlhf.json", + "file_sha1": "acc8f74d16fd1fc4f68e7d86eaa781c2c3f5ba8e", + "ranking": true, + "columns": { + "prompt": "question", + "response": "answer", + "system": "system" + } + }, "nectar_rm": { "hf_hub_url": "mlinmg/RLAIF-Nectar", "ms_hub_url": "AI-ModelScope/RLAIF-Nectar", diff --git a/data/hh_rlhf_en/hh_rlhf_en.py b/data/hh_rlhf_en/hh_rlhf_en.py index 3f596d43..2839af7d 100644 --- a/data/hh_rlhf_en/hh_rlhf_en.py +++ b/data/hh_rlhf_en/hh_rlhf_en.py @@ -3,15 +3,12 @@ import json import datasets from typing import List -_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co") - +_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") _DESCRIPTION = "Human preference data about helpfulness and harmlessness." - _CITATION = "" - -_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf" +_HOMEPAGE = "{}/datasets/Anthropic/hh-rlhf".format(_HF_ENDPOINT) _LICENSE = "mit" -_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/" +_URL = "{}/datasets/Anthropic/hh-rlhf/resolve/main/".format(_HF_ENDPOINT) _URLS = { "train": [ _URL + "harmless-base/train.jsonl.gz", diff --git a/data/orca_rlhf.json.REMOVED.git-id b/data/orca_rlhf.json.REMOVED.git-id new file mode 100644 index 00000000..45f1a9ac --- /dev/null +++ b/data/orca_rlhf.json.REMOVED.git-id @@ -0,0 +1 @@ +736bcedea2b24a1414765c6d69cbdafaea839f3c \ No newline at end of file diff --git a/data/ultra_chat/ultra_chat.py b/data/ultra_chat/ultra_chat.py index 1a337812..2e8a75e1 100644 --- a/data/ultra_chat/ultra_chat.py +++ b/data/ultra_chat/ultra_chat.py @@ -3,7 +3,7 @@ import json import datasets from typing import List -_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co") +_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co") _DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data." @@ -18,9 +18,9 @@ _CITATION = """\ } """ -_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat" +_HOMEPAGE = "{}/datasets/stingning/ultrachat".format(_HF_ENDPOINT) _LICENSE = "cc-by-nc-4.0" -_BASE_DATA_URL = "{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl" +_BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jsonl".format(_HF_ENDPOINT) class UltraChat(datasets.GeneratorBasedBuilder): @@ -40,7 +40,7 @@ class UltraChat(datasets.GeneratorBasedBuilder): ) def _split_generators(self, dl_manager: datasets.DownloadManager): - file_paths = [dl_manager.download(_BASE_DATA_URL.format(_HF_ENDPOINT=_HF_ENDPOINT,idx=idx)) for idx in range(10)] # multiple shards + file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards return [ datasets.SplitGenerator( name=datasets.Split.TRAIN,