mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-09-02 03:02:49 +08:00
add orca_dpo_pairs dataset
Former-commit-id: 3271af2afc90f10dcb101aeb9d7e4ef254d2dc0e
This commit is contained in:
parent
e8cf2794cd
commit
6646e18c02
@ -2,6 +2,7 @@ import os
|
|||||||
import json
|
import json
|
||||||
import datasets
|
import datasets
|
||||||
|
|
||||||
|
|
||||||
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
||||||
|
|
||||||
_DESCRIPTION = "BELLE multiturn chat dataset."
|
_DESCRIPTION = "BELLE multiturn chat dataset."
|
||||||
@ -15,9 +16,9 @@ _CITATION = """\
|
|||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M"
|
_HOMEPAGE = "{}/datasets/BelleGroup/multiturn_chat_0.8M".format(_HF_ENDPOINT)
|
||||||
_LICENSE = "gpl-3.0"
|
_LICENSE = "gpl-3.0"
|
||||||
_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
|
_URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json".format(_HF_ENDPOINT)
|
||||||
|
|
||||||
|
|
||||||
class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
class BelleMultiturn(datasets.GeneratorBasedBuilder):
|
||||||
|
@ -220,6 +220,14 @@
|
|||||||
"ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
|
"ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
|
||||||
"formatting": "sharegpt"
|
"formatting": "sharegpt"
|
||||||
},
|
},
|
||||||
|
"glaive_toolcall_100k": {
|
||||||
|
"hf_hub_url": "hiyouga/glaive-function-calling-v2-sharegpt",
|
||||||
|
"formatting": "sharegpt",
|
||||||
|
"columns": {
|
||||||
|
"messages": "conversations",
|
||||||
|
"tools": "tools"
|
||||||
|
}
|
||||||
|
},
|
||||||
"cosmopedia": {
|
"cosmopedia": {
|
||||||
"hf_hub_url": "HuggingFaceTB/cosmopedia",
|
"hf_hub_url": "HuggingFaceTB/cosmopedia",
|
||||||
"columns": {
|
"columns": {
|
||||||
@ -295,6 +303,16 @@
|
|||||||
"file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd",
|
"file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd",
|
||||||
"ranking": true
|
"ranking": true
|
||||||
},
|
},
|
||||||
|
"orca_rlhf": {
|
||||||
|
"file_name": "orca_rlhf.json",
|
||||||
|
"file_sha1": "acc8f74d16fd1fc4f68e7d86eaa781c2c3f5ba8e",
|
||||||
|
"ranking": true,
|
||||||
|
"columns": {
|
||||||
|
"prompt": "question",
|
||||||
|
"response": "answer",
|
||||||
|
"system": "system"
|
||||||
|
}
|
||||||
|
},
|
||||||
"nectar_rm": {
|
"nectar_rm": {
|
||||||
"hf_hub_url": "mlinmg/RLAIF-Nectar",
|
"hf_hub_url": "mlinmg/RLAIF-Nectar",
|
||||||
"ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
|
"ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
|
||||||
|
@ -3,15 +3,12 @@ import json
|
|||||||
import datasets
|
import datasets
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co")
|
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
||||||
|
|
||||||
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
|
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
|
||||||
|
|
||||||
_CITATION = ""
|
_CITATION = ""
|
||||||
|
_HOMEPAGE = "{}/datasets/Anthropic/hh-rlhf".format(_HF_ENDPOINT)
|
||||||
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf"
|
|
||||||
_LICENSE = "mit"
|
_LICENSE = "mit"
|
||||||
_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/"
|
_URL = "{}/datasets/Anthropic/hh-rlhf/resolve/main/".format(_HF_ENDPOINT)
|
||||||
_URLS = {
|
_URLS = {
|
||||||
"train": [
|
"train": [
|
||||||
_URL + "harmless-base/train.jsonl.gz",
|
_URL + "harmless-base/train.jsonl.gz",
|
||||||
|
1
data/orca_rlhf.json.REMOVED.git-id
Normal file
1
data/orca_rlhf.json.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
|||||||
|
736bcedea2b24a1414765c6d69cbdafaea839f3c
|
@ -3,7 +3,7 @@ import json
|
|||||||
import datasets
|
import datasets
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co")
|
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
|
||||||
|
|
||||||
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
|
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
|
||||||
|
|
||||||
@ -18,9 +18,9 @@ _CITATION = """\
|
|||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat"
|
_HOMEPAGE = "{}/datasets/stingning/ultrachat".format(_HF_ENDPOINT)
|
||||||
_LICENSE = "cc-by-nc-4.0"
|
_LICENSE = "cc-by-nc-4.0"
|
||||||
_BASE_DATA_URL = "{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl"
|
_BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jsonl".format(_HF_ENDPOINT)
|
||||||
|
|
||||||
|
|
||||||
class UltraChat(datasets.GeneratorBasedBuilder):
|
class UltraChat(datasets.GeneratorBasedBuilder):
|
||||||
@ -40,7 +40,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
def _split_generators(self, dl_manager: datasets.DownloadManager):
|
||||||
file_paths = [dl_manager.download(_BASE_DATA_URL.format(_HF_ENDPOINT=_HF_ENDPOINT,idx=idx)) for idx in range(10)] # multiple shards
|
file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
|
||||||
return [
|
return [
|
||||||
datasets.SplitGenerator(
|
datasets.SplitGenerator(
|
||||||
name=datasets.Split.TRAIN,
|
name=datasets.Split.TRAIN,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user