support ms dataset

Former-commit-id: 98638b35dc24045ac17b9b01d08d3a02372acef3
This commit is contained in:
yuze.zyz 2023-12-08 18:00:57 +08:00
parent 6f3e3174c1
commit c2432b2e8d
3 changed files with 54 additions and 21 deletions

View File

@ -69,22 +69,28 @@
} }
}, },
"guanaco": { "guanaco": {
"hf_hub_url": "JosephusCheung/GuanacoDataset" "hf_hub_url": "JosephusCheung/GuanacoDataset",
"ms_hub_yrl": "wyj123456/GuanacoDataset"
}, },
"belle_2m": { "belle_2m": {
"hf_hub_url": "BelleGroup/train_2M_CN" "hf_hub_url": "BelleGroup/train_2M_CN",
"ms_hub_yrl": "AI-ModelScope/train_2M_CN"
}, },
"belle_1m": { "belle_1m": {
"hf_hub_url": "BelleGroup/train_1M_CN" "hf_hub_url": "BelleGroup/train_1M_CN",
"ms_hub_yrl": "AI-ModelScope/train_1M_CN"
}, },
"belle_0.5m": { "belle_0.5m": {
"hf_hub_url": "BelleGroup/train_0.5M_CN" "hf_hub_url": "BelleGroup/train_0.5M_CN",
"ms_hub_yrl": "AI-ModelScope/train_0.5M_CN"
}, },
"belle_dialog": { "belle_dialog": {
"hf_hub_url": "BelleGroup/generated_chat_0.4M" "hf_hub_url": "BelleGroup/generated_chat_0.4M",
"ms_hub_yrl": "AI-ModelScope/generated_chat_0.4M"
}, },
"belle_math": { "belle_math": {
"hf_hub_url": "BelleGroup/school_math_0.25M" "hf_hub_url": "BelleGroup/school_math_0.25M",
"ms_hub_yrl": "AI-ModelScope/school_math_0.25M"
}, },
"belle_multiturn": { "belle_multiturn": {
"script_url": "belle_multiturn", "script_url": "belle_multiturn",
@ -95,10 +101,12 @@
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"open_platypus": { "open_platypus": {
"hf_hub_url": "garage-bAInd/Open-Platypus" "hf_hub_url": "garage-bAInd/Open-Platypus",
"ms_hub_yrl": "AI-ModelScope/Open-Platypus"
}, },
"codealpaca": { "codealpaca": {
"hf_hub_url": "sahil2801/CodeAlpaca-20k" "hf_hub_url": "sahil2801/CodeAlpaca-20k",
"ms_hub_yrl": "AI-ModelScope/CodeAlpaca-20k"
}, },
"alpaca_cot": { "alpaca_cot": {
"hf_hub_url": "QingyiSi/Alpaca-CoT" "hf_hub_url": "QingyiSi/Alpaca-CoT"
@ -112,6 +120,7 @@
}, },
"mathinstruct": { "mathinstruct": {
"hf_hub_url": "TIGER-Lab/MathInstruct", "hf_hub_url": "TIGER-Lab/MathInstruct",
"ms_hub_yrl": "AI-ModelScope/MathInstruct",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"response": "output" "response": "output"
@ -126,13 +135,15 @@
}, },
"webqa": { "webqa": {
"hf_hub_url": "suolyer/webqa", "hf_hub_url": "suolyer/webqa",
"ms_hub_yrl": "AI-ModelScope/webqa",
"columns": { "columns": {
"prompt": "input", "prompt": "input",
"response": "output" "response": "output"
} }
}, },
"webnovel": { "webnovel": {
"hf_hub_url": "zxbsmk/webnovel_cn" "hf_hub_url": "zxbsmk/webnovel_cn",
"ms_hub_yrl": "AI-ModelScope/webnovel_cn"
}, },
"nectar_sft": { "nectar_sft": {
"hf_hub_url": "mlinmg/SFT-Nectar" "hf_hub_url": "mlinmg/SFT-Nectar"
@ -146,10 +157,12 @@
}, },
"sharegpt_hyper": { "sharegpt_hyper": {
"hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k", "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k",
"ms_hub_yrl": "AI-ModelScope/sharegpt-hyperfiltered-3k",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"sharegpt4": { "sharegpt4": {
"hf_hub_url": "shibing624/sharegpt_gpt4", "hf_hub_url": "shibing624/sharegpt_gpt4",
"ms_hub_yrl": "AI-ModelScope/sharegpt_gpt4",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"ultrachat_200k": { "ultrachat_200k": {
@ -176,6 +189,7 @@
}, },
"evol_instruct": { "evol_instruct": {
"hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k", "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
"ms_hub_yrl": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"hh_rlhf_en": { "hh_rlhf_en": {
@ -251,6 +265,7 @@
}, },
"wikipedia_zh": { "wikipedia_zh": {
"hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
"ms_hub_yrl": "AI-ModelScope/wikipedia-cn-20230720-filtered",
"columns": { "columns": {
"prompt": "completion" "prompt": "completion"
} }

View File

@ -24,7 +24,7 @@ def get_dataset(
for dataset_attr in data_args.dataset_list: for dataset_attr in data_args.dataset_list:
logger.info("Loading dataset {}...".format(dataset_attr)) logger.info("Loading dataset {}...".format(dataset_attr))
if dataset_attr.load_from == "hf_hub": if dataset_attr.load_from in ("hf_hub", "ms_hub"):
data_path = dataset_attr.dataset_name data_path = dataset_attr.dataset_name
data_name = dataset_attr.subset data_name = dataset_attr.subset
data_files = None data_files = None
@ -53,6 +53,13 @@ def get_dataset(
else: else:
raise NotImplementedError raise NotImplementedError
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub":
from modelscope import MsDataset
dataset = MsDataset.load(
dataset_name=data_path,
subset_name=data_name,
).to_hf_dataset()
else:
dataset = load_dataset( dataset = load_dataset(
path=data_path, path=data_path,
name=data_name, name=data_name,

View File

@ -2,7 +2,9 @@ import os
import json import json
from typing import List, Literal, Optional from typing import List, Literal, Optional
from dataclasses import dataclass, field from dataclasses import dataclass, field
from llmtuner.extras.logging import get_logger
logger = get_logger(__name__)
DATA_CONFIG = "dataset_info.json" DATA_CONFIG = "dataset_info.json"
@ -152,8 +154,17 @@ class DataArguments:
if name not in dataset_info: if name not in dataset_info:
raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG)) raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
if "hf_hub_url" in dataset_info[name]: if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]:
dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]) url_key_name = "hf_hub_url"
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')):
if 'ms_hub_url' in dataset_info[name]:
url_key_name = 'ms_hub_url'
else:
logger.warning('You are using ModelScope Hub, but the specified dataset '
'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.')
dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')],
dataset_name=dataset_info[name][url_key_name])
elif "script_url" in dataset_info[name]: elif "script_url" in dataset_info[name]:
dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
else: else: