support ms dataset

Former-commit-id: 9c2247d700763f480d88a5dd46480cb32cfc174e
This commit is contained in:
yuze.zyz 2023-12-08 18:00:57 +08:00
parent 9a26819a58
commit c523613f0a
3 changed files with 54 additions and 21 deletions

View File

@ -69,22 +69,28 @@
} }
}, },
"guanaco": { "guanaco": {
"hf_hub_url": "JosephusCheung/GuanacoDataset" "hf_hub_url": "JosephusCheung/GuanacoDataset",
"ms_hub_yrl": "wyj123456/GuanacoDataset"
}, },
"belle_2m": { "belle_2m": {
"hf_hub_url": "BelleGroup/train_2M_CN" "hf_hub_url": "BelleGroup/train_2M_CN",
"ms_hub_yrl": "AI-ModelScope/train_2M_CN"
}, },
"belle_1m": { "belle_1m": {
"hf_hub_url": "BelleGroup/train_1M_CN" "hf_hub_url": "BelleGroup/train_1M_CN",
"ms_hub_yrl": "AI-ModelScope/train_1M_CN"
}, },
"belle_0.5m": { "belle_0.5m": {
"hf_hub_url": "BelleGroup/train_0.5M_CN" "hf_hub_url": "BelleGroup/train_0.5M_CN",
"ms_hub_yrl": "AI-ModelScope/train_0.5M_CN"
}, },
"belle_dialog": { "belle_dialog": {
"hf_hub_url": "BelleGroup/generated_chat_0.4M" "hf_hub_url": "BelleGroup/generated_chat_0.4M",
"ms_hub_yrl": "AI-ModelScope/generated_chat_0.4M"
}, },
"belle_math": { "belle_math": {
"hf_hub_url": "BelleGroup/school_math_0.25M" "hf_hub_url": "BelleGroup/school_math_0.25M",
"ms_hub_yrl": "AI-ModelScope/school_math_0.25M"
}, },
"belle_multiturn": { "belle_multiturn": {
"script_url": "belle_multiturn", "script_url": "belle_multiturn",
@ -95,10 +101,12 @@
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"open_platypus": { "open_platypus": {
"hf_hub_url": "garage-bAInd/Open-Platypus" "hf_hub_url": "garage-bAInd/Open-Platypus",
"ms_hub_yrl": "AI-ModelScope/Open-Platypus"
}, },
"codealpaca": { "codealpaca": {
"hf_hub_url": "sahil2801/CodeAlpaca-20k" "hf_hub_url": "sahil2801/CodeAlpaca-20k",
"ms_hub_yrl": "AI-ModelScope/CodeAlpaca-20k"
}, },
"alpaca_cot": { "alpaca_cot": {
"hf_hub_url": "QingyiSi/Alpaca-CoT" "hf_hub_url": "QingyiSi/Alpaca-CoT"
@ -112,6 +120,7 @@
}, },
"mathinstruct": { "mathinstruct": {
"hf_hub_url": "TIGER-Lab/MathInstruct", "hf_hub_url": "TIGER-Lab/MathInstruct",
"ms_hub_yrl": "AI-ModelScope/MathInstruct",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"response": "output" "response": "output"
@ -126,13 +135,15 @@
}, },
"webqa": { "webqa": {
"hf_hub_url": "suolyer/webqa", "hf_hub_url": "suolyer/webqa",
"ms_hub_yrl": "AI-ModelScope/webqa",
"columns": { "columns": {
"prompt": "input", "prompt": "input",
"response": "output" "response": "output"
} }
}, },
"webnovel": { "webnovel": {
"hf_hub_url": "zxbsmk/webnovel_cn" "hf_hub_url": "zxbsmk/webnovel_cn",
"ms_hub_yrl": "AI-ModelScope/webnovel_cn"
}, },
"nectar_sft": { "nectar_sft": {
"hf_hub_url": "mlinmg/SFT-Nectar" "hf_hub_url": "mlinmg/SFT-Nectar"
@ -146,10 +157,12 @@
}, },
"sharegpt_hyper": { "sharegpt_hyper": {
"hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k", "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k",
"ms_hub_yrl": "AI-ModelScope/sharegpt-hyperfiltered-3k",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"sharegpt4": { "sharegpt4": {
"hf_hub_url": "shibing624/sharegpt_gpt4", "hf_hub_url": "shibing624/sharegpt_gpt4",
"ms_hub_yrl": "AI-ModelScope/sharegpt_gpt4",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"ultrachat_200k": { "ultrachat_200k": {
@ -176,6 +189,7 @@
}, },
"evol_instruct": { "evol_instruct": {
"hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k", "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
"ms_hub_yrl": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"hh_rlhf_en": { "hh_rlhf_en": {
@ -251,6 +265,7 @@
}, },
"wikipedia_zh": { "wikipedia_zh": {
"hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
"ms_hub_yrl": "AI-ModelScope/wikipedia-cn-20230720-filtered",
"columns": { "columns": {
"prompt": "completion" "prompt": "completion"
} }

View File

@ -24,7 +24,7 @@ def get_dataset(
for dataset_attr in data_args.dataset_list: for dataset_attr in data_args.dataset_list:
logger.info("Loading dataset {}...".format(dataset_attr)) logger.info("Loading dataset {}...".format(dataset_attr))
if dataset_attr.load_from == "hf_hub": if dataset_attr.load_from in ("hf_hub", "ms_hub"):
data_path = dataset_attr.dataset_name data_path = dataset_attr.dataset_name
data_name = dataset_attr.subset data_name = dataset_attr.subset
data_files = None data_files = None
@ -53,6 +53,13 @@ def get_dataset(
else: else:
raise NotImplementedError raise NotImplementedError
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub":
from modelscope import MsDataset
dataset = MsDataset.load(
dataset_name=data_path,
subset_name=data_name,
).to_hf_dataset()
else:
dataset = load_dataset( dataset = load_dataset(
path=data_path, path=data_path,
name=data_name, name=data_name,

View File

@ -2,7 +2,9 @@ import os
import json import json
from typing import List, Literal, Optional from typing import List, Literal, Optional
from dataclasses import dataclass, field from dataclasses import dataclass, field
from llmtuner.extras.logging import get_logger
logger = get_logger(__name__)
DATA_CONFIG = "dataset_info.json" DATA_CONFIG = "dataset_info.json"
@ -152,8 +154,17 @@ class DataArguments:
if name not in dataset_info: if name not in dataset_info:
raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG)) raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
if "hf_hub_url" in dataset_info[name]: if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]:
dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]) url_key_name = "hf_hub_url"
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')):
if 'ms_hub_url' in dataset_info[name]:
url_key_name = 'ms_hub_url'
else:
logger.warning('You are using ModelScope Hub, but the specified dataset '
'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.')
dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')],
dataset_name=dataset_info[name][url_key_name])
elif "script_url" in dataset_info[name]: elif "script_url" in dataset_info[name]:
dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
else: else: