Merge pull request #1802 from tastelikefeet/feat/support_ms

Support ModelScope Datahub

Former-commit-id: 382319915c3d986e018c1346c638b518bb29a6a3
This commit is contained in:
hoshi-hiyouga 2023-12-12 17:58:37 +08:00 committed by GitHub
commit 0091af79b2
4 changed files with 68 additions and 22 deletions

View File

@ -69,22 +69,28 @@
}
},
"guanaco": {
"hf_hub_url": "JosephusCheung/GuanacoDataset"
"hf_hub_url": "JosephusCheung/GuanacoDataset",
"ms_hub_url": "AI-ModelScope/GuanacoDataset"
},
"belle_2m": {
"hf_hub_url": "BelleGroup/train_2M_CN"
"hf_hub_url": "BelleGroup/train_2M_CN",
"ms_hub_url": "AI-ModelScope/train_2M_CN"
},
"belle_1m": {
"hf_hub_url": "BelleGroup/train_1M_CN"
"hf_hub_url": "BelleGroup/train_1M_CN",
"ms_hub_url": "AI-ModelScope/train_1M_CN"
},
"belle_0.5m": {
"hf_hub_url": "BelleGroup/train_0.5M_CN"
"hf_hub_url": "BelleGroup/train_0.5M_CN",
"ms_hub_url": "AI-ModelScope/train_0.5M_CN"
},
"belle_dialog": {
"hf_hub_url": "BelleGroup/generated_chat_0.4M"
"hf_hub_url": "BelleGroup/generated_chat_0.4M",
"ms_hub_url": "AI-ModelScope/generated_chat_0.4M"
},
"belle_math": {
"hf_hub_url": "BelleGroup/school_math_0.25M"
"hf_hub_url": "BelleGroup/school_math_0.25M",
"ms_hub_url": "AI-ModelScope/school_math_0.25M"
},
"belle_multiturn": {
"script_url": "belle_multiturn",
@ -95,16 +101,19 @@
"formatting": "sharegpt"
},
"open_platypus": {
"hf_hub_url": "garage-bAInd/Open-Platypus"
"hf_hub_url": "garage-bAInd/Open-Platypus",
"ms_hub_url": "AI-ModelScope/Open-Platypus"
},
"codealpaca": {
"hf_hub_url": "sahil2801/CodeAlpaca-20k"
"hf_hub_url": "sahil2801/CodeAlpaca-20k",
"ms_hub_url": "AI-ModelScope/CodeAlpaca-20k"
},
"alpaca_cot": {
"hf_hub_url": "QingyiSi/Alpaca-CoT"
},
"openorca": {
"hf_hub_url": "Open-Orca/OpenOrca",
"ms_hub_url": "AI-ModelScope/OpenOrca",
"columns": {
"prompt": "question",
"response": "response"
@ -112,6 +121,7 @@
},
"mathinstruct": {
"hf_hub_url": "TIGER-Lab/MathInstruct",
"ms_hub_url": "AI-ModelScope/MathInstruct",
"columns": {
"prompt": "instruction",
"response": "output"
@ -126,19 +136,22 @@
},
"webqa": {
"hf_hub_url": "suolyer/webqa",
"ms_hub_url": "AI-ModelScope/webqa",
"columns": {
"prompt": "input",
"response": "output"
}
},
"webnovel": {
"hf_hub_url": "zxbsmk/webnovel_cn"
"hf_hub_url": "zxbsmk/webnovel_cn",
"ms_hub_url": "AI-ModelScope/webnovel_cn"
},
"nectar_sft": {
"hf_hub_url": "mlinmg/SFT-Nectar"
},
"adgen": {
"hf_hub_url": "HasturOfficial/adgen",
"ms_hub_url": "AI-ModelScope/adgen",
"columns": {
"prompt": "content",
"response": "summary"
@ -150,10 +163,12 @@
},
"sharegpt4": {
"hf_hub_url": "shibing624/sharegpt_gpt4",
"ms_hub_url": "AI-ModelScope/sharegpt_gpt4",
"formatting": "sharegpt"
},
"ultrachat_200k": {
"hf_hub_url": "HuggingFaceH4/ultrachat_200k",
"ms_hub_url": "AI-ModelScope/ultrachat_200k",
"columns": {
"prompt": "messages",
"query": "role",
@ -167,6 +182,7 @@
},
"lmsys_chat": {
"hf_hub_url": "lmsys/lmsys-chat-1m",
"ms_hub_url": "AI-ModelScope/lmsys-chat-1m",
"columns": {
"prompt": "conversation",
"query": "role",
@ -251,6 +267,7 @@
},
"wikipedia_zh": {
"hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
"ms_hub_url": "AI-ModelScope/wikipedia-cn-20230720-filtered",
"columns": {
"prompt": "completion"
}

View File

@ -25,7 +25,7 @@ def get_dataset(
logger.info("Loading dataset {}...".format(dataset_attr))
data_path, data_name, data_dir, data_files = None, None, None, None
if dataset_attr.load_from == "hf_hub":
if dataset_attr.load_from in ("hf_hub", "ms_hub"):
data_path = dataset_attr.dataset_name
data_name = dataset_attr.subset
data_dir = dataset_attr.folder
@ -53,16 +53,30 @@ def get_dataset(
else:
raise NotImplementedError
dataset = load_dataset(
path=data_path,
name=data_name,
data_dir=data_dir,
data_files=data_files,
split=data_args.split,
cache_dir=model_args.cache_dir,
token=model_args.hf_hub_token,
streaming=(data_args.streaming and (dataset_attr.load_from != "file"))
)
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub":
from modelscope import MsDataset
from modelscope.utils.config_ds import MS_DATASETS_CACHE
cache_dir = model_args.cache_dir or MS_DATASETS_CACHE
dataset = MsDataset.load(
dataset_name=data_path,
subset_name=data_name,
split=data_args.split,
data_files=data_files,
cache_dir=cache_dir,
token=model_args.ms_hub_token,
use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
).to_hf_dataset()
else:
dataset = load_dataset(
path=data_path,
name=data_name,
data_files=data_files,
split=data_args.split,
cache_dir=model_args.cache_dir,
token=model_args.hf_hub_token,
streaming=(data_args.streaming and (dataset_attr.load_from != "file"))
)
if data_args.streaming and (dataset_attr.load_from == "file"):
dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter

View File

@ -2,7 +2,9 @@ import os
import json
from typing import List, Literal, Optional
from dataclasses import dataclass, field
from llmtuner.extras.logging import get_logger
logger = get_logger(__name__)
DATA_CONFIG = "dataset_info.json"
@ -153,8 +155,17 @@ class DataArguments:
if name not in dataset_info:
raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
if "hf_hub_url" in dataset_info[name]:
dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]:
url_key_name = "hf_hub_url"
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')):
if 'ms_hub_url' in dataset_info[name]:
url_key_name = 'ms_hub_url'
else:
logger.warning('You are using ModelScope Hub, but the specified dataset '
'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.')
dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')],
dataset_name=dataset_info[name][url_key_name])
elif "script_url" in dataset_info[name]:
dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
else:

View File

@ -59,6 +59,10 @@ class ModelArguments:
default=None,
metadata={"help": "Auth token to log in with Hugging Face Hub."}
)
ms_hub_token: Optional[str] = field(
default=None,
metadata={"help": "Auth token to log in with ModelScope Hub."}
)
def __post_init__(self):
self.compute_dtype = None