Merge pull request #1802 from tastelikefeet/feat/support_ms

Support ModelScope Datahub

Former-commit-id: 382319915c3d986e018c1346c638b518bb29a6a3
This commit is contained in:
hoshi-hiyouga 2023-12-12 17:58:37 +08:00 committed by GitHub
commit 0091af79b2
4 changed files with 68 additions and 22 deletions

View File

@ -69,22 +69,28 @@
} }
}, },
"guanaco": { "guanaco": {
"hf_hub_url": "JosephusCheung/GuanacoDataset" "hf_hub_url": "JosephusCheung/GuanacoDataset",
"ms_hub_url": "AI-ModelScope/GuanacoDataset"
}, },
"belle_2m": { "belle_2m": {
"hf_hub_url": "BelleGroup/train_2M_CN" "hf_hub_url": "BelleGroup/train_2M_CN",
"ms_hub_url": "AI-ModelScope/train_2M_CN"
}, },
"belle_1m": { "belle_1m": {
"hf_hub_url": "BelleGroup/train_1M_CN" "hf_hub_url": "BelleGroup/train_1M_CN",
"ms_hub_url": "AI-ModelScope/train_1M_CN"
}, },
"belle_0.5m": { "belle_0.5m": {
"hf_hub_url": "BelleGroup/train_0.5M_CN" "hf_hub_url": "BelleGroup/train_0.5M_CN",
"ms_hub_url": "AI-ModelScope/train_0.5M_CN"
}, },
"belle_dialog": { "belle_dialog": {
"hf_hub_url": "BelleGroup/generated_chat_0.4M" "hf_hub_url": "BelleGroup/generated_chat_0.4M",
"ms_hub_url": "AI-ModelScope/generated_chat_0.4M"
}, },
"belle_math": { "belle_math": {
"hf_hub_url": "BelleGroup/school_math_0.25M" "hf_hub_url": "BelleGroup/school_math_0.25M",
"ms_hub_url": "AI-ModelScope/school_math_0.25M"
}, },
"belle_multiturn": { "belle_multiturn": {
"script_url": "belle_multiturn", "script_url": "belle_multiturn",
@ -95,16 +101,19 @@
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"open_platypus": { "open_platypus": {
"hf_hub_url": "garage-bAInd/Open-Platypus" "hf_hub_url": "garage-bAInd/Open-Platypus",
"ms_hub_url": "AI-ModelScope/Open-Platypus"
}, },
"codealpaca": { "codealpaca": {
"hf_hub_url": "sahil2801/CodeAlpaca-20k" "hf_hub_url": "sahil2801/CodeAlpaca-20k",
"ms_hub_url": "AI-ModelScope/CodeAlpaca-20k"
}, },
"alpaca_cot": { "alpaca_cot": {
"hf_hub_url": "QingyiSi/Alpaca-CoT" "hf_hub_url": "QingyiSi/Alpaca-CoT"
}, },
"openorca": { "openorca": {
"hf_hub_url": "Open-Orca/OpenOrca", "hf_hub_url": "Open-Orca/OpenOrca",
"ms_hub_url": "AI-ModelScope/OpenOrca",
"columns": { "columns": {
"prompt": "question", "prompt": "question",
"response": "response" "response": "response"
@ -112,6 +121,7 @@
}, },
"mathinstruct": { "mathinstruct": {
"hf_hub_url": "TIGER-Lab/MathInstruct", "hf_hub_url": "TIGER-Lab/MathInstruct",
"ms_hub_url": "AI-ModelScope/MathInstruct",
"columns": { "columns": {
"prompt": "instruction", "prompt": "instruction",
"response": "output" "response": "output"
@ -126,19 +136,22 @@
}, },
"webqa": { "webqa": {
"hf_hub_url": "suolyer/webqa", "hf_hub_url": "suolyer/webqa",
"ms_hub_url": "AI-ModelScope/webqa",
"columns": { "columns": {
"prompt": "input", "prompt": "input",
"response": "output" "response": "output"
} }
}, },
"webnovel": { "webnovel": {
"hf_hub_url": "zxbsmk/webnovel_cn" "hf_hub_url": "zxbsmk/webnovel_cn",
"ms_hub_url": "AI-ModelScope/webnovel_cn"
}, },
"nectar_sft": { "nectar_sft": {
"hf_hub_url": "mlinmg/SFT-Nectar" "hf_hub_url": "mlinmg/SFT-Nectar"
}, },
"adgen": { "adgen": {
"hf_hub_url": "HasturOfficial/adgen", "hf_hub_url": "HasturOfficial/adgen",
"ms_hub_url": "AI-ModelScope/adgen",
"columns": { "columns": {
"prompt": "content", "prompt": "content",
"response": "summary" "response": "summary"
@ -150,10 +163,12 @@
}, },
"sharegpt4": { "sharegpt4": {
"hf_hub_url": "shibing624/sharegpt_gpt4", "hf_hub_url": "shibing624/sharegpt_gpt4",
"ms_hub_url": "AI-ModelScope/sharegpt_gpt4",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"ultrachat_200k": { "ultrachat_200k": {
"hf_hub_url": "HuggingFaceH4/ultrachat_200k", "hf_hub_url": "HuggingFaceH4/ultrachat_200k",
"ms_hub_url": "AI-ModelScope/ultrachat_200k",
"columns": { "columns": {
"prompt": "messages", "prompt": "messages",
"query": "role", "query": "role",
@ -167,6 +182,7 @@
}, },
"lmsys_chat": { "lmsys_chat": {
"hf_hub_url": "lmsys/lmsys-chat-1m", "hf_hub_url": "lmsys/lmsys-chat-1m",
"ms_hub_url": "AI-ModelScope/lmsys-chat-1m",
"columns": { "columns": {
"prompt": "conversation", "prompt": "conversation",
"query": "role", "query": "role",
@ -251,6 +267,7 @@
}, },
"wikipedia_zh": { "wikipedia_zh": {
"hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
"ms_hub_url": "AI-ModelScope/wikipedia-cn-20230720-filtered",
"columns": { "columns": {
"prompt": "completion" "prompt": "completion"
} }

View File

@ -25,7 +25,7 @@ def get_dataset(
logger.info("Loading dataset {}...".format(dataset_attr)) logger.info("Loading dataset {}...".format(dataset_attr))
data_path, data_name, data_dir, data_files = None, None, None, None data_path, data_name, data_dir, data_files = None, None, None, None
if dataset_attr.load_from == "hf_hub": if dataset_attr.load_from in ("hf_hub", "ms_hub"):
data_path = dataset_attr.dataset_name data_path = dataset_attr.dataset_name
data_name = dataset_attr.subset data_name = dataset_attr.subset
data_dir = dataset_attr.folder data_dir = dataset_attr.folder
@ -53,16 +53,30 @@ def get_dataset(
else: else:
raise NotImplementedError raise NotImplementedError
dataset = load_dataset( if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub":
path=data_path, from modelscope import MsDataset
name=data_name, from modelscope.utils.config_ds import MS_DATASETS_CACHE
data_dir=data_dir, cache_dir = model_args.cache_dir or MS_DATASETS_CACHE
data_files=data_files,
split=data_args.split, dataset = MsDataset.load(
cache_dir=model_args.cache_dir, dataset_name=data_path,
token=model_args.hf_hub_token, subset_name=data_name,
streaming=(data_args.streaming and (dataset_attr.load_from != "file")) split=data_args.split,
) data_files=data_files,
cache_dir=cache_dir,
token=model_args.ms_hub_token,
use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
).to_hf_dataset()
else:
dataset = load_dataset(
path=data_path,
name=data_name,
data_files=data_files,
split=data_args.split,
cache_dir=model_args.cache_dir,
token=model_args.hf_hub_token,
streaming=(data_args.streaming and (dataset_attr.load_from != "file"))
)
if data_args.streaming and (dataset_attr.load_from == "file"): if data_args.streaming and (dataset_attr.load_from == "file"):
dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter

View File

@ -2,7 +2,9 @@ import os
import json import json
from typing import List, Literal, Optional from typing import List, Literal, Optional
from dataclasses import dataclass, field from dataclasses import dataclass, field
from llmtuner.extras.logging import get_logger
logger = get_logger(__name__)
DATA_CONFIG = "dataset_info.json" DATA_CONFIG = "dataset_info.json"
@ -153,8 +155,17 @@ class DataArguments:
if name not in dataset_info: if name not in dataset_info:
raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG)) raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
if "hf_hub_url" in dataset_info[name]: if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]:
dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]) url_key_name = "hf_hub_url"
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')):
if 'ms_hub_url' in dataset_info[name]:
url_key_name = 'ms_hub_url'
else:
logger.warning('You are using ModelScope Hub, but the specified dataset '
'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.')
dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')],
dataset_name=dataset_info[name][url_key_name])
elif "script_url" in dataset_info[name]: elif "script_url" in dataset_info[name]:
dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
else: else:

View File

@ -59,6 +59,10 @@ class ModelArguments:
default=None, default=None,
metadata={"help": "Auth token to log in with Hugging Face Hub."} metadata={"help": "Auth token to log in with Hugging Face Hub."}
) )
ms_hub_token: Optional[str] = field(
default=None,
metadata={"help": "Auth token to log in with ModelScope Hub."}
)
def __post_init__(self): def __post_init__(self):
self.compute_dtype = None self.compute_dtype = None