diff --git a/data/dataset_info.json b/data/dataset_info.json index 1896d94d..02664af7 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -69,22 +69,28 @@ } }, "guanaco": { - "hf_hub_url": "JosephusCheung/GuanacoDataset" + "hf_hub_url": "JosephusCheung/GuanacoDataset", + "ms_hub_url": "AI-ModelScope/GuanacoDataset" }, "belle_2m": { - "hf_hub_url": "BelleGroup/train_2M_CN" + "hf_hub_url": "BelleGroup/train_2M_CN", + "ms_hub_url": "AI-ModelScope/train_2M_CN" }, "belle_1m": { - "hf_hub_url": "BelleGroup/train_1M_CN" + "hf_hub_url": "BelleGroup/train_1M_CN", + "ms_hub_url": "AI-ModelScope/train_1M_CN" }, "belle_0.5m": { - "hf_hub_url": "BelleGroup/train_0.5M_CN" + "hf_hub_url": "BelleGroup/train_0.5M_CN", + "ms_hub_url": "AI-ModelScope/train_0.5M_CN" }, "belle_dialog": { - "hf_hub_url": "BelleGroup/generated_chat_0.4M" + "hf_hub_url": "BelleGroup/generated_chat_0.4M", + "ms_hub_url": "AI-ModelScope/generated_chat_0.4M" }, "belle_math": { - "hf_hub_url": "BelleGroup/school_math_0.25M" + "hf_hub_url": "BelleGroup/school_math_0.25M", + "ms_hub_url": "AI-ModelScope/school_math_0.25M" }, "belle_multiturn": { "script_url": "belle_multiturn", @@ -95,16 +101,19 @@ "formatting": "sharegpt" }, "open_platypus": { - "hf_hub_url": "garage-bAInd/Open-Platypus" + "hf_hub_url": "garage-bAInd/Open-Platypus", + "ms_hub_url": "AI-ModelScope/Open-Platypus" }, "codealpaca": { - "hf_hub_url": "sahil2801/CodeAlpaca-20k" + "hf_hub_url": "sahil2801/CodeAlpaca-20k", + "ms_hub_url": "AI-ModelScope/CodeAlpaca-20k" }, "alpaca_cot": { "hf_hub_url": "QingyiSi/Alpaca-CoT" }, "openorca": { "hf_hub_url": "Open-Orca/OpenOrca", + "ms_hub_url": "AI-ModelScope/OpenOrca", "columns": { "prompt": "question", "response": "response" @@ -112,6 +121,7 @@ }, "mathinstruct": { "hf_hub_url": "TIGER-Lab/MathInstruct", + "ms_hub_url": "AI-ModelScope/MathInstruct", "columns": { "prompt": "instruction", "response": "output" @@ -126,19 +136,22 @@ }, "webqa": { "hf_hub_url": "suolyer/webqa", + "ms_hub_url": "AI-ModelScope/webqa", "columns": { "prompt": "input", "response": "output" } }, "webnovel": { - "hf_hub_url": "zxbsmk/webnovel_cn" + "hf_hub_url": "zxbsmk/webnovel_cn", + "ms_hub_url": "AI-ModelScope/webnovel_cn" }, "nectar_sft": { "hf_hub_url": "mlinmg/SFT-Nectar" }, "adgen": { "hf_hub_url": "HasturOfficial/adgen", + "ms_hub_url": "AI-ModelScope/adgen", "columns": { "prompt": "content", "response": "summary" @@ -150,10 +163,12 @@ }, "sharegpt4": { "hf_hub_url": "shibing624/sharegpt_gpt4", + "ms_hub_url": "AI-ModelScope/sharegpt_gpt4", "formatting": "sharegpt" }, "ultrachat_200k": { "hf_hub_url": "HuggingFaceH4/ultrachat_200k", + "ms_hub_url": "AI-ModelScope/ultrachat_200k", "columns": { "prompt": "messages", "query": "role", @@ -167,6 +182,7 @@ }, "lmsys_chat": { "hf_hub_url": "lmsys/lmsys-chat-1m", + "ms_hub_url": "AI-ModelScope/lmsys-chat-1m", "columns": { "prompt": "conversation", "query": "role", @@ -251,6 +267,7 @@ }, "wikipedia_zh": { "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", + "ms_hub_url": "AI-ModelScope/wikipedia-cn-20230720-filtered", "columns": { "prompt": "completion" } diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index d5a7a588..75113cb3 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -25,7 +25,7 @@ def get_dataset( logger.info("Loading dataset {}...".format(dataset_attr)) data_path, data_name, data_dir, data_files = None, None, None, None - if dataset_attr.load_from == "hf_hub": + if dataset_attr.load_from in ("hf_hub", "ms_hub"): data_path = dataset_attr.dataset_name data_name = dataset_attr.subset data_dir = dataset_attr.folder @@ -53,16 +53,30 @@ def get_dataset( else: raise NotImplementedError - dataset = load_dataset( - path=data_path, - name=data_name, - data_dir=data_dir, - data_files=data_files, - split=data_args.split, - cache_dir=model_args.cache_dir, - token=model_args.hf_hub_token, - streaming=(data_args.streaming and (dataset_attr.load_from != "file")) - ) + if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub": + from modelscope import MsDataset + from modelscope.utils.config_ds import MS_DATASETS_CACHE + cache_dir = model_args.cache_dir or MS_DATASETS_CACHE + + dataset = MsDataset.load( + dataset_name=data_path, + subset_name=data_name, + split=data_args.split, + data_files=data_files, + cache_dir=cache_dir, + token=model_args.ms_hub_token, + use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")), + ).to_hf_dataset() + else: + dataset = load_dataset( + path=data_path, + name=data_name, + data_files=data_files, + split=data_args.split, + cache_dir=model_args.cache_dir, + token=model_args.hf_hub_token, + streaming=(data_args.streaming and (dataset_attr.load_from != "file")) + ) if data_args.streaming and (dataset_attr.load_from == "file"): dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py index da9be11b..b5ed3f99 100644 --- a/src/llmtuner/hparams/data_args.py +++ b/src/llmtuner/hparams/data_args.py @@ -2,7 +2,9 @@ import os import json from typing import List, Literal, Optional from dataclasses import dataclass, field +from llmtuner.extras.logging import get_logger +logger = get_logger(__name__) DATA_CONFIG = "dataset_info.json" @@ -153,8 +155,17 @@ class DataArguments: if name not in dataset_info: raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG)) - if "hf_hub_url" in dataset_info[name]: - dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]) + if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]: + url_key_name = "hf_hub_url" + if int(os.environ.get('USE_MODELSCOPE_HUB', '0')): + if 'ms_hub_url' in dataset_info[name]: + url_key_name = 'ms_hub_url' + else: + logger.warning('You are using ModelScope Hub, but the specified dataset ' + 'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.') + + dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')], + dataset_name=dataset_info[name][url_key_name]) elif "script_url" in dataset_info[name]: dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) else: diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index 07903b37..6ba37431 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -59,6 +59,10 @@ class ModelArguments: default=None, metadata={"help": "Auth token to log in with Hugging Face Hub."} ) + ms_hub_token: Optional[str] = field( + default=None, + metadata={"help": "Auth token to log in with ModelScope Hub."} + ) def __post_init__(self): self.compute_dtype = None