fix modelscope data hub

Former-commit-id: d5b2c57a356539df9993e4774b856231eca8a6da
This commit is contained in:
hiyouga 2023-12-12 18:33:06 +08:00
parent 0091af79b2
commit cefc0b2f03
6 changed files with 60 additions and 46 deletions

View File

@ -109,7 +109,8 @@
"ms_hub_url": "AI-ModelScope/CodeAlpaca-20k" "ms_hub_url": "AI-ModelScope/CodeAlpaca-20k"
}, },
"alpaca_cot": { "alpaca_cot": {
"hf_hub_url": "QingyiSi/Alpaca-CoT" "hf_hub_url": "QingyiSi/Alpaca-CoT",
"ms_hub_url": "AI-ModelScope/Alpaca-CoT"
}, },
"openorca": { "openorca": {
"hf_hub_url": "Open-Orca/OpenOrca", "hf_hub_url": "Open-Orca/OpenOrca",
@ -170,23 +171,23 @@
"hf_hub_url": "HuggingFaceH4/ultrachat_200k", "hf_hub_url": "HuggingFaceH4/ultrachat_200k",
"ms_hub_url": "AI-ModelScope/ultrachat_200k", "ms_hub_url": "AI-ModelScope/ultrachat_200k",
"columns": { "columns": {
"prompt": "messages", "messages": "messages",
"query": "role", "role": "role",
"response": "content" "content": "content"
}, },
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"agent_instruct": { "agent_instruct": {
"hf_hub_url": "THUDM/AgentInstruct", "hf_hub_url": "THUDM/AgentInstruct",
"ms_hub_url": "ZhipuAI/AgentInstruct",
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
"lmsys_chat": { "lmsys_chat": {
"hf_hub_url": "lmsys/lmsys-chat-1m", "hf_hub_url": "lmsys/lmsys-chat-1m",
"ms_hub_url": "AI-ModelScope/lmsys-chat-1m",
"columns": { "columns": {
"prompt": "conversation", "messages": "conversation",
"query": "role", "role": "role",
"response": "content" "content": "content"
}, },
"formatting": "sharegpt" "formatting": "sharegpt"
}, },
@ -287,12 +288,14 @@
}, },
"the_stack": { "the_stack": {
"hf_hub_url": "bigcode/the-stack", "hf_hub_url": "bigcode/the-stack",
"ms_hub_url": "AI-ModelScope/the-stack",
"columns": { "columns": {
"prompt": "content" "prompt": "content"
} }
}, },
"starcoder_python": { "starcoder_python": {
"hf_hub_url": "bigcode/starcoderdata", "hf_hub_url": "bigcode/starcoderdata",
"ms_hub_url": "AI-ModelScope/starcoderdata",
"columns": { "columns": {
"prompt": "content" "prompt": "content"
}, },

View File

@ -25,7 +25,7 @@ def get_dataset(
logger.info("Loading dataset {}...".format(dataset_attr)) logger.info("Loading dataset {}...".format(dataset_attr))
data_path, data_name, data_dir, data_files = None, None, None, None data_path, data_name, data_dir, data_files = None, None, None, None
if dataset_attr.load_from in ("hf_hub", "ms_hub"): if dataset_attr.load_from in ["hf_hub", "ms_hub"]:
data_path = dataset_attr.dataset_name data_path = dataset_attr.dataset_name
data_name = dataset_attr.subset data_name = dataset_attr.subset
data_dir = dataset_attr.folder data_dir = dataset_attr.folder
@ -53,24 +53,29 @@ def get_dataset(
else: else:
raise NotImplementedError raise NotImplementedError
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub": if dataset_attr.load_from == "ms_hub":
from modelscope import MsDataset try:
from modelscope.utils.config_ds import MS_DATASETS_CACHE from modelscope import MsDataset # type: ignore
cache_dir = model_args.cache_dir or MS_DATASETS_CACHE from modelscope.utils.config_ds import MS_DATASETS_CACHE # type: ignore
cache_dir = model_args.cache_dir or MS_DATASETS_CACHE
dataset = MsDataset.load( dataset = MsDataset.load(
dataset_name=data_path, dataset_name=data_path,
subset_name=data_name, subset_name=data_name,
split=data_args.split, data_dir=data_dir,
data_files=data_files, data_files=data_files,
split=data_args.split,
cache_dir=cache_dir, cache_dir=cache_dir,
token=model_args.ms_hub_token, token=model_args.ms_hub_token,
use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")), use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
).to_hf_dataset() ).to_hf_dataset()
except ImportError:
raise ImportError("Please install modelscope via `pip install modelscope -U`")
else: else:
dataset = load_dataset( dataset = load_dataset(
path=data_path, path=data_path,
name=data_name, name=data_name,
data_dir=data_dir,
data_files=data_files, data_files=data_files,
split=data_args.split, split=data_args.split,
cache_dir=model_args.cache_dir, cache_dir=model_args.cache_dir,

View File

@ -2,17 +2,19 @@ import os
import json import json
from typing import List, Literal, Optional from typing import List, Literal, Optional
from dataclasses import dataclass, field from dataclasses import dataclass, field
from llmtuner.extras.logging import get_logger
logger = get_logger(__name__)
DATA_CONFIG = "dataset_info.json" DATA_CONFIG = "dataset_info.json"
def use_modelscope() -> bool:
return bool(int(os.environ.get("USE_MODELSCOPE_HUB", "0")))
@dataclass @dataclass
class DatasetAttr: class DatasetAttr:
load_from: str load_from: Literal["hf_hub", "ms_hub", "script", "file"]
dataset_name: Optional[str] = None dataset_name: Optional[str] = None
dataset_sha1: Optional[str] = None dataset_sha1: Optional[str] = None
system_prompt: Optional[str] = None system_prompt: Optional[str] = None
@ -155,19 +157,25 @@ class DataArguments:
if name not in dataset_info: if name not in dataset_info:
raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG)) raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]: has_hf_url = "hf_hub_url" in dataset_info[name]
url_key_name = "hf_hub_url" has_ms_url = "ms_hub_url" in dataset_info[name]
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')):
if 'ms_hub_url' in dataset_info[name]:
url_key_name = 'ms_hub_url'
else:
logger.warning('You are using ModelScope Hub, but the specified dataset '
'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.')
dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')], if has_hf_url or has_ms_url:
dataset_name=dataset_info[name][url_key_name]) if (use_modelscope() and has_ms_url) or (not has_hf_url):
dataset_attr = DatasetAttr(
"ms_hub",
dataset_name=dataset_info[name]["ms_hub_url"]
)
else:
dataset_attr = DatasetAttr(
"hf_hub",
dataset_name=dataset_info[name]["hf_hub_url"]
)
elif "script_url" in dataset_info[name]: elif "script_url" in dataset_info[name]:
dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) dataset_attr = DatasetAttr(
"script",
dataset_name=dataset_info[name]["script_url"]
)
else: else:
dataset_attr = DatasetAttr( dataset_attr = DatasetAttr(
"file", "file",

View File

@ -66,8 +66,8 @@ def init_adapter(
if model_args.checkpoint_dir is not None: if model_args.checkpoint_dir is not None:
is_mergeable = True is_mergeable = True
if getattr(model, "quantization_method", None) == "gptq": if getattr(model, "quantization_method", None): # merge lora in quantized model is unstable
assert len(model_args.checkpoint_dir) == 1, "GPTQ quantized model only accepts a single checkpoint." assert len(model_args.checkpoint_dir) == 1, "Quantized model only accepts a single checkpoint."
is_mergeable = False is_mergeable = False
if (is_trainable and finetuning_args.resume_lora_training) or (not is_mergeable): if (is_trainable and finetuning_args.resume_lora_training) or (not is_mergeable):

View File

@ -1,4 +1,3 @@
import os
import math import math
import torch import torch
from types import MethodType from types import MethodType
@ -13,7 +12,6 @@ from transformers import (
PreTrainedModel, PreTrainedModel,
PreTrainedTokenizerBase PreTrainedTokenizerBase
) )
from transformers.models.llama import modeling_llama as LlamaModule
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
from trl import AutoModelForCausalLMWithValueHead from trl import AutoModelForCausalLMWithValueHead

View File

@ -44,13 +44,13 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
if model_args.quantization_bit is not None and finetuning_args.finetuning_type != "lora": if model_args.quantization_bit is not None and finetuning_args.finetuning_type != "lora":
raise ValueError("Quantization is only compatible with the LoRA method.") raise ValueError("Quantization is only compatible with the LoRA method.")
if ( if model_args.checkpoint_dir is not None and len(model_args.checkpoint_dir) != 1:
model_args.checkpoint_dir is not None if finetuning_args.finetuning_type != "lora":
and len(model_args.checkpoint_dir) != 1
and finetuning_args.finetuning_type != "lora"
):
raise ValueError("Multiple checkpoints are only available for LoRA tuning.") raise ValueError("Multiple checkpoints are only available for LoRA tuning.")
if model_args.quantization_bit is not None:
raise ValueError("Quantized model only accepts a single checkpoint. Merge them first.")
def parse_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: def parse_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
parser = HfArgumentParser(_TRAIN_ARGS) parser = HfArgumentParser(_TRAIN_ARGS)