mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-22 13:42:51 +08:00
fix modelscope data hub
Former-commit-id: d5b2c57a356539df9993e4774b856231eca8a6da
This commit is contained in:
parent
0091af79b2
commit
cefc0b2f03
@ -109,7 +109,8 @@
|
|||||||
"ms_hub_url": "AI-ModelScope/CodeAlpaca-20k"
|
"ms_hub_url": "AI-ModelScope/CodeAlpaca-20k"
|
||||||
},
|
},
|
||||||
"alpaca_cot": {
|
"alpaca_cot": {
|
||||||
"hf_hub_url": "QingyiSi/Alpaca-CoT"
|
"hf_hub_url": "QingyiSi/Alpaca-CoT",
|
||||||
|
"ms_hub_url": "AI-ModelScope/Alpaca-CoT"
|
||||||
},
|
},
|
||||||
"openorca": {
|
"openorca": {
|
||||||
"hf_hub_url": "Open-Orca/OpenOrca",
|
"hf_hub_url": "Open-Orca/OpenOrca",
|
||||||
@ -170,23 +171,23 @@
|
|||||||
"hf_hub_url": "HuggingFaceH4/ultrachat_200k",
|
"hf_hub_url": "HuggingFaceH4/ultrachat_200k",
|
||||||
"ms_hub_url": "AI-ModelScope/ultrachat_200k",
|
"ms_hub_url": "AI-ModelScope/ultrachat_200k",
|
||||||
"columns": {
|
"columns": {
|
||||||
"prompt": "messages",
|
"messages": "messages",
|
||||||
"query": "role",
|
"role": "role",
|
||||||
"response": "content"
|
"content": "content"
|
||||||
},
|
},
|
||||||
"formatting": "sharegpt"
|
"formatting": "sharegpt"
|
||||||
},
|
},
|
||||||
"agent_instruct": {
|
"agent_instruct": {
|
||||||
"hf_hub_url": "THUDM/AgentInstruct",
|
"hf_hub_url": "THUDM/AgentInstruct",
|
||||||
|
"ms_hub_url": "ZhipuAI/AgentInstruct",
|
||||||
"formatting": "sharegpt"
|
"formatting": "sharegpt"
|
||||||
},
|
},
|
||||||
"lmsys_chat": {
|
"lmsys_chat": {
|
||||||
"hf_hub_url": "lmsys/lmsys-chat-1m",
|
"hf_hub_url": "lmsys/lmsys-chat-1m",
|
||||||
"ms_hub_url": "AI-ModelScope/lmsys-chat-1m",
|
|
||||||
"columns": {
|
"columns": {
|
||||||
"prompt": "conversation",
|
"messages": "conversation",
|
||||||
"query": "role",
|
"role": "role",
|
||||||
"response": "content"
|
"content": "content"
|
||||||
},
|
},
|
||||||
"formatting": "sharegpt"
|
"formatting": "sharegpt"
|
||||||
},
|
},
|
||||||
@ -287,12 +288,14 @@
|
|||||||
},
|
},
|
||||||
"the_stack": {
|
"the_stack": {
|
||||||
"hf_hub_url": "bigcode/the-stack",
|
"hf_hub_url": "bigcode/the-stack",
|
||||||
|
"ms_hub_url": "AI-ModelScope/the-stack",
|
||||||
"columns": {
|
"columns": {
|
||||||
"prompt": "content"
|
"prompt": "content"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"starcoder_python": {
|
"starcoder_python": {
|
||||||
"hf_hub_url": "bigcode/starcoderdata",
|
"hf_hub_url": "bigcode/starcoderdata",
|
||||||
|
"ms_hub_url": "AI-ModelScope/starcoderdata",
|
||||||
"columns": {
|
"columns": {
|
||||||
"prompt": "content"
|
"prompt": "content"
|
||||||
},
|
},
|
||||||
|
@ -25,7 +25,7 @@ def get_dataset(
|
|||||||
logger.info("Loading dataset {}...".format(dataset_attr))
|
logger.info("Loading dataset {}...".format(dataset_attr))
|
||||||
|
|
||||||
data_path, data_name, data_dir, data_files = None, None, None, None
|
data_path, data_name, data_dir, data_files = None, None, None, None
|
||||||
if dataset_attr.load_from in ("hf_hub", "ms_hub"):
|
if dataset_attr.load_from in ["hf_hub", "ms_hub"]:
|
||||||
data_path = dataset_attr.dataset_name
|
data_path = dataset_attr.dataset_name
|
||||||
data_name = dataset_attr.subset
|
data_name = dataset_attr.subset
|
||||||
data_dir = dataset_attr.folder
|
data_dir = dataset_attr.folder
|
||||||
@ -53,24 +53,29 @@ def get_dataset(
|
|||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')) and dataset_attr.load_from == "ms_hub":
|
if dataset_attr.load_from == "ms_hub":
|
||||||
from modelscope import MsDataset
|
try:
|
||||||
from modelscope.utils.config_ds import MS_DATASETS_CACHE
|
from modelscope import MsDataset # type: ignore
|
||||||
cache_dir = model_args.cache_dir or MS_DATASETS_CACHE
|
from modelscope.utils.config_ds import MS_DATASETS_CACHE # type: ignore
|
||||||
|
|
||||||
dataset = MsDataset.load(
|
cache_dir = model_args.cache_dir or MS_DATASETS_CACHE
|
||||||
dataset_name=data_path,
|
dataset = MsDataset.load(
|
||||||
subset_name=data_name,
|
dataset_name=data_path,
|
||||||
split=data_args.split,
|
subset_name=data_name,
|
||||||
data_files=data_files,
|
data_dir=data_dir,
|
||||||
cache_dir=cache_dir,
|
data_files=data_files,
|
||||||
token=model_args.ms_hub_token,
|
split=data_args.split,
|
||||||
use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
|
cache_dir=cache_dir,
|
||||||
).to_hf_dataset()
|
token=model_args.ms_hub_token,
|
||||||
|
use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
|
||||||
|
).to_hf_dataset()
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Please install modelscope via `pip install modelscope -U`")
|
||||||
else:
|
else:
|
||||||
dataset = load_dataset(
|
dataset = load_dataset(
|
||||||
path=data_path,
|
path=data_path,
|
||||||
name=data_name,
|
name=data_name,
|
||||||
|
data_dir=data_dir,
|
||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
split=data_args.split,
|
split=data_args.split,
|
||||||
cache_dir=model_args.cache_dir,
|
cache_dir=model_args.cache_dir,
|
||||||
|
@ -2,17 +2,19 @@ import os
|
|||||||
import json
|
import json
|
||||||
from typing import List, Literal, Optional
|
from typing import List, Literal, Optional
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from llmtuner.extras.logging import get_logger
|
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
|
||||||
|
|
||||||
DATA_CONFIG = "dataset_info.json"
|
DATA_CONFIG = "dataset_info.json"
|
||||||
|
|
||||||
|
|
||||||
|
def use_modelscope() -> bool:
|
||||||
|
return bool(int(os.environ.get("USE_MODELSCOPE_HUB", "0")))
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DatasetAttr:
|
class DatasetAttr:
|
||||||
|
|
||||||
load_from: str
|
load_from: Literal["hf_hub", "ms_hub", "script", "file"]
|
||||||
dataset_name: Optional[str] = None
|
dataset_name: Optional[str] = None
|
||||||
dataset_sha1: Optional[str] = None
|
dataset_sha1: Optional[str] = None
|
||||||
system_prompt: Optional[str] = None
|
system_prompt: Optional[str] = None
|
||||||
@ -155,19 +157,25 @@ class DataArguments:
|
|||||||
if name not in dataset_info:
|
if name not in dataset_info:
|
||||||
raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
|
raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
|
||||||
|
|
||||||
if "hf_hub_url" in dataset_info[name] or 'ms_hub_url' in dataset_info[name]:
|
has_hf_url = "hf_hub_url" in dataset_info[name]
|
||||||
url_key_name = "hf_hub_url"
|
has_ms_url = "ms_hub_url" in dataset_info[name]
|
||||||
if int(os.environ.get('USE_MODELSCOPE_HUB', '0')):
|
|
||||||
if 'ms_hub_url' in dataset_info[name]:
|
|
||||||
url_key_name = 'ms_hub_url'
|
|
||||||
else:
|
|
||||||
logger.warning('You are using ModelScope Hub, but the specified dataset '
|
|
||||||
'has no `ms_hub_url` key, so `hf_hub_url` will be used instead.')
|
|
||||||
|
|
||||||
dataset_attr = DatasetAttr(url_key_name[:url_key_name.index('_url')],
|
if has_hf_url or has_ms_url:
|
||||||
dataset_name=dataset_info[name][url_key_name])
|
if (use_modelscope() and has_ms_url) or (not has_hf_url):
|
||||||
|
dataset_attr = DatasetAttr(
|
||||||
|
"ms_hub",
|
||||||
|
dataset_name=dataset_info[name]["ms_hub_url"]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
dataset_attr = DatasetAttr(
|
||||||
|
"hf_hub",
|
||||||
|
dataset_name=dataset_info[name]["hf_hub_url"]
|
||||||
|
)
|
||||||
elif "script_url" in dataset_info[name]:
|
elif "script_url" in dataset_info[name]:
|
||||||
dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
|
dataset_attr = DatasetAttr(
|
||||||
|
"script",
|
||||||
|
dataset_name=dataset_info[name]["script_url"]
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
dataset_attr = DatasetAttr(
|
dataset_attr = DatasetAttr(
|
||||||
"file",
|
"file",
|
||||||
|
@ -66,8 +66,8 @@ def init_adapter(
|
|||||||
|
|
||||||
if model_args.checkpoint_dir is not None:
|
if model_args.checkpoint_dir is not None:
|
||||||
is_mergeable = True
|
is_mergeable = True
|
||||||
if getattr(model, "quantization_method", None) == "gptq":
|
if getattr(model, "quantization_method", None): # merge lora in quantized model is unstable
|
||||||
assert len(model_args.checkpoint_dir) == 1, "GPTQ quantized model only accepts a single checkpoint."
|
assert len(model_args.checkpoint_dir) == 1, "Quantized model only accepts a single checkpoint."
|
||||||
is_mergeable = False
|
is_mergeable = False
|
||||||
|
|
||||||
if (is_trainable and finetuning_args.resume_lora_training) or (not is_mergeable):
|
if (is_trainable and finetuning_args.resume_lora_training) or (not is_mergeable):
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import os
|
|
||||||
import math
|
import math
|
||||||
import torch
|
import torch
|
||||||
from types import MethodType
|
from types import MethodType
|
||||||
@ -13,7 +12,6 @@ from transformers import (
|
|||||||
PreTrainedModel,
|
PreTrainedModel,
|
||||||
PreTrainedTokenizerBase
|
PreTrainedTokenizerBase
|
||||||
)
|
)
|
||||||
from transformers.models.llama import modeling_llama as LlamaModule
|
|
||||||
from transformers.utils.versions import require_version
|
from transformers.utils.versions import require_version
|
||||||
from trl import AutoModelForCausalLMWithValueHead
|
from trl import AutoModelForCausalLMWithValueHead
|
||||||
|
|
||||||
|
@ -44,12 +44,12 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
|
|||||||
if model_args.quantization_bit is not None and finetuning_args.finetuning_type != "lora":
|
if model_args.quantization_bit is not None and finetuning_args.finetuning_type != "lora":
|
||||||
raise ValueError("Quantization is only compatible with the LoRA method.")
|
raise ValueError("Quantization is only compatible with the LoRA method.")
|
||||||
|
|
||||||
if (
|
if model_args.checkpoint_dir is not None and len(model_args.checkpoint_dir) != 1:
|
||||||
model_args.checkpoint_dir is not None
|
if finetuning_args.finetuning_type != "lora":
|
||||||
and len(model_args.checkpoint_dir) != 1
|
raise ValueError("Multiple checkpoints are only available for LoRA tuning.")
|
||||||
and finetuning_args.finetuning_type != "lora"
|
|
||||||
):
|
if model_args.quantization_bit is not None:
|
||||||
raise ValueError("Multiple checkpoints are only available for LoRA tuning.")
|
raise ValueError("Quantized model only accepts a single checkpoint. Merge them first.")
|
||||||
|
|
||||||
|
|
||||||
def parse_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
|
def parse_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user