Merge branch 'main' into feat/support_ms

Former-commit-id: 6382efec52
This commit is contained in:
hoshi-hiyouga
2023-12-12 17:55:32 +08:00
committed by GitHub
35 changed files with 662 additions and 370 deletions

View File

@@ -24,27 +24,27 @@ def get_dataset(
for dataset_attr in data_args.dataset_list:
logger.info("Loading dataset {}...".format(dataset_attr))
data_path, data_name, data_dir, data_files = None, None, None, None
if dataset_attr.load_from in ("hf_hub", "ms_hub"):
data_path = dataset_attr.dataset_name
data_name = dataset_attr.subset
data_files = None
data_dir = dataset_attr.folder
elif dataset_attr.load_from == "script":
data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
data_name = dataset_attr.subset
data_files = None
elif dataset_attr.load_from == "file":
data_path, data_name = None, None
data_files: List[str] = []
if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is directory
for file_name in os.listdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)):
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name, file_name))
data_files = []
local_path: str = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
if os.path.isdir(local_path): # is directory
for file_name in os.listdir(local_path):
data_files.append(os.path.join(local_path, file_name))
if data_path is None:
data_path = EXT2TYPE.get(file_name.split(".")[-1], None)
else:
assert data_path == EXT2TYPE.get(file_name.split(".")[-1], None), "file types are not identical."
elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is file
data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name))
data_path = EXT2TYPE.get(dataset_attr.dataset_name.split(".")[-1], None)
elif os.path.isfile(local_path): # is file
data_files.append(local_path)
data_path = EXT2TYPE.get(local_path.split(".")[-1], None)
else:
raise ValueError("File not found.")

View File

@@ -541,9 +541,7 @@ register_template(
"[INST] {{query}} [/INST]"
],
system="",
sep=[
" "
]
sep=[]
)
@@ -650,6 +648,23 @@ register_template(
)
register_template(
name="xuanyuan",
prefix=[
"{{system}}"
],
prompt=[
"Human: {{query}} Assistant:"
],
system=(
"以下是用户和人工智能助手之间的对话。用户以Human开头人工智能助手以Assistant开头"
"会对人类提出的问题给出有帮助、高质量、详细和礼貌的回答,并且总是拒绝参与与不道德、"
"不安全、有争议、政治敏感等相关的话题、问题和指示。\n"
),
sep=[]
)
register_template(
name="xverse",
prefix=[
@@ -707,6 +722,9 @@ register_template(
sep=[
"<|im_end|>\n"
],
stop_words=[
"<|im_end|>"
],
efficient_eos=True
)