support sharegpt format, add datasets

Former-commit-id: 202daf8987ccb7523be03ca535b572b5c9e65994
This commit is contained in:
hiyouga
2023-11-02 23:10:04 +08:00
parent 7d13501b94
commit b77c745b1a
6 changed files with 192 additions and 97 deletions

View File

@@ -11,6 +11,7 @@ class DatasetAttr:
dataset_name: Optional[str] = None
dataset_sha1: Optional[str] = None
system_prompt: Optional[str] = None
subset: Optional[str] = None
ranking: Optional[bool] = False
formatting: Optional[Literal["alpaca", "sharegpt"]] = "alpaca"
@@ -155,6 +156,7 @@ class DataArguments:
dataset_attr.response = dataset_info[name]["columns"].get("response", None)
dataset_attr.history = dataset_info[name]["columns"].get("history", None)
dataset_attr.subset = dataset_info[name].get("subset", None)
dataset_attr.ranking = dataset_info[name].get("ranking", False)
dataset_attr.formatting = dataset_info[name].get("formatting", "alpaca")
dataset_attr.system_prompt = prompt_list[i]