Slim Orca data parsing

Former-commit-id: 8bd41826092f0ac46ad51f91928e88e5964ae1a5
This commit is contained in:
Mark Mueller 2024-02-08 17:52:36 +01:00
parent 842b56666a
commit 16d0bf0317
2 changed files with 21 additions and 15 deletions

View File

@ -53,20 +53,22 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
if len(messages) == 0: if len(messages) == 0:
continue continue
n_sys = 0
prompt = [] prompt = []
response = [] response = []
n_sys = 0
for turn_idx, message in enumerate(messages): for turn_idx, message in enumerate(messages):
accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag, dataset_attr.assistant_tag, dataset_attr.function_tag] if dataset_attr.system_tag and message[dataset_attr.role_tag] == dataset_attr.system_tag:
if message[dataset_attr.role_tag] == "system":
outputs["system"].append(message[dataset_attr.content_tag]) outputs["system"].append(message[dataset_attr.content_tag])
n_sys += 1 n_sys = 1
elif message[dataset_attr.role_tag] not in accept_tags:
print("sytem attr", dataset_attr.system) if (turn_idx - n_sys) % 2 == 0:
print("accepted tags", accept_tags) accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag]
raise ValueError("Invalid role tag in {}.".format(messages))
else: else:
accept_tags = [dataset_attr.assistant_tag, dataset_attr.function_tag]
if message[dataset_attr.role_tag] not in accept_tags:
raise ValueError("Invalid role tag in {}.".format(messages))
prompt.append( prompt.append(
{"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
) )
@ -75,10 +77,10 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
response.append(last_message) response.append(last_message)
outputs["prompt"].append(prompt) outputs["prompt"].append(prompt)
outputs["response"].append(response) outputs["response"].append(response)
if n_sys == 0: if not dataset_attr.system_tag:
outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
assert n_sys <= 1
return outputs return outputs

View File

@ -37,6 +37,10 @@ class DatasetAttr:
assistant_tag: Optional[str] = "gpt" assistant_tag: Optional[str] = "gpt"
observation_tag: Optional[str] = "observation" observation_tag: Optional[str] = "observation"
function_tag: Optional[str] = "function_call" function_tag: Optional[str] = "function_call"
system_tag: Optional[str] = None
assert system_tag is None or system is None, f"Can not provide both system message (system_tag={system_tag}) and system column(system={system})"
def __repr__(self) -> str: def __repr__(self) -> str:
return self.dataset_name return self.dataset_name
@ -95,7 +99,7 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
setattr(dataset_attr, column_name, dataset_info[name]["columns"].get(column_name, None)) setattr(dataset_attr, column_name, dataset_info[name]["columns"].get(column_name, None))
if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]: if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]:
for tag in ["role_tag", "content_tag", "user_tag", "assistant_tag", "observation_tag", "function_tag"]: for tag in ["role_tag", "content_tag", "user_tag", "assistant_tag", "observation_tag", "function_tag", "system_tag"]:
setattr(dataset_attr, tag, dataset_info[name]["tags"].get(tag, None)) setattr(dataset_attr, tag, dataset_info[name]["tags"].get(tag, None))
dataset_list.append(dataset_attr) dataset_list.append(dataset_attr)