mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-23 06:12:50 +08:00
Merge pull request #2462 from mnmueller/main
Enable Parsing of SlimOrca Former-commit-id: 388b705a8dd6b13b20ab4d350b9b788e182e6ecd
This commit is contained in:
commit
04fa6b9a3d
@ -26,7 +26,8 @@ If you are using a custom dataset, please provide your dataset definition in the
|
|||||||
"user_tag": "the value of the role_tag represents the user. (default: human)",
|
"user_tag": "the value of the role_tag represents the user. (default: human)",
|
||||||
"assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
|
"assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
|
||||||
"observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
|
"observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
|
||||||
"function_tag": "the value of the role_tag represents the function call. (default: function_call)"
|
"function_tag": "the value of the role_tag represents the function call. (default: function_call)",
|
||||||
|
"system_tag": "the value of the role_tag represents the system prompt. (default: None) incompatible with system column"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
@ -126,6 +126,20 @@
|
|||||||
"system": "system_prompt"
|
"system": "system_prompt"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"slimorca": {
|
||||||
|
"hf_hub_url": "Open-Orca/SlimOrca",
|
||||||
|
"formatting": "sharegpt",
|
||||||
|
"columns": {
|
||||||
|
"messages": "conversations"
|
||||||
|
},
|
||||||
|
"tags": {
|
||||||
|
"role_tag": "from",
|
||||||
|
"content_tag": "value",
|
||||||
|
"user_tag": "human",
|
||||||
|
"assistant_tag": "gpt",
|
||||||
|
"system_tag": "system"
|
||||||
|
}
|
||||||
|
},
|
||||||
"intel_orca_dpo_pairs_de" : {
|
"intel_orca_dpo_pairs_de" : {
|
||||||
"hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
|
"hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
|
||||||
"ranking": true
|
"ranking": true
|
||||||
|
@ -47,16 +47,22 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
|
|||||||
dataset_attr.assistant_tag: Role.ASSISTANT,
|
dataset_attr.assistant_tag: Role.ASSISTANT,
|
||||||
dataset_attr.observation_tag: Role.OBSERVATION,
|
dataset_attr.observation_tag: Role.OBSERVATION,
|
||||||
dataset_attr.function_tag: Role.FUNCTION,
|
dataset_attr.function_tag: Role.FUNCTION,
|
||||||
|
dataset_attr.system_tag: Role.SYSTEM,
|
||||||
}
|
}
|
||||||
for i, messages in enumerate(examples[dataset_attr.messages]):
|
for i, messages in enumerate(examples[dataset_attr.messages]):
|
||||||
messages = messages[: len(messages) // 2 * 2] # should be multiples of 2
|
if len(messages) <= 1:
|
||||||
if len(messages) == 0:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
prompt = []
|
prompt = []
|
||||||
response = []
|
response = []
|
||||||
|
n_sys = 0
|
||||||
for turn_idx, message in enumerate(messages):
|
for turn_idx, message in enumerate(messages):
|
||||||
if turn_idx % 2 == 0:
|
if dataset_attr.system_tag and message[dataset_attr.role_tag] == dataset_attr.system_tag:
|
||||||
|
outputs["system"].append(message[dataset_attr.content_tag])
|
||||||
|
n_sys = 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (turn_idx - n_sys) % 2 == 0:
|
||||||
accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag]
|
accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag]
|
||||||
else:
|
else:
|
||||||
accept_tags = [dataset_attr.assistant_tag, dataset_attr.function_tag]
|
accept_tags = [dataset_attr.assistant_tag, dataset_attr.function_tag]
|
||||||
@ -68,10 +74,14 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
|
|||||||
{"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
|
{"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if len(prompt) % 2 == 1:
|
||||||
|
# Last message was neither from assistant nor function
|
||||||
|
prompt.pop(-1)
|
||||||
last_message = prompt.pop(-1)
|
last_message = prompt.pop(-1)
|
||||||
response.append(last_message)
|
response.append(last_message)
|
||||||
outputs["prompt"].append(prompt)
|
outputs["prompt"].append(prompt)
|
||||||
outputs["response"].append(response)
|
outputs["response"].append(response)
|
||||||
|
if n_sys == 0:
|
||||||
outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
|
outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
|
||||||
outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
|
outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
|
||||||
|
|
||||||
|
@ -37,6 +37,10 @@ class DatasetAttr:
|
|||||||
assistant_tag: Optional[str] = "gpt"
|
assistant_tag: Optional[str] = "gpt"
|
||||||
observation_tag: Optional[str] = "observation"
|
observation_tag: Optional[str] = "observation"
|
||||||
function_tag: Optional[str] = "function_call"
|
function_tag: Optional[str] = "function_call"
|
||||||
|
system_tag: Optional[str] = None
|
||||||
|
|
||||||
|
assert system_tag is None or system is None, f"Can not provide both system message (system_tag={system_tag}) and system column(system={system})"
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return self.dataset_name
|
return self.dataset_name
|
||||||
@ -95,7 +99,7 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
|
|||||||
setattr(dataset_attr, column_name, dataset_info[name]["columns"].get(column_name, None))
|
setattr(dataset_attr, column_name, dataset_info[name]["columns"].get(column_name, None))
|
||||||
|
|
||||||
if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]:
|
if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]:
|
||||||
for tag in ["role_tag", "content_tag", "user_tag", "assistant_tag", "observation_tag", "function_tag"]:
|
for tag in ["role_tag", "content_tag", "user_tag", "assistant_tag", "observation_tag", "function_tag", "system_tag"]:
|
||||||
setattr(dataset_attr, tag, dataset_info[name]["tags"].get(tag, None))
|
setattr(dataset_attr, tag, dataset_info[name]["tags"].get(tag, None))
|
||||||
|
|
||||||
dataset_list.append(dataset_attr)
|
dataset_list.append(dataset_attr)
|
||||||
|
@ -21,6 +21,7 @@ class Role(str, Enum):
|
|||||||
ASSISTANT = "assistant"
|
ASSISTANT = "assistant"
|
||||||
OBSERVATION = "observation"
|
OBSERVATION = "observation"
|
||||||
FUNCTION = "function"
|
FUNCTION = "function"
|
||||||
|
SYSTEM = "system"
|
||||||
|
|
||||||
|
|
||||||
def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
|
def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user