From 96f8ccf3d533ae88275f16d07049d9c2fc07a3d5 Mon Sep 17 00:00:00 2001 From: Mark Mueller Date: Thu, 8 Feb 2024 08:28:32 -0800 Subject: [PATCH 1/5] SlimOrca aligner Former-commit-id: 928dda93867c2327a7957c04648592044ccf9daf --- src/llmtuner/data/aligner.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index 8144141c..5140f9d8 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -53,28 +53,32 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" if len(messages) == 0: continue + n_sys = 0 prompt = [] response = [] for turn_idx, message in enumerate(messages): - if turn_idx % 2 == 0: - accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag] - else: - accept_tags = [dataset_attr.assistant_tag, dataset_attr.function_tag] + accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag, dataset_attr.assistant_tag, dataset_attr.function_tag] - if message[dataset_attr.role_tag] not in accept_tags: + if message[dataset_attr.role_tag] == "system": + outputs["system"].append(message[dataset_attr.content_tag]) + n_sys += 1 + elif message[dataset_attr.role_tag] not in accept_tags: + print("sytem attr", dataset_attr.system) + print("accepted tags", accept_tags) raise ValueError("Invalid role tag in {}.".format(messages)) - - prompt.append( - {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} - ) + else: + prompt.append( + {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} + ) last_message = prompt.pop(-1) response.append(last_message) outputs["prompt"].append(prompt) outputs["response"].append(response) - outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") + if n_sys == 0: + outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - + assert n_sys <= 1 return outputs From 04515f6b55d391a54222f215c90ae9d160cb471c Mon Sep 17 00:00:00 2001 From: Mark Mueller Date: Thu, 8 Feb 2024 17:52:36 +0100 Subject: [PATCH 2/5] Slim Orca data parsing Former-commit-id: 4dca3907964d27abc2b21eb55c75676901c98912 --- src/llmtuner/data/aligner.py | 30 ++++++++++++++++-------------- src/llmtuner/data/parser.py | 6 +++++- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index 5140f9d8..070a917c 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -53,32 +53,34 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" if len(messages) == 0: continue - n_sys = 0 prompt = [] response = [] + n_sys = 0 for turn_idx, message in enumerate(messages): - accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag, dataset_attr.assistant_tag, dataset_attr.function_tag] - - if message[dataset_attr.role_tag] == "system": + if dataset_attr.system_tag and message[dataset_attr.role_tag] == dataset_attr.system_tag: outputs["system"].append(message[dataset_attr.content_tag]) - n_sys += 1 - elif message[dataset_attr.role_tag] not in accept_tags: - print("sytem attr", dataset_attr.system) - print("accepted tags", accept_tags) - raise ValueError("Invalid role tag in {}.".format(messages)) + n_sys = 1 + + if (turn_idx - n_sys) % 2 == 0: + accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag] else: - prompt.append( - {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} - ) + accept_tags = [dataset_attr.assistant_tag, dataset_attr.function_tag] + + if message[dataset_attr.role_tag] not in accept_tags: + raise ValueError("Invalid role tag in {}.".format(messages)) + + prompt.append( + {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} + ) last_message = prompt.pop(-1) response.append(last_message) outputs["prompt"].append(prompt) outputs["response"].append(response) - if n_sys == 0: + if not dataset_attr.system_tag: outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - assert n_sys <= 1 + return outputs diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py index 474aad83..aa6319fd 100644 --- a/src/llmtuner/data/parser.py +++ b/src/llmtuner/data/parser.py @@ -37,6 +37,10 @@ class DatasetAttr: assistant_tag: Optional[str] = "gpt" observation_tag: Optional[str] = "observation" function_tag: Optional[str] = "function_call" + system_tag: Optional[str] = None + + assert system_tag is None or system is None, f"Can not provide both system message (system_tag={system_tag}) and system column(system={system})" + def __repr__(self) -> str: return self.dataset_name @@ -95,7 +99,7 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: setattr(dataset_attr, column_name, dataset_info[name]["columns"].get(column_name, None)) if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]: - for tag in ["role_tag", "content_tag", "user_tag", "assistant_tag", "observation_tag", "function_tag"]: + for tag in ["role_tag", "content_tag", "user_tag", "assistant_tag", "observation_tag", "function_tag", "system_tag"]: setattr(dataset_attr, tag, dataset_info[name]["tags"].get(tag, None)) dataset_list.append(dataset_attr) From 5788b7c7d0cdc2bb047ea8619dce327339d93855 Mon Sep 17 00:00:00 2001 From: Mark Mueller Date: Thu, 8 Feb 2024 17:54:18 +0100 Subject: [PATCH 3/5] Slim Orca data parsing Former-commit-id: 3016427be4e63fd25f40bc5a0d1f8cedc0997334 --- src/llmtuner/data/aligner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index 070a917c..d4f281bc 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -77,7 +77,7 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" response.append(last_message) outputs["prompt"].append(prompt) outputs["response"].append(response) - if not dataset_attr.system_tag: + if n_sys == 0: outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") From 4d473894fddf4778a5cbf8449de1b617c5f44983 Mon Sep 17 00:00:00 2001 From: Mark Mueller Date: Thu, 8 Feb 2024 17:56:18 +0100 Subject: [PATCH 4/5] Slim Orca data parsing Former-commit-id: ca57d27c39d4e7bc3dd7c3207a23d23d2cbd446b --- src/llmtuner/data/aligner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index d4f281bc..cd3a7ea4 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -60,6 +60,7 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" if dataset_attr.system_tag and message[dataset_attr.role_tag] == dataset_attr.system_tag: outputs["system"].append(message[dataset_attr.content_tag]) n_sys = 1 + continue if (turn_idx - n_sys) % 2 == 0: accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag] From 1ce82f391ad8f28f470208ac557398d202a62f69 Mon Sep 17 00:00:00 2001 From: Mark Mueller Date: Thu, 8 Feb 2024 19:32:20 +0100 Subject: [PATCH 5/5] Slim Orca data parsing Former-commit-id: f2d8efede7e20edafed0d5446eb64f2d419949b1 --- data/README.md | 3 ++- src/llmtuner/data/aligner.py | 7 +++++-- src/llmtuner/data/utils.py | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/data/README.md b/data/README.md index 3d950e1b..bb38935c 100644 --- a/data/README.md +++ b/data/README.md @@ -26,7 +26,8 @@ If you are using a custom dataset, please provide your dataset definition in the "user_tag": "the value of the role_tag represents the user. (default: human)", "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)", "observation_tag": "the value of the role_tag represents the tool results. (default: observation)", - "function_tag": "the value of the role_tag represents the function call. (default: function_call)" + "function_tag": "the value of the role_tag represents the function call. (default: function_call)", + "system_tag": "the value of the role_tag represents the system prompt. (default: None) incompatible with system column" } } ``` diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index cd3a7ea4..34c8980d 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -47,10 +47,10 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" dataset_attr.assistant_tag: Role.ASSISTANT, dataset_attr.observation_tag: Role.OBSERVATION, dataset_attr.function_tag: Role.FUNCTION, + dataset_attr.system_tag: Role.SYSTEM, } for i, messages in enumerate(examples[dataset_attr.messages]): - messages = messages[: len(messages) // 2 * 2] # should be multiples of 2 - if len(messages) == 0: + if len(messages) <= 1: continue prompt = [] @@ -74,6 +74,9 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} ) + if len(prompt) % 2 == 1: + # Last message was neither from assistant nor function + prompt.pop(-1) last_message = prompt.pop(-1) response.append(last_message) outputs["prompt"].append(prompt) diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py index 062d390f..75a28c59 100644 --- a/src/llmtuner/data/utils.py +++ b/src/llmtuner/data/utils.py @@ -21,6 +21,7 @@ class Role(str, Enum): ASSISTANT = "assistant" OBSERVATION = "observation" FUNCTION = "function" + SYSTEM = "system" def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: