Merge pull request #2462 from mnmueller/main

Enable Parsing of SlimOrca Former-commit-id: 388b705a8d
2026-01-02 12:10:34 +08:00 · 2024-02-09 22:55:48 +08:00
parent 36f092b53f 4bd7b8375e
commit 04fa6b9a3d
5 changed files with 36 additions and 6 deletions
--- a/data/README.md
+++ b/data/README.md
@@ -26,7 +26,8 @@ If you are using a custom dataset, please provide your dataset definition in the
    "user_tag": "the value of the role_tag represents the user. (default: human)",
    "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
    "observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
-    "function_tag": "the value of the role_tag represents the function call. (default: function_call)"
+    "function_tag": "the value of the role_tag represents the function call. (default: function_call)",
+    "system_tag": "the value of the role_tag represents the system prompt. (default: None) incompatible with system column"
  }
 }
 ```
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -126,6 +126,20 @@
      "system": "system_prompt"
    }
  },
+ "slimorca": {
+  "hf_hub_url": "Open-Orca/SlimOrca",
+  "formatting": "sharegpt",
+  "columns": {
+    "messages": "conversations"
+  },
+  "tags": {
+    "role_tag": "from",
+    "content_tag": "value",
+    "user_tag": "human",
+    "assistant_tag": "gpt",
+    "system_tag": "system"
+  }
+ },
  "intel_orca_dpo_pairs_de" : {
    "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
    "ranking": true
--- a/src/llmtuner/data/aligner.py
+++ b/src/llmtuner/data/aligner.py
@@ -47,16 +47,22 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
        dataset_attr.assistant_tag: Role.ASSISTANT,
        dataset_attr.observation_tag: Role.OBSERVATION,
        dataset_attr.function_tag: Role.FUNCTION,
+        dataset_attr.system_tag: Role.SYSTEM,
    }
    for i, messages in enumerate(examples[dataset_attr.messages]):
-        messages = messages[: len(messages) // 2 * 2]  # should be multiples of 2
-        if len(messages) == 0:
+        if len(messages) <= 1:
            continue

        prompt = []
        response = []
+        n_sys = 0
        for turn_idx, message in enumerate(messages):
-            if turn_idx % 2 == 0:
+            if dataset_attr.system_tag and message[dataset_attr.role_tag] == dataset_attr.system_tag:
+                outputs["system"].append(message[dataset_attr.content_tag])
+                n_sys = 1
+                continue
+
+            if (turn_idx - n_sys) % 2 == 0:
                accept_tags = [dataset_attr.user_tag, dataset_attr.observation_tag]
            else:
                accept_tags = [dataset_attr.assistant_tag, dataset_attr.function_tag]
@@ -68,10 +74,14 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
                {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
            )

+        if len(prompt) % 2 == 1:
+            # Last message was neither from assistant nor function
+            prompt.pop(-1)
        last_message = prompt.pop(-1)
        response.append(last_message)
        outputs["prompt"].append(prompt)
        outputs["response"].append(response)
+        if n_sys == 0:
            outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
        outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")

--- a/src/llmtuner/data/parser.py
+++ b/src/llmtuner/data/parser.py
@@ -37,6 +37,10 @@ class DatasetAttr:
    assistant_tag: Optional[str] = "gpt"
    observation_tag: Optional[str] = "observation"
    function_tag: Optional[str] = "function_call"
+    system_tag: Optional[str] = None
+
+    assert system_tag is None or system is None, f"Can not provide both system message (system_tag={system_tag}) and system column(system={system})"
+

    def __repr__(self) -> str:
        return self.dataset_name
@@ -95,7 +99,7 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
                setattr(dataset_attr, column_name, dataset_info[name]["columns"].get(column_name, None))

        if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]:
-            for tag in ["role_tag", "content_tag", "user_tag", "assistant_tag", "observation_tag", "function_tag"]:
+            for tag in ["role_tag", "content_tag", "user_tag", "assistant_tag", "observation_tag", "function_tag", "system_tag"]:
                setattr(dataset_attr, tag, dataset_info[name]["tags"].get(tag, None))

        dataset_list.append(dataset_attr)
--- a/src/llmtuner/data/utils.py
+++ b/src/llmtuner/data/utils.py
@@ -21,6 +21,7 @@ class Role(str, Enum):
    ASSISTANT = "assistant"
    OBSERVATION = "observation"
    FUNCTION = "function"
+    SYSTEM = "system"


 def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: