improve aligner

Former-commit-id: 7d2dc83c5e
2026-03-09 13:15:59 +08:00 · 2024-02-10 16:39:19 +08:00
parent 04fa6b9a3d
commit db2051684b
11 changed files with 127 additions and 112 deletions
--- a/data/README.md
+++ b/data/README.md
@@ -11,7 +11,7 @@ If you are using a custom dataset, please provide your dataset definition in the
  "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
  "ranking": "whether the dataset is a preference dataset or not. (default: false)",
  "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
-  "columns": {
+  "columns (optional)": {
    "prompt": "the column name in the dataset containing the prompts. (default: instruction)",
    "query": "the column name in the dataset containing the queries. (default: input)",
    "response": "the column name in the dataset containing the responses. (default: output)",
@@ -20,14 +20,14 @@ If you are using a custom dataset, please provide your dataset definition in the
    "system": "the column name in the dataset containing the system prompts. (default: None)",
    "tools": "the column name in the dataset containing the tool description. (default: None)"
  },
-  "tags": {
+  "tags (optional, used for the sharegpt format)": {
    "role_tag": "the key in the message represents the identity. (default: from)",
    "content_tag": "the key in the message represents the content. (default: value)",
    "user_tag": "the value of the role_tag represents the user. (default: human)",
    "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
    "observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
    "function_tag": "the value of the role_tag represents the function call. (default: function_call)",
-    "system_tag": "the value of the role_tag represents the system prompt. (default: None) incompatible with system column"
+    "system_tag": "the value of the role_tag represents the system prompt. (default: system, can override system column)"
  }
 }
 ```
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -11,7 +11,7 @@
  "folder": "Hugging Face 仓库的文件夹名称（可选，默认：None）",
  "ranking": "是否为偏好数据集（可选，默认：False）",
  "formatting": "数据集格式（可选，默认：alpaca，可以为 alpaca 或 sharegpt）",
-  "columns": {
+  "columns（可选）": {
    "prompt": "数据集代表提示词的表头名称（默认：instruction）",
    "query": "数据集代表请求的表头名称（默认：input）",
    "response": "数据集代表回答的表头名称（默认：output）",
@@ -20,13 +20,14 @@
    "system": "数据集代表系统提示的表头名称（默认：None）",
    "tools": "数据集代表工具描述的表头名称（默认：None）"
  },
-  "tags": {
+  "tags（可选，用于 sharegpt 格式）": {
    "role_tag": "消息中代表发送者身份的键名（默认：from）",
    "content_tag": "消息中代表文本内容的键名（默认：value）",
    "user_tag": "消息中代表用户的 role_tag（默认：human）",
    "assistant_tag": "消息中代表助手的 role_tag（默认：gpt）",
    "observation_tag": "消息中代表工具返回结果的 role_tag（默认：observation）",
-    "function_tag": "消息中代表工具调用的 role_tag（默认：function_call）"
+    "function_tag": "消息中代表工具调用的 role_tag（默认：function_call）",
+    "system_tag": "消息中代表系统提示的 role_tag（默认：system，会覆盖 system 列）"
  }
 }
 ```
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -15,9 +15,6 @@
    "file_name": "alpaca_gpt4_data_zh.json",
    "file_sha1": "3eaa3bda364ccdd59925d7448a698256c31ef845"
  },
-  "alpaca-gpt4_de": {
-    "hf_hub_url": "mayflowergmbh/alpaca-gpt4_de"
-  },
  "self_cognition": {
    "file_name": "self_cognition.json",
    "file_sha1": "6287a730ada924fc5d9eadc6d8f865e01b7a6f67"
@@ -42,9 +39,6 @@
      "history": "history"
    }
  },
-  "oasst_de": {
-    "hf_hub_url": "mayflowergmbh/oasst_de"
-  },
  "lima": {
    "file_name": "lima.json",
    "file_sha1": "9db59f6b7007dc4b17529fc63379b9cd61640f37",
@@ -126,44 +120,8 @@
      "system": "system_prompt"
    }
  },
- "slimorca": {
-  "hf_hub_url": "Open-Orca/SlimOrca",
-  "formatting": "sharegpt",
-  "columns": {
-    "messages": "conversations"
-  },
-  "tags": {
-    "role_tag": "from",
-    "content_tag": "value",
-    "user_tag": "human",
-    "assistant_tag": "gpt",
-    "system_tag": "system"
-  }
- },
-  "intel_orca_dpo_pairs_de" : {
-    "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
-    "ranking": true
-  },
-  "airoboros-3.0_de": {
-    "hf_hub_url": "mayflowergmbh/airoboros-3.0_de"
-  },
-  "booksum_de": {
-    "hf_hub_url": "mayflowergmbh/booksum_de"
-  },
-  "dolphin_de": {
-    "hf_hub_url": "mayflowergmbh/dolphin_de"
-  },
-  "wiki_qa_de": {
-    "hf_hub_url": "mayflowergmbh/wiki_qa_de"
-  },
-  "evol-instruct_de": {
-    "hf_hub_url": "mayflowergmbh/evol-instruct_de"
-  },
-  "openschnabeltier_de": {
-    "hf_hub_url": "mayflowergmbh/openschnabeltier_de"
-  },
-  "dolly-15k_de": {
-    "hf_hub_url": "mayflowergmbh/dolly-15k_de"
+  "slimorca": {
+    "hf_hub_url": "Open-Orca/SlimOrca"
  },
  "mathinstruct": {
    "hf_hub_url": "TIGER-Lab/MathInstruct",
@@ -180,6 +138,13 @@
      "response": "target"
    }
  },
+  "wikiqa": {
+    "hf_hub_url": "wiki_qa",
+    "columns": {
+      "prompt": "question",
+      "response": "answer"
+    }
+  },
  "webqa": {
    "hf_hub_url": "suolyer/webqa",
    "ms_hub_url": "AI-ModelScope/webqa",
@@ -193,7 +158,8 @@
    "ms_hub_url": "AI-ModelScope/webnovel_cn"
  },
  "nectar_sft": {
-    "hf_hub_url": "mlinmg/SFT-Nectar"
+    "hf_hub_url": "mlinmg/SFT-Nectar",
+    "ms_hub_url": "AI-ModelScope/SFT-Nectar"
  },
  "deepctrl": {
    "ms_hub_url": "deepctrl/deepctrl-sft-data"
@@ -229,9 +195,6 @@
    },
    "formatting": "sharegpt"
  },
-  "ultrachat_chat_de": {
-    "hf_hub_url": "mayflowergmbh/ultra-chat_de"
-  },
  "agent_instruct": {
    "hf_hub_url": "THUDM/AgentInstruct",
    "ms_hub_url": "ZhipuAI/AgentInstruct",
@@ -253,8 +216,36 @@
  },
  "evol_instruct": {
    "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
+    "ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
    "formatting": "sharegpt"
  },
+  "oasst_de": {
+    "hf_hub_url": "mayflowergmbh/oasst_de"
+  },
+  "dolly_15k_de": {
+    "hf_hub_url": "mayflowergmbh/dolly-15k_de"
+  },
+  "alpaca-gpt4_de": {
+    "hf_hub_url": "mayflowergmbh/alpaca-gpt4_de"
+  },
+  "openschnabeltier_de": {
+    "hf_hub_url": "mayflowergmbh/openschnabeltier_de"
+  },
+  "evol_instruct_de": {
+    "hf_hub_url": "mayflowergmbh/evol-instruct_de"
+  },
+  "dolphin_de": {
+    "hf_hub_url": "mayflowergmbh/dolphin_de"
+  },
+  "booksum_de": {
+    "hf_hub_url": "mayflowergmbh/booksum_de"
+  },
+  "airoboros_de": {
+    "hf_hub_url": "mayflowergmbh/airoboros-3.0_de"
+  },
+  "ultrachat_de": {
+    "hf_hub_url": "mayflowergmbh/ultra-chat_de"
+  },
  "hh_rlhf_en": {
    "script_url": "hh_rlhf_en",
    "columns": {
@@ -298,6 +289,11 @@
  },
  "nectar_rm": {
    "hf_hub_url": "mlinmg/RLAIF-Nectar",
+    "ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
+    "ranking": true
+  },
+  "orca_dpo_de" : {
+    "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
    "ranking": true
  },
  "wiki_demo": {
@@ -329,6 +325,7 @@
  },
  "wikipedia_en": {
    "hf_hub_url": "olm/olm-wikipedia-20221220",
+    "ms_hub_url": "AI-ModelScope/olm-wikipedia-20221220",
    "columns": {
      "prompt": "text"
    }
@@ -342,6 +339,7 @@
  },
  "pile": {
    "hf_hub_url": "EleutherAI/pile",
+    "ms_hub_url": "AI-ModelScope/pile",
    "columns": {
      "prompt": "text"
    },
@@ -349,6 +347,7 @@
  },
  "skypile": {
    "hf_hub_url": "Skywork/SkyPile-150B",
+    "ms_hub_url": "AI-ModelScope/SkyPile-150B",
    "columns": {
      "prompt": "text"
    }