From 8332c85f37239885c17f089b499375320b07c276 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 27 Nov 2024 04:20:23 +0000
Subject: [PATCH] add marco-o1 and openo1 dataset

Former-commit-id: 51d49e075470951f109bcdde136203f972450c2e
---
 README.md                            |  3 ++-
 README_zh.md                         |  3 ++-
 src/llamafactory/data/template.py    | 16 ++++++++++++++++
 src/llamafactory/extras/constants.py | 11 +++++++++++
 4 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index c6d0e1a8..d216a1fc 100644
--- a/README.md
+++ b/README.md
@@ -290,9 +290,10 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
 - [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
-- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
 - [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 - [Magpie-ultra-v0.1 (en)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [OpenO1-SFT (en&zh)](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Pokemon-gpt4o-captions (en&zh)](https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
diff --git a/README_zh.md b/README_zh.md
index 9ba60c05..14ea863d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -290,9 +290,10 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
 - [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
-- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
 - [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 - [Magpie-ultra-v0.1 (en)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [OpenO1-SFT (en&zh)](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Pokemon-gpt4o-captions (en&zh)](https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 6054b7a6..e7f3b72b 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -901,6 +901,22 @@ _register_template(
 )
 
 
+_register_template(
+    name="marco",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system=(
+        "你是一个经过良好训练的AI助手，你的名字是Marco-o1.由阿里国际数字商业集团的AI Business创造.\n## 重要！！！！！\n"
+        "当你回答问题时，你的思考应该在<Thought>内完成，<Output>内输出你的结果。\n"
+        "<Thought>应该尽可能是英文，但是有2个特例，一个是对原文中的引用，另一个是是数学应该使用markdown格式，<Output>内的输出需要遵循用户输入的语言。\n"
+    ),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
 _register_template(
     name="mistral",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 5fdb33c2..09050a7c 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -998,6 +998,17 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "Marco-o1-Chat": {
+            DownloadSource.DEFAULT: "AIDC-AI/Marco-o1",
+            DownloadSource.MODELSCOPE: "AIDC-AI/Marco-o1",
+        },
+    },
+    template="marco",
+)
+
+
 register_model_group(
     models={
         "MiniCPM-2B-SFT-Chat": {