From 9d88d0bcd861b1383dcbaedde4bcbe20452816fc Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 8 Jun 2024 21:11:32 +0800
Subject: [PATCH 001/162] update git workflows

Former-commit-id: 5a3f26bc53433caa98b2a66294becaf156280a4c
---
 .github/workflows/label_issue.yml | 17 +++++++++++++++++
 .github/workflows/tests.yml       | 17 ++++++++---------
 2 files changed, 25 insertions(+), 9 deletions(-)
 create mode 100644 .github/workflows/label_issue.yml

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
new file mode 100644
index 00000000..b9a5543c
--- /dev/null
+++ b/.github/workflows/label_issue.yml
@@ -0,0 +1,17 @@
+name: label_issue
+
+on:
+  issues:
+    types:
+      - opened
+
+jobs:
+  label_issue:
+    runs-on: ubuntu-latest
+
+    steps:
+      - env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          ISSUE_URL: ${{ github.event.issue.html_url }}
+        run: |
+          gh issue edit $ISSUE_URL --add-label "pending"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 32edf6a8..6ddcbc05 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -3,14 +3,7 @@ name: tests
 on:
   push:
     branches:
-      - main
-    paths:
-      - "**.py"
-      - "requirements.txt"
-      - ".github/workflows/*.yml"
-  pull_request:
-    branches:
-      - main
+      - $default-branch
     paths:
       - "**.py"
       - "requirements.txt"
@@ -19,21 +12,27 @@ on:
 jobs:
   tests:
     runs-on: ubuntu-latest
+
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
+
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: "3.8"
           cache: "pip"
           cache-dependency-path: "setup.py"
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           python -m pip install .[torch,dev]
+
       - name: Check quality
         run: |
           make style && make quality
+
       - name: Test with pytest
         run: |
           make test

From d999691d4fba74bc0d7f5474fb3c037ba81a804a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 8 Jun 2024 21:15:36 +0800
Subject: [PATCH 002/162] Update tests.yml

Former-commit-id: e90f0cc30d6bb819246ccc08935c39e714c179a1
---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 6ddcbc05..f3ac96db 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -3,7 +3,7 @@ name: tests
 on:
   push:
     branches:
-      - $default-branch
+      - main
     paths:
       - "**.py"
       - "requirements.txt"

From fcd42d8e3adeca9b3d4e65e178834d620ce3c451 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 8 Jun 2024 21:25:35 +0800
Subject: [PATCH 003/162] add pr ci

Former-commit-id: 9b05bb8540b946d0c74bf804bcafc4a785d22c47
---
 .github/workflows/tests.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f3ac96db..96092662 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,6 +8,15 @@ on:
       - "**.py"
       - "requirements.txt"
       - ".github/workflows/*.yml"
+  pull_request:
+    types:
+      - review_requested
+    branches:
+      - main
+    paths:
+      - "**.py"
+      - "requirements.txt"
+      - ".github/workflows/*.yml"
 
 jobs:
   tests:

From 74764be45affef2b2d1145dd36b26df8ad744b8a Mon Sep 17 00:00:00 2001
From: "-.-" <wooooolpl@163.com>
Date: Sat, 8 Jun 2024 23:51:56 +0800
Subject: [PATCH 004/162] fix README

Former-commit-id: fa30028c0b83c38610b596209493a748b8ca0928
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fb6c5782..4dea65b9 100644
--- a/README.md
+++ b/README.md
@@ -335,7 +335,7 @@ huggingface-cli login
 ```bash
 git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
 cd LLaMA-Factory
-pip install -e '.[torch,metrics]'
+pip install -e ".[torch,metrics]"
 ```
 
 Extra dependencies available: torch, torch_npu, metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality
diff --git a/README_zh.md b/README_zh.md
index 142254df..ab0e8cb7 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -335,7 +335,7 @@ huggingface-cli login
 ```bash
 git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
 cd LLaMA-Factory
-pip install -e '.[torch,metrics]'
+pip install -e ".[torch,metrics]"
 ```
 
 可选的额外依赖项：torch、torch_npu、metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality

From 8d0df82d0ea8c1fbd4b5141621eeda1e3e44cbc4 Mon Sep 17 00:00:00 2001
From: mMrBun <2015711377@qq.com>
Date: Sun, 9 Jun 2024 18:16:15 +0800
Subject: [PATCH 005/162] Implemented the tool_formatter and tool_extractor for
 glm4 tool_format

Former-commit-id: db7fa4490ea7f6966418d2879c895cbc1763b16d
---
 src/llamafactory/data/formatter.py | 42 +++++++++++++++++++++++++++++-
 src/llamafactory/data/template.py  |  3 ++-
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 0cd3d6c1..344e01db 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -23,6 +23,17 @@ TOOL_SYSTEM_PROMPT = (
 )
 
 
+GLM4_TOOL_SUFFIX_PROMPT = (
+    "在调用上述函数时，请使用 Json 格式表示调用的参数。"
+)
+
+GLM4_TOOL_PROMPT = (
+    "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持，"
+    "{tool_text}"
+    
+)
+
+
 def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
     tool_text = ""
     tool_names = []
@@ -53,6 +64,14 @@ def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
     )
 
 
+def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
+    tool_text = ""
+    for tool in tools:
+        tool_name = tool["name"]
+        tool_text += f"\n\n## {tool_name}\n\n{json.dumps(tool, ensure_ascii=False, indent=4)}\n{GLM4_TOOL_SUFFIX_PROMPT}"
+    return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+ 
+
 def default_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
     regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+).*?Action Input:\s*(.*)", re.DOTALL)
     action_match = re.search(regex, content)
@@ -69,10 +88,24 @@ def default_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
     return tool_name, json.dumps(arguments, ensure_ascii=False)
 
 
+def glm4_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
+    lines = content.strip().split("\n")
+    if len(lines) != 2:
+        return content
+    tool_name = lines[0].strip()
+    tool_input = lines[1].strip()
+    try:
+        arguments = json.loads(tool_input)
+    except json.JSONDecodeError:
+        return content
+    return tool_name, json.dumps(arguments, ensure_ascii=False)
+
+    
+
 @dataclass
 class Formatter(ABC):
     slots: SLOTS = field(default_factory=list)
-    tool_format: Optional[Literal["default"]] = None
+    tool_format: Optional[Literal["default", "glm4"]] = None
 
     @abstractmethod
     def apply(self, **kwargs) -> SLOTS: ...
@@ -175,6 +208,11 @@ class ToolFormatter(Formatter):
 
             if self.tool_format == "default":
                 return [default_tool_formatter(tools)]
+            elif self.tool_format == "glm4":
+                """
+                '[gMASK]<sop><|system|>\n你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n## get_current_weather\n\n{\n    "name": "get_current_weather",\n    "description": "Get the current weather",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "location": {\n                "type": "string",\n                "description": "The city and state, e.g. San Francisco, CA"\n            },\n            "format": {\n                "type": "string",\n                "enum": [\n                    "celsius",\n                    "fahrenheit"\n                ],\n                "description": "The temperature unit to use. Infer this from the users location."\n            }\n        },\n        "required": [\n            "location",\n            "format"\n        ]\n    }\n}\n在调用上述函数时，请使用 Json 格式表示调用的参数。<|user|>\nWhat\'s the weather like in San Francisco, Tokyo, and Paris? use celsius<|assistant|>'
+                """
+                return [glm4_tool_formatter(tools)]
             else:
                 raise NotImplementedError
         except Exception:
@@ -183,5 +221,7 @@ class ToolFormatter(Formatter):
     def extract(self, content: str) -> Union[str, Tuple[str, str]]:
         if self.tool_format == "default":
             return default_tool_extractor(content)
+        elif self.tool_format == "glm4":
+            return glm4_tool_extractor(content)
         else:
             raise NotImplementedError
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 3dce5ec6..b2aea217 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -662,9 +662,10 @@ _register_template(
     name="glm4",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
     format_assistant=StringFormatter(slots=["\n{{content}}"]),
-    format_system=StringFormatter(slots=["[gMASK]<sop>{{content}}"]),
+    format_system=StringFormatter(slots=["[gMASK]<sop><|system|>\n{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
     force_system=True,

From f86bc621be2b39d82f8c95fa087434995b061337 Mon Sep 17 00:00:00 2001
From: mMrBun <2015711377@qq.com>
Date: Sun, 9 Jun 2024 18:25:22 +0800
Subject: [PATCH 006/162] Removed unnecessary comments.

Former-commit-id: 2b81252aa693871098931cd7873ef83ef4922ba5
---
 src/llamafactory/data/formatter.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 344e01db..9f58915b 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -209,9 +209,6 @@ class ToolFormatter(Formatter):
             if self.tool_format == "default":
                 return [default_tool_formatter(tools)]
             elif self.tool_format == "glm4":
-                """
-                '[gMASK]<sop><|system|>\n你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n## get_current_weather\n\n{\n    "name": "get_current_weather",\n    "description": "Get the current weather",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "location": {\n                "type": "string",\n                "description": "The city and state, e.g. San Francisco, CA"\n            },\n            "format": {\n                "type": "string",\n                "enum": [\n                    "celsius",\n                    "fahrenheit"\n                ],\n                "description": "The temperature unit to use. Infer this from the users location."\n            }\n        },\n        "required": [\n            "location",\n            "format"\n        ]\n    }\n}\n在调用上述函数时，请使用 Json 格式表示调用的参数。<|user|>\nWhat\'s the weather like in San Francisco, Tokyo, and Paris? use celsius<|assistant|>'
-                """
                 return [glm4_tool_formatter(tools)]
             else:
                 raise NotImplementedError

From 680a340a40514ebeb66f7eec21b9f538e770d4a9 Mon Sep 17 00:00:00 2001
From: mMrBun <2015711377@qq.com>
Date: Mon, 10 Jun 2024 02:00:14 +0800
Subject: [PATCH 007/162] Optimize the handling of QWEN2 in scenarios involving
 multiple tool calls.

Former-commit-id: 48f870edc96ada40360f7e6e67cbf58805295b33
---
 src/llamafactory/api/chat.py       | 13 +++++++-----
 src/llamafactory/data/formatter.py | 34 ++++++++++++++++++------------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/llamafactory/api/chat.py b/src/llamafactory/api/chat.py
index 98957bc1..d4db1eea 100644
--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -150,11 +150,14 @@ async def create_chat_completion_response(
         else:
             result = response.response_text
 
-        if isinstance(result, tuple):
-            name, arguments = result
-            function = Function(name=name, arguments=arguments)
-            tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
-            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=[tool_call])
+        if isinstance(result, list):
+            tool_calls = []
+            for tool in result:
+                name, arguments = tool
+                function = Function(name=name, arguments=arguments)
+                tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
+                tool_calls.append(tool_call)
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=tool_calls)
             finish_reason = Finish.TOOL
         else:
             response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 9f58915b..1d917887 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -72,23 +72,29 @@ def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
     return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
  
 
-def default_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
-    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+).*?Action Input:\s*(.*)", re.DOTALL)
-    action_match = re.search(regex, content)
+def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
+    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*({.*?})(?=\nAction:|\Z)", re.DOTALL)
+    action_match = re.findall(regex, content)
     if not action_match:
         return content
 
-    tool_name = action_match.group(1).strip()
-    tool_input = action_match.group(2).strip().strip('"').strip("```")
-    try:
-        arguments = json.loads(tool_input)
-    except json.JSONDecodeError:
-        return content
+    results = []
+    
+    for match in action_match:
+        tool_name, tool_input = match
+        tool_name = tool_name.strip()
+        tool_input = tool_input.strip().strip('"').strip("```")
 
-    return tool_name, json.dumps(arguments, ensure_ascii=False)
+        try:
+            arguments = json.loads(tool_input)
+            results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
+        except json.JSONDecodeError:
+            return content
+
+    return results
 
 
-def glm4_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
+def glm4_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
     lines = content.strip().split("\n")
     if len(lines) != 2:
         return content
@@ -98,7 +104,7 @@ def glm4_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
         arguments = json.loads(tool_input)
     except json.JSONDecodeError:
         return content
-    return tool_name, json.dumps(arguments, ensure_ascii=False)
+    return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
 
     
@@ -110,7 +116,7 @@ class Formatter(ABC):
     @abstractmethod
     def apply(self, **kwargs) -> SLOTS: ...
 
-    def extract(self, content: str) -> Union[str, Tuple[str, str]]:
+    def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
         raise NotImplementedError
 
 
@@ -215,7 +221,7 @@ class ToolFormatter(Formatter):
         except Exception:
             return [""]
 
-    def extract(self, content: str) -> Union[str, Tuple[str, str]]:
+    def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
         if self.tool_format == "default":
             return default_tool_extractor(content)
         elif self.tool_format == "glm4":

From 7474e8035fcdbb08a3a047d0e81354229c785dbc Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 10 Jun 2024 21:24:15 +0800
Subject: [PATCH 008/162] fix #2666

Former-commit-id: f121d5c4f94af9f165132c4309cb9bdc8217d985
---
 src/llamafactory/model/adapter.py         |  2 +-
 tests/data/test_supervised.py             | 32 ++++++++++++++---------
 tests/model/model_utils/test_attention.py | 15 +++++------
 tests/model/test_freeze.py                | 19 ++++----------
 tests/model/test_full.py                  |  8 +++---
 tests/model/test_lora.py                  | 19 ++++----------
 6 files changed, 41 insertions(+), 54 deletions(-)

diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index f4e501a7..34b9eda6 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -209,6 +209,7 @@ def _setup_lora_tuning(
             "lora_alpha": finetuning_args.lora_alpha,
             "lora_dropout": finetuning_args.lora_dropout,
             "use_rslora": finetuning_args.use_rslora,
+            "use_dora": finetuning_args.use_dora,
             "modules_to_save": finetuning_args.additional_target,
         }
 
@@ -218,7 +219,6 @@ def _setup_lora_tuning(
             lora_config = LoraConfig(
                 task_type=TaskType.CAUSAL_LM,
                 inference_mode=False,
-                use_dora=finetuning_args.use_dora,
                 **peft_kwargs,
             )
             model = get_peft_model(model, lora_config)
diff --git a/tests/data/test_supervised.py b/tests/data/test_supervised.py
index bb7f71df..63a3453f 100644
--- a/tests/data/test_supervised.py
+++ b/tests/data/test_supervised.py
@@ -1,4 +1,5 @@
 import os
+import random
 
 import pytest
 from datasets import load_dataset
@@ -8,17 +9,17 @@ from llamafactory.hparams import get_train_args
 from llamafactory.model import load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TRAINING_ARGS = {
+TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
     "do_train": True,
     "finetuning_type": "full",
-    "dataset": "llamafactory/tiny_dataset",
+    "dataset": "llamafactory/tiny-supervised-dataset",
     "dataset_dir": "ONLINE",
     "template": "llama3",
-    "cutoff_len": 1024,
+    "cutoff_len": 8192,
     "overwrite_cache": True,
     "output_dir": "dummy_dir",
     "overwrite_output_dir": True,
@@ -26,19 +27,24 @@ TRAINING_ARGS = {
 }
 
 
-@pytest.mark.parametrize("test_num", [5])
-def test_supervised(test_num: int):
-    model_args, data_args, training_args, _, _ = get_train_args(TRAINING_ARGS)
+@pytest.mark.parametrize("num_samples", [10])
+def test_supervised(num_samples: int):
+    model_args, data_args, training_args, _, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     tokenizer = tokenizer_module["tokenizer"]
     tokenized_data = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
 
-    original_data = load_dataset(TRAINING_ARGS["dataset"], split="train")
-    for test_idx in range(test_num):
-        decode_result = tokenizer.decode(tokenized_data["input_ids"][test_idx])
+    original_data = load_dataset(TRAIN_ARGS["dataset"], split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        decoded_result = tokenizer.decode(tokenized_data["input_ids"][index])
+        prompt = original_data[index]["instruction"]
+        if original_data[index]["input"]:
+            prompt += "\n" + original_data[index]["input"]
+
         messages = [
-            {"role": "user", "content": original_data[test_idx]["instruction"]},
-            {"role": "assistant", "content": original_data[test_idx]["output"]},
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": original_data[index]["output"]},
         ]
         templated_result = tokenizer.apply_chat_template(messages, tokenize=False)
-        assert decode_result == templated_result
+        assert decoded_result == templated_result
diff --git a/tests/model/model_utils/test_attention.py b/tests/model/model_utils/test_attention.py
index 4d414289..751adda4 100644
--- a/tests/model/model_utils/test_attention.py
+++ b/tests/model/model_utils/test_attention.py
@@ -6,7 +6,12 @@ from llamafactory.hparams import get_infer_args
 from llamafactory.model import load_model, load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "template": "llama3",
+}
 
 
 def test_attention():
@@ -23,13 +28,7 @@ def test_attention():
         "fa2": "LlamaFlashAttention2",
     }
     for requested_attention in attention_available:
-        model_args, _, finetuning_args, _ = get_infer_args(
-            {
-                "model_name_or_path": TINY_LLAMA,
-                "template": "llama2",
-                "flash_attn": requested_attention,
-            }
-        )
+        model_args, _, finetuning_args, _ = get_infer_args({"flash_attn": requested_attention, **INFER_ARGS})
         tokenizer_module = load_tokenizer(model_args)
         model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args)
         for module in model.modules():
diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py
index c6cdec78..97800696 100644
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
@@ -6,14 +6,14 @@ from llamafactory.hparams import get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TRAINING_ARGS = {
+TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
     "do_train": True,
     "finetuning_type": "freeze",
-    "dataset": "llamafactory/tiny_dataset",
+    "dataset": "llamafactory/tiny-supervised-dataset",
     "dataset_dir": "ONLINE",
     "template": "llama3",
     "cutoff_len": 1024,
@@ -25,12 +25,7 @@ TRAINING_ARGS = {
 
 
 def test_freeze_all_modules():
-    model_args, _, _, finetuning_args, _ = get_train_args(
-        {
-            "freeze_trainable_layers": 1,
-            **TRAINING_ARGS,
-        }
-    )
+    model_args, _, _, finetuning_args, _ = get_train_args({"freeze_trainable_layers": 1, **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     for name, param in model.named_parameters():
@@ -44,11 +39,7 @@ def test_freeze_all_modules():
 
 def test_freeze_extra_modules():
     model_args, _, _, finetuning_args, _ = get_train_args(
-        {
-            "freeze_trainable_layers": 1,
-            "freeze_extra_modules": "embed_tokens,lm_head",
-            **TRAINING_ARGS,
-        }
+        {"freeze_trainable_layers": 1, "freeze_extra_modules": "embed_tokens,lm_head", **TRAIN_ARGS}
     )
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
diff --git a/tests/model/test_full.py b/tests/model/test_full.py
index ef57a980..6cb78f37 100644
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
@@ -6,14 +6,14 @@ from llamafactory.hparams import get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TRAINING_ARGS = {
+TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
     "do_train": True,
     "finetuning_type": "full",
-    "dataset": "llamafactory/tiny_dataset",
+    "dataset": "llamafactory/tiny-supervised-dataset",
     "dataset_dir": "ONLINE",
     "template": "llama3",
     "cutoff_len": 1024,
@@ -25,7 +25,7 @@ TRAINING_ARGS = {
 
 
 def test_full():
-    model_args, _, _, finetuning_args, _ = get_train_args(TRAINING_ARGS)
+    model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     for param in model.parameters():
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 1f2c02ae..2e2b89d9 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -6,14 +6,14 @@ from llamafactory.hparams import get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TRAINING_ARGS = {
+TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
     "do_train": True,
     "finetuning_type": "lora",
-    "dataset": "llamafactory/tiny_dataset",
+    "dataset": "llamafactory/tiny-supervised-dataset",
     "dataset_dir": "ONLINE",
     "template": "llama3",
     "cutoff_len": 1024,
@@ -25,12 +25,7 @@ TRAINING_ARGS = {
 
 
 def test_lora_all_modules():
-    model_args, _, _, finetuning_args, _ = get_train_args(
-        {
-            "lora_target": "all",
-            **TRAINING_ARGS,
-        }
-    )
+    model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "all", **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     linear_modules = set()
@@ -48,11 +43,7 @@ def test_lora_all_modules():
 
 def test_lora_extra_modules():
     model_args, _, _, finetuning_args, _ = get_train_args(
-        {
-            "lora_target": "all",
-            "additional_target": "embed_tokens,lm_head",
-            **TRAINING_ARGS,
-        }
+        {"lora_target": "all", "additional_target": "embed_tokens,lm_head", **TRAIN_ARGS}
     )
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)

From bc86e70af9b30c3aa96d0162b21b292ca79e252e Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 10 Jun 2024 23:56:00 +0800
Subject: [PATCH 009/162] update evaluator

Former-commit-id: bb8661e62481ff7027b8969f3d8a6a17290c9da3
---
 src/llamafactory/eval/evaluator.py |  4 +-
 src/llamafactory/eval/template.py  |  9 ++--
 tests/eval/test_eval_template.py   | 77 ++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 9 deletions(-)
 create mode 100644 tests/eval/test_eval_template.py

diff --git a/src/llamafactory/eval/evaluator.py b/src/llamafactory/eval/evaluator.py
index 192f4815..5c6fb104 100644
--- a/src/llamafactory/eval/evaluator.py
+++ b/src/llamafactory/eval/evaluator.py
@@ -26,9 +26,7 @@ class Evaluator:
         self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template)
         self.model = load_model(self.tokenizer, self.model_args, finetuning_args)
         self.eval_template = get_eval_template(self.eval_args.lang)
-        self.choice_inputs = [
-            self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES
-        ]
+        self.choice_inputs = [self.tokenizer.encode(ch, add_special_tokens=False)[-1] for ch in CHOICES]
 
     @torch.inference_mode()
     def batch_inference(self, batch_input: Dict[str, torch.Tensor]) -> List[str]:
diff --git a/src/llamafactory/eval/template.py b/src/llamafactory/eval/template.py
index a4a6ef0e..2cbb5aaf 100644
--- a/src/llamafactory/eval/template.py
+++ b/src/llamafactory/eval/template.py
@@ -10,7 +10,6 @@ class EvalTemplate:
     system: str
     choice: str
     answer: str
-    prefix: str
 
     def _parse_example(self, example: Dict[str, str]) -> Tuple[str, str]:
         r"""
@@ -42,8 +41,8 @@ class EvalTemplate:
 eval_templates: Dict[str, "EvalTemplate"] = {}
 
 
-def _register_eval_template(name: str, system: str, choice: str, answer: str, prefix: str) -> None:
-    eval_templates[name] = EvalTemplate(system=system, choice=choice, answer=answer, prefix=prefix)
+def _register_eval_template(name: str, system: str, choice: str, answer: str) -> None:
+    eval_templates[name] = EvalTemplate(system=system, choice=choice, answer=answer)
 
 
 def get_eval_template(name: str) -> "EvalTemplate":
@@ -56,8 +55,7 @@ _register_eval_template(
     name="en",
     system="The following are multiple choice questions (with answers) about {subject}.\n\n",
     choice="\n{choice}. {content}",
-    answer="\nAnswer: ",
-    prefix=" ",
+    answer="\nAnswer:",
 )
 
 
@@ -66,5 +64,4 @@ _register_eval_template(
     system="以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。\n\n",
     choice="\n{choice}. {content}",
     answer="\n答案：",
-    prefix=" ",
 )
diff --git a/tests/eval/test_eval_template.py b/tests/eval/test_eval_template.py
new file mode 100644
index 00000000..f6a91a67
--- /dev/null
+++ b/tests/eval/test_eval_template.py
@@ -0,0 +1,77 @@
+from llamafactory.eval.template import get_eval_template
+
+
+def test_eval_template_en():
+    support_set = [
+        {
+            "question": "Fewshot question",
+            "A": "Fewshot1",
+            "B": "Fewshot2",
+            "C": "Fewshot3",
+            "D": "Fewshot4",
+            "answer": "B",
+        }
+    ]
+    example = {
+        "question": "Target question",
+        "A": "Target1",
+        "B": "Target2",
+        "C": "Target3",
+        "D": "Target4",
+        "answer": "C",
+    }
+    template = get_eval_template(name="en")
+    messages = template.format_example(example, support_set=support_set, subject_name="SubName")
+    assert messages == [
+        {
+            "role": "user",
+            "content": (
+                "The following are multiple choice questions (with answers) about SubName.\n\n"
+                "Fewshot question\nA. Fewshot1\nB. Fewshot2\nC. Fewshot3\nD. Fewshot4\nAnswer:"
+            ),
+        },
+        {"role": "assistant", "content": "B"},
+        {
+            "role": "user",
+            "content": "Target question\nA. Target1\nB. Target2\nC. Target3\nD. Target4\nAnswer:",
+        },
+        {"role": "assistant", "content": "C"},
+    ]
+
+
+def test_eval_template_zh():
+    support_set = [
+        {
+            "question": "示例问题",
+            "A": "示例答案1",
+            "B": "示例答案2",
+            "C": "示例答案3",
+            "D": "示例答案4",
+            "answer": "B",
+        }
+    ]
+    example = {
+        "question": "目标问题",
+        "A": "目标答案1",
+        "B": "目标答案2",
+        "C": "目标答案3",
+        "D": "目标答案4",
+        "answer": "C",
+    }
+    template = get_eval_template(name="zh")
+    messages = template.format_example(example, support_set=support_set, subject_name="主题")
+    assert messages == [
+        {
+            "role": "user",
+            "content": (
+                "以下是中国关于主题考试的单项选择题，请选出其中的正确答案。\n\n"
+                "示例问题\nA. 示例答案1\nB. 示例答案2\nC. 示例答案3\nD. 示例答案4\n答案："
+            ),
+        },
+        {"role": "assistant", "content": "B"},
+        {
+            "role": "user",
+            "content": "目标问题\nA. 目标答案1\nB. 目标答案2\nC. 目标答案3\nD. 目标答案4\n答案：",
+        },
+        {"role": "assistant", "content": "C"},
+    ]

From 0e7c15d2bd422dc952ce782fbd61b726ab828f3e Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 00:19:17 +0800
Subject: [PATCH 010/162] fix #4145

Fix the docker image


Former-commit-id: a9838281156fe870bfcde5d1f7afc15264fd4aad
---
 Dockerfile         | 38 ++++++++++++++++++++++++++++++++++----
 README.md          | 36 ++++++++++++++++++------------------
 README_zh.md       | 34 ++++++++++++++++++----------------
 docker-compose.yml | 10 ++++++++--
 4 files changed, 78 insertions(+), 40 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0a35e355..45849601 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,14 +1,44 @@
-FROM nvcr.io/nvidia/pytorch:24.01-py3
+# Use the NVIDIA official image with PyTorch 2.3.0
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html
+FROM nvcr.io/nvidia/pytorch:24.02-py3
 
+# Define installation arguments
+ARG INSTALL_BNB=false
+ARG INSTALL_VLLM=false
+ARG INSTALL_DEEPSPEED=false
+ARG PIP_INDEX=https://pypi.org/simple
+
+# Set the working directory
 WORKDIR /app
 
+# Install the requirements
 COPY requirements.txt /app/
-RUN pip install -r requirements.txt
+RUN pip config set global.index-url $PIP_INDEX
+RUN python -m pip install --upgrade pip
+RUN python -m pip install -r requirements.txt
 
+# Copy the rest of the application into the image
 COPY . /app/
-RUN pip install -e .[metrics,bitsandbytes,qwen]
 
+# Install the LLaMA Factory
+RUN EXTRA_PACKAGES="metrics"; \
+    if [ "$INSTALL_BNB" = "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
+    fi; \
+    if [ "$INSTALL_VLLM" = "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
+    fi; \
+    if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
+    fi; \
+    pip install -e .[$EXTRA_PACKAGES] && \
+    pip uninstall -y transformer-engine
+
+# Set up volumes
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
+
+# Expose port 7860 for the LLaMA Board
 EXPOSE 7860
 
-CMD [ "llamafactory-cli", "webui" ]
+# Expose port 8000 for the API service
+EXPOSE 8000
diff --git a/README.md b/README.md
index 4dea65b9..35dacd2e 100644
--- a/README.md
+++ b/README.md
@@ -405,9 +405,9 @@ Please refer to [data/README.md](data/README.md) for checking the details about
 Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Llama3-8B-Instruct model, respectively.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
 See [examples/README.md](examples/README.md) for advanced usage (including distributed training).
@@ -417,33 +417,33 @@ See [examples/README.md](examples/README.md) for advanced usage (including distr
 
 ### Fine-Tuning with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
 
-#### Use local environment
-
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
+llamafactory-cli webui
 ```
 
-</details>
-
-#### Use Docker
+### Build Docker
 
 ```bash
-docker build -f ./Dockerfile -t llama-factory:latest .
-docker run --gpus=all \
+docker build -f ./Dockerfile \
+    --build-arg INSTALL_BNB=false \
+    --build-arg INSTALL_VLLM=false \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    -t llamafactory:latest .
+
+docker run -it --gpus=all \
     -v ./hf_cache:/root/.cache/huggingface/ \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -p 7860:7860 \
+    -p 8000:8000 \
     --shm-size 16G \
-    --name llama_factory \
-    -d llama-factory:latest
+    --name llamafactory \
+    llamafactory:latest
 ```
 
-#### Use Docker Compose
-
-```bash
-docker compose -f ./docker-compose.yml up -d
-```
+> [!TIP]
+> Use Docker Compose to build image via `docker compose up -d`.
 
 <details><summary>Details about volume</summary>
 
diff --git a/README_zh.md b/README_zh.md
index ab0e8cb7..0ddb8b19 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -405,9 +405,9 @@ Docker 镜像：
 下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
 高级用法请参考 [examples/README_zh.md](examples/README_zh.md)（包括多 GPU 微调）。
@@ -417,31 +417,33 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_s
 
 ### LLaMA Board 可视化微调（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
 
-#### 使用本地环境
-
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
+llamafactory-cli webui
 ```
 
-#### 使用 Docker
+### 构建 Docker
 
 ```bash
-docker build -f ./Dockerfile -t llama-factory:latest .
-docker run --gpus=all \
+docker build -f ./Dockerfile \
+    --build-arg INSTALL_BNB=false \
+    --build-arg INSTALL_VLLM=false \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    -t llamafactory:latest .
+
+docker run -it --gpus=all \
     -v ./hf_cache:/root/.cache/huggingface/ \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -p 7860:7860 \
+    -p 8000:8000 \
     --shm-size 16G \
-    --name llama_factory \
-    -d llama-factory:latest
+    --name llamafactory \
+    llamafactory:latest
 ```
 
-#### 使用 Docker Compose
-
-```bash
-docker compose -f ./docker-compose.yml up -d
-```
+> [!TIP]
+> 通过 `docker compose up -d` 使用 Docker Compose 构建镜像。
 
 <details><summary>数据卷详情</summary>
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 9602a3e3..b3e4a34d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,17 +1,23 @@
 version: '3.8'
 
 services:
-  llama-factory:
+  llamafactory:
     build:
       dockerfile: Dockerfile
       context: .
-    container_name: llama_factory
+      args:
+        INSTALL_BNB: false
+        INSTALL_VLLM: false
+        INSTALL_DEEPSPEED: false
+        PIP_INDEX: https://pypi.org/simple
+    container_name: llamafactory
     volumes:
       - ./hf_cache:/root/.cache/huggingface/
       - ./data:/app/data
       - ./output:/app/output
     ports:
       - "7860:7860"
+      - "8000:8000"
     ipc: host
     deploy:
       resources:

From 68df064c1fd485ef4ce64a0162b93b76eb1a8061 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 00:37:17 +0800
Subject: [PATCH 011/162] fix #4160

The split heads should be concatenated in dim=2


Former-commit-id: 4b3f247f270d44df9fe226cfe0dabfb7fcd2deda
---
 src/llamafactory/model/model_utils/longlora.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py
index c8dc52f5..cd468979 100644
--- a/src/llamafactory/model/model_utils/longlora.py
+++ b/src/llamafactory/model/model_utils/longlora.py
@@ -96,7 +96,8 @@ def llama_attention_forward(
             (
                 attn_output[:, :, : self.num_heads // 2],
                 attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
-            )
+            ),
+            dim=2,
         )
 
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
@@ -194,7 +195,8 @@ def llama_flash_attention_2_forward(
             (
                 attn_output[:, :, : self.num_heads // 2],
                 attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
-            )
+            ),
+            dim=2,
         )
 
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
@@ -293,7 +295,8 @@ def llama_sdpa_attention_forward(
             (
                 attn_output[:, :, : self.num_heads // 2],
                 attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
-            )
+            ),
+            dim=2,
         )
 
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
@@ -303,7 +306,7 @@ def llama_sdpa_attention_forward(
 
 
 def _apply_llama_patch() -> None:
-    require_version("transformers==4.40.2", "To fix: pip install transformers==4.40.2")
+    require_version("transformers==4.41.2", "To fix: pip install transformers==4.41.2")
     LlamaAttention.forward = llama_attention_forward
     LlamaFlashAttention2.forward = llama_flash_attention_2_forward
     LlamaSdpaAttention.forward = llama_sdpa_attention_forward

From 41eadf54599323b8c64c54cfd848f60972cf0dd0 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 00:44:26 +0800
Subject: [PATCH 012/162] release v0.8.1

Former-commit-id: 875a34f492701d1c644facbe9ede411af2931513
---
 src/llamafactory/extras/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index 1d4e43f1..8c34fd96 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -9,7 +9,7 @@ import trl
 from transformers.utils import is_torch_cuda_available, is_torch_npu_available
 
 
-VERSION = "0.8.1.dev0"
+VERSION = "0.8.1"
 
 
 def print_env() -> None:

From e540759f4f31756dd35497b2956cc8c729ebc6ab Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 00:50:53 +0800
Subject: [PATCH 013/162] set dev version

Former-commit-id: 16c47cc15226119e33e46ba0f2f6ccb37072257f
---
 src/llamafactory/extras/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index 8c34fd96..a8cb799d 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -9,7 +9,7 @@ import trl
 from transformers.utils import is_torch_cuda_available, is_torch_npu_available
 
 
-VERSION = "0.8.1"
+VERSION = "0.8.2.dev0"
 
 
 def print_env() -> None:

From 8c7943c4de5c6a4d695407304b2fdf37798caf00 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 01:04:16 +0800
Subject: [PATCH 014/162] tiny fix

Former-commit-id: b5e9711ef375cc323fc083e742cccfc974550416
---
 src/llamafactory/model/model_utils/longlora.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py
index cd468979..4a8c562a 100644
--- a/src/llamafactory/model/model_utils/longlora.py
+++ b/src/llamafactory/model/model_utils/longlora.py
@@ -182,11 +182,9 @@ def llama_flash_attention_2_forward(
         query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
         if attention_mask is not None:
             attention_mask = attention_mask[:, :groupsz].repeat(num_groups, 1)
-    else:
-        groupsz = q_len
 
     attn_output: torch.Tensor = self._flash_attention_forward(
-        query_states, key_states, value_states, attention_mask, groupsz, dropout=dropout_rate
+        query_states, key_states, value_states, attention_mask, query_states.size(1), dropout=dropout_rate
     )
 
     if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back

From 4f33de245c5e00ea4f4457676ea92779d1ab3533 Mon Sep 17 00:00:00 2001
From: Alfredo Luque <alfredo.luque@airbnb.com>
Date: Tue, 11 Jun 2024 00:07:06 +0000
Subject: [PATCH 015/162] add manifest so requirements.txt in sdist

Former-commit-id: b501a3c56c51786c3006a2aca15a145641a4556c
---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..82c51f63
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include LICENSE requirements.txt

From b7458a24b6adbc958839b4a27803069055c3fbf4 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 12:48:53 +0800
Subject: [PATCH 016/162] tiny fix

Former-commit-id: c4b2e263d9cefbad0fbc5de72422e4ef8edbcb54
---
 src/llamafactory/hparams/parser.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index ff1fbf5d..ec5dd62c 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -171,9 +171,6 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.do_train and model_args.quantization_device_map == "auto":
         raise ValueError("Cannot use device map for quantized models in training.")
 
-    if finetuning_args.use_dora and model_args.use_unsloth:
-        raise ValueError("Unsloth does not support DoRA.")
-
     if finetuning_args.pure_bf16:
         if not is_torch_bf16_gpu_available():
             raise ValueError("This device does not support `pure_bf16`.")

From 95f95bef609cf3f44bd4c939ee9d7201ae4749cb Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 15:38:38 +0800
Subject: [PATCH 017/162] fix #4198

Former-commit-id: 945d2c6cc73542adf9272ebd9aa332ea2c1c7361
---
 src/llamafactory/hparams/model_args.py  | 12 ++++++++++
 src/llamafactory/model/patcher.py       |  2 +-
 src/llamafactory/train/trainer_utils.py | 32 +++++++++++--------------
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 6352a420..71467770 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -1,6 +1,8 @@
 from dataclasses import asdict, dataclass, field
 from typing import Any, Dict, Literal, Optional
 
+from typing_extensions import Self
+
 
 @dataclass
 class ModelArguments:
@@ -216,3 +218,13 @@ class ModelArguments:
 
     def to_dict(self) -> Dict[str, Any]:
         return asdict(self)
+
+    @classmethod
+    def copyfrom(cls, old_arg: Self, **kwargs) -> Self:
+        arg_dict = old_arg.to_dict()
+        arg_dict.update(**kwargs)
+        new_arg = cls(**arg_dict)
+        new_arg.compute_dtype = old_arg.compute_dtype
+        new_arg.device_map = old_arg.device_map
+        new_arg.model_max_length = old_arg.model_max_length
+        return new_arg
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 87c92315..18221a10 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -79,7 +79,7 @@ def patch_config(
             if "device_map" not in init_kwargs and model_args.device_map:
                 init_kwargs["device_map"] = model_args.device_map
 
-            if init_kwargs["device_map"] == "auto":
+            if init_kwargs.get("device_map", None) == "auto":
                 init_kwargs["offload_folder"] = model_args.offload_folder
 
 
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 0ddcdb11..7e9cc881 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -83,15 +83,12 @@ def create_ref_model(
     The valuehead parameter is randomly initialized since it is useless for PPO training.
     """
     if finetuning_args.ref_model is not None:
-        ref_model_args_dict = model_args.to_dict()
-        ref_model_args_dict.update(
-            dict(
-                model_name_or_path=finetuning_args.ref_model,
-                adapter_name_or_path=finetuning_args.ref_model_adapters,
-                quantization_bit=finetuning_args.ref_model_quantization_bit,
-            )
+        ref_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.ref_model,
+            adapter_name_or_path=finetuning_args.ref_model_adapters,
+            quantization_bit=finetuning_args.ref_model_quantization_bit,
         )
-        ref_model_args = ModelArguments(**ref_model_args_dict)
         ref_finetuning_args = FinetuningArguments()
         tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
         ref_model = load_model(
@@ -102,9 +99,11 @@ def create_ref_model(
         if finetuning_args.finetuning_type == "lora":
             ref_model = None
         else:
-            tokenizer = load_tokenizer(model_args)["tokenizer"]
+            ref_model_args = ModelArguments.copyfrom(model_args)
+            ref_finetuning_args = FinetuningArguments()
+            tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
             ref_model = load_model(
-                tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead
+                tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
             )
             logger.info("Created reference model from the model itself.")
 
@@ -139,15 +138,12 @@ def create_reward_model(
         logger.info("Loaded adapter weights of reward model from {}".format(finetuning_args.reward_model))
         return None
     else:
-        reward_model_args_dict = model_args.to_dict()
-        reward_model_args_dict.update(
-            dict(
-                model_name_or_path=finetuning_args.reward_model,
-                adapter_name_or_path=finetuning_args.reward_model_adapters,
-                quantization_bit=finetuning_args.reward_model_quantization_bit,
-            )
+        reward_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.reward_model,
+            adapter_name_or_path=finetuning_args.reward_model_adapters,
+            quantization_bit=finetuning_args.reward_model_quantization_bit,
         )
-        reward_model_args = ModelArguments(**reward_model_args_dict)
         reward_finetuning_args = FinetuningArguments()
         tokenizer = load_tokenizer(reward_model_args)["tokenizer"]
         reward_model = load_model(

From 3f52c233b1b1c2502e199d4f3a1204215d1bd0cc Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 15:40:21 +0800
Subject: [PATCH 018/162] Update bug-report.yml

Former-commit-id: bb022cd867ebf2593e40fc6ba43b768603b129a3
---
 .github/ISSUE_TEMPLATE/bug-report.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 1d962200..768adea6 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -38,7 +38,9 @@ body:
         请合理使用 Markdown 标签来格式化您的文本。
 
       placeholder: |
+        ```bash
         llamafactory-cli train ...
+        ```
 
   - type: textarea
     id: expected-behavior

From dfac202c7d87551e9eb13fb1a40d47d3c904eb96 Mon Sep 17 00:00:00 2001
From: d <913015993@qq.com>
Date: Tue, 11 Jun 2024 16:21:48 +0800
Subject: [PATCH 019/162] =?UTF-8?q?=E7=BB=8F=E8=BF=87=E5=A4=A7=E9=87=8F?=
 =?UTF-8?q?=E7=9A=84=E5=A2=9E=E9=87=8F=E9=A2=84=E8=AE=AD=E7=BB=83=EF=BC=8C?=
 =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E5=AF=B9=E6=AF=94=E8=AF=95=E9=AA=8C=EF=BC=8C?=
 =?UTF-8?q?=E5=8F=91=E7=8E=B0=E8=BF=99=E4=B8=AAbug=EF=BC=9Allama3=E5=9C=A8?=
 =?UTF-8?q?=E9=A2=84=E8=AE=AD=E7=BB=83=E6=97=B6=E4=BD=BF=E7=94=A8=E7=9A=84?=
 =?UTF-8?q?tokenizer.eos=5Ftoke=E6=98=AF'<|end=5Fof=5Ftext|>'=20=EF=BC=8C?=
 =?UTF-8?q?=E8=BF=99=E9=87=8C=E5=9C=A8=E6=AF=8F=E6=9D=A1=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E5=90=8E=E9=9D=A2=E4=B9=9F=E5=BE=97=E7=94=A8=E8=BF=99=E4=B8=AA?=
 =?UTF-8?q?=EF=BC=8C=E8=80=8C=E4=B8=8D=E6=98=AF'<|eot=5Fid|>'=EF=BC=8C?=
 =?UTF-8?q?=E5=90=A6=E5=88=99=E5=BE=88=E5=AE=B9=E6=98=93=E5=AF=BC=E8=87=B4?=
 =?UTF-8?q?=E4=B8=A5=E9=87=8D=E7=9A=84=E6=80=A7=E8=83=BD=E4=B8=8B=E9=99=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Former-commit-id: ef470561f742b16eaa0f99c4cadecd7c84ce6bd2
---
 src/llamafactory/data/processors/pretrain.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index 87727b55..4050f74c 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -12,7 +12,8 @@ def preprocess_pretrain_dataset(
     examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
-    text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]]
+    eos_token = '<|end_of_text|>' if data_args.template == 'llama3' else  tokenizer.eos_token
+    text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]]
 
     if not data_args.packing:
         if data_args.template == "gemma":

From a7233181f28bb6e9008c8c67654c04621e8bc8ea Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 16:52:36 +0800
Subject: [PATCH 020/162] fix deepspeed version

Former-commit-id: 938a69bb07d4de7d82928ff01c582032162c1480
---
 src/llamafactory/model/model_utils/moe.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/llamafactory/model/model_utils/moe.py b/src/llamafactory/model/model_utils/moe.py
index e554e45a..8a73c844 100644
--- a/src/llamafactory/model/model_utils/moe.py
+++ b/src/llamafactory/model/model_utils/moe.py
@@ -1,5 +1,6 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Sequence
 
+import torch
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils.versions import require_version
 
@@ -10,6 +11,13 @@ if TYPE_CHECKING:
     from ...hparams import ModelArguments
 
 
+def _set_z3_leaf_modules(model: "PreTrainedModel", leaf_modules: Sequence["torch.nn.Module"]) -> None:
+    require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
+    from deepspeed.utils import set_z3_leaf_modules  # type: ignore
+
+    set_z3_leaf_modules(model, leaf_modules)
+
+
 def add_z3_leaf_module(model: "PreTrainedModel") -> None:
     r"""
     Sets module as a leaf module to skip partitioning in deepspeed zero3.
@@ -17,33 +25,30 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:
     if not is_deepspeed_zero3_enabled():
         return
 
-    require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
-    from deepspeed.utils import set_z3_leaf_modules  # type: ignore
-
     if getattr(model.config, "model_type", None) == "dbrx":
         from transformers.models.dbrx.modeling_dbrx import DbrxFFN
 
-        set_z3_leaf_modules(model, [DbrxFFN])
+        _set_z3_leaf_modules(model, [DbrxFFN])
 
     if getattr(model.config, "model_type", None) == "jamba":
         from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock
 
-        set_z3_leaf_modules(model, [JambaSparseMoeBlock])
+        _set_z3_leaf_modules(model, [JambaSparseMoeBlock])
 
     if getattr(model.config, "model_type", None) == "jetmoe":
         from transformers.models.jetmoe.modeling_jetmoe import JetMoeMoA, JetMoeMoE
 
-        set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE])
+        _set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE])
 
     if getattr(model.config, "model_type", None) == "mixtral":
         from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
-        set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+        _set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
 
     if getattr(model.config, "model_type", None) == "qwen2moe":
         from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
 
-        set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
+        _set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
 
 
 def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:

From 6625bf6b332a52d09be21c0de3b989468d1ecf6c Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 11 Jun 2024 17:02:14 +0800
Subject: [PATCH 021/162] Update pretrain.py

Former-commit-id: e2317b2a84149e39fddfd6366be3de23dfb71f82
---
 src/llamafactory/data/processors/pretrain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index 4050f74c..832c987e 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -12,7 +12,7 @@ def preprocess_pretrain_dataset(
     examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
-    eos_token = '<|end_of_text|>' if data_args.template == 'llama3' else  tokenizer.eos_token
+    eos_token = "<|end_of_text|>" if data_args.template == "llama3" else tokenizer.eos_token
     text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]]
 
     if not data_args.packing:

From 45712c6251414024413eb2f669214e93c693f8c6 Mon Sep 17 00:00:00 2001
From: Arthur Kim <kimdwkimdw@gmail.com>
Date: Wed, 12 Jun 2024 16:49:12 +0900
Subject: [PATCH 022/162] Support vllm==0.5.0

Former-commit-id: e7a8ffd7af21bc3759f055033ba2209fa7a1be0e
---
 src/llamafactory/chat/vllm_engine.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 87ce8684..d096f6eb 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -13,7 +13,10 @@ from .base_engine import BaseEngine, Response
 if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
-    from vllm.sequence import MultiModalData
+    try:
+        from vllm.multimodal import MultiModalData  # vllm==0.5.0
+    except ImportError:
+        from vllm.sequence import MultiModalData  # vllm<0.5.0
 
 
 if TYPE_CHECKING:

From f7b66af7aff84177d6310b01e15bf58f2d0ca170 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 12 Jun 2024 16:50:11 +0800
Subject: [PATCH 023/162] fix #4242

Former-commit-id: cf260e7af03f49aa5e3d6daf3b27738ff9b9bcb8
---
 Dockerfile                           | 2 +-
 src/llamafactory/chat/vllm_engine.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 45849601..3932ff30 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -32,7 +32,7 @@ RUN EXTRA_PACKAGES="metrics"; \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
     pip install -e .[$EXTRA_PACKAGES] && \
-    pip uninstall -y transformer-engine
+    pip uninstall -y transformer-engine flash-attn
 
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index d096f6eb..d2850a6e 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -13,10 +13,11 @@ from .base_engine import BaseEngine, Response
 if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
+
     try:
-        from vllm.multimodal import MultiModalData  # vllm==0.5.0
+        from vllm.multimodal import MultiModalData  # type: ignore (for vllm>=0.5.0)
     except ImportError:
-        from vllm.sequence import MultiModalData  # vllm<0.5.0
+        from vllm.sequence import MultiModalData  # for vllm<0.5.0
 
 
 if TYPE_CHECKING:

From 8ad6a70a57f2892a748a94db80c7185a3ba8b7d8 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 12 Jun 2024 17:39:12 +0800
Subject: [PATCH 024/162] update readme

Former-commit-id: a436aaa83f0cf12c8f404459e5486f9369d538ec
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 35dacd2e..a773a5f4 100644
--- a/README.md
+++ b/README.md
@@ -443,7 +443,7 @@ docker run -it --gpus=all \
 ```
 
 > [!TIP]
-> Use Docker Compose to build image via `docker compose up -d`.
+> Use Docker Compose to build image via `docker-compose up -d`.
 
 <details><summary>Details about volume</summary>
 
diff --git a/README_zh.md b/README_zh.md
index 0ddb8b19..7a9cb159 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -443,7 +443,7 @@ docker run -it --gpus=all \
 ```
 
 > [!TIP]
-> 通过 `docker compose up -d` 使用 Docker Compose 构建镜像。
+> 通过 `docker-compose up -d` 使用 Docker Compose 构建镜像。
 
 <details><summary>数据卷详情</summary>
 

From d98900d804b94bc0a972483c2d91c6883547c18d Mon Sep 17 00:00:00 2001
From: hzhaoy <hzywong@gmail.com>
Date: Wed, 12 Jun 2024 18:29:03 +0800
Subject: [PATCH 025/162] adapt vllm==0.5.0

Former-commit-id: 02afd9ff64f23e6707ac739ae1269f41bd70c340
---
 src/llamafactory/chat/vllm_engine.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index d2850a6e..34126adf 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -1,10 +1,12 @@
 import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence, Union
 
+from packaging import version
+
 from ..data import get_template_and_fix_tokenizer
 from ..extras.logging import get_logger
 from ..extras.misc import get_device_count
-from ..extras.packages import is_vllm_available
+from ..extras.packages import is_vllm_available, _get_package_version
 from ..model import load_config, load_tokenizer
 from ..model.model_utils.visual import LlavaMultiModalProjectorForYiVLForVLLM
 from .base_engine import BaseEngine, Response
@@ -14,10 +16,10 @@ if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
 
-    try:
-        from vllm.multimodal import MultiModalData  # type: ignore (for vllm>=0.5.0)
-    except ImportError:
-        from vllm.sequence import MultiModalData  # for vllm<0.5.0
+    if _get_package_version("vllm") >= version.parse("0.5.0"):
+        from vllm.multimodal.image import ImagePixelData
+    else:
+        from vllm.sequence import MultiModalData
 
 
 if TYPE_CHECKING:
@@ -110,7 +112,10 @@ class VllmEngine(BaseEngine):
         if self.processor is not None and image is not None:  # add image features
             image_processor: "BaseImageProcessor" = getattr(self.processor, "image_processor")
             pixel_values = image_processor(image, return_tensors="pt")["pixel_values"]
-            multi_modal_data = MultiModalData(type=MultiModalData.Type.IMAGE, data=pixel_values)
+            if _get_package_version("vllm") >= version.parse("0.5.0"):
+                multi_modal_data = ImagePixelData(pixel_values)
+            else:
+                multi_modal_data = MultiModalData(type=MultiModalData.Type.IMAGE, data=pixel_values)
         else:
             multi_modal_data = None
 

From 7d3a9b10b70d1fc1af3c218bb21f7c3fe42b59b9 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 00:07:48 +0800
Subject: [PATCH 026/162] fix docker compose usage

Former-commit-id: 59a5bd5d5c8d2a44e2dad26b74e77a45e109c8d6
---
 README.md          | 10 ++++++++--
 README_zh.md       | 10 ++++++++--
 docker-compose.yml |  5 +++--
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index a773a5f4..65964560 100644
--- a/README.md
+++ b/README.md
@@ -423,6 +423,8 @@ llamafactory-cli webui
 
 ### Build Docker
 
+#### Use Docker
+
 ```bash
 docker build -f ./Dockerfile \
     --build-arg INSTALL_BNB=false \
@@ -442,8 +444,12 @@ docker run -it --gpus=all \
     llamafactory:latest
 ```
 
-> [!TIP]
-> Use Docker Compose to build image via `docker-compose up -d`.
+#### Use Docker Compose
+
+```bash
+docker-compose up -d
+docker-compose exec -it llamafactory bash
+```
 
 <details><summary>Details about volume</summary>
 
diff --git a/README_zh.md b/README_zh.md
index 7a9cb159..7962a6d1 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -423,6 +423,8 @@ llamafactory-cli webui
 
 ### 构建 Docker
 
+#### 使用 Docker
+
 ```bash
 docker build -f ./Dockerfile \
     --build-arg INSTALL_BNB=false \
@@ -442,8 +444,12 @@ docker run -it --gpus=all \
     llamafactory:latest
 ```
 
-> [!TIP]
-> 通过 `docker-compose up -d` 使用 Docker Compose 构建镜像。
+#### 使用 Docker Compose
+
+```bash
+docker-compose up -d
+docker-compose exec -it llamafactory bash
+```
 
 <details><summary>数据卷详情</summary>
 
diff --git a/docker-compose.yml b/docker-compose.yml
index b3e4a34d..c5dc34e9 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3.8'
-
 services:
   llamafactory:
     build:
@@ -19,6 +17,9 @@ services:
       - "7860:7860"
       - "8000:8000"
     ipc: host
+    tty: true
+    stdin_open: true
+    command: bash
     deploy:
       resources:
         reservations:

From f4c95557609699ff36d483beb49d0f792fbff146 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 00:48:44 +0800
Subject: [PATCH 027/162] fix lint

Former-commit-id: b170165679317af2b3f03633afac27661b3deb06
---
 README.md                                   |  2 +-
 README_zh.md                                |  2 +-
 src/llamafactory/hparams/finetuning_args.py | 17 ++++++++---------
 src/llamafactory/hparams/model_args.py      | 12 ++++++++----
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 65964560..994a62c6 100644
--- a/README.md
+++ b/README.md
@@ -448,7 +448,7 @@ docker run -it --gpus=all \
 
 ```bash
 docker-compose up -d
-docker-compose exec -it llamafactory bash
+docker-compose exec llamafactory bash
 ```
 
 <details><summary>Details about volume</summary>
diff --git a/README_zh.md b/README_zh.md
index 7962a6d1..fa395c6b 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -448,7 +448,7 @@ docker run -it --gpus=all \
 
 ```bash
 docker-compose up -d
-docker-compose exec -it llamafactory bash
+docker-compose exec llamafactory bash
 ```
 
 <details><summary>数据卷详情</summary>
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index 08af31e4..facbe792 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Literal, Optional
+from typing import List, Literal, Optional
 
 
 @dataclass
@@ -319,20 +319,19 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
                 return [item.strip() for item in arg.split(",")]
             return arg
 
-        self.freeze_trainable_modules = split_arg(self.freeze_trainable_modules)
-        self.freeze_extra_modules = split_arg(self.freeze_extra_modules)
-        self.lora_alpha = self.lora_alpha or self.lora_rank * 2
-        self.lora_target = split_arg(self.lora_target)
-        self.additional_target = split_arg(self.additional_target)
-        self.galore_target = split_arg(self.galore_target)
+        self.freeze_trainable_modules: List[str] = split_arg(self.freeze_trainable_modules)
+        self.freeze_extra_modules: Optional[List[str]] = split_arg(self.freeze_extra_modules)
+        self.lora_alpha: int = self.lora_alpha or self.lora_rank * 2
+        self.lora_target: List[str] = split_arg(self.lora_target)
+        self.additional_target: Optional[List[str]] = split_arg(self.additional_target)
+        self.galore_target: List[str] = split_arg(self.galore_target)
         self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only
+        self.use_ref_model = self.pref_loss not in ["orpo", "simpo"]
 
         assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method."
         assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
         assert self.reward_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
 
-        self.use_ref_model = self.pref_loss not in ["orpo", "simpo"]
-
         if self.stage == "ppo" and self.reward_model is None:
             raise ValueError("`reward_model` is necessary for PPO training.")
 
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 71467770..359beafd 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -1,9 +1,13 @@
 from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, Literal, Optional
+from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union
 
 from typing_extensions import Self
 
 
+if TYPE_CHECKING:
+    import torch
+
+
 @dataclass
 class ModelArguments:
     r"""
@@ -194,9 +198,9 @@ class ModelArguments:
     )
 
     def __post_init__(self):
-        self.compute_dtype = None
-        self.device_map = None
-        self.model_max_length = None
+        self.compute_dtype: Optional["torch.dtype"] = None
+        self.device_map: Optional[Union[str, Dict[str, Any]]] = None
+        self.model_max_length: Optional[int] = None
 
         if self.split_special_tokens and self.use_fast_tokenizer:
             raise ValueError("`split_special_tokens` is only supported for slow tokenizers.")

From 39e3d3fed63a5185806802e0b7de2eaf22451c22 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 01:00:56 +0800
Subject: [PATCH 028/162] add neo-sft dataset

Former-commit-id: 34863fa7cb641ceca92e3a2eec914126db537b62
---
 README.md              | 1 +
 README_zh.md           | 1 +
 data/dataset_info.json | 4 ++++
 3 files changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 994a62c6..5bbaf2d7 100644
--- a/README.md
+++ b/README.md
@@ -259,6 +259,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
+- [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
diff --git a/README_zh.md b/README_zh.md
index fa395c6b..fb616909 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -259,6 +259,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
+- [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
diff --git a/data/dataset_info.json b/data/dataset_info.json
index 8c5cbb45..1d226b3a 100644
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -248,6 +248,10 @@
   "ruozhiba_gpt4": {
     "hf_hub_url": "hfl/ruozhiba_gpt4_turbo"
   },
+  "neo_sft": {
+    "hf_hub_url": "m-a-p/neo_sft_phase2",
+    "formatting": "sharegpt"
+  },
   "llava_1k_en": {
     "hf_hub_url": "BUAADreamer/llava-en-zh-2k",
     "subset": "en",

From 344d1192acfbfb2aba7cbb8bbe45900546618e20 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 01:58:16 +0800
Subject: [PATCH 029/162] clean code

Former-commit-id: f54cafd5c7f0383370d1a2f357834a61a97397ce
---
 src/llamafactory/chat/vllm_engine.py          | 12 +++++-----
 src/llamafactory/extras/packages.py           | 22 +++++--------------
 .../model/model_utils/attention.py            |  7 +++---
 src/llamafactory/train/sft/metric.py          |  3 ++-
 4 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 34126adf..e4c05478 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -1,12 +1,10 @@
 import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence, Union
 
-from packaging import version
-
 from ..data import get_template_and_fix_tokenizer
 from ..extras.logging import get_logger
 from ..extras.misc import get_device_count
-from ..extras.packages import is_vllm_available, _get_package_version
+from ..extras.packages import is_vllm_available, is_vllm_version_greater_than_0_5
 from ..model import load_config, load_tokenizer
 from ..model.model_utils.visual import LlavaMultiModalProjectorForYiVLForVLLM
 from .base_engine import BaseEngine, Response
@@ -16,7 +14,7 @@ if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
 
-    if _get_package_version("vllm") >= version.parse("0.5.0"):
+    if is_vllm_version_greater_than_0_5():
         from vllm.multimodal.image import ImagePixelData
     else:
         from vllm.sequence import MultiModalData
@@ -112,9 +110,9 @@ class VllmEngine(BaseEngine):
         if self.processor is not None and image is not None:  # add image features
             image_processor: "BaseImageProcessor" = getattr(self.processor, "image_processor")
             pixel_values = image_processor(image, return_tensors="pt")["pixel_values"]
-            if _get_package_version("vllm") >= version.parse("0.5.0"):
-                multi_modal_data = ImagePixelData(pixel_values)
-            else:
+            if is_vllm_version_greater_than_0_5():
+                multi_modal_data = ImagePixelData(image=pixel_values)
+            else:  # TODO: remove vllm 0.4.3 support
                 multi_modal_data = MultiModalData(type=MultiModalData.Type.IMAGE, data=pixel_values)
         else:
             multi_modal_data = None
diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py
index 4c9e6492..0746bb4f 100644
--- a/src/llamafactory/extras/packages.py
+++ b/src/llamafactory/extras/packages.py
@@ -1,5 +1,6 @@
 import importlib.metadata
 import importlib.util
+from functools import lru_cache
 from typing import TYPE_CHECKING
 
 from packaging import version
@@ -24,10 +25,6 @@ def is_fastapi_available():
     return _is_package_available("fastapi")
 
 
-def is_flash_attn2_available():
-    return _is_package_available("flash_attn") and _get_package_version("flash_attn") > version.parse("2.0.0")
-
-
 def is_galore_available():
     return _is_package_available("galore_torch")
 
@@ -36,18 +33,10 @@ def is_gradio_available():
     return _is_package_available("gradio")
 
 
-def is_jieba_available():
-    return _is_package_available("jieba")
-
-
 def is_matplotlib_available():
     return _is_package_available("matplotlib")
 
 
-def is_nltk_available():
-    return _is_package_available("nltk")
-
-
 def is_pillow_available():
     return _is_package_available("PIL")
 
@@ -60,10 +49,6 @@ def is_rouge_available():
     return _is_package_available("rouge_chinese")
 
 
-def is_sdpa_available():
-    return _get_package_version("torch") > version.parse("2.1.1")
-
-
 def is_starlette_available():
     return _is_package_available("sse_starlette")
 
@@ -74,3 +59,8 @@ def is_uvicorn_available():
 
 def is_vllm_available():
     return _is_package_available("vllm")
+
+
+@lru_cache
+def is_vllm_version_greater_than_0_5():
+    return _get_package_version("vllm") >= version.parse("0.5.0")
diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index b52ddc86..2bd36fdc 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -1,7 +1,8 @@
 from typing import TYPE_CHECKING
 
+from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available
+
 from ...extras.logging import get_logger
-from ...extras.packages import is_flash_attn2_available, is_sdpa_available
 
 
 if TYPE_CHECKING:
@@ -21,13 +22,13 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model
         requested_attn_implementation = "eager"
 
     elif model_args.flash_attn == "sdpa":
-        if not is_sdpa_available():
+        if not is_torch_sdpa_available():
             logger.warning("torch>=2.1.1 is required for SDPA attention.")
             return
 
         requested_attn_implementation = "sdpa"
     elif model_args.flash_attn == "fa2":
-        if not is_flash_attn2_available():
+        if not is_flash_attn_2_available():
             logger.warning("FlashAttention-2 is not installed.")
             return
 
diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index b135fcfb..6ed356c1 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -2,9 +2,10 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
 
 import numpy as np
+from transformers.utils import is_jieba_available, is_nltk_available
 
 from ...extras.constants import IGNORE_INDEX
-from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available
+from ...extras.packages import is_rouge_available
 
 
 if TYPE_CHECKING:

From 045cef901ea3fb1516f242ae0b9faac9a99185df Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 02:25:50 +0800
Subject: [PATCH 030/162] fix #4209

DeepSpeed ZeRO3 has inflight param error when calling model.eval()


Former-commit-id: 4be013f18ea6a35b5a11db98db5f0670ffb41619
---
 src/llamafactory/train/dpo/trainer.py   |  7 +++++--
 src/llamafactory/train/kto/trainer.py   |  7 +++++--
 src/llamafactory/train/ppo/trainer.py   |  2 ++
 src/llamafactory/train/trainer_utils.py | 13 -------------
 4 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index d860b29a..5bdb9c43 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import defaultdict
 from contextlib import nullcontext
 from types import MethodType
@@ -10,7 +11,7 @@ from trl import DPOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps, get_ref_context
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
 if TYPE_CHECKING:
@@ -61,6 +62,8 @@ class CustomDPOTrainer(DPOTrainer):
         if not hasattr(self, "accelerator"):
             raise AttributeError("Please update `transformers`.")
 
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
         if ref_model is not None:
             if self.is_deepspeed_enabled:
                 if not (
@@ -176,7 +179,7 @@ class CustomDPOTrainer(DPOTrainer):
 
         if self.ref_model is None:
             ref_model = model
-            ref_context = get_ref_context(self.accelerator, model)
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
         else:
             ref_model = self.ref_model
             ref_context = nullcontext()
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 22a84e4a..3b4488fc 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import defaultdict
 from contextlib import nullcontext
 from types import MethodType
@@ -9,7 +10,7 @@ from trl import KTOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps, get_ref_context
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
 if TYPE_CHECKING:
@@ -60,6 +61,8 @@ class CustomKTOTrainer(KTOTrainer):
         if not hasattr(self, "accelerator"):
             raise AttributeError("Please update `transformers`.")
 
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
         if ref_model is not None:
             if self.is_deepspeed_enabled:
                 if not (
@@ -143,7 +146,7 @@ class CustomKTOTrainer(KTOTrainer):
         """
         if self.ref_model is None:
             ref_model = model
-            ref_context = get_ref_context(self.accelerator, model)
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
         else:
             ref_model = self.ref_model
             ref_context = nullcontext()
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 2e1288e4..737c45a3 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -1,6 +1,7 @@
 import math
 import os
 import sys
+import warnings
 from types import MethodType
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
@@ -136,6 +137,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
 
         device_type = unwrapped_model.pretrained_model.device.type
         self.amp_context = torch.autocast(device_type, dtype=model_args.compute_dtype)
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
 
         if finetuning_args.reward_model_type == "full":
             if self.is_deepspeed_enabled:
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 7e9cc881..48944a63 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -1,4 +1,3 @@
-from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -19,7 +18,6 @@ if is_galore_available():
 
 
 if TYPE_CHECKING:
-    from accelerate import Accelerator
     from transformers import PreTrainedModel, Seq2SeqTrainingArguments
     from trl import AutoModelForCausalLMWithValueHead
 
@@ -154,17 +152,6 @@ def create_reward_model(
         return reward_model
 
 
-@contextmanager
-def get_ref_context(accelerator: "Accelerator", model: "PreTrainedModel"):
-    r"""
-    Gets adapter context for the reference model.
-    """
-    with accelerator.unwrap_model(model).disable_adapter():
-        model.eval()
-        yield
-        model.train()
-
-
 def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]:
     r"""
     Returns a list of names of parameters with weight decay. (weights in non-layernorm layers)

From e8885443a9edae3452c9fd26cfc0e1feef1c2fe5 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 02:48:21 +0800
Subject: [PATCH 031/162] fix #4221

Former-commit-id: 05a3be4853b941909e7d193c31e8d62c8c5f879b
---
 src/llamafactory/data/aligner.py                 |  8 ++++++--
 src/llamafactory/data/loader.py                  | 11 ++++++-----
 src/llamafactory/data/preprocess.py              |  3 +--
 src/llamafactory/data/processors/feedback.py     |  3 +--
 src/llamafactory/data/processors/pairwise.py     |  3 +--
 src/llamafactory/data/processors/pretrain.py     |  2 +-
 src/llamafactory/data/processors/supervised.py   |  3 +--
 src/llamafactory/data/processors/unsupervised.py |  3 +--
 src/llamafactory/train/sft/metric.py             |  2 +-
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py
index 434956af..3e9d5c46 100644
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -10,6 +10,7 @@ from .data_utils import Role
 
 if TYPE_CHECKING:
     from datasets import Dataset, IterableDataset
+    from transformers import Seq2SeqTrainingArguments
 
     from ..hparams import DataArguments
     from .parser import DatasetAttr
@@ -175,7 +176,10 @@ def convert_sharegpt(
 
 
 def align_dataset(
-    dataset: Union["Dataset", "IterableDataset"], dataset_attr: "DatasetAttr", data_args: "DataArguments"
+    dataset: Union["Dataset", "IterableDataset"],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
 ) -> Union["Dataset", "IterableDataset"]:
     r"""
     Aligned dataset:
@@ -208,7 +212,7 @@ def align_dataset(
     if not data_args.streaming:
         kwargs = dict(
             num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=(not data_args.overwrite_cache),
+            load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
             desc="Converting format of dataset",
         )
 
diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py
index 2c236c76..ba426f81 100644
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -18,8 +18,7 @@ from .template import get_template_and_fix_tokenizer
 
 if TYPE_CHECKING:
     from datasets import Dataset, IterableDataset
-    from transformers import ProcessorMixin, Seq2SeqTrainingArguments
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments
 
     from ..hparams import DataArguments, ModelArguments
     from .parser import DatasetAttr
@@ -32,6 +31,7 @@ def load_single_dataset(
     dataset_attr: "DatasetAttr",
     model_args: "ModelArguments",
     data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
 ) -> Union["Dataset", "IterableDataset"]:
     logger.info("Loading dataset {}...".format(dataset_attr))
     data_path, data_name, data_dir, data_files = None, None, None, None
@@ -123,7 +123,7 @@ def load_single_dataset(
         max_samples = min(data_args.max_samples, len(dataset))
         dataset = dataset.select(range(max_samples))
 
-    return align_dataset(dataset, dataset_attr, data_args)
+    return align_dataset(dataset, dataset_attr, data_args, training_args)
 
 
 def get_dataset(
@@ -157,7 +157,8 @@ def get_dataset(
             if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True):
                 raise ValueError("The dataset is not applicable in the current training stage.")
 
-            all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args))
+            all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args, training_args))
+
         dataset = merge_dataset(all_datasets, data_args, training_args)
 
     with training_args.main_process_first(desc="pre-process dataset"):
@@ -169,7 +170,7 @@ def get_dataset(
         if not data_args.streaming:
             kwargs = dict(
                 num_proc=data_args.preprocessing_num_workers,
-                load_from_cache_file=(not data_args.overwrite_cache),
+                load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
                 desc="Running tokenizer on dataset",
             )
 
diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py
index 97789c39..875f55d6 100644
--- a/src/llamafactory/data/preprocess.py
+++ b/src/llamafactory/data/preprocess.py
@@ -13,8 +13,7 @@ from .processors.unsupervised import preprocess_unsupervised_dataset, print_unsu
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin, Seq2SeqTrainingArguments
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments
 
     from ..hparams import DataArguments
     from .template import Template
diff --git a/src/llamafactory/data/processors/feedback.py b/src/llamafactory/data/processors/feedback.py
index 98d83658..5fba452c 100644
--- a/src/llamafactory/data/processors/feedback.py
+++ b/src/llamafactory/data/processors/feedback.py
@@ -6,8 +6,7 @@ from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin
 
     from ...hparams import DataArguments
     from ..template import Template
diff --git a/src/llamafactory/data/processors/pairwise.py b/src/llamafactory/data/processors/pairwise.py
index fe984efa..db52c6a7 100644
--- a/src/llamafactory/data/processors/pairwise.py
+++ b/src/llamafactory/data/processors/pairwise.py
@@ -6,8 +6,7 @@ from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin
 
     from ...hparams import DataArguments
     from ..template import Template
diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index 832c987e..a10ccabd 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any, Dict, List
 
 
 if TYPE_CHECKING:
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer
 
     from ...hparams import DataArguments
 
diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py
index 19d60280..f59f5371 100644
--- a/src/llamafactory/data/processors/supervised.py
+++ b/src/llamafactory/data/processors/supervised.py
@@ -7,8 +7,7 @@ from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, gre
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin
 
     from ...hparams import DataArguments
     from ..template import Template
diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py
index f711eeac..38497a15 100644
--- a/src/llamafactory/data/processors/unsupervised.py
+++ b/src/llamafactory/data/processors/unsupervised.py
@@ -6,8 +6,7 @@ from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin
 
     from ...hparams import DataArguments
     from ..template import Template
diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index 6ed356c1..923238d6 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -9,7 +9,7 @@ from ...extras.packages import is_rouge_available
 
 
 if TYPE_CHECKING:
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer
 
 
 if is_jieba_available():

From 554c84f8d3af480336deb0b3649fde8cb5da766f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 03:15:06 +0800
Subject: [PATCH 032/162] update examples

Former-commit-id: 19681f93db399d695aa8e35f8ec2a9e720875baa
---
 README.md                                     |   2 +-
 README_zh.md                                  |   2 +-
 examples/README.md                            | 126 ++++++++---------
 examples/README_zh.md                         | 128 ++++++++----------
 .../extras/fsdp_qlora/llama3_lora_sft.yaml    |   4 +-
 .../extras/llama_pro/llama3_freeze_sft.yaml   |   1 +
 examples/extras/loraplus/llama3_lora_sft.yaml |   1 +
 examples/extras/mod/llama3_full_sft.yaml      |   1 +
 examples/lora_multi_gpu/llama3_lora_sft.yaml  |  41 ------
 .../llama3_full_predict.yaml                  |   0
 .../llama3_full_sft_ds3.yaml}                 |   0
 .../llama3_lora_dpo.yaml                      |   1 +
 .../llama3_lora_eval.yaml                     |   0
 .../llama3_lora_kto.yaml                      |   2 +
 .../llama3_lora_ppo.yaml                      |   1 +
 .../llama3_lora_predict.yaml                  |   1 +
 .../llama3_lora_pretrain.yaml                 |   1 +
 .../llama3_lora_reward.yaml                   |   1 +
 .../llama3_lora_sft.yaml                      |   1 +
 .../llama3_lora_sft_ds0.yaml}                 |   4 +-
 .../llama3_lora_sft_ds3.yaml}                 |   4 +-
 .../llama3_preprocess.yaml                    |   0
 .../llava1_5_lora_sft.yaml                    |   1 +
 .../llama3_lora_sft_aqlm.yaml                 |   1 +
 .../llama3_lora_sft_awq.yaml                  |   1 +
 .../llama3_lora_sft_bitsandbytes.yaml         |   1 +
 .../llama3_lora_sft_gptq.yaml                 |   1 +
 27 files changed, 128 insertions(+), 199 deletions(-)
 delete mode 100644 examples/lora_multi_gpu/llama3_lora_sft.yaml
 rename examples/{full_multi_gpu => train_full}/llama3_full_predict.yaml (100%)
 rename examples/{full_multi_gpu/llama3_full_sft.yaml => train_full/llama3_full_sft_ds3.yaml} (100%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_dpo.yaml (96%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_eval.yaml (100%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_kto.yaml (94%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_ppo.yaml (96%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_predict.yaml (95%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_pretrain.yaml (96%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_reward.yaml (96%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_sft.yaml (96%)
 rename examples/{lora_multi_npu/llama3_lora_sft_ds.yaml => train_lora/llama3_lora_sft_ds0.yaml} (98%)
 rename examples/{lora_multi_gpu/llama3_lora_sft_ds.yaml => train_lora/llama3_lora_sft_ds3.yaml} (98%)
 rename examples/{lora_single_gpu => train_lora}/llama3_preprocess.yaml (100%)
 rename examples/{lora_single_gpu => train_lora}/llava1_5_lora_sft.yaml (96%)
 rename examples/{qlora_single_gpu => train_qlora}/llama3_lora_sft_aqlm.yaml (96%)
 rename examples/{qlora_single_gpu => train_qlora}/llama3_lora_sft_awq.yaml (96%)
 rename examples/{qlora_single_gpu => train_qlora}/llama3_lora_sft_bitsandbytes.yaml (96%)
 rename examples/{qlora_single_gpu => train_qlora}/llama3_lora_sft_gptq.yaml (96%)

diff --git a/README.md b/README.md
index 5bbaf2d7..5dd10d5a 100644
--- a/README.md
+++ b/README.md
@@ -406,7 +406,7 @@ Please refer to [data/README.md](data/README.md) for checking the details about
 Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Llama3-8B-Instruct model, respectively.
 
 ```bash
-llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
diff --git a/README_zh.md b/README_zh.md
index fb616909..76bd2d89 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -406,7 +406,7 @@ Docker 镜像：
 下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。
 
 ```bash
-llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
diff --git a/examples/README.md b/examples/README.md
index f985d552..3372afb9 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -4,59 +4,57 @@ Make sure to execute these commands in the `LLaMA-Factory` directory.
 
 ## Table of Contents
 
-- [LoRA Fine-Tuning on A Single GPU](#lora-fine-tuning-on-a-single-gpu)
-- [QLoRA Fine-Tuning on a Single GPU](#qlora-fine-tuning-on-a-single-gpu)
-- [LoRA Fine-Tuning on Multiple GPUs](#lora-fine-tuning-on-multiple-gpus)
-- [LoRA Fine-Tuning on Multiple NPUs](#lora-fine-tuning-on-multiple-npus)
-- [Full-Parameter Fine-Tuning on Multiple GPUs](#full-parameter-fine-tuning-on-multiple-gpus)
+- [LoRA Fine-Tuning](#lora-fine-tuning)
+- [QLoRA Fine-Tuning](#qlora-fine-tuning)
+- [Full-Parameter Fine-Tuning](#full-parameter-fine-tuning)
 - [Merging LoRA Adapters and Quantization](#merging-lora-adapters-and-quantization)
 - [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
 - [Extras](#extras)
 
 ## Examples
 
-### LoRA Fine-Tuning on A Single GPU
+### LoRA Fine-Tuning
 
 #### (Continuous) Pre-Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
 ```
 
 #### Supervised Fine-Tuning
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### Multimodal Supervised Fine-Tuning
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
 ```
 
 #### Reward Modeling
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
 ```
 
 #### PPO Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
 ```
 
 #### DPO/ORPO/SimPO Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 ```
 
 #### KTO Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
 ```
 
 #### Preprocess Dataset
@@ -64,95 +62,79 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
+llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
 ```
 
 #### Evaluating on MMLU/CMMLU/C-Eval Benchmarks
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
+llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
 ```
 
 #### Batch Predicting and Computing BLEU and ROUGE Scores
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
-```
-
-### QLoRA Fine-Tuning on a Single GPU
-
-#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
-```
-
-#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
-```
-
-#### Supervised Fine-Tuning with 4-bit AWQ Quantization
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
-```
-
-#### Supervised Fine-Tuning with 2-bit AQLM Quantization
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
-```
-
-### LoRA Fine-Tuning on Multiple GPUs
-
-#### Supervised Fine-Tuning on Single Node
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml
 ```
 
 #### Supervised Fine-Tuning on Multiple Nodes
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds.yaml
 ```
 
-### LoRA Fine-Tuning on Multiple NPUs
+### QLoRA Fine-Tuning
 
-#### Supervised Fine-Tuning with DeepSpeed ZeRO-0
+#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
 
 ```bash
-ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
 ```
 
-### Full-Parameter Fine-Tuning on Multiple GPUs
+#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+```
+
+#### Supervised Fine-Tuning with 4-bit AWQ Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+```
+
+#### Supervised Fine-Tuning with 2-bit AQLM Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+```
+
+### Full-Parameter Fine-Tuning
 
 #### Supervised Fine-Tuning on Single Node
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```
 
 #### Supervised Fine-Tuning on Multiple Nodes
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```
 
 #### Batch Predicting and Computing BLEU and ROUGE Scores
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_predict.yaml
+llamafactory-cli train examples/train_full/llama3_full_predict.yaml
 ```
 
 ### Merging LoRA Adapters and Quantization
@@ -162,35 +144,33 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llam
 Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
 #### Quantizing Model using AutoGPTQ
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
 ```
 
 ### Inferring LoRA Fine-Tuned Models
 
-Use `CUDA_VISIBLE_DEVICES=0,1` to infer models on multiple devices.
-
 #### Use CLI
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### Use Web UI
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### Launch OpenAI-style API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+llamafactory-cli api examples/inference/llama3_lora_sft.yaml
 ```
 
 ### Extras
@@ -198,32 +178,32 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.y
 #### Full-Parameter Fine-Tuning using GaLore
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
 ```
 
 #### Full-Parameter Fine-Tuning using BAdam
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 ```
 
 #### LoRA+ Fine-Tuning
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
 ```
 
 #### Mixture-of-Depths Fine-Tuning
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
 ```
 
 #### LLaMA-Pro Fine-Tuning
 
 ```bash
 bash examples/extras/llama_pro/expand.sh
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```
 
 #### FSDP+QLoRA Fine-Tuning
diff --git a/examples/README_zh.md b/examples/README_zh.md
index cf5bbf49..64c31fbd 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -4,59 +4,57 @@
 
 ## 目录
 
-- [单 GPU LoRA 微调](#单-gpu-lora-微调)
-- [单 GPU QLoRA 微调](#单-gpu-qlora-微调)
-- [多 GPU LoRA 微调](#多-gpu-lora-微调)
-- [多 NPU LoRA 微调](#多-npu-lora-微调)
-- [多 GPU 全参数微调](#多-gpu-全参数微调)
+- [LoRA 微调](#lora-微调)
+- [QLoRA 微调](#qlora-微调)
+- [全参数微调](#全参数微调)
 - [合并 LoRA 适配器与模型量化](#合并-lora-适配器与模型量化)
 - [推理 LoRA 模型](#推理-lora-模型)
 - [杂项](#杂项)
 
 ## 示例
 
-### 单 GPU LoRA 微调
+### LoRA 微调
 
 #### （增量）预训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
 ```
 
 #### 指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### 多模态指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
 ```
 
 #### 奖励模型训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
 ```
 
 #### PPO 训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
 ```
 
 #### DPO/ORPO/SimPO 训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 ```
 
 #### KTO 训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
 ```
 
 #### 预处理数据集
@@ -64,95 +62,79 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 对于大数据集有帮助，在配置中使用 `tokenized_path` 以加载预处理后的数据集。
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
+llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
 ```
 
 #### 在 MMLU/CMMLU/C-Eval 上评估
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
+llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
 ```
 
 #### 批量预测并计算 BLEU 和 ROUGE 分数
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml
 ```
 
-### 单 GPU QLoRA 微调
-
-#### 基于 4/8 比特 Bitsandbytes 量化进行指令监督微调（推荐）
+#### 多机指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
-```
-
-#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
-```
-
-#### 基于 4 比特 AWQ 量化进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
-```
-
-#### 基于 2 比特 AQLM 量化进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
-```
-
-### 多 GPU LoRA 微调
-
-#### 在单机上进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
-```
-
-#### 在多机上进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### 使用 DeepSpeed ZeRO-3 平均分配显存
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds.yaml
 ```
 
-### 多 NPU LoRA 微调
+### QLoRA 微调
 
-#### 使用 DeepSpeed ZeRO-0 进行指令监督微调
+#### 基于 4/8 比特 Bitsandbytes 量化进行指令监督微调（推荐）
 
 ```bash
-ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
 ```
 
-### 多 GPU 全参数微调
+#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+```
+
+#### 基于 4 比特 AWQ 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+```
+
+#### 基于 2 比特 AQLM 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+```
+
+### 全参数微调
 
 #### 在单机上进行指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```
 
 #### 在多机上进行指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```
 
 #### 批量预测并计算 BLEU 和 ROUGE 分数
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_predict.yaml
+llamafactory-cli train examples/train_full/llama3_full_predict.yaml
 ```
 
 ### 合并 LoRA 适配器与模型量化
@@ -162,35 +144,33 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llam
 注：请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
 #### 使用 AutoGPTQ 量化模型
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
 ```
 
 ### 推理 LoRA 模型
 
-使用 `CUDA_VISIBLE_DEVICES=0,1` 进行多卡推理。
-
 #### 使用命令行接口
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### 使用浏览器界面
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### 启动 OpenAI 风格 API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+llamafactory-cli api examples/inference/llama3_lora_sft.yaml
 ```
 
 ### 杂项
@@ -198,32 +178,32 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.y
 #### 使用 GaLore 进行全参数训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
 ```
 
 #### 使用 BAdam 进行全参数训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 ```
 
 #### LoRA+ 微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
 ```
 
 #### 深度混合微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
 ```
 
 #### LLaMA-Pro 微调
 
 ```bash
 bash examples/extras/llama_pro/expand.sh
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```
 
 #### FSDP+QLoRA 微调
diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
index 084269ef..cc773991 100644
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -8,9 +8,6 @@ do_train: true
 finetuning_type: lora
 lora_target: all
 
-### ddp
-ddp_timeout: 180000000
-
 ### dataset
 dataset: identity,alpaca_en_demo
 template: llama3
@@ -34,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index 444a1113..f92d6945 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -32,6 +32,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
index 1ba654ec..57383ae0 100644
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml
index df03c1e0..085febfc 100644
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 pure_bf16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml
deleted file mode 100644
index 348e53b9..00000000
--- a/examples/lora_multi_gpu/llama3_lora_sft.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### ddp
-ddp_timeout: 180000000
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
diff --git a/examples/full_multi_gpu/llama3_full_predict.yaml b/examples/train_full/llama3_full_predict.yaml
similarity index 100%
rename from examples/full_multi_gpu/llama3_full_predict.yaml
rename to examples/train_full/llama3_full_predict.yaml
diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/train_full/llama3_full_sft_ds3.yaml
similarity index 100%
rename from examples/full_multi_gpu/llama3_full_sft.yaml
rename to examples/train_full/llama3_full_sft_ds3.yaml
diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/train_lora/llama3_lora_dpo.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_dpo.yaml
rename to examples/train_lora/llama3_lora_dpo.yaml
index 78344330..db25fb51 100644
--- a/examples/lora_single_gpu/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -32,6 +32,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_eval.yaml b/examples/train_lora/llama3_lora_eval.yaml
similarity index 100%
rename from examples/lora_single_gpu/llama3_lora_eval.yaml
rename to examples/train_lora/llama3_lora_eval.yaml
diff --git a/examples/lora_single_gpu/llama3_lora_kto.yaml b/examples/train_lora/llama3_lora_kto.yaml
similarity index 94%
rename from examples/lora_single_gpu/llama3_lora_kto.yaml
rename to examples/train_lora/llama3_lora_kto.yaml
index d5234c0a..f730c82e 100644
--- a/examples/lora_single_gpu/llama3_lora_kto.yaml
+++ b/examples/train_lora/llama3_lora_kto.yaml
@@ -6,6 +6,7 @@ stage: kto
 do_train: true
 finetuning_type: lora
 lora_target: all
+pref_beta: 0.1
 
 ### dataset
 dataset: kto_en_demo
@@ -30,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_ppo.yaml b/examples/train_lora/llama3_lora_ppo.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_ppo.yaml
rename to examples/train_lora/llama3_lora_ppo.yaml
index 98c842f9..e574014e 100644
--- a/examples/lora_single_gpu/llama3_lora_ppo.yaml
+++ b/examples/train_lora/llama3_lora_ppo.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### generate
 max_new_tokens: 512
diff --git a/examples/lora_single_gpu/llama3_lora_predict.yaml b/examples/train_lora/llama3_lora_predict.yaml
similarity index 95%
rename from examples/lora_single_gpu/llama3_lora_predict.yaml
rename to examples/train_lora/llama3_lora_predict.yaml
index a127d248..148c8635 100644
--- a/examples/lora_single_gpu/llama3_lora_predict.yaml
+++ b/examples/train_lora/llama3_lora_predict.yaml
@@ -22,3 +22,4 @@ overwrite_output_dir: true
 ### eval
 per_device_eval_batch_size: 1
 predict_with_generate: true
+ddp_timeout: 180000000
diff --git a/examples/lora_single_gpu/llama3_lora_pretrain.yaml b/examples/train_lora/llama3_lora_pretrain.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_pretrain.yaml
rename to examples/train_lora/llama3_lora_pretrain.yaml
index db435ca9..839b3e51 100644
--- a/examples/lora_single_gpu/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/llama3_lora_pretrain.yaml
@@ -29,6 +29,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/train_lora/llama3_lora_reward.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_reward.yaml
rename to examples/train_lora/llama3_lora_reward.yaml
index 1ce42ea4..79559d19 100644
--- a/examples/lora_single_gpu/llama3_lora_reward.yaml
+++ b/examples/train_lora/llama3_lora_reward.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/train_lora/llama3_lora_sft.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_sft.yaml
rename to examples/train_lora/llama3_lora_sft.yaml
index 651b636f..fe30c575 100644
--- a/examples/lora_single_gpu/llama3_lora_sft.yaml
+++ b/examples/train_lora/llama3_lora_sft.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml b/examples/train_lora/llama3_lora_sft_ds0.yaml
similarity index 98%
rename from examples/lora_multi_npu/llama3_lora_sft_ds.yaml
rename to examples/train_lora/llama3_lora_sft_ds0.yaml
index a0ec8aa1..08b638e6 100644
--- a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds0.yaml
@@ -6,9 +6,6 @@ stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: all
-
-### ddp
-ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z0_config.json
 
 ### dataset
@@ -34,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/train_lora/llama3_lora_sft_ds3.yaml
similarity index 98%
rename from examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
rename to examples/train_lora/llama3_lora_sft_ds3.yaml
index 1c432fa7..b7266d61 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds3.yaml
@@ -6,9 +6,6 @@ stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: all
-
-### ddp
-ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
 ### dataset
@@ -34,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/train_lora/llama3_preprocess.yaml
similarity index 100%
rename from examples/lora_single_gpu/llama3_preprocess.yaml
rename to examples/train_lora/llama3_preprocess.yaml
diff --git a/examples/lora_single_gpu/llava1_5_lora_sft.yaml b/examples/train_lora/llava1_5_lora_sft.yaml
similarity index 96%
rename from examples/lora_single_gpu/llava1_5_lora_sft.yaml
rename to examples/train_lora/llava1_5_lora_sft.yaml
index df510a93..55ac31fa 100644
--- a/examples/lora_single_gpu/llava1_5_lora_sft.yaml
+++ b/examples/train_lora/llava1_5_lora_sft.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
similarity index 96%
rename from examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
rename to examples/train_qlora/llama3_lora_sft_aqlm.yaml
index d54d6af6..7b6767d5 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/train_qlora/llama3_lora_sft_awq.yaml
similarity index 96%
rename from examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
rename to examples/train_qlora/llama3_lora_sft_awq.yaml
index 5cef178a..a2a26e4b 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_awq.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
similarity index 96%
rename from examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
rename to examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
index b308dcab..cc773991 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/train_qlora/llama3_lora_sft_gptq.yaml
similarity index 96%
rename from examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
rename to examples/train_qlora/llama3_lora_sft_gptq.yaml
index b950042e..ad3d854c 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1

From e06568de988db0f6e44e663bbc8457e8aa0f0507 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 03:16:20 +0800
Subject: [PATCH 033/162] Update llama3_full_sft_ds3.yaml

Former-commit-id: e715af62d521112d9c155cfa91fbb42fa0e77710
---
 examples/train_full/llama3_full_sft_ds3.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/train_full/llama3_full_sft_ds3.yaml b/examples/train_full/llama3_full_sft_ds3.yaml
index 40b62f24..40afd2ee 100644
--- a/examples/train_full/llama3_full_sft_ds3.yaml
+++ b/examples/train_full/llama3_full_sft_ds3.yaml
@@ -5,9 +5,6 @@ model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 stage: sft
 do_train: true
 finetuning_type: full
-
-### ddp
-ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
 ### dataset
@@ -33,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1

From 48d87a2decf4dd4d687ef0d2eec8b93109ad7d98 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 03:19:18 +0800
Subject: [PATCH 034/162] add quant check in webui export tab

Former-commit-id: 6455ca07061ae9858cd7bc996b28be1fde697a3d
---
 src/llamafactory/webui/components/export.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/llamafactory/webui/components/export.py b/src/llamafactory/webui/components/export.py
index 7e1493c8..9d756a38 100644
--- a/src/llamafactory/webui/components/export.py
+++ b/src/llamafactory/webui/components/export.py
@@ -21,6 +21,13 @@ if TYPE_CHECKING:
 GPTQ_BITS = ["8", "4", "3", "2"]
 
 
+def can_quantize(checkpoint_path: Union[str, List[str]]) -> "gr.Dropdown":
+    if isinstance(checkpoint_path, list) and len(checkpoint_path) != 0:
+        return gr.Dropdown(value="none", interactive=False)
+    else:
+        return gr.Dropdown(interactive=True)
+
+
 def save_model(
     lang: str,
     model_name: str,
@@ -96,6 +103,9 @@ def create_export_tab(engine: "Engine") -> Dict[str, "Component"]:
         export_dir = gr.Textbox()
         export_hub_model_id = gr.Textbox()
 
+    checkpoint_path: gr.Dropdown = engine.manager.get_elem_by_id("top.checkpoint_path")
+    checkpoint_path.change(can_quantize, [checkpoint_path], [export_quantization_bit], queue=False)
+
     export_btn = gr.Button()
     info_box = gr.Textbox(show_label=False, interactive=False)
 

From db6e74884c633df80aedcd245a2dd08532ae37c3 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 03:26:10 +0800
Subject: [PATCH 035/162] update examples

Former-commit-id: d6bf6231290d79eb3a63e711f18fa711ef18a4f6
---
 examples/README.md    | 10 ++++++----
 examples/README_zh.md | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 3372afb9..180d5f7b 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -11,6 +11,8 @@ Make sure to execute these commands in the `LLaMA-Factory` directory.
 - [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
 - [Extras](#extras)
 
+Use `CUDA_VISIBLE_DEVICES` (GPU) or `ASCEND_RT_VISIBLE_DEVICES` (NPU) to choose computing devices.
+
 ## Examples
 
 ### LoRA Fine-Tuning
@@ -87,7 +89,7 @@ FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llama
 #### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
 ```
 
 ### QLoRA Fine-Tuning
@@ -121,14 +123,14 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_s
 #### Supervised Fine-Tuning on Single Node
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
 
 #### Supervised Fine-Tuning on Multiple Nodes
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
 
 #### Batch Predicting and Computing BLEU and ROUGE Scores
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 64c31fbd..b6168a95 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -11,6 +11,8 @@
 - [推理 LoRA 模型](#推理-lora-模型)
 - [杂项](#杂项)
 
+使用 `CUDA_VISIBLE_DEVICES`（GPU）或 `ASCEND_RT_VISIBLE_DEVICES`（NPU）选择计算设备。
+
 ## 示例
 
 ### LoRA 微调
@@ -87,7 +89,7 @@ FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llama
 #### 使用 DeepSpeed ZeRO-3 平均分配显存
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
 ```
 
 ### QLoRA 微调
@@ -121,14 +123,14 @@ llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
 #### 在单机上进行指令监督微调
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
 
 #### 在多机上进行指令监督微调
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
 
 #### 批量预测并计算 BLEU 和 ROUGE 分数

From 6306f3e2ead0a80b75997e29bf265710a67c2fd2 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 16:02:21 +0800
Subject: [PATCH 036/162] Update README.md

Former-commit-id: f8d701cd3ce2e56f95b4f5439b8b48d5b62e0d2b
---
 examples/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 180d5f7b..a6d78936 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -97,25 +97,25 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.
 #### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
 ```
 
 #### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
 ```
 
 #### Supervised Fine-Tuning with 4-bit AWQ Quantization
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
 ```
 
 #### Supervised Fine-Tuning with 2-bit AQLM Quantization
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
 ```
 
 ### Full-Parameter Fine-Tuning

From bbeb3b10aacd38261ab0e352ffd3a35ee0027a38 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 04:05:54 +0800
Subject: [PATCH 037/162] add test cases

Former-commit-id: 731176ff34cdf0cbf6b41c40c69f4ceb54c2daf6
---
 src/llamafactory/chat/vllm_engine.py   |  2 +-
 src/llamafactory/hparams/model_args.py |  8 +--
 src/llamafactory/model/adapter.py      | 43 +++++++------
 src/llamafactory/model/patcher.py      |  5 +-
 src/llamafactory/train/ppo/trainer.py  |  3 +-
 tests/model/test_base.py               | 32 ++++++++++
 tests/model/test_freeze.py             | 22 ++++++-
 tests/model/test_full.py               | 20 ++++++-
 tests/model/test_lora.py               | 83 +++++++++++++++++++++++++-
 9 files changed, 184 insertions(+), 34 deletions(-)
 create mode 100644 tests/model/test_base.py

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index e4c05478..f0812a99 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -52,7 +52,7 @@ class VllmEngine(BaseEngine):
             "model": model_args.model_name_or_path,
             "trust_remote_code": True,
             "download_dir": model_args.cache_dir,
-            "dtype": model_args.vllm_dtype,
+            "dtype": model_args.infer_dtype,
             "max_model_len": model_args.vllm_maxlen,
             "tensor_parallel_size": get_device_count() or 1,
             "gpu_memory_utilization": model_args.vllm_gpu_util,
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 359beafd..bbac2e4b 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -136,10 +136,6 @@ class ModelArguments:
         default=8,
         metadata={"help": "Maximum rank of all LoRAs in the vLLM engine."},
     )
-    vllm_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
-        default="auto",
-        metadata={"help": "Data type for model weights and activations in the vLLM engine."},
-    )
     offload_folder: str = field(
         default="offload",
         metadata={"help": "Path to offload model weights."},
@@ -148,6 +144,10 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether or not to use KV cache in generation."},
     )
+    infer_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
+        default="auto",
+        metadata={"help": "Data type for model weights and activations at inference."}
+    )
     hf_hub_token: Optional[str] = field(
         default=None,
         metadata={"help": "Auth token to log in with Hugging Face Hub."},
diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index 34b9eda6..c37f6009 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -25,8 +25,12 @@ def _setup_full_tuning(
     model: "PreTrainedModel",
     model_args: "ModelArguments",
     finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
     cast_trainable_params_to_fp32: bool,
 ) -> None:
+    if not is_trainable:
+        return
+
     logger.info("Fine-tuning method: Full")
     forbidden_modules = set()
     if model_args.visual_inputs and finetuning_args.freeze_vision_tower:
@@ -47,8 +51,12 @@ def _setup_freeze_tuning(
     model: "PreTrainedModel",
     model_args: "ModelArguments",
     finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
     cast_trainable_params_to_fp32: bool,
 ) -> None:
+    if not is_trainable:
+        return
+
     logger.info("Fine-tuning method: Freeze")
     if model_args.visual_inputs:
         config = model.config.text_config
@@ -132,7 +140,9 @@ def _setup_lora_tuning(
     is_trainable: bool,
     cast_trainable_params_to_fp32: bool,
 ) -> "PeftModel":
-    logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
+    if is_trainable:
+        logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
+
     adapter_to_resume = None
 
     if model_args.adapter_name_or_path is not None:
@@ -173,6 +183,8 @@ def _setup_lora_tuning(
                     offload_folder=model_args.offload_folder,
                 )
 
+        logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
+
     if is_trainable and adapter_to_resume is None:  # create new lora weights while training
         if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all":
             target_modules = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
@@ -227,9 +239,6 @@ def _setup_lora_tuning(
         for param in filter(lambda p: p.requires_grad, model.parameters()):
             param.data = param.data.to(torch.float32)
 
-    if model_args.adapter_name_or_path is not None:
-        logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
-
     return model
 
 
@@ -247,29 +256,27 @@ def init_adapter(
 
     Note that the trainable parameters must be cast to float32.
     """
-    if (not is_trainable) and model_args.adapter_name_or_path is None:
-        logger.info("Adapter is not found at evaluation, load the base model.")
-        return model
+    if is_trainable and getattr(model, "quantization_method", None) and finetuning_args.finetuning_type != "lora":
+        raise ValueError("Quantized models can only be used for the LoRA tuning.")
 
-    if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None):
-        raise ValueError("You can only use lora for quantized models.")
-
-    if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
+    if not is_trainable:
+        cast_trainable_params_to_fp32 = False
+    elif is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
         logger.info("ZeRO3/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.")
         cast_trainable_params_to_fp32 = False
     else:
         logger.info("Upcasting trainable params to float32.")
         cast_trainable_params_to_fp32 = True
 
-    if is_trainable and finetuning_args.finetuning_type == "full":
-        _setup_full_tuning(model, model_args, finetuning_args, cast_trainable_params_to_fp32)
-
-    if is_trainable and finetuning_args.finetuning_type == "freeze":
-        _setup_freeze_tuning(model, model_args, finetuning_args, cast_trainable_params_to_fp32)
-
-    if finetuning_args.finetuning_type == "lora":
+    if finetuning_args.finetuning_type == "full":
+        _setup_full_tuning(model, model_args, finetuning_args, is_trainable, cast_trainable_params_to_fp32)
+    elif finetuning_args.finetuning_type == "freeze":
+        _setup_freeze_tuning(model, model_args, finetuning_args, is_trainable, cast_trainable_params_to_fp32)
+    elif finetuning_args.finetuning_type == "lora":
         model = _setup_lora_tuning(
             config, model, model_args, finetuning_args, is_trainable, cast_trainable_params_to_fp32
         )
+    else:
+        raise NotImplementedError("Unknown finetuning type: {}.".format(finetuning_args.finetuning_type))
 
     return model
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 18221a10..b97ff433 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -44,7 +44,10 @@ def patch_config(
     is_trainable: bool,
 ) -> None:
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
-        model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
+        if model_args.infer_dtype == "auto":
+            model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
+        else:
+            model_args.compute_dtype = getattr(torch, model_args.infer_dtype)
 
     if is_torch_npu_available():
         use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 737c45a3..45f47455 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -135,8 +135,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model)
         self.is_chatglm_model = getattr(unwrapped_model.config, "model_type", None) == "chatglm"
 
-        device_type = unwrapped_model.pretrained_model.device.type
-        self.amp_context = torch.autocast(device_type, dtype=model_args.compute_dtype)
+        self.amp_context = torch.autocast(self.current_device.type, dtype=self.model_args.compute_dtype)
         warnings.simplefilter("ignore")  # remove gc warnings on ref model
 
         if finetuning_args.reward_model_type == "full":
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
new file mode 100644
index 00000000..32a3918e
--- /dev/null
+++ b/tests/model/test_base.py
@@ -0,0 +1,32 @@
+import os
+
+import torch
+from transformers import AutoModelForCausalLM
+
+from llamafactory.hparams import get_infer_args
+from llamafactory.model import load_model, load_tokenizer
+
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
+    state_dict_a = model_a.state_dict()
+    state_dict_b = model_b.state_dict()
+    assert set(state_dict_a.keys()) == set(state_dict_b.keys())
+    for name in state_dict_a.keys():
+        assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
+
+
+def test_base():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+    ref_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    compare_model(model, ref_model)
diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py
index 97800696..a0618315 100644
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
@@ -2,7 +2,7 @@ import os
 
 import torch
 
-from llamafactory.hparams import get_train_args
+from llamafactory.hparams import get_infer_args, get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
@@ -23,8 +23,15 @@ TRAIN_ARGS = {
     "fp16": True,
 }
 
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "finetuning_type": "freeze",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
 
-def test_freeze_all_modules():
+
+def test_freeze_train_all_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"freeze_trainable_layers": 1, **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
@@ -37,7 +44,7 @@ def test_freeze_all_modules():
             assert param.dtype == torch.float16
 
 
-def test_freeze_extra_modules():
+def test_freeze_train_extra_modules():
     model_args, _, _, finetuning_args, _ = get_train_args(
         {"freeze_trainable_layers": 1, "freeze_extra_modules": "embed_tokens,lm_head", **TRAIN_ARGS}
     )
@@ -50,3 +57,12 @@ def test_freeze_extra_modules():
         else:
             assert param.requires_grad is False
             assert param.dtype == torch.float16
+
+
+def test_freeze_inference():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+    for param in model.parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
diff --git a/tests/model/test_full.py b/tests/model/test_full.py
index 6cb78f37..802b987c 100644
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
@@ -2,7 +2,7 @@ import os
 
 import torch
 
-from llamafactory.hparams import get_train_args
+from llamafactory.hparams import get_infer_args, get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
@@ -23,11 +23,27 @@ TRAIN_ARGS = {
     "fp16": True,
 }
 
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "finetuning_type": "full",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
 
-def test_full():
+
+def test_full_train():
     model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     for param in model.parameters():
         assert param.requires_grad is True
         assert param.dtype == torch.float32
+
+
+def test_full_inference():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+    for param in model.parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 2e2b89d9..3e2503f1 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -1,13 +1,18 @@
 import os
+from typing import Sequence
 
 import torch
+from peft import LoraModel, PeftModel
+from transformers import AutoModelForCausalLM
 
-from llamafactory.hparams import get_train_args
+from llamafactory.hparams import get_infer_args, get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
 TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
+TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
+
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
@@ -23,8 +28,32 @@ TRAIN_ARGS = {
     "fp16": True,
 }
 
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "adapter_name_or_path": TINY_LLAMA_ADAPTER,
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
 
-def test_lora_all_modules():
+
+def load_reference_model() -> "torch.nn.Module":
+    model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA)
+    return PeftModel.from_pretrained(model, TINY_LLAMA_ADAPTER)
+
+
+def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_keys: Sequence[str] = []):
+    state_dict_a = model_a.state_dict()
+    state_dict_b = model_b.state_dict()
+    assert set(state_dict_a.keys()) == set(state_dict_b.keys())
+    for name in state_dict_a.keys():
+        if any(key in name for key in diff_keys):
+            assert torch.allclose(state_dict_a[name], state_dict_b[name]) is False
+        else:
+            assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
+
+
+def test_lora_train_all_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "all", **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
@@ -41,7 +70,7 @@ def test_lora_all_modules():
     assert linear_modules == {"q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"}
 
 
-def test_lora_extra_modules():
+def test_lora_train_extra_modules():
     model_args, _, _, finetuning_args, _ = get_train_args(
         {"lora_target": "all", "additional_target": "embed_tokens,lm_head", **TRAIN_ARGS}
     )
@@ -61,3 +90,51 @@ def test_lora_extra_modules():
             assert param.dtype == torch.float16
 
     assert extra_modules == {"embed_tokens", "lm_head"}
+
+
+def test_lora_train_old_adapters():
+    model_args, _, _, finetuning_args, _ = get_train_args(
+        {"adapter_name_or_path": TINY_LLAMA_ADAPTER, "create_new_adapter": False, **TRAIN_ARGS}
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
+    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
+    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
+        param.data = param.data.to(torch.float32)
+
+    compare_model(model, ref_model)
+
+
+def test_lora_train_new_adapters():
+    model_args, _, _, finetuning_args, _ = get_train_args(
+        {"adapter_name_or_path": TINY_LLAMA_ADAPTER, "create_new_adapter": True, **TRAIN_ARGS}
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
+    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
+    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
+        param.data = param.data.to(torch.float32)
+
+    compare_model(
+        model, ref_model, diff_keys=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"]
+    )
+
+
+def test_lora_inference():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+
+    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER)
+    ref_model = ref_model.merge_and_unload()
+    compare_model(model, ref_model)
+
+    for name, param in model.named_parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
+        assert "lora" not in name

From 43cfbde0d181bb6af1abff4e615cc0a034ba5d55 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 04:34:55 +0800
Subject: [PATCH 038/162] fix #4295

Former-commit-id: 08f657868f9d605b837c5d8c2946a25cc05c8735
---
 src/llamafactory/train/sft/trainer.py  | 10 +++++++---
 src/llamafactory/train/sft/workflow.py |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index c063b214..6bf5b7c0 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -13,6 +13,7 @@ from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
+    from torch.utils.data import Dataset
     from transformers import ProcessorMixin
     from transformers.trainer import PredictionOutput
 
@@ -94,7 +95,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         padded_tensor[:, -src_tensor.shape[-1] :] = src_tensor  # adopt left-padding
         return padded_tensor.contiguous()  # in contiguous memory
 
-    def save_predictions(self, predict_results: "PredictionOutput") -> None:
+    def save_predictions(self, dataset: "Dataset", predict_results: "PredictionOutput") -> None:
         r"""
         Saves model predictions to `output_dir`.
 
@@ -120,6 +121,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
                     (preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1
                 )  # move pad token to last
 
+        decoded_inputs = self.tokenizer.batch_decode(
+            dataset["input_ids"], skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
         decoded_labels = self.tokenizer.batch_decode(
             labels, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
@@ -127,6 +131,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 
         with open(output_prediction_file, "w", encoding="utf-8") as writer:
             res: List[str] = []
-            for label, pred in zip(decoded_labels, decoded_preds):
-                res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False))
+            for text, label, pred in zip(decoded_inputs, decoded_labels, decoded_preds):
+                res.append(json.dumps({"prompt": text, "label": label, "predict": pred}, ensure_ascii=False))
             writer.write("\n".join(res))
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index f09b5173..a989b3f7 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -93,7 +93,7 @@ def run_sft(
             predict_results.metrics.pop("predict_loss", None)
         trainer.log_metrics("predict", predict_results.metrics)
         trainer.save_metrics("predict", predict_results.metrics)
-        trainer.save_predictions(predict_results)
+        trainer.save_predictions(dataset, predict_results)
 
     # Create model card
     create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)

From d2137c7099436f0ce42b16f2b838de09942602fb Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 04:47:13 +0800
Subject: [PATCH 039/162] fix #4292

Former-commit-id: 4cd4c179d24eab0fcaec2b29b9dd71970f877fe8
---
 src/llamafactory/webui/common.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py
index 37b38df0..3b8d5378 100644
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
@@ -39,7 +39,11 @@ def get_save_dir(*paths: str) -> os.PathLike:
     r"""
     Gets the path to saved model checkpoints.
     """
-    paths = (path.replace(os.path.sep, "").replace(" ", "").strip() for path in paths)
+    if os.path.sep in paths[-1]:
+        logger.warning("Found complex path, some features may be not available.")
+        return paths[-1]
+
+    paths = (path.replace(" ", "").strip() for path in paths)
     return os.path.join(DEFAULT_SAVE_DIR, *paths)
 
 
From c9959df3c7f7e0867a322654735357df5c10aa48 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 04:57:19 +0800
Subject: [PATCH 040/162] disable DP

Former-commit-id: c18fd609d268389f3e65274992045a6c9f8e6c1f
---
 src/llamafactory/hparams/parser.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index ec5dd62c..3476a726 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -8,6 +8,7 @@ import transformers
 from transformers import HfArgumentParser, Seq2SeqTrainingArguments
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.trainer_utils import get_last_checkpoint
+from transformers.training_args import ParallelMode
 from transformers.utils import is_torch_bf16_gpu_available
 from transformers.utils.versions import require_version
 
@@ -162,6 +163,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("PPO only accepts wandb or tensorboard logger.")
 
+    if training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED:
+        raise ValueError("Please launch distributed training with `llamafactory-cli` or `torchrun`.")
+
     if training_args.max_steps == -1 and data_args.streaming:
         raise ValueError("Please specify `max_steps` in streaming mode.")
 
@@ -181,14 +185,14 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if (
         finetuning_args.use_galore
         and finetuning_args.galore_layerwise
-        and training_args.parallel_mode.value == "distributed"
+        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
     ):
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
     if (
         finetuning_args.use_badam
         and finetuning_args.badam_mode == "layer"
-        and training_args.parallel_mode.value == "distributed"
+        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
     ):
         raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
 
@@ -230,7 +234,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
 
     # Post-process training arguments
     if (
-        training_args.parallel_mode.value == "distributed"
+        training_args.parallel_mode == ParallelMode.DISTRIBUTED
         and training_args.ddp_find_unused_parameters is None
         and finetuning_args.finetuning_type == "lora"
     ):
@@ -290,7 +294,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
             training_args.local_rank,
             training_args.device,
             training_args.n_gpu,
-            training_args.parallel_mode.value == "distributed",
+            training_args.parallel_mode == ParallelMode.DISTRIBUTED,
             str(model_args.compute_dtype),
         )
     )

From ab4b3931fd1b6bef0d6ef691ad070b4504f9246a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 05:11:33 +0800
Subject: [PATCH 041/162] fix #4271

Former-commit-id: 03707e78d29bfcf5d395a64bb38632bdb3ff47ce
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 405ac46e..1756551e 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ extra_require = {
     "torch": ["torch>=1.13.1"],
     "torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
-    "deepspeed": ["deepspeed>=0.10.0,<=0.14.0"],
+    "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "vllm": ["vllm>=0.4.3"],
     "galore": ["galore-torch"],

From 8b523554d63c86511c64bd68b5ed6c11849e5fc0 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 05:13:16 +0800
Subject: [PATCH 042/162] update readme

Former-commit-id: a43d302aa79cbfb9b0606e855b4c1af6865d8e68
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5dd10d5a..cae79694 100644
--- a/README.md
+++ b/README.md
@@ -463,7 +463,7 @@ docker-compose exec llamafactory bash
 ### Deploy with OpenAI-style API and vLLM
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
+API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
 ```
 
 > [!TIP]
diff --git a/README_zh.md b/README_zh.md
index 76bd2d89..af3ff8f0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -463,7 +463,7 @@ docker-compose exec llamafactory bash
 ### 利用 vLLM 部署 OpenAI API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
+API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
 ```
 
 > [!TIP]

From acfae2e677cc6d507671ea241af0236d91b4f7fd Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 17:54:33 +0800
Subject: [PATCH 043/162] add license

Former-commit-id: 69cfc98d7c81756a5ab6bf962240e393e449fef0
---
 evaluation/ceval/ceval.py                     |  1 +
 evaluation/cmmlu/cmmlu.py                     |  1 +
 evaluation/mmlu/mmlu.py                       |  1 +
 scripts/cal_flops.py                          | 23 +++++++++--
 scripts/cal_lr.py                             | 23 +++++++++--
 scripts/cal_ppl.py                            | 19 ++++++++-
 scripts/length_cdf.py                         | 19 ++++++++-
 scripts/llama_pro.py                          | 23 +++++++++--
 scripts/llamafy_baichuan2.py                  | 22 ++++++++--
 scripts/llamafy_qwen.py                       | 21 ++++++++--
 scripts/loftq_init.py                         | 23 +++++++++--
 scripts/test_toolcall.py                      | 15 +++++++
 setup.py                                      | 14 +++++++
 src/api.py                                    | 14 +++++++
 src/llamafactory/__init__.py                  | 14 +++++++
 src/llamafactory/api/app.py                   | 14 +++++++
 src/llamafactory/api/chat.py                  | 14 +++++++
 src/llamafactory/api/common.py                | 14 +++++++
 src/llamafactory/api/protocol.py              | 14 +++++++
 src/llamafactory/chat/__init__.py             | 14 +++++++
 src/llamafactory/chat/base_engine.py          | 14 +++++++
 src/llamafactory/chat/chat_model.py           | 17 ++++++++
 src/llamafactory/chat/hf_engine.py            | 14 +++++++
 src/llamafactory/chat/vllm_engine.py          | 14 +++++++
 src/llamafactory/cli.py                       | 14 +++++++
 src/llamafactory/data/__init__.py             | 14 +++++++
 src/llamafactory/data/aligner.py              | 14 +++++++
 src/llamafactory/data/collator.py             | 14 +++++++
 src/llamafactory/data/data_utils.py           | 14 +++++++
 src/llamafactory/data/formatter.py            | 14 +++++++
 src/llamafactory/data/loader.py               | 14 +++++++
 src/llamafactory/data/parser.py               | 14 +++++++
 src/llamafactory/data/preprocess.py           | 14 +++++++
 src/llamafactory/data/processors/feedback.py  | 14 +++++++
 src/llamafactory/data/processors/pairwise.py  | 14 +++++++
 src/llamafactory/data/processors/pretrain.py  | 17 ++++++++
 .../data/processors/processor_utils.py        | 14 +++++++
 .../data/processors/supervised.py             | 14 +++++++
 .../data/processors/unsupervised.py           | 14 +++++++
 src/llamafactory/data/template.py             | 14 +++++++
 src/llamafactory/eval/evaluator.py            | 39 +++++++++++++++++-
 src/llamafactory/eval/template.py             | 14 +++++++
 src/llamafactory/extras/callbacks.py          | 14 +++++++
 src/llamafactory/extras/constants.py          | 14 +++++++
 src/llamafactory/extras/env.py                | 14 +++++++
 src/llamafactory/extras/logging.py            | 14 +++++++
 src/llamafactory/extras/misc.py               | 14 +++++++
 src/llamafactory/extras/packages.py           | 17 ++++++++
 src/llamafactory/extras/ploting.py            | 14 +++++++
 src/llamafactory/hparams/__init__.py          | 14 +++++++
 src/llamafactory/hparams/data_args.py         | 17 ++++++++
 src/llamafactory/hparams/evaluation_args.py   | 14 +++++++
 src/llamafactory/hparams/finetuning_args.py   | 14 +++++++
 src/llamafactory/hparams/generating_args.py   | 14 +++++++
 src/llamafactory/hparams/model_args.py        | 17 ++++++++
 src/llamafactory/hparams/parser.py            | 17 ++++++++
 src/llamafactory/launcher.py                  | 14 +++++++
 src/llamafactory/model/__init__.py            | 14 +++++++
 src/llamafactory/model/adapter.py             | 14 +++++++
 src/llamafactory/model/loader.py              | 14 +++++++
 .../model/model_utils/attention.py            | 14 +++++++
 .../model/model_utils/checkpointing.py        | 19 ++++++++-
 .../model/model_utils/embedding.py            | 14 +++++++
 .../model/model_utils/longlora.py             | 17 ++++++++
 src/llamafactory/model/model_utils/misc.py    | 14 +++++++
 src/llamafactory/model/model_utils/mod.py     | 14 +++++++
 src/llamafactory/model/model_utils/moe.py     | 14 +++++++
 .../model/model_utils/quantization.py         | 18 ++++++++-
 src/llamafactory/model/model_utils/rope.py    | 18 +++++++++
 src/llamafactory/model/model_utils/unsloth.py | 14 +++++++
 .../model/model_utils/valuehead.py            | 14 +++++++
 src/llamafactory/model/model_utils/visual.py  | 17 ++++++++
 src/llamafactory/model/patcher.py             | 14 +++++++
 src/llamafactory/train/dpo/__init__.py        | 14 +++++++
 src/llamafactory/train/dpo/trainer.py         | 17 ++++++++
 src/llamafactory/train/dpo/workflow.py        | 17 +++++++-
 src/llamafactory/train/kto/__init__.py        | 14 +++++++
 src/llamafactory/train/kto/trainer.py         | 17 ++++++++
 src/llamafactory/train/kto/workflow.py        | 17 ++++++++
 src/llamafactory/train/ppo/__init__.py        | 14 +++++++
 src/llamafactory/train/ppo/ppo_utils.py       | 14 +++++++
 src/llamafactory/train/ppo/trainer.py         | 17 ++++++++
 src/llamafactory/train/ppo/workflow.py        | 17 +++++++-
 src/llamafactory/train/pt/__init__.py         | 14 +++++++
 src/llamafactory/train/pt/trainer.py          | 14 +++++++
 src/llamafactory/train/pt/workflow.py         | 17 +++++++-
 src/llamafactory/train/rm/__init__.py         | 14 +++++++
 src/llamafactory/train/rm/metric.py           | 14 +++++++
 src/llamafactory/train/rm/trainer.py          | 40 ++++++++++++++++++-
 src/llamafactory/train/rm/workflow.py         | 39 +++++++++++++++++-
 src/llamafactory/train/sft/__init__.py        | 14 +++++++
 src/llamafactory/train/sft/metric.py          | 18 +++++++++
 src/llamafactory/train/sft/trainer.py         | 17 ++++++++
 src/llamafactory/train/sft/workflow.py        | 17 +++++++-
 src/llamafactory/train/trainer_utils.py       | 19 +++++++++
 src/llamafactory/train/tuner.py               | 14 +++++++
 src/llamafactory/webui/chatter.py             | 14 +++++++
 src/llamafactory/webui/common.py              | 14 +++++++
 src/llamafactory/webui/components/__init__.py | 14 +++++++
 src/llamafactory/webui/components/chatbot.py  | 14 +++++++
 src/llamafactory/webui/components/data.py     | 14 +++++++
 src/llamafactory/webui/components/eval.py     | 14 +++++++
 src/llamafactory/webui/components/export.py   | 14 +++++++
 src/llamafactory/webui/components/infer.py    | 14 +++++++
 src/llamafactory/webui/components/top.py      | 14 +++++++
 src/llamafactory/webui/components/train.py    | 14 +++++++
 src/llamafactory/webui/css.py                 | 14 +++++++
 src/llamafactory/webui/engine.py              | 14 +++++++
 src/llamafactory/webui/interface.py           | 14 +++++++
 src/llamafactory/webui/locales.py             | 14 +++++++
 src/llamafactory/webui/manager.py             | 14 +++++++
 src/llamafactory/webui/runner.py              | 14 +++++++
 src/llamafactory/webui/utils.py               | 14 +++++++
 src/train.py                                  | 14 +++++++
 src/webui.py                                  | 14 +++++++
 tests/data/test_supervised.py                 | 14 +++++++
 tests/eval/test_eval_template.py              | 14 +++++++
 tests/model/model_utils/test_attention.py     | 14 +++++++
 tests/model/test_base.py                      | 14 +++++++
 tests/model/test_freeze.py                    | 14 +++++++
 tests/model/test_full.py                      | 14 +++++++
 tests/model/test_lora.py                      | 14 +++++++
 122 files changed, 1848 insertions(+), 32 deletions(-)

diff --git a/evaluation/ceval/ceval.py b/evaluation/ceval/ceval.py
index 4111d6b4..48442d50 100644
--- a/evaluation/ceval/ceval.py
+++ b/evaluation/ceval/ceval.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 
 import datasets
diff --git a/evaluation/cmmlu/cmmlu.py b/evaluation/cmmlu/cmmlu.py
index 37efb328..5ff548a4 100644
--- a/evaluation/cmmlu/cmmlu.py
+++ b/evaluation/cmmlu/cmmlu.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 
 import datasets
diff --git a/evaluation/mmlu/mmlu.py b/evaluation/mmlu/mmlu.py
index a4530250..1065fb31 100644
--- a/evaluation/mmlu/mmlu.py
+++ b/evaluation/mmlu/mmlu.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 
 import datasets
diff --git a/scripts/cal_flops.py b/scripts/cal_flops.py
index ac87e0ab..627b5534 100644
--- a/scripts/cal_flops.py
+++ b/scripts/cal_flops.py
@@ -1,7 +1,20 @@
 # coding=utf-8
-# Calculates the flops of pre-trained models.
-# Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
-# Inspired by: https://www.deepspeed.ai/tutorials/flops-profiler/
+# Copyright 2024 Microsoft Corporation and the LlamaFactory team.
+#
+# This code is inspired by Microsoft's DeepSpeed library.
+# https://www.deepspeed.ai/tutorials/flops-profiler/
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import fire
 import torch
@@ -17,6 +30,10 @@ def calculate_flops(
     seq_length: int = 256,
     flash_attn: str = "auto",
 ):
+    r"""
+    Calculates the flops of pre-trained models.
+    Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
+    """
     with get_accelerator().device(0):
         chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
         fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.model.device)
diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py
index bfa32cc9..ff21d27c 100644
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -1,7 +1,20 @@
 # coding=utf-8
-# Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
-# Usage: python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en --cutoff_len 1024 --batch_size 16
-# Inspired by: https://github.com/imoneoi/openchat/blob/master/ochat/training_deepspeed/train.py
+# Copyright 2024 imoneoi and the LlamaFactory team.
+#
+# This code is inspired by imoneoi's OpenChat library.
+# https://github.com/imoneoi/openchat/blob/3.6.0/ochat/training_deepspeed/train.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import math
 from typing import Literal
@@ -32,6 +45,10 @@ def calculate_lr(
     cutoff_len: int = 1024,  # i.e. maximum input length during training
     is_mistral: bool = False,  # mistral model uses a smaller learning rate,
 ):
+    r"""
+    Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
+    Usage: python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en --cutoff_len 1024 --batch_size 16
+    """
     model_args, data_args, training_args, _, _ = get_train_args(
         dict(
             stage=stage,
diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py
index 387b756c..fb503629 100644
--- a/scripts/cal_ppl.py
+++ b/scripts/cal_ppl.py
@@ -1,6 +1,17 @@
 # coding=utf-8
-# Calculates the ppl on the dataset of the pre-trained models.
-# Usage: python cal_ppl.py --model_name_or_path path_to_model --save_name ppl.json
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import json
 from dataclasses import dataclass
@@ -56,6 +67,10 @@ def cal_ppl(
     max_samples: Optional[int] = None,
     train_on_prompt: bool = False,
 ):
+    r"""
+    Calculates the ppl on the dataset of the pre-trained models.
+    Usage: python cal_ppl.py --model_name_or_path path_to_model --save_name ppl.json
+    """
     model_args, data_args, training_args, finetuning_args, _ = get_train_args(
         dict(
             stage=stage,
diff --git a/scripts/length_cdf.py b/scripts/length_cdf.py
index 7739dcf0..4cdf01e6 100644
--- a/scripts/length_cdf.py
+++ b/scripts/length_cdf.py
@@ -1,6 +1,17 @@
 # coding=utf-8
-# Calculates the distribution of the input lengths in the dataset.
-# Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en --template default
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from collections import defaultdict
 
@@ -19,6 +30,10 @@ def length_cdf(
     template: str = "default",
     interval: int = 1000,
 ):
+    r"""
+    Calculates the distribution of the input lengths in the dataset.
+    Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en --template default
+    """
     model_args, data_args, training_args, _, _ = get_train_args(
         dict(
             stage="sft",
diff --git a/scripts/llama_pro.py b/scripts/llama_pro.py
index 727998ae..f315335a 100644
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -1,7 +1,20 @@
 # coding=utf-8
-# Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.
-# Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
-# Inspired by: https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
+# Copyright 2024 Tencent Inc. and the LlamaFactory team.
+#
+# This code is inspired by Tencent's LLaMA-Pro library.
+# https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import json
 import os
@@ -37,6 +50,10 @@ def block_expansion(
     shard_size: Optional[str] = "2GB",
     save_safetensors: Optional[bool] = False,
 ):
+    r"""
+    Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.
+    Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
+    """
     config: "PretrainedConfig" = AutoConfig.from_pretrained(model_name_or_path)
     num_layers = getattr(config, "num_hidden_layers")
     setattr(config, "num_hidden_layers", num_layers + num_expand)
diff --git a/scripts/llamafy_baichuan2.py b/scripts/llamafy_baichuan2.py
index 1ae58879..19284f5f 100644
--- a/scripts/llamafy_baichuan2.py
+++ b/scripts/llamafy_baichuan2.py
@@ -1,8 +1,17 @@
 # coding=utf-8
-# Converts the Baichuan2-7B model in the same format as LLaMA2-7B.
-# Usage: python llamafy_baichuan2.py --input_dir input --output_dir output
-# Inspired by: https://huggingface.co/fireballoon/baichuan-llama-7b/blob/main/convert_baichuan_to_llama.py
-# Converted model: https://huggingface.co/hiyouga/Baichuan2-7B-Base-LLaMAfied
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import json
 import os
@@ -79,6 +88,11 @@ def save_config(input_dir: str, output_dir: str):
 def llamafy_baichuan2(
     input_dir: str, output_dir: str, shard_size: Optional[str] = "2GB", save_safetensors: Optional[bool] = False
 ):
+    r"""
+    Converts the Baichuan2-7B model in the same format as LLaMA2-7B.
+    Usage: python llamafy_baichuan2.py --input_dir input --output_dir output
+    Converted model: https://huggingface.co/hiyouga/Baichuan2-7B-Base-LLaMAfied
+    """
     try:
         os.makedirs(output_dir, exist_ok=False)
     except Exception as e:
diff --git a/scripts/llamafy_qwen.py b/scripts/llamafy_qwen.py
index 69cf3e8e..e5b59483 100644
--- a/scripts/llamafy_qwen.py
+++ b/scripts/llamafy_qwen.py
@@ -1,7 +1,17 @@
 # coding=utf-8
-# Converts the Qwen models in the same format as LLaMA2.
-# Usage: python llamafy_qwen.py --input_dir input --output_dir output
-# Converted model: https://huggingface.co/hiyouga/Qwen-14B-Chat-LLaMAfied
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import json
 import os
@@ -131,6 +141,11 @@ def save_config(input_dir: str, output_dir: str, torch_dtype: str):
 def llamafy_qwen(
     input_dir: str, output_dir: str, shard_size: Optional[str] = "2GB", save_safetensors: Optional[bool] = False
 ):
+    r"""
+    Converts the Qwen models in the same format as LLaMA2.
+    Usage: python llamafy_qwen.py --input_dir input --output_dir output
+    Converted model: https://huggingface.co/hiyouga/Qwen-14B-Chat-LLaMAfied
+    """
     try:
         os.makedirs(output_dir, exist_ok=False)
     except Exception as e:
diff --git a/scripts/loftq_init.py b/scripts/loftq_init.py
index 7f244316..159dea06 100644
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
@@ -1,7 +1,20 @@
 # coding=utf-8
-# Initializes LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ)
-# Usage: python loftq_init.py --model_name_or_path path_to_model --save_dir output_dir
-# Inspired by: https://github.com/huggingface/peft/blob/main/examples/loftq_finetuning/quantize_save_load.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.10.0/examples/loftq_finetuning/quantize_save_load.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import os
 from typing import TYPE_CHECKING, Optional
@@ -49,6 +62,10 @@ def quantize_loftq(
     lora_target: Optional[str] = "q_proj,v_proj",
     save_safetensors: Optional[bool] = False,
 ):
+    r"""
+    Initializes LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ)
+    Usage: python loftq_init.py --model_name_or_path path_to_model --save_dir output_dir
+    """
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
     loftq_config = LoftQConfig(loftq_bits=loftq_bits, loftq_iter=loftq_iter)
diff --git a/scripts/test_toolcall.py b/scripts/test_toolcall.py
index 7e460017..6f6fd06c 100644
--- a/scripts/test_toolcall.py
+++ b/scripts/test_toolcall.py
@@ -1,3 +1,18 @@
+# coding=utf-8
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from typing import Sequence
diff --git a/setup.py b/setup.py
index 1756551e..3d2ac921 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import re
 
diff --git a/src/api.py b/src/api.py
index 3655e393..0f925497 100644
--- a/src/api.py
+++ b/src/api.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 import uvicorn
diff --git a/src/llamafactory/__init__.py b/src/llamafactory/__init__.py
index 78230937..9d732777 100644
--- a/src/llamafactory/__init__.py
+++ b/src/llamafactory/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Level: api, webui > chat, eval, train > data, model > hparams > extras
 
 from .cli import VERSION
diff --git a/src/llamafactory/api/app.py b/src/llamafactory/api/app.py
index 21edab2f..c1264617 100644
--- a/src/llamafactory/api/app.py
+++ b/src/llamafactory/api/app.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from contextlib import asynccontextmanager
 from typing import Optional
diff --git a/src/llamafactory/api/chat.py b/src/llamafactory/api/chat.py
index 98957bc1..a2074dbb 100644
--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import base64
 import io
 import json
diff --git a/src/llamafactory/api/common.py b/src/llamafactory/api/common.py
index 5ad9a071..d1ac94de 100644
--- a/src/llamafactory/api/common.py
+++ b/src/llamafactory/api/common.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 from typing import TYPE_CHECKING, Any, Dict
 
diff --git a/src/llamafactory/api/protocol.py b/src/llamafactory/api/protocol.py
index 055fa781..a69132ea 100644
--- a/src/llamafactory/api/protocol.py
+++ b/src/llamafactory/api/protocol.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 from enum import Enum, unique
 from typing import Any, Dict, List, Optional, Union
diff --git a/src/llamafactory/chat/__init__.py b/src/llamafactory/chat/__init__.py
index a1a79de6..07276d48 100644
--- a/src/llamafactory/chat/__init__.py
+++ b/src/llamafactory/chat/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .base_engine import BaseEngine
 from .chat_model import ChatModel
 
diff --git a/src/llamafactory/chat/base_engine.py b/src/llamafactory/chat/base_engine.py
index 65b6c59c..92a51ebe 100644
--- a/src/llamafactory/chat/base_engine.py
+++ b/src/llamafactory/chat/base_engine.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Literal, Optional, Sequence, Union
diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
index 281ef0c1..fb800106 100644
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
@@ -1,3 +1,20 @@
+# Copyright 2024 THUDM and the LlamaFactory team.
+#
+# This code is inspired by the THUDM's ChatGLM implementation.
+# https://github.com/THUDM/ChatGLM-6B/blob/main/cli_demo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import asyncio
 from threading import Thread
 from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, Generator, List, Optional, Sequence
diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 28e6a409..a7ff7015 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import asyncio
 import concurrent.futures
 import os
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index f0812a99..d488a039 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence, Union
 
diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index 5042e53c..c7f136b3 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 import subprocess
diff --git a/src/llamafactory/data/__init__.py b/src/llamafactory/data/__init__.py
index b08691d3..307853bc 100644
--- a/src/llamafactory/data/__init__.py
+++ b/src/llamafactory/data/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .collator import KTODataCollatorWithPadding, PairwiseDataCollatorWithPadding
 from .data_utils import Role, split_dataset
 from .loader import get_dataset
diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py
index 3e9d5c46..299bdca3 100644
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from functools import partial
 from typing import TYPE_CHECKING, Any, Dict, List, Union
diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py
index 1dc8dd8d..e4859ff5 100644
--- a/src/llamafactory/data/collator.py
+++ b/src/llamafactory/data/collator.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import Any, Dict, Sequence
 
diff --git a/src/llamafactory/data/data_utils.py b/src/llamafactory/data/data_utils.py
index 9b313112..cc9761b1 100644
--- a/src/llamafactory/data/data_utils.py
+++ b/src/llamafactory/data/data_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from enum import Enum, unique
 from typing import TYPE_CHECKING, Dict, List, Tuple, Union
 
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 0cd3d6c1..590e682b 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import re
 from abc import ABC, abstractmethod
diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py
index ba426f81..f44ef5de 100644
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 import os
 import sys
diff --git a/src/llamafactory/data/parser.py b/src/llamafactory/data/parser.py
index ec97bfc1..4bebcd68 100644
--- a/src/llamafactory/data/parser.py
+++ b/src/llamafactory/data/parser.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from dataclasses import dataclass
diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py
index 875f55d6..9a8b97f3 100644
--- a/src/llamafactory/data/preprocess.py
+++ b/src/llamafactory/data/preprocess.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from functools import partial
 from typing import TYPE_CHECKING, Callable, Literal, Optional, Tuple
 
diff --git a/src/llamafactory/data/processors/feedback.py b/src/llamafactory/data/processors/feedback.py
index 5fba452c..219ab353 100644
--- a/src/llamafactory/data/processors/feedback.py
+++ b/src/llamafactory/data/processors/feedback.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
diff --git a/src/llamafactory/data/processors/pairwise.py b/src/llamafactory/data/processors/pairwise.py
index db52c6a7..b2939348 100644
--- a/src/llamafactory/data/processors/pairwise.py
+++ b/src/llamafactory/data/processors/pairwise.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index a10ccabd..fb4c840c 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from itertools import chain
 from typing import TYPE_CHECKING, Any, Dict, List
 
diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py
index 9903a053..93df0cd5 100644
--- a/src/llamafactory/data/processors/processor_utils.py
+++ b/src/llamafactory/data/processors/processor_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import bisect
 from typing import TYPE_CHECKING, List, Sequence
 
diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py
index f59f5371..eb5ffb1a 100644
--- a/src/llamafactory/data/processors/supervised.py
+++ b/src/llamafactory/data/processors/supervised.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from collections import defaultdict
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py
index 38497a15..75ad4d51 100644
--- a/src/llamafactory/data/processors/unsupervised.py
+++ b/src/llamafactory/data/processors/unsupervised.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.logging import get_logger
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index b600c567..786c679f 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
 
diff --git a/src/llamafactory/eval/evaluator.py b/src/llamafactory/eval/evaluator.py
index 5c6fb104..bbd7a44b 100644
--- a/src/llamafactory/eval/evaluator.py
+++ b/src/llamafactory/eval/evaluator.py
@@ -1,4 +1,41 @@
-# Inspired by: https://github.com/hendrycks/test/blob/master/evaluate_flan.py
+# Copyright 2024 the LlamaFactory team.
+#
+# This code is inspired by Dan's test library.
+# https://github.com/hendrycks/test/blob/master/evaluate_flan.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2020 Dan Hendrycks
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 
 import inspect
 import json
diff --git a/src/llamafactory/eval/template.py b/src/llamafactory/eval/template.py
index 2cbb5aaf..7d524e7c 100644
--- a/src/llamafactory/eval/template.py
+++ b/src/llamafactory/eval/template.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import Dict, List, Sequence, Tuple
 
diff --git a/src/llamafactory/extras/callbacks.py b/src/llamafactory/extras/callbacks.py
index 441ebbfd..0dff6a69 100644
--- a/src/llamafactory/extras/callbacks.py
+++ b/src/llamafactory/extras/callbacks.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import logging
 import os
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 7d96fb5f..e31e7419 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from collections import OrderedDict, defaultdict
 from enum import Enum
 from typing import Dict, Optional
diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index a8cb799d..586c24c0 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import platform
 
 import accelerate
diff --git a/src/llamafactory/extras/logging.py b/src/llamafactory/extras/logging.py
index 430b8a48..67622212 100644
--- a/src/llamafactory/extras/logging.py
+++ b/src/llamafactory/extras/logging.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import logging
 import os
 import sys
diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index fc33f77e..3d969df1 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import gc
 import os
 from typing import TYPE_CHECKING, Dict, Tuple
diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py
index 0746bb4f..35f546ab 100644
--- a/src/llamafactory/extras/packages.py
+++ b/src/llamafactory/extras/packages.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/import_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import importlib.metadata
 import importlib.util
 from functools import lru_cache
diff --git a/src/llamafactory/extras/ploting.py b/src/llamafactory/extras/ploting.py
index dea23bbe..596d55e7 100644
--- a/src/llamafactory/extras/ploting.py
+++ b/src/llamafactory/extras/ploting.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import math
 import os
diff --git a/src/llamafactory/hparams/__init__.py b/src/llamafactory/hparams/__init__.py
index d1ee98dd..cfe448c1 100644
--- a/src/llamafactory/hparams/__init__.py
+++ b/src/llamafactory/hparams/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .data_args import DataArguments
 from .evaluation_args import EvaluationArguments
 from .finetuning_args import FinetuningArguments
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index 1e0cd08c..95284766 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass, field
 from typing import Literal, Optional
 
diff --git a/src/llamafactory/hparams/evaluation_args.py b/src/llamafactory/hparams/evaluation_args.py
index 5a05f6f6..a7f221ca 100644
--- a/src/llamafactory/hparams/evaluation_args.py
+++ b/src/llamafactory/hparams/evaluation_args.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from dataclasses import dataclass, field
 from typing import Literal, Optional
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index facbe792..52dc299e 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass, field
 from typing import List, Literal, Optional
 
diff --git a/src/llamafactory/hparams/generating_args.py b/src/llamafactory/hparams/generating_args.py
index 0ee17d1a..7ebb4eed 100644
--- a/src/llamafactory/hparams/generating_args.py
+++ b/src/llamafactory/hparams/generating_args.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import asdict, dataclass, field
 from typing import Any, Dict, Optional
 
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index bbac2e4b..0a91f0fa 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import asdict, dataclass, field
 from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union
 
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 3476a726..1c57567c 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import logging
 import os
 import sys
diff --git a/src/llamafactory/launcher.py b/src/llamafactory/launcher.py
index de154db9..65e0b68f 100644
--- a/src/llamafactory/launcher.py
+++ b/src/llamafactory/launcher.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from llamafactory.train.tuner import run_exp
 
 
diff --git a/src/llamafactory/model/__init__.py b/src/llamafactory/model/__init__.py
index 9d23d59f..4abbaa1b 100644
--- a/src/llamafactory/model/__init__.py
+++ b/src/llamafactory/model/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .loader import load_config, load_model, load_tokenizer
 from .model_utils.misc import find_all_linear_modules
 from .model_utils.valuehead import load_valuehead_params
diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index c37f6009..dfa71525 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import re
 from typing import TYPE_CHECKING
 
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 697a04e7..69cccd93 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, Optional, TypedDict
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer
diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index 2bd36fdc..8ff3807b 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING
 
 from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available
diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py
index e0657be8..e4e84b12 100644
--- a/src/llamafactory/model/model_utils/checkpointing.py
+++ b/src/llamafactory/model/model_utils/checkpointing.py
@@ -1,3 +1,21 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's Transformers and PEFT library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/modeling_utils.py
+# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/utils/other.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from functools import partial
 from types import MethodType
@@ -68,7 +86,6 @@ def prepare_model_for_training(
         (1) cast the layernorm in fp32
         (2) make output embedding layer require grads
         (3) add the upcasting of the lm_head in fp32
-    Inspired by: https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/other.py#L72
     """
     if model_args.upcast_layernorm:
         logger.info("Upcasting layernorm weights in float32.")
diff --git a/src/llamafactory/model/model_utils/embedding.py b/src/llamafactory/model/model_utils/embedding.py
index 3d9278e3..3ff79828 100644
--- a/src/llamafactory/model/model_utils/embedding.py
+++ b/src/llamafactory/model/model_utils/embedding.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from contextlib import nullcontext
 from typing import TYPE_CHECKING
diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py
index 4a8c562a..7af43dcf 100644
--- a/src/llamafactory/model/model_utils/longlora.py
+++ b/src/llamafactory/model/model_utils/longlora.py
@@ -1,3 +1,20 @@
+# Copyright 2024 EleutherAI, HuggingFace Inc., and the LlamaFactory team.
+#
+# This code is based on the EleutherAI's GPT-NeoX and HuggingFace's Transformers libraries.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from typing import TYPE_CHECKING, Optional, Tuple
 
diff --git a/src/llamafactory/model/model_utils/misc.py b/src/llamafactory/model/model_utils/misc.py
index 4851bd29..a2812228 100644
--- a/src/llamafactory/model/model_utils/misc.py
+++ b/src/llamafactory/model/model_utils/misc.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, List
 
 from ...extras.logging import get_logger
diff --git a/src/llamafactory/model/model_utils/mod.py b/src/llamafactory/model/model_utils/mod.py
index 5708a1a8..ec73af00 100644
--- a/src/llamafactory/model/model_utils/mod.py
+++ b/src/llamafactory/model/model_utils/mod.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING
 
 from ...extras.constants import MOD_SUPPORTED_MODELS
diff --git a/src/llamafactory/model/model_utils/moe.py b/src/llamafactory/model/model_utils/moe.py
index 8a73c844..5c7473aa 100644
--- a/src/llamafactory/model/model_utils/moe.py
+++ b/src/llamafactory/model/model_utils/moe.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Sequence
 
 import torch
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 02a54f07..9e6b9da4 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's Optimum library.
+# https://github.com/huggingface/optimum/blob/v1.20.0/optimum/gptq/data.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from enum import Enum, unique
@@ -41,7 +58,6 @@ class QuantizationMethod(str, Enum):
 
 def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]:
     r"""
-    Inspired by: https://github.com/huggingface/optimum/blob/v1.16.0/optimum/gptq/data.py#L133
     TODO: remove tokenizer.decode() https://github.com/huggingface/optimum/pull/1600
     """
     if os.path.isfile(model_args.export_quantization_dataset):
diff --git a/src/llamafactory/model/model_utils/rope.py b/src/llamafactory/model/model_utils/rope.py
index 93ab8929..88303c4d 100644
--- a/src/llamafactory/model/model_utils/rope.py
+++ b/src/llamafactory/model/model_utils/rope.py
@@ -1,3 +1,21 @@
+# Copyright 2024 LMSYS and the LlamaFactory team.
+# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+# This code is inspired by the LMSYS's FastChat library.
+# https://github.com/lm-sys/FastChat/blob/v0.2.30/fastchat/train/train.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from typing import TYPE_CHECKING
 
diff --git a/src/llamafactory/model/model_utils/unsloth.py b/src/llamafactory/model/model_utils/unsloth.py
index 8a16409d..9cfaec61 100644
--- a/src/llamafactory/model/model_utils/unsloth.py
+++ b/src/llamafactory/model/model_utils/unsloth.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 from ...extras.logging import get_logger
diff --git a/src/llamafactory/model/model_utils/valuehead.py b/src/llamafactory/model/model_utils/valuehead.py
index 64333688..9ab3d45a 100644
--- a/src/llamafactory/model/model_utils/valuehead.py
+++ b/src/llamafactory/model/model_utils/valuehead.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 import torch
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index c8260b7f..37237485 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/modeling_llava.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Tuple
 
 import torch
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index b97ff433..053516e4 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from types import MethodType
 from typing import TYPE_CHECKING, Any, Dict
diff --git a/src/llamafactory/train/dpo/__init__.py b/src/llamafactory/train/dpo/__init__.py
index 43fe9420..9ce0d089 100644
--- a/src/llamafactory/train/dpo/__init__.py
+++ b/src/llamafactory/train/dpo/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_dpo
 
 
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index 5bdb9c43..475d08c3 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import warnings
 from collections import defaultdict
 from contextlib import nullcontext
diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py
index 992985b0..8c3c2eb1 100644
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/dpo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
 
diff --git a/src/llamafactory/train/kto/__init__.py b/src/llamafactory/train/kto/__init__.py
index 34c7905a..a1900368 100644
--- a/src/llamafactory/train/kto/__init__.py
+++ b/src/llamafactory/train/kto/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_kto
 
 
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 3b4488fc..6e96fc0c 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/kto_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import warnings
 from collections import defaultdict
 from contextlib import nullcontext
diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py
index c79b160b..8a7af6d4 100644
--- a/src/llamafactory/train/kto/workflow.py
+++ b/src/llamafactory/train/kto/workflow.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/kto.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, List, Optional
 
 from ...data import KTODataCollatorWithPadding, get_dataset, split_dataset
diff --git a/src/llamafactory/train/ppo/__init__.py b/src/llamafactory/train/ppo/__init__.py
index d17336d5..161f6f5d 100644
--- a/src/llamafactory/train/ppo/__init__.py
+++ b/src/llamafactory/train/ppo/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_ppo
 
 
diff --git a/src/llamafactory/train/ppo/ppo_utils.py b/src/llamafactory/train/ppo/ppo_utils.py
index fec3fc1e..05c40946 100644
--- a/src/llamafactory/train/ppo/ppo_utils.py
+++ b/src/llamafactory/train/ppo/ppo_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 from contextlib import nullcontext
 from typing import TYPE_CHECKING, Dict, List, Literal, Optional
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 45f47455..61420f3b 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/ppo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import os
 import sys
diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py
index 111704c6..891d539a 100644
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/lvwerra/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/ppo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
 
diff --git a/src/llamafactory/train/pt/__init__.py b/src/llamafactory/train/pt/__init__.py
index bdf397f6..d80e6f22 100644
--- a/src/llamafactory/train/pt/__init__.py
+++ b/src/llamafactory/train/pt/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_pt
 
 
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index 1d96e82f..09729f2e 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from types import MethodType
 from typing import TYPE_CHECKING, Dict, Optional
 
diff --git a/src/llamafactory/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py
index 8a635567..f1df314e 100644
--- a/src/llamafactory/train/pt/workflow.py
+++ b/src/llamafactory/train/pt/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/language-modeling/run_clm.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import math
 from typing import TYPE_CHECKING, List, Optional
diff --git a/src/llamafactory/train/rm/__init__.py b/src/llamafactory/train/rm/__init__.py
index dedac35f..48278315 100644
--- a/src/llamafactory/train/rm/__init__.py
+++ b/src/llamafactory/train/rm/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_rm
 
 
diff --git a/src/llamafactory/train/rm/metric.py b/src/llamafactory/train/rm/metric.py
index 99dc6ab8..fb880b1c 100644
--- a/src/llamafactory/train/rm/metric.py
+++ b/src/llamafactory/train/rm/metric.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, Sequence, Tuple, Union
 
 import numpy as np
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index bfb344dc..14695d7d 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -1,3 +1,42 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# This code is inspired by CarperAI's trlx library.
+# https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/reward_model.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2022 CarperAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import json
 import os
 from types import MethodType
@@ -79,7 +118,6 @@ class PairwiseTrainer(Trainer):
         chosen_scores, rejected_scores = [], []
 
         # Compute pairwise loss. Only backprop on the different tokens before padding
-        # Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py
         loss = 0
         for i in range(batch_size):
             chosen_length = (chosen_input_ids[i] != self.tokenizer.pad_token_id).nonzero()[-1] + 1
diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py
index 2e9e194b..75c0a2bf 100644
--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
@@ -1,4 +1,41 @@
-# Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py
+# Copyright 2024 the LlamaFactory team.
+#
+# This code is inspired by CarperAI's trlx library.
+# https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2022 CarperAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 
 from typing import TYPE_CHECKING, List, Optional
 
diff --git a/src/llamafactory/train/sft/__init__.py b/src/llamafactory/train/sft/__init__.py
index f2f84e78..475dfe5f 100644
--- a/src/llamafactory/train/sft/__init__.py
+++ b/src/llamafactory/train/sft/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_sft
 
 
diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index 923238d6..d2147c22 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -1,3 +1,21 @@
+# Copyright 2024 HuggingFace Inc., THUDM, and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library and THUDM's ChatGLM implementation.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+# https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
 
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 6bf5b7c0..6ab6914e 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer_seq2seq.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from types import MethodType
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index a989b3f7..dfc71cfb 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
 
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 48944a63..5621d5df 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -1,3 +1,22 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the GaLore's implementation: https://github.com/jiaweizzhao/GaLore
+# and the LoRA+'s implementation: https://github.com/nikhil-ghosh-berkeley/loraplus
+# and the BAdam's implementation: https://github.com/Ledzy/BAdam
+# and the TRL's implementation: https://github.com/huggingface/trl
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py
index eed875e9..788b4c4f 100644
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import torch
diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py
index c82710d3..864c41c7 100644
--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from typing import TYPE_CHECKING, Dict, Generator, List, Optional, Sequence, Tuple
diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py
index 3b8d5378..980428a4 100644
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from collections import defaultdict
diff --git a/src/llamafactory/webui/components/__init__.py b/src/llamafactory/webui/components/__init__.py
index 5c1e21b8..715fb6e4 100644
--- a/src/llamafactory/webui/components/__init__.py
+++ b/src/llamafactory/webui/components/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .chatbot import create_chat_box
 from .eval import create_eval_tab
 from .export import create_export_tab
diff --git a/src/llamafactory/webui/components/chatbot.py b/src/llamafactory/webui/components/chatbot.py
index f83694b1..ad74114b 100644
--- a/src/llamafactory/webui/components/chatbot.py
+++ b/src/llamafactory/webui/components/chatbot.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict, Tuple
 
 from ...data import Role
diff --git a/src/llamafactory/webui/components/data.py b/src/llamafactory/webui/components/data.py
index 232b973d..88e500cf 100644
--- a/src/llamafactory/webui/components/data.py
+++ b/src/llamafactory/webui/components/data.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple
diff --git a/src/llamafactory/webui/components/eval.py b/src/llamafactory/webui/components/eval.py
index 0a7a0f44..b522913e 100644
--- a/src/llamafactory/webui/components/eval.py
+++ b/src/llamafactory/webui/components/eval.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 from ...extras.packages import is_gradio_available
diff --git a/src/llamafactory/webui/components/export.py b/src/llamafactory/webui/components/export.py
index 9d756a38..14257949 100644
--- a/src/llamafactory/webui/components/export.py
+++ b/src/llamafactory/webui/components/export.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict, Generator, List, Union
 
 from ...extras.constants import PEFT_METHODS
diff --git a/src/llamafactory/webui/components/infer.py b/src/llamafactory/webui/components/infer.py
index 970f4629..03bccd7f 100644
--- a/src/llamafactory/webui/components/infer.py
+++ b/src/llamafactory/webui/components/infer.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 from ...extras.packages import is_gradio_available
diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index fd0ead3d..2515a83d 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 from ...data import TEMPLATES
diff --git a/src/llamafactory/webui/components/train.py b/src/llamafactory/webui/components/train.py
index 72dfc858..673f6bf4 100644
--- a/src/llamafactory/webui/components/train.py
+++ b/src/llamafactory/webui/components/train.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 from transformers.trainer_utils import SchedulerType
diff --git a/src/llamafactory/webui/css.py b/src/llamafactory/webui/css.py
index 36e3d4c2..53982119 100644
--- a/src/llamafactory/webui/css.py
+++ b/src/llamafactory/webui/css.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 CSS = r"""
 .duplicate-button {
   margin: auto !important;
diff --git a/src/llamafactory/webui/engine.py b/src/llamafactory/webui/engine.py
index eb6142d3..04893215 100644
--- a/src/llamafactory/webui/engine.py
+++ b/src/llamafactory/webui/engine.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict
 
 from .chatter import WebChatModel
diff --git a/src/llamafactory/webui/interface.py b/src/llamafactory/webui/interface.py
index bae3ba76..d25f4d38 100644
--- a/src/llamafactory/webui/interface.py
+++ b/src/llamafactory/webui/interface.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 from ..extras.packages import is_gradio_available
diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py
index e30feab2..427f01b8 100644
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 LOCALES = {
     "lang": {
         "en": {
diff --git a/src/llamafactory/webui/manager.py b/src/llamafactory/webui/manager.py
index 326fdb8d..7e9b801a 100644
--- a/src/llamafactory/webui/manager.py
+++ b/src/llamafactory/webui/manager.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict, Generator, List, Set, Tuple
 
 
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 35014628..76982934 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from copy import deepcopy
 from subprocess import Popen, TimeoutExpired
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index e39f2aa4..6ce2a8e7 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 import signal
diff --git a/src/train.py b/src/train.py
index b20aa9d2..6703ffdb 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from llamafactory.train.tuner import run_exp
 
 
diff --git a/src/webui.py b/src/webui.py
index bbefb54e..99370af2 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 from llamafactory.webui.interface import create_ui
diff --git a/tests/data/test_supervised.py b/tests/data/test_supervised.py
index 63a3453f..a72800d2 100644
--- a/tests/data/test_supervised.py
+++ b/tests/data/test_supervised.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 
diff --git a/tests/eval/test_eval_template.py b/tests/eval/test_eval_template.py
index f6a91a67..f85d9d57 100644
--- a/tests/eval/test_eval_template.py
+++ b/tests/eval/test_eval_template.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from llamafactory.eval.template import get_eval_template
 
 
diff --git a/tests/model/model_utils/test_attention.py b/tests/model/model_utils/test_attention.py
index 751adda4..97ac9dcc 100644
--- a/tests/model/model_utils/test_attention.py
+++ b/tests/model/model_utils/test_attention.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index 32a3918e..462e8cfa 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 import torch
diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py
index a0618315..ac5a023c 100644
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 import torch
diff --git a/tests/model/test_full.py b/tests/model/test_full.py
index 802b987c..bcd6480f 100644
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 import torch
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 3e2503f1..e49c026c 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from typing import Sequence
 

From 61aaab22c93fe89145f1bea926b00004352bd7fc Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 17:58:52 +0800
Subject: [PATCH 044/162] add minicpm #4227

Former-commit-id: e1bb18ce60be9a1b203989def30f1b9194286325
---
 src/llamafactory/extras/constants.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index e31e7419..73a9969d 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -682,6 +682,21 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "MiniCPM-2B-SFT-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-2B-sft-bf16",
+            DownloadSource.MODELSCOPE: "OpenBMB/miniCPM-bf16",
+        },
+        "MiniCPM-2B-DPO-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-2B-dpo-bf16",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-2B-dpo-bf16",
+        },
+    },
+    template="cpm",
+)
+
+
 register_model_group(
     models={
         "Mistral-7B-v0.1": {

From 4851ef85b795f2c03d1d90f9fb57caa2d1f59258 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 19:51:20 +0800
Subject: [PATCH 045/162] add tests

Former-commit-id: 484634ee9c982e82e919ff67d507e0210345182d
---
 Makefile                                      |  2 +-
 src/llamafactory/extras/misc.py               |  9 ++-
 tests/data/test_supervised.py                 |  2 +-
 tests/model/model_utils/test_checkpointing.py | 74 +++++++++++++++++++
 tests/model/test_base.py                      | 30 +++++++-
 tests/model/test_freeze.py                    |  3 +
 tests/model/test_full.py                      |  2 +
 tests/model/test_lora.py                      | 58 +++++++++++++--
 8 files changed, 166 insertions(+), 14 deletions(-)
 create mode 100644 tests/model/model_utils/test_checkpointing.py

diff --git a/Makefile b/Makefile
index 65be047b..3f13b215 100644
--- a/Makefile
+++ b/Makefile
@@ -11,4 +11,4 @@ style:
 	ruff format $(check_dirs)
 
 test:
-	pytest tests/
+	CUDA_VISIBLE_DEVICES= pytest tests/
diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index 3d969df1..93153b3e 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -22,6 +22,7 @@ from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList, PreTr
 from transformers.utils import (
     SAFE_WEIGHTS_NAME,
     WEIGHTS_NAME,
+    is_safetensors_available,
     is_torch_bf16_gpu_available,
     is_torch_cuda_available,
     is_torch_mps_available,
@@ -34,6 +35,11 @@ from .constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from .logging import get_logger
 
 
+if is_safetensors_available():
+    from safetensors import safe_open
+    from safetensors.torch import save_file
+
+
 _is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
 try:
     _is_bf16_available = is_torch_bf16_gpu_available()
@@ -128,9 +134,6 @@ def fix_valuehead_checkpoint(
         return
 
     if safe_serialization:
-        from safetensors import safe_open
-        from safetensors.torch import save_file
-
         path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
         with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
             state_dict: Dict[str, torch.Tensor] = {key: f.get_tensor(key) for key in f.keys()}
diff --git a/tests/data/test_supervised.py b/tests/data/test_supervised.py
index a72800d2..9f7b2dbf 100644
--- a/tests/data/test_supervised.py
+++ b/tests/data/test_supervised.py
@@ -41,7 +41,7 @@ TRAIN_ARGS = {
 }
 
 
-@pytest.mark.parametrize("num_samples", [10])
+@pytest.mark.parametrize("num_samples", [16])
 def test_supervised(num_samples: int):
     model_args, data_args, training_args, _, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
diff --git a/tests/model/model_utils/test_checkpointing.py b/tests/model/model_utils/test_checkpointing.py
new file mode 100644
index 00000000..670e693d
--- /dev/null
+++ b/tests/model/model_utils/test_checkpointing.py
@@ -0,0 +1,74 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+from llamafactory.extras.misc import get_current_device
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_model, load_tokenizer
+
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "lora_target": "all",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+def test_checkpointing_enable():
+    model_args, _, _, finetuning_args, _ = get_train_args({"disable_gradient_checkpointing": False, **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+    for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()):
+        assert getattr(module, "gradient_checkpointing") is True
+
+
+def test_checkpointing_disable():
+    model_args, _, _, finetuning_args, _ = get_train_args({"disable_gradient_checkpointing": True, **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+    for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()):
+        assert getattr(module, "gradient_checkpointing") is False
+
+
+def test_upcast_layernorm():
+    model_args, _, _, finetuning_args, _ = get_train_args({"upcast_layernorm": True, **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+    for name, param in model.named_parameters():
+        if param.ndim == 1 and "norm" in name:
+            assert param.dtype == torch.float32
+
+
+def test_upcast_lmhead_output():
+    model_args, _, _, finetuning_args, _ = get_train_args({"upcast_lmhead_output": True, **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+    inputs = torch.randn((1, 16), dtype=torch.float16, device=get_current_device())
+    outputs: "torch.Tensor" = model.lm_head(inputs)
+    assert outputs.dtype == torch.float32
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index 462e8cfa..ee0b2886 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -13,16 +13,21 @@
 # limitations under the License.
 
 import os
+from typing import Dict
 
 import torch
 from transformers import AutoModelForCausalLM
+from trl import AutoModelForCausalLMWithValueHead
 
+from llamafactory.extras.misc import get_current_device
 from llamafactory.hparams import get_infer_args
 from llamafactory.model import load_model, load_tokenizer
 
 
 TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
+TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
+
 INFER_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "template": "llama3",
@@ -38,9 +43,32 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
         assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
 
 
+def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]):
+    state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
+    self.v_head.load_state_dict(state_dict, strict=False)
+    del state_dict
+
+
 def test_base():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
-    ref_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+
+    ref_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    compare_model(model, ref_model)
+
+
+def test_valuehead():
+    AutoModelForCausalLMWithValueHead.post_init = post_init  # patch for CPU test
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(
+        tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False, add_valuehead=True
+    )
+
+    ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
+        TINY_LLAMA_VALUEHEAD, torch_dtype=torch.float16, device_map=get_current_device()
+    )
     compare_model(model, ref_model)
diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py
index ac5a023c..5f478af6 100644
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
@@ -49,6 +49,7 @@ def test_freeze_train_all_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"freeze_trainable_layers": 1, **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     for name, param in model.named_parameters():
         if name.startswith("model.layers.1."):
             assert param.requires_grad is True
@@ -64,6 +65,7 @@ def test_freeze_train_extra_modules():
     )
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     for name, param in model.named_parameters():
         if name.startswith("model.layers.1.") or any(module in name for module in ["embed_tokens", "lm_head"]):
             assert param.requires_grad is True
@@ -77,6 +79,7 @@ def test_freeze_inference():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+
     for param in model.parameters():
         assert param.requires_grad is False
         assert param.dtype == torch.float16
diff --git a/tests/model/test_full.py b/tests/model/test_full.py
index bcd6480f..0a6e0743 100644
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
@@ -49,6 +49,7 @@ def test_full_train():
     model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     for param in model.parameters():
         assert param.requires_grad is True
         assert param.dtype == torch.float32
@@ -58,6 +59,7 @@ def test_full_inference():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+
     for param in model.parameters():
         assert param.requires_grad is False
         assert param.dtype == torch.float16
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index e49c026c..4923c8ad 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -18,7 +18,9 @@ from typing import Sequence
 import torch
 from peft import LoraModel, PeftModel
 from transformers import AutoModelForCausalLM
+from trl import AutoModelForCausalLMWithValueHead
 
+from llamafactory.extras.misc import get_current_device
 from llamafactory.hparams import get_infer_args, get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
@@ -27,6 +29,8 @@ TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
 
+TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
+
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
@@ -67,10 +71,29 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_k
             assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
 
 
+def test_lora_train_qv_modules():
+    model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "q_proj,v_proj", **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
+    linear_modules = set()
+    for name, param in model.named_parameters():
+        if any(module in name for module in ["lora_A", "lora_B"]):
+            linear_modules.add(name.split(".lora_", maxsplit=1)[0].split(".")[-1])
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        else:
+            assert param.requires_grad is False
+            assert param.dtype == torch.float16
+
+    assert linear_modules == {"q_proj", "v_proj"}
+
+
 def test_lora_train_all_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "all", **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     linear_modules = set()
     for name, param in model.named_parameters():
         if any(module in name for module in ["lora_A", "lora_B"]):
@@ -90,6 +113,7 @@ def test_lora_train_extra_modules():
     )
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     extra_modules = set()
     for name, param in model.named_parameters():
         if any(module in name for module in ["lora_A", "lora_B"]):
@@ -113,7 +137,9 @@ def test_lora_train_old_adapters():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
 
-    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
     ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
     for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
         param.data = param.data.to(torch.float32)
@@ -128,7 +154,9 @@ def test_lora_train_new_adapters():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
 
-    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
     ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
     for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
         param.data = param.data.to(torch.float32)
@@ -138,17 +166,31 @@ def test_lora_train_new_adapters():
     )
 
 
+def test_lora_train_valuehead():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(
+        tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True, add_valuehead=True
+    )
+
+    ref_model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(
+        TINY_LLAMA_VALUEHEAD, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    state_dict = model.state_dict()
+    ref_state_dict = ref_model.state_dict()
+
+    assert torch.allclose(state_dict["v_head.summary.weight"], ref_state_dict["v_head.summary.weight"])
+    assert torch.allclose(state_dict["v_head.summary.bias"], ref_state_dict["v_head.summary.bias"])
+
+
 def test_lora_inference():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
 
-    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
     ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER)
     ref_model = ref_model.merge_and_unload()
     compare_model(model, ref_model)
-
-    for name, param in model.named_parameters():
-        assert param.requires_grad is False
-        assert param.dtype == torch.float16
-        assert "lora" not in name

From f9653ac29c36aa44284d07c102d2fe979b796391 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 20:06:17 +0800
Subject: [PATCH 046/162] use fixture

Former-commit-id: 10761985691b9f934f7689c1f82aa6dd68febcca
---
 src/llamafactory/hparams/model_args.py |  2 +-
 tests/model/test_base.py               | 15 ++++++++++-----
 tests/model/test_lora.py               | 14 +++++++++++++-
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 0a91f0fa..53bdbdf2 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -163,7 +163,7 @@ class ModelArguments:
     )
     infer_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
         default="auto",
-        metadata={"help": "Data type for model weights and activations at inference."}
+        metadata={"help": "Data type for model weights and activations at inference."},
     )
     hf_hub_token: Optional[str] = field(
         default=None,
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index ee0b2886..2deedde2 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -15,6 +15,7 @@
 import os
 from typing import Dict
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM
 from trl import AutoModelForCausalLMWithValueHead
@@ -43,10 +44,14 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
         assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
 
 
-def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]):
-    state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
-    self.v_head.load_state_dict(state_dict, strict=False)
-    del state_dict
+@pytest.fixture
+def fix_valuehead_cpu_loading():
+    def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]):
+        state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+    AutoModelForCausalLMWithValueHead.post_init = post_init
 
 
 def test_base():
@@ -60,8 +65,8 @@ def test_base():
     compare_model(model, ref_model)
 
 
+@pytest.mark.usefixtures("fix_valuehead_cpu_loading")
 def test_valuehead():
-    AutoModelForCausalLMWithValueHead.post_init = post_init  # patch for CPU test
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 4923c8ad..fe032332 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 import os
-from typing import Sequence
+from typing import Dict, Sequence
 
+import pytest
 import torch
 from peft import LoraModel, PeftModel
 from transformers import AutoModelForCausalLM
@@ -71,6 +72,16 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_k
             assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
 
 
+@pytest.fixture
+def fix_valuehead_cpu_loading():
+    def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]):
+        state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+    AutoModelForCausalLMWithValueHead.post_init = post_init
+
+
 def test_lora_train_qv_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "q_proj,v_proj", **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
@@ -166,6 +177,7 @@ def test_lora_train_new_adapters():
     )
 
 
+@pytest.mark.usefixtures("fix_valuehead_cpu_loading")
 def test_lora_train_valuehead():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)

From 640372cb66f0c06e5bb3cd02f322fdaf0f67219b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:06:41 +0800
Subject: [PATCH 047/162] tiny fix

Former-commit-id: f7f440986b0ae3b38ea9f2da80789629d4f79ea1
---
 scripts/cal_flops.py                                | 2 +-
 scripts/cal_lr.py                                   | 2 +-
 scripts/llama_pro.py                                | 2 +-
 src/llamafactory/data/processors/pretrain.py        | 2 +-
 src/llamafactory/eval/evaluator.py                  | 2 +-
 src/llamafactory/extras/packages.py                 | 2 +-
 src/llamafactory/hparams/data_args.py               | 2 +-
 src/llamafactory/model/model_utils/checkpointing.py | 2 +-
 src/llamafactory/model/model_utils/longlora.py      | 6 ++++--
 src/llamafactory/model/model_utils/quantization.py  | 2 +-
 src/llamafactory/model/model_utils/visual.py        | 2 +-
 src/llamafactory/train/dpo/workflow.py              | 2 +-
 src/llamafactory/train/kto/trainer.py               | 4 ++--
 src/llamafactory/train/kto/workflow.py              | 2 +-
 src/llamafactory/train/ppo/trainer.py               | 2 +-
 src/llamafactory/train/ppo/workflow.py              | 2 +-
 src/llamafactory/train/pt/workflow.py               | 2 +-
 src/llamafactory/train/rm/trainer.py                | 4 ++--
 src/llamafactory/train/rm/workflow.py               | 2 +-
 src/llamafactory/train/sft/metric.py                | 2 +-
 src/llamafactory/train/sft/workflow.py              | 2 +-
 tests/model/test_base.py                            | 2 +-
 22 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/scripts/cal_flops.py b/scripts/cal_flops.py
index 627b5534..32526d89 100644
--- a/scripts/cal_flops.py
+++ b/scripts/cal_flops.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # Copyright 2024 Microsoft Corporation and the LlamaFactory team.
 #
-# This code is inspired by Microsoft's DeepSpeed library.
+# This code is inspired by the Microsoft's DeepSpeed library.
 # https://www.deepspeed.ai/tutorials/flops-profiler/
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py
index ff21d27c..ad6992cb 100644
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # Copyright 2024 imoneoi and the LlamaFactory team.
 #
-# This code is inspired by imoneoi's OpenChat library.
+# This code is inspired by the imoneoi's OpenChat library.
 # https://github.com/imoneoi/openchat/blob/3.6.0/ochat/training_deepspeed/train.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/scripts/llama_pro.py b/scripts/llama_pro.py
index f315335a..395375ef 100644
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # Copyright 2024 Tencent Inc. and the LlamaFactory team.
 #
-# This code is inspired by Tencent's LLaMA-Pro library.
+# This code is inspired by the Tencent's LLaMA-Pro library.
 # https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index fb4c840c..67d6009b 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/eval/evaluator.py b/src/llamafactory/eval/evaluator.py
index bbd7a44b..d3140793 100644
--- a/src/llamafactory/eval/evaluator.py
+++ b/src/llamafactory/eval/evaluator.py
@@ -1,6 +1,6 @@
 # Copyright 2024 the LlamaFactory team.
 #
-# This code is inspired by Dan's test library.
+# This code is inspired by the Dan's test library.
 # https://github.com/hendrycks/test/blob/master/evaluate_flan.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py
index 35f546ab..0a84a293 100644
--- a/src/llamafactory/extras/packages.py
+++ b/src/llamafactory/extras/packages.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/import_utils.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index 95284766..39290e21 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py
index e4e84b12..f5314125 100644
--- a/src/llamafactory/model/model_utils/checkpointing.py
+++ b/src/llamafactory/model/model_utils/checkpointing.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's Transformers and PEFT library.
+# This code is inspired by the HuggingFace's Transformers and PEFT library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/modeling_utils.py
 # https://github.com/huggingface/peft/blob/v0.10.0/src/peft/utils/other.py
 #
diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py
index 7af43dcf..af30bd50 100644
--- a/src/llamafactory/model/model_utils/longlora.py
+++ b/src/llamafactory/model/model_utils/longlora.py
@@ -1,7 +1,9 @@
-# Copyright 2024 EleutherAI, HuggingFace Inc., and the LlamaFactory team.
+# Copyright 2024 EleutherAI, HuggingFace Inc., Yukang Chen, and the LlamaFactory team.
 #
-# This code is based on the EleutherAI's GPT-NeoX and HuggingFace's Transformers libraries.
+# This code is based on the EleutherAI's GPT-NeoX and the HuggingFace's Transformers libraries.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+# This code is also inspired by the original LongLoRA implementation.
+# https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 9e6b9da4..0a0fca34 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's Optimum library.
+# This code is inspired by the HuggingFace's Optimum library.
 # https://github.com/huggingface/optimum/blob/v1.20.0/optimum/gptq/data.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index 37237485..700bf470 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's Transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/modeling_llava.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py
index 8c3c2eb1..431b5285 100644
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/dpo.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 6e96fc0c..91d68975 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/kto_trainer.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -114,8 +114,8 @@ class CustomKTOTrainer(KTOTrainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
     def forward(
diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py
index 8a7af6d4..8182a184 100644
--- a/src/llamafactory/train/kto/workflow.py
+++ b/src/llamafactory/train/kto/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/kto.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 61420f3b..df4a37be 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/ppo_trainer.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py
index 891d539a..4f4d2820 100644
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/ppo.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py
index f1df314e..b84a0e7d 100644
--- a/src/llamafactory/train/pt/workflow.py
+++ b/src/llamafactory/train/pt/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 14695d7d..7f91e5f5 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 the LlamaFactory team.
 #
-# This code is inspired by CarperAI's trlx library.
+# This code is inspired by the CarperAI's trlx library.
 # https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/reward_model.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -89,8 +89,8 @@ class PairwiseTrainer(Trainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
     def compute_loss(
diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py
index 75c0a2bf..6f24e964 100644
--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 the LlamaFactory team.
 #
-# This code is inspired by CarperAI's trlx library.
+# This code is inspired by the CarperAI's trlx library.
 # https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index d2147c22..95bfcb69 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc., THUDM, and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library and THUDM's ChatGLM implementation.
+# This code is inspired by the HuggingFace's transformers library and the THUDM's ChatGLM implementation.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
 # https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py
 #
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index dfc71cfb..885bc7ac 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index 2deedde2..954492ef 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -41,7 +41,7 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
     state_dict_b = model_b.state_dict()
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
-        assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
+        assert torch.allclose(state_dict_a[name], state_dict_b[name])
 
 
 @pytest.fixture

From 0b571f84b457bc6dd35fffa55e5f4658a210f743 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:08:12 +0800
Subject: [PATCH 048/162] support pissa

Former-commit-id: ef8e45f2eaf466c54e9a671512a2974575677b08
---
 README.md                                   |  6 +-
 README_zh.md                                |  6 +-
 examples/README.md                          |  6 ++
 examples/README_zh.md                       |  6 ++
 examples/extras/pissa/llama3_lora_sft.yaml  | 42 ++++++++++
 scripts/loftq_init.py                       | 72 +++++++----------
 scripts/pissa_init.py                       | 79 ++++++++++++++++++
 src/llamafactory/hparams/finetuning_args.py | 20 ++++-
 src/llamafactory/hparams/model_args.py      |  8 +-
 src/llamafactory/hparams/parser.py          |  5 +-
 src/llamafactory/model/adapter.py           | 25 ++++--
 src/llamafactory/train/dpo/trainer.py       | 13 ++-
 src/llamafactory/train/pt/trainer.py        | 12 ++-
 src/llamafactory/train/sft/trainer.py       | 13 ++-
 src/llamafactory/train/trainer_utils.py     | 54 ++++++++++++-
 src/llamafactory/webui/components/train.py  |  9 ++-
 src/llamafactory/webui/locales.py           | 14 ++++
 src/llamafactory/webui/runner.py            |  2 +
 tests/model/test_pissa.py                   | 90 +++++++++++++++++++++
 19 files changed, 406 insertions(+), 76 deletions(-)
 create mode 100644 examples/extras/pissa/llama3_lora_sft.yaml
 create mode 100644 scripts/pissa_init.py
 create mode 100644 tests/model/test_pissa.py

diff --git a/README.md b/README.md
index cae79694..cb9a7222 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ Choose your path:
 - **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO, etc.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
-- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and Agent tuning.
+- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ, PiSSA and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
@@ -71,9 +71,9 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
-[24/06/07] We supported fine-tuning the **[Qwen-2](https://qwenlm.github.io/blog/qwen2/)** series models.
+[24/06/16] We support **[PiSSA](https://arxiv.org/abs/2404.02948)** algorithm. See [examples](examples/README.md) for usage.
 
-[24/06/05] We supported fine-tuning the **[GLM-4-9B/GLM-4-9B-Chat](https://github.com/THUDM/GLM-4)** models.
+[24/06/07] We supported fine-tuning the **[Qwen2](https://qwenlm.github.io/blog/qwen2/)** and **[GLM-4](https://github.com/THUDM/GLM-4)** models.
 
 [24/05/26] We supported **[SimPO](https://arxiv.org/abs/2405.14734)** algorithm for preference learning. See [examples](examples/README.md) for usage.
 
diff --git a/README_zh.md b/README_zh.md
index af3ff8f0..5c005f30 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -49,7 +49,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - **多种模型**：LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
 - **集成方法**：（增量）预训练、（多模态）指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练、ORPO 训练等等。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
-- **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ 和 Agent 微调。
+- **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ、PiSSA 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
 - **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
 - **极速推理**：基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。
@@ -71,9 +71,9 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
-[24/06/07] 我们支持了 **[Qwen-2](https://qwenlm.github.io/blog/qwen2/)** 系列模型的微调。
+[24/06/16] 我们支持了 **[PiSSA](https://arxiv.org/abs/2404.02948)** 算法。详细用法请参照 [examples](examples/README_zh.md)。
 
-[24/06/05] 我们支持了 **[GLM-4-9B/GLM-4-9B-Chat](https://github.com/THUDM/GLM-4)** 模型的微调。
+[24/06/07] 我们支持了 **[Qwen2](https://qwenlm.github.io/blog/qwen2/)** 和 **[GLM-4](https://github.com/THUDM/GLM-4)** 模型的微调。
 
 [24/05/26] 我们支持了 **[SimPO](https://arxiv.org/abs/2405.14734)** 偏好对齐算法。详细用法请参照 [examples](examples/README_zh.md)。
 
diff --git a/examples/README.md b/examples/README.md
index a6d78936..902d26b1 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -213,3 +213,9 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/single_node.sh
 ```
+
+#### PiSSA Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index b6168a95..586e498c 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -213,3 +213,9 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/single_node.sh
 ```
+
+#### PiSSA 微调
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
diff --git a/examples/extras/pissa/llama3_lora_sft.yaml b/examples/extras/pissa/llama3_lora_sft.yaml
new file mode 100644
index 00000000..fd4b9f1d
--- /dev/null
+++ b/examples/extras/pissa/llama3_lora_sft.yaml
@@ -0,0 +1,42 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+pissa_init: true
+pissa_iter: 4
+pissa_convert: true
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+fp16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
diff --git a/scripts/loftq_init.py b/scripts/loftq_init.py
index 159dea06..556f342c 100644
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's PEFT library.
+# This code is based on the HuggingFace's PEFT library.
 # https://github.com/huggingface/peft/blob/v0.10.0/examples/loftq_finetuning/quantize_save_load.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,11 +17,9 @@
 # limitations under the License.
 
 import os
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import fire
-import torch
-import torch.nn as nn
 from peft import LoftQConfig, LoraConfig, TaskType, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -30,41 +28,20 @@ if TYPE_CHECKING:
     from transformers import PreTrainedModel
 
 
-class Shell(nn.Module):
-    def __init__(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
-        super().__init__()
-        self.weight = nn.Parameter(weight, requires_grad=False)
-        if bias is not None:
-            self.bias = nn.Parameter(bias, requires_grad=False)
-
-
-def unwrap_model(model: nn.Module, pattern=".base_layer") -> None:
-    for name in {k.split(pattern)[0] for k, _ in model.named_modules() if pattern in k}:
-        parent_name = ".".join(name.split(".")[:-1])
-        child_name = name.split(".")[-1]
-        parent_module = model.get_submodule(parent_name)
-        child_module = getattr(parent_module, child_name)
-        base_layer = getattr(child_module, "base_layer")
-        weight = getattr(base_layer, "weight", None)
-        bias = getattr(base_layer, "bias", None)
-        setattr(parent_module, child_name, Shell(weight, bias))
-
-    print("Model unwrapped.")
-
-
 def quantize_loftq(
     model_name_or_path: str,
-    save_dir: str,
-    loftq_bits: Optional[int] = 4,
-    loftq_iter: Optional[int] = 1,
-    lora_alpha: Optional[int] = None,
-    lora_rank: Optional[int] = 16,
-    lora_target: Optional[str] = "q_proj,v_proj",
-    save_safetensors: Optional[bool] = False,
+    output_dir: str,
+    loftq_bits: int = 4,
+    loftq_iter: int = 4,
+    lora_alpha: int = None,
+    lora_rank: int = 16,
+    lora_dropout: float = 0,
+    lora_target: str = "q_proj,v_proj",
+    save_safetensors: bool = True,
 ):
     r"""
     Initializes LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ)
-    Usage: python loftq_init.py --model_name_or_path path_to_model --save_dir output_dir
+    Usage: python loftq_init.py --model_name_or_path path_to_model --output_dir output_dir
     """
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
@@ -74,25 +51,34 @@ def quantize_loftq(
         inference_mode=True,
         r=lora_rank,
         lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
-        lora_dropout=0.1,
+        lora_dropout=lora_dropout,
         target_modules=[name.strip() for name in lora_target.split(",")],
         init_lora_weights="loftq",
         loftq_config=loftq_config,
     )
 
     # Init LoftQ model
-    lora_model = get_peft_model(model, lora_config)
-    base_model: "PreTrainedModel" = lora_model.get_base_model()
+    print("Initializing LoftQ weights, it may be take several minutes, wait patiently.")
+    peft_model = get_peft_model(model, lora_config)
+    loftq_dir = os.path.join(output_dir, "loftq_init")
 
     # Save LoftQ model
-    setattr(lora_model.base_model.peft_config["default"], "base_model_name_or_path", save_dir)
-    setattr(lora_model.base_model.peft_config["default"], "init_lora_weights", True)
-    lora_model.save_pretrained(os.path.join(save_dir, "adapters"), safe_serialization=save_safetensors)
+    setattr(peft_model.peft_config["default"], "base_model_name_or_path", output_dir)
+    setattr(peft_model.peft_config["default"], "init_lora_weights", True)  # don't apply loftq again
+    peft_model.save_pretrained(loftq_dir, safe_serialization=save_safetensors)
+    print("Adapter weights saved in {}".format(loftq_dir))
 
     # Save base model
-    unwrap_model(base_model)
-    base_model.save_pretrained(save_dir, safe_serialization=save_safetensors)
-    tokenizer.save_pretrained(save_dir)
+    base_model: "PreTrainedModel" = peft_model.unload()
+    base_model.save_pretrained(output_dir, safe_serialization=save_safetensors)
+    tokenizer.save_pretrained(output_dir)
+    print("Model weights saved in {}".format(output_dir))
+
+    print("Fine-tune this model with:")
+    print("model_name_or_path: {}".format(output_dir))
+    print("adapter_name_or_path: {}".format(loftq_dir))
+    print("finetuning_type: lora")
+    print("quantization_bit: {}".format(loftq_bits))
 
 
 if __name__ == "__main__":
diff --git a/scripts/pissa_init.py b/scripts/pissa_init.py
new file mode 100644
index 00000000..1b673c45
--- /dev/null
+++ b/scripts/pissa_init.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is based on the HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.11.0/examples/pissa_finetuning/preprocess.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING
+
+import fire
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+
+def quantize_pissa(
+    model_name_or_path: str,
+    output_dir: str,
+    pissa_iter: int = 4,
+    lora_alpha: int = None,
+    lora_rank: int = 16,
+    lora_dropout: float = 0,
+    lora_target: str = "q_proj,v_proj",
+    save_safetensors: bool = True,
+):
+    r"""
+    Initializes LoRA weights with Principal Singular values and Singular vectors Adaptation (PiSSA)
+    Usage: python pissa_init.py --model_name_or_path path_to_model --output_dir output_dir
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
+    lora_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        r=lora_rank,
+        lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
+        lora_dropout=lora_dropout,
+        target_modules=[name.strip() for name in lora_target.split(",")],
+        init_lora_weights="pissa" if pissa_iter == -1 else "pissa_niter_{}".format(pissa_iter)
+    )
+
+    # Init PiSSA model
+    peft_model = get_peft_model(model, lora_config)
+    pissa_dir = os.path.join(output_dir, "pissa_init")
+
+    # Save PiSSA model
+    setattr(peft_model.peft_config["default"], "init_lora_weights", True)  # don't apply pissa again
+    peft_model.save_pretrained(pissa_dir, safe_serialization=save_safetensors)
+    print("Adapter weights saved in {}".format(pissa_dir))
+
+    # Save base model
+    base_model: "PreTrainedModel" = peft_model.unload()
+    base_model.save_pretrained(output_dir, safe_serialization=save_safetensors)
+    tokenizer.save_pretrained(output_dir)
+    print("Model weights saved in {}".format(output_dir))
+
+    print("Fine-tune this model with:")
+    print("model_name_or_path: {}".format(output_dir))
+    print("adapter_name_or_path: {}".format(pissa_dir))
+    print("finetuning_type: lora")
+    print("pissa_convert: true")
+
+
+if __name__ == "__main__":
+    fire.Fire(quantize_pissa)
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index 52dc299e..1ef46eca 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -108,6 +108,18 @@ class LoraArguments:
         default=False,
         metadata={"help": "Whether or not to use the weight-decomposed lora method (DoRA)."},
     )
+    pissa_init: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to initialize a PiSSA adapter."},
+    )
+    pissa_iter: int = field(
+        default=4,
+        metadata={"help": "The number of iteration steps performed by FSVD in PiSSA. Use -1 to disable it."},
+    )
+    pissa_convert: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to convert the PiSSA adapter to a normal LoRA adapter."},
+    )
     create_new_adapter: bool = field(
         default=False,
         metadata={"help": "Whether or not to create a new adapter with randomly initialized weight."},
@@ -340,7 +352,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         self.additional_target: Optional[List[str]] = split_arg(self.additional_target)
         self.galore_target: List[str] = split_arg(self.galore_target)
         self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only
-        self.use_ref_model = self.pref_loss not in ["orpo", "simpo"]
+        self.use_ref_model = (self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"])
 
         assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method."
         assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
@@ -367,5 +379,11 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora":
             raise ValueError("`loraplus_lr_ratio` is only valid for LoRA training.")
 
+        if self.pissa_convert and self.finetuning_type != "lora":
+            raise ValueError("`pissa_convert` is only valid for LoRA training.")
+
+        if self.pissa_convert and (self.stage in ["rm", "ppo", "kto"] or self.use_ref_model):
+            raise ValueError("Cannot use PiSSA for current training stage.")
+
         if self.train_mm_proj_only and self.finetuning_type != "full":
             raise ValueError("`train_mm_proj_only` is only valid for full training.")
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 53bdbdf2..996e9130 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -45,6 +45,10 @@ class ModelArguments:
             )
         },
     )
+    adapter_folder: Optional[str] = field(
+        default=None,
+        metadata={"help": "The folder containing the adapter weights to load."},
+    )
     cache_dir: Optional[str] = field(
         default=None,
         metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."},
@@ -150,7 +154,7 @@ class ModelArguments:
         metadata={"help": "Whether or not to disable CUDA graph in the vLLM engine."},
     )
     vllm_max_lora_rank: int = field(
-        default=8,
+        default=32,
         metadata={"help": "Maximum rank of all LoRAs in the vLLM engine."},
     )
     offload_folder: str = field(
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 1c57567c..31a805f6 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -90,6 +90,9 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
         if finetuning_args.finetuning_type != "lora":
             raise ValueError("Quantization is only compatible with the LoRA method.")
 
+        if finetuning_args.use_pissa:
+            raise ValueError("Please use scripts/pissa_init.py for quantized PiSSA.")
+
         if model_args.resize_vocab:
             raise ValueError("Cannot resize embedding layers of a quantized model.")
 
diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index dfa71525..a8f3a256 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -179,8 +179,16 @@ def _setup_lora_tuning(
         else:
             adapter_to_merge = model_args.adapter_name_or_path
 
+        init_kwargs = {
+            "subfolder": model_args.adapter_folder,
+            "offload_folder": model_args.offload_folder,
+            "cache_dir": model_args.cache_dir,
+            "revision": model_args.model_revision,
+            "token": model_args.hf_hub_token,
+        }
+
         for adapter in adapter_to_merge:
-            model: "LoraModel" = PeftModel.from_pretrained(model, adapter, offload_folder=model_args.offload_folder)
+            model: "LoraModel" = PeftModel.from_pretrained(model, adapter, **init_kwargs)
             model = model.merge_and_unload()
 
         if len(adapter_to_merge) > 0:
@@ -190,12 +198,7 @@ def _setup_lora_tuning(
             if model_args.use_unsloth:
                 model = load_unsloth_peft_model(config, model_args, is_trainable=is_trainable)
             else:
-                model = PeftModel.from_pretrained(
-                    model,
-                    adapter_to_resume,
-                    is_trainable=is_trainable,
-                    offload_folder=model_args.offload_folder,
-                )
+                model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable, **init_kwargs)
 
         logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
 
@@ -242,6 +245,14 @@ def _setup_lora_tuning(
         if model_args.use_unsloth:
             model = get_unsloth_peft_model(model, model_args, peft_kwargs)
         else:
+            if finetuning_args.pissa_init:
+                if finetuning_args.pissa_iter == -1:
+                    logger.info("Using PiSSA initialization.")
+                    peft_kwargs["init_lora_weights"] = "pissa"
+                else:
+                    logger.info("Using PiSSA initialization with FSVD steps {}.".format(finetuning_args.pissa_iter))
+                    peft_kwargs["init_lora_weights"] = "pissa_niter_{}".format(finetuning_args.pissa_iter)
+
             lora_config = LoraConfig(
                 task_type=TaskType.CAUSAL_LM,
                 inference_mode=False,
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index 475d08c3..9928d0bc 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import warnings
 from collections import defaultdict
 from contextlib import nullcontext
@@ -28,7 +29,7 @@ from trl import DPOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
+from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
 if TYPE_CHECKING:
@@ -91,6 +92,9 @@ class CustomDPOTrainer(DPOTrainer):
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
                 self.ref_model.eval()
 
+        if finetuning_args.pissa_convert:
+            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
 
@@ -109,8 +113,11 @@ class CustomDPOTrainer(DPOTrainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.finetuning_args.pissa_convert:
+            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
+
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
     def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index 09729f2e..f9e04cb5 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from types import MethodType
 from typing import TYPE_CHECKING, Dict, Optional
 
 from transformers import Trainer
 
 from ...extras.logging import get_logger
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
+from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -42,6 +43,10 @@ class CustomTrainer(Trainer):
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
         self.processor = processor
+
+        if finetuning_args.pissa_convert:
+            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
 
@@ -60,6 +65,9 @@ class CustomTrainer(Trainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.finetuning_args.pissa_convert:
+            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
+
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 6ab6914e..921e49ab 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer_seq2seq.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,7 +26,7 @@ from transformers import Seq2SeqTrainer
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
+from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -51,6 +51,10 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
         self.processor = processor
+
+        if finetuning_args.pissa_convert:
+            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
 
@@ -69,8 +73,11 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.finetuning_args.pissa_convert:
+            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
+
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
     def prediction_step(
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 5621d5df..2d6bab24 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -1,9 +1,9 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by the GaLore's implementation: https://github.com/jiaweizzhao/GaLore
-# and the LoRA+'s implementation: https://github.com/nikhil-ghosh-berkeley/loraplus
-# and the BAdam's implementation: https://github.com/Ledzy/BAdam
-# and the TRL's implementation: https://github.com/huggingface/trl
+# This code is inspired by the original GaLore's implementation: https://github.com/jiaweizzhao/GaLore
+# and the original LoRA+'s implementation: https://github.com/nikhil-ghosh-berkeley/loraplus
+# and the original BAdam's implementation: https://github.com/Ledzy/BAdam
+# and the HuggingFace's TRL library: https://github.com/huggingface/trl
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,9 +17,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
+from peft import PeftModel
 from transformers import Trainer
 from transformers.optimization import get_scheduler
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
@@ -37,6 +39,7 @@ if is_galore_available():
 
 
 if TYPE_CHECKING:
+    from accelerate import Accelerator
     from transformers import PreTrainedModel, Seq2SeqTrainingArguments
     from trl import AutoModelForCausalLMWithValueHead
 
@@ -171,6 +174,49 @@ def create_reward_model(
         return reward_model
 
 
+def convert_pissa_adapter(
+    output_dir: str,
+    state_dict: Dict[str, "torch.Tensor"],
+    accelerator: "Accelerator",
+    model: "PreTrainedModel",
+    training_args: "Seq2SeqTrainingArguments",
+) -> None:
+    r"""
+    Converts the PiSSA adapter to a LoRA adapter.
+    """
+    pissa_init_dir = os.path.join(training_args.output_dir, "pissa_init")
+    pissa_backup_dir = os.path.join(output_dir, "pissa_backup")
+    if output_dir == pissa_init_dir:
+        logger.info("Initial PiSSA adatper will be saved at: {}.".format(pissa_init_dir))
+        unwrapped_model = accelerator.unwrap_model(model)
+        if isinstance(unwrapped_model, PeftModel):
+            init_lora_weights = getattr(unwrapped_model.peft_config["default"], "init_lora_weights")
+            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", True)
+            unwrapped_model.save_pretrained(
+                output_dir,
+                state_dict=state_dict,
+                safe_serialization=training_args.save_safetensors,
+            )
+            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", init_lora_weights)
+    elif output_dir == training_args.output_dir:  # at the end of training
+        logger.info("Converted PiSSA adapter will be saved at: {}.".format(output_dir))
+        unwrapped_model = accelerator.unwrap_model(model)
+        if isinstance(unwrapped_model, PeftModel):  # backup the pissa adapter for further use
+            unwrapped_model.save_pretrained(
+                pissa_backup_dir,
+                state_dict=state_dict,
+                safe_serialization=training_args.save_safetensors,
+            )
+            unwrapped_model.save_pretrained(
+                output_dir,
+                state_dict=state_dict,
+                safe_serialization=training_args.save_safetensors,
+                convert_pissa_to_lora=pissa_init_dir,
+            )
+            unwrapped_model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
+            unwrapped_model.set_adapter("default")
+
+
 def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]:
     r"""
     Returns a list of names of parameters with weight decay. (weights in non-layernorm layers)
diff --git a/src/llamafactory/webui/components/train.py b/src/llamafactory/webui/components/train.py
index 673f6bf4..874f3c5e 100644
--- a/src/llamafactory/webui/components/train.py
+++ b/src/llamafactory/webui/components/train.py
@@ -163,10 +163,9 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             create_new_adapter = gr.Checkbox()
 
         with gr.Row():
-            with gr.Column(scale=1):
-                use_rslora = gr.Checkbox()
-                use_dora = gr.Checkbox()
-
+            use_rslora = gr.Checkbox()
+            use_dora = gr.Checkbox()
+            use_pissa = gr.Checkbox()
             lora_target = gr.Textbox(scale=2)
             additional_target = gr.Textbox(scale=2)
 
@@ -179,6 +178,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             create_new_adapter,
             use_rslora,
             use_dora,
+            use_pissa,
             lora_target,
             additional_target,
         }
@@ -193,6 +193,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             create_new_adapter=create_new_adapter,
             use_rslora=use_rslora,
             use_dora=use_dora,
+            use_pissa=use_pissa,
             lora_target=lora_target,
             additional_target=additional_target,
         )
diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py
index 427f01b8..8e8d6fce 100644
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
@@ -732,6 +732,20 @@ LOCALES = {
             "info": "使用权重分解的 LoRA。",
         },
     },
+    "use_pissa": {
+        "en": {
+            "label": "Use PiSSA",
+            "info": "Use PiSSA method.",
+        },
+        "ru": {
+            "label": "используйте PiSSA",
+            "info": "Используйте метод PiSSA.",
+        },
+        "zh": {
+            "label": "使用 PiSSA",
+            "info": "使用 PiSSA 方法。",
+        },
+    },
     "lora_target": {
         "en": {
             "label": "LoRA modules (optional)",
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 76982934..13dbba03 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -173,6 +173,8 @@ class Runner:
             args["create_new_adapter"] = get("train.create_new_adapter")
             args["use_rslora"] = get("train.use_rslora")
             args["use_dora"] = get("train.use_dora")
+            args["pissa_init"] = get("train.use_pissa")
+            args["pissa_convert"] = get("train.use_pissa")
             args["lora_target"] = get("train.lora_target") or "all"
             args["additional_target"] = get("train.additional_target") or None
 
diff --git a/tests/model/test_pissa.py b/tests/model/test_pissa.py
new file mode 100644
index 00000000..70c424fd
--- /dev/null
+++ b/tests/model/test_pissa.py
@@ -0,0 +1,90 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+from peft import LoraModel, PeftModel
+from transformers import AutoModelForCausalLM
+
+from llamafactory.extras.misc import get_current_device
+from llamafactory.hparams import get_infer_args, get_train_args
+from llamafactory.model import load_model, load_tokenizer
+
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TINY_LLAMA_PISSA = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-pissa")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "pissa_init": True,
+    "pissa_iter": -1,
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA_PISSA,
+    "adapter_name_or_path": TINY_LLAMA_PISSA,
+    "adapter_folder": "pissa_init",
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
+    state_dict_a = model_a.state_dict()
+    state_dict_b = model_b.state_dict()
+    assert set(state_dict_a.keys()) == set(state_dict_b.keys())
+    for name in state_dict_a.keys():
+        assert torch.allclose(state_dict_a[name], state_dict_b[name])
+
+
+def test_pissa_init():
+    model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA_PISSA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_PISSA, subfolder="pissa_init", is_trainable=True)
+    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
+        param.data = param.data.to(torch.float32)
+
+    compare_model(model, ref_model)
+
+
+def test_pissa_inference():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA_PISSA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_PISSA, subfolder="pissa_init")
+    ref_model = ref_model.merge_and_unload()
+    compare_model(model, ref_model)

From bf46a8ca3e47057cb481077da778ca77f9f94a7f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:21:06 +0800
Subject: [PATCH 049/162] increase tol

Former-commit-id: c29071445e34aed23123fdf883a4d877744a1b0e
---
 tests/model/test_pissa.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/model/test_pissa.py b/tests/model/test_pissa.py
index 70c424fd..41d02752 100644
--- a/tests/model/test_pissa.py
+++ b/tests/model/test_pissa.py
@@ -59,7 +59,7 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
     state_dict_b = model_b.state_dict()
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
-        assert torch.allclose(state_dict_a[name], state_dict_b[name])
+        assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-3, atol=1e-3)
 
 
 def test_pissa_init():

From 1a66bb1438a5dfe86eac73662002a4ea8fee5207 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:22:23 +0800
Subject: [PATCH 050/162] Update tests.yml

Former-commit-id: 82e83615a706293abbf266d11c57caedafdd4c5b
---
 .github/workflows/tests.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 96092662..98bd9455 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -9,8 +9,6 @@ on:
       - "requirements.txt"
       - ".github/workflows/*.yml"
   pull_request:
-    types:
-      - review_requested
     branches:
       - main
     paths:

From d5a0cc93a26e45ed81178ad614259de03bdeb662 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:38:44 +0800
Subject: [PATCH 051/162] fix tol

Former-commit-id: bdb54bcb477126687db789bd89f2df84e424a2a3
---
 src/llamafactory/train/trainer_utils.py | 1 +
 tests/model/test_base.py                | 2 +-
 tests/model/test_lora.py                | 4 ++--
 tests/model/test_pissa.py               | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 2d6bab24..9052c96d 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -213,6 +213,7 @@ def convert_pissa_adapter(
                 safe_serialization=training_args.save_safetensors,
                 convert_pissa_to_lora=pissa_init_dir,
             )
+            # TODO: the model is applied pissa again unexpectedly
             unwrapped_model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
             unwrapped_model.set_adapter("default")
 
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index 954492ef..e1991b20 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -41,7 +41,7 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
     state_dict_b = model_b.state_dict()
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
-        assert torch.allclose(state_dict_a[name], state_dict_b[name])
+        assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5)
 
 
 @pytest.fixture
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index fe032332..64566fe8 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -67,9 +67,9 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_k
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
         if any(key in name for key in diff_keys):
-            assert torch.allclose(state_dict_a[name], state_dict_b[name]) is False
+            assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5) is False
         else:
-            assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
+            assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5) is True
 
 
 @pytest.fixture
diff --git a/tests/model/test_pissa.py b/tests/model/test_pissa.py
index 41d02752..030310d0 100644
--- a/tests/model/test_pissa.py
+++ b/tests/model/test_pissa.py
@@ -59,7 +59,7 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
     state_dict_b = model_b.state_dict()
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
-        assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-3, atol=1e-3)
+        assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5)
 
 
 def test_pissa_init():

From 18af6f0d3b757c4b15e55cb665baa7c05147d120 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:43:43 +0800
Subject: [PATCH 052/162] update pr template

Former-commit-id: 0b7c29674fda10c0ac87e0a0c75990feabb5a3de
---
 .github/PULL_REQUEST_TEMPLATE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index b31e9d19..d23d6be3 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -5,3 +5,4 @@ Fixes # (issue)
 ## Before submitting
 
 - [ ] Did you read the [contributor guideline](https://github.com/hiyouga/LLaMA-Factory/blob/main/.github/CONTRIBUTING.md)?
+- [ ] Did you write any new necessary tests?

From 6b729cccb9f231aa60944f734bd100d2a06cb83f Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 16 Jun 2024 02:57:00 +0800
Subject: [PATCH 053/162] Update parser.py

Former-commit-id: d10c97193d08bd368aca1a72f0d1d8a96c76765d
---
 src/llamafactory/hparams/parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 31a805f6..467fc43d 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -90,8 +90,8 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
         if finetuning_args.finetuning_type != "lora":
             raise ValueError("Quantization is only compatible with the LoRA method.")
 
-        if finetuning_args.use_pissa:
-            raise ValueError("Please use scripts/pissa_init.py for quantized PiSSA.")
+        if finetuning_args.pissa_init:
+            raise ValueError("Please use scripts/pissa_init.py to initialize PiSSA for a quantized model.")
 
         if model_args.resize_vocab:
             raise ValueError("Cannot resize embedding layers of a quantized model.")

From ef578c39a08cc67b1d9f028ec1ff16dba62d0584 Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sat, 15 Jun 2024 19:31:56 -0300
Subject: [PATCH 054/162] Add Magpie and Webinstruct dataset samples

Adds two dataset samples claimed superior performance: Magpie (from Allen AI) and Webinstruct (from TIGER-Lab).

Former-commit-id: 12f4a2bc3172ecd5b6775061d59103f565ac9562
---
 data/dataset_info.json | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/data/dataset_info.json b/data/dataset_info.json
index 1d226b3a..e321196a 100644
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -524,13 +524,13 @@
       "prompt": "text"
     }
   },
-  "fileweb": {
+  "fineweb": {
     "hf_hub_url": "HuggingFaceFW/fineweb",
     "columns": {
       "prompt": "text"
     }
   },
-  "fileweb_edu": {
+  "fineweb_edu": {
     "hf_hub_url": "HuggingFaceFW/fineweb-edu",
     "columns": {
       "prompt": "text"
@@ -550,5 +550,25 @@
       "prompt": "content"
     },
     "folder": "python"
+  },
+  "Magpie-Pro-300K-Filtered": {
+    "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered",
+    "columns": {
+      "messages": "conversations"
+    },
+    "tags": {
+      "role_tag": "from",
+      "content_tag": "value",
+      "user_tag": "human",
+      "assistant_tag": "gpt"
+    },
+    "formatting": "sharegpt"
+  },
+  "WebInstructSub": {
+    "hf_hub_url": "TIGER-Lab/WebInstructSub",
+    "columns": {
+      "prompt": "question",
+      "response": "answer"
+    }
   }
-}
\ No newline at end of file
+}

From 70f7e5c4a1352549a7882dd3934b9e14a5d5abec Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 11:19:25 -0300
Subject: [PATCH 055/162] Update README.md

Add Magpie and Webinstruct to README

Former-commit-id: 2b32b9263f12605e48e11dce9b5fbb746d790745
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index cb9a7222..30c90e9b 100644
--- a/README.md
+++ b/README.md
@@ -270,6 +270,8 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
 - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
 - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
 </details>
 

From 1477e4c773ceb1a3baa22efd5c3e37175c79fb93 Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 11:22:06 -0300
Subject: [PATCH 056/162] Update README_zh.md

Add Magpie and WebInstruct to README

Former-commit-id: 6cf5323959fe9500ba06ab28980fcc8f62e1373f
---
 README_zh.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README_zh.md b/README_zh.md
index 5c005f30..531d9b56 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -270,8 +270,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
 - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
 - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
-
-</details>
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
 <details><summary>偏好数据集</summary>
 

From 28c4a06c25aa109473ec8a588f9ad7e14d88b570 Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 11:34:31 -0300
Subject: [PATCH 057/162] Update README_zh.md

Fix details tag in datasets menus

Former-commit-id: d79c1bd4806e9ea13115fabebf9da2d19b0a52be
---
 README_zh.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README_zh.md b/README_zh.md
index 531d9b56..711596f0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -273,6 +273,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
 - [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
+</details>
+
 <details><summary>偏好数据集</summary>
 
 - [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)

From 9a515a44750b4c4f8768af4a400f9d044f48ce12 Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 19:16:23 -0300
Subject: [PATCH 058/162] Fix Dockerfile

Adds the commands to correctly execute LLama-Factory servers

Former-commit-id: 22af40f0895a6f88709a495febeca8507d41d989
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 3932ff30..9c2f645c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,3 +42,5 @@ EXPOSE 7860
 
 # Expose port 8000 for the API service
 EXPOSE 8000
+
+CMD [ "llamafactory-cli", "webui" ]

From 52a2894bdcf4b93b7bfef603920aaa1725e9a60e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=83=A1=E7=BF=80?= <1020327414@qq.com>
Date: Mon, 17 Jun 2024 16:45:57 +0800
Subject: [PATCH 059/162] Update requirements.txt

add pandas version requirements

Former-commit-id: ed1cf559aa2d02588aacf55a17b439473651f626
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 9e00555e..5c7202a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ accelerate>=0.30.1
 peft>=0.11.1
 trl>=0.8.6
 gradio>=4.0.0
+pandas>=2.2.2
 scipy
 einops
 sentencepiece

From aea2e4083fe4117a8b394b8ad649d4838dd31e54 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 17:47:25 +0800
Subject: [PATCH 060/162] tiny fix

Former-commit-id: 2289436567a7860d25d9da0afb39e4a3e5e83839
---
 examples/README.md                            | 14 ++++----
 examples/README_zh.md                         | 14 ++++----
 .../fsdp_qlora/{single_node.sh => train.sh}   |  0
 scripts/llama_pro.py                          |  2 +-
 scripts/loftq_init.py                         |  2 +-
 scripts/pissa_init.py                         |  5 ++-
 tests/model/test_lora.py                      | 34 +++++++------------
 7 files changed, 32 insertions(+), 39 deletions(-)
 rename examples/extras/fsdp_qlora/{single_node.sh => train.sh} (100%)

diff --git a/examples/README.md b/examples/README.md
index 902d26b1..007a81ab 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -195,6 +195,12 @@ llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
 ```
 
+#### PiSSA Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
+
 #### Mixture-of-Depths Fine-Tuning
 
 ```bash
@@ -211,11 +217,5 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 #### FSDP+QLoRA Fine-Tuning
 
 ```bash
-bash examples/extras/fsdp_qlora/single_node.sh
-```
-
-#### PiSSA Fine-Tuning
-
-```bash
-llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+bash examples/extras/fsdp_qlora/train.sh
 ```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 586e498c..b9d90f25 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -195,6 +195,12 @@ llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
 ```
 
+#### PiSSA 微调
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
+
 #### 深度混合微调
 
 ```bash
@@ -211,11 +217,5 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 #### FSDP+QLoRA 微调
 
 ```bash
-bash examples/extras/fsdp_qlora/single_node.sh
-```
-
-#### PiSSA 微调
-
-```bash
-llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+bash examples/extras/fsdp_qlora/train.sh
 ```
diff --git a/examples/extras/fsdp_qlora/single_node.sh b/examples/extras/fsdp_qlora/train.sh
similarity index 100%
rename from examples/extras/fsdp_qlora/single_node.sh
rename to examples/extras/fsdp_qlora/train.sh
diff --git a/scripts/llama_pro.py b/scripts/llama_pro.py
index 395375ef..17bf6fc2 100644
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -120,7 +120,7 @@ def block_expansion(
             json.dump(index, f, indent=2, sort_keys=True)
         print("Model weights saved in {}".format(output_dir))
 
-    print("Fine-tune this model with:")
+    print("- Fine-tune this model with:")
     print("model_name_or_path: {}".format(output_dir))
     print("finetuning_type: freeze")
     print("freeze_trainable_layers: {}".format(num_expand))
diff --git a/scripts/loftq_init.py b/scripts/loftq_init.py
index 556f342c..b9506fa3 100644
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
@@ -74,7 +74,7 @@ def quantize_loftq(
     tokenizer.save_pretrained(output_dir)
     print("Model weights saved in {}".format(output_dir))
 
-    print("Fine-tune this model with:")
+    print("- Fine-tune this model with:")
     print("model_name_or_path: {}".format(output_dir))
     print("adapter_name_or_path: {}".format(loftq_dir))
     print("finetuning_type: lora")
diff --git a/scripts/pissa_init.py b/scripts/pissa_init.py
index 1b673c45..10b81efc 100644
--- a/scripts/pissa_init.py
+++ b/scripts/pissa_init.py
@@ -68,11 +68,14 @@ def quantize_pissa(
     tokenizer.save_pretrained(output_dir)
     print("Model weights saved in {}".format(output_dir))
 
-    print("Fine-tune this model with:")
+    print("- Fine-tune this model with:")
     print("model_name_or_path: {}".format(output_dir))
     print("adapter_name_or_path: {}".format(pissa_dir))
     print("finetuning_type: lora")
+    print("pissa_init: false")
     print("pissa_convert: true")
+    print("- and optionally with:")
+    print("quantization_bit: 4")
 
 
 if __name__ == "__main__":
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 64566fe8..630e5f75 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -56,9 +56,15 @@ INFER_ARGS = {
 }
 
 
-def load_reference_model() -> "torch.nn.Module":
-    model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA)
-    return PeftModel.from_pretrained(model, TINY_LLAMA_ADAPTER)
+def load_reference_model(is_trainable: bool = False) -> "LoraModel":
+    model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    lora_model = PeftModel.from_pretrained(model, TINY_LLAMA_ADAPTER, is_trainable=is_trainable)
+    for param in filter(lambda p: p.requires_grad, lora_model.parameters()):
+        param.data = param.data.to(torch.float32)
+
+    return lora_model
 
 
 def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_keys: Sequence[str] = []):
@@ -148,13 +154,7 @@ def test_lora_train_old_adapters():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
 
-    base_model = AutoModelForCausalLM.from_pretrained(
-        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
-    )
-    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
-    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
-        param.data = param.data.to(torch.float32)
-
+    ref_model = load_reference_model(is_trainable=True)
     compare_model(model, ref_model)
 
 
@@ -165,13 +165,7 @@ def test_lora_train_new_adapters():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
 
-    base_model = AutoModelForCausalLM.from_pretrained(
-        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
-    )
-    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
-    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
-        param.data = param.data.to(torch.float32)
-
+    ref_model = load_reference_model(is_trainable=True)
     compare_model(
         model, ref_model, diff_keys=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"]
     )
@@ -200,9 +194,5 @@ def test_lora_inference():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
 
-    base_model = AutoModelForCausalLM.from_pretrained(
-        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
-    )
-    ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER)
-    ref_model = ref_model.merge_and_unload()
+    ref_model = load_reference_model().merge_and_unload()
     compare_model(model, ref_model)

From bb8c7e70482fcef6bf8961293b52bc40ac8d42e7 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 18:17:48 +0800
Subject: [PATCH 061/162] fix #4326

Former-commit-id: 3c2c45812a720d92f7f5b15b9f03370fe6bf069e
---
 src/llamafactory/model/adapter.py              | 16 +++++++++++++---
 .../model/model_utils/quantization.py          | 18 ++++++++----------
 src/llamafactory/model/patcher.py              |  5 ++++-
 3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index a8f3a256..34518878 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -281,12 +281,22 @@ def init_adapter(
 
     Note that the trainable parameters must be cast to float32.
     """
-    if is_trainable and getattr(model, "quantization_method", None) and finetuning_args.finetuning_type != "lora":
-        raise ValueError("Quantized models can only be used for the LoRA tuning.")
+    if is_trainable and getattr(model, "quantization_method", None) is not None:
+        if finetuning_args.finetuning_type != "lora":
+            raise ValueError("Quantized models can only be used for the LoRA tuning.")
 
+        if finetuning_args.pissa_init:
+            raise ValueError("Cannot initialize PiSSA adapter on quantized models.")
+
+    # cast trainable parameters to float32 if:
+    # 1. is_trainable and quantization_bit is not None (qlora)
+    # 2. is_trainable and not deepspeed zero3 and not fsdp (zero3 or fsdp already in float32)
+    # 3. is_trainable and not pure_bf16 and not badam
     if not is_trainable:
         cast_trainable_params_to_fp32 = False
-    elif is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
+    elif model_args.quantization_bit is None and (
+        is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam
+    ):
         logger.info("ZeRO3/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.")
         cast_trainable_params_to_fp32 = False
     else:
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 0a0fca34..5251f84f 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -1,6 +1,7 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by the HuggingFace's Optimum library.
+# This code is inspired by the HuggingFace's Transformers and Optimum library.
+# https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/utils/quantization_config.py
 # https://github.com/huggingface/optimum/blob/v1.20.0/optimum/gptq/data.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -96,10 +97,7 @@ def configure_quantization(
     """
     if getattr(config, "quantization_config", None):  # ptq
         if is_deepspeed_zero3_enabled():
-            raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantized models.")
-
-        if model_args.quantization_device_map != "auto":
-            init_kwargs["device_map"] = {"": get_current_device()}
+            raise ValueError("DeepSpeed ZeRO-3 is incompatible with PTQ-quantized models.")
 
         quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
         quant_method = quantization_config.get("quant_method", "")
@@ -152,15 +150,15 @@ def configure_quantization(
                 bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
             )
 
+        # assign device map if:
+        # 1. not deepspeed zero3 and not fsdp
+        # 2. not auto quantization device map
         if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
             if model_args.quantization_bit != 4:
-                raise ValueError("Only 4-bit quantized model can use auto device map.")
+                raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
 
-            require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
-            require_version("accelerate>=0.28.0", "To fix: pip install accelerate>=0.28.0")
             require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
-            init_kwargs["torch_dtype"] = model_args.compute_dtype  # fsdp+qlora requires same dtype
         else:
-            init_kwargs["device_map"] = {"": get_current_device()}
+            init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
 
         logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 053516e4..8fa17d08 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -89,7 +89,10 @@ def patch_config(
     # deepspeed zero3 is not compatible with low_cpu_mem_usage
     init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage and (not is_deepspeed_zero3_enabled())
 
-    if not is_deepspeed_zero3_enabled() and not is_fsdp_enabled():  # cast dtype and device if not use zero3 or fsdp
+    # cast data type of the model if:
+    # 1. not deepspeed zero3 and not fsdp (keep zero3 or fsdp in float32)
+    # 2. fsdp + qlora
+    if model_args.quantization_bit is not None or (not is_deepspeed_zero3_enabled() and not is_fsdp_enabled()):
         init_kwargs["torch_dtype"] = model_args.compute_dtype
 
         if init_kwargs["low_cpu_mem_usage"]:  # device map requires low_cpu_mem_usage=True

From 34e98b7fc699aac47d66fb5aa556787a0b2f7b53 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Mon, 17 Jun 2024 18:18:10 +0800
Subject: [PATCH 062/162] adapt for badam with ds zero3

Former-commit-id: fff2a020ec8713022bd8145f4a7168168ea07ca4
---
 src/llamafactory/hparams/parser.py    | 12 ++++++------
 src/llamafactory/train/sft/trainer.py | 15 +++++++++++++++
 src/llamafactory/train/utils.py       |  7 +++++++
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 6311297e..fe108657 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -184,12 +184,12 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
-    if (
-        finetuning_args.use_badam
-        and finetuning_args.badam_mode == "layer"
-        and training_args.parallel_mode.value == "distributed"
-    ):
-        raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
+    # if (
+    #     finetuning_args.use_badam
+    #     and finetuning_args.badam_mode == "layer"
+    #     and training_args.parallel_mode.value == "distributed"
+    # ):
+    #     raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
 
     if (finetuning_args.use_galore or finetuning_args.use_badam) and training_args.deepspeed is not None:
         raise ValueError("GaLore and BAdam are incompatible with DeepSpeed yet.")
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 35671e1b..cd73bf5c 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -55,6 +55,21 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
             output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
+    def training_step(self, *args, **kwargs):
+        r"""
+        Update the reference to deepspeed optimizer
+        """
+        if self.finetuning_args.use_badam and \
+            self.args.deepspeed_plugin is not None and \
+            self.args.deepspeed_plugin.zero_stage == 3:
+            
+            ds_optim = self.optimizer.optimizer
+            badam_optim = ds_optim.optimizer
+            badam_optim.ds_optimizer = ds_optim
+        
+        return super().training_step(*args, **kwargs)
+            
+
     def prediction_step(
         self,
         model: "torch.nn.Module",
diff --git a/src/llamafactory/train/utils.py b/src/llamafactory/train/utils.py
index 23834f2d..b189922b 100644
--- a/src/llamafactory/train/utils.py
+++ b/src/llamafactory/train/utils.py
@@ -309,6 +309,12 @@ def _create_badam_optimizer(
         dict(params=decay_params, weight_decay=training_args.weight_decay),
     ]
 
+    ds_zero3_enabled = False
+    if hasattr(training_args, "deepspeed_plugin") and training_args.deepspeed_plugin is not None:
+        assert training_args.deepspeed_plugin.zero_stage == 3, f"BAdam only supports deepspeed ZeRO-3 stage, got {training_args.deepspeed_plugin.zero_stage}"
+        assert finetuning_args.badam_mode == "layer", "BAdam only supports layer-wise update in ZeRO-3 stage"
+        ds_zero3_enabled = True
+
     if finetuning_args.badam_mode == "layer":
         from badam import BlockOptimizer
 
@@ -321,6 +327,7 @@ def _create_badam_optimizer(
             start_block=finetuning_args.badam_start_block,
             switch_mode=finetuning_args.badam_switch_mode,
             verbose=finetuning_args.badam_verbose,
+            ds_zero3_enabled=ds_zero3_enabled
         )
         logger.info(
             f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "

From ae17a856b72faa5e7529e02184982e9d63c7d9c3 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Mon, 17 Jun 2024 18:29:36 +0800
Subject: [PATCH 063/162] update gitigore

Former-commit-id: 0068648aee07840cd2a08071e093436aee3f5cb6
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 0355c666..2486e728 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,5 @@ cython_debug/
 user.config
 saves/
 cache/
+wandb
+ds_badam_exp
\ No newline at end of file

From d03ae32053004ba4b3d5a13f750da22355721d8c Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 18:47:24 +0800
Subject: [PATCH 064/162] update readme

Former-commit-id: 07c629f77c3978f339402e578cde1aede3f37699
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index cb9a7222..60045118 100644
--- a/README.md
+++ b/README.md
@@ -481,7 +481,7 @@ Train the model by specifying a model ID of the ModelScope Hub as the `model_nam
 
 ### Use W&B Logger
 
-To use [Weights & Biases](https://wandb.ai) for logging experimental results, you need to add the following arguments.
+To use [Weights & Biases](https://wandb.ai) for logging experimental results, you need to add the following arguments to yaml files.
 
 ```yaml
 report_to: wandb
diff --git a/README_zh.md b/README_zh.md
index 5c005f30..fc824561 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -481,7 +481,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 ### 使用 W&B 面板
 
-若要使用 [Weights & Biases](https://wandb.ai) 记录实验数据，请添加下面的参数。
+若要使用 [Weights & Biases](https://wandb.ai) 记录实验数据，请在 yaml 文件中添加下面的参数。
 
 ```yaml
 report_to: wandb

From 530e3bfc968581d7b8a9b40cb64c8f75fd72e9b0 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 19:07:17 +0800
Subject: [PATCH 065/162] update chat engine #4335

Former-commit-id: b163df7de48777e4319c9ccc736b0acdd5f473ed
---
 src/llamafactory/chat/chat_model.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
index fb800106..2a72f422 100644
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
     from .base_engine import BaseEngine, Response
 
 
-def _start_background_loop(loop: asyncio.AbstractEventLoop) -> None:
+def _start_background_loop(loop: "asyncio.AbstractEventLoop") -> None:
     asyncio.set_event_loop(loop)
     loop.run_forever()
 
@@ -49,7 +49,8 @@ class ChatModel:
         self._loop = asyncio.new_event_loop()
         self._thread = Thread(target=_start_background_loop, args=(self._loop,), daemon=True)
         self._thread.start()
-        asyncio.run_coroutine_threadsafe(self.engine.start(), self._loop)
+        task = asyncio.run_coroutine_threadsafe(self.engine.start(), self._loop)
+        task.result()
 
     def chat(
         self,

From 412c1b9522d066d1af8e37a00b1a2351dc5ef63e Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 22:35:56 +0800
Subject: [PATCH 066/162] lint

Former-commit-id: a19a7ac99af62b6715c96274f6350b124a784331
---
 scripts/pissa_init.py                       | 2 +-
 src/llamafactory/hparams/finetuning_args.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/pissa_init.py b/scripts/pissa_init.py
index 10b81efc..50239727 100644
--- a/scripts/pissa_init.py
+++ b/scripts/pissa_init.py
@@ -50,7 +50,7 @@ def quantize_pissa(
         lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
         lora_dropout=lora_dropout,
         target_modules=[name.strip() for name in lora_target.split(",")],
-        init_lora_weights="pissa" if pissa_iter == -1 else "pissa_niter_{}".format(pissa_iter)
+        init_lora_weights="pissa" if pissa_iter == -1 else "pissa_niter_{}".format(pissa_iter),
     )
 
     # Init PiSSA model
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index 1ef46eca..b676891e 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -352,7 +352,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         self.additional_target: Optional[List[str]] = split_arg(self.additional_target)
         self.galore_target: List[str] = split_arg(self.galore_target)
         self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only
-        self.use_ref_model = (self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"])
+        self.use_ref_model = self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"]
 
         assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method."
         assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."

From b567216702c976aae5e524dff4c0b3a4b55b0878 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Tue, 18 Jun 2024 12:27:47 +0800
Subject: [PATCH 067/162] Support distributed BAdam.

Former-commit-id: bdcb986e37975911c190a74d3e60bb77aa2033bd
---
 src/llamafactory/hparams/parser.py    | 24 ++++++++++--------------
 src/llamafactory/train/dpo/trainer.py |  6 ++++++
 src/llamafactory/train/kto/trainer.py |  6 ++++++
 src/llamafactory/train/ppo/trainer.py |  6 ++++++
 src/llamafactory/train/pt/trainer.py  |  6 ++++++
 src/llamafactory/train/rm/trainer.py  |  6 ++++++
 src/llamafactory/train/sft/trainer.py | 22 ++++++----------------
 7 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index f922bbfd..598a34e5 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -209,24 +209,20 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
-<<<<<<< HEAD
-    # if (
-    #     finetuning_args.use_badam
-    #     and finetuning_args.badam_mode == "layer"
-    #     and training_args.parallel_mode.value == "distributed"
-    # ):
-    #     raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
-=======
     if (
         finetuning_args.use_badam
-        and finetuning_args.badam_mode == "layer"
-        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
+        and training_args.parallel_mode.value == "distributed"
     ):
-        raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
->>>>>>> upstream/main
+        if finetuning_args.badam_mode == "ratio":
+            raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
+        if (finetuning_args.badam_mode == "layer"
+            and training_args.deepspeed_plugin is not None
+            and training_args.deepspeed_plugin.zero_stage < 3
+        ):
+            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage, got stage {self.args.deepspeed_plugin.zero_stage}")
 
-    if (finetuning_args.use_galore or finetuning_args.use_badam) and training_args.deepspeed is not None:
-        raise ValueError("GaLore and BAdam are incompatible with DeepSpeed yet.")
+    if (finetuning_args.use_galore) and training_args.deepspeed is not None:
+        raise ValueError("GaLore are incompatible with DeepSpeed yet.")
 
     if model_args.infer_backend == "vllm":
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index 9928d0bc..284bf41a 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -100,6 +100,12 @@ class CustomDPOTrainer(DPOTrainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 91d68975..d8b609e0 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -95,6 +95,12 @@ class CustomKTOTrainer(KTOTrainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index df4a37be..5a18cbaa 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -170,6 +170,12 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
         r"""
         Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer.
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index f9e04cb5..1e5e9f6a 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -52,6 +52,12 @@ class CustomTrainer(Trainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 7f91e5f5..5d0e6263 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -76,6 +76,12 @@ class PairwiseTrainer(Trainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 0628ea59..9446d245 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -57,9 +57,14 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
-
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
@@ -80,21 +85,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         if self.processor is not None:
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
-    def training_step(self, *args, **kwargs):
-        r"""
-        Update the reference to deepspeed optimizer
-        """
-        if self.finetuning_args.use_badam and \
-            self.args.deepspeed_plugin is not None and \
-            self.args.deepspeed_plugin.zero_stage == 3:
-            
-            ds_optim = self.optimizer.optimizer
-            badam_optim = ds_optim.optimizer
-            badam_optim.ds_optimizer = ds_optim
-        
-        return super().training_step(*args, **kwargs)
-            
-
     def prediction_step(
         self,
         model: "torch.nn.Module",

From fe47ebe572e5d5d41c101bd106cc3f5e7d74ff40 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Tue, 18 Jun 2024 12:39:26 +0800
Subject: [PATCH 068/162] fix typo

Former-commit-id: d4bee3716dbf8a84564d5bcc2059172604819f3e
---
 src/llamafactory/hparams/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 598a34e5..680559ac 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -219,7 +219,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
             and training_args.deepspeed_plugin is not None
             and training_args.deepspeed_plugin.zero_stage < 3
         ):
-            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage, got stage {self.args.deepspeed_plugin.zero_stage}")
+            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage, got stage {training_args.deepspeed_plugin.zero_stage}")
 
     if (finetuning_args.use_galore) and training_args.deepspeed is not None:
         raise ValueError("GaLore are incompatible with DeepSpeed yet.")

From c59c00c06b8cbede4f507746a098895f4c8b8916 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Tue, 18 Jun 2024 13:50:26 +0800
Subject: [PATCH 069/162] add example

Former-commit-id: 75603db09b085e3f703286b87abe041af020e615
---
 examples/extras/badam/llama3_badam_sft.yaml | 40 +++++++++++++++++++++
 examples/extras/badam/train_single_gpu.sh   | 37 +++++++++++++++++++
 examples/extras/badam/train_zero3.sh        | 39 ++++++++++++++++++++
 3 files changed, 116 insertions(+)
 create mode 100644 examples/extras/badam/llama3_badam_sft.yaml
 create mode 100644 examples/extras/badam/train_single_gpu.sh
 create mode 100644 examples/extras/badam/train_zero3.sh

diff --git a/examples/extras/badam/llama3_badam_sft.yaml b/examples/extras/badam/llama3_badam_sft.yaml
new file mode 100644
index 00000000..f5adb220
--- /dev/null
+++ b/examples/extras/badam/llama3_badam_sft.yaml
@@ -0,0 +1,40 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_badam: true
+badam_switch_mode: ascending
+badam_switch_interval: 50
+badam_verbose: 2
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-6
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/badam/train_single_gpu.sh b/examples/extras/badam/train_single_gpu.sh
new file mode 100644
index 00000000..8af79007
--- /dev/null
+++ b/examples/extras/badam/train_single_gpu.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=0
+
+cd ../../..
+
+llamafactory-cli train \
+    --stage sft \
+    --do_train True \
+    --model_name_or_path meta-llama/Llama-2-13b-hf \
+    --preprocessing_num_workers 16 \
+    --finetuning_type full \
+    --template default \
+    --flash_attn auto \
+    --dataset_dir data \
+    --dataset alpaca_en_demo \
+    --cutoff_len 1024 \
+    --learning_rate 1e-6 \
+    --num_train_epochs 3.0 \
+    --max_samples 100000 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --max_grad_norm 1.0 \
+    --logging_steps 5 \
+    --save_steps 100 \
+    --warmup_steps 0 \
+    --optim adamw_torch \
+    --packing False \
+    --report_to none \
+    --use_badam True \
+    --output_dir saves/LLaMA2-13B/full/BAdam \
+    --plot_loss True \
+    --ddp_timeout 180000000 \
+    --include_num_input_tokens_seen True \
+    --badam_mode layer \
+    --badam_switch_mode ascending \
+    --badam_switch_interval 50
\ No newline at end of file
diff --git a/examples/extras/badam/train_zero3.sh b/examples/extras/badam/train_zero3.sh
new file mode 100644
index 00000000..3b182134
--- /dev/null
+++ b/examples/extras/badam/train_zero3.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+cd ../../..
+
+llamafactory-cli train \
+    --stage sft \
+    --do_train True \
+    --model_name_or_path meta-llama/Llama-2-13b-hf \
+    --preprocessing_num_workers 16 \
+    --finetuning_type full \
+    --template default \
+    --flash_attn auto \
+    --dataset_dir data \
+    --dataset alpaca_en_demo \
+    --cutoff_len 1024 \
+    --learning_rate 1e-6 \
+    --num_train_epochs 3.0 \
+    --max_samples 100000 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 2 \
+    --lr_scheduler_type cosine \
+    --max_grad_norm 1.0 \
+    --logging_steps 5 \
+    --save_steps 100 \
+    --warmup_steps 0 \
+    --optim adamw_torch \
+    --packing False \
+    --report_to none \
+    --use_badam True \
+    --output_dir saves/LLaMA2-13B/full/BAdam \
+    --fp16 True \
+    --plot_loss True \
+    --ddp_timeout 180000000 \
+    --include_num_input_tokens_seen True \
+    --badam_mode layer \
+    --badam_switch_mode ascending \
+    --badam_switch_interval 50 \
+    --deepspeed cache/ds_z3_config.json 
\ No newline at end of file

From 6050202c53c9ff24b926bad1cbc0ec3e388d237d Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 18 Jun 2024 22:08:56 +0800
Subject: [PATCH 070/162] fix #4335

Former-commit-id: 2ab449adbb160f339a0586edeb846fa311ad8382
---
 src/llamafactory/chat/base_engine.py |  5 -----
 src/llamafactory/chat/chat_model.py  |  2 --
 src/llamafactory/chat/hf_engine.py   | 10 ++++------
 src/llamafactory/chat/vllm_engine.py |  3 ---
 4 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/src/llamafactory/chat/base_engine.py b/src/llamafactory/chat/base_engine.py
index 92a51ebe..ccdf4c92 100644
--- a/src/llamafactory/chat/base_engine.py
+++ b/src/llamafactory/chat/base_engine.py
@@ -50,11 +50,6 @@ class BaseEngine(ABC):
         generating_args: "GeneratingArguments",
     ) -> None: ...
 
-    @abstractmethod
-    async def start(
-        self,
-    ) -> None: ...
-
     @abstractmethod
     async def chat(
         self,
diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
index 2a72f422..5c83fa67 100644
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
@@ -49,8 +49,6 @@ class ChatModel:
         self._loop = asyncio.new_event_loop()
         self._thread = Thread(target=_start_background_loop, args=(self._loop,), daemon=True)
         self._thread.start()
-        task = asyncio.run_coroutine_threadsafe(self.engine.start(), self._loop)
-        task.result()
 
     def chat(
         self,
diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index a7ff7015..30200456 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -59,6 +59,7 @@ class HuggingfaceEngine(BaseEngine):
             self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
         )  # must after fixing tokenizer to resize vocab
         self.generating_args = generating_args.to_dict()
+        self.semaphore = asyncio.Semaphore(int(os.environ.get("MAX_CONCURRENT", "1")))
 
     @staticmethod
     def _process_args(
@@ -259,9 +260,6 @@ class HuggingfaceEngine(BaseEngine):
 
         return scores
 
-    async def start(self) -> None:
-        self._semaphore = asyncio.Semaphore(int(os.environ.get("MAX_CONCURRENT", 1)))
-
     async def chat(
         self,
         messages: Sequence[Dict[str, str]],
@@ -286,7 +284,7 @@ class HuggingfaceEngine(BaseEngine):
             image,
             input_kwargs,
         )
-        async with self._semaphore:
+        async with self.semaphore:
             with concurrent.futures.ThreadPoolExecutor() as pool:
                 return await loop.run_in_executor(pool, self._chat, *input_args)
 
@@ -314,7 +312,7 @@ class HuggingfaceEngine(BaseEngine):
             image,
             input_kwargs,
         )
-        async with self._semaphore:
+        async with self.semaphore:
             with concurrent.futures.ThreadPoolExecutor() as pool:
                 stream = self._stream_chat(*input_args)
                 while True:
@@ -333,6 +331,6 @@ class HuggingfaceEngine(BaseEngine):
 
         loop = asyncio.get_running_loop()
         input_args = (self.model, self.tokenizer, batch_input, input_kwargs)
-        async with self._semaphore:
+        async with self.semaphore:
             with concurrent.futures.ThreadPoolExecutor() as pool:
                 return await loop.run_in_executor(pool, self._get_scores, *input_args)
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index d488a039..2626d612 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -183,9 +183,6 @@ class VllmEngine(BaseEngine):
         )
         return result_generator
 
-    async def start(self) -> None:
-        pass
-
     async def chat(
         self,
         messages: Sequence[Dict[str, str]],

From 82264d9c9ae2407bbe1d2b7ec749a92bb6e82851 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 18 Jun 2024 22:27:24 +0800
Subject: [PATCH 071/162] Update requirements.txt

Former-commit-id: da8684f9f0b0103d4fa81279343a48ecd0fcc0cd
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 5c7202a5..f76524d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ accelerate>=0.30.1
 peft>=0.11.1
 trl>=0.8.6
 gradio>=4.0.0
-pandas>=2.2.2
+pandas>=2.0.0
 scipy
 einops
 sentencepiece

From 650bb459542acfe7ca09e27549e4ce31c856f0ea Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 18 Jun 2024 22:42:45 +0800
Subject: [PATCH 072/162] fix #4357

Former-commit-id: a6741bba8cebd16a6a3f97a2dc81057d0e27eb39
---
 src/llamafactory/chat/hf_engine.py         | 7 +++++++
 src/llamafactory/hparams/parser.py         | 1 +
 src/llamafactory/model/model_utils/rope.py | 4 ++--
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 30200456..9e60175b 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -59,6 +59,13 @@ class HuggingfaceEngine(BaseEngine):
             self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
         )  # must after fixing tokenizer to resize vocab
         self.generating_args = generating_args.to_dict()
+        try:
+            asyncio.get_event_loop()
+        except RuntimeError:
+            logger.warning("There is no current event loop, creating a new one.")
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
         self.semaphore = asyncio.Semaphore(int(os.environ.get("MAX_CONCURRENT", "1")))
 
     @staticmethod
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 467fc43d..9b305016 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -353,6 +353,7 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
 
     if model_args.export_dir is not None and model_args.export_device == "cpu":
         model_args.device_map = {"": torch.device("cpu")}
+        model_args.model_max_length = data_args.cutoff_len
     else:
         model_args.device_map = "auto"
 
diff --git a/src/llamafactory/model/model_utils/rope.py b/src/llamafactory/model/model_utils/rope.py
index 88303c4d..4373ee19 100644
--- a/src/llamafactory/model/model_utils/rope.py
+++ b/src/llamafactory/model/model_utils/rope.py
@@ -39,8 +39,8 @@ def configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_
         logger.warning("Current model does not support RoPE scaling.")
         return
 
-    if is_trainable:
-        if model_args.rope_scaling == "dynamic":
+    if model_args.model_max_length is not None:
+        if is_trainable and model_args.rope_scaling == "dynamic":
             logger.warning(
                 "Dynamic NTK scaling may not work well with fine-tuning. "
                 "See: https://github.com/huggingface/transformers/pull/24653"

From f312b7db061667d382ea5389de743c90f8d62063 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 18 Jun 2024 22:53:54 +0800
Subject: [PATCH 073/162] add deepseek coder v2 #4346

Former-commit-id: d83d3846d8e3bf5c40d4b90c24e2c5909ec61864
---
 README.md                            | 58 ++++++++++++++--------------
 README_zh.md                         | 58 ++++++++++++++--------------
 src/llamafactory/extras/constants.py | 12 ++++++
 3 files changed, 70 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 60045118..ca9e7e1d 100644
--- a/README.md
+++ b/README.md
@@ -151,35 +151,35 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                    | Model size                       | Template  |
-| -------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                           | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | cohere    |
-| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                  | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | gemma     |
-| [GLM4](https://huggingface.co/THUDM)                     | 9B                               | glm4      |
-| [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B                   | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                       | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                           | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                           | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)               | 3B                               | gemma     |
-| [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                | 4B/7B/14B                        | phi       |
-| [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B                  | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2 (MoE)](https://huggingface.co/Qwen)               | 0.5B/1.5B/7B/57B/72B             | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | xverse    |
-| [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                    | 6B/34B                           | yi_vl     |
-| [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | yuan      |
+| Model                                                     | Model size                       | Template  |
+| --------------------------------------------------------- | -------------------------------- | --------- |
+| [Baichuan2](https://huggingface.co/baichuan-inc)          | 7B/13B                           | baichuan2 |
+| [BLOOM](https://huggingface.co/bigscience)                | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [BLOOMZ](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                  | 6B                               | chatglm3  |
+| [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/CodeGemma](https://huggingface.co/google)          | 2B/7B                            | gemma     |
+| [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
+| [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
+| [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
+| [LLaMA-2](https://huggingface.co/meta-llama)              | 7B/13B/70B                       | llama2    |
+| [LLaMA-3](https://huggingface.co/meta-llama)              | 8B/70B                           | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)              | 7B/13B                           | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)       | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                    | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                | 3B                               | gemma     |
+| [Phi-1.5/2](https://huggingface.co/microsoft)             | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                 | 4B/7B/14B                        | phi       |
+| [Qwen](https://huggingface.co/Qwen)                       | 1.8B/7B/14B/72B                  | qwen      |
+| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)         | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
+| [Qwen2 (MoE)](https://huggingface.co/Qwen)                | 0.5B/1.5B/7B/57B/72B             | qwen      |
+| [StarCoder2](https://huggingface.co/bigcode)              | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                   | 7B/13B/65B                       | xverse    |
+| [Yi (1/1.5)](https://huggingface.co/01-ai)                | 6B/9B/34B                        | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                     | 6B/34B                           | yi_vl     |
+| [Yuan](https://huggingface.co/IEITYuan)                   | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
diff --git a/README_zh.md b/README_zh.md
index fc824561..2ede76ba 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -151,35 +151,35 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 模型
 
-| 模型名                                                   | 模型大小                          | Template  |
-| -------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                           | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | cohere    |
-| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                  | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | gemma     |
-| [GLM4](https://huggingface.co/THUDM)                     | 9B                               | glm4      |
-| [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B                   | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                       | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                           | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                           | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)               | 3B                               | gemma     |
-| [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                | 4B/7B/14B                        | phi       |
-| [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B                  | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2 (MoE)](https://huggingface.co/Qwen)               | 0.5B/1.5B/7B/57B/72B             | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | xverse    |
-| [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                    | 6B/34B                           | yi_vl     |
-| [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | yuan      |
+| 模型名                                                    | 模型大小                          | Template  |
+| --------------------------------------------------------- | -------------------------------- | --------- |
+| [Baichuan2](https://huggingface.co/baichuan-inc)          | 7B/13B                           | baichuan2 |
+| [BLOOM](https://huggingface.co/bigscience)                | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [BLOOMZ](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                  | 6B                               | chatglm3  |
+| [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/CodeGemma](https://huggingface.co/google)          | 2B/7B                            | gemma     |
+| [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
+| [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
+| [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
+| [LLaMA-2](https://huggingface.co/meta-llama)              | 7B/13B/70B                       | llama2    |
+| [LLaMA-3](https://huggingface.co/meta-llama)              | 8B/70B                           | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)              | 7B/13B                           | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)       | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                    | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                | 3B                               | gemma     |
+| [Phi-1.5/2](https://huggingface.co/microsoft)             | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                 | 4B/7B/14B                        | phi       |
+| [Qwen](https://huggingface.co/Qwen)                       | 1.8B/7B/14B/72B                  | qwen      |
+| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)         | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
+| [Qwen2 (MoE)](https://huggingface.co/Qwen)                | 0.5B/1.5B/7B/57B/72B             | qwen      |
+| [StarCoder2](https://huggingface.co/bigcode)              | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                   | 7B/13B/65B                       | xverse    |
+| [Yi (1/1.5)](https://huggingface.co/01-ai)                | 6B/9B/34B                        | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                     | 6B/34B                           | yi_vl     |
+| [Yuan](https://huggingface.co/IEITYuan)                   | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 73a9969d..36265c8e 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -403,6 +403,18 @@ register_model_group(
             DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat",
             DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat",
         },
+        "DeepSeek-MoE-Coder-16B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
+        },
+        "DeepSeek-MoE-Coder-236B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Base",
+        },
+        "DeepSeek-MoE-Coder-16B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        },
+        "DeepSeek-MoE-Coder-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+        },
     },
     template="deepseek",
 )

From 33fe2744689084a5a27118c7b248137018a2d2ad Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 18 Jun 2024 23:32:18 +0800
Subject: [PATCH 074/162] tiny fix

Former-commit-id: bb750fa3dde03ec024ae75596ecd4b884cb126c6
---
 Dockerfile             |  1 +
 README.md              |  4 ++--
 README_zh.md           |  4 ++--
 data/dataset_info.json | 33 ++++++++++++---------------------
 4 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9c2f645c..61d58005 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,4 +43,5 @@ EXPOSE 7860
 # Expose port 8000 for the API service
 EXPOSE 8000
 
+# Launch LLaMA Board
 CMD [ "llamafactory-cli", "webui" ]
diff --git a/README.md b/README.md
index 2d95583b..0e8e55f7 100644
--- a/README.md
+++ b/README.md
@@ -260,6 +260,8 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
 - [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
@@ -270,8 +272,6 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
 - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
 - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
-- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
-- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
 </details>
 
diff --git a/README_zh.md b/README_zh.md
index ed5475d3..152cd6fa 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -260,6 +260,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
 - [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
@@ -270,8 +272,6 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
 - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
 - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
-- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
-- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
 </details>
 
diff --git a/data/dataset_info.json b/data/dataset_info.json
index e321196a..f8ffd407 100644
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -252,6 +252,17 @@
     "hf_hub_url": "m-a-p/neo_sft_phase2",
     "formatting": "sharegpt"
   },
+  "magpie_pro_300k": {
+    "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered",
+    "formatting": "sharegpt"
+  },
+  "web_instruct": {
+    "hf_hub_url": "TIGER-Lab/WebInstructSub",
+    "columns": {
+      "prompt": "question",
+      "response": "answer"
+    }
+  },
   "llava_1k_en": {
     "hf_hub_url": "BUAADreamer/llava-en-zh-2k",
     "subset": "en",
@@ -550,25 +561,5 @@
       "prompt": "content"
     },
     "folder": "python"
-  },
-  "Magpie-Pro-300K-Filtered": {
-    "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered",
-    "columns": {
-      "messages": "conversations"
-    },
-    "tags": {
-      "role_tag": "from",
-      "content_tag": "value",
-      "user_tag": "human",
-      "assistant_tag": "gpt"
-    },
-    "formatting": "sharegpt"
-  },
-  "WebInstructSub": {
-    "hf_hub_url": "TIGER-Lab/WebInstructSub",
-    "columns": {
-      "prompt": "question",
-      "response": "answer"
-    }
   }
-}
+}
\ No newline at end of file

From 8594d4fd53ed9b301bf0dbc420b5d344b340359a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 02:48:43 +0800
Subject: [PATCH 075/162] update data

Former-commit-id: 5f396ea8555a5f0de7b55f5049890f15c25bbe51
---
 data/glaive_toolcall_en_demo.json | 74 +++++++++++++++----------------
 data/glaive_toolcall_zh_demo.json | 42 +++++++++---------
 2 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/data/glaive_toolcall_en_demo.json b/data/glaive_toolcall_en_demo.json
index c17465a3..4dcd4383 100644
--- a/data/glaive_toolcall_en_demo.json
+++ b/data/glaive_toolcall_en_demo.json
@@ -1,4 +1,41 @@
 [
+  {
+    "conversations": [
+      {
+        "from": "human",
+        "value": "Hi, I have some ingredients and I want to cook something. Can you help me find a recipe?"
+      },
+      {
+        "from": "gpt",
+        "value": "Of course! I can help you with that. Please tell me what ingredients you have."
+      },
+      {
+        "from": "human",
+        "value": "I have chicken, bell peppers, and rice."
+      },
+      {
+        "from": "function_call",
+        "value": "{\"name\": \"search_recipes\", \"arguments\": {\"ingredients\": [\"chicken\", \"bell peppers\", \"rice\"]}}"
+      },
+      {
+        "from": "observation",
+        "value": "{\"recipes\": [{\"name\": \"Chicken and Bell Pepper Stir Fry\", \"ingredients\": [\"chicken\", \"bell peppers\", \"rice\"], \"instructions\": \"Cut the chicken into small pieces. Slice the bell peppers. Cook the rice. Stir fry the chicken and bell peppers. Serve over rice.\"}, {\"name\": \"Chicken and Rice Casserole\", \"ingredients\": [\"chicken\", \"bell peppers\", \"rice\"], \"instructions\": \"Cook the chicken and rice separately. Mix them together with the bell peppers in a casserole dish. Bake until golden brown.\"}]}"
+      },
+      {
+        "from": "gpt",
+        "value": "I found two recipes for you. The first one is \"Chicken and Bell Pepper Stir Fry\". The instructions are: Cut the chicken into small pieces. Slice the bell peppers. Cook the rice. Stir fry the chicken and bell peppers. Serve over rice. The second one is \"Chicken and Rice Casserole\". The instructions are: Cook the chicken and rice separately. Mix them together with the bell peppers in a casserole dish. Bake until golden brown. Which one would you like to try?"
+      },
+      {
+        "from": "human",
+        "value": "They both sound delicious, but I think I'll try the stir fry. Can you order the ingredients for me?"
+      },
+      {
+        "from": "gpt",
+        "value": "I'm sorry, but as an AI, I don't have the capability to perform external tasks such as ordering ingredients. However, I can help you find more recipes or provide cooking tips if you need."
+      }
+    ],
+    "tools": "[{\"name\": \"search_recipes\", \"description\": \"Search for recipes based on ingredients\", \"parameters\": {\"type\": \"object\", \"properties\": {\"ingredients\": {\"type\": \"array\", \"items\": {\"type\": \"string\"}, \"description\": \"The ingredients to search for\"}}, \"required\": [\"ingredients\"]}}]"
+  },
   {
     "conversations": [
       {
@@ -155,43 +192,6 @@
     ],
     "tools": "[]"
   },
-  {
-    "conversations": [
-      {
-        "from": "human",
-        "value": "Hi, I have some ingredients and I want to cook something. Can you help me find a recipe?"
-      },
-      {
-        "from": "gpt",
-        "value": "Of course! I can help you with that. Please tell me what ingredients you have."
-      },
-      {
-        "from": "human",
-        "value": "I have chicken, bell peppers, and rice."
-      },
-      {
-        "from": "function_call",
-        "value": "{\"name\": \"search_recipes\", \"arguments\": {\"ingredients\": [\"chicken\", \"bell peppers\", \"rice\"]}}"
-      },
-      {
-        "from": "observation",
-        "value": "{\"recipes\": [{\"name\": \"Chicken and Bell Pepper Stir Fry\", \"ingredients\": [\"chicken\", \"bell peppers\", \"rice\"], \"instructions\": \"Cut the chicken into small pieces. Slice the bell peppers. Cook the rice. Stir fry the chicken and bell peppers. Serve over rice.\"}, {\"name\": \"Chicken and Rice Casserole\", \"ingredients\": [\"chicken\", \"bell peppers\", \"rice\"], \"instructions\": \"Cook the chicken and rice separately. Mix them together with the bell peppers in a casserole dish. Bake until golden brown.\"}]}"
-      },
-      {
-        "from": "gpt",
-        "value": "I found two recipes for you. The first one is \"Chicken and Bell Pepper Stir Fry\". The instructions are: Cut the chicken into small pieces. Slice the bell peppers. Cook the rice. Stir fry the chicken and bell peppers. Serve over rice. The second one is \"Chicken and Rice Casserole\". The instructions are: Cook the chicken and rice separately. Mix them together with the bell peppers in a casserole dish. Bake until golden brown. Which one would you like to try?"
-      },
-      {
-        "from": "human",
-        "value": "They both sound delicious, but I think I'll try the stir fry. Can you order the ingredients for me?"
-      },
-      {
-        "from": "gpt",
-        "value": "I'm sorry, but as an AI, I don't have the capability to perform external tasks such as ordering ingredients. However, I can help you find more recipes or provide cooking tips if you need."
-      }
-    ],
-    "tools": "[{\"name\": \"search_recipes\", \"description\": \"Search for recipes based on ingredients\", \"parameters\": {\"type\": \"object\", \"properties\": {\"ingredients\": {\"type\": \"array\", \"items\": {\"type\": \"string\"}, \"description\": \"The ingredients to search for\"}}, \"required\": [\"ingredients\"]}}]"
-  },
   {
     "conversations": [
       {
diff --git a/data/glaive_toolcall_zh_demo.json b/data/glaive_toolcall_zh_demo.json
index 06efbac0..f40bda68 100644
--- a/data/glaive_toolcall_zh_demo.json
+++ b/data/glaive_toolcall_zh_demo.json
@@ -1,25 +1,4 @@
 [
-  {
-    "conversations": [
-      {
-        "from": "human",
-        "value": "假设你有一个需要随机数的Java程序，范围在0到10之间。你可以使用什么代码片段来生成这样的数字？\n\n下面是一个示例代码片段，它使用Math.random()方法来生成一个在0到1之间的随机数，然后缩放并四舍五入结果，得到一个在0到10之间的随机数：\n\n```java\nint randomNumber = (int) (Math.random() * 11);\n```\n\n在Java中是否有更有效的方法来生成随机数？你会如何修改代码以生成具有不同范围或分布的随机数？"
-      },
-      {
-        "from": "gpt",
-        "value": "您提供的代码片段是Java中生成随机数的一种常见且高效的方法。然而，如果您需要对随机数生成有更多的控制，您可以使用java.util.Random类，它提供了更多生成随机数的选项。\n\n要生成一个在不同范围内的随机数，您可以修改代码以调整缩放因子。例如，要生成一个在1到10之间的随机数，您可以使用以下代码：\n\nint randomNumber = (int) (Math.random() * 10) + 1;\n\n要生成具有不同分布的随机数，您可以使用各种技术，如正态分布、指数分布或自定义分布。生成正态分布的一种方法是使用Box-Muller变换。这里有一个代码示例，它生成具有正态分布的随机数：\n\nRandom random = new Random();\ndouble u1 = random.nextDouble();\ndouble u2 = random.nextDouble();\ndouble z = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);\ndouble mean = 5;\ndouble stdDev = 2;\nint randomNumber = (int) (z * stdDev + mean);\n\n这段代码生成一个具有均值5和标准差2的正态分布随机数。您可以调整均值和标准差来生成具有不同分布的随机数。"
-      },
-      {
-        "from": "human",
-        "value": "你能解释一下算法分析中的大O记法概念吗？"
-      },
-      {
-        "from": "gpt",
-        "value": "是的，大O表示法是一种描述算法性能随输入数据大小增加的方法。它用于估计算法在最坏情况下的时间复杂度或空间复杂度。大O中的“O”代表“数量级”，这意味着随着输入大小的增加，算法所需的操作数以一定的速率增长。\n\n例如，如果我们有一个需要n步才能解决大小为n的问题的算法，那么时间复杂度可以表示为O(n)，这意味着操作数随着输入大小线性增长。如果我们有一个需要n^2步才能解决大小为n的问题的算法，那么时间复杂度可以表示为O(n^2)，这意味着操作数随着输入大小二次方增长。\n\n还有其他几种大O表示法，如O(log n)、O(n log n)、O(2^n)等，它们代表了随着输入大小增加的不同操作数增长速率。通过分析算法的大O表示法，我们可以比较不同算法的性能，并选择给定问题最有效的算法。\n\n您是否有关于大O表示法或算法分析的特定问题？"
-      }
-    ],
-    "tools": "[]"
-  },
   {
     "conversations": [
       {
@@ -41,6 +20,27 @@
     ],
     "tools": "[{\"name\": \"generate_invoice\", \"description\": \"生成发票\", \"parameters\": {\"type\": \"object\", \"properties\": {\"customer_name\": {\"type\": \"string\", \"description\": \"客户名称\"}, \"items\": {\"type\": \"array\", \"items\": {\"type\": \"object\", \"properties\": {\"name\": {\"type\": \"string\", \"description\": \"The item name\"}, \"quantity\": {\"type\": \"integer\", \"description\": \"The quantity of the item\"}, \"price\": {\"type\": \"number\", \"description\": \"The price per unit\"}}, \"required\": [\"name\", \"quantity\", \"price\"]}}}, \"required\": [\"customer_name\", \"items\"]}}, {\"name\": \"generate_password\", \"description\": \"生成随机密码\", \"parameters\": {\"type\": \"object\", \"properties\": {\"length\": {\"type\": \"integer\", \"description\": \"密码的长度\"}}, \"required\": [\"length\"]}}]"
   },
+  {
+    "conversations": [
+      {
+        "from": "human",
+        "value": "假设你有一个需要随机数的Java程序，范围在0到10之间。你可以使用什么代码片段来生成这样的数字？\n\n下面是一个示例代码片段，它使用Math.random()方法来生成一个在0到1之间的随机数，然后缩放并四舍五入结果，得到一个在0到10之间的随机数：\n\n```java\nint randomNumber = (int) (Math.random() * 11);\n```\n\n在Java中是否有更有效的方法来生成随机数？你会如何修改代码以生成具有不同范围或分布的随机数？"
+      },
+      {
+        "from": "gpt",
+        "value": "您提供的代码片段是Java中生成随机数的一种常见且高效的方法。然而，如果您需要对随机数生成有更多的控制，您可以使用java.util.Random类，它提供了更多生成随机数的选项。\n\n要生成一个在不同范围内的随机数，您可以修改代码以调整缩放因子。例如，要生成一个在1到10之间的随机数，您可以使用以下代码：\n\nint randomNumber = (int) (Math.random() * 10) + 1;\n\n要生成具有不同分布的随机数，您可以使用各种技术，如正态分布、指数分布或自定义分布。生成正态分布的一种方法是使用Box-Muller变换。这里有一个代码示例，它生成具有正态分布的随机数：\n\nRandom random = new Random();\ndouble u1 = random.nextDouble();\ndouble u2 = random.nextDouble();\ndouble z = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);\ndouble mean = 5;\ndouble stdDev = 2;\nint randomNumber = (int) (z * stdDev + mean);\n\n这段代码生成一个具有均值5和标准差2的正态分布随机数。您可以调整均值和标准差来生成具有不同分布的随机数。"
+      },
+      {
+        "from": "human",
+        "value": "你能解释一下算法分析中的大O记法概念吗？"
+      },
+      {
+        "from": "gpt",
+        "value": "是的，大O表示法是一种描述算法性能随输入数据大小增加的方法。它用于估计算法在最坏情况下的时间复杂度或空间复杂度。大O中的“O”代表“数量级”，这意味着随着输入大小的增加，算法所需的操作数以一定的速率增长。\n\n例如，如果我们有一个需要n步才能解决大小为n的问题的算法，那么时间复杂度可以表示为O(n)，这意味着操作数随着输入大小线性增长。如果我们有一个需要n^2步才能解决大小为n的问题的算法，那么时间复杂度可以表示为O(n^2)，这意味着操作数随着输入大小二次方增长。\n\n还有其他几种大O表示法，如O(log n)、O(n log n)、O(2^n)等，它们代表了随着输入大小增加的不同操作数增长速率。通过分析算法的大O表示法，我们可以比较不同算法的性能，并选择给定问题最有效的算法。\n\n您是否有关于大O表示法或算法分析的特定问题？"
+      }
+    ],
+    "tools": "[]"
+  },
   {
     "conversations": [
       {

From b11e57157f38bb289a08931d4ca1d7b9d1659d0c Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 03:23:51 +0800
Subject: [PATCH 076/162] fix tool formatter, allow parallel function #4362

Former-commit-id: b8f16c976db4ecec1cc8558851c8cbfb6a5b7e9c
---
 src/llamafactory/api/chat.py       |  19 ++---
 src/llamafactory/data/formatter.py | 120 ++++++++++++---------------
 src/llamafactory/data/template.py  |  16 +++-
 src/llamafactory/webui/chatter.py  |  13 ++-
 tests/data/test_formatter.py       | 125 +++++++++++++++++++++++++++++
 5 files changed, 207 insertions(+), 86 deletions(-)
 create mode 100644 tests/data/test_formatter.py

diff --git a/src/llamafactory/api/chat.py b/src/llamafactory/api/chat.py
index 945856cb..2c7e11e2 100644
--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -92,9 +92,11 @@ def _process_request(
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
 
         if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
-            name = message.tool_calls[0].function.name
-            arguments = message.tool_calls[0].function.arguments
-            content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)
+            tool_calls = [
+                {"name": tool_call.function.name, "argument": tool_call.function.arguments}
+                for tool_call in message.tool_calls
+            ]
+            content = json.dumps(tool_calls, ensure_ascii=False)
             input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
         elif isinstance(message.content, list):
             for input_item in message.content:
@@ -118,7 +120,7 @@ def _process_request(
     if isinstance(tool_list, list) and len(tool_list):
         try:
             tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
-        except Exception:
+        except json.JSONDecodeError:
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
     else:
         tools = None
@@ -160,17 +162,16 @@ async def create_chat_completion_response(
     choices = []
     for i, response in enumerate(responses):
         if tools:
-            result = chat_model.engine.template.format_tools.extract(response.response_text)
+            result = chat_model.engine.template.extract_tool(response.response_text)
         else:
             result = response.response_text
 
         if isinstance(result, list):
             tool_calls = []
             for tool in result:
-                name, arguments = tool
-                function = Function(name=name, arguments=arguments)
-                tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
-                tool_calls.append(tool_call)
+                function = Function(name=tool[0], arguments=tool[1])
+                tool_calls.append(FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function))
+
             response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=tool_calls)
             finish_reason = Finish.TOOL
         else:
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index fa35df5b..70be6a5a 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -22,29 +22,20 @@ from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Tuple, Uni
 SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
 
 
-JSON_FORMAT_PROMPT = (
-    """, in a JSON format representing the kwargs (e.g. ```{"input": "hello world", "num_beams": 5}```)"""
-)
-
-
-TOOL_SYSTEM_PROMPT = (
+DEFAULT_TOOL_PROMPT = (
     "You have access to the following tools:\n{tool_text}"
     "Use the following format if using a tool:\n"
     "```\n"
     "Action: tool name (one of [{tool_names}]).\n"
-    "Action Input: the input to the tool{format_prompt}.\n"
+    "Action Input: the input to the tool, in a JSON format representing the kwargs "
+    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```).\n"""
     "```\n"
 )
 
 
-GLM4_TOOL_SUFFIX_PROMPT = (
-    "在调用上述函数时，请使用 Json 格式表示调用的参数。"
-)
-
 GLM4_TOOL_PROMPT = (
-    "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持，"
-    "{tool_text}"
-    
+    "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+    "你的任务是针对用户的问题和要求提供适当的答复和支持。{tool_text}"
 )
 
 
@@ -73,32 +64,19 @@ def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
         )
         tool_names.append(tool["name"])
 
-    return TOOL_SYSTEM_PROMPT.format(
-        tool_text=tool_text, tool_names=", ".join(tool_names), format_prompt=JSON_FORMAT_PROMPT
-    )
+    return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
 
 
-def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
-    tool_text = ""
-    for tool in tools:
-        tool_name = tool["name"]
-        tool_text += f"\n\n## {tool_name}\n\n{json.dumps(tool, ensure_ascii=False, indent=4)}\n{GLM4_TOOL_SUFFIX_PROMPT}"
-    return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
- 
-
 def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*({.*?})(?=\nAction:|\Z)", re.DOTALL)
-    action_match = re.findall(regex, content)
+    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|$)", re.DOTALL)
+    action_match: List[Tuple[str, str]] = re.findall(regex, content)
     if not action_match:
         return content
 
     results = []
-    
     for match in action_match:
-        tool_name, tool_input = match
-        tool_name = tool_name.strip()
-        tool_input = tool_input.strip().strip('"').strip("```")
-
+        tool_name = match[0].strip()
+        tool_input = match[1].strip().strip('"').strip("```")
         try:
             arguments = json.loads(tool_input)
             results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
@@ -108,19 +86,28 @@ def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
     return results
 
 
+def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
+    tool_text = ""
+    for tool in tools:
+        tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+            name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
+        )
+
+    return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+
+
 def glm4_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    lines = content.strip().split("\n")
-    if len(lines) != 2:
+    if "\n" not in content:
         return content
-    tool_name = lines[0].strip()
-    tool_input = lines[1].strip()
+
+    tool_name, tool_input = content.split("\n", maxsplit=1)
     try:
         arguments = json.loads(tool_input)
     except json.JSONDecodeError:
         return content
+
     return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
 
-    
 
 @dataclass
 class Formatter(ABC):
@@ -193,22 +180,28 @@ class FunctionFormatter(Formatter):
 
     def apply(self, **kwargs) -> SLOTS:
         content = kwargs.pop("content")
+        functions: List[Tuple[str, str]] = []
         try:
-            function = json.loads(content)
-            name = function["name"]
-            arguments = json.dumps(function["arguments"], ensure_ascii=False)
-        except Exception:
-            name, arguments = "", ""
+            tool_calls = json.loads(content)
+            if not isinstance(tool_calls, list):  # parallel function call
+                tool_calls = [tool_calls]
+
+            for tool_call in tool_calls:
+                functions.append((tool_call["name"], json.dumps(tool_call["arguments"], ensure_ascii=False)))
+
+        except json.JSONDecodeError:
+            functions = []
 
         elements = []
-        for slot in self.slots:
-            if isinstance(slot, str):
-                slot = slot.replace("{{name}}", name).replace("{{arguments}}", arguments)
-                elements.append(slot)
-            elif isinstance(slot, (dict, set)):
-                elements.append(slot)
-            else:
-                raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
+        for name, arguments in functions:
+            for slot in self.slots:
+                if isinstance(slot, str):
+                    slot = slot.replace("{{name}}", name).replace("{{arguments}}", arguments)
+                    elements.append(slot)
+                elif isinstance(slot, (dict, set)):
+                    elements.append(slot)
+                else:
+                    raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
 
         return elements
 
@@ -216,29 +209,22 @@ class FunctionFormatter(Formatter):
 @dataclass
 class ToolFormatter(Formatter):
     def __post_init__(self):
-        if self.tool_format is None:
+        if self.tool_format == "default":
+            self._tool_formatter = default_tool_formatter
+            self._tool_extractor = default_tool_extractor
+        elif self.tool_format == "glm4":
+            self._tool_formatter = glm4_tool_formatter
+            self._tool_extractor = glm4_tool_extractor
+        else:
             raise ValueError("Tool format was not found.")
 
     def apply(self, **kwargs) -> SLOTS:
         content = kwargs.pop("content")
         try:
             tools = json.loads(content)
-            if not len(tools):
-                return [""]
-
-            if self.tool_format == "default":
-                return [default_tool_formatter(tools)]
-            elif self.tool_format == "glm4":
-                return [glm4_tool_formatter(tools)]
-            else:
-                raise NotImplementedError
-        except Exception:
+            return [self._tool_formatter(tools) if len(tools) != 0 else ""]
+        except json.JSONDecodeError:
             return [""]
 
     def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
-        if self.tool_format == "default":
-            return default_tool_extractor(content)
-        elif self.tool_format == "glm4":
-            return glm4_tool_extractor(content)
-        else:
-            raise NotImplementedError
+        return self._tool_extractor(content)
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index d97699b0..77694c59 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -79,6 +79,12 @@ class Template:
         """
         return self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
 
+    def extract_tool(self, content: str) -> Union[str, List[Tuple[str, str]]]:
+        r"""
+        Extracts tool message.
+        """
+        return self.format_tools.extract(content)
+
     def _encode(
         self,
         tokenizer: "PreTrainedTokenizer",
@@ -100,7 +106,8 @@ class Template:
             if i == 0 and (system or tools or self.force_system):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
                 elements += self.format_system.apply(content=(system + tool_text))
-            elif i > 0 and i % 2 == 0:
+
+            if i > 0 and i % 2 == 0:
                 elements += self.format_separator.apply()
 
             if message["role"] == Role.USER.value:
@@ -191,7 +198,8 @@ class Llama2Template(Template):
             if i == 0 and (system or tools or self.force_system):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
                 system_text = self.format_system.apply(content=(system + tool_text))[0]
-            elif i > 0 and i % 2 == 0:
+
+            if i > 0 and i % 2 == 0:
                 elements += self.format_separator.apply()
 
             if message["role"] == Role.USER.value:
@@ -259,7 +267,9 @@ def _register_template(
     template_class = Llama2Template if name.startswith("llama2") else Template
     default_user_formatter = StringFormatter(slots=["{{content}}"])
     default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots)
-    default_function_formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots)
+    default_function_formatter = FunctionFormatter(
+        slots=["Action: {{name}}\nAction Input: {{arguments}}\n"] + eos_slots
+    )
     default_tool_formatter = ToolFormatter(tool_format="default")
     default_separator_formatter = EmptyFormatter()
     TEMPLATES[name] = template_class(
diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py
index 864c41c7..a2b54dce 100644
--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
@@ -140,16 +140,15 @@ class WebChatModel(ChatModel):
         ):
             response += new_text
             if tools:
-                result = self.engine.template.format_tools.extract(response)
+                result = self.engine.template.extract_tool(response)
             else:
                 result = response
 
-            if isinstance(result, tuple):
-                name, arguments = result
-                arguments = json.loads(arguments)
-                tool_call = json.dumps({"name": name, "arguments": arguments}, ensure_ascii=False)
-                output_messages = messages + [{"role": Role.FUNCTION.value, "content": tool_call}]
-                bot_text = "```json\n" + tool_call + "\n```"
+            if isinstance(result, list):
+                tool_calls = [{"name": tool[0], "arguments": json.loads(tool[1])} for tool in result]
+                tool_calls = json.dumps(tool_calls, indent=4, ensure_ascii=False)
+                output_messages = messages + [{"role": Role.FUNCTION.value, "content": tool_calls}]
+                bot_text = "```json\n" + tool_calls + "\n```"
             else:
                 output_messages = messages + [{"role": Role.ASSISTANT.value, "content": result}]
                 bot_text = result
diff --git a/tests/data/test_formatter.py b/tests/data/test_formatter.py
new file mode 100644
index 00000000..430eb0e6
--- /dev/null
+++ b/tests/data/test_formatter.py
@@ -0,0 +1,125 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from llamafactory.data.formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
+
+
+def test_empty_formatter():
+    formatter = EmptyFormatter(slots=["\n"])
+    assert formatter.apply() == ["\n"]
+
+
+def test_string_formatter():
+    formatter = StringFormatter(slots=["<s>", "Human: {{content}}\nAssistant:"])
+    assert formatter.apply(content="Hi") == ["<s>", "Human: Hi\nAssistant:"]
+
+
+def test_function_formatter():
+    formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}\n"])
+    tool_calls = json.dumps({"name": "tool_name", "arguments": {"foo": "bar", "size": 10}})
+    assert formatter.apply(content=tool_calls) == [
+        """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n"""
+    ]
+
+
+def test_multi_function_formatter():
+    formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}\n"])
+    tool_calls = json.dumps([{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}] * 2)
+    assert formatter.apply(content=tool_calls) == [
+        """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n""",
+        """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n""",
+    ]
+
+
+def test_default_tool_formatter():
+    formatter = ToolFormatter(tool_format="default")
+    tools = [
+        {
+            "name": "test_tool",
+            "description": "tool_desc",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "foo": {"type": "string", "description": "foo_desc"},
+                    "bar": {"type": "number", "description": "bar_desc"},
+                },
+                "required": ["foo"],
+            },
+        }
+    ]
+    assert formatter.apply(content=json.dumps(tools)) == [
+        "You have access to the following tools:\n"
+        "> Tool Name: test_tool\n"
+        "Tool Description: tool_desc\n"
+        "Tool Args:\n"
+        "  - foo (string, required): foo_desc\n"
+        "  - bar (number): bar_desc\n\n"
+        "Use the following format if using a tool:\n"
+        "```\n"
+        "Action: tool name (one of [test_tool]).\n"
+        "Action Input: the input to the tool, in a JSON format representing the kwargs "
+        """(e.g. ```{"input": "hello world", "num_beams": 5}```).\n"""
+        "```\n"
+    ]
+
+
+def test_default_tool_extractor():
+    formatter = ToolFormatter(tool_format="default")
+    result = """Action: test_tool\nAction Input: {"foo": "bar", "size": 10}\n"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
+
+
+def test_default_multi_tool_extractor():
+    formatter = ToolFormatter(tool_format="default")
+    result = (
+        """Action: test_tool\nAction Input: {"foo": "bar", "size": 10}\n"""
+        """Action: another_tool\nAction Input: {"foo": "job", "size": 2}\n"""
+    )
+    assert formatter.extract(result) == [
+        ("test_tool", """{"foo": "bar", "size": 10}"""),
+        ("another_tool", """{"foo": "job", "size": 2}"""),
+    ]
+
+
+def test_glm4_tool_formatter():
+    formatter = ToolFormatter(tool_format="glm4")
+    tools = [
+        {
+            "name": "test_tool",
+            "description": "tool_desc",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "foo": {"type": "string", "description": "foo_desc"},
+                    "bar": {"type": "number", "description": "bar_desc"},
+                },
+                "required": ["foo"],
+            },
+        }
+    ]
+    assert formatter.apply(content=json.dumps(tools)) == [
+        "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+        "你的任务是针对用户的问题和要求提供适当的答复和支持。"
+        "\n\n## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+            json.dumps(tools[0], indent=4)
+        )
+    ]
+
+
+def test_glm4_tool_extractor():
+    formatter = ToolFormatter(tool_format="glm4")
+    result = """test_tool\n{"foo": "bar", "size": 10}\n"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]

From 5d7dfa75ca26c97521b2a25c4bc3b755f7aa8250 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 03:39:52 +0800
Subject: [PATCH 077/162] use prefix to replace force system

Former-commit-id: 731d9a964f1c3dbfb83825524d697831e691fb9d
---
 src/llamafactory/data/template.py | 75 +++++++++++++------------------
 1 file changed, 30 insertions(+), 45 deletions(-)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 77694c59..a12e9c88 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -38,12 +38,12 @@ class Template:
     format_observation: "Formatter"
     format_tools: "Formatter"
     format_separator: "Formatter"
+    format_prefix: "Formatter"
     default_system: str
     stop_words: List[str]
     image_token: str
     efficient_eos: bool
     replace_eos: bool
-    force_system: bool
 
     def encode_oneturn(
         self,
@@ -102,8 +102,9 @@ class Template:
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
-            elements = []
-            if i == 0 and (system or tools or self.force_system):
+            elements = self.format_prefix.apply()
+
+            if i == 0 and (system or tools):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
                 elements += self.format_system.apply(content=(system + tool_text))
 
@@ -193,9 +194,10 @@ class Llama2Template(Template):
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
-            elements = []
+            elements = self.format_prefix.apply()
+
             system_text = ""
-            if i == 0 and (system or tools or self.force_system):
+            if i == 0 and (system or tools):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
                 system_text = self.format_system.apply(content=(system + tool_text))[0]
 
@@ -230,12 +232,12 @@ def _register_template(
     format_observation: Optional["Formatter"] = None,
     format_tools: Optional["Formatter"] = None,
     format_separator: Optional["Formatter"] = None,
+    format_prefix: Optional["Formatter"] = None,
     default_system: str = "",
     stop_words: List[str] = [],
     image_token: str = "<image>",
     efficient_eos: bool = False,
     replace_eos: bool = False,
-    force_system: bool = False,
 ) -> None:
     r"""
     Registers a chat template.
@@ -272,6 +274,7 @@ def _register_template(
     )
     default_tool_formatter = ToolFormatter(tool_format="default")
     default_separator_formatter = EmptyFormatter()
+    default_prefix_formatter = EmptyFormatter()
     TEMPLATES[name] = template_class(
         format_user=format_user or default_user_formatter,
         format_assistant=format_assistant or default_assistant_formatter,
@@ -280,12 +283,12 @@ def _register_template(
         format_observation=format_observation or format_user or default_user_formatter,
         format_tools=format_tools or default_tool_formatter,
         format_separator=format_separator or default_separator_formatter,
+        format_prefix=format_prefix or default_prefix_formatter,
         default_system=default_system,
         stop_words=stop_words,
         image_token=image_token,
         efficient_eos=efficient_eos,
         replace_eos=replace_eos,
-        force_system=force_system,
     )
 
 
@@ -329,7 +332,7 @@ def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", pl
 
 
 def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") -> str:
-    jinja_template = ""
+    jinja_template = _convert_slots_to_jinja(template.format_prefix.apply(), tokenizer)
 
     if template.default_system:
         jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
@@ -339,11 +342,7 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer")
     )
 
     system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message")
-    if isinstance(template, Llama2Template):
-        pass
-    elif template.force_system:
-        jinja_template += "{{ " + system_message + " }}"
-    else:
+    if not isinstance(template, Llama2Template):
         jinja_template += "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}"
 
     jinja_template += "{% for message in messages %}"
@@ -459,9 +458,8 @@ _register_template(
 _register_template(
     name="belle",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     format_separator=EmptyFormatter(slots=["\n\n"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -486,10 +484,9 @@ _register_template(
 _register_template(
     name="chatglm2",
     format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
-    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
     format_separator=EmptyFormatter(slots=["\n\n"]),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -497,14 +494,14 @@ _register_template(
     name="chatglm3",
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
+    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(
         slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
     ),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -512,13 +509,12 @@ _register_template(
     name="chatglm3_system",
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_system=StringFormatter(
-        slots=[{"token": "[gMASK]"}, {"token": "sop"}, {"token": "<|system|>"}, "\n", "{{content}}"]
-    ),
+    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(
         slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
     ),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
     default_system=(
         "You are ChatGLM3, a large language model trained by Zhipu.AI. "
         "Follow the user's instructions carefully. Respond using markdown."
@@ -553,8 +549,7 @@ _register_template(
 
 _register_template(
     name="codegeex2",
-    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
 )
 
 
@@ -581,8 +576,7 @@ _register_template(
 _register_template(
     name="cpm",
     format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -615,8 +609,7 @@ _register_template(
 _register_template(
     name="deepseek",
     format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -648,9 +641,8 @@ _register_template(
     name="empty",
     format_user=StringFormatter(slots=["{{content}}"]),
     format_assistant=StringFormatter(slots=["{{content}}"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -672,13 +664,12 @@ _register_template(
 _register_template(
     name="gemma",
     format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     format_observation=StringFormatter(
         slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
     ),
     format_separator=EmptyFormatter(slots=["<end_of_turn>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -686,13 +677,13 @@ _register_template(
     name="glm4",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
     format_assistant=StringFormatter(slots=["\n{{content}}"]),
-    format_system=StringFormatter(slots=["[gMASK]<sop><|system|>\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
     format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -768,24 +759,21 @@ _register_template(
 _register_template(
     name="mistral",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
 _register_template(
     name="olmo",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
-    format_system=StringFormatter(slots=[{"eos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"eos_token"}]),
 )
 
 
 _register_template(
     name="openchat",
     format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -799,18 +787,16 @@ _register_template(
             )
         ]
     ),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|eot_id|>"],
     replace_eos=True,
-    force_system=True,
 )
 
 
 _register_template(
     name="orion",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -852,7 +838,6 @@ _register_template(
     format_separator=EmptyFormatter(slots=["\n"]),
     stop_words=["<|end|>"],
     replace_eos=True,
-    force_system=True,
 )
 
 
From 15959d522c1d009a20237d503d09bf31bf97952b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 03:49:23 +0800
Subject: [PATCH 078/162] fix bug

Former-commit-id: 412139eaa2fde98ba19e1257d21144382a59f0d6
---
 src/llamafactory/data/template.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index a12e9c88..c9af9605 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -102,7 +102,10 @@ class Template:
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
-            elements = self.format_prefix.apply()
+            elements = []
+
+            if i == 0:
+                elements += self.format_prefix.apply()
 
             if i == 0 and (system or tools):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
@@ -194,7 +197,10 @@ class Llama2Template(Template):
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
-            elements = self.format_prefix.apply()
+            elements = []
+
+            if i == 0:
+                elements += self.format_prefix.apply()
 
             system_text = ""
             if i == 0 and (system or tools):

From a22e932b4fcd0f271fea1bb58e79caec96f59170 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Wed, 19 Jun 2024 12:29:40 +0800
Subject: [PATCH 079/162] Cleaner integration.

Former-commit-id: 26d4b05d424bd71f570195dd433258caf6465d92
---
 src/llamafactory/hparams/parser.py      |  7 ++-----
 src/llamafactory/train/dpo/trainer.py   | 12 +++---------
 src/llamafactory/train/kto/trainer.py   | 12 +++---------
 src/llamafactory/train/ppo/trainer.py   | 12 +++---------
 src/llamafactory/train/pt/trainer.py    | 12 +++---------
 src/llamafactory/train/rm/trainer.py    | 12 +++---------
 src/llamafactory/train/sft/trainer.py   | 11 +++--------
 src/llamafactory/train/trainer_utils.py | 10 ++++------
 8 files changed, 24 insertions(+), 64 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 680559ac..c7c17ae0 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -215,11 +215,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         if finetuning_args.badam_mode == "ratio":
             raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
-        if (finetuning_args.badam_mode == "layer"
-            and training_args.deepspeed_plugin is not None
-            and training_args.deepspeed_plugin.zero_stage < 3
-        ):
-            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage, got stage {training_args.deepspeed_plugin.zero_stage}")
+        if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()):
+            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.")
 
     if (finetuning_args.use_galore) and training_args.deepspeed is not None:
         raise ValueError("GaLore are incompatible with DeepSpeed yet.")
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index 284bf41a..a3e0e961 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -96,15 +96,9 @@ class CustomDPOTrainer(DPOTrainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index d8b609e0..0d50987f 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -91,15 +91,9 @@ class CustomKTOTrainer(KTOTrainer):
                 self.ref_model.eval()
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 5a18cbaa..f81831e6 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -166,15 +166,9 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                 self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
         r"""
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index 1e5e9f6a..d3516b41 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -48,15 +48,9 @@ class CustomTrainer(Trainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 5d0e6263..433251cf 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -72,15 +72,9 @@ class PairwiseTrainer(Trainer):
         self.processor = processor
         self.can_return_loss = True  # override property to return eval_loss
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 9446d245..45799b96 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -56,14 +56,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 98c38842..b57f5a6e 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -371,11 +371,8 @@ def _create_badam_optimizer(
         dict(params=decay_params, weight_decay=training_args.weight_decay),
     ]
 
-    ds_zero3_enabled = False
-    if hasattr(training_args, "deepspeed_plugin") and training_args.deepspeed_plugin is not None:
-        assert training_args.deepspeed_plugin.zero_stage == 3, f"BAdam only supports deepspeed ZeRO-3 stage, got {training_args.deepspeed_plugin.zero_stage}"
-        assert finetuning_args.badam_mode == "layer", "BAdam only supports layer-wise update in ZeRO-3 stage"
-        ds_zero3_enabled = True
+    from transformers.integrations import is_deepspeed_zero3_enabled
+    ds_zero3_enabled = is_deepspeed_zero3_enabled()
 
     if finetuning_args.badam_mode == "layer":
         from badam import BlockOptimizer
@@ -400,6 +397,7 @@ def _create_badam_optimizer(
     elif finetuning_args.badam_mode == "ratio":
         from badam import BlockOptimizerRatio
 
+        assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer."
         assert finetuning_args.badam_update_ratio > 1e-6
         optimizer = BlockOptimizerRatio(
             param_groups=param_groups,
@@ -411,7 +409,7 @@ def _create_badam_optimizer(
             **optim_kwargs,
         )
         logger.info(
-            f"Using BAdam optimizer with ratio-wise update, update ratio is {finetuning_args.badam_update_ratio}, "
+            f"Using BAdam optimizer with ratio-based update, update ratio is {finetuning_args.badam_update_ratio}, "
             f"mask mode is {finetuning_args.badam_mask_mode}"
         )
 

From da3b0aab6d95f257354e55ab2d9a1207f8bbcd52 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 17:44:05 +0800
Subject: [PATCH 080/162] fix templates

Former-commit-id: 6f357d59b73309c5955683008632e7f320e7dcb1
---
 src/llamafactory/data/formatter.py      |  2 +-
 src/llamafactory/data/template.py       | 48 ++++++++-----------------
 src/llamafactory/train/trainer_utils.py |  3 +-
 3 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 70be6a5a..ed9ba8b8 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -68,7 +68,7 @@ def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
 
 
 def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|$)", re.DOTALL)
+    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
     action_match: List[Tuple[str, str]] = re.findall(regex, content)
     if not action_match:
         return content
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index c9af9605..3c4bc5ec 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -478,11 +478,7 @@ _register_template(
 _register_template(
     name="breeze",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    default_system=(
-        "You are a helpful AI assistant built by MediaTek Research. "
-        "The user you are helping speaks Traditional Chinese and comes from Taiwan."
-    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
 )
 
@@ -569,13 +565,8 @@ _register_template(
             )
         ]
     ),
-    format_system=StringFormatter(
-        slots=[{"bos_token"}, "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"]
-    ),
-    default_system=(
-        "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users "
-        "by providing thorough responses. You are trained by Cohere."
-    ),
+    format_system=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -645,8 +636,6 @@ _register_template(
 
 _register_template(
     name="empty",
-    format_user=StringFormatter(slots=["{{content}}"]),
-    format_assistant=StringFormatter(slots=["{{content}}"]),
     format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
 )
@@ -695,25 +684,21 @@ _register_template(
 
 _register_template(
     name="intern",
-    format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
-    format_separator=EmptyFormatter(slots=[{"token": "<eoa>"}, "\n"]),
+    format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
+    format_system=StringFormatter(slots=["<|System|>:{{content}}\n"]),
+    format_separator=EmptyFormatter(slots=["<eoa>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<eoa>"],
-    efficient_eos=True,
+    efficient_eos=True,  # internlm tokenizer cannot set eos_token_id
 )
 
 
 _register_template(
     name="intern2",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]),
-    format_separator=EmptyFormatter(slots=["\n"]),
-    default_system=(
-        "You are an AI assistant whose name is InternLM (书生·浦语).\n"
-        "- InternLM (书生·浦语) is a conversational language model that is developed "
-        "by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
-        "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen "
-        "by the user such as English and 中文."
-    ),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_separator=EmptyFormatter(slots=["<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|im_end|>"],
     efficient_eos=True,  # internlm2 tokenizer cannot set eos_token_id
 )
@@ -722,7 +707,6 @@ _register_template(
 _register_template(
     name="llama2",
     format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
-    format_assistant=StringFormatter(slots=[" {{content}} ", {"eos_token"}]),
     format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
 )
 
@@ -745,9 +729,7 @@ _register_template(
             )
         ]
     ),
-    format_system=StringFormatter(
-        slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]
-    ),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
     format_observation=StringFormatter(
         slots=[
             (
@@ -756,7 +738,7 @@ _register_template(
             )
         ]
     ),
-    default_system="You are a helpful assistant.",
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|eot_id|>"],
     replace_eos=True,
 )
@@ -809,9 +791,9 @@ _register_template(
 _register_template(
     name="phi",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
-    default_system="You are a helpful AI assistant.",
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|end|>"],
     replace_eos=True,
 )
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 9052c96d..c1b90155 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -198,6 +198,7 @@ def convert_pissa_adapter(
                 safe_serialization=training_args.save_safetensors,
             )
             setattr(unwrapped_model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
     elif output_dir == training_args.output_dir:  # at the end of training
         logger.info("Converted PiSSA adapter will be saved at: {}.".format(output_dir))
         unwrapped_model = accelerator.unwrap_model(model)
@@ -233,7 +234,7 @@ def _create_galore_optimizer(
     finetuning_args: "FinetuningArguments",
 ) -> "torch.optim.Optimizer":
     if len(finetuning_args.galore_target) == 1 and finetuning_args.galore_target[0] == "all":
-        galore_targets = find_all_linear_modules(model)
+        galore_targets = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
     else:
         galore_targets = finetuning_args.galore_target
 

From 11f88c4fdc7ad84f6325197dd8460188f4cfecfc Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 20:03:50 +0800
Subject: [PATCH 081/162] fix jinja template

Former-commit-id: 0ebf2e2ee23918d28b0cbb20ba456732d6eedfbb
---
 src/llamafactory/data/template.py |  6 +++++-
 tests/data/test_supervised.py     |  9 +++++---
 tests/data/test_template.py       | 35 +++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 tests/data/test_template.py

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 3c4bc5ec..393ebfea 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -338,7 +338,11 @@ def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", pl
 
 
 def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") -> str:
-    jinja_template = _convert_slots_to_jinja(template.format_prefix.apply(), tokenizer)
+    jinja_template = ""
+
+    prefix = _convert_slots_to_jinja(template.format_prefix.apply(), tokenizer)
+    if prefix:
+        jinja_template += "{{ " + prefix + " }}"
 
     if template.default_system:
         jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
diff --git a/tests/data/test_supervised.py b/tests/data/test_supervised.py
index 9f7b2dbf..9cb49615 100644
--- a/tests/data/test_supervised.py
+++ b/tests/data/test_supervised.py
@@ -17,6 +17,7 @@ import random
 
 import pytest
 from datasets import load_dataset
+from transformers import AutoTokenizer
 
 from llamafactory.data import get_dataset
 from llamafactory.hparams import get_train_args
@@ -48,10 +49,11 @@ def test_supervised(num_samples: int):
     tokenizer = tokenizer_module["tokenizer"]
     tokenized_data = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
 
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+
     original_data = load_dataset(TRAIN_ARGS["dataset"], split="train")
     indexes = random.choices(range(len(original_data)), k=num_samples)
     for index in indexes:
-        decoded_result = tokenizer.decode(tokenized_data["input_ids"][index])
         prompt = original_data[index]["instruction"]
         if original_data[index]["input"]:
             prompt += "\n" + original_data[index]["input"]
@@ -60,5 +62,6 @@ def test_supervised(num_samples: int):
             {"role": "user", "content": prompt},
             {"role": "assistant", "content": original_data[index]["output"]},
         ]
-        templated_result = tokenizer.apply_chat_template(messages, tokenize=False)
-        assert decoded_result == templated_result
+        templated_result = ref_tokenizer.apply_chat_template(messages, tokenize=False)
+        decoded_result = tokenizer.decode(tokenized_data["input_ids"][index])
+        assert templated_result == decoded_result
diff --git a/tests/data/test_template.py b/tests/data/test_template.py
new file mode 100644
index 00000000..9d73c116
--- /dev/null
+++ b/tests/data/test_template.py
@@ -0,0 +1,35 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from transformers import AutoTokenizer
+
+from llamafactory.data import get_template_and_fix_tokenizer
+
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+
+def test_jinja_template():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    get_template_and_fix_tokenizer(tokenizer, name="llama3")
+    assert tokenizer.chat_template != ref_tokenizer.chat_template
+
+    messages = [
+        {"role": "user", "content": "hi!"},
+        {"role": "assistant", "content": "hello there"},
+    ]
+    assert tokenizer.apply_chat_template(messages) == ref_tokenizer.apply_chat_template(messages)

From 741c0b75667578e5340f9ccf46c3cb3ed4e400b1 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 20:42:09 +0800
Subject: [PATCH 082/162] release v0.8.2

Former-commit-id: 3050bbe51d46acd8473275d2713fc28932e4a3d3
---
 .github/workflows/publish.yml  | 39 ++++++++++++++++++++++++++++++++++
 src/llamafactory/extras/env.py |  2 +-
 2 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/publish.yml

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 00000000..bb39be3a
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,39 @@
+name: publish
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  publish:
+    name: Upload release to PyPI
+
+    runs-on: ubuntu-latest
+
+    environment:
+      name: release
+      url: https://pypi.org/p/llamafactory
+
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.8"
+      
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install build
+      
+      - name: Build package
+        run: |
+          python -m build
+      
+      - name: Publish package
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index 586c24c0..ee224148 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -23,7 +23,7 @@ import trl
 from transformers.utils import is_torch_cuda_available, is_torch_npu_available
 
 
-VERSION = "0.8.2.dev0"
+VERSION = "0.8.2"
 
 
 def print_env() -> None:

From 11ba8198a8e60673de8af185776e2a312396fc5f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 20:46:33 +0800
Subject: [PATCH 083/162] Update publish.yml

Former-commit-id: 60b0633e29c9e701aa3813bd1fdc0282bd07f7c8
---
 .github/workflows/publish.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index bb39be3a..15c7153e 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -2,7 +2,8 @@ name: publish
 
 on:
   release:
-    types: [published]
+    types:
+      - published
 
 jobs:
   publish:

From fadad087065c94e93e6f2d32c22dec08065be99f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 21:08:16 +0800
Subject: [PATCH 084/162] set dev version

Former-commit-id: 221665345d97f839ce4ba8d54643da30c71b6083
---
 src/llamafactory/extras/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index ee224148..ab387231 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -23,7 +23,7 @@ import trl
 from transformers.utils import is_torch_cuda_available, is_torch_npu_available
 
 
-VERSION = "0.8.2"
+VERSION = "0.8.3.dev0"
 
 
 def print_env() -> None:

From 0680f186330c646d414995bb1a75e2927ab256fa Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 21:27:00 +0800
Subject: [PATCH 085/162] update patcher

Former-commit-id: afb365e515d615dd62f791622450debab60ce5cc
---
 src/llamafactory/model/model_utils/checkpointing.py | 10 ++++------
 src/llamafactory/model/patcher.py                   |  5 +++++
 tests/model/model_utils/test_checkpointing.py       |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py
index f5314125..f4f3d8a5 100644
--- a/src/llamafactory/model/model_utils/checkpointing.py
+++ b/src/llamafactory/model/model_utils/checkpointing.py
@@ -78,9 +78,7 @@ def _fp32_forward_post_hook(
     return output.to(torch.float32)
 
 
-def prepare_model_for_training(
-    model: "PreTrainedModel", model_args: "ModelArguments", output_layer_name: str = "lm_head"
-) -> None:
+def prepare_model_for_training(model: "PreTrainedModel", model_args: "ModelArguments") -> None:
     r"""
     Includes:
         (1) cast the layernorm in fp32
@@ -104,8 +102,8 @@ def prepare_model_for_training(
             setattr(model.config, "use_cache", False)  # turn off when gradient checkpointing is enabled
             logger.info("Gradient checkpointing enabled.")
 
-    if hasattr(model, output_layer_name) and model_args.upcast_lmhead_output:
-        logger.info("Upcasting lm_head outputs in float32.")
-        output_layer = getattr(model, output_layer_name)
+    if model_args.upcast_lmhead_output:
+        output_layer = model.get_output_embeddings()
         if isinstance(output_layer, torch.nn.Linear) and output_layer.weight.dtype != torch.float32:
+            logger.info("Upcasting lm_head outputs in float32.")
             output_layer.register_forward_hook(_fp32_forward_post_hook)
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 8fa17d08..a53fde98 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -152,6 +152,10 @@ def patch_valuehead_model(model: "AutoModelForCausalLMWithValueHead") -> None:
         if isinstance(self.pretrained_model, PreTrainedModel):
             return self.pretrained_model.get_input_embeddings()
 
+    def get_output_embeddings(self: "AutoModelForCausalLMWithValueHead") -> torch.nn.Module:
+        if isinstance(self.pretrained_model, PreTrainedModel):
+            return self.pretrained_model.get_output_embeddings()
+
     def create_or_update_model_card(self: "AutoModelForCausalLMWithValueHead", output_dir: str) -> None:
         if isinstance(self.pretrained_model, PeftModel):
             self.pretrained_model.create_or_update_model_card(output_dir)
@@ -160,4 +164,5 @@ def patch_valuehead_model(model: "AutoModelForCausalLMWithValueHead") -> None:
     setattr(model, "_keys_to_ignore_on_save", ignore_modules)
     setattr(model, "tie_weights", MethodType(tie_weights, model))
     setattr(model, "get_input_embeddings", MethodType(get_input_embeddings, model))
+    setattr(model, "get_output_embeddings", MethodType(get_output_embeddings, model))
     setattr(model, "create_or_update_model_card", MethodType(create_or_update_model_card, model))
diff --git a/tests/model/model_utils/test_checkpointing.py b/tests/model/model_utils/test_checkpointing.py
index 670e693d..9b6dfc9e 100644
--- a/tests/model/model_utils/test_checkpointing.py
+++ b/tests/model/model_utils/test_checkpointing.py
@@ -70,5 +70,5 @@ def test_upcast_lmhead_output():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     inputs = torch.randn((1, 16), dtype=torch.float16, device=get_current_device())
-    outputs: "torch.Tensor" = model.lm_head(inputs)
+    outputs: "torch.Tensor" = model.get_output_embeddings()(inputs)
     assert outputs.dtype == torch.float32

From 2b961c1f5ba00a28c42da2de252ba53f82676cbe Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 23:22:28 +0800
Subject: [PATCH 086/162] fix llamaboard abort

Former-commit-id: 9ef609a2c0185040e531dea3829a6f481539cdea
---
 src/llamafactory/webui/runner.py |  4 ++--
 src/llamafactory/webui/utils.py  | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 13dbba03..6cd21b07 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -24,7 +24,7 @@ from ..extras.misc import is_gpu_or_npu_available, torch_gc
 from ..extras.packages import is_gradio_available
 from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir, load_config
 from .locales import ALERTS, LOCALES
-from .utils import abort_leaf_process, gen_cmd, get_eval_results, get_trainer_info, load_args, save_args, save_cmd
+from .utils import abort_process, gen_cmd, get_eval_results, get_trainer_info, load_args, save_args, save_cmd
 
 
 if is_gradio_available():
@@ -52,7 +52,7 @@ class Runner:
     def set_abort(self) -> None:
         self.aborted = True
         if self.trainer is not None:
-            abort_leaf_process(self.trainer.pid)
+            abort_process(self.trainer.pid)
 
     def _initialize(self, data: Dict["Component", Any], do_train: bool, from_preview: bool) -> str:
         get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index 6ce2a8e7..a616bcba 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -33,16 +33,16 @@ if is_gradio_available():
     import gradio as gr
 
 
-def abort_leaf_process(pid: int) -> None:
+def abort_process(pid: int) -> None:
     r"""
-    Aborts the leaf processes.
+    Aborts the processes recursively in a bottom-up way.
     """
     children = psutil.Process(pid).children()
     if children:
         for child in children:
-            abort_leaf_process(child.pid)
-    else:
-        os.kill(pid, signal.SIGABRT)
+            abort_process(child.pid)
+
+    os.kill(pid, signal.SIGABRT)
 
 
 def can_quantize(finetuning_type: str) -> "gr.Dropdown":

From 5ae34858da925affde8848db1dd227c34b8129e8 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 23:46:03 +0800
Subject: [PATCH 087/162] improve llamaboard

Former-commit-id: e606ab35c0eced667dde7137c2d72848f264c96c
---
 src/llamafactory/webui/components/top.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index 2515a83d..09d43ac8 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -50,9 +50,13 @@ def create_top() -> Dict[str, "Component"]:
             visual_inputs = gr.Checkbox(scale=1)
 
     model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False)
-    model_name.input(save_config, inputs=[lang, model_name], queue=False)
+    model_name.input(save_config, inputs=[lang, model_name], queue=False).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    )
     model_path.input(save_config, inputs=[lang, model_name, model_path], queue=False)
-    finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False)
+    finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    )
     checkpoint_path.focus(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False)
 
     return dict(

From b3b278561d52a31600d75bc8c162eadeca9d4b56 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 20 Jun 2024 02:09:47 +0000
Subject: [PATCH 088/162] update dependencies

Former-commit-id: 25164273d1ca7a8f6f99b41279e342906f6bc4d5
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index f76524d8..7380add4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ matplotlib>=3.7.0
 fire
 packaging
 pyyaml
+numpy<2.0.0

From fecde5c13f2fe87d9a7d622147047c292b91514a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 20 Jun 2024 22:56:05 +0800
Subject: [PATCH 089/162] tiny fix

Former-commit-id: 2d8d47f6126d68db1701ed18fc31310c6f14dd49
---
 src/llamafactory/hparams/parser.py       |  3 +++
 src/llamafactory/model/adapter.py        | 17 ++++++++---------
 src/llamafactory/model/patcher.py        |  4 ++--
 src/llamafactory/webui/components/top.py |  4 ++--
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 9b305016..a593bf45 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -199,6 +199,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         if not is_torch_bf16_gpu_available():
             raise ValueError("This device does not support `pure_bf16`.")
 
+        if training_args.deepspeed:
+            raise ValueError("`pure_bf16` is incompatible with DeepSpeed.")
+
         if training_args.fp16 or training_args.bf16:
             raise ValueError("Turn off mixed precision training when using `pure_bf16`.")
 
diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index 34518878..7caef9cc 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -289,16 +289,15 @@ def init_adapter(
             raise ValueError("Cannot initialize PiSSA adapter on quantized models.")
 
     # cast trainable parameters to float32 if:
-    # 1. is_trainable and quantization_bit is not None (qlora)
-    # 2. is_trainable and not deepspeed zero3 and not fsdp (zero3 or fsdp already in float32)
-    # 3. is_trainable and not pure_bf16 and not badam
+    # 1. is_trainable and not pure_bf16 and not badam and quantization_bit is not None (qlora)
+    # 2. is_trainable and not pure_bf16 and not badam and not zero3 and not fsdp (zero3 or fsdp already in fp32)
+    cast_trainable_params_to_fp32 = False
     if not is_trainable:
-        cast_trainable_params_to_fp32 = False
-    elif model_args.quantization_bit is None and (
-        is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam
-    ):
-        logger.info("ZeRO3/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.")
-        cast_trainable_params_to_fp32 = False
+        pass
+    elif finetuning_args.pure_bf16 or finetuning_args.use_badam:
+        logger.info("Pure bf16 / BAdam detected, remaining trainable params in half precision.")
+    elif model_args.quantization_bit is None and (is_deepspeed_zero3_enabled() or is_fsdp_enabled()):
+        logger.info("ZeRO3 / FSDP detected, remaining trainable params in float32.")
     else:
         logger.info("Upcasting trainable params to float32.")
         cast_trainable_params_to_fp32 = True
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index a53fde98..35153649 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -91,8 +91,8 @@ def patch_config(
 
     # cast data type of the model if:
     # 1. not deepspeed zero3 and not fsdp (keep zero3 or fsdp in float32)
-    # 2. fsdp + qlora
-    if model_args.quantization_bit is not None or (not is_deepspeed_zero3_enabled() and not is_fsdp_enabled()):
+    # 2. quantization_bit is not None (qlora)
+    if (not is_deepspeed_zero3_enabled() and not is_fsdp_enabled()) or model_args.quantization_bit is not None:
         init_kwargs["torch_dtype"] = model_args.compute_dtype
 
         if init_kwargs["low_cpu_mem_usage"]:  # device map requires low_cpu_mem_usage=True
diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index 09d43ac8..18b9a7d2 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -49,10 +49,10 @@ def create_top() -> Dict[str, "Component"]:
             booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none", scale=3)
             visual_inputs = gr.Checkbox(scale=1)
 
-    model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False)
-    model_name.input(save_config, inputs=[lang, model_name], queue=False).then(
+    model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False).then(
         list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
     )
+    model_name.input(save_config, inputs=[lang, model_name], queue=False)
     model_path.input(save_config, inputs=[lang, model_name, model_path], queue=False)
     finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False).then(
         list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False

From a5a75ec20c1ef1284f12f4ee09d10950b37f7e18 Mon Sep 17 00:00:00 2001
From: Erich Schubert <kno10@users.noreply.github.com>
Date: Fri, 21 Jun 2024 09:14:21 +0200
Subject: [PATCH 090/162] Print help if no arguments given

Former-commit-id: 08dfb7ec636fd5bfbb30dac9d5fba6e32bfc6728
---
 src/llamafactory/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index c7f136b3..af9dd5f5 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -74,7 +74,7 @@ class Command(str, Enum):
 
 
 def main():
-    command = sys.argv.pop(1)
+    command = sys.argv.pop(1) if len(sys.argv) > 0 else Command.HELP
     if command == Command.API:
         run_api()
     elif command == Command.CHAT:

From c6f5024e415ec1a9d5ca560723518c9aef30e581 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 22 Jun 2024 00:00:38 +0800
Subject: [PATCH 091/162] fix api

Former-commit-id: dcbd6d86dfc49f12529b02ec331e3e5c05740061
---
 src/llamafactory/api/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/api/chat.py b/src/llamafactory/api/chat.py
index 2c7e11e2..72b2ae50 100644
--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -93,7 +93,7 @@ def _process_request(
 
         if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
             tool_calls = [
-                {"name": tool_call.function.name, "argument": tool_call.function.arguments}
+                {"name": tool_call.function.name, "arguments": tool_call.function.arguments}
                 for tool_call in message.tool_calls
             ]
             content = json.dumps(tool_calls, ensure_ascii=False)

From c173a1e9cac493589f4f848bb9c5ba7ccf4541b9 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 22 Jun 2024 01:31:32 +0800
Subject: [PATCH 092/162] remove dup template

Former-commit-id: 5fec12203b24608af4d4993f44a657eb5a0348e5
---
 src/llamafactory/data/template.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 393ebfea..b5bf688c 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -511,25 +511,6 @@ _register_template(
 )
 
 
-_register_template(
-    name="chatglm3_system",
-    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
-    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
-    format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
-    format_observation=StringFormatter(
-        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
-    ),
-    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
-    default_system=(
-        "You are ChatGLM3, a large language model trained by Zhipu.AI. "
-        "Follow the user's instructions carefully. Respond using markdown."
-    ),
-    stop_words=["<|user|>", "<|observation|>"],
-    efficient_eos=True,
-)
-
-
 _register_template(
     name="chatml",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),

From 15a104e26dc01fefb7948446cf1ea8fd2159be28 Mon Sep 17 00:00:00 2001
From: mMrBun <2015711377@qq.com>
Date: Sat, 22 Jun 2024 02:00:13 +0800
Subject: [PATCH 093/162] Add tool_format to overwrite tool formatter template

Former-commit-id: af08971ca50443fd5597e5e4412a3aa17214502f
---
 src/llamafactory/chat/hf_engine.py    | 2 +-
 src/llamafactory/chat/vllm_engine.py  | 2 +-
 src/llamafactory/data/template.py     | 5 ++++-
 src/llamafactory/hparams/data_args.py | 4 ++++
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 9e60175b..22a24339 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -54,7 +54,7 @@ class HuggingfaceEngine(BaseEngine):
         self.tokenizer = tokenizer_module["tokenizer"]
         self.processor = tokenizer_module["processor"]
         self.tokenizer.padding_side = "left" if self.can_generate else "right"
-        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template, data_args.tool_format)
         self.model = load_model(
             self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
         )  # must after fixing tokenizer to resize vocab
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 2626d612..f0d23676 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -59,7 +59,7 @@ class VllmEngine(BaseEngine):
         self.tokenizer = tokenizer_module["tokenizer"]
         self.processor = tokenizer_module["processor"]
         self.tokenizer.padding_side = "left"
-        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template, data_args.tool_format)
         self.generating_args = generating_args.to_dict()
 
         engine_args = {
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index b5bf688c..3d8ded3b 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -379,6 +379,7 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer")
 def get_template_and_fix_tokenizer(
     tokenizer: "PreTrainedTokenizer",
     name: Optional[str] = None,
+    tool_format: Optional[str] = None,
 ) -> Template:
     if name is None:
         template = TEMPLATES["empty"]  # placeholder
@@ -386,6 +387,9 @@ def get_template_and_fix_tokenizer(
         template = TEMPLATES.get(name, None)
         if template is None:
             raise ValueError("Template {} does not exist.".format(name))
+        
+    if tool_format:
+        template.format_tools = ToolFormatter(tool_format=tool_format)
 
     stop_words = template.stop_words
     if template.replace_eos:
@@ -660,7 +664,6 @@ _register_template(
     format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
-    format_tools=ToolFormatter(tool_format="glm4"),
     format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index 39290e21..959742e3 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -29,6 +29,10 @@ class DataArguments:
         default=None,
         metadata={"help": "Which template to use for constructing prompts in training and inference."},
     )
+    tool_format: Optional[str] = field(
+        default=None,
+        metadata={"help": "Specifies the tool format template for function calling ."},
+    )
     dataset: Optional[str] = field(
         default=None,
         metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."},

From 83ee461b9adfc8299e9896dd7dd04d187a82ed28 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 24 Jun 2024 18:22:12 +0800
Subject: [PATCH 094/162] update readme

Former-commit-id: a1477208471039d3578980f929f1ca8c2a07aa96
---
 CITATION.cff                          | 10 ++++--
 README.md                             | 49 +++++++++++++++++++--------
 README_zh.md                          | 49 +++++++++++++++++++--------
 src/llamafactory/extras/constants.py  |  2 +-
 src/llamafactory/train/ppo/trainer.py | 16 ++++-----
 5 files changed, 85 insertions(+), 41 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index 4caf3787..a572b5fa 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -12,12 +12,14 @@ authors:
   given-names: "Yanhan"
 - family-names: "Luo"
   given-names: "Zheyan"
+- family-names: "Feng"
+  given-names: "Zhangchi"
 - family-names: "Ma"
   given-names: "Yongqiang"
 title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
 url: "https://arxiv.org/abs/2403.13372"
 preferred-citation:
-  type: article
+  type: conference-paper
   authors:
     - family-names: "Zheng"
       given-names: "Yaowei"
@@ -29,9 +31,13 @@ preferred-citation:
       given-names: "Yanhan"
     - family-names: "Luo"
       given-names: "Zheyan"
+    - family-names: "Feng"
+      given-names: "Zhangchi"
     - family-names: "Ma"
       given-names: "Yongqiang"
-  journal: "arXiv preprint arXiv:2403.13372"
+  booktitle: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)"
   title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
   url: "https://arxiv.org/abs/2403.13372"
   year: 2024
+  publisher: "Association for Computational Linguistics"
+  address: "Bangkok, Thailand"
diff --git a/README.md b/README.md
index 0e8e55f7..9462964c 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
-[![Citation](https://img.shields.io/badge/citation-44-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-63-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -15,7 +15,7 @@
 
 [![GitHub Tread](https://trendshift.io/api/badge/repositories/4535)](https://trendshift.io/repositories/4535)
 
-👋 Join our [WeChat](assets/wechat.jpg).
+👋 Join our [WeChat](assets/wechat.jpg) or [NPU user group](assets/wechat_npu.jpg).
 
 \[ English | [中文](README_zh.md) \]
 
@@ -360,8 +360,6 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 <details><summary>For Ascend NPU users</summary>
 
-Join [NPU user group](assets/wechat_npu.jpg).
-
 To install LLaMA Factory on Ascend NPU devices, please specify extra dependencies: `pip install -e '.[torch-npu,metrics]'`. Additionally, you need to install the **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**. Please follow the [installation tutorial](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html) or use the following commands:
 
 ```bash
@@ -503,38 +501,55 @@ If you have a project that should be incorporated, please contact via email or c
 1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526)
 1. Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. [[arxiv]](https://arxiv.org/abs/2311.07816)
 1. Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. [[arxiv]](https://arxiv.org/abs/2312.15710)
-1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
-1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
+1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. KDD 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
+1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
 1. Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2402.05904)
 1. Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. [[arxiv]](https://arxiv.org/abs/2402.07625)
 1. Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11176)
 1. Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. [[arxiv]](https://arxiv.org/abs/2402.11187)
 1. Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. [[arxiv]](https://arxiv.org/abs/2402.11746)
 1. Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11801)
-1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. 2024. [[arxiv]](https://arxiv.org/abs/2402.11809)
+1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2402.11809)
 1. Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11819)
 1. Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. [[arxiv]](https://arxiv.org/abs/2402.12204)
 1. Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.14714)
-1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
+1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. ACL 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
 1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073)
 1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
 1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
-1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
+1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. COLING 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
 1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443)
 1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
 1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
 1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
-1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
+1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. ICML 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
 1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
 1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836)
 1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581)
 1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215)
 1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621)
-1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2404.17140)
-1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
+1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2404.17140)
+1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. NAACL 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
+1. Xu et al. Large Language Models for Cyber Security: A Systematic Literature Review. 2024. [[arxiv]](https://arxiv.org/abs/2405.04760)
+1. Dammu et al. "They are uncultured": Unveiling Covert Harms and Social Threats in LLM Generated Conversations. 2024. [[arxiv]](https://arxiv.org/abs/2405.05378)
+1. Yi et al. A safety realignment framework via subspace-oriented model fusion for large language models. 2024. [[arxiv]](https://arxiv.org/abs/2405.09055)
+1. Lou et al. SPO: Multi-Dimensional Preference Sequential Alignment With Implicit Reward Modeling. 2024. [[arxiv]](https://arxiv.org/abs/2405.12739)
+1. Zhang et al. Getting More from Less: Large Language Models are Good Spontaneous Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2405.13816)
+1. Zhang et al. TS-Align: A Teacher-Student Collaborative Framework for Scalable Iterative Finetuning of Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2405.20215)
+1. Zihong Chen. Sentence Segmentation and Sentence Punctuation Based on XunziALLM. 2024. [[paper]](https://aclanthology.org/2024.lt4hala-1.30)
+1. Gao et al. The Best of Both Worlds: Toward an Honest and Helpful Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2406.00380)
+1. Wang and Song. MARS: Benchmarking the Metaphysical Reasoning Abilities of Language Models with a Multi-task Evaluation Dataset. 2024. [[arxiv]](https://arxiv.org/abs/2406.02106)
+1. Hu et al. Computational Limits of Low-Rank Adaptation (LoRA) for Transformer-Based Models. 2024. [[arxiv]](https://arxiv.org/abs/2406.03136)
+1. Ge et al. Time Sensitive Knowledge Editing through Efficient Finetuning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2406.04496)
+1. Tan et al. Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions. 2024. [[arxiv]](https://arxiv.org/abs/2406.05688)
+1. Song et al. Turbo Sparse: Achieving LLM SOTA Performance with Minimal Activated Parameters. 2024. [[arxiv]](https://arxiv.org/abs/2406.05955)
+1. Gu et al. RWKV-CLIP: A Robust Vision-Language Representation Learner. 2024. [[arxiv]](https://arxiv.org/abs/2406.06973)
+1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115)
+1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815)
+1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
 1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
@@ -542,6 +557,8 @@ If you have a project that should be incorporated, please contact via email or c
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: A large language model specialized in generate metadata for stable diffusion. [[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
 1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**: A multimodal large language model specialized in Chinese medical domain, based on LLaVA-1.5-7B.
+1. **[AutoRE](https://github.com/THUDM/AutoRE)**: A document-level relation extraction system based on large language models.
+1. **[NVIDIA RTX AI Toolkit](https://github.com/NVIDIA/RTX-AI-Toolkit)**: SDKs for fine-tuning LLMs on Windows PC for NVIDIA RTX.
 
 </details>
 
@@ -556,10 +573,12 @@ Please follow the model licenses to use the corresponding model weights: [Baichu
 If this work is helpful, please kindly cite as:
 
 ```bibtex
-@article{zheng2024llamafactory,
+@inproceedings{zheng2024llamafactory,
   title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
-  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Yongqiang Ma},
-  journal={arXiv preprint arXiv:2403.13372},
+  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma},
+  booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)},
+  address={Bangkok, Thailand},
+  publisher={Association for Computational Linguistics},
   year={2024},
   url={http://arxiv.org/abs/2403.13372}
 }
diff --git a/README_zh.md b/README_zh.md
index 152cd6fa..2073ce17 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -15,7 +15,7 @@
 
 [![GitHub Tread](https://trendshift.io/api/badge/repositories/4535)](https://trendshift.io/repositories/4535)
 
-👋 加入我们的[微信群](assets/wechat.jpg)。
+👋 加入我们的[微信群](assets/wechat.jpg)或 [NPU 用户群](assets/wechat_npu.jpg)。
 
 \[ [English](README.md) | 中文 \]
 
@@ -360,8 +360,6 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 <details><summary>昇腾 NPU 用户指南</summary>
 
-加入 [NPU 用户群](assets/wechat_npu.jpg)。
-
 在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e '.[torch-npu,metrics]'` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
 
 ```bash
@@ -503,38 +501,55 @@ run_name: test_run # 可选
 1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526)
 1. Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. [[arxiv]](https://arxiv.org/abs/2311.07816)
 1. Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. [[arxiv]](https://arxiv.org/abs/2312.15710)
-1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
-1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
+1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. KDD 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
+1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
 1. Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2402.05904)
 1. Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. [[arxiv]](https://arxiv.org/abs/2402.07625)
 1. Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11176)
 1. Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. [[arxiv]](https://arxiv.org/abs/2402.11187)
 1. Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. [[arxiv]](https://arxiv.org/abs/2402.11746)
 1. Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11801)
-1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. 2024. [[arxiv]](https://arxiv.org/abs/2402.11809)
+1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2402.11809)
 1. Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11819)
 1. Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. [[arxiv]](https://arxiv.org/abs/2402.12204)
 1. Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.14714)
-1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
+1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. ACL 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
 1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073)
 1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
 1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
-1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
+1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. COLING 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
 1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443)
 1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
 1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
 1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
-1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
+1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. ICML 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
 1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
 1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836)
 1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581)
 1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215)
 1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621)
-1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2404.17140)
-1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
+1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2404.17140)
+1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. NAACL 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
+1. Xu et al. Large Language Models for Cyber Security: A Systematic Literature Review. 2024. [[arxiv]](https://arxiv.org/abs/2405.04760)
+1. Dammu et al. "They are uncultured": Unveiling Covert Harms and Social Threats in LLM Generated Conversations. 2024. [[arxiv]](https://arxiv.org/abs/2405.05378)
+1. Yi et al. A safety realignment framework via subspace-oriented model fusion for large language models. 2024. [[arxiv]](https://arxiv.org/abs/2405.09055)
+1. Lou et al. SPO: Multi-Dimensional Preference Sequential Alignment With Implicit Reward Modeling. 2024. [[arxiv]](https://arxiv.org/abs/2405.12739)
+1. Zhang et al. Getting More from Less: Large Language Models are Good Spontaneous Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2405.13816)
+1. Zhang et al. TS-Align: A Teacher-Student Collaborative Framework for Scalable Iterative Finetuning of Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2405.20215)
+1. Zihong Chen. Sentence Segmentation and Sentence Punctuation Based on XunziALLM. 2024. [[paper]](https://aclanthology.org/2024.lt4hala-1.30)
+1. Gao et al. The Best of Both Worlds: Toward an Honest and Helpful Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2406.00380)
+1. Wang and Song. MARS: Benchmarking the Metaphysical Reasoning Abilities of Language Models with a Multi-task Evaluation Dataset. 2024. [[arxiv]](https://arxiv.org/abs/2406.02106)
+1. Hu et al. Computational Limits of Low-Rank Adaptation (LoRA) for Transformer-Based Models. 2024. [[arxiv]](https://arxiv.org/abs/2406.03136)
+1. Ge et al. Time Sensitive Knowledge Editing through Efficient Finetuning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2406.04496)
+1. Tan et al. Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions. 2024. [[arxiv]](https://arxiv.org/abs/2406.05688)
+1. Song et al. Turbo Sparse: Achieving LLM SOTA Performance with Minimal Activated Parameters. 2024. [[arxiv]](https://arxiv.org/abs/2406.05955)
+1. Gu et al. RWKV-CLIP: A Robust Vision-Language Representation Learner. 2024. [[arxiv]](https://arxiv.org/abs/2406.06973)
+1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115)
+1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815)
+1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper，基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM，基于 Baichuan-13B 微调而得，具有法律推理和知识检索能力。
 1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。
@@ -542,6 +557,8 @@ run_name: test_run # 可选
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**：MBTI性格大模型项目，根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**：一个用于生成 Stable Diffusion 提示词的大型语言模型。[[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
 1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**：中文多模态医学大模型，基于 LLaVA-1.5-7B 在中文多模态医疗数据上微调而得。
+1. **[AutoRE](https://github.com/THUDM/AutoRE)**：基于大语言模型的文档级关系抽取系统。
+1. **[NVIDIA RTX AI Toolkit](https://github.com/NVIDIA/RTX-AI-Toolkit)**: 在 Windows 主机上利用英伟达 RTX 设备进行大型语言模型微调的开发包。
 
 </details>
 
@@ -556,10 +573,12 @@ run_name: test_run # 可选
 如果您觉得此项目有帮助，请考虑以下列格式引用
 
 ```bibtex
-@article{zheng2024llamafactory,
-  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models}, 
-  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Yongqiang Ma},
-  journal={arXiv preprint arXiv:2403.13372},
+@inproceedings{zheng2024llamafactory,
+  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
+  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma},
+  booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)},
+  address={Bangkok, Thailand},
+  publisher={Association for Computational Linguistics},
   year={2024},
   url={http://arxiv.org/abs/2403.13372}
 }
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 36265c8e..866f39d4 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -579,7 +579,7 @@ register_model_group(
 
 register_model_group(
     models={
-        "Jambda-v0.1": {
+        "Jamba-v0.1": {
             DownloadSource.DEFAULT: "ai21labs/Jamba-v0.1",
             DownloadSource.MODELSCOPE: "AI-ModelScope/Jamba-v0.1",
         }
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index df4a37be..38f4c6c8 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -202,18 +202,18 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
 
         if self.is_world_process_zero():
             logger.info("***** Running training *****")
-            logger.info("  Num examples = {}".format(num_examples))
-            logger.info("  Num Epochs = {}".format(num_train_epochs))
-            logger.info("  Instantaneous batch size per device = {}".format(self.args.per_device_train_batch_size))
+            logger.info("  Num examples = {:,}".format(num_examples))
+            logger.info("  Num Epochs = {:,}".format(num_train_epochs))
+            logger.info("  Instantaneous batch size per device = {:,}".format(self.args.per_device_train_batch_size))
             logger.info(
-                "  Total train batch size (w. parallel, buffer, distributed & accumulation) = {}".format(
+                "  Total train batch size (w. parallel, buffer, distributed & accumulation) = {:,}".format(
                     total_train_batch_size
                 )
             )
-            logger.info("  Gradient Accumulation steps = {}".format(self.args.gradient_accumulation_steps))
-            logger.info("  Num optimization epochs per batch = {}".format(self.finetuning_args.ppo_epochs))
-            logger.info("  Total training steps = {}".format(max_steps))
-            logger.info("  Number of trainable parameters = {}".format(count_parameters(self.model)[0]))
+            logger.info("  Gradient Accumulation steps = {:,}".format(self.args.gradient_accumulation_steps))
+            logger.info("  Num optimization epochs per batch = {:,}".format(self.finetuning_args.ppo_epochs))
+            logger.info("  Total training steps = {:,}".format(max_steps))
+            logger.info("  Number of trainable parameters = {:,}".format(count_parameters(self.model)[0]))
 
         dataiter = iter(self.dataloader)
         loss_meter = AverageMeter()

From a44b46ac32417387be10ef76658d482f5cd9fc17 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 24 Jun 2024 18:29:04 +0800
Subject: [PATCH 095/162] update readme

Former-commit-id: 0775d56ee3cfde34e28a48cbf4a583f4530def19
---
 CITATION.cff | 3 ++-
 README_zh.md | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index a572b5fa..01b4c9fd 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -20,6 +20,8 @@ title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
 url: "https://arxiv.org/abs/2403.13372"
 preferred-citation:
   type: conference-paper
+  conference:
+    name: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)"
   authors:
     - family-names: "Zheng"
       given-names: "Yaowei"
@@ -35,7 +37,6 @@ preferred-citation:
       given-names: "Zhangchi"
     - family-names: "Ma"
       given-names: "Yongqiang"
-  booktitle: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)"
   title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
   url: "https://arxiv.org/abs/2403.13372"
   year: 2024
diff --git a/README_zh.md b/README_zh.md
index 2073ce17..8b77e91e 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -4,7 +4,7 @@
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
-[![Citation](https://img.shields.io/badge/citation-44-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-63-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)

From 154a3e5012895f425b71c751465f28413e9c1e94 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Mon, 24 Jun 2024 10:57:36 +0000
Subject: [PATCH 096/162] update docker files

  1. add docker-npu (Dockerfile and docker-compose.yml)
  2. move cuda docker to docker-cuda and tiny changes to adapt to the new path


Former-commit-id: 5431c1f18aadb072208efe7fd8e36fdcfbf807c2
---
 README.md                                     | 61 +++++++++++++++++--
 README_zh.md                                  | 58 ++++++++++++++++--
 Dockerfile => docker/docker-cuda/Dockerfile   | 10 +--
 .../docker-cuda/docker-compose.yml            |  6 +-
 docker/docker-npu/Dockerfile                  | 40 ++++++++++++
 docker/docker-npu/docker-compose.yml          | 31 ++++++++++
 6 files changed, 187 insertions(+), 19 deletions(-)
 rename Dockerfile => docker/docker-cuda/Dockerfile (87%)
 rename docker-compose.yml => docker/docker-cuda/docker-compose.yml (80%)
 create mode 100644 docker/docker-npu/Dockerfile
 create mode 100644 docker/docker-npu/docker-compose.yml

diff --git a/README.md b/README.md
index 9462964c..1107ae0b 100644
--- a/README.md
+++ b/README.md
@@ -383,10 +383,11 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 | torch-npu    | 2.1.0   | 2.1.0.post3 |
 | deepspeed    | 0.13.2  | 0.13.2      |
 
-Docker image:
+Docker users please refer to [Build Docker](#Build-Docker).
 
-- 32GB: [Download page](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html)
-- 64GB: [Download page](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
+**NOTE**
+
+The default docker image is [cosdt/cann:8.0.rc1-910b-ubuntu22.04](https://hub.docker.com/layers/cosdt/cann/8.0.rc1-910b-ubuntu22.04/images/sha256-29ef8aacf6b2babd292f06f00b9190c212e7c79a947411e213135e4d41a178a9?context=explore). More options can be found at [cosdt/cann](https://hub.docker.com/r/cosdt/cann/tags).
 
 Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
 
@@ -426,7 +427,10 @@ llamafactory-cli webui
 
 #### Use Docker
 
+<details><summary>For NVIDIA GPU users:</summary>
+
 ```bash
+cd ./docker/docker-cuda
 docker build -f ./Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
@@ -435,18 +439,63 @@ docker build -f ./Dockerfile \
     -t llamafactory:latest .
 
 docker run -it --gpus=all \
-    -v ./hf_cache:/root/.cache/huggingface/ \
-    -v ./data:/app/data \
-    -v ./output:/app/output \
+    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
+    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
+    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
     -p 7860:7860 \
     -p 8000:8000 \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
 ```
+</details>
+
+<details><summary>For Ascend NPU users:</summary>
+
+```bash
+cd ./docker/docker-npu
+docker build -f ./Dockerfile \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    -t llamafactory:latest .
+
+# add --device for multi-npu usage
+# or modify --device to change npu card
+docker run -it \
+    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
+    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
+    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64 \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -p 7860:7860 \
+    -p 8000:8000 \
+    --device /dev/davinci0 \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    --shm-size 16G \
+    --name llamafactory \
+    llamafactory:latest
+```
+</details>
 
 #### Use Docker Compose
 
+Firstly enter your docker path:
+
+```bash
+# for NVIDIA GPU users
+cd ./docker/docker-cuda
+
+# for Ascend NPU users
+cd ./docker/docker-npu
+```
+
+Then run the following command to build docker image and start the container:
+
 ```bash
 docker-compose up -d
 docker-compose exec llamafactory bash
diff --git a/README_zh.md b/README_zh.md
index 8b77e91e..6326c0b5 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -383,10 +383,11 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 | torch-npu    | 2.1.0   | 2.1.0.post3 |
 | deepspeed    | 0.13.2  | 0.13.2      |
 
-Docker 镜像：
+Docker用户请参考 [构建 Docker](#构建-Docker).
 
-- 32GB：[下载地址](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html)
-- 64GB：[下载地址](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
+**NOTE**
+
+默认镜像为 [cosdt/cann:8.0.rc1-910b-ubuntu22.04](https://hub.docker.com/layers/cosdt/cann/8.0.rc1-910b-ubuntu22.04/images/sha256-29ef8aacf6b2babd292f06f00b9190c212e7c79a947411e213135e4d41a178a9?context=explore). 更多选择见 [cosdt/cann](https://hub.docker.com/r/cosdt/cann/tags).
 
 请使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定运算设备。
 
@@ -426,7 +427,10 @@ llamafactory-cli webui
 
 #### 使用 Docker
 
+<details><summary>NVIDIA GPU 用户：</summary>
+
 ```bash
+cd ./docker/docker-cuda
 docker build -f ./Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
@@ -435,18 +439,60 @@ docker build -f ./Dockerfile \
     -t llamafactory:latest .
 
 docker run -it --gpus=all \
-    -v ./hf_cache:/root/.cache/huggingface/ \
-    -v ./data:/app/data \
-    -v ./output:/app/output \
+    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
+    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
+    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
     -p 7860:7860 \
     -p 8000:8000 \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
 ```
+</details>
+
+<details><summary>Ascend NPU 用户：</summary>
+
+```bash
+cd ./docker/docker-npu
+docker build -f ./Dockerfile \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    -t llamafactory:latest .
+
+# 增加 --device 来使用多卡 NPU 或修改第一个 --device 来更改 NPU 卡
+docker run -it \
+    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
+    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
+    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64 \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -p 7860:7860 \
+    -p 8000:8000 \
+    --device /dev/davinci0 \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    --shm-size 16G \
+    --name llamafactory \
+    llamafactory:latest
+```
+</details>
 
 #### 使用 Docker Compose
 
+首先进入 docker 目录：
+```bash
+# NVIDIA GPU 用户
+cd ./docker/docker-cuda
+
+# Ascend NPU 用户
+cd ./docker/docker-npu
+```
+然后运行以下命令创建 docker 镜像并启动容器:
+
 ```bash
 docker-compose up -d
 docker-compose exec llamafactory bash
diff --git a/Dockerfile b/docker/docker-cuda/Dockerfile
similarity index 87%
rename from Dockerfile
rename to docker/docker-cuda/Dockerfile
index 61d58005..72797a26 100644
--- a/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -9,16 +9,18 @@ ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
-WORKDIR /app
+WORKDIR /app/LLaMA-Factory
+
+RUN cd /app && \
+    git config --global http.version HTTP/1.1 && \
+    git clone https://github.com/hiyouga/LLaMA-Factory.git && \
+    cd /app/LLaMA-Factory
 
 # Install the requirements
-COPY requirements.txt /app/
 RUN pip config set global.index-url $PIP_INDEX
 RUN python -m pip install --upgrade pip
 RUN python -m pip install -r requirements.txt
 
-# Copy the rest of the application into the image
-COPY . /app/
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="metrics"; \
diff --git a/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
similarity index 80%
rename from docker-compose.yml
rename to docker/docker-cuda/docker-compose.yml
index c5dc34e9..a470aa60 100644
--- a/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -10,9 +10,9 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface/
-      - ./data:/app/data
-      - ./output:/app/output
+      - ../../hf_cache:/root/.cache/huggingface/
+      - ../../data:/app/LLaMA-Factory/data
+      - ../../output:/app/LLaMA-Factory/output
     ports:
       - "7860:7860"
       - "8000:8000"
diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
new file mode 100644
index 00000000..9456bcbf
--- /dev/null
+++ b/docker/docker-npu/Dockerfile
@@ -0,0 +1,40 @@
+# Using ubuntu 22.04 images with cann 8.0.rc1
+# More options can be found at https://hub.docker.com/r/cosdt/cann/tags
+FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Define installation arguments
+ARG INSTALL_DEEPSPEED=false
+ARG PIP_INDEX=https://pypi.org/simple
+
+# Set the working directory
+WORKDIR /app/LLaMA-Factory
+
+RUN cd /app && \
+    git config --global http.version HTTP/1.1 && \
+    git clone https://github.com/hiyouga/LLaMA-Factory.git && \
+    cd /app/LLaMA-Factory
+
+RUN pip config set global.index-url $PIP_INDEX
+RUN python3 -m pip install --upgrade pip
+
+# Install the LLaMA Factory
+RUN EXTRA_PACKAGES="torch-npu,metrics"; \
+    if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
+    fi; \
+    pip install -e .[$EXTRA_PACKAGES] && \
+    pip uninstall -y transformer-engine flash-attn
+
+# Set up volumes
+VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
+
+# Expose port 7860 for the LLaMA Board
+EXPOSE 7860
+
+# Expose port 8000 for the API service
+EXPOSE 8000
+
+# Launch LLaMA Board
+CMD [ "llamafactory-cli", "webui" ]
diff --git a/docker/docker-npu/docker-compose.yml b/docker/docker-npu/docker-compose.yml
new file mode 100644
index 00000000..93eb6718
--- /dev/null
+++ b/docker/docker-npu/docker-compose.yml
@@ -0,0 +1,31 @@
+services:
+  llamafactory:
+    build:
+      dockerfile: Dockerfile
+      context: .
+      args:
+        INSTALL_DEEPSPEED: false
+        PIP_INDEX: https://pypi.org/simple
+    container_name: llamafactory
+    volumes:
+      - ../../hf_cache:/root/.cache/huggingface/
+      - ../../data:/app/LLaMA-Factory/data
+      - ../../output:/app/LLaMA-Factory/output
+      - /usr/local/dcmi:/usr/local/dcmi
+      - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
+      - /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
+      - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
+      - /etc/ascend_install.info:/etc/ascend_install.info
+    ports:
+      - "7860:7860"
+      - "8000:8000"
+    ipc: host
+    tty: true
+    stdin_open: true
+    command: bash
+    devices:
+      - /dev/davinci0
+      - /dev/davinci_manager
+      - /dev/devmm_svm
+      - /dev/hisi_hdc
+    restart: unless-stopped

From 3b36d005ad72d6a148067d342edaa8bec5746ec1 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Mon, 24 Jun 2024 12:27:00 +0000
Subject: [PATCH 097/162] auto-label npu issue

Former-commit-id: d19c9eac783377151e58731723fb7cbb2dab3323
---
 .github/workflows/label_issue.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
index b9a5543c..352b4b25 100644
--- a/.github/workflows/label_issue.yml
+++ b/.github/workflows/label_issue.yml
@@ -13,5 +13,18 @@ jobs:
       - env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           ISSUE_URL: ${{ github.event.issue.html_url }}
+          ISSUE_TITLE: "${{ github.event.issue.title }}"
         run: |
           gh issue edit $ISSUE_URL --add-label "pending"
+
+          # auto-add label for npu
+          NPU_KEYWORDS=("npu" "ascend" "昇腾")
+          LABEL_NPU="npu"
+          ISSUE_TITLE_LOWER=$(echo "$ISSUE_TITLE" | tr '[:upper:]' '[:lower:]')
+
+          for keyword in "${NPU_KEYWORDS[@]}"; do
+            if [[ "$ISSUE_TITLE_LOWER" == *"$keyword"* ]] && [[ "$ISSUE_TITLE_LOWER" != *"input"* ]]; then
+              gh issue edit "$ISSUE_URL" --add-label "$LABEL_NPU"
+              break
+            fi
+          done

From 0bf750ade8c1c9a0b1dd819322c3191775277b92 Mon Sep 17 00:00:00 2001
From: stceum <50257864+stceum@users.noreply.github.com>
Date: Mon, 24 Jun 2024 20:39:20 +0800
Subject: [PATCH 098/162] Bug Fix: `off` is parsed as `False` in yaml file,
 changed to `disabled` to avoid this.

Former-commit-id: 171289d8e4c111fdca2b100282b64c74a04a4726
---
 src/llamafactory/hparams/model_args.py          | 2 +-
 src/llamafactory/hparams/parser.py              | 4 ++++
 src/llamafactory/model/model_utils/attention.py | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 996e9130..9b51c064 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -97,7 +97,7 @@ class ModelArguments:
         default=None,
         metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
     )
-    flash_attn: Literal["off", "sdpa", "fa2", "auto"] = field(
+    flash_attn: Literal["disabled", "sdpa", "fa2", "auto"] = field(
         default="auto",
         metadata={"help": "Enable FlashAttention for faster training and inference."},
     )
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index a593bf45..9ef2d607 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -102,6 +102,10 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
         if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
             raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
 
+    # In case that `flash_attn` is set to `off` in the yaml file, and parsed as `False` afterwards.
+    if model_args.flash_attn == False:
+        raise ValueError("flash_attn should be \"disabled\", \"sdpa\", \"fa2\" or \"auto\".")
+
 
 def _check_extra_dependencies(
     model_args: "ModelArguments",
diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index 8ff3807b..dfd90936 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -32,7 +32,7 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model
     if model_args.flash_attn == "auto":
         return
 
-    elif model_args.flash_attn == "off":
+    elif model_args.flash_attn == "disabled":
         requested_attn_implementation = "eager"
 
     elif model_args.flash_attn == "sdpa":

From 454315c6708c0aeec159e8e62d545426b9716365 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 21:35:34 +0800
Subject: [PATCH 099/162] Update test_attention.py

Former-commit-id: c2cc7a0f152aa14fc03ae413f4a9dc06742a29d7
---
 tests/model/model_utils/test_attention.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/model/model_utils/test_attention.py b/tests/model/model_utils/test_attention.py
index 97ac9dcc..4cae3d7c 100644
--- a/tests/model/model_utils/test_attention.py
+++ b/tests/model/model_utils/test_attention.py
@@ -29,7 +29,7 @@ INFER_ARGS = {
 
 
 def test_attention():
-    attention_available = ["off"]
+    attention_available = ["disabled"]
     if is_torch_sdpa_available():
         attention_available.append("sdpa")
 
@@ -37,7 +37,7 @@ def test_attention():
         attention_available.append("fa2")
 
     llama_attention_classes = {
-        "off": "LlamaAttention",
+        "disabled": "LlamaAttention",
         "sdpa": "LlamaSdpaAttention",
         "fa2": "LlamaFlashAttention2",
     }

From 1372049120abd75f3cce5049bbc78404bf003ed8 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 21:37:42 +0800
Subject: [PATCH 100/162] Update parser.py

Former-commit-id: 60e605cd9d399bd04432864ede9c84302890eac8
---
 src/llamafactory/hparams/parser.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 9ef2d607..a593bf45 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -102,10 +102,6 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
         if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
             raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
 
-    # In case that `flash_attn` is set to `off` in the yaml file, and parsed as `False` afterwards.
-    if model_args.flash_attn == False:
-        raise ValueError("flash_attn should be \"disabled\", \"sdpa\", \"fa2\" or \"auto\".")
-
 
 def _check_extra_dependencies(
     model_args: "ModelArguments",

From 0fadf307598f4b034b4bcbcc0dbc046e6efaad33 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 21:59:39 +0800
Subject: [PATCH 101/162] Update label_issue.yml

Former-commit-id: 90785a69c6210c3a02babb12c56fb7900095247c
---
 .github/workflows/label_issue.yml | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
index 352b4b25..507b6be7 100644
--- a/.github/workflows/label_issue.yml
+++ b/.github/workflows/label_issue.yml
@@ -13,18 +13,15 @@ jobs:
       - env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           ISSUE_URL: ${{ github.event.issue.html_url }}
-          ISSUE_TITLE: "${{ github.event.issue.title }}"
+          ISSUE_TITLE: ${{ github.event.issue.title }}
         run: |
-          gh issue edit $ISSUE_URL --add-label "pending"
-
-          # auto-add label for npu
-          NPU_KEYWORDS=("npu" "ascend" "昇腾")
-          LABEL_NPU="npu"
-          ISSUE_TITLE_LOWER=$(echo "$ISSUE_TITLE" | tr '[:upper:]' '[:lower:]')
-
-          for keyword in "${NPU_KEYWORDS[@]}"; do
-            if [[ "$ISSUE_TITLE_LOWER" == *"$keyword"* ]] && [[ "$ISSUE_TITLE_LOWER" != *"input"* ]]; then
-              gh issue edit "$ISSUE_URL" --add-label "$LABEL_NPU"
+          LABEL=pending
+          NPU_KEYWORDS=(npu ascend 昇腾)
+          ISSUE_TITLE_LOWER=$(echo $ISSUE_TITLE | tr '[:upper:]' '[:lower:]')
+          for KEYWORD in ${NPU_KEYWORDS[@]}; do
+            if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]] then
+              LABEL=pending,npu
               break
             fi
           done
+          gh issue edit $ISSUE_URL --add-label $LABEL

From ec99274fd25d7bb5ff8dd0bf831203d24929df3f Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 22:01:23 +0800
Subject: [PATCH 102/162] Update label_issue.yml

Former-commit-id: dc2f7998b4ae9d7223c7c16732d835cea2a28713
---
 .github/workflows/label_issue.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
index 507b6be7..0e10f0b9 100644
--- a/.github/workflows/label_issue.yml
+++ b/.github/workflows/label_issue.yml
@@ -19,7 +19,7 @@ jobs:
           NPU_KEYWORDS=(npu ascend 昇腾)
           ISSUE_TITLE_LOWER=$(echo $ISSUE_TITLE | tr '[:upper:]' '[:lower:]')
           for KEYWORD in ${NPU_KEYWORDS[@]}; do
-            if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]] then
+            if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]]; then
               LABEL=pending,npu
               break
             fi

From a27d4bb4be4c61b8347a539504b15ac32b94344d Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 24 Jun 2024 22:34:31 +0800
Subject: [PATCH 103/162] fix #4410

Former-commit-id: f49adc4ab5eade21d7a9e029212f17688ee9b0cf
---
 src/llamafactory/data/formatter.py         |  4 ++--
 src/llamafactory/hparams/model_args.py     |  2 +-
 src/llamafactory/model/patcher.py          |  6 +++---
 src/llamafactory/webui/chatter.py          |  1 +
 src/llamafactory/webui/components/infer.py | 17 ++++++++++++++---
 src/llamafactory/webui/locales.py          | 11 +++++++++++
 6 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index ed9ba8b8..b5dc57ff 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -34,8 +34,8 @@ DEFAULT_TOOL_PROMPT = (
 
 
 GLM4_TOOL_PROMPT = (
-    "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
-    "你的任务是针对用户的问题和要求提供适当的答复和支持。{tool_text}"
+    "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+    "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具{tool_text}"
 )
 
 
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 9b51c064..3f21145d 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -97,7 +97,7 @@ class ModelArguments:
         default=None,
         metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
     )
-    flash_attn: Literal["disabled", "sdpa", "fa2", "auto"] = field(
+    flash_attn: Literal["auto", "disabled", "sdpa", "fa2"] = field(
         default="auto",
         metadata={"help": "Enable FlashAttention for faster training and inference."},
     )
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 35153649..24cd2601 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -58,10 +58,10 @@ def patch_config(
     is_trainable: bool,
 ) -> None:
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
-        if model_args.infer_dtype == "auto":
-            model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
-        else:
+        if model_args.infer_dtype != "auto" and not is_trainable:
             model_args.compute_dtype = getattr(torch, model_args.infer_dtype)
+        else:
+            model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
 
     if is_torch_npu_available():
         use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py
index a2b54dce..652c341c 100644
--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
@@ -87,6 +87,7 @@ class WebChatModel(ChatModel):
             visual_inputs=get("top.visual_inputs"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             infer_backend=get("infer.infer_backend"),
+            infer_dtype=get("infer.infer_dtype"),
         )
 
         if checkpoint_path:
diff --git a/src/llamafactory/webui/components/infer.py b/src/llamafactory/webui/components/infer.py
index 03bccd7f..a0064479 100644
--- a/src/llamafactory/webui/components/infer.py
+++ b/src/llamafactory/webui/components/infer.py
@@ -32,15 +32,26 @@ def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]:
     input_elems = engine.manager.get_base_elems()
     elem_dict = dict()
 
-    infer_backend = gr.Dropdown(choices=["huggingface", "vllm"], value="huggingface")
+    with gr.Row():
+        infer_backend = gr.Dropdown(choices=["huggingface", "vllm"], value="huggingface")
+        infer_dtype = gr.Dropdown(choices=["auto", "float16", "bfloat16", "float32"], value="auto")
+
     with gr.Row():
         load_btn = gr.Button()
         unload_btn = gr.Button()
 
     info_box = gr.Textbox(show_label=False, interactive=False)
 
-    input_elems.update({infer_backend})
-    elem_dict.update(dict(infer_backend=infer_backend, load_btn=load_btn, unload_btn=unload_btn, info_box=info_box))
+    input_elems.update({infer_backend, infer_dtype})
+    elem_dict.update(
+        dict(
+            infer_backend=infer_backend,
+            infer_dtype=infer_dtype,
+            load_btn=load_btn,
+            unload_btn=unload_btn,
+            info_box=info_box,
+        )
+    )
 
     chatbot, messages, chat_elems = create_chat_box(engine, visible=False)
     elem_dict.update(chat_elems)
diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py
index 8e8d6fce..cd166584 100644
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
@@ -1206,6 +1206,17 @@ LOCALES = {
             "label": "推理引擎",
         },
     },
+    "infer_dtype": {
+        "en": {
+            "label": "Inference data type",
+        },
+        "ru": {
+            "label": "Тип данных для вывода",
+        },
+        "zh": {
+            "label": "推理数据类型",
+        },
+    },
     "load_btn": {
         "en": {
             "value": "Load model",

From 70410aedc178fa47dec85bede84920a26fd90817 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:06:18 +0800
Subject: [PATCH 104/162] Update loader.py

Former-commit-id: afa59d61844595e6b615227e6bfdc0b16c8015dd
---
 src/llamafactory/data/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py
index f44ef5de..8e7062db 100644
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -148,7 +148,7 @@ def get_dataset(
     tokenizer: "PreTrainedTokenizer",
     processor: Optional["ProcessorMixin"] = None,
 ) -> Union["Dataset", "IterableDataset"]:
-    template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
+    template = get_template_and_fix_tokenizer(tokenizer, data_args.template, data_args.tool_format)
     if data_args.train_on_prompt and template.efficient_eos:
         raise ValueError("Current template does not support `train_on_prompt`.")
 

From d9529b749ed563c256652b7c42bc515a89fc2368 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:12:59 +0800
Subject: [PATCH 105/162] Update template.py

Former-commit-id: d53517bff6f8734221d7df9982f3bdd4d2eb2cab
---
 src/llamafactory/data/template.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 3d8ded3b..3a72a858 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -664,6 +664,7 @@ _register_template(
     format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
     format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,

From 14e7f0188c578083c5f9de970458bfc5d0114a5b Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:14:36 +0800
Subject: [PATCH 106/162] Update test_formatter.py

Former-commit-id: d13ef043441734189b05e739dbbebb16077a6f0b
---
 tests/data/test_formatter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/data/test_formatter.py b/tests/data/test_formatter.py
index 430eb0e6..a01e8a7e 100644
--- a/tests/data/test_formatter.py
+++ b/tests/data/test_formatter.py
@@ -111,9 +111,9 @@ def test_glm4_tool_formatter():
         }
     ]
     assert formatter.apply(content=json.dumps(tools)) == [
-        "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
-        "你的任务是针对用户的问题和要求提供适当的答复和支持。"
-        "\n\n## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+        "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+        "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具\n\n"
+        "## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
             json.dumps(tools[0], indent=4)
         )
     ]

From c78db25a2410ad5e7d5c6b50a2a63450e66dbaab Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:21:10 +0800
Subject: [PATCH 107/162] Update cli.py

Former-commit-id: 9db6126496ec9e834541823715f700f92b3968c7
---
 src/llamafactory/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index af9dd5f5..e4846780 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -74,7 +74,7 @@ class Command(str, Enum):
 
 
 def main():
-    command = sys.argv.pop(1) if len(sys.argv) > 0 else Command.HELP
+    command = sys.argv.pop(1) if len(sys.argv) != 1 else Command.HELP
     if command == Command.API:
         run_api()
     elif command == Command.CHAT:

From ae8866415e136c08fa20028b527a5a0c36eeccb3 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:41:35 +0800
Subject: [PATCH 108/162] Update Dockerfile

Former-commit-id: 632681d8ece0eaac59bb364d971435a3bc6665a9
---
 docker/docker-cuda/Dockerfile | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 72797a26..61d58005 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -9,18 +9,16 @@ ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
-WORKDIR /app/LLaMA-Factory
-
-RUN cd /app && \
-    git config --global http.version HTTP/1.1 && \
-    git clone https://github.com/hiyouga/LLaMA-Factory.git && \
-    cd /app/LLaMA-Factory
+WORKDIR /app
 
 # Install the requirements
+COPY requirements.txt /app/
 RUN pip config set global.index-url $PIP_INDEX
 RUN python -m pip install --upgrade pip
 RUN python -m pip install -r requirements.txt
 
+# Copy the rest of the application into the image
+COPY . /app/
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="metrics"; \

From 819b4a1168d6a6bc0cf9889082742cb5792eb050 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 00:46:08 +0800
Subject: [PATCH 109/162] Update Dockerfile

Former-commit-id: c897a70501707c0f4c432bb8e9a9beeb4e8953a3
---
 docker/docker-cuda/Dockerfile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 61d58005..2d20bfe4 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -12,13 +12,14 @@ ARG PIP_INDEX=https://pypi.org/simple
 WORKDIR /app
 
 # Install the requirements
-COPY requirements.txt /app/
+COPY requirements.txt /app
 RUN pip config set global.index-url $PIP_INDEX
+RUN pip config set global.extra-index-url $PIP_INDEX
 RUN python -m pip install --upgrade pip
 RUN python -m pip install -r requirements.txt
 
 # Copy the rest of the application into the image
-COPY . /app/
+COPY . /app
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="metrics"; \
@@ -38,10 +39,9 @@ RUN EXTRA_PACKAGES="metrics"; \
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
 
 # Expose port 7860 for the LLaMA Board
+ENV GRADIO_SERVER_PORT 7860
 EXPOSE 7860
 
 # Expose port 8000 for the API service
+ENV API_PORT 8000
 EXPOSE 8000
-
-# Launch LLaMA Board
-CMD [ "llamafactory-cli", "webui" ]

From bd27a250f85a7c607cbeaff9490706961f82c57f Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 00:46:47 +0800
Subject: [PATCH 110/162] Update docker-compose.yml

Former-commit-id: 56af208074e6af5465183af85367e7edd89d5aa6
---
 docker/docker-cuda/docker-compose.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index a470aa60..04d6531a 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -1,8 +1,8 @@
 services:
   llamafactory:
     build:
-      dockerfile: Dockerfile
-      context: .
+      dockerfile: ./docker/docker-cuda/Dockerfile
+      context: ../..
       args:
         INSTALL_BNB: false
         INSTALL_VLLM: false
@@ -10,9 +10,9 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ../../hf_cache:/root/.cache/huggingface/
-      - ../../data:/app/LLaMA-Factory/data
-      - ../../output:/app/LLaMA-Factory/output
+      - ./hf_cache:/root/.cache/huggingface/
+      - ./data:/app/data
+      - ./output:/app/output
     ports:
       - "7860:7860"
       - "8000:8000"

From 603b5bc427e9aa9ce5c075eba0baa29b9174541c Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 00:50:34 +0800
Subject: [PATCH 111/162] Update Dockerfile

Former-commit-id: cdcd9455c19311394e148476a28ca75849c845b2
---
 docker/docker-npu/Dockerfile | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 9456bcbf..0fdd4472 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -1,5 +1,5 @@
-# Using ubuntu 22.04 images with cann 8.0.rc1
-# More options can be found at https://hub.docker.com/r/cosdt/cann/tags
+# Use the Ubuntu 22.04 image with CANN 8.0.rc1
+# More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
@@ -9,15 +9,17 @@ ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
-WORKDIR /app/LLaMA-Factory
-
-RUN cd /app && \
-    git config --global http.version HTTP/1.1 && \
-    git clone https://github.com/hiyouga/LLaMA-Factory.git && \
-    cd /app/LLaMA-Factory
+WORKDIR /app
 
+# Install the requirements
+COPY requirements.txt /app
 RUN pip config set global.index-url $PIP_INDEX
-RUN python3 -m pip install --upgrade pip
+RUN pip config set global.extra-index-url $PIP_INDEX
+RUN python -m pip install --upgrade pip
+RUN python -m pip install -r requirements.txt
+
+# Copy the rest of the application into the image
+COPY . /app
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="torch-npu,metrics"; \
@@ -31,10 +33,9 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
 
 # Expose port 7860 for the LLaMA Board
+ENV GRADIO_SERVER_PORT 7860
 EXPOSE 7860
 
 # Expose port 8000 for the API service
+ENV API_PORT 8000
 EXPOSE 8000
-
-# Launch LLaMA Board
-CMD [ "llamafactory-cli", "webui" ]

From 64ec82262f7184c475283c4fff6c385b868748bf Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 00:54:28 +0800
Subject: [PATCH 112/162] Update docker-compose.yml

Former-commit-id: e038daf8dfa5d948b70c18469cb5a0be9aec464a
---
 docker/docker-npu/docker-compose.yml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/docker/docker-npu/docker-compose.yml b/docker/docker-npu/docker-compose.yml
index 93eb6718..7fff6e73 100644
--- a/docker/docker-npu/docker-compose.yml
+++ b/docker/docker-npu/docker-compose.yml
@@ -1,20 +1,19 @@
 services:
   llamafactory:
     build:
-      dockerfile: Dockerfile
-      context: .
+      dockerfile: ./docker/docker-npu/Dockerfile
+      context: ../..
       args:
         INSTALL_DEEPSPEED: false
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ../../hf_cache:/root/.cache/huggingface/
-      - ../../data:/app/LLaMA-Factory/data
-      - ../../output:/app/LLaMA-Factory/output
+      - ./hf_cache:/root/.cache/huggingface/
+      - ./data:/app/data
+      - ./output:/app/output
       - /usr/local/dcmi:/usr/local/dcmi
       - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
-      - /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
-      - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
+      - /usr/local/Ascend/driver:/usr/local/Ascend/driver
       - /etc/ascend_install.info:/etc/ascend_install.info
     ports:
       - "7860:7860"

From a4fc5a7ef26ba3969f986b88d6e631b871258935 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 01:03:38 +0800
Subject: [PATCH 113/162] Update README.md

Former-commit-id: abe7aca5e133960da9200e3a036d9a550f474171
---
 README.md | 84 ++++++++++++++++++++++++-------------------------------
 1 file changed, 37 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 1107ae0b..a20b848b 100644
--- a/README.md
+++ b/README.md
@@ -383,12 +383,6 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 | torch-npu    | 2.1.0   | 2.1.0.post3 |
 | deepspeed    | 0.13.2  | 0.13.2      |
 
-Docker users please refer to [Build Docker](#Build-Docker).
-
-**NOTE**
-
-The default docker image is [cosdt/cann:8.0.rc1-910b-ubuntu22.04](https://hub.docker.com/layers/cosdt/cann/8.0.rc1-910b-ubuntu22.04/images/sha256-29ef8aacf6b2babd292f06f00b9190c212e7c79a947411e213135e4d41a178a9?context=explore). More options can be found at [cosdt/cann](https://hub.docker.com/r/cosdt/cann/tags).
-
 Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
 
 If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
@@ -425,50 +419,62 @@ llamafactory-cli webui
 
 ### Build Docker
 
-#### Use Docker
-
-<details><summary>For NVIDIA GPU users:</summary>
+For CUDA users:
 
 ```bash
-cd ./docker/docker-cuda
-docker build -f ./Dockerfile \
+docker-compose -f ./docker/docker-cuda/docker-compose.yml up -d
+docker-compose exec llamafactory bash
+```
+
+For Ascend NPU users:
+
+```bash
+docker-compose -f ./docker/docker-npu/docker-compose.yml up -d
+docker-compose exec llamafactory bash
+```
+
+<details><summary>Build without Docker Compose</summary>
+
+For CUDA users:
+
+```bash
+docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
-docker run -it --gpus=all \
-    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
-    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
-    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+docker run -dit --gpus=all \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
     -p 7860:7860 \
     -p 8000:8000 \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
-```
-</details>
 
-<details><summary>For Ascend NPU users:</summary>
+docker exec -it llamafactory bash
+```
+
+For Ascend NPU users:
 
 ```bash
-cd ./docker/docker-npu
-docker build -f ./Dockerfile \
+# Change docker image upon your environment
+docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
-# add --device for multi-npu usage
-# or modify --device to change npu card
-docker run -it \
-    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
-    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
-    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+# Change `device` upon your resources
+docker run -dit \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
     -v /usr/local/dcmi:/usr/local/dcmi \
     -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64 \
-    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
     -v /etc/ascend_install.info:/etc/ascend_install.info \
     -p 7860:7860 \
     -p 8000:8000 \
@@ -479,28 +485,12 @@ docker run -it \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
+
+docker exec -it llamafactory bash
 ```
+
 </details>
 
-#### Use Docker Compose
-
-Firstly enter your docker path:
-
-```bash
-# for NVIDIA GPU users
-cd ./docker/docker-cuda
-
-# for Ascend NPU users
-cd ./docker/docker-npu
-```
-
-Then run the following command to build docker image and start the container:
-
-```bash
-docker-compose up -d
-docker-compose exec llamafactory bash
-```
-
 <details><summary>Details about volume</summary>
 
 - hf_cache: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory.

From 9582a10106b10546196f1157ebe66ecc14b302f4 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 01:06:59 +0800
Subject: [PATCH 114/162] Update README_zh.md

Former-commit-id: f0c95160fea48b8c6291f42beb79ac089177fbb2
---
 README_zh.md | 83 ++++++++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 45 deletions(-)

diff --git a/README_zh.md b/README_zh.md
index 6326c0b5..3bed0846 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -360,7 +360,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 <details><summary>昇腾 NPU 用户指南</summary>
 
-在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e '.[torch-npu,metrics]'` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
+在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e ".[torch-npu,metrics]"` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
 
 ```bash
 # 请替换 URL 为 CANN 版本和设备型号对应的 URL
@@ -383,12 +383,6 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 | torch-npu    | 2.1.0   | 2.1.0.post3 |
 | deepspeed    | 0.13.2  | 0.13.2      |
 
-Docker用户请参考 [构建 Docker](#构建-Docker).
-
-**NOTE**
-
-默认镜像为 [cosdt/cann:8.0.rc1-910b-ubuntu22.04](https://hub.docker.com/layers/cosdt/cann/8.0.rc1-910b-ubuntu22.04/images/sha256-29ef8aacf6b2babd292f06f00b9190c212e7c79a947411e213135e4d41a178a9?context=explore). 更多选择见 [cosdt/cann](https://hub.docker.com/r/cosdt/cann/tags).
-
 请使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定运算设备。
 
 如果遇到无法正常推理的情况，请尝试设置 `do_sample: false`。
@@ -425,49 +419,62 @@ llamafactory-cli webui
 
 ### 构建 Docker
 
-#### 使用 Docker
-
-<details><summary>NVIDIA GPU 用户：</summary>
+CUDA 用户：
 
 ```bash
-cd ./docker/docker-cuda
-docker build -f ./Dockerfile \
+docker-compose -f ./docker/docker-cuda/docker-compose.yml up -d
+docker-compose exec llamafactory bash
+```
+
+昇腾 NPU 用户：
+
+```bash
+docker-compose -f ./docker/docker-npu/docker-compose.yml up -d
+docker-compose exec llamafactory bash
+```
+
+<details><summary>不使用 Docker Compose 构建</summary>
+
+CUDA 用户：
+
+```bash
+docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
-docker run -it --gpus=all \
-    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
-    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
-    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+docker run -dit --gpus=all \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
     -p 7860:7860 \
     -p 8000:8000 \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
-```
-</details>
 
-<details><summary>Ascend NPU 用户：</summary>
+docker exec -it llamafactory bash
+```
+
+昇腾 NPU 用户：
 
 ```bash
-cd ./docker/docker-npu
-docker build -f ./Dockerfile \
+# 根据您的环境选择镜像
+docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
-# 增加 --device 来使用多卡 NPU 或修改第一个 --device 来更改 NPU 卡
-docker run -it \
-    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
-    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
-    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+# 根据您的资源更改 `device`
+docker run -dit \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
     -v /usr/local/dcmi:/usr/local/dcmi \
     -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64 \
-    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
     -v /etc/ascend_install.info:/etc/ascend_install.info \
     -p 7860:7860 \
     -p 8000:8000 \
@@ -478,26 +485,12 @@ docker run -it \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
+
+docker exec -it llamafactory bash
 ```
+
 </details>
 
-#### 使用 Docker Compose
-
-首先进入 docker 目录：
-```bash
-# NVIDIA GPU 用户
-cd ./docker/docker-cuda
-
-# Ascend NPU 用户
-cd ./docker/docker-npu
-```
-然后运行以下命令创建 docker 镜像并启动容器:
-
-```bash
-docker-compose up -d
-docker-compose exec llamafactory bash
-```
-
 <details><summary>数据卷详情</summary>
 
 - hf_cache：使用宿主机的 Hugging Face 缓存文件夹，允许更改为新的目录。

From f9d00ce0515254c1d0e5c29193929a5d54172b26 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 01:15:19 +0800
Subject: [PATCH 115/162] tiny fix

Former-commit-id: bb57478366a70a0871af30ab31c890f471e27ff4
---
 .dockerignore                              |  5 +++--
 README.md                                  | 10 ++++++----
 README_zh.md                               |  6 ++++--
 src/llamafactory/data/formatter.py         |  2 +-
 src/llamafactory/data/template.py          |  6 +++---
 src/llamafactory/hparams/data_args.py      |  8 ++++----
 src/llamafactory/webui/components/train.py |  2 +-
 src/llamafactory/webui/runner.py           |  2 +-
 8 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 2ac0e11d..75cd2209 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,10 +4,11 @@
 .venv
 cache
 data
+docker
+examples
+saves
 hf_cache
 output
-examples
 .dockerignore
 .gitattributes
 .gitignore
-Dockerfile
diff --git a/README.md b/README.md
index a20b848b..e1b45236 100644
--- a/README.md
+++ b/README.md
@@ -360,7 +360,7 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 <details><summary>For Ascend NPU users</summary>
 
-To install LLaMA Factory on Ascend NPU devices, please specify extra dependencies: `pip install -e '.[torch-npu,metrics]'`. Additionally, you need to install the **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**. Please follow the [installation tutorial](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html) or use the following commands:
+To install LLaMA Factory on Ascend NPU devices, please specify extra dependencies: `pip install -e ".[torch-npu,metrics]"`. Additionally, you need to install the **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**. Please follow the [installation tutorial](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html) or use the following commands:
 
 ```bash
 # replace the url according to your CANN version and devices
@@ -422,14 +422,16 @@ llamafactory-cli webui
 For CUDA users:
 
 ```bash
-docker-compose -f ./docker/docker-cuda/docker-compose.yml up -d
+cd docker/docker-cuda/
+docker-compose up -d
 docker-compose exec llamafactory bash
 ```
 
 For Ascend NPU users:
 
 ```bash
-docker-compose -f ./docker/docker-npu/docker-compose.yml up -d
+cd docker/docker-npu/
+docker-compose up -d
 docker-compose exec llamafactory bash
 ```
 
@@ -461,7 +463,7 @@ docker exec -it llamafactory bash
 For Ascend NPU users:
 
 ```bash
-# Change docker image upon your environment
+# Choose docker image upon your environment
 docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
diff --git a/README_zh.md b/README_zh.md
index 3bed0846..32edb1f7 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -422,14 +422,16 @@ llamafactory-cli webui
 CUDA 用户：
 
 ```bash
-docker-compose -f ./docker/docker-cuda/docker-compose.yml up -d
+cd docker/docker-cuda/
+docker-compose up -d
 docker-compose exec llamafactory bash
 ```
 
 昇腾 NPU 用户：
 
 ```bash
-docker-compose -f ./docker/docker-npu/docker-compose.yml up -d
+cd docker/docker-npu/
+docker-compose up -d
 docker-compose exec llamafactory bash
 ```
 
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index b5dc57ff..88ebf682 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -216,7 +216,7 @@ class ToolFormatter(Formatter):
             self._tool_formatter = glm4_tool_formatter
             self._tool_extractor = glm4_tool_extractor
         else:
-            raise ValueError("Tool format was not found.")
+            raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
 
     def apply(self, **kwargs) -> SLOTS:
         content = kwargs.pop("content")
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 3a72a858..193ff482 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -387,8 +387,9 @@ def get_template_and_fix_tokenizer(
         template = TEMPLATES.get(name, None)
         if template is None:
             raise ValueError("Template {} does not exist.".format(name))
-        
-    if tool_format:
+
+    if tool_format is not None:
+        logger.info("Using tool format: {}.".format(tool_format))
         template.format_tools = ToolFormatter(tool_format=tool_format)
 
     stop_words = template.stop_words
@@ -625,7 +626,6 @@ _register_template(
 
 _register_template(
     name="empty",
-    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
 )
 
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index 959742e3..dad13820 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -29,10 +29,6 @@ class DataArguments:
         default=None,
         metadata={"help": "Which template to use for constructing prompts in training and inference."},
     )
-    tool_format: Optional[str] = field(
-        default=None,
-        metadata={"help": "Specifies the tool format template for function calling ."},
-    )
     dataset: Optional[str] = field(
         default=None,
         metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."},
@@ -105,6 +101,10 @@ class DataArguments:
             "help": "Whether or not to pack the sequences in training. Will automatically enable in pre-training."
         },
     )
+    tool_format: Optional[str] = field(
+        default=None,
+        metadata={"help": "Tool format to use for constructing function calling examples."},
+    )
     tokenized_path: Optional[str] = field(
         default=None,
         metadata={"help": "Path to save or load the tokenized datasets."},
diff --git a/src/llamafactory/webui/components/train.py b/src/llamafactory/webui/components/train.py
index 874f3c5e..f33c37ee 100644
--- a/src/llamafactory/webui/components/train.py
+++ b/src/llamafactory/webui/components/train.py
@@ -291,7 +291,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         with gr.Column(scale=1):
             loss_viewer = gr.Plot()
 
-    input_elems.update({output_dir, config_path, device_count, ds_stage, ds_offload})
+    input_elems.update({output_dir, config_path, ds_stage, ds_offload})
     elem_dict.update(
         dict(
             cmd_preview_btn=cmd_preview_btn,
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 6cd21b07..549ec765 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -306,7 +306,7 @@ class Runner:
 
     def _form_config_dict(self, data: Dict["Component", Any]) -> Dict[str, Any]:
         config_dict = {}
-        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path", "train.device_count"]
+        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path"]
         for elem, value in data.items():
             elem_id = self.manager.get_id_by_elem(elem)
             if elem_id not in skip_ids:

From 106e9d56288cb6bdbce567612a86bb71df3b603d Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 01:51:29 +0800
Subject: [PATCH 116/162] fix #4419

Former-commit-id: 15069c3ca814d5ac9beec77d914b71cde7ea0f47
---
 README.md                             | 6 ++++--
 README_zh.md                          | 6 ++++--
 docker/docker-cuda/Dockerfile         | 2 +-
 docker/docker-cuda/docker-compose.yml | 3 ++-
 docker/docker-npu/Dockerfile          | 2 +-
 docker/docker-npu/docker-compose.yml  | 3 ++-
 6 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index e1b45236..4b42edd7 100644
--- a/README.md
+++ b/README.md
@@ -448,7 +448,8 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     -t llamafactory:latest .
 
 docker run -dit --gpus=all \
-    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -p 7860:7860 \
@@ -471,7 +472,8 @@ docker build -f ./docker/docker-npu/Dockerfile \
 
 # Change `device` upon your resources
 docker run -dit \
-    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -v /usr/local/dcmi:/usr/local/dcmi \
diff --git a/README_zh.md b/README_zh.md
index 32edb1f7..3926c09d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -448,7 +448,8 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     -t llamafactory:latest .
 
 docker run -dit --gpus=all \
-    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -p 7860:7860 \
@@ -471,7 +472,8 @@ docker build -f ./docker/docker-npu/Dockerfile \
 
 # 根据您的资源更改 `device`
 docker run -dit \
-    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -v /usr/local/dcmi:/usr/local/dcmi \
diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 2d20bfe4..827b7b3c 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -36,7 +36,7 @@ RUN EXTRA_PACKAGES="metrics"; \
     pip uninstall -y transformer-engine flash-attn
 
 # Set up volumes
-VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
+VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
 
 # Expose port 7860 for the LLaMA Board
 ENV GRADIO_SERVER_PORT 7860
diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index 04d6531a..1c0a3c75 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -10,7 +10,8 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface/
+      - ./hf_cache:/root/.cache/huggingface
+      - ./ms_cache:/root/.cache/modelscope
       - ./data:/app/data
       - ./output:/app/output
     ports:
diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 0fdd4472..08de626b 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -30,7 +30,7 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
     pip uninstall -y transformer-engine flash-attn
 
 # Set up volumes
-VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
+VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
 
 # Expose port 7860 for the LLaMA Board
 ENV GRADIO_SERVER_PORT 7860
diff --git a/docker/docker-npu/docker-compose.yml b/docker/docker-npu/docker-compose.yml
index 7fff6e73..a6b878fd 100644
--- a/docker/docker-npu/docker-compose.yml
+++ b/docker/docker-npu/docker-compose.yml
@@ -8,7 +8,8 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface/
+      - ./hf_cache:/root/.cache/huggingface
+      - ./ms_cache:/root/.cache/modelscope
       - ./data:/app/data
       - ./output:/app/output
       - /usr/local/dcmi:/usr/local/dcmi

From 4d2c2790839846dd44fbc7baabf6057b869d5365 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 01:54:53 +0800
Subject: [PATCH 117/162] tiny fix about badam

Former-commit-id: 03f49267c7406e36aee35639f86e6e0383897090
---
 .github/workflows/tests.yml                   |  2 +-
 .gitignore                                    |  8 ++--
 ...ma3_lora_sft.yaml => llama3_full_sft.yaml} |  2 +-
 ...adam_sft.yaml => llama3_full_sft_ds3.yaml} |  4 +-
 examples/extras/badam/train_single_gpu.sh     | 37 ------------------
 examples/extras/badam/train_zero3.sh          | 39 -------------------
 setup.py                                      |  2 +-
 src/llamafactory/hparams/parser.py            | 14 +++----
 src/llamafactory/train/dpo/trainer.py         |  3 +-
 src/llamafactory/train/kto/trainer.py         |  3 +-
 src/llamafactory/train/ppo/trainer.py         |  3 +-
 src/llamafactory/train/pt/trainer.py          |  3 +-
 src/llamafactory/train/rm/trainer.py          |  3 +-
 src/llamafactory/train/sft/trainer.py         |  3 +-
 src/llamafactory/train/trainer_utils.py       |  7 +---
 15 files changed, 31 insertions(+), 102 deletions(-)
 rename examples/extras/badam/{llama3_lora_sft.yaml => llama3_full_sft.yaml} (97%)
 rename examples/extras/badam/{llama3_badam_sft.yaml => llama3_full_sft_ds3.yaml} (89%)
 delete mode 100644 examples/extras/badam/train_single_gpu.sh
 delete mode 100644 examples/extras/badam/train_zero3.sh

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 98bd9455..73d77de5 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -34,7 +34,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install .[torch,dev]
+          python -m pip install ".[torch,dev]"
 
       - name: Check quality
         run: |
diff --git a/.gitignore b/.gitignore
index 2486e728..82e6e9e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,8 +160,8 @@ cython_debug/
 .idea/
 
 # custom .gitignore
-user.config
-saves/
 cache/
-wandb
-ds_badam_exp
\ No newline at end of file
+config/
+saves/
+output/
+wandb/
diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_full_sft.yaml
similarity index 97%
rename from examples/extras/badam/llama3_lora_sft.yaml
rename to examples/extras/badam/llama3_full_sft.yaml
index a78de2fa..31d61c33 100644
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft.yaml
@@ -6,6 +6,7 @@ stage: sft
 do_train: true
 finetuning_type: full
 use_badam: true
+badam_mode: layer
 badam_switch_mode: ascending
 badam_switch_interval: 50
 badam_verbose: 2
@@ -32,7 +33,6 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-pure_bf16: true
 
 ### eval
 val_size: 0.1
diff --git a/examples/extras/badam/llama3_badam_sft.yaml b/examples/extras/badam/llama3_full_sft_ds3.yaml
similarity index 89%
rename from examples/extras/badam/llama3_badam_sft.yaml
rename to examples/extras/badam/llama3_full_sft_ds3.yaml
index f5adb220..f2d7309f 100644
--- a/examples/extras/badam/llama3_badam_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft_ds3.yaml
@@ -6,9 +6,11 @@ stage: sft
 do_train: true
 finetuning_type: full
 use_badam: true
+badam_mode: layer
 badam_switch_mode: ascending
 badam_switch_interval: 50
 badam_verbose: 2
+deepspeed: examples/deepspeed/ds_z3_config.json
 
 ### dataset
 dataset: identity,alpaca_en_demo
@@ -28,7 +30,7 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
-learning_rate: 1.0e-6
+learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
diff --git a/examples/extras/badam/train_single_gpu.sh b/examples/extras/badam/train_single_gpu.sh
deleted file mode 100644
index 8af79007..00000000
--- a/examples/extras/badam/train_single_gpu.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=0
-
-cd ../../..
-
-llamafactory-cli train \
-    --stage sft \
-    --do_train True \
-    --model_name_or_path meta-llama/Llama-2-13b-hf \
-    --preprocessing_num_workers 16 \
-    --finetuning_type full \
-    --template default \
-    --flash_attn auto \
-    --dataset_dir data \
-    --dataset alpaca_en_demo \
-    --cutoff_len 1024 \
-    --learning_rate 1e-6 \
-    --num_train_epochs 3.0 \
-    --max_samples 100000 \
-    --per_device_train_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --max_grad_norm 1.0 \
-    --logging_steps 5 \
-    --save_steps 100 \
-    --warmup_steps 0 \
-    --optim adamw_torch \
-    --packing False \
-    --report_to none \
-    --use_badam True \
-    --output_dir saves/LLaMA2-13B/full/BAdam \
-    --plot_loss True \
-    --ddp_timeout 180000000 \
-    --include_num_input_tokens_seen True \
-    --badam_mode layer \
-    --badam_switch_mode ascending \
-    --badam_switch_interval 50
\ No newline at end of file
diff --git a/examples/extras/badam/train_zero3.sh b/examples/extras/badam/train_zero3.sh
deleted file mode 100644
index 3b182134..00000000
--- a/examples/extras/badam/train_zero3.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-
-cd ../../..
-
-llamafactory-cli train \
-    --stage sft \
-    --do_train True \
-    --model_name_or_path meta-llama/Llama-2-13b-hf \
-    --preprocessing_num_workers 16 \
-    --finetuning_type full \
-    --template default \
-    --flash_attn auto \
-    --dataset_dir data \
-    --dataset alpaca_en_demo \
-    --cutoff_len 1024 \
-    --learning_rate 1e-6 \
-    --num_train_epochs 3.0 \
-    --max_samples 100000 \
-    --per_device_train_batch_size 8 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --max_grad_norm 1.0 \
-    --logging_steps 5 \
-    --save_steps 100 \
-    --warmup_steps 0 \
-    --optim adamw_torch \
-    --packing False \
-    --report_to none \
-    --use_badam True \
-    --output_dir saves/LLaMA2-13B/full/BAdam \
-    --fp16 True \
-    --plot_loss True \
-    --ddp_timeout 180000000 \
-    --include_num_input_tokens_seen True \
-    --badam_mode layer \
-    --badam_switch_mode ascending \
-    --badam_switch_interval 50 \
-    --deepspeed cache/ds_z3_config.json 
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 3d2ac921..64f50a87 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ extra_require = {
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "vllm": ["vllm>=0.4.3"],
     "galore": ["galore-torch"],
-    "badam": ["badam"],
+    "badam": ["badam>=1.2.1"],
     "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index f2ccd5e6..a4b7f7a5 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -121,7 +121,7 @@ def _check_extra_dependencies(
         require_version("galore_torch", "To fix: pip install galore_torch")
 
     if finetuning_args.use_badam:
-        require_version("badam", "To fix: pip install badam")
+        require_version("badam>=1.2.1", "To fix: pip install badam>=1.2.1")
 
     if finetuning_args.plot_loss:
         require_version("matplotlib", "To fix: pip install matplotlib")
@@ -214,15 +214,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
 
     if (
         finetuning_args.use_badam
-        and training_args.parallel_mode.value == "distributed"
+        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
     ):
         if finetuning_args.badam_mode == "ratio":
-            raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
-        if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()):
-            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.")
+            raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
+        elif not is_deepspeed_zero3_enabled():
+            raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.")
 
-    if (finetuning_args.use_galore) and training_args.deepspeed is not None:
-        raise ValueError("GaLore are incompatible with DeepSpeed yet.")
+    if finetuning_args.use_galore and training_args.deepspeed is not None:
+        raise ValueError("GaLore is incompatible with DeepSpeed yet.")
 
     if model_args.infer_backend == "vllm":
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index a3e0e961..ed4fd5d9 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -96,7 +96,8 @@ class CustomDPOTrainer(DPOTrainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 0d50987f..c2edf95a 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -91,7 +91,8 @@ class CustomKTOTrainer(KTOTrainer):
                 self.ref_model.eval()
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 2d5d7ffc..70d01919 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -166,7 +166,8 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                 self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index d3516b41..b6fb161d 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -48,7 +48,8 @@ class CustomTrainer(Trainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 433251cf..70c2e9a0 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -72,7 +72,8 @@ class PairwiseTrainer(Trainer):
         self.processor = processor
         self.can_return_loss = True  # override property to return eval_loss
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 45799b96..8f18317f 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -56,7 +56,8 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 0206dcb6..21d41c36 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -23,6 +23,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 import torch
 from peft import PeftModel
 from transformers import Trainer
+from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.optimization import get_scheduler
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 from transformers.trainer_pt_utils import get_parameter_names
@@ -372,9 +373,6 @@ def _create_badam_optimizer(
         dict(params=decay_params, weight_decay=training_args.weight_decay),
     ]
 
-    from transformers.integrations import is_deepspeed_zero3_enabled
-    ds_zero3_enabled = is_deepspeed_zero3_enabled()
-
     if finetuning_args.badam_mode == "layer":
         from badam import BlockOptimizer
 
@@ -387,7 +385,7 @@ def _create_badam_optimizer(
             start_block=finetuning_args.badam_start_block,
             switch_mode=finetuning_args.badam_switch_mode,
             verbose=finetuning_args.badam_verbose,
-            ds_zero3_enabled=ds_zero3_enabled
+            ds_zero3_enabled=is_deepspeed_zero3_enabled(),
         )
         logger.info(
             f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
@@ -398,7 +396,6 @@ def _create_badam_optimizer(
     elif finetuning_args.badam_mode == "ratio":
         from badam import BlockOptimizerRatio
 
-        assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer."
         assert finetuning_args.badam_update_ratio > 1e-6
         optimizer = BlockOptimizerRatio(
             param_groups=param_groups,

From 8a6405f39edd48b93e59aa9c9df202e54e975cac Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 02:31:44 +0800
Subject: [PATCH 118/162] fix #4379

Former-commit-id: 96bedb4b6445a04ff8b97fb2aadace50b2f882df
---
 src/llamafactory/train/tuner.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py
index 788b4c4f..a02fff22 100644
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import torch
@@ -19,6 +21,7 @@ from transformers import PreTrainedModel
 
 from ..data import get_template_and_fix_tokenizer
 from ..extras.callbacks import LogCallback
+from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from ..extras.logging import get_logger
 from ..hparams import get_infer_args, get_train_args
 from ..model import load_model, load_tokenizer
@@ -98,6 +101,25 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
             safe_serialization=(not model_args.export_legacy_format),
         )
 
+    if finetuning_args.stage == "rm":
+        if model_args.adapter_name_or_path is not None:
+            vhead_path = model_args.adapter_name_or_path[-1]
+        else:
+            vhead_path = model_args.model_name_or_path
+
+        if os.path.exists(os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME)):
+            shutil.copy(
+                os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME),
+                os.path.join(model_args.export_dir, V_HEAD_SAFE_WEIGHTS_NAME),
+            )
+            logger.info("Copied valuehead to {}.".format(model_args.export_dir))
+        elif os.path.exists(os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME)):
+            shutil.copy(
+                os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME),
+                os.path.join(model_args.export_dir, V_HEAD_WEIGHTS_NAME),
+            )
+            logger.info("Copied valuehead to {}.".format(model_args.export_dir))
+
     try:
         tokenizer.padding_side = "left"  # restore padding side
         tokenizer.init_kwargs["padding_side"] = "left"

From 3d1d42030f0951d63ece3e3256042d86d9cf015a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 02:34:04 +0800
Subject: [PATCH 119/162] fix #4432

Former-commit-id: 972a3b469c600bc6528aef3a49b6fdec63d65803
---
 src/llamafactory/model/loader.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 69cccd93..e1015821 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -14,6 +14,7 @@
 
 from typing import TYPE_CHECKING, Any, Dict, Optional, TypedDict
 
+import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer
 from trl import AutoModelForCausalLMWithValueHead
 
@@ -175,6 +176,10 @@ def load_model(
 
     if not is_trainable:
         model.requires_grad_(False)
+        for param in model.parameters():
+            if param.data.dtype == torch.float32 and model_args.compute_dtype != torch.float32:
+                param.data = param.data.to(model_args.compute_dtype)
+
         model.eval()
     else:
         model.train()

From 81bf8cfdf2d66b88c781cd2726871950a58b5c21 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 02:51:49 +0800
Subject: [PATCH 120/162] fix test case

Former-commit-id: 6663057cfbdc96385d901a5dfba22cfcd7a61b23
---
 tests/model/test_base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index e1991b20..6431a504 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -73,7 +73,8 @@ def test_valuehead():
         tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False, add_valuehead=True
     )
 
-    ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
+    ref_model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(
         TINY_LLAMA_VALUEHEAD, torch_dtype=torch.float16, device_map=get_current_device()
     )
+    ref_model.v_head = ref_model.v_head.to(torch.float16)
     compare_model(model, ref_model)

From f5bf167a6eea21fd2b06498435379decf6960bad Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 02:55:50 +0800
Subject: [PATCH 121/162] lint

Former-commit-id: c9e424d2198b5872ce118a6ab4c109bf73be2bee
---
 src/llamafactory/hparams/parser.py | 5 +----
 tests/data/test_formatter.py       | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index a4b7f7a5..d4bcfbc6 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -212,10 +212,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
-    if (
-        finetuning_args.use_badam
-        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
-    ):
+    if finetuning_args.use_badam and training_args.parallel_mode == ParallelMode.DISTRIBUTED:
         if finetuning_args.badam_mode == "ratio":
             raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
         elif not is_deepspeed_zero3_enabled():
diff --git a/tests/data/test_formatter.py b/tests/data/test_formatter.py
index a01e8a7e..37b21dc5 100644
--- a/tests/data/test_formatter.py
+++ b/tests/data/test_formatter.py
@@ -113,9 +113,7 @@ def test_glm4_tool_formatter():
     assert formatter.apply(content=json.dumps(tools)) == [
         "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
         "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具\n\n"
-        "## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
-            json.dumps(tools[0], indent=4)
-        )
+        "## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(json.dumps(tools[0], indent=4))
     ]
 
 
From 5b07dea7ed1039113675da85efae34069475c941 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 14:34:13 +0800
Subject: [PATCH 122/162] fix #4456

Former-commit-id: 920f4fa4ca9e08bcf0d16450e085ee0fa8b4e1c5
---
 .dockerignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.dockerignore b/.dockerignore
index 75cd2209..23ad75a8 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -5,7 +5,6 @@
 cache
 data
 docker
-examples
 saves
 hf_cache
 output

From a2f7a487cf7b0acd2bdcf1a63a52c1865d554535 Mon Sep 17 00:00:00 2001
From: hzhaoy <hzywong@gmail.com>
Date: Tue, 25 Jun 2024 15:13:07 +0800
Subject: [PATCH 123/162] support flash-attn in Dockerfile

Former-commit-id: 0dba000aa178f915cea7d75bf0c9d47e671a21d2
---
 docker/docker-cuda/Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 827b7b3c..06a172f0 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -35,6 +35,11 @@ RUN EXTRA_PACKAGES="metrics"; \
     pip install -e .[$EXTRA_PACKAGES] && \
     pip uninstall -y transformer-engine flash-attn
 
+# Rebuild flash-attn
+RUN ninja --version || \
+    (pip uninstall -y ninja && pip install ninja) && \
+    MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation
+
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
 

From 60dea82813c680a4bac2ca5d018d1ca7ce6e6ce9 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Wed, 26 Jun 2024 02:15:00 +0000
Subject: [PATCH 124/162] fix docker-compose path

Former-commit-id: 9de3c24aa2a8268be06c8fef8e47f4fb6715c7ec
---
 docker/docker-cuda/docker-compose.yml | 8 ++++----
 docker/docker-npu/docker-compose.yml  | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index 1c0a3c75..e2d1a5ad 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -10,10 +10,10 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface
-      - ./ms_cache:/root/.cache/modelscope
-      - ./data:/app/data
-      - ./output:/app/output
+      - ../../hf_cache:/root/.cache/huggingface
+      - ../../ms_cache:/root/.cache/modelscope
+      - ../../data:/app/data
+      - ../../output:/app/output
     ports:
       - "7860:7860"
       - "8000:8000"
diff --git a/docker/docker-npu/docker-compose.yml b/docker/docker-npu/docker-compose.yml
index a6b878fd..657cba9f 100644
--- a/docker/docker-npu/docker-compose.yml
+++ b/docker/docker-npu/docker-compose.yml
@@ -8,10 +8,10 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface
-      - ./ms_cache:/root/.cache/modelscope
-      - ./data:/app/data
-      - ./output:/app/output
+      - ../../hf_cache:/root/.cache/huggingface
+      - ../../ms_cache:/root/.cache/modelscope
+      - ../../data:/app/data
+      - ../../output:/app/output
       - /usr/local/dcmi:/usr/local/dcmi
       - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
       - /usr/local/Ascend/driver:/usr/local/Ascend/driver

From c0af40b2104c014b0123fe1703f0a09625dcc018 Mon Sep 17 00:00:00 2001
From: fanjunliang <fanjunliang@sensetime.com>
Date: Wed, 26 Jun 2024 18:21:42 +0800
Subject: [PATCH 125/162] fix torch-npu dependency

Former-commit-id: 7c8a8061d0cda6342f6c883748fb6bc6650df9f9
---
 docker/docker-npu/Dockerfile | 9 +++++----
 setup.py                     | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 08de626b..0ec16107 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -7,16 +7,17 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Define installation arguments
 ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
+ARG EXTRA_INDEX=https://download.pytorch.org/whl/cpu
 
 # Set the working directory
 WORKDIR /app
 
 # Install the requirements
 COPY requirements.txt /app
-RUN pip config set global.index-url $PIP_INDEX
-RUN pip config set global.extra-index-url $PIP_INDEX
-RUN python -m pip install --upgrade pip
-RUN python -m pip install -r requirements.txt
+RUN pip config set global.index-url $PIP_INDEX && \
+    pip config set global.extra-index-url $EXTRA_INDEX && \
+    pip install --upgrade pip && \
+    pip install -r requirements.txt
 
 # Copy the rest of the application into the image
 COPY . /app
diff --git a/setup.py b/setup.py
index 64f50a87..89301d1b 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,7 @@ def get_requires():
 
 extra_require = {
     "torch": ["torch>=1.13.1"],
-    "torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],

From 0265a98942fa086348f6f09cb7ba811bad710489 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 26 Jun 2024 19:43:16 +0800
Subject: [PATCH 126/162] fix #4556

Former-commit-id: 81faa9a985c14e83e38f42aedd228edb676b0695
---
 scripts/loftq_init.py             | 8 ++++++--
 scripts/pissa_init.py             | 8 ++++++--
 src/llamafactory/data/template.py | 2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/loftq_init.py b/scripts/loftq_init.py
index b9506fa3..4d2c01b9 100644
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
@@ -36,15 +36,19 @@ def quantize_loftq(
     lora_alpha: int = None,
     lora_rank: int = 16,
     lora_dropout: float = 0,
-    lora_target: str = "q_proj,v_proj",
+    lora_target: tuple = ("q_proj", "v_proj"),
     save_safetensors: bool = True,
 ):
     r"""
     Initializes LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ)
     Usage: python loftq_init.py --model_name_or_path path_to_model --output_dir output_dir
     """
+    if isinstance(lora_target, str):
+        lora_target = [name.strip() for name in lora_target.split(",")]
+
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
+
     loftq_config = LoftQConfig(loftq_bits=loftq_bits, loftq_iter=loftq_iter)
     lora_config = LoraConfig(
         task_type=TaskType.CAUSAL_LM,
@@ -52,7 +56,7 @@ def quantize_loftq(
         r=lora_rank,
         lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
         lora_dropout=lora_dropout,
-        target_modules=[name.strip() for name in lora_target.split(",")],
+        target_modules=lora_target,
         init_lora_weights="loftq",
         loftq_config=loftq_config,
     )
diff --git a/scripts/pissa_init.py b/scripts/pissa_init.py
index 50239727..ad9d161c 100644
--- a/scripts/pissa_init.py
+++ b/scripts/pissa_init.py
@@ -35,21 +35,25 @@ def quantize_pissa(
     lora_alpha: int = None,
     lora_rank: int = 16,
     lora_dropout: float = 0,
-    lora_target: str = "q_proj,v_proj",
+    lora_target: tuple = ("q_proj", "v_proj"),
     save_safetensors: bool = True,
 ):
     r"""
     Initializes LoRA weights with Principal Singular values and Singular vectors Adaptation (PiSSA)
     Usage: python pissa_init.py --model_name_or_path path_to_model --output_dir output_dir
     """
+    if isinstance(lora_target, str):
+        lora_target = [name.strip() for name in lora_target.split(",")]
+
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
+
     lora_config = LoraConfig(
         task_type=TaskType.CAUSAL_LM,
         r=lora_rank,
         lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
         lora_dropout=lora_dropout,
-        target_modules=[name.strip() for name in lora_target.split(",")],
+        target_modules=lora_target,
         init_lora_weights="pissa" if pissa_iter == -1 else "pissa_niter_{}".format(pissa_iter),
     )
 
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 193ff482..53f16df4 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -618,7 +618,7 @@ _register_template(
 
 _register_template(
     name="default",
-    format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]),
+    format_user=StringFormatter(slots=["Human: {{content}}\nAssistant:"]),
     format_system=StringFormatter(slots=["{{content}}\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )

From e823b200cf89e7d2a957bfd07b49e0e1326afc60 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 26 Jun 2024 19:52:35 +0800
Subject: [PATCH 127/162] fix #4458

Former-commit-id: aab14b15268dbe74ded22549dbd3677474868cbb
---
 src/llamafactory/train/ppo/trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 70d01919..c5f6e175 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -99,10 +99,10 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         )
 
         # Add deepspeed config
-        ppo_config.accelerator_kwargs["kwargs_handlers"] = [
-            DistributedDataParallelKwargs(find_unused_parameters=training_args.ddp_find_unused_parameters)
-        ]
         if training_args.deepspeed_plugin is not None:
+            ppo_config.accelerator_kwargs["kwargs_handlers"] = [
+                DistributedDataParallelKwargs(find_unused_parameters=training_args.ddp_find_unused_parameters)
+            ]
             ppo_config.accelerator_kwargs["deepspeed_plugin"] = training_args.deepspeed_plugin
 
         # Create optimizer and scheduler

From 4041aa024b96532dee982a889990e19747f95a42 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 26 Jun 2024 22:11:44 +0800
Subject: [PATCH 128/162] improve autogptq integration

Former-commit-id: d68408c7b123b8ff92014db35cac0b24b414a6f4
---
 setup.py                                      |  2 +-
 .../model/model_utils/quantization.py         | 41 ++++++++++++-------
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/setup.py b/setup.py
index 64f50a87..8254b6d4 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@ extra_require = {
     "vllm": ["vllm>=0.4.3"],
     "galore": ["galore-torch"],
     "badam": ["badam>=1.2.1"],
-    "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
+    "gptq": ["optimum>=1.17.0", "auto-gptq>=0.5.0"],
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
     "qwen": ["transformers_stream_generator"],
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 5251f84f..fab61cb8 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -57,9 +57,9 @@ class QuantizationMethod(str, Enum):
     HQQ = "hqq"
 
 
-def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]:
+def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[Dict[str, Any]]:
     r"""
-    TODO: remove tokenizer.decode() https://github.com/huggingface/optimum/pull/1600
+    Prepares the dataset to perform AutoGPTQ.
     """
     if os.path.isfile(model_args.export_quantization_dataset):
         data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None)
@@ -68,20 +68,32 @@ def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "Mod
         data_path = model_args.export_quantization_dataset
         data_files = None
 
-    dataset = load_dataset(path=data_path, data_files=data_files, split="train", cache_dir=model_args.cache_dir)
-    maxlen = model_args.export_quantization_maxlen
+    dataset = load_dataset(
+        path=data_path,
+        data_files=data_files,
+        split="train",
+        cache_dir=model_args.cache_dir,
+        token=model_args.hf_hub_token,
+    )
 
     samples = []
+    maxlen = model_args.export_quantization_maxlen
     for _ in range(model_args.export_quantization_nsamples):
+        n_try = 0
         while True:
+            if n_try > 100:
+                raise ValueError("Cannot find satisfying example, considering decrease `export_quantization_maxlen`.")
+
             sample_idx = random.randint(0, len(dataset) - 1)
-            sample: Dict[str, torch.Tensor] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt")
-            if sample["input_ids"].size(1) >= maxlen:
+            sample: Dict[str, "torch.Tensor"] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt")
+            n_try += 1
+            if sample["input_ids"].size(1) > maxlen:
                 break  # TODO: fix large maxlen
 
         word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1)
         input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen]
-        samples.append(tokenizer.decode(input_ids[0].tolist(), skip_special_tokens=True))
+        attention_mask = sample["attention_mask"][:, word_idx : word_idx + maxlen]
+        samples.append({"input_ids": input_ids, "attention_mask": attention_mask})
 
     return samples
 
@@ -119,21 +131,20 @@ def configure_quantization(
         logger.info("Loading {}-bit {}-quantized model.".format(quant_bits, quant_method.upper()))
 
     elif model_args.export_quantization_bit is not None:  # auto-gptq
-        require_version("optimum>=1.16.0", "To fix: pip install optimum>=1.16.0")
+        require_version("optimum>=1.17.0", "To fix: pip install optimum>=1.17.0")
         require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
         from accelerate.utils import get_max_memory
 
         if getattr(config, "model_type", None) == "chatglm":
-            raise ValueError("ChatGLM model is not supported.")
+            raise ValueError("ChatGLM model is not supported yet.")
 
         init_kwargs["quantization_config"] = GPTQConfig(
             bits=model_args.export_quantization_bit,
-            tokenizer=tokenizer,
             dataset=_get_quantization_dataset(tokenizer, model_args),
         )
         init_kwargs["device_map"] = "auto"
         init_kwargs["max_memory"] = get_max_memory()
-        logger.info("Quantizing model to {} bit.".format(model_args.export_quantization_bit))
+        logger.info("Quantizing model to {} bit with AutoGPTQ.".format(model_args.export_quantization_bit))
 
     elif model_args.quantization_bit is not None:  # bnb
         if model_args.quantization_bit == 8:
@@ -150,9 +161,9 @@ def configure_quantization(
                 bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
             )
 
-        # assign device map if:
-        # 1. not deepspeed zero3 and not fsdp
-        # 2. not auto quantization device map
+        # Do not assign device map if:
+        # 1. deepspeed zero3 or fsdp (train)
+        # 2. auto quantization device map (inference)
         if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
             if model_args.quantization_bit != 4:
                 raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
@@ -161,4 +172,4 @@ def configure_quantization(
         else:
             init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
 
-        logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
+        logger.info("Quantizing model to {} bit with bitsandbytes.".format(model_args.quantization_bit))

From e9dda7111dd4a103aebb9251ba0dd661c498b9ea Mon Sep 17 00:00:00 2001
From: hzhaoy <hzywong@gmail.com>
Date: Thu, 27 Jun 2024 00:11:04 +0800
Subject: [PATCH 129/162] add flash-attn installation flag in Dockerfile

Former-commit-id: 2535044e95f6df628bd1f01e0eecb02407105d79
---
 README.md                             | 1 +
 README_zh.md                          | 1 +
 docker/docker-cuda/Dockerfile         | 9 ++++++---
 docker/docker-cuda/docker-compose.yml | 1 +
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4b42edd7..cdca8333 100644
--- a/README.md
+++ b/README.md
@@ -444,6 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg INSTALL_FLASH_ATTN=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
diff --git a/README_zh.md b/README_zh.md
index 3926c09d..d26c8268 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -444,6 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg INSTALL_FLASH_ATTN=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 06a172f0..44aaf538 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -6,6 +6,7 @@ FROM nvcr.io/nvidia/pytorch:24.02-py3
 ARG INSTALL_BNB=false
 ARG INSTALL_VLLM=false
 ARG INSTALL_DEEPSPEED=false
+ARG INSTALL_FLASH_ATTN=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
@@ -36,9 +37,11 @@ RUN EXTRA_PACKAGES="metrics"; \
     pip uninstall -y transformer-engine flash-attn
 
 # Rebuild flash-attn
-RUN ninja --version || \
-    (pip uninstall -y ninja && pip install ninja) && \
-    MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation
+RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
+        ninja --version || \
+        (pip uninstall -y ninja && pip install ninja) && \
+        MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation \
+    fi;
 
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index 1c0a3c75..ad269cb0 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -7,6 +7,7 @@ services:
         INSTALL_BNB: false
         INSTALL_VLLM: false
         INSTALL_DEEPSPEED: false
+        INSTALL_FLASH_ATTN: false
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:

From 28c2c7fba54c2e7c5b4288664f8bc2b46ef73d08 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 00:29:42 +0800
Subject: [PATCH 130/162] support HQQ/EETQ #4113

Former-commit-id: b7cb51ddb394f04fe4646b2c297fc8d918c9979e
---
 README.md                                     |  4 +-
 README_zh.md                                  |  4 +-
 setup.py                                      |  8 +-
 src/llamafactory/extras/env.py                |  5 +-
 src/llamafactory/hparams/model_args.py        |  7 +-
 src/llamafactory/model/__init__.py            |  2 +
 src/llamafactory/model/loader.py              |  4 +-
 .../model/model_utils/quantization.py         | 75 ++++++++++++-------
 src/llamafactory/webui/chatter.py             | 10 ++-
 src/llamafactory/webui/common.py              |  2 +
 src/llamafactory/webui/components/export.py   |  5 +-
 src/llamafactory/webui/components/top.py      | 13 ++--
 src/llamafactory/webui/locales.py             | 20 ++++-
 src/llamafactory/webui/manager.py             |  1 +
 src/llamafactory/webui/runner.py              | 18 ++++-
 src/llamafactory/webui/utils.py               | 13 ++++
 16 files changed, 134 insertions(+), 57 deletions(-)

diff --git a/README.md b/README.md
index 4b42edd7..443c8cf7 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ Choose your path:
 
 - **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO, etc.
-- **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
+- **Scalable resources**: 16-bit full-tuning, freeze-tuning, LoRA and 2/3/4/5/6/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ.
 - **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ, PiSSA and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
@@ -341,7 +341,7 @@ cd LLaMA-Factory
 pip install -e ".[torch,metrics]"
 ```
 
-Extra dependencies available: torch, torch_npu, metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality
+Extra dependencies available: torch, torch-npu, metrics, deepspeed, bitsandbytes, hqq, eetq, gptq, awq, aqlm, vllm, galore, badam, qwen, modelscope, quality
 
 > [!TIP]
 > Use `pip install --no-deps -e .` to resolve package conflicts.
diff --git a/README_zh.md b/README_zh.md
index 3926c09d..d5172a7d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -48,7 +48,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 - **多种模型**：LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
 - **集成方法**：（增量）预训练、（多模态）指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练、ORPO 训练等等。
-- **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
+- **多种精度**：16 比特全参数微调、冻结微调、LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ 的 2/3/4/5/6/8 比特 QLoRA 微调。
 - **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ、PiSSA 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
 - **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
@@ -341,7 +341,7 @@ cd LLaMA-Factory
 pip install -e ".[torch,metrics]"
 ```
 
-可选的额外依赖项：torch、torch_npu、metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality
+可选的额外依赖项：torch、torch-npu、metrics、deepspeed、bitsandbytes、hqq、eetq、gptq、awq、aqlm、vllm、galore、badam、qwen、modelscope、quality
 
 > [!TIP]
 > 遇到包冲突时，可使用 `pip install --no-deps -e .` 解决。
diff --git a/setup.py b/setup.py
index 8254b6d4..d43c311c 100644
--- a/setup.py
+++ b/setup.py
@@ -39,12 +39,14 @@ extra_require = {
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
-    "vllm": ["vllm>=0.4.3"],
-    "galore": ["galore-torch"],
-    "badam": ["badam>=1.2.1"],
+    "hqq": ["hqq"],
+    "eetq": ["eetq"],
     "gptq": ["optimum>=1.17.0", "auto-gptq>=0.5.0"],
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
+    "vllm": ["vllm>=0.4.3"],
+    "galore": ["galore-torch"],
+    "badam": ["badam>=1.2.1"],
     "qwen": ["transformers_stream_generator"],
     "modelscope": ["modelscope"],
     "dev": ["ruff", "pytest"],
diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index ab387231..14876048 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -1,4 +1,7 @@
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/commands/env.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 3f21145d..087c8c38 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -77,6 +77,10 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether or not to use memory-efficient model loading."},
     )
+    quantization_method: Literal["bitsandbytes", "hqq", "eetq"] = field(
+        default="bitsandbytes",
+        metadata={"help": "Quantization method to use for on-the-fly quantization."},
+    )
     quantization_bit: Optional[int] = field(
         default=None,
         metadata={"help": "The number of bits to quantize the model using bitsandbytes."},
@@ -235,9 +239,6 @@ class ModelArguments:
         if self.new_special_tokens is not None:  # support multiple special tokens
             self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")]
 
-        assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
-        assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization."
-
         if self.export_quantization_bit is not None and self.export_quantization_dataset is None:
             raise ValueError("Quantization dataset is necessary for exporting.")
 
diff --git a/src/llamafactory/model/__init__.py b/src/llamafactory/model/__init__.py
index 4abbaa1b..48cfe76c 100644
--- a/src/llamafactory/model/__init__.py
+++ b/src/llamafactory/model/__init__.py
@@ -14,10 +14,12 @@
 
 from .loader import load_config, load_model, load_tokenizer
 from .model_utils.misc import find_all_linear_modules
+from .model_utils.quantization import QuantizationMethod
 from .model_utils.valuehead import load_valuehead_params
 
 
 __all__ = [
+    "QuantizationMethod",
     "load_config",
     "load_model",
     "load_tokenizer",
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index e1015821..1261d17a 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -186,11 +186,11 @@ def load_model(
 
     trainable_params, all_param = count_parameters(model)
     if is_trainable:
-        param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
+        param_stats = "trainable params: {:,} || all params: {:,} || trainable%: {:.4f}".format(
             trainable_params, all_param, 100 * trainable_params / all_param
         )
     else:
-        param_stats = "all params: {:d}".format(all_param)
+        param_stats = "all params: {:,}".format(all_param)
 
     logger.info(param_stats)
 
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index fab61cb8..3203b4aa 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING, Any, Dict, List
 
 import torch
 from datasets import load_dataset
-from transformers import BitsAndBytesConfig, GPTQConfig
+from transformers import BitsAndBytesConfig, EetqConfig, GPTQConfig, HqqConfig
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.modeling_utils import is_fsdp_enabled
 from transformers.utils.versions import require_version
@@ -59,7 +59,7 @@ class QuantizationMethod(str, Enum):
 
 def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[Dict[str, Any]]:
     r"""
-    Prepares the dataset to perform AutoGPTQ.
+    Prepares the tokenized dataset to perform AutoGPTQ. Do not use tensor output for JSON serialization.
     """
     if os.path.isfile(model_args.export_quantization_dataset):
         data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None)
@@ -93,7 +93,7 @@ def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "Mod
         word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1)
         input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen]
         attention_mask = sample["attention_mask"][:, word_idx : word_idx + maxlen]
-        samples.append({"input_ids": input_ids, "attention_mask": attention_mask})
+        samples.append({"input_ids": input_ids.tolist(), "attention_mask": attention_mask.tolist()})
 
     return samples
 
@@ -105,7 +105,7 @@ def configure_quantization(
     init_kwargs: Dict[str, Any],
 ) -> None:
     r"""
-    Priority: PTQ-quantized (training) > AutoGPTQ (export) > Bitsandbytes (training)
+    Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)
     """
     if getattr(config, "quantization_config", None):  # ptq
         if is_deepspeed_zero3_enabled():
@@ -131,6 +131,9 @@ def configure_quantization(
         logger.info("Loading {}-bit {}-quantized model.".format(quant_bits, quant_method.upper()))
 
     elif model_args.export_quantization_bit is not None:  # auto-gptq
+        if model_args.export_quantization_bit not in [8, 4, 3, 2]:
+            raise ValueError("AutoGPTQ only accepts 2/3/4/8-bit quantization.")
+
         require_version("optimum>=1.17.0", "To fix: pip install optimum>=1.17.0")
         require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
         from accelerate.utils import get_max_memory
@@ -146,30 +149,48 @@ def configure_quantization(
         init_kwargs["max_memory"] = get_max_memory()
         logger.info("Quantizing model to {} bit with AutoGPTQ.".format(model_args.export_quantization_bit))
 
-    elif model_args.quantization_bit is not None:  # bnb
-        if model_args.quantization_bit == 8:
-            require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0")
-            init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
+    elif model_args.quantization_bit is not None:  # on-the-fly
+        if model_args.quantization_method == QuantizationMethod.BITS_AND_BYTES.value:
+            if model_args.quantization_bit == 8:
+                require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0")
+                init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
+            elif model_args.quantization_bit == 4:
+                require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0")
+                init_kwargs["quantization_config"] = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=model_args.compute_dtype,
+                    bnb_4bit_use_double_quant=model_args.double_quantization,
+                    bnb_4bit_quant_type=model_args.quantization_type,
+                    bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
+                )
+            else:
+                raise ValueError("Bitsandbytes only accepts 4-bit or 8-bit quantization.")
 
-        elif model_args.quantization_bit == 4:
-            require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0")
-            init_kwargs["quantization_config"] = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=model_args.compute_dtype,
-                bnb_4bit_use_double_quant=model_args.double_quantization,
-                bnb_4bit_quant_type=model_args.quantization_type,
-                bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
-            )
+            # Do not assign device map if:
+            # 1. deepspeed zero3 or fsdp (train)
+            # 2. auto quantization device map (inference)
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
+                if model_args.quantization_bit != 4:
+                    raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
 
-        # Do not assign device map if:
-        # 1. deepspeed zero3 or fsdp (train)
-        # 2. auto quantization device map (inference)
-        if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
-            if model_args.quantization_bit != 4:
-                raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
+                require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
+            else:
+                init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
 
-            require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
-        else:
-            init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
+            logger.info("Quantizing model to {} bit with bitsandbytes.".format(model_args.quantization_bit))
+        elif model_args.quantization_method == QuantizationMethod.HQQ.value:
+            if model_args.quantization_bit not in [8, 6, 5, 4, 3, 2, 1]:
+                raise ValueError("HQQ only accepts 1/2/3/4/5/6/8-bit quantization.")
 
-        logger.info("Quantizing model to {} bit with bitsandbytes.".format(model_args.quantization_bit))
+            require_version("hqq", "To fix: pip install hqq")
+            init_kwargs["quantization_config"] = HqqConfig(
+                nbits=model_args.quantization_bit, quant_zero=False, quant_scale=False, axis=0
+            )  # use ATEN kernel (axis=0) for performance
+            logger.info("Quantizing model to {} bit with HQQ.".format(model_args.quantization_bit))
+        elif model_args.quantization_method == QuantizationMethod.EETQ.value:
+            if model_args.quantization_bit != 8:
+                raise ValueError("EETQ only accepts 8-bit quantization.")
+
+            require_version("eetq", "To fix: pip install eetq")
+            init_kwargs["quantization_config"] = EetqConfig()
+            logger.info("Quantizing model to {} bit with EETQ.".format(model_args.quantization_bit))
diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py
index 652c341c..8abef920 100644
--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
@@ -23,7 +23,7 @@ from ..data import Role
 from ..extras.constants import PEFT_METHODS
 from ..extras.misc import torch_gc
 from ..extras.packages import is_gradio_available
-from .common import get_save_dir
+from .common import QUANTIZATION_BITS, get_save_dir
 from .locales import ALERTS
 
 
@@ -76,11 +76,17 @@ class WebChatModel(ChatModel):
             yield error
             return
 
+        if get("top.quantization_bit") in QUANTIZATION_BITS:
+            quantization_bit = int(get("top.quantization_bit"))
+        else:
+            quantization_bit = None
+
         yield ALERTS["info_loading"][lang]
         args = dict(
             model_name_or_path=model_path,
             finetuning_type=finetuning_type,
-            quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
+            quantization_bit=quantization_bit,
+            quantization_method=get("top.quantization_method"),
             template=get("top.template"),
             flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
             use_unsloth=(get("top.booster") == "unsloth"),
diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py
index 980428a4..bced18f0 100644
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
@@ -47,6 +47,8 @@ DEFAULT_CONFIG_DIR = "config"
 DEFAULT_DATA_DIR = "data"
 DEFAULT_SAVE_DIR = "saves"
 USER_CONFIG = "user_config.yaml"
+QUANTIZATION_BITS = ["8", "6", "5", "4", "3", "2", "1"]
+GPTQ_BITS = ["8", "4", "3", "2"]
 
 
 def get_save_dir(*paths: str) -> os.PathLike:
diff --git a/src/llamafactory/webui/components/export.py b/src/llamafactory/webui/components/export.py
index 14257949..0a938f02 100644
--- a/src/llamafactory/webui/components/export.py
+++ b/src/llamafactory/webui/components/export.py
@@ -18,7 +18,7 @@ from ...extras.constants import PEFT_METHODS
 from ...extras.misc import torch_gc
 from ...extras.packages import is_gradio_available
 from ...train.tuner import export_model
-from ..common import get_save_dir
+from ..common import GPTQ_BITS, get_save_dir
 from ..locales import ALERTS
 
 
@@ -32,9 +32,6 @@ if TYPE_CHECKING:
     from ..engine import Engine
 
 
-GPTQ_BITS = ["8", "4", "3", "2"]
-
-
 def can_quantize(checkpoint_path: Union[str, List[str]]) -> "gr.Dropdown":
     if isinstance(checkpoint_path, list) and len(checkpoint_path) != 0:
         return gr.Dropdown(value="none", interactive=False)
diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index 18b9a7d2..e331d5e4 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -18,7 +18,7 @@ from ...data import TEMPLATES
 from ...extras.constants import METHODS, SUPPORTED_MODELS
 from ...extras.packages import is_gradio_available
 from ..common import get_model_info, list_checkpoints, save_config
-from ..utils import can_quantize
+from ..utils import can_quantize, can_quantize_to
 
 
 if is_gradio_available():
@@ -43,10 +43,11 @@ def create_top() -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as advanced_tab:
         with gr.Row():
-            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=2)
-            template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default", scale=2)
-            rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=3)
-            booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none", scale=3)
+            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=1)
+            quantization_method = gr.Dropdown(choices=["bitsandbytes", "hqq", "eetq"], value="bitsandbytes", scale=1)
+            template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default", scale=1)
+            rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=2)
+            booster = gr.Radio(choices=["auto", "flashattn2", "unsloth"], value="auto", scale=2)
             visual_inputs = gr.Checkbox(scale=1)
 
     model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False).then(
@@ -58,6 +59,7 @@ def create_top() -> Dict[str, "Component"]:
         list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
     )
     checkpoint_path.focus(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False)
+    quantization_method.change(can_quantize_to, [quantization_method], [quantization_bit], queue=False)
 
     return dict(
         lang=lang,
@@ -67,6 +69,7 @@ def create_top() -> Dict[str, "Component"]:
         checkpoint_path=checkpoint_path,
         advanced_tab=advanced_tab,
         quantization_bit=quantization_bit,
+        quantization_method=quantization_method,
         template=template,
         rope_scaling=rope_scaling,
         booster=booster,
diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py
index cd166584..435876e7 100644
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
@@ -85,15 +85,29 @@ LOCALES = {
     "quantization_bit": {
         "en": {
             "label": "Quantization bit",
-            "info": "Enable 4/8-bit model quantization (QLoRA).",
+            "info": "Enable quantization (QLoRA).",
         },
         "ru": {
             "label": "Уровень квантования",
-            "info": "Включить 4/8-битное квантование модели (QLoRA).",
+            "info": "Включить квантование (QLoRA).",
         },
         "zh": {
             "label": "量化等级",
-            "info": "启用 4/8 比特模型量化（QLoRA）。",
+            "info": "启用量化（QLoRA）。",
+        },
+    },
+    "quantization_method": {
+        "en": {
+            "label": "Quantization method",
+            "info": "Quantization algorithm to use.",
+        },
+        "ru": {
+            "label": "Метод квантования",
+            "info": "Алгоритм квантования, который следует использовать.",
+        },
+        "zh": {
+            "label": "量化方法",
+            "info": "使用的量化算法。",
         },
     },
     "template": {
diff --git a/src/llamafactory/webui/manager.py b/src/llamafactory/webui/manager.py
index 7e9b801a..ebe9f1b9 100644
--- a/src/llamafactory/webui/manager.py
+++ b/src/llamafactory/webui/manager.py
@@ -71,6 +71,7 @@ class Manager:
             self._id_to_elem["top.finetuning_type"],
             self._id_to_elem["top.checkpoint_path"],
             self._id_to_elem["top.quantization_bit"],
+            self._id_to_elem["top.quantization_method"],
             self._id_to_elem["top.template"],
             self._id_to_elem["top.rope_scaling"],
             self._id_to_elem["top.booster"],
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 549ec765..f7fbac30 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -22,7 +22,7 @@ from transformers.trainer import TRAINING_ARGS_NAME
 from ..extras.constants import LLAMABOARD_CONFIG, PEFT_METHODS, TRAINING_STAGES
 from ..extras.misc import is_gpu_or_npu_available, torch_gc
 from ..extras.packages import is_gradio_available
-from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir, load_config
+from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, QUANTIZATION_BITS, get_save_dir, load_config
 from .locales import ALERTS, LOCALES
 from .utils import abort_process, gen_cmd, get_eval_results, get_trainer_info, load_args, save_args, save_cmd
 
@@ -104,6 +104,11 @@ class Runner:
         model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
         user_config = load_config()
 
+        if get("top.quantization_bit") in QUANTIZATION_BITS:
+            quantization_bit = int(get("top.quantization_bit"))
+        else:
+            quantization_bit = None
+
         args = dict(
             stage=TRAINING_STAGES[get("train.training_stage")],
             do_train=True,
@@ -111,7 +116,8 @@ class Runner:
             cache_dir=user_config.get("cache_dir", None),
             preprocessing_num_workers=16,
             finetuning_type=finetuning_type,
-            quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
+            quantization_bit=quantization_bit,
+            quantization_method=get("top.quantization_method"),
             template=get("top.template"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
@@ -234,13 +240,19 @@ class Runner:
         model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
         user_config = load_config()
 
+        if get("top.quantization_bit") in QUANTIZATION_BITS:
+            quantization_bit = int(get("top.quantization_bit"))
+        else:
+            quantization_bit = None
+
         args = dict(
             stage="sft",
             model_name_or_path=get("top.model_path"),
             cache_dir=user_config.get("cache_dir", None),
             preprocessing_num_workers=16,
             finetuning_type=finetuning_type,
-            quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
+            quantization_bit=quantization_bit,
+            quantization_method=get("top.quantization_method"),
             template=get("top.template"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index a616bcba..4f313e4e 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -25,6 +25,7 @@ from yaml import safe_dump, safe_load
 from ..extras.constants import PEFT_METHODS, RUNNING_LOG, TRAINER_LOG, TRAINING_ARGS, TRAINING_STAGES
 from ..extras.packages import is_gradio_available, is_matplotlib_available
 from ..extras.ploting import gen_loss_plot
+from ..model import QuantizationMethod
 from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir
 from .locales import ALERTS
 
@@ -55,6 +56,18 @@ def can_quantize(finetuning_type: str) -> "gr.Dropdown":
         return gr.Dropdown(interactive=True)
 
 
+def can_quantize_to(quantization_method: str) -> "gr.Dropdown":
+    r"""
+    Returns the available quantization bits.
+    """
+    if quantization_method == QuantizationMethod.BITS_AND_BYTES.value:
+        return gr.Dropdown(choices=["none", "8", "4"])
+    elif quantization_method == QuantizationMethod.HQQ.value:
+        return gr.Dropdown(choices=["none", "8", "6", "5", "4", "3", "2", "1"])
+    elif quantization_method == QuantizationMethod.EETQ.value:
+        return gr.Dropdown(choices=["none", "8"])
+
+
 def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Tuple[List[str], bool]:
     r"""
     Modifys states after changing the training stage.

From 2d6b32c2277a39fe33bc13c3c4c0a6d6541f3da0 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 00:36:04 +0800
Subject: [PATCH 131/162] tiny fix

Former-commit-id: 69dac21ed9f07977b4540eb838a0ef93f3d3abc4
---
 src/llamafactory/webui/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index 4f313e4e..069a2a2a 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -61,11 +61,11 @@ def can_quantize_to(quantization_method: str) -> "gr.Dropdown":
     Returns the available quantization bits.
     """
     if quantization_method == QuantizationMethod.BITS_AND_BYTES.value:
-        return gr.Dropdown(choices=["none", "8", "4"])
+        return gr.Dropdown(choices=["none", "8", "4"], value="none")
     elif quantization_method == QuantizationMethod.HQQ.value:
-        return gr.Dropdown(choices=["none", "8", "6", "5", "4", "3", "2", "1"])
+        return gr.Dropdown(choices=["none", "8", "6", "5", "4", "3", "2", "1"], value="none")
     elif quantization_method == QuantizationMethod.EETQ.value:
-        return gr.Dropdown(choices=["none", "8"])
+        return gr.Dropdown(choices=["none", "8"], value="none")
 
 
 def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Tuple[List[str], bool]:

From 0bec843f70bc7625126d3f4a8f794eec0b31a31d Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 00:46:41 +0800
Subject: [PATCH 132/162] tiny fix

Former-commit-id: c6747a39dbbdda8decaa104499918bc7ac5f02e4
---
 src/llamafactory/webui/components/top.py | 2 +-
 src/llamafactory/webui/utils.py          | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index e331d5e4..9df3f062 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -43,7 +43,7 @@ def create_top() -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as advanced_tab:
         with gr.Row():
-            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=1)
+            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", allow_custom_value=True, scale=1)
             quantization_method = gr.Dropdown(choices=["bitsandbytes", "hqq", "eetq"], value="bitsandbytes", scale=1)
             template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default", scale=1)
             rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=2)
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index 069a2a2a..14616ac4 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -61,11 +61,13 @@ def can_quantize_to(quantization_method: str) -> "gr.Dropdown":
     Returns the available quantization bits.
     """
     if quantization_method == QuantizationMethod.BITS_AND_BYTES.value:
-        return gr.Dropdown(choices=["none", "8", "4"], value="none")
+        available_bits = ["none", "8", "4"]
     elif quantization_method == QuantizationMethod.HQQ.value:
-        return gr.Dropdown(choices=["none", "8", "6", "5", "4", "3", "2", "1"], value="none")
+        available_bits = ["none", "8", "6", "5", "4", "3", "2", "1"]
     elif quantization_method == QuantizationMethod.EETQ.value:
-        return gr.Dropdown(choices=["none", "8"], value="none")
+        available_bits = ["none", "8"]
+
+    return gr.Dropdown(choices=available_bits)
 
 
 def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Tuple[List[str], bool]:

From 646c1a9d532cfe4486f1a4834c8cadc02231b93b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 00:53:33 +0800
Subject: [PATCH 133/162] update examples

Former-commit-id: cce238f7d07919b79237bc9ab39265766c20f020
---
 examples/README.md                                            | 4 ++--
 examples/README_zh.md                                         | 4 ++--
 examples/train_lora/llama3_lora_dpo.yaml                      | 2 +-
 ...3_lora_sft_bitsandbytes.yaml => llama3_lora_sft_otfq.yaml} | 1 +
 4 files changed, 6 insertions(+), 5 deletions(-)
 rename examples/train_qlora/{llama3_lora_sft_bitsandbytes.yaml => llama3_lora_sft_otfq.yaml} (88%)

diff --git a/examples/README.md b/examples/README.md
index 007a81ab..d5aca5ad 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -94,10 +94,10 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.
 
 ### QLoRA Fine-Tuning
 
-#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
+#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes/HQQ/EETQ Quantization (Recommended)
 
 ```bash
-llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
 ```
 
 #### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
diff --git a/examples/README_zh.md b/examples/README_zh.md
index b9d90f25..d96bf882 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -94,10 +94,10 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.
 
 ### QLoRA 微调
 
-#### 基于 4/8 比特 Bitsandbytes 量化进行指令监督微调（推荐）
+#### 基于 4/8 比特 Bitsandbytes/HQQ/EETQ 量化进行指令监督微调（推荐）
 
 ```bash
-llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
 ```
 
 #### 基于 4/8 比特 GPTQ 量化进行指令监督微调
diff --git a/examples/train_lora/llama3_lora_dpo.yaml b/examples/train_lora/llama3_lora_dpo.yaml
index db25fb51..188e5078 100644
--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -7,7 +7,7 @@ do_train: true
 finetuning_type: lora
 lora_target: all
 pref_beta: 0.1
-pref_loss: sigmoid  # [sigmoid (dpo), orpo, simpo]
+pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
 
 ### dataset
 dataset: dpo_en_demo
diff --git a/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml b/examples/train_qlora/llama3_lora_sft_otfq.yaml
similarity index 88%
rename from examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
rename to examples/train_qlora/llama3_lora_sft_otfq.yaml
index cc773991..9c73b439 100644
--- a/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/train_qlora/llama3_lora_sft_otfq.yaml
@@ -1,6 +1,7 @@
 ### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
+quantization_method: bitsandbytes  # choices: [bitsandbytes (4/8), hqq (2/3/4/5/6/8), eetq (8)]
 
 ### method
 stage: sft

From 2381fb68a4a41c7183d02c850d19e2eb569720d6 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 01:12:25 +0800
Subject: [PATCH 134/162] add quant checks

Former-commit-id: 15bb053e3549739b1a2134640a659b0f35df7de7
---
 src/llamafactory/model/model_utils/quantization.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 3203b4aa..317646e0 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -108,8 +108,11 @@ def configure_quantization(
     Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)
     """
     if getattr(config, "quantization_config", None):  # ptq
-        if is_deepspeed_zero3_enabled():
-            raise ValueError("DeepSpeed ZeRO-3 is incompatible with PTQ-quantized models.")
+        if model_args.quantization_bit is not None:
+            logger.warning("`quantization_bit` will not affect on the PTQ-quantized models.")
+
+        if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+            raise ValueError("DeepSpeed ZeRO-3 or FSDP is incompatible with PTQ-quantized models.")
 
         quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
         quant_method = quantization_config.get("quant_method", "")
@@ -182,6 +185,9 @@ def configure_quantization(
             if model_args.quantization_bit not in [8, 6, 5, 4, 3, 2, 1]:
                 raise ValueError("HQQ only accepts 1/2/3/4/5/6/8-bit quantization.")
 
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+                raise ValueError("HQQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP.")
+
             require_version("hqq", "To fix: pip install hqq")
             init_kwargs["quantization_config"] = HqqConfig(
                 nbits=model_args.quantization_bit, quant_zero=False, quant_scale=False, axis=0
@@ -191,6 +197,9 @@ def configure_quantization(
             if model_args.quantization_bit != 8:
                 raise ValueError("EETQ only accepts 8-bit quantization.")
 
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+                raise ValueError("EETQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP.")
+
             require_version("eetq", "To fix: pip install eetq")
             init_kwargs["quantization_config"] = EetqConfig()
             logger.info("Quantizing model to {} bit with EETQ.".format(model_args.quantization_bit))

From 3491525d966f02c7372b3ccfc8483e621efdf7fb Mon Sep 17 00:00:00 2001
From: hzhaoy <hzywong@gmail.com>
Date: Thu, 27 Jun 2024 13:49:57 +0800
Subject: [PATCH 135/162] fix #4579

Former-commit-id: 0fa298ff6a4febea36ea9f11c7594277a77e6e9b
---
 src/llamafactory/train/sft/trainer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 8f18317f..f0a86dff 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -53,6 +53,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         self.processor = processor
 
         if finetuning_args.pissa_convert:
+            if self.is_deepspeed_enabled:
+                self.accelerator.deepspeed_config = self.accelerator.state.deepspeed_plugin.deepspeed_config
+                self.deepspeed = self._wrap_model(self.model_wrapped)
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:

From f4efffd619fcf29cdd539b0a2e9d7f9b3828cec5 Mon Sep 17 00:00:00 2001
From: fanjunliang <fanjunliang@sensetime.com>
Date: Thu, 27 Jun 2024 15:21:55 +0800
Subject: [PATCH 136/162] support docker-npu-[amd64|arm64] build

Former-commit-id: 25f16f5e299c94175e62bac9f0da5b47a2bb31b7
---
 README.md                    |  2 +-
 README_zh.md                 |  2 +-
 docker/docker-npu/Dockerfile | 12 +++++++++---
 setup.py                     |  3 ++-
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 4b42edd7..9c509ff0 100644
--- a/README.md
+++ b/README.md
@@ -465,7 +465,7 @@ For Ascend NPU users:
 
 ```bash
 # Choose docker image upon your environment
-docker build -f ./docker/docker-npu/Dockerfile \
+docker build --platform linux/arm64 -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
diff --git a/README_zh.md b/README_zh.md
index 3926c09d..c3fb6ecf 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -465,7 +465,7 @@ docker exec -it llamafactory bash
 
 ```bash
 # 根据您的环境选择镜像
-docker build -f ./docker/docker-npu/Dockerfile \
+docker build --platform linux/arm64 -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 0ec16107..8d80397e 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -1,10 +1,11 @@
 # Use the Ubuntu 22.04 image with CANN 8.0.rc1
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
-FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
+FROM --platform=$TARGETPLATFORM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Define installation arguments
+ARG TARGETPLATFORM
 ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
 ARG EXTRA_INDEX=https://download.pytorch.org/whl/cpu
@@ -15,7 +16,6 @@ WORKDIR /app
 # Install the requirements
 COPY requirements.txt /app
 RUN pip config set global.index-url $PIP_INDEX && \
-    pip config set global.extra-index-url $EXTRA_INDEX && \
     pip install --upgrade pip && \
     pip install -r requirements.txt
 
@@ -23,7 +23,13 @@ RUN pip config set global.index-url $PIP_INDEX && \
 COPY . /app
 
 # Install the LLaMA Factory
-RUN EXTRA_PACKAGES="torch-npu,metrics"; \
+RUN EXTRA_PACKAGES="metrics"; \
+    if [ "$TARGETPLATFORM" == "linux/arm64" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},torch-npu-arm64"; \
+    else \
+        pip config set global.extra-index-url $EXTRA_INDEX; \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},torch-npu-amd64"; \
+    fi; \
     if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
diff --git a/setup.py b/setup.py
index 89301d1b..594070cd 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,8 @@ def get_requires():
 
 extra_require = {
     "torch": ["torch>=1.13.1"],
-    "torch-npu": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu-arm64": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu-amd64": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],

From c8c45d537130720a044eb1981fa8dcb126891a00 Mon Sep 17 00:00:00 2001
From: faddddeout <39449491+injet-zhou@users.noreply.github.com>
Date: Thu, 27 Jun 2024 09:58:00 +0000
Subject: [PATCH 137/162] Exit the process with the subprocess's return code
 when utilizing the CLI

Former-commit-id: ab42a4e2501a80fba1704a506bd1209a441570fa
---
 src/llamafactory/cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index e4846780..c92f556b 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -91,7 +91,7 @@ def main():
             master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
             master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
             logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
-            subprocess.run(
+            subproc = subprocess.run(
                 (
                     "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
                     "--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
@@ -106,6 +106,7 @@ def main():
                 ),
                 shell=True,
             )
+            sys.exit(subproc.returncode)
         else:
             run_exp()
     elif command == Command.WEBDEMO:

From 4da26498001d41c5af13f91d21577adcf8b11b62 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:16:46 +0800
Subject: [PATCH 138/162] Update setup.py

Former-commit-id: 42293ab26f7fd7ffb77b308655ccd47b7c2ffa84
---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 594070cd..dd333c48 100644
--- a/setup.py
+++ b/setup.py
@@ -35,8 +35,8 @@ def get_requires():
 
 extra_require = {
     "torch": ["torch>=1.13.1"],
-    "torch-npu-arm64": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
-    "torch-npu-amd64": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu-amd": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],

From 7dc404b137d5ed5b5c1ee045ee67651984ff4063 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:17:35 +0800
Subject: [PATCH 139/162] Update README.md

Former-commit-id: 01869ccbb5af2704c9d5bfdd4f2ff30978fb466d
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9c509ff0..4b42edd7 100644
--- a/README.md
+++ b/README.md
@@ -465,7 +465,7 @@ For Ascend NPU users:
 
 ```bash
 # Choose docker image upon your environment
-docker build --platform linux/arm64 -f ./docker/docker-npu/Dockerfile \
+docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .

From 33094f4aa27b85e7aa443a4633cab7d04e032b35 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:17:52 +0800
Subject: [PATCH 140/162] Update README_zh.md

Former-commit-id: 62f2e27f4355aa35c26e1146dbe90fac3b380118
---
 README_zh.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_zh.md b/README_zh.md
index c3fb6ecf..3926c09d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -465,7 +465,7 @@ docker exec -it llamafactory bash
 
 ```bash
 # 根据您的环境选择镜像
-docker build --platform linux/arm64 -f ./docker/docker-npu/Dockerfile \
+docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .

From 9fb8ebe00416c9bcacc23b0c87830b4dbf0f1be9 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:38:15 +0800
Subject: [PATCH 141/162] Update setup.py

Former-commit-id: 544e1844fb237eed3eb621f4e6e355eac2ff7b85
---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index dd333c48..64f50a87 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,6 @@ def get_requires():
 extra_require = {
     "torch": ["torch>=1.13.1"],
     "torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
-    "torch-npu-amd": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],

From 301727822f0f1089f78be38a065bc58f6e2ff9b8 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:51:25 +0800
Subject: [PATCH 142/162] Update Dockerfile

Former-commit-id: 7dea6840256472f8aa2c642f11d9e30bfa0fb96f
---
 docker/docker-npu/Dockerfile | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 8d80397e..d1d176e9 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -1,39 +1,38 @@
 # Use the Ubuntu 22.04 image with CANN 8.0.rc1
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
-FROM --platform=$TARGETPLATFORM cosdt/cann:8.0.rc1-910b-ubuntu22.04
+FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
+# Set env
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Define installation arguments
-ARG TARGETPLATFORM
 ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
-ARG EXTRA_INDEX=https://download.pytorch.org/whl/cpu
+# x86 torch cpu index
+ARG TORCH_INDEX=https://download.pytorch.org/whl/cpu
 
 # Set the working directory
 WORKDIR /app
 
 # Install the requirements
 COPY requirements.txt /app
-RUN pip config set global.index-url $PIP_INDEX && \
-    pip install --upgrade pip && \
-    pip install -r requirements.txt
+RUN pip config set global.index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$PIP_INDEX" && \
+    python -m pip install --upgrade pip && \
+    python -m pip install -r requirements.txt
 
 # Copy the rest of the application into the image
 COPY . /app
 
 # Install the LLaMA Factory
-RUN EXTRA_PACKAGES="metrics"; \
-    if [ "$TARGETPLATFORM" == "linux/arm64" ]; then \
-        EXTRA_PACKAGES="${EXTRA_PACKAGES},torch-npu-arm64"; \
-    else \
-        pip config set global.extra-index-url $EXTRA_INDEX; \
-        EXTRA_PACKAGES="${EXTRA_PACKAGES},torch-npu-amd64"; \
-    fi; \
-    if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
+RUN EXTRA_PACKAGES="torch-npu,metrics"; \
+    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
-    pip install -e .[$EXTRA_PACKAGES] && \
+    if [ "$(uname -i)" != "aarch64" ]; then \
+        pip config set global.extra-index-url "$TORCH_INDEX" \
+    fi; \
+    pip install -e ".[$EXTRA_PACKAGES]" && \
     pip uninstall -y transformer-engine flash-attn
 
 # Set up volumes

From 9ba14577b99ac16fb0096ab2abd18249878c091f Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:57:40 +0800
Subject: [PATCH 143/162] Update Dockerfile

Former-commit-id: a239f535a64378b74ef34799cd8e2e4a78f00f4c
---
 docker/docker-npu/Dockerfile | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index d1d176e9..71ab3daf 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -2,13 +2,11 @@
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
-# Set env
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Define installation arguments
 ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
-# x86 torch cpu index
 ARG TORCH_INDEX=https://download.pytorch.org/whl/cpu
 
 # Set the working directory
@@ -17,7 +15,7 @@ WORKDIR /app
 # Install the requirements
 COPY requirements.txt /app
 RUN pip config set global.index-url "$PIP_INDEX" && \
-    pip config set global.extra-index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$TORCH_INDEX" && \
     python -m pip install --upgrade pip && \
     python -m pip install -r requirements.txt
 
@@ -29,9 +27,6 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
     if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
-    if [ "$(uname -i)" != "aarch64" ]; then \
-        pip config set global.extra-index-url "$TORCH_INDEX" \
-    fi; \
     pip install -e ".[$EXTRA_PACKAGES]" && \
     pip uninstall -y transformer-engine flash-attn
 

From 98053508118884a62d0b07f4be90ea5165ec84f6 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 20:14:48 +0800
Subject: [PATCH 144/162] tiny fix

Former-commit-id: c1a78a3a9f8ab9d57577cee37f9c457d60863ba2
---
 README.md                             |  2 +-
 README_zh.md                          |  2 +-
 docker/docker-cuda/Dockerfile         | 36 ++++++++++++++-------------
 docker/docker-cuda/docker-compose.yml |  2 +-
 docker/docker-npu/Dockerfile          |  4 +--
 src/llamafactory/cli.py               |  4 +--
 src/llamafactory/hparams/parser.py    |  4 +--
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 45ac23d8..44aed7e8 100644
--- a/README.md
+++ b/README.md
@@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
-    --build-arg INSTALL_FLASH_ATTN=false \
+    --build-arg INSTALL_FLASHATTN=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
diff --git a/README_zh.md b/README_zh.md
index c5fd4f69..7e3d51ad 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
-    --build-arg INSTALL_FLASH_ATTN=false \
+    --build-arg INSTALL_FLASHATTN=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 44aaf538..718390a8 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -2,11 +2,14 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html
 FROM nvcr.io/nvidia/pytorch:24.02-py3
 
+# Define environments
+ENV MAX_JOBS=4
+
 # Define installation arguments
 ARG INSTALL_BNB=false
 ARG INSTALL_VLLM=false
 ARG INSTALL_DEEPSPEED=false
-ARG INSTALL_FLASH_ATTN=false
+ARG INSTALL_FLASHATTN=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
@@ -14,34 +17,33 @@ WORKDIR /app
 
 # Install the requirements
 COPY requirements.txt /app
-RUN pip config set global.index-url $PIP_INDEX
-RUN pip config set global.extra-index-url $PIP_INDEX
-RUN python -m pip install --upgrade pip
-RUN python -m pip install -r requirements.txt
+RUN pip config set global.index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$PIP_INDEX" && \
+    python -m pip install --upgrade pip && \
+    python -m pip install -r requirements.txt
+
+# Rebuild flash attention
+RUN pip uninstall -y transformer-engine flash-attn && \
+    if [ "$INSTALL_FLASHATTN" == "true" ]; then \
+        pip uninstall -y ninja && pip install ninja && \
+        pip install --no-cache-dir flash-attn --no-build-isolation \
+    fi;
 
 # Copy the rest of the application into the image
 COPY . /app
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="metrics"; \
-    if [ "$INSTALL_BNB" = "true" ]; then \
+    if [ "$INSTALL_BNB" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
     fi; \
-    if [ "$INSTALL_VLLM" = "true" ]; then \
+    if [ "$INSTALL_VLLM" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
     fi; \
-    if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
+    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
-    pip install -e .[$EXTRA_PACKAGES] && \
-    pip uninstall -y transformer-engine flash-attn
-
-# Rebuild flash-attn
-RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
-        ninja --version || \
-        (pip uninstall -y ninja && pip install ninja) && \
-        MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation \
-    fi;
+    pip install -e ".[$EXTRA_PACKAGES]"
 
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index 4ccb0c04..16267dc3 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -7,7 +7,7 @@ services:
         INSTALL_BNB: false
         INSTALL_VLLM: false
         INSTALL_DEEPSPEED: false
-        INSTALL_FLASH_ATTN: false
+        INSTALL_FLASHATTN: false
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 71ab3daf..e413d4e3 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -2,6 +2,7 @@
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
+# Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Define installation arguments
@@ -27,8 +28,7 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
     if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
-    pip install -e ".[$EXTRA_PACKAGES]" && \
-    pip uninstall -y transformer-engine flash-attn
+    pip install -e ".[$EXTRA_PACKAGES]"
 
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index c92f556b..48eb2898 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -91,7 +91,7 @@ def main():
             master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
             master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
             logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
-            subproc = subprocess.run(
+            process = subprocess.run(
                 (
                     "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
                     "--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
@@ -106,7 +106,7 @@ def main():
                 ),
                 shell=True,
             )
-            sys.exit(subproc.returncode)
+            sys.exit(process.returncode)
         else:
             run_exp()
     elif command == Command.WEBDEMO:
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index d4bcfbc6..6017907c 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -199,8 +199,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         if not is_torch_bf16_gpu_available():
             raise ValueError("This device does not support `pure_bf16`.")
 
-        if training_args.deepspeed:
-            raise ValueError("`pure_bf16` is incompatible with DeepSpeed.")
+        if is_deepspeed_zero3_enabled():
+            raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
 
         if training_args.fp16 or training_args.bf16:
             raise ValueError("Turn off mixed precision training when using `pure_bf16`.")

From 52ab77d008fa051f1ea29ca1c67086595fd01883 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 20:29:16 +0800
Subject: [PATCH 145/162] fix docker file

Former-commit-id: 688f02decb1185deb74b26444f7643cab7d355c1
---
 docker/docker-cuda/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 718390a8..e4fac987 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -26,8 +26,8 @@ RUN pip config set global.index-url "$PIP_INDEX" && \
 RUN pip uninstall -y transformer-engine flash-attn && \
     if [ "$INSTALL_FLASHATTN" == "true" ]; then \
         pip uninstall -y ninja && pip install ninja && \
-        pip install --no-cache-dir flash-attn --no-build-isolation \
-    fi;
+        pip install --no-cache-dir flash-attn --no-build-isolation; \
+    fi
 
 # Copy the rest of the application into the image
 COPY . /app

From b588a099db753e3f6a4373111b2dcb1339a1f01c Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 00:41:58 +0800
Subject: [PATCH 146/162] fix #4549

Former-commit-id: c9fdef10de737d1f433209812ef73e29cb60490a
---
 src/llamafactory/hparams/parser.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 6017907c..5c59fbe0 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -202,9 +202,6 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         if is_deepspeed_zero3_enabled():
             raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
 
-        if training_args.fp16 or training_args.bf16:
-            raise ValueError("Turn off mixed precision training when using `pure_bf16`.")
-
     if (
         finetuning_args.use_galore
         and finetuning_args.galore_layerwise

From 884a4a33ee36c52086ac207c31d441d96394a80b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 01:04:24 +0800
Subject: [PATCH 147/162] refactor pissa, improve llamaboard

Former-commit-id: 619556e46c19718f702c97df5d570a2a4c5fb13a
---
 src/llamafactory/extras/misc.py               |  72 ++--------
 src/llamafactory/hparams/finetuning_args.py   |   6 +-
 src/llamafactory/hparams/parser.py            |  12 +-
 .../{extras => train}/callbacks.py            | 135 +++++++++++++++++-
 src/llamafactory/train/dpo/trainer.py         |  21 +--
 src/llamafactory/train/kto/trainer.py         |  13 +-
 src/llamafactory/train/ppo/trainer.py         |  35 +++--
 src/llamafactory/train/ppo/workflow.py        |   4 +-
 src/llamafactory/train/pt/trainer.py          |  23 ++-
 src/llamafactory/train/rm/trainer.py          |  19 +--
 src/llamafactory/train/rm/workflow.py         |   6 +-
 src/llamafactory/train/sft/trainer.py         |  23 +--
 src/llamafactory/train/trainer_utils.py       |  48 -------
 src/llamafactory/train/tuner.py               |   4 +-
 src/llamafactory/webui/runner.py              |   1 +
 src/llamafactory/webui/utils.py               |  13 +-
 16 files changed, 219 insertions(+), 216 deletions(-)
 rename src/llamafactory/{extras => train}/callbacks.py (59%)

diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index 93153b3e..30c287bd 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -1,4 +1,7 @@
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/peft_model.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,15 +17,11 @@
 
 import gc
 import os
-from typing import TYPE_CHECKING, Dict, Tuple
+from typing import TYPE_CHECKING, Tuple
 
 import torch
-from peft import PeftModel
-from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList, PreTrainedModel
+from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
 from transformers.utils import (
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    is_safetensors_available,
     is_torch_bf16_gpu_available,
     is_torch_cuda_available,
     is_torch_mps_available,
@@ -31,15 +30,9 @@ from transformers.utils import (
 )
 from transformers.utils.versions import require_version
 
-from .constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from .logging import get_logger
 
 
-if is_safetensors_available():
-    from safetensors import safe_open
-    from safetensors.torch import save_file
-
-
 _is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
 try:
     _is_bf16_available = is_torch_bf16_gpu_available()
@@ -48,8 +41,6 @@ except Exception:
 
 
 if TYPE_CHECKING:
-    from trl import AutoModelForCausalLMWithValueHead
-
     from ..hparams import ModelArguments
 
 
@@ -99,7 +90,7 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
         if num_params == 0 and hasattr(param, "ds_numel"):
             num_params = param.ds_numel
 
-        # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by 2
+        # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by itemsize
         if param.__class__.__name__ == "Params4bit":
             if hasattr(param, "quant_storage") and hasattr(param.quant_storage, "itemsize"):
                 num_bytes = param.quant_storage.itemsize
@@ -117,51 +108,6 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
     return trainable_params, all_param
 
 
-def fix_valuehead_checkpoint(
-    model: "AutoModelForCausalLMWithValueHead", output_dir: str, safe_serialization: bool
-) -> None:
-    r"""
-    The model is already unwrapped.
-
-    There are three cases:
-    1. full tuning without ds_zero3: state_dict = {"model.layers.*": ..., "v_head.summary.*": ...}
-    2. lora tuning without ds_zero3: state_dict = {"v_head.summary.*": ...}
-    3. under deepspeed zero3: state_dict = {"pretrained_model.model.layers.*": ..., "v_head.summary.*": ...}
-
-    We assume `stage3_gather_16bit_weights_on_model_save=true`.
-    """
-    if not isinstance(model.pretrained_model, (PreTrainedModel, PeftModel)):
-        return
-
-    if safe_serialization:
-        path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
-        with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
-            state_dict: Dict[str, torch.Tensor] = {key: f.get_tensor(key) for key in f.keys()}
-    else:
-        path_to_checkpoint = os.path.join(output_dir, WEIGHTS_NAME)
-        state_dict: Dict[str, torch.Tensor] = torch.load(path_to_checkpoint, map_location="cpu")
-
-    decoder_state_dict = {}
-    v_head_state_dict = {}
-    for name, param in state_dict.items():
-        if name.startswith("v_head."):
-            v_head_state_dict[name] = param
-        else:
-            decoder_state_dict[name.replace("pretrained_model.", "")] = param
-
-    os.remove(path_to_checkpoint)
-    model.pretrained_model.save_pretrained(
-        output_dir, state_dict=decoder_state_dict or None, safe_serialization=safe_serialization
-    )
-
-    if safe_serialization:
-        save_file(v_head_state_dict, os.path.join(output_dir, V_HEAD_SAFE_WEIGHTS_NAME), metadata={"format": "pt"})
-    else:
-        torch.save(v_head_state_dict, os.path.join(output_dir, V_HEAD_WEIGHTS_NAME))
-
-    logger.info("Value head model saved at: {}".format(output_dir))
-
-
 def get_current_device() -> torch.device:
     r"""
     Gets the current available device.
@@ -201,7 +147,7 @@ def get_logits_processor() -> "LogitsProcessorList":
     return logits_processor
 
 
-def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype:
+def infer_optim_dtype(model_dtype: "torch.dtype") -> "torch.dtype":
     r"""
     Infers the optimal dtype according to the model_dtype and device compatibility.
     """
@@ -220,7 +166,7 @@ def is_gpu_or_npu_available() -> bool:
     return is_torch_npu_available() or is_torch_cuda_available()
 
 
-def has_tokenized_data(path: os.PathLike) -> bool:
+def has_tokenized_data(path: "os.PathLike") -> bool:
     r"""
     Checks if the path has a tokenized dataset.
     """
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index b676891e..28da95ad 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -379,10 +379,10 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora":
             raise ValueError("`loraplus_lr_ratio` is only valid for LoRA training.")
 
-        if self.pissa_convert and self.finetuning_type != "lora":
-            raise ValueError("`pissa_convert` is only valid for LoRA training.")
+        if self.pissa_init and self.finetuning_type != "lora":
+            raise ValueError("`pissa_init` is only valid for LoRA training.")
 
-        if self.pissa_convert and (self.stage in ["rm", "ppo", "kto"] or self.use_ref_model):
+        if self.pissa_init and (self.stage in ["ppo", "kto"] or self.use_ref_model):
             raise ValueError("Cannot use PiSSA for current training stage.")
 
         if self.train_mm_proj_only and self.finetuning_type != "full":
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 5c59fbe0..8b2ea4c1 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -83,9 +83,6 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
     if model_args.adapter_name_or_path is not None and finetuning_args.finetuning_type != "lora":
         raise ValueError("Adapter is only valid for the LoRA method.")
 
-    if model_args.use_unsloth and is_deepspeed_zero3_enabled():
-        raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.")
-
     if model_args.quantization_bit is not None:
         if finetuning_args.finetuning_type != "lora":
             raise ValueError("Quantization is only compatible with the LoRA method.")
@@ -186,6 +183,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED:
         raise ValueError("Please launch distributed training with `llamafactory-cli` or `torchrun`.")
 
+    if training_args.deepspeed and training_args.parallel_mode != ParallelMode.DISTRIBUTED:
+        raise ValueError("Please use `FORCE_TORCHRUN=1` to launch DeepSpeed training.")
+
     if training_args.max_steps == -1 and data_args.streaming:
         raise ValueError("Please specify `max_steps` in streaming mode.")
 
@@ -195,6 +195,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.do_train and model_args.quantization_device_map == "auto":
         raise ValueError("Cannot use device map for quantized models in training.")
 
+    if finetuning_args.pissa_init and is_deepspeed_zero3_enabled():
+        raise ValueError("PiSSA is incompatible with DeepSpeed ZeRO-3.")
+
     if finetuning_args.pure_bf16:
         if not is_torch_bf16_gpu_available():
             raise ValueError("This device does not support `pure_bf16`.")
@@ -224,6 +227,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if model_args.visual_inputs and data_args.packing:
         raise ValueError("Cannot use packing in MLLM fine-tuning.")
 
+    if model_args.use_unsloth and is_deepspeed_zero3_enabled():
+        raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.")
+
     _verify_model_args(model_args, finetuning_args)
     _check_extra_dependencies(model_args, finetuning_args, training_args)
 
diff --git a/src/llamafactory/extras/callbacks.py b/src/llamafactory/train/callbacks.py
similarity index 59%
rename from src/llamafactory/extras/callbacks.py
rename to src/llamafactory/train/callbacks.py
index 0dff6a69..dc9c981e 100644
--- a/src/llamafactory/extras/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
@@ -1,4 +1,7 @@
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,22 +25,78 @@ from concurrent.futures import ThreadPoolExecutor
 from datetime import timedelta
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
+import torch
 import transformers
-from transformers import TrainerCallback
+from peft import PeftModel
+from transformers import PreTrainedModel, ProcessorMixin, TrainerCallback
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length
+from transformers.utils import (
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    is_safetensors_available,
+)
 
-from .constants import TRAINER_LOG
-from .logging import LoggerHandler, get_logger
-from .misc import fix_valuehead_checkpoint
+from ..extras.constants import TRAINER_LOG, V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+from ..extras.logging import LoggerHandler, get_logger
 
 
+if is_safetensors_available():
+    from safetensors import safe_open
+    from safetensors.torch import save_file
+
 if TYPE_CHECKING:
     from transformers import TrainerControl, TrainerState, TrainingArguments
+    from trl import AutoModelForCausalLMWithValueHead
 
 
 logger = get_logger(__name__)
 
 
+def fix_valuehead_checkpoint(
+    model: "AutoModelForCausalLMWithValueHead", output_dir: str, safe_serialization: bool
+) -> None:
+    r"""
+    The model is already unwrapped.
+
+    There are three cases:
+    1. full tuning without ds_zero3: state_dict = {"model.layers.*": ..., "v_head.summary.*": ...}
+    2. lora tuning without ds_zero3: state_dict = {"v_head.summary.*": ...}
+    3. under deepspeed zero3: state_dict = {"pretrained_model.model.layers.*": ..., "v_head.summary.*": ...}
+
+    We assume `stage3_gather_16bit_weights_on_model_save=true`.
+    """
+    if not isinstance(model.pretrained_model, (PreTrainedModel, PeftModel)):
+        return
+
+    if safe_serialization:
+        path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
+        with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
+            state_dict: Dict[str, torch.Tensor] = {key: f.get_tensor(key) for key in f.keys()}
+    else:
+        path_to_checkpoint = os.path.join(output_dir, WEIGHTS_NAME)
+        state_dict: Dict[str, torch.Tensor] = torch.load(path_to_checkpoint, map_location="cpu")
+
+    decoder_state_dict = {}
+    v_head_state_dict = {}
+    for name, param in state_dict.items():
+        if name.startswith("v_head."):
+            v_head_state_dict[name] = param
+        else:
+            decoder_state_dict[name.replace("pretrained_model.", "")] = param
+
+    os.remove(path_to_checkpoint)
+    model.pretrained_model.save_pretrained(
+        output_dir, state_dict=decoder_state_dict or None, safe_serialization=safe_serialization
+    )
+
+    if safe_serialization:
+        save_file(v_head_state_dict, os.path.join(output_dir, V_HEAD_SAFE_WEIGHTS_NAME), metadata={"format": "pt"})
+    else:
+        torch.save(v_head_state_dict, os.path.join(output_dir, V_HEAD_WEIGHTS_NAME))
+
+    logger.info("Value head model saved at: {}".format(output_dir))
+
+
 class FixValueHeadModelCallback(TrainerCallback):
     def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
@@ -51,8 +110,70 @@ class FixValueHeadModelCallback(TrainerCallback):
             )
 
 
+class SaveProcessorCallback(TrainerCallback):
+    def __init__(self, processor: "ProcessorMixin") -> None:
+        r"""
+        Initializes a callback for saving the processor.
+        """
+        self.processor = processor
+
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the end of training.
+        """
+        if args.should_save:
+            getattr(self.processor, "image_processor").save_pretrained(args.output_dir)
+
+
+class PissaConvertCallback(TrainerCallback):
+    r"""
+    Initializes a callback for converting the PiSSA adapter to a normal one.
+    """
+
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the beginning of training.
+        """
+        if args.should_save:
+            model = kwargs.pop("model")
+            pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
+            logger.info("Initial PiSSA adatper will be saved at: {}.".format(pissa_init_dir))
+            if isinstance(model, PeftModel):
+                init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
+                setattr(model.peft_config["default"], "init_lora_weights", True)
+                model.save_pretrained(pissa_init_dir, safe_serialization=args.save_safetensors)
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the end of training.
+        """
+        if args.should_save:
+            model = kwargs.pop("model")
+            pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
+            pissa_backup_dir = os.path.join(args.output_dir, "pissa_backup")
+            pissa_convert_dir = os.path.join(args.output_dir, "pissa_converted")
+            logger.info("Converted PiSSA adapter will be saved at: {}.".format(pissa_convert_dir))
+            # 1. save a pissa backup with init_lora_weights: True
+            # 2. save a converted lora with init_lora_weights: pissa
+            # 3. load the pissa backup with init_lora_weights: True
+            # 4. delete the initial adapter and change init_lora_weights to pissa
+            if isinstance(model, PeftModel):
+                init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
+                setattr(model.peft_config["default"], "init_lora_weights", True)
+                model.save_pretrained(pissa_backup_dir, safe_serialization=args.save_safetensors)
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+                model.save_pretrained(
+                    pissa_convert_dir, safe_serialization=args.save_safetensors, convert_pissa_to_lora=pissa_init_dir
+                )
+                model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
+                model.set_adapter("default")
+                model.delete_adapter("pissa_init")
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
+
 class LogCallback(TrainerCallback):
-    def __init__(self, output_dir: str) -> None:
+    def __init__(self) -> None:
         r"""
         Initializes a callback for logging training and evaluation status.
         """
@@ -70,7 +191,7 @@ class LogCallback(TrainerCallback):
         self.webui_mode = os.environ.get("LLAMABOARD_ENABLED", "0").lower() in ["true", "1"]
         if self.webui_mode:
             signal.signal(signal.SIGABRT, self._set_abort)
-            self.logger_handler = LoggerHandler(output_dir)
+            self.logger_handler = LoggerHandler(os.environ.get("LLAMABOARD_WORKDIR"))
             logging.root.addHandler(self.logger_handler)
             transformers.logging.add_handler(self.logger_handler)
 
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index ed4fd5d9..e45467d6 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import warnings
 from collections import defaultdict
 from contextlib import nullcontext
@@ -29,7 +28,8 @@ from trl import DPOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
-from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler, get_batch_logps
+from ..callbacks import PissaConvertCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
 if TYPE_CHECKING:
@@ -54,7 +54,6 @@ class CustomDPOTrainer(DPOTrainer):
                 disable_dropout_in_model(ref_model)
 
         self.finetuning_args = finetuning_args
-        self.processor = processor
         self.reference_free = False
         self.use_dpo_data_collator = True  # hack to avoid warning
         self.generate_during_eval = False  # disable at evaluation
@@ -92,14 +91,17 @@ class CustomDPOTrainer(DPOTrainer):
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
                 self.ref_model.eval()
 
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
         if finetuning_args.pissa_convert:
-            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+            self.callback_handler.add_callback(PissaConvertCallback)
 
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -112,15 +114,6 @@ class CustomDPOTrainer(DPOTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.finetuning_args.pissa_convert:
-            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
-
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
-
     def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
         r"""
         Computes ORPO's odds ratio (OR) loss for batched log probabilities of the policy model.
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index c2edf95a..460311e4 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -27,6 +27,7 @@ from trl import KTOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
+from ..callbacks import SaveProcessorCallback
 from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
@@ -53,7 +54,6 @@ class CustomKTOTrainer(KTOTrainer):
                 disable_dropout_in_model(ref_model)
 
         self.finetuning_args = finetuning_args
-        self.processor = processor
         self.reference_free = False
         self.use_dpo_data_collator = True  # hack to avoid warning
         self.generate_during_eval = False  # disable at evaluation
@@ -90,11 +90,14 @@ class CustomKTOTrainer(KTOTrainer):
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
                 self.ref_model.eval()
 
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -113,12 +116,6 @@ class CustomKTOTrainer(KTOTrainer):
         """
         return Trainer._get_train_sampler(self)
 
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
-
     def forward(
         self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"], prefix: Literal["", "kl_"] = ""
     ) -> Tuple["torch.Tensor", "torch.Tensor"]:
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index c5f6e175..57f0b848 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -27,6 +27,7 @@ from accelerate.utils import DistributedDataParallelKwargs
 from tqdm import tqdm
 from transformers import GenerationConfig, Trainer, TrainerControl, TrainerState
 from transformers.optimization import get_scheduler
+from transformers.trainer_callback import CallbackHandler
 from transformers.trainer_pt_utils import remove_dummy_checkpoint
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
@@ -34,9 +35,9 @@ from trl import PPOConfig, PPOTrainer
 from trl.core import PPODecorators, logprobs_from_logits
 from trl.models.utils import unwrap_model_for_generation
 
-from ...extras.callbacks import FixValueHeadModelCallback, LogCallback
 from ...extras.logging import get_logger
 from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor
+from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback
 from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 from .ppo_utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm
 
@@ -131,7 +132,6 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         self.finetuning_args = finetuning_args
         self.reward_model = reward_model
         self.current_device = get_current_device()  # patch for deepspeed training
-        self.processor = processor
 
         self.generation_config = GenerationConfig(
             pad_token_id=self.tokenizer.pad_token_id,
@@ -143,8 +143,9 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         self.control = TrainerControl()
         self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
         self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
-        self.log_callback, self.save_callback = callbacks[0], callbacks[1]
-        assert isinstance(self.log_callback, LogCallback) and isinstance(self.save_callback, FixValueHeadModelCallback)
+        self.callback_handler = CallbackHandler(
+            [callbacks], self.accelerator.unwrap_model(self.model), self.tokenizer, self.optimizer, self.lr_scheduler
+        )
 
         if self.args.max_steps > 0:
             logger.info("max_steps is given, it will override any value given in num_train_epochs")
@@ -165,11 +166,16 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
             else:
                 self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
 
+        self.add_callback(FixValueHeadModelCallback)
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
         r"""
@@ -219,7 +225,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         dataiter = iter(self.dataloader)
         loss_meter = AverageMeter()
         reward_meter = AverageMeter()
-        self.log_callback.on_train_begin(self.args, self.state, self.control)
+        self.callback_handler.on_train_begin(self.args, self.state, self.control)
 
         for step in tqdm(range(max_steps), disable=not self.is_local_process_zero()):
             try:
@@ -257,7 +263,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                     logger.warning("Failed to save stats due to unknown errors.")
 
             self.state.global_step += 1
-            self.log_callback.on_step_end(self.args, self.state, self.control)
+            self.callback_handler.on_step_end(self.args, self.state, self.control)
 
             if self.is_local_process_zero() and (step + 1) % self.args.logging_steps == 0:
                 logs = dict(
@@ -269,7 +275,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                 tqdm.write(str(logs))
                 logs["step"] = step
                 self.state.log_history.append(logs)
-                self.log_callback.on_log(self.args, self.state, self.control)
+                self.callback_handler.on_log(self.args, self.state, self.control, logs)
                 loss_meter.reset()
                 reward_meter.reset()
 
@@ -277,17 +283,12 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                 self.save_model(
                     os.path.join(self.args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, self.state.global_step))
                 )
-                self.save_callback.on_save(
-                    self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model)
-                )
+                self.callback_handler.on_save(self.args, self.state, self.control)
 
             if self.control.should_epoch_stop or self.control.should_training_stop:
                 break
 
-        self.log_callback.on_train_end(self.args, self.state, self.control)
-        self.save_callback.on_train_end(
-            self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model)
-        )
+        self.callback_handler.on_train_end(self.args, self.state, self.control)
 
     def create_optimizer(
         self,
@@ -505,7 +506,3 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
 
         elif self.args.should_save:
             self._save(output_dir)
-
-        if self.processor is not None and self.args.should_save:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py
index 4f4d2820..651296f3 100644
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
@@ -20,10 +20,9 @@ from typing import TYPE_CHECKING, List, Optional
 from transformers import DataCollatorWithPadding
 
 from ...data import get_dataset
-from ...extras.callbacks import FixValueHeadModelCallback
-from ...extras.misc import fix_valuehead_checkpoint
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
+from ..callbacks import FixValueHeadModelCallback, fix_valuehead_checkpoint
 from ..trainer_utils import create_ref_model, create_reward_model
 from .trainer import CustomPPOTrainer
 
@@ -75,6 +74,7 @@ def run_ppo(
         ppo_trainer.save_model()
         if training_args.should_save:
             fix_valuehead_checkpoint(model, training_args.output_dir, training_args.save_safetensors)
+
         ppo_trainer.save_state()  # must be called after save_model to have a folder
         if ppo_trainer.is_world_process_zero() and finetuning_args.plot_loss:
             plot_loss(training_args.output_dir, keys=["loss", "reward"])
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index b6fb161d..e8f180a6 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from types import MethodType
-from typing import TYPE_CHECKING, Dict, Optional
+from typing import TYPE_CHECKING, Optional
 
 from transformers import Trainer
 
 from ...extras.logging import get_logger
-from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler
+from ..callbacks import PissaConvertCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -42,16 +42,18 @@ class CustomTrainer(Trainer):
     ) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
-        self.processor = processor
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
 
         if finetuning_args.pissa_convert:
-            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+            self.add_callback(PissaConvertCallback)
 
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -63,12 +65,3 @@ class CustomTrainer(Trainer):
     ) -> "torch.optim.lr_scheduler.LRScheduler":
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
-
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.finetuning_args.pissa_convert:
-            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
-
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 70c2e9a0..5eceead8 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -46,6 +46,7 @@ import torch
 from transformers import Trainer
 
 from ...extras.logging import get_logger
+from ..callbacks import FixValueHeadModelCallback, PissaConvertCallback, SaveProcessorCallback
 from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 
 
@@ -69,13 +70,20 @@ class PairwiseTrainer(Trainer):
     ) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
-        self.processor = processor
         self.can_return_loss = True  # override property to return eval_loss
+        self.add_callback(FixValueHeadModelCallback)
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.pissa_convert:
+            self.add_callback(PissaConvertCallback)
+
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -88,12 +96,6 @@ class PairwiseTrainer(Trainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
-
     def compute_loss(
         self, model: "PreTrainedModel", inputs: Dict[str, torch.Tensor], return_outputs: bool = False
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
@@ -164,4 +166,5 @@ class PairwiseTrainer(Trainer):
             res: List[str] = []
             for c_score, r_score in zip(chosen_scores, rejected_scores):
                 res.append(json.dumps({"chosen": round(float(c_score), 2), "rejected": round(float(r_score), 2)}))
+
             writer.write("\n".join(res))
diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py
index 6f24e964..e0b32b77 100644
--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
@@ -40,10 +40,9 @@
 from typing import TYPE_CHECKING, List, Optional
 
 from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset
-from ...extras.callbacks import FixValueHeadModelCallback
-from ...extras.misc import fix_valuehead_checkpoint
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
+from ..callbacks import fix_valuehead_checkpoint
 from ..trainer_utils import create_modelcard_and_push
 from .metric import compute_accuracy
 from .trainer import PairwiseTrainer
@@ -77,7 +76,7 @@ def run_rm(
         args=training_args,
         finetuning_args=finetuning_args,
         data_collator=data_collator,
-        callbacks=callbacks + [FixValueHeadModelCallback()],
+        callbacks=callbacks,
         compute_metrics=compute_accuracy,
         **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
@@ -89,6 +88,7 @@ def run_rm(
         trainer.save_model()
         if training_args.should_save:
             fix_valuehead_checkpoint(model, training_args.output_dir, training_args.save_safetensors)
+
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index f0a86dff..06bd2b6b 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -26,7 +26,8 @@ from transformers import Seq2SeqTrainer
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler
+from ..callbacks import PissaConvertCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -50,19 +51,18 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
     ) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
-        self.processor = processor
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
 
         if finetuning_args.pissa_convert:
-            if self.is_deepspeed_enabled:
-                self.accelerator.deepspeed_config = self.accelerator.state.deepspeed_plugin.deepspeed_config
-                self.deepspeed = self._wrap_model(self.model_wrapped)
-            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+            self.add_callback(PissaConvertCallback)
 
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -75,15 +75,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.finetuning_args.pissa_convert:
-            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
-
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
-
     def prediction_step(
         self,
         model: "torch.nn.Module",
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 21d41c36..4b581691 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -17,11 +17,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from peft import PeftModel
 from transformers import Trainer
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.optimization import get_scheduler
@@ -40,7 +38,6 @@ if is_galore_available():
 
 
 if TYPE_CHECKING:
-    from accelerate import Accelerator
     from transformers import PreTrainedModel, Seq2SeqTrainingArguments
     from trl import AutoModelForCausalLMWithValueHead
 
@@ -175,51 +172,6 @@ def create_reward_model(
         return reward_model
 
 
-def convert_pissa_adapter(
-    output_dir: str,
-    state_dict: Dict[str, "torch.Tensor"],
-    accelerator: "Accelerator",
-    model: "PreTrainedModel",
-    training_args: "Seq2SeqTrainingArguments",
-) -> None:
-    r"""
-    Converts the PiSSA adapter to a LoRA adapter.
-    """
-    pissa_init_dir = os.path.join(training_args.output_dir, "pissa_init")
-    pissa_backup_dir = os.path.join(output_dir, "pissa_backup")
-    if output_dir == pissa_init_dir:
-        logger.info("Initial PiSSA adatper will be saved at: {}.".format(pissa_init_dir))
-        unwrapped_model = accelerator.unwrap_model(model)
-        if isinstance(unwrapped_model, PeftModel):
-            init_lora_weights = getattr(unwrapped_model.peft_config["default"], "init_lora_weights")
-            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", True)
-            unwrapped_model.save_pretrained(
-                output_dir,
-                state_dict=state_dict,
-                safe_serialization=training_args.save_safetensors,
-            )
-            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", init_lora_weights)
-
-    elif output_dir == training_args.output_dir:  # at the end of training
-        logger.info("Converted PiSSA adapter will be saved at: {}.".format(output_dir))
-        unwrapped_model = accelerator.unwrap_model(model)
-        if isinstance(unwrapped_model, PeftModel):  # backup the pissa adapter for further use
-            unwrapped_model.save_pretrained(
-                pissa_backup_dir,
-                state_dict=state_dict,
-                safe_serialization=training_args.save_safetensors,
-            )
-            unwrapped_model.save_pretrained(
-                output_dir,
-                state_dict=state_dict,
-                safe_serialization=training_args.save_safetensors,
-                convert_pissa_to_lora=pissa_init_dir,
-            )
-            # TODO: the model is applied pissa again unexpectedly
-            unwrapped_model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
-            unwrapped_model.set_adapter("default")
-
-
 def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]:
     r"""
     Returns a list of names of parameters with weight decay. (weights in non-layernorm layers)
diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py
index a02fff22..dc982e07 100644
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -20,11 +20,11 @@ import torch
 from transformers import PreTrainedModel
 
 from ..data import get_template_and_fix_tokenizer
-from ..extras.callbacks import LogCallback
 from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from ..extras.logging import get_logger
 from ..hparams import get_infer_args, get_train_args
 from ..model import load_model, load_tokenizer
+from .callbacks import LogCallback
 from .dpo import run_dpo
 from .kto import run_kto
 from .ppo import run_ppo
@@ -41,8 +41,8 @@ logger = get_logger(__name__)
 
 
 def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallback"] = []) -> None:
+    callbacks.append(LogCallback())
     model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
-    callbacks.append(LogCallback(training_args.output_dir))
 
     if finetuning_args.stage == "pt":
         run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index f7fbac30..131d180d 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -310,6 +310,7 @@ class Runner:
 
             env = deepcopy(os.environ)
             env["LLAMABOARD_ENABLED"] = "1"
+            env["LLAMABOARD_WORKDIR"] = args["output_dir"]
             if args.get("deepspeed", None) is not None:
                 env["FORCE_TORCHRUN"] = "1"
 
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index 14616ac4..6e5fdbe4 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -38,12 +38,15 @@ def abort_process(pid: int) -> None:
     r"""
     Aborts the processes recursively in a bottom-up way.
     """
-    children = psutil.Process(pid).children()
-    if children:
-        for child in children:
-            abort_process(child.pid)
+    try:
+        children = psutil.Process(pid).children()
+        if children:
+            for child in children:
+                abort_process(child.pid)
 
-    os.kill(pid, signal.SIGABRT)
+        os.kill(pid, signal.SIGABRT)
+    except Exception:
+        pass
 
 
 def can_quantize(finetuning_type: str) -> "gr.Dropdown":

From 71b8bb6037caab897544df44bd0526e95572cb3d Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 01:17:07 +0800
Subject: [PATCH 148/162] update examples

Former-commit-id: 66f248b90cfa2b29c73060459b2337b78154c47b
---
 examples/extras/fsdp_qlora/llama3_lora_sft.yaml  | 2 +-
 examples/extras/llama_pro/llama3_freeze_sft.yaml | 2 +-
 examples/extras/loraplus/llama3_lora_sft.yaml    | 2 +-
 examples/extras/pissa/llama3_lora_sft.yaml       | 2 +-
 examples/train_full/llama3_full_sft_ds3.yaml     | 2 +-
 examples/train_lora/llama3_lora_dpo.yaml         | 2 +-
 examples/train_lora/llama3_lora_kto.yaml         | 2 +-
 examples/train_lora/llama3_lora_ppo.yaml         | 2 +-
 examples/train_lora/llama3_lora_pretrain.yaml    | 2 +-
 examples/train_lora/llama3_lora_reward.yaml      | 4 ++--
 examples/train_lora/llama3_lora_sft.yaml         | 2 +-
 examples/train_lora/llama3_lora_sft_ds0.yaml     | 2 +-
 examples/train_lora/llama3_lora_sft_ds3.yaml     | 2 +-
 examples/train_lora/llava1_5_lora_sft.yaml       | 2 +-
 examples/train_qlora/llama3_lora_sft_aqlm.yaml   | 2 +-
 examples/train_qlora/llama3_lora_sft_awq.yaml    | 2 +-
 examples/train_qlora/llama3_lora_sft_gptq.yaml   | 2 +-
 examples/train_qlora/llama3_lora_sft_otfq.yaml   | 2 +-
 18 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
index cc773991..6c80ef58 100644
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index f92d6945..5e7e90bb 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -31,7 +31,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
index 57383ae0..062a312b 100644
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/extras/pissa/llama3_lora_sft.yaml b/examples/extras/pissa/llama3_lora_sft.yaml
index fd4b9f1d..05077b6c 100644
--- a/examples/extras/pissa/llama3_lora_sft.yaml
+++ b/examples/extras/pissa/llama3_lora_sft.yaml
@@ -32,7 +32,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_full/llama3_full_sft_ds3.yaml b/examples/train_full/llama3_full_sft_ds3.yaml
index 40afd2ee..c983ad5c 100644
--- a/examples/train_full/llama3_full_sft_ds3.yaml
+++ b/examples/train_full/llama3_full_sft_ds3.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_dpo.yaml b/examples/train_lora/llama3_lora_dpo.yaml
index 188e5078..d87c0669 100644
--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -31,7 +31,7 @@ learning_rate: 5.0e-6
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_kto.yaml b/examples/train_lora/llama3_lora_kto.yaml
index f730c82e..08208c25 100644
--- a/examples/train_lora/llama3_lora_kto.yaml
+++ b/examples/train_lora/llama3_lora_kto.yaml
@@ -30,7 +30,7 @@ learning_rate: 5.0e-6
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_ppo.yaml b/examples/train_lora/llama3_lora_ppo.yaml
index e574014e..512e90ea 100644
--- a/examples/train_lora/llama3_lora_ppo.yaml
+++ b/examples/train_lora/llama3_lora_ppo.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-5
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### generate
diff --git a/examples/train_lora/llama3_lora_pretrain.yaml b/examples/train_lora/llama3_lora_pretrain.yaml
index 839b3e51..5e8aaaef 100644
--- a/examples/train_lora/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/llama3_lora_pretrain.yaml
@@ -28,7 +28,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_reward.yaml b/examples/train_lora/llama3_lora_reward.yaml
index 79559d19..96c32238 100644
--- a/examples/train_lora/llama3_lora_reward.yaml
+++ b/examples/train_lora/llama3_lora_reward.yaml
@@ -25,11 +25,11 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
+learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_sft.yaml b/examples/train_lora/llama3_lora_sft.yaml
index fe30c575..55a8077e 100644
--- a/examples/train_lora/llama3_lora_sft.yaml
+++ b/examples/train_lora/llama3_lora_sft.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_sft_ds0.yaml b/examples/train_lora/llama3_lora_sft_ds0.yaml
index 08b638e6..f1442faa 100644
--- a/examples/train_lora/llama3_lora_sft_ds0.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds0.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_sft_ds3.yaml b/examples/train_lora/llama3_lora_sft_ds3.yaml
index b7266d61..66e7007e 100644
--- a/examples/train_lora/llama3_lora_sft_ds3.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds3.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llava1_5_lora_sft.yaml b/examples/train_lora/llava1_5_lora_sft.yaml
index 55ac31fa..ec03f82c 100644
--- a/examples/train_lora/llava1_5_lora_sft.yaml
+++ b/examples/train_lora/llava1_5_lora_sft.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_qlora/llama3_lora_sft_aqlm.yaml b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
index 7b6767d5..3519d46b 100644
--- a/examples/train_qlora/llama3_lora_sft_aqlm.yaml
+++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_qlora/llama3_lora_sft_awq.yaml b/examples/train_qlora/llama3_lora_sft_awq.yaml
index a2a26e4b..df48669b 100644
--- a/examples/train_qlora/llama3_lora_sft_awq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_awq.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_qlora/llama3_lora_sft_gptq.yaml b/examples/train_qlora/llama3_lora_sft_gptq.yaml
index ad3d854c..61fa9bb4 100644
--- a/examples/train_qlora/llama3_lora_sft_gptq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_qlora/llama3_lora_sft_otfq.yaml b/examples/train_qlora/llama3_lora_sft_otfq.yaml
index 9c73b439..80a05768 100644
--- a/examples/train_qlora/llama3_lora_sft_otfq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_otfq.yaml
@@ -31,7 +31,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval

From 81094dc09abfe31ffab2f64b537b97bc2a0f3e78 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 01:26:50 +0800
Subject: [PATCH 149/162] add Gemma2 models

Former-commit-id: 8fc5a248ecfd6861cb90dac6c14fe89cdeaf8921
---
 README.md                            |  2 +-
 README_zh.md                         |  2 +-
 src/llamafactory/extras/constants.py | 12 ++++++++++++
 src/llamafactory/train/rm/trainer.py |  2 +-
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 44aed7e8..27d1e98b 100644
--- a/README.md
+++ b/README.md
@@ -160,7 +160,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
 | [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)          | 2B/7B                            | gemma     |
+| [Gemma/Gemma2/CodeGemma](https://huggingface.co/google)   | 2B/7B/9B/27B                     | gemma     |
 | [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
 | [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
diff --git a/README_zh.md b/README_zh.md
index 7e3d51ad..9fb56a4e 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -160,7 +160,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
 | [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)          | 2B/7B                            | gemma     |
+| [Gemma/Gemma2/CodeGemma](https://huggingface.co/google)   | 2B/7B/9B/27B                     | gemma     |
 | [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
 | [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 866f39d4..3ea21f76 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -507,6 +507,18 @@ register_model_group(
         "Gemma-1.1-7B-Chat": {
             DownloadSource.DEFAULT: "google/gemma-1.1-7b-it",
         },
+        "Gemma-2-9B": {
+            DownloadSource.DEFAULT: "google/gemma-2-9b",
+        },
+        "Gemma-2-27B": {
+            DownloadSource.DEFAULT: "google/gemma-2-27b",
+        },
+        "Gemma-2-9B-Chat": {
+            DownloadSource.DEFAULT: "google/gemma-2-9b-it",
+        },
+        "Gemma-2-27B-Chat": {
+            DownloadSource.DEFAULT: "google/gemma-2-27b-it",
+        },
     },
     template="gemma",
 )
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 5eceead8..accc877d 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -105,7 +105,7 @@ class PairwiseTrainer(Trainer):
         Subclass and override to inject custom behavior.
 
         Note that the first element will be removed from the output tuple.
-        See: https://github.com/huggingface/transformers/blob/v4.39.1/src/transformers/trainer.py#L3777
+        See: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py#L3842
         """
         # Compute rewards
         _, _, values = model(**inputs, output_hidden_states=True, return_dict=True)

From 7705df9dadd2eccad31ebed805fce176b993e844 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 01:28:59 +0800
Subject: [PATCH 150/162] fix docker flashattn

Former-commit-id: 0966f5d4616a3877a6b921976dc39e8799831d36
---
 docker/docker-cuda/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index e4fac987..d94aa970 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -4,6 +4,7 @@ FROM nvcr.io/nvidia/pytorch:24.02-py3
 
 # Define environments
 ENV MAX_JOBS=4
+ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
 
 # Define installation arguments
 ARG INSTALL_BNB=false

From 0bd6bcd95f7029544243a3e2b4f102dad0ae2108 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 03:18:54 +0800
Subject: [PATCH 151/162] increase pissa_iter for stability

Former-commit-id: 03f8d9b0fb10ae58e7f68508197330d616957899
---
 src/llamafactory/hparams/finetuning_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index 28da95ad..3867c0ec 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -113,7 +113,7 @@ class LoraArguments:
         metadata={"help": "Whether or not to initialize a PiSSA adapter."},
     )
     pissa_iter: int = field(
-        default=4,
+        default=16,
         metadata={"help": "The number of iteration steps performed by FSVD in PiSSA. Use -1 to disable it."},
     )
     pissa_convert: bool = field(

From 6ce0b5891bc8066daed2b6cb964ae3caa26b6677 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 06:00:26 +0800
Subject: [PATCH 152/162] bf16 by default, gemma2 attns

Gemma2 finetuning cannot work until merging https://github.com/huggingface/transformers/pull/31674


Former-commit-id: da66c32c7be0adc28d2185b23e9f62d56acb961c
---
 src/llamafactory/model/model_utils/attention.py | 8 +++++++-
 src/llamafactory/model/patcher.py               | 2 +-
 src/llamafactory/webui/components/train.py      | 2 +-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index dfd90936..9021d277 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -28,7 +28,13 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
-def configure_attn_implementation(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
+def configure_attn_implementation(
+    config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool
+) -> None:
+    if getattr(config, "model_type", None) == "gemma2" and is_trainable:  # gemma2 adopts soft-cap attention
+        logger.warning("Gemma-2 models should use eager attention in training, change `flash_attn` to disabled.")
+        model_args.flash_attn = "disabled"
+
     if model_args.flash_attn == "auto":
         return
 
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 24cd2601..4eae0bb4 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -67,7 +67,7 @@ def patch_config(
         use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
         torch.npu.set_compile_mode(jit_compile=use_jit_compile)
 
-    configure_attn_implementation(config, model_args)
+    configure_attn_implementation(config, model_args, is_trainable)
     configure_rope(config, model_args, is_trainable)
     configure_longlora(config, model_args, is_trainable)
     configure_quantization(config, tokenizer, model_args, init_kwargs)
diff --git a/src/llamafactory/webui/components/train.py b/src/llamafactory/webui/components/train.py
index f33c37ee..d4832dd3 100644
--- a/src/llamafactory/webui/components/train.py
+++ b/src/llamafactory/webui/components/train.py
@@ -54,7 +54,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         num_train_epochs = gr.Textbox(value="3.0")
         max_grad_norm = gr.Textbox(value="1.0")
         max_samples = gr.Textbox(value="100000")
-        compute_type = gr.Dropdown(choices=["fp16", "bf16", "fp32", "pure_bf16"], value="fp16")
+        compute_type = gr.Dropdown(choices=["bf16", "fp16", "fp32", "pure_bf16"], value="bf16")
 
     input_elems.update({learning_rate, num_train_epochs, max_grad_norm, max_samples, compute_type})
     elem_dict.update(

From b9f2c6e64ef9f244b85b0093ce4749f07afdaa70 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 06:55:19 +0800
Subject: [PATCH 153/162] update readme

Former-commit-id: 9f809c311af373508cb51b204ae54b047729a9dc
---
 README.md    | 57 +++++++++++++++++++++++++---------------------------
 README_zh.md | 57 +++++++++++++++++++++++++---------------------------
 2 files changed, 54 insertions(+), 60 deletions(-)

diff --git a/README.md b/README.md
index 27d1e98b..6c6a48d9 100644
--- a/README.md
+++ b/README.md
@@ -151,35 +151,32 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                     | Model size                       | Template  |
-| --------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)          | 7B/13B                           | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)                | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                  | 6B                               | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/Gemma2/CodeGemma](https://huggingface.co/google)   | 2B/7B/9B/27B                     | gemma     |
-| [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
-| [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)              | 7B/13B/70B                       | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)              | 8B/70B                           | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)              | 7B/13B                           | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)       | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                    | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)                | 3B                               | gemma     |
-| [Phi-1.5/2](https://huggingface.co/microsoft)             | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                 | 4B/7B/14B                        | phi       |
-| [Qwen](https://huggingface.co/Qwen)                       | 1.8B/7B/14B/72B                  | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)         | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2 (MoE)](https://huggingface.co/Qwen)                | 0.5B/1.5B/7B/57B/72B             | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)              | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                   | 7B/13B/65B                       | xverse    |
-| [Yi (1/1.5)](https://huggingface.co/01-ai)                | 6B/9B/34B                        | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                     | 6B/34B                           | yi_vl     |
-| [Yuan](https://huggingface.co/IEITYuan)                   | 2B/51B/102B                      | yuan      |
+| Model                                                        | Model size                       | Template  |
+| ------------------------------------------------------------ | -------------------------------- | --------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)            | 7B/13B                           | baichuan2 |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)            | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                     | 6B                               | chatglm3  |
+| [Command R](https://huggingface.co/CohereForAI)              | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)    | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                      | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)     | 2B/7B/9B/27B                     | gemma     |
+| [GLM-4](https://huggingface.co/THUDM)                        | 9B                               | glm4      |
+| [InternLM2](https://huggingface.co/internlm)                 | 7B/20B                           | intern2   |
+| [Llama](https://github.com/facebookresearch/llama)           | 7B/13B/33B/65B                   | -         |
+| [Llama 2](https://huggingface.co/meta-llama)                 | 7B/13B/70B                       | llama2    |
+| [Llama 3](https://huggingface.co/meta-llama)                 | 8B/70B                           | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                 | 7B/13B                           | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)          | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                       | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                   | 3B                               | gemma     |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                    | 4B/7B/14B                        | phi       |
+| [Qwen/Qwen1.5/Qwen2 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen      |
+| [StarCoder 2](https://huggingface.co/bigcode)                | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                      | 7B/13B/65B                       | xverse    |
+| [Yi/Yi-1.5](https://huggingface.co/01-ai)                    | 6B/9B/34B                        | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                        | 6B/34B                           | yi_vl     |
+| [Yuan 2](https://huggingface.co/IEITYuan)                    | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
@@ -610,7 +607,7 @@ If you have a project that should be incorporated, please contact via email or c
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
 
-Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index 9fb56a4e..f695646a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -151,35 +151,32 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 模型
 
-| 模型名                                                    | 模型大小                          | Template  |
-| --------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)          | 7B/13B                           | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)                | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                  | 6B                               | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/Gemma2/CodeGemma](https://huggingface.co/google)   | 2B/7B/9B/27B                     | gemma     |
-| [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
-| [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)              | 7B/13B/70B                       | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)              | 8B/70B                           | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)              | 7B/13B                           | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)       | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                    | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)                | 3B                               | gemma     |
-| [Phi-1.5/2](https://huggingface.co/microsoft)             | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                 | 4B/7B/14B                        | phi       |
-| [Qwen](https://huggingface.co/Qwen)                       | 1.8B/7B/14B/72B                  | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)         | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2 (MoE)](https://huggingface.co/Qwen)                | 0.5B/1.5B/7B/57B/72B             | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)              | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                   | 7B/13B/65B                       | xverse    |
-| [Yi (1/1.5)](https://huggingface.co/01-ai)                | 6B/9B/34B                        | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                     | 6B/34B                           | yi_vl     |
-| [Yuan](https://huggingface.co/IEITYuan)                   | 2B/51B/102B                      | yuan      |
+| 模型名                                                       | 模型大小                          | Template  |
+| ------------------------------------------------------------ | -------------------------------- | --------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)            | 7B/13B                           | baichuan2 |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)            | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                     | 6B                               | chatglm3  |
+| [Command R](https://huggingface.co/CohereForAI)              | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)    | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                      | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)     | 2B/7B/9B/27B                     | gemma     |
+| [GLM-4](https://huggingface.co/THUDM)                        | 9B                               | glm4      |
+| [InternLM2](https://huggingface.co/internlm)                 | 7B/20B                           | intern2   |
+| [Llama](https://github.com/facebookresearch/llama)           | 7B/13B/33B/65B                   | -         |
+| [Llama 2](https://huggingface.co/meta-llama)                 | 7B/13B/70B                       | llama2    |
+| [Llama 3](https://huggingface.co/meta-llama)                 | 8B/70B                           | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                 | 7B/13B                           | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)          | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                       | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                   | 3B                               | gemma     |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                    | 4B/7B/14B                        | phi       |
+| [Qwen/Qwen1.5/Qwen2 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen      |
+| [StarCoder 2](https://huggingface.co/bigcode)                | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                      | 7B/13B/65B                       | xverse    |
+| [Yi/Yi-1.5](https://huggingface.co/01-ai)                    | 6B/9B/34B                        | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                        | 6B/34B                           | yi_vl     |
+| [Yuan 2](https://huggingface.co/IEITYuan)                    | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
@@ -610,7 +607,7 @@ run_name: test_run # 可选
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
 
-使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## 引用
 

From f7a4f3d9c041c3dcb5169d3dae50dc0fc885fb31 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 29 Jun 2024 01:42:14 +0800
Subject: [PATCH 154/162] loose gemma2 attention

Former-commit-id: a0b645017a2de3d58b6cbc71bd91ec96fc7a818b
---
 src/llamafactory/model/model_utils/attention.py | 10 ++++++++--
 src/llamafactory/train/callbacks.py             |  5 +----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index 9021d277..80d9d4b8 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -32,8 +32,14 @@ def configure_attn_implementation(
     config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool
 ) -> None:
     if getattr(config, "model_type", None) == "gemma2" and is_trainable:  # gemma2 adopts soft-cap attention
-        logger.warning("Gemma-2 models should use eager attention in training, change `flash_attn` to disabled.")
-        model_args.flash_attn = "disabled"
+        if model_args.flash_attn == "auto":
+            logger.warning("Gemma-2 models should use eager attention in training, change `flash_attn` to disabled.")
+            model_args.flash_attn = "disabled"
+        else:
+            logger.warning(
+                "Gemma-2 models should use eager attention in training, but you set `flash_attn: {}`. "
+                "Will proceed at your own risk.".format(model_args.flash_attn)
+            )
 
     if model_args.flash_attn == "auto":
         return
diff --git a/src/llamafactory/train/callbacks.py b/src/llamafactory/train/callbacks.py
index dc9c981e..4d024278 100644
--- a/src/llamafactory/train/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
@@ -1,7 +1,4 @@
-# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
-#
-# This code is inspired by the HuggingFace's transformers library.
-# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py
+# Copyright 2024 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 9a0723143aee05e7fae671f2a3fb994c84a26e6b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 30 Jun 2024 21:05:31 +0800
Subject: [PATCH 155/162] update npu docker

Former-commit-id: 2f4d5174205605b8821d4fb626283e07694ecf80
---
 docker/docker-npu/Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index e413d4e3..34cf9616 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -1,6 +1,9 @@
 # Use the Ubuntu 22.04 image with CANN 8.0.rc1
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
+# FROM cosdt/cann:8.0.rc1-910-ubuntu22.04
 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
+# FROM cosdt/cann:8.0.rc1-910-openeuler22.03
+# FROM cosdt/cann:8.0.rc1-910b-openeuler22.03
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive

From 35c65ddf8c362741b86bc78dd9a6756a3ef193ac Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 30 Jun 2024 21:28:51 +0800
Subject: [PATCH 156/162] fix #4398 #4592

Former-commit-id: 8c92d268903c00392c8bd75a731daa1f107d6202
---
 README.md                        |  2 ++
 README_zh.md                     |  4 +++-
 src/llamafactory/extras/misc.py  | 24 +++++++++++++++++-------
 src/llamafactory/model/loader.py |  3 ++-
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 6c6a48d9..6299650b 100644
--- a/README.md
+++ b/README.md
@@ -384,6 +384,8 @@ Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to
 
 If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
 
+Download the pre-built Docker images: [32GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html) | [64GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
+
 </details>
 
 ### Data Preparation
diff --git a/README_zh.md b/README_zh.md
index f695646a..efc647a7 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -357,7 +357,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 <details><summary>昇腾 NPU 用户指南</summary>
 
-在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e ".[torch-npu,metrics]"` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
+在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e ".[torch-npu,metrics]"` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit 与 Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
 
 ```bash
 # 请替换 URL 为 CANN 版本和设备型号对应的 URL
@@ -384,6 +384,8 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
 如果遇到无法正常推理的情况，请尝试设置 `do_sample: false`。
 
+下载预构建 Docker 镜像：[32GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html) | [64GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
+
 </details>
 
 ### 数据准备
diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index 30c287bd..20c752c5 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -20,7 +20,9 @@ import os
 from typing import TYPE_CHECKING, Tuple
 
 import torch
+import transformers.dynamic_module_utils
 from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
+from transformers.dynamic_module_utils import get_relative_imports
 from transformers.utils import (
     is_torch_bf16_gpu_available,
     is_torch_cuda_available,
@@ -69,6 +71,9 @@ class AverageMeter:
 
 
 def check_dependencies() -> None:
+    r"""
+    Checks the version of the required packages.
+    """
     if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]:
         logger.warning("Version checking has been disabled, may lead to unexpected behaviors.")
     else:
@@ -79,7 +84,7 @@ def check_dependencies() -> None:
         require_version("trl>=0.8.6", "To fix: pip install trl>=0.8.6")
 
 
-def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
+def count_parameters(model: "torch.nn.Module") -> Tuple[int, int]:
     r"""
     Returns the number of trainable parameters and number of all parameters in the model.
     """
@@ -108,7 +113,7 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
     return trainable_params, all_param
 
 
-def get_current_device() -> torch.device:
+def get_current_device() -> "torch.device":
     r"""
     Gets the current available device.
     """
@@ -147,6 +152,13 @@ def get_logits_processor() -> "LogitsProcessorList":
     return logits_processor
 
 
+def has_tokenized_data(path: "os.PathLike") -> bool:
+    r"""
+    Checks if the path has a tokenized dataset.
+    """
+    return os.path.isdir(path) and len(os.listdir(path)) > 0
+
+
 def infer_optim_dtype(model_dtype: "torch.dtype") -> "torch.dtype":
     r"""
     Infers the optimal dtype according to the model_dtype and device compatibility.
@@ -166,11 +178,9 @@ def is_gpu_or_npu_available() -> bool:
     return is_torch_npu_available() or is_torch_cuda_available()
 
 
-def has_tokenized_data(path: "os.PathLike") -> bool:
-    r"""
-    Checks if the path has a tokenized dataset.
-    """
-    return os.path.isdir(path) and len(os.listdir(path)) > 0
+def skip_check_imports() -> None:
+    if os.environ.get("FORCE_CHECK_IMPORTS", "0").lower() not in ["true", "1"]:
+        transformers.dynamic_module_utils.check_imports = get_relative_imports
 
 
 def torch_gc() -> None:
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 1261d17a..fe700d53 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -19,7 +19,7 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Se
 from trl import AutoModelForCausalLMWithValueHead
 
 from ..extras.logging import get_logger
-from ..extras.misc import count_parameters, try_download_model_from_ms
+from ..extras.misc import count_parameters, skip_check_imports, try_download_model_from_ms
 from .adapter import init_adapter
 from .model_utils.misc import register_autoclass
 from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
@@ -48,6 +48,7 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
 
     Note: including inplace operation of model_args.
     """
+    skip_check_imports()
     model_args.model_name_or_path = try_download_model_from_ms(model_args)
     return {
         "trust_remote_code": True,

From 9988b1b0298517e8f56747b0c4cd3cc60205b085 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 00:22:52 +0800
Subject: [PATCH 157/162] update readme

Former-commit-id: 7998d969bf942c91cf41a189e3941f6e04c81c6f
---
 README.md    | 10 +++++++++-
 README_zh.md | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6299650b..3d3feae5 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
-[![Citation](https://img.shields.io/badge/citation-63-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-71-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -593,6 +593,14 @@ If you have a project that should be incorporated, please contact via email or c
 1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115)
 1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815)
 1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099)
+1. Ding et al. IntentionQA: A Benchmark for Evaluating Purchase Intention Comprehension Abilities of Language Models in E-commerce. 2024. [[arxiv]](https://arxiv.org/abs/2406.10173)
+1. He et al. COMMUNITY-CROSS-INSTRUCT: Unsupervised Instruction Generation for Aligning Large Language Models to Online Communities. 2024. [[arxiv]](https://arxiv.org/abs/2406.12074)
+1. Lin et al. FVEL: Interactive Formal Verification Environment with Large Language Models via Theorem Proving. 2024. [[arxiv]](https://arxiv.org/abs/2406.14408)
+1. Treutlein et al. Connecting the Dots: LLMs can Infer and Verbalize Latent Structure from Disparate Training Data. 2024. [[arxiv]](https://arxiv.org/abs/2406.14546)
+1. Feng et al. SS-Bench: A Benchmark for Social Story Generation and Evaluation. 2024. [[arxiv]](https://arxiv.org/abs/2406.15695)
+1. Feng et al. Self-Constructed Context Decompilation with Fined-grained Alignment Enhancement. 2024. [[arxiv]](https://arxiv.org/abs/2406.17233)
+1. Liu et al. Large Language Models for Cuffless Blood Pressure Measurement From Wearable Biosignals. 2024. [[arxiv]](https://arxiv.org/abs/2406.18069)
+1. Iyer et al. Exploring Very Low-Resource Translation with LLMs: The University of Edinburgh’s Submission to AmericasNLP 2024 Translation Task. AmericasNLP 2024. [[paper]](https://aclanthology.org/2024.americasnlp-1.25)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
 1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
diff --git a/README_zh.md b/README_zh.md
index efc647a7..cb5a42e4 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -4,7 +4,7 @@
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
-[![Citation](https://img.shields.io/badge/citation-63-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-71-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -593,6 +593,14 @@ run_name: test_run # 可选
 1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115)
 1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815)
 1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099)
+1. Ding et al. IntentionQA: A Benchmark for Evaluating Purchase Intention Comprehension Abilities of Language Models in E-commerce. 2024. [[arxiv]](https://arxiv.org/abs/2406.10173)
+1. He et al. COMMUNITY-CROSS-INSTRUCT: Unsupervised Instruction Generation for Aligning Large Language Models to Online Communities. 2024. [[arxiv]](https://arxiv.org/abs/2406.12074)
+1. Lin et al. FVEL: Interactive Formal Verification Environment with Large Language Models via Theorem Proving. 2024. [[arxiv]](https://arxiv.org/abs/2406.14408)
+1. Treutlein et al. Connecting the Dots: LLMs can Infer and Verbalize Latent Structure from Disparate Training Data. 2024. [[arxiv]](https://arxiv.org/abs/2406.14546)
+1. Feng et al. SS-Bench: A Benchmark for Social Story Generation and Evaluation. 2024. [[arxiv]](https://arxiv.org/abs/2406.15695)
+1. Feng et al. Self-Constructed Context Decompilation with Fined-grained Alignment Enhancement. 2024. [[arxiv]](https://arxiv.org/abs/2406.17233)
+1. Liu et al. Large Language Models for Cuffless Blood Pressure Measurement From Wearable Biosignals. 2024. [[arxiv]](https://arxiv.org/abs/2406.18069)
+1. Iyer et al. Exploring Very Low-Resource Translation with LLMs: The University of Edinburgh’s Submission to AmericasNLP 2024 Translation Task. AmericasNLP 2024. [[paper]](https://aclanthology.org/2024.americasnlp-1.25)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper，基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM，基于 Baichuan-13B 微调而得，具有法律推理和知识检索能力。
 1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。

From a43f5183894292021a07f538852c6b548c86aa7f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 01:19:27 +0800
Subject: [PATCH 158/162] fix #4402 #4617

Deprecate reserved_label_len arg


Former-commit-id: 4b6568984c0be4b31e7aa91b7c0d52b7f7b12b0b
---
 src/llamafactory/data/data_utils.py           |  12 +-
 src/llamafactory/data/formatter.py            | 116 ++-------------
 src/llamafactory/data/processors/feedback.py  |  16 +-
 src/llamafactory/data/processors/pairwise.py  |  17 ++-
 .../data/processors/processor_utils.py        |  15 +-
 .../data/processors/supervised.py             |  15 +-
 .../data/processors/unsupervised.py           |   9 +-
 src/llamafactory/data/template.py             | 114 +++++---------
 src/llamafactory/data/tool_utils.py           | 140 ++++++++++++++++++
 src/llamafactory/hparams/data_args.py         |   7 -
 tests/data/test_formatter.py                  |   4 +-
 tests/data/test_processor.py                  |  32 ++++
 tests/data/test_template.py                   |  55 ++++++-
 13 files changed, 329 insertions(+), 223 deletions(-)
 create mode 100644 src/llamafactory/data/tool_utils.py
 create mode 100644 tests/data/test_processor.py

diff --git a/src/llamafactory/data/data_utils.py b/src/llamafactory/data/data_utils.py
index cc9761b1..76ded47e 100644
--- a/src/llamafactory/data/data_utils.py
+++ b/src/llamafactory/data/data_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from enum import Enum, unique
-from typing import TYPE_CHECKING, Dict, List, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Sequence, Set, Union
 
 from datasets import concatenate_datasets, interleave_datasets
 
@@ -30,6 +30,9 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
+SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
+
+
 @unique
 class Role(str, Enum):
     USER = "user"
@@ -39,13 +42,6 @@ class Role(str, Enum):
     OBSERVATION = "observation"
 
 
-def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]:
-    max_target_len = int(max_len * (target_len / (source_len + target_len)))
-    max_target_len = max(max_target_len, reserved_label_len)
-    max_source_len = max_len - min(max_target_len, target_len)
-    return max_source_len, max_target_len
-
-
 def merge_dataset(
     all_datasets: List[Union["Dataset", "IterableDataset"]],
     data_args: "DataArguments",
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 88ebf682..c1653a76 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -16,97 +16,10 @@ import json
 import re
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
-
-SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
-
-
-DEFAULT_TOOL_PROMPT = (
-    "You have access to the following tools:\n{tool_text}"
-    "Use the following format if using a tool:\n"
-    "```\n"
-    "Action: tool name (one of [{tool_names}]).\n"
-    "Action Input: the input to the tool, in a JSON format representing the kwargs "
-    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```).\n"""
-    "```\n"
-)
-
-
-GLM4_TOOL_PROMPT = (
-    "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
-    "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具{tool_text}"
-)
-
-
-def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
-    tool_text = ""
-    tool_names = []
-    for tool in tools:
-        param_text = ""
-        for name, param in tool["parameters"]["properties"].items():
-            required = ", required" if name in tool["parameters"].get("required", []) else ""
-            enum = ", should be one of [{}]".format(", ".join(param["enum"])) if param.get("enum", None) else ""
-            items = (
-                ", where each item should be {}".format(param["items"].get("type", "")) if param.get("items") else ""
-            )
-            param_text += "  - {name} ({type}{required}): {desc}{enum}{items}\n".format(
-                name=name,
-                type=param.get("type", ""),
-                required=required,
-                desc=param.get("description", ""),
-                enum=enum,
-                items=items,
-            )
-
-        tool_text += "> Tool Name: {name}\nTool Description: {desc}\nTool Args:\n{args}\n".format(
-            name=tool["name"], desc=tool.get("description", ""), args=param_text
-        )
-        tool_names.append(tool["name"])
-
-    return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
-
-
-def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
-    action_match: List[Tuple[str, str]] = re.findall(regex, content)
-    if not action_match:
-        return content
-
-    results = []
-    for match in action_match:
-        tool_name = match[0].strip()
-        tool_input = match[1].strip().strip('"').strip("```")
-        try:
-            arguments = json.loads(tool_input)
-            results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
-        except json.JSONDecodeError:
-            return content
-
-    return results
-
-
-def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
-    tool_text = ""
-    for tool in tools:
-        tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
-            name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
-        )
-
-    return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
-
-
-def glm4_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    if "\n" not in content:
-        return content
-
-    tool_name, tool_input = content.split("\n", maxsplit=1)
-    try:
-        arguments = json.loads(tool_input)
-    except json.JSONDecodeError:
-        return content
-
-    return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
+from .data_utils import SLOTS
+from .tool_utils import DefaultToolUtils, GLM4ToolUtils
 
 
 @dataclass
@@ -168,15 +81,12 @@ class StringFormatter(Formatter):
 @dataclass
 class FunctionFormatter(Formatter):
     def __post_init__(self):
-        has_name, has_args = False, False
-        for slot in filter(lambda s: isinstance(s, str), self.slots):
-            if "{{name}}" in slot:
-                has_name = True
-            if "{{arguments}}" in slot:
-                has_args = True
-
-        if not has_name or not has_args:
-            raise ValueError("Name and arguments placeholders are required in the function formatter.")
+        if self.tool_format == "default":
+            self.slots = DefaultToolUtils.get_function_slots() + self.slots
+        elif self.tool_format == "glm4":
+            self.slots = GLM4ToolUtils.get_function_slots() + self.slots
+        else:
+            raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
 
     def apply(self, **kwargs) -> SLOTS:
         content = kwargs.pop("content")
@@ -210,11 +120,11 @@ class FunctionFormatter(Formatter):
 class ToolFormatter(Formatter):
     def __post_init__(self):
         if self.tool_format == "default":
-            self._tool_formatter = default_tool_formatter
-            self._tool_extractor = default_tool_extractor
+            self._tool_formatter = DefaultToolUtils.tool_formatter
+            self._tool_extractor = DefaultToolUtils.tool_extractor
         elif self.tool_format == "glm4":
-            self._tool_formatter = glm4_tool_formatter
-            self._tool_extractor = glm4_tool_extractor
+            self._tool_formatter = GLM4ToolUtils.tool_formatter
+            self._tool_extractor = GLM4ToolUtils.tool_extractor
         else:
             raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
 
diff --git a/src/llamafactory/data/processors/feedback.py b/src/llamafactory/data/processors/feedback.py
index 219ab353..7ba05e23 100644
--- a/src/llamafactory/data/processors/feedback.py
+++ b/src/llamafactory/data/processors/feedback.py
@@ -16,7 +16,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
+from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, infer_seqlen
 
 
 if TYPE_CHECKING:
@@ -55,12 +55,8 @@ def _encode_feedback_example(
     else:
         kl_messages = prompt + [kl_response[1]]
 
-    prompt_ids, response_ids = template.encode_oneturn(
-        tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
-    _, kl_response_ids = template.encode_oneturn(
-        tokenizer, kl_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
+    prompt_ids, response_ids = template.encode_oneturn(tokenizer, messages, system, tools)
+    _, kl_response_ids = template.encode_oneturn(tokenizer, kl_messages, system, tools)
 
     if template.efficient_eos:
         response_ids += [tokenizer.eos_token_id]
@@ -70,6 +66,12 @@ def _encode_feedback_example(
         image_token_id = tokenizer.convert_tokens_to_ids(template.image_token)
         prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids
 
+    # do not consider the kl_response
+    source_len, target_len = infer_seqlen(len(prompt_ids), len(response_ids), data_args.cutoff_len)
+    prompt_ids = prompt_ids[:source_len]
+    response_ids = response_ids[:target_len]
+    kl_response_ids = kl_response_ids[:target_len]
+
     input_ids = prompt_ids + response_ids
     labels = [IGNORE_INDEX] * len(prompt_ids) + response_ids
     kl_input_ids = prompt_ids + kl_response_ids
diff --git a/src/llamafactory/data/processors/pairwise.py b/src/llamafactory/data/processors/pairwise.py
index b2939348..c6001e6e 100644
--- a/src/llamafactory/data/processors/pairwise.py
+++ b/src/llamafactory/data/processors/pairwise.py
@@ -16,7 +16,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
+from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, infer_seqlen
 
 
 if TYPE_CHECKING:
@@ -44,12 +44,8 @@ def _encode_pairwise_example(
 
     chosen_messages = prompt + [response[0]]
     rejected_messages = prompt + [response[1]]
-    prompt_ids, chosen_ids = template.encode_oneturn(
-        tokenizer, chosen_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
-    _, rejected_ids = template.encode_oneturn(
-        tokenizer, rejected_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
+    prompt_ids, chosen_ids = template.encode_oneturn(tokenizer, chosen_messages, system, tools)
+    _, rejected_ids = template.encode_oneturn(tokenizer, rejected_messages, system, tools)
 
     if template.efficient_eos:
         chosen_ids += [tokenizer.eos_token_id]
@@ -59,6 +55,13 @@ def _encode_pairwise_example(
         image_token_id = tokenizer.convert_tokens_to_ids(template.image_token)
         prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids
 
+    source_len, target_len = infer_seqlen(
+        len(prompt_ids), max(len(chosen_ids), len(rejected_ids)), data_args.cutoff_len
+    )  # consider the response is more important
+    prompt_ids = prompt_ids[:source_len]
+    chosen_ids = chosen_ids[:target_len]
+    rejected_ids = rejected_ids[:target_len]
+
     chosen_input_ids = prompt_ids + chosen_ids
     chosen_labels = [IGNORE_INDEX] * len(prompt_ids) + chosen_ids
     rejected_input_ids = prompt_ids + rejected_ids
diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py
index 93df0cd5..455908ae 100644
--- a/src/llamafactory/data/processors/processor_utils.py
+++ b/src/llamafactory/data/processors/processor_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import bisect
-from typing import TYPE_CHECKING, List, Sequence
+from typing import TYPE_CHECKING, List, Sequence, Tuple
 
 from ...extras.packages import is_pillow_available
 
@@ -76,3 +76,16 @@ def get_paligemma_token_type_ids(input_len: int, processor: "ProcessorMixin") ->
     """
     image_seq_length = getattr(processor, "image_seq_length")
     return [0] * image_seq_length + [1] * (input_len - image_seq_length)
+
+
+def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> Tuple[int, int]:
+    if target_len * 2 < cutoff_len:  # truncate source
+        max_target_len = cutoff_len
+    elif source_len * 2 < cutoff_len:  # truncate target
+        max_target_len = cutoff_len - source_len
+    else:  # truncate both
+        max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))
+
+    new_target_len = min(max_target_len, target_len)
+    new_source_len = max(cutoff_len - new_target_len, 0)
+    return new_source_len, new_target_len
diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py
index eb5ffb1a..b283542d 100644
--- a/src/llamafactory/data/processors/supervised.py
+++ b/src/llamafactory/data/processors/supervised.py
@@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, greedy_knapsack
+from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, greedy_knapsack, infer_seqlen
 
 
 if TYPE_CHECKING:
@@ -51,10 +51,17 @@ def _encode_supervised_example(
         input_ids += [image_token_id] * getattr(processor, "image_seq_length")
         labels += [IGNORE_INDEX] * getattr(processor, "image_seq_length")
 
-    encoded_pairs = template.encode_multiturn(
-        tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
+    encoded_pairs = template.encode_multiturn(tokenizer, messages, system, tools)
+    total_length = 1 if template.efficient_eos else 0
     for turn_idx, (source_ids, target_ids) in enumerate(encoded_pairs):
+        if total_length >= data_args.cutoff_len:
+            break
+
+        source_len, target_len = infer_seqlen(len(source_ids), len(target_ids), data_args.cutoff_len - total_length)
+        source_ids = source_ids[:source_len]
+        target_ids = target_ids[:target_len]
+        total_length += source_len + target_len
+
         if data_args.train_on_prompt:
             source_mask = source_ids
         elif turn_idx != 0 and template.efficient_eos:
diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py
index 75ad4d51..b3fc85c9 100644
--- a/src/llamafactory/data/processors/unsupervised.py
+++ b/src/llamafactory/data/processors/unsupervised.py
@@ -16,7 +16,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.logging import get_logger
 from ..data_utils import Role
-from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
+from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, infer_seqlen
 
 
 if TYPE_CHECKING:
@@ -47,9 +47,7 @@ def _encode_unsupervised_example(
     else:
         messages = prompt + [{"role": Role.ASSISTANT.value, "content": ""}]
 
-    input_ids, labels = template.encode_oneturn(
-        tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
+    input_ids, labels = template.encode_oneturn(tokenizer, messages, system, tools)
     if template.efficient_eos:
         labels += [tokenizer.eos_token_id]
 
@@ -57,6 +55,9 @@ def _encode_unsupervised_example(
         image_token_id = tokenizer.convert_tokens_to_ids(template.image_token)
         input_ids = [image_token_id] * getattr(processor, "image_seq_length") + input_ids
 
+    source_len, target_len = infer_seqlen(len(input_ids), len(labels), data_args.cutoff_len)
+    input_ids = input_ids[:source_len]
+    labels = labels[:target_len]
     return input_ids, labels
 
 
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 53f16df4..aefd5195 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -16,7 +16,7 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
 
 from ..extras.logging import get_logger
-from .data_utils import Role, infer_max_len
+from .data_utils import Role
 from .formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
 
 
@@ -48,36 +48,33 @@ class Template:
     def encode_oneturn(
         self,
         tokenizer: "PreTrainedTokenizer",
-        messages: List[Dict[str, str]],
+        messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
-        cutoff_len: int = 1_000_000,
-        reserved_label_len: int = 1,
     ) -> Tuple[List[int], List[int]]:
         r"""
         Returns a single pair of token ids representing prompt and response respectively.
         """
-        encoded_pairs = self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
         prompt_ids = []
-        for query_ids, resp_ids in encoded_pairs[:-1]:
-            prompt_ids += query_ids + resp_ids
-        prompt_ids = prompt_ids + encoded_pairs[-1][0]
-        answer_ids = encoded_pairs[-1][1]
+        for encoded_ids in encoded_messages[:-1]:
+            prompt_ids += encoded_ids
+
+        answer_ids = encoded_messages[-1]
         return prompt_ids, answer_ids
 
     def encode_multiturn(
         self,
         tokenizer: "PreTrainedTokenizer",
-        messages: List[Dict[str, str]],
+        messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
-        cutoff_len: int = 1_000_000,
-        reserved_label_len: int = 1,
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+    ) -> List[Tuple[List[int], List[int]]]:
         r"""
         Returns multiple pairs of token ids representing prompts and responses respectively.
         """
-        return self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
 
     def extract_tool(self, content: str) -> Union[str, List[Tuple[str, str]]]:
         r"""
@@ -88,16 +85,14 @@ class Template:
     def _encode(
         self,
         tokenizer: "PreTrainedTokenizer",
-        messages: List[Dict[str, str]],
+        messages: Sequence[Dict[str, str]],
         system: Optional[str],
         tools: Optional[str],
-        cutoff_len: int,
-        reserved_label_len: int,
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+    ) -> List[List[int]]:
         r"""
         Encodes formatted inputs to pairs of token ids.
-        Turn 0: system + query        resp
-        Turn t: sep + query           resp
+        Turn 0: prefix + system + query        resp
+        Turn t: sep + query                    resp
         """
         system = system or self.default_system
         encoded_messages = []
@@ -106,10 +101,9 @@ class Template:
 
             if i == 0:
                 elements += self.format_prefix.apply()
-
-            if i == 0 and (system or tools):
-                tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
-                elements += self.format_system.apply(content=(system + tool_text))
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    elements += self.format_system.apply(content=(system + tool_text))
 
             if i > 0 and i % 2 == 0:
                 elements += self.format_separator.apply()
@@ -127,11 +121,9 @@ class Template:
 
             encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
 
-        return self._make_pairs(encoded_messages, cutoff_len, reserved_label_len)
+        return encoded_messages
 
-    def _convert_elements_to_ids(
-        self, tokenizer: "PreTrainedTokenizer", elements: List[Union[str, Dict[str, str]]]
-    ) -> List[int]:
+    def _convert_elements_to_ids(self, tokenizer: "PreTrainedTokenizer", elements: "SLOTS") -> List[int]:
         r"""
         Converts elements to token ids.
         """
@@ -152,60 +144,32 @@ class Template:
 
         return token_ids
 
-    def _make_pairs(
-        self,
-        encoded_messages: Sequence[List[int]],
-        cutoff_len: int,
-        reserved_label_len: int,
-    ) -> Sequence[Tuple[List[int], List[int]]]:
-        encoded_pairs = []
-        total_length = 0
-        for i in range(0, len(encoded_messages), 2):
-            if total_length >= cutoff_len:
-                break
-
-            max_source_len, max_target_len = infer_max_len(
-                source_len=len(encoded_messages[i]),
-                target_len=len(encoded_messages[i + 1]),
-                max_len=(cutoff_len - total_length),
-                reserved_label_len=reserved_label_len,
-            )
-            source_ids = encoded_messages[i][:max_source_len]
-            target_ids = encoded_messages[i + 1][:max_target_len]
-            total_length += len(source_ids) + len(target_ids)
-            encoded_pairs.append((source_ids, target_ids))
-
-        return encoded_pairs
-
 
 @dataclass
 class Llama2Template(Template):
     def _encode(
         self,
         tokenizer: "PreTrainedTokenizer",
-        messages: List[Dict[str, str]],
+        messages: Sequence[Dict[str, str]],
         system: str,
         tools: str,
-        cutoff_len: int,
-        reserved_label_len: int,
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+    ) -> List[List[int]]:
         r"""
         Encodes formatted inputs to pairs of token ids.
-        Turn 0: system + query        resp
-        Turn t: sep + query           resp
+        Turn 0: prefix + system + query        resp
+        Turn t: sep + query                    resp
         """
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
             elements = []
 
+            system_text = ""
             if i == 0:
                 elements += self.format_prefix.apply()
-
-            system_text = ""
-            if i == 0 and (system or tools):
-                tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
-                system_text = self.format_system.apply(content=(system + tool_text))[0]
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    system_text = self.format_system.apply(content=(system + tool_text))[0]
 
             if i > 0 and i % 2 == 0:
                 elements += self.format_separator.apply()
@@ -223,7 +187,7 @@ class Llama2Template(Template):
 
             encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
 
-        return self._make_pairs(encoded_messages, cutoff_len, reserved_label_len)
+        return encoded_messages
 
 
 TEMPLATES: Dict[str, Template] = {}
@@ -240,7 +204,7 @@ def _register_template(
     format_separator: Optional["Formatter"] = None,
     format_prefix: Optional["Formatter"] = None,
     default_system: str = "",
-    stop_words: List[str] = [],
+    stop_words: Sequence[str] = [],
     image_token: str = "<image>",
     efficient_eos: bool = False,
     replace_eos: bool = False,
@@ -275,9 +239,7 @@ def _register_template(
     template_class = Llama2Template if name.startswith("llama2") else Template
     default_user_formatter = StringFormatter(slots=["{{content}}"])
     default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots)
-    default_function_formatter = FunctionFormatter(
-        slots=["Action: {{name}}\nAction Input: {{arguments}}\n"] + eos_slots
-    )
+    default_function_formatter = FunctionFormatter(slots=eos_slots, tool_format="default")
     default_tool_formatter = ToolFormatter(tool_format="default")
     default_separator_formatter = EmptyFormatter()
     default_prefix_formatter = EmptyFormatter()
@@ -390,7 +352,9 @@ def get_template_and_fix_tokenizer(
 
     if tool_format is not None:
         logger.info("Using tool format: {}.".format(tool_format))
+        eos_slots = [] if template.efficient_eos else [{"eos_token"}]
         template.format_tools = ToolFormatter(tool_format=tool_format)
+        template.format_function = FunctionFormatter(slots=eos_slots, tool_format=tool_format)
 
     stop_words = template.stop_words
     if template.replace_eos:
@@ -506,10 +470,11 @@ _register_template(
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
     format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
-    format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
+    format_function=FunctionFormatter(slots=[], tool_format="glm4"),
     format_observation=StringFormatter(
         slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
     ),
+    format_tools=ToolFormatter(tool_format="glm4"),
     format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
@@ -603,16 +568,15 @@ _register_template(
 _register_template(
     name="deepseekcoder",
     format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
-    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_separator=EmptyFormatter(slots=["\n<|EOT|>\n"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     default_system=(
         "You are an AI programming assistant, utilizing the Deepseek Coder model, "
         "developed by Deepseek Company, and you only answer questions related to computer science. "
         "For politically sensitive questions, security and privacy issues, "
         "and other non-computer science questions, you will refuse to answer\n"
     ),
-    stop_words=["<|EOT|>"],
-    efficient_eos=True,
 )
 
 
@@ -662,7 +626,7 @@ _register_template(
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
     format_assistant=StringFormatter(slots=["\n{{content}}"]),
     format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
-    format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
+    format_function=FunctionFormatter(slots=[], tool_format="glm4"),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
     format_tools=ToolFormatter(tool_format="glm4"),
     format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
diff --git a/src/llamafactory/data/tool_utils.py b/src/llamafactory/data/tool_utils.py
new file mode 100644
index 00000000..ac5565d5
--- /dev/null
+++ b/src/llamafactory/data/tool_utils.py
@@ -0,0 +1,140 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Union
+
+from .data_utils import SLOTS
+
+
+DEFAULT_TOOL_PROMPT = (
+    "You have access to the following tools:\n{tool_text}"
+    "Use the following format if using a tool:\n"
+    "```\n"
+    "Action: tool name (one of [{tool_names}]).\n"
+    "Action Input: the input to the tool, in a JSON format representing the kwargs "
+    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```).\n"""
+    "```\n"
+)
+
+
+GLM4_TOOL_PROMPT = (
+    "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+    "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具{tool_text}"
+)
+
+
+@dataclass
+class ToolUtils(ABC):
+    @staticmethod
+    @abstractmethod
+    def get_function_slots() -> SLOTS: ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str: ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]: ...
+
+
+class DefaultToolUtils(ToolUtils):
+    @staticmethod
+    def get_function_slots() -> SLOTS:
+        return ["Action: {{name}}\nAction Input: {{arguments}}\n"]
+
+    @staticmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+        tool_text = ""
+        tool_names = []
+        for tool in tools:
+            param_text = ""
+            for name, param in tool["parameters"]["properties"].items():
+                required, enum, items = "", "", ""
+                if name in tool["parameters"].get("required", []):
+                    required = ", required"
+
+                if param.get("enum", None):
+                    enum = ", should be one of [{}]".format(", ".join(param["enum"]))
+
+                if param.get("items", None):
+                    items = ", where each item should be {}".format(param["items"].get("type", ""))
+
+                param_text += "  - {name} ({type}{required}): {desc}{enum}{items}\n".format(
+                    name=name,
+                    type=param.get("type", ""),
+                    required=required,
+                    desc=param.get("description", ""),
+                    enum=enum,
+                    items=items,
+                )
+
+            tool_text += "> Tool Name: {name}\nTool Description: {desc}\nTool Args:\n{args}\n".format(
+                name=tool["name"], desc=tool.get("description", ""), args=param_text
+            )
+            tool_names.append(tool["name"])
+
+        return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
+
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
+        regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
+        action_match: List[Tuple[str, str]] = re.findall(regex, content)
+        if not action_match:
+            return content
+
+        results = []
+        for match in action_match:
+            tool_name = match[0].strip()
+            tool_input = match[1].strip().strip('"').strip("```")
+            try:
+                arguments = json.loads(tool_input)
+                results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
+            except json.JSONDecodeError:
+                return content
+
+        return results
+
+
+class GLM4ToolUtils(ToolUtils):
+    @staticmethod
+    def get_function_slots() -> SLOTS:
+        return ["{{name}}\n{{arguments}}"]
+
+    @staticmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+                name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
+            )
+
+        return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
+        if "\n" not in content:
+            return content
+
+        tool_name, tool_input = content.split("\n", maxsplit=1)
+        try:
+            arguments = json.loads(tool_input)
+        except json.JSONDecodeError:
+            return content
+
+        return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index dad13820..880be84a 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -45,10 +45,6 @@ class DataArguments:
         default=1024,
         metadata={"help": "The cutoff length of the tokenized inputs in the dataset."},
     )
-    reserved_label_len: int = field(
-        default=1,
-        metadata={"help": "The minimum cutoff length reserved for the tokenized labels in the dataset."},
-    )
     train_on_prompt: bool = field(
         default=False,
         metadata={"help": "Whether to disable the mask on the prompt or not."},
@@ -111,9 +107,6 @@ class DataArguments:
     )
 
     def __post_init__(self):
-        if self.reserved_label_len >= self.cutoff_len:
-            raise ValueError("`reserved_label_len` must be smaller than `cutoff_len`.")
-
         if self.streaming and self.val_size > 1e-6 and self.val_size < 1:
             raise ValueError("Streaming mode should have an integer val size.")
 
diff --git a/tests/data/test_formatter.py b/tests/data/test_formatter.py
index 37b21dc5..1845df24 100644
--- a/tests/data/test_formatter.py
+++ b/tests/data/test_formatter.py
@@ -28,7 +28,7 @@ def test_string_formatter():
 
 
 def test_function_formatter():
-    formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}\n"])
+    formatter = FunctionFormatter(slots=[], tool_format="default")
     tool_calls = json.dumps({"name": "tool_name", "arguments": {"foo": "bar", "size": 10}})
     assert formatter.apply(content=tool_calls) == [
         """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n"""
@@ -36,7 +36,7 @@ def test_function_formatter():
 
 
 def test_multi_function_formatter():
-    formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}\n"])
+    formatter = FunctionFormatter(slots=[], tool_format="default")
     tool_calls = json.dumps([{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}] * 2)
     assert formatter.apply(content=tool_calls) == [
         """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n""",
diff --git a/tests/data/test_processor.py b/tests/data/test_processor.py
new file mode 100644
index 00000000..fa8f7172
--- /dev/null
+++ b/tests/data/test_processor.py
@@ -0,0 +1,32 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import pytest
+
+from llamafactory.data.processors.processor_utils import infer_seqlen
+
+
+@pytest.mark.parametrize(
+    "test_input,test_output",
+    [
+        ((3000, 2000, 1000), (600, 400)),
+        ((2000, 3000, 1000), (400, 600)),
+        ((1000, 100, 1000), (900, 100)),
+        ((100, 1000, 1000), (100, 900)),
+    ],
+)
+def test_infer_seqlen(test_input: Tuple[int, int, int], test_output: Tuple[int, int]):
+    assert test_output == infer_seqlen(*test_input)
diff --git a/tests/data/test_template.py b/tests/data/test_template.py
index 9d73c116..e4728a84 100644
--- a/tests/data/test_template.py
+++ b/tests/data/test_template.py
@@ -21,15 +21,60 @@ from llamafactory.data import get_template_and_fix_tokenizer
 
 TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
+MESSAGES = [
+    {"role": "user", "content": "How are you"},
+    {"role": "assistant", "content": "I am fine!"},
+    {"role": "user", "content": "你好"},
+    {"role": "assistant", "content": "很高兴认识你！"},
+]
+
+
+def test_encode_oneturn():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    template = get_template_and_fix_tokenizer(tokenizer, name="llama3")
+    prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES)
+    assert tokenizer.decode(prompt_ids) == (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\nI am fine!<|eot_id|>"
+        "<|start_header_id|>user<|end_header_id|>\n\n你好<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    assert tokenizer.decode(answer_ids) == "很高兴认识你！<|eot_id|>"
+
+
+def test_encode_multiturn():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    template = get_template_and_fix_tokenizer(tokenizer, name="llama3")
+    encoded_pairs = template.encode_multiturn(tokenizer, MESSAGES)
+    assert tokenizer.decode(encoded_pairs[0][0]) == (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    assert tokenizer.decode(encoded_pairs[0][1]) == "I am fine!<|eot_id|>"
+    assert tokenizer.decode(encoded_pairs[1][0]) == (
+        "<|start_header_id|>user<|end_header_id|>\n\n你好<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    assert tokenizer.decode(encoded_pairs[1][1]) == "很高兴认识你！<|eot_id|>"
+
 
 def test_jinja_template():
     tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
     ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
     get_template_and_fix_tokenizer(tokenizer, name="llama3")
     assert tokenizer.chat_template != ref_tokenizer.chat_template
+    assert tokenizer.apply_chat_template(MESSAGES) == ref_tokenizer.apply_chat_template(MESSAGES)
 
-    messages = [
-        {"role": "user", "content": "hi!"},
-        {"role": "assistant", "content": "hello there"},
-    ]
-    assert tokenizer.apply_chat_template(messages) == ref_tokenizer.apply_chat_template(messages)
+
+def test_qwen_template():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")
+    template = get_template_and_fix_tokenizer(tokenizer, name="qwen")
+    prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES)
+    assert tokenizer.decode(prompt_ids) == (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\nHow are you<|im_end|>\n"
+        "<|im_start|>assistant\nI am fine!<|im_end|>\n"
+        "<|im_start|>user\n你好<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    assert tokenizer.decode(answer_ids) == "很高兴认识你！<|im_end|>"

From 92607846d08a699477aa6fe699407b583167d672 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 01:29:09 +0800
Subject: [PATCH 159/162] Update label_issue.yml

Former-commit-id: fffa3defdda02ad579cb703c0704f94bad94f21a
---
 .github/workflows/label_issue.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
index 0e10f0b9..ffd644a7 100644
--- a/.github/workflows/label_issue.yml
+++ b/.github/workflows/label_issue.yml
@@ -16,7 +16,7 @@ jobs:
           ISSUE_TITLE: ${{ github.event.issue.title }}
         run: |
           LABEL=pending
-          NPU_KEYWORDS=(npu ascend 昇腾)
+          NPU_KEYWORDS=(npu ascend huawei 华为 昇腾)
           ISSUE_TITLE_LOWER=$(echo $ISSUE_TITLE | tr '[:upper:]' '[:lower:]')
           for KEYWORD in ${NPU_KEYWORDS[@]}; do
             if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]]; then

From 8baf04d772cc7bd99c5f818fe25affbcd7f29ba8 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 03:51:20 +0800
Subject: [PATCH 160/162] add eval acc

Former-commit-id: 7ffde76fbfb6192e3aac31ccc098f31ce89181ae
---
 src/llamafactory/train/sft/metric.py   | 24 +++++++++++++++++++++---
 src/llamafactory/train/sft/trainer.py  | 17 ++++++-----------
 src/llamafactory/train/sft/workflow.py |  7 ++++---
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index 95bfcb69..72faef0a 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -17,9 +17,11 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Dict
 
 import numpy as np
+import torch
+from transformers import EvalPrediction
 from transformers.utils import is_jieba_available, is_nltk_available
 
 from ...extras.constants import IGNORE_INDEX
@@ -42,6 +44,22 @@ if is_rouge_available():
     from rouge_chinese import Rouge
 
 
+def compute_accuracy(eval_preds: "EvalPrediction") -> Dict[str, float]:
+    preds, labels = eval_preds.predictions, eval_preds.label_ids
+    accuracies = []
+    for i in range(len(preds)):
+        pred, label = preds[i, 1:], labels[i, :-1]
+        label_mask = label != IGNORE_INDEX
+        accuracies.append(np.mean(pred[label_mask] == label[label_mask]))
+
+    return {"accuracy": float(np.mean(accuracies))}
+
+
+def eval_logit_processor(logits: "torch.Tensor", labels: "torch.Tensor") -> "torch.Tensor":
+    logits = logits[0] if isinstance(logits, (list, tuple)) else logits
+    return torch.argmax(logits, dim=-1)
+
+
 @dataclass
 class ComputeMetrics:
     r"""
@@ -50,11 +68,11 @@ class ComputeMetrics:
 
     tokenizer: "PreTrainedTokenizer"
 
-    def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]:
+    def __call__(self, eval_preds: "EvalPrediction") -> Dict[str, float]:
         r"""
         Uses the model predictions to compute metrics.
         """
-        preds, labels = eval_preds
+        preds, labels = eval_preds.predictions, eval_preds.label_ids
         score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []}
 
         preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id)
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 06bd2b6b..954bb69f 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -135,21 +135,16 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 
         for i in range(len(preds)):
             pad_len = np.nonzero(preds[i] != self.tokenizer.pad_token_id)[0]
-            if len(pad_len):
-                preds[i] = np.concatenate(
-                    (preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1
-                )  # move pad token to last
+            if len(pad_len):  # move pad token to last
+                preds[i] = np.concatenate((preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1)
 
-        decoded_inputs = self.tokenizer.batch_decode(
-            dataset["input_ids"], skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        decoded_labels = self.tokenizer.batch_decode(
-            labels, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        decoded_inputs = self.tokenizer.batch_decode(dataset["input_ids"], skip_special_tokens=True)
+        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
 
         with open(output_prediction_file, "w", encoding="utf-8") as writer:
             res: List[str] = []
             for text, label, pred in zip(decoded_inputs, decoded_labels, decoded_preds):
                 res.append(json.dumps({"prompt": text, "label": label, "predict": pred}, ensure_ascii=False))
+
             writer.write("\n".join(res))
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index 885bc7ac..0c3f9b11 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -25,7 +25,7 @@ from ...extras.misc import get_logits_processor
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
 from ..trainer_utils import create_modelcard_and_push
-from .metric import ComputeMetrics
+from .metric import ComputeMetrics, compute_accuracy, eval_logit_processor
 from .trainer import CustomSeq2SeqTrainer
 
 
@@ -72,7 +72,8 @@ def run_sft(
         finetuning_args=finetuning_args,
         data_collator=data_collator,
         callbacks=callbacks,
-        compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None,
+        compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else compute_accuracy,
+        preprocess_logits_for_metrics=None if training_args.predict_with_generate else eval_logit_processor,
         **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
     )
@@ -91,7 +92,7 @@ def run_sft(
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
         if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "eval_accuracy"])
 
     # Evaluation
     if training_args.do_eval:

From 711ffd0aaf00e2986702f368462516bf9be58307 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 03:55:20 +0800
Subject: [PATCH 161/162] tiny fix

Former-commit-id: 19e43c3a9ed771e991cb273d394ab28fb923f868
---
 src/llamafactory/model/model_utils/attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index 80d9d4b8..4bed7e21 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -35,7 +35,7 @@ def configure_attn_implementation(
         if model_args.flash_attn == "auto":
             logger.warning("Gemma-2 models should use eager attention in training, change `flash_attn` to disabled.")
             model_args.flash_attn = "disabled"
-        else:
+        elif model_args.flash_attn != "disabled":
             logger.warning(
                 "Gemma-2 models should use eager attention in training, but you set `flash_attn: {}`. "
                 "Will proceed at your own risk.".format(model_args.flash_attn)

From 14b37e1e030fe6064b2b4db98c1da1c1a183752f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 05:43:17 +0800
Subject: [PATCH 162/162] tiny fix

Former-commit-id: 5dd2e5c3323f56420b5845a5ed28bcd9d4da5e41
---
 src/llamafactory/train/sft/metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index 72faef0a..c69608c0 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -48,7 +48,7 @@ def compute_accuracy(eval_preds: "EvalPrediction") -> Dict[str, float]:
     preds, labels = eval_preds.predictions, eval_preds.label_ids
     accuracies = []
     for i in range(len(preds)):
-        pred, label = preds[i, 1:], labels[i, :-1]
+        pred, label = preds[i, :-1], labels[i, 1:]
         label_mask = label != IGNORE_INDEX
         accuracies.append(np.mean(pred[label_mask] == label[label_mask]))