From 82533114a74e528bbab7fc50d6f98e79504bd5ff Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 8 Jun 2024 21:11:32 +0800
Subject: [PATCH 001/160] update git workflows

Former-commit-id: 5a3f26bc53433caa98b2a66294becaf156280a4c
---
 .github/workflows/label_issue.yml | 17 +++++++++++++++++
 .github/workflows/tests.yml       | 17 ++++++++---------
 2 files changed, 25 insertions(+), 9 deletions(-)
 create mode 100644 .github/workflows/label_issue.yml

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
new file mode 100644
index 00000000..b9a5543c
--- /dev/null
+++ b/.github/workflows/label_issue.yml
@@ -0,0 +1,17 @@
+name: label_issue
+
+on:
+  issues:
+    types:
+      - opened
+
+jobs:
+  label_issue:
+    runs-on: ubuntu-latest
+
+    steps:
+      - env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          ISSUE_URL: ${{ github.event.issue.html_url }}
+        run: |
+          gh issue edit $ISSUE_URL --add-label "pending"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 32edf6a8..6ddcbc05 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -3,14 +3,7 @@ name: tests
 on:
   push:
     branches:
-      - main
-    paths:
-      - "**.py"
-      - "requirements.txt"
-      - ".github/workflows/*.yml"
-  pull_request:
-    branches:
-      - main
+      - $default-branch
     paths:
       - "**.py"
       - "requirements.txt"
@@ -19,21 +12,27 @@ on:
 jobs:
   tests:
     runs-on: ubuntu-latest
+
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
+
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: "3.8"
           cache: "pip"
           cache-dependency-path: "setup.py"
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           python -m pip install .[torch,dev]
+
       - name: Check quality
         run: |
           make style && make quality
+
       - name: Test with pytest
         run: |
           make test

From 88528f1a87ff42901a1ff8bc3c5e05b4afa190a0 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 8 Jun 2024 21:15:36 +0800
Subject: [PATCH 002/160] Update tests.yml

Former-commit-id: e90f0cc30d6bb819246ccc08935c39e714c179a1
---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 6ddcbc05..f3ac96db 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -3,7 +3,7 @@ name: tests
 on:
   push:
     branches:
-      - $default-branch
+      - main
     paths:
       - "**.py"
       - "requirements.txt"

From d812249db7ed95543ba9319ed09d582685884481 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 8 Jun 2024 21:25:35 +0800
Subject: [PATCH 003/160] add pr ci

Former-commit-id: 9b05bb8540b946d0c74bf804bcafc4a785d22c47
---
 .github/workflows/tests.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f3ac96db..96092662 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,6 +8,15 @@ on:
       - "**.py"
       - "requirements.txt"
       - ".github/workflows/*.yml"
+  pull_request:
+    types:
+      - review_requested
+    branches:
+      - main
+    paths:
+      - "**.py"
+      - "requirements.txt"
+      - ".github/workflows/*.yml"
 
 jobs:
   tests:

From 041ecd0de159bdf10a2b9f1e99c495592c4329c5 Mon Sep 17 00:00:00 2001
From: "-.-" <wooooolpl@163.com>
Date: Sat, 8 Jun 2024 23:51:56 +0800
Subject: [PATCH 004/160] fix README

Former-commit-id: fa30028c0b83c38610b596209493a748b8ca0928
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fb6c5782..4dea65b9 100644
--- a/README.md
+++ b/README.md
@@ -335,7 +335,7 @@ huggingface-cli login
 ```bash
 git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
 cd LLaMA-Factory
-pip install -e '.[torch,metrics]'
+pip install -e ".[torch,metrics]"
 ```
 
 Extra dependencies available: torch, torch_npu, metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality
diff --git a/README_zh.md b/README_zh.md
index 142254df..ab0e8cb7 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -335,7 +335,7 @@ huggingface-cli login
 ```bash
 git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
 cd LLaMA-Factory
-pip install -e '.[torch,metrics]'
+pip install -e ".[torch,metrics]"
 ```
 
 可选的额外依赖项：torch、torch_npu、metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality

From 8db8ed5a413e65751ba5a791cba1de766ffd09d3 Mon Sep 17 00:00:00 2001
From: mMrBun <2015711377@qq.com>
Date: Sun, 9 Jun 2024 18:16:15 +0800
Subject: [PATCH 005/160] Implemented the tool_formatter and tool_extractor for
 glm4 tool_format

Former-commit-id: db7fa4490ea7f6966418d2879c895cbc1763b16d
---
 src/llamafactory/data/formatter.py | 42 +++++++++++++++++++++++++++++-
 src/llamafactory/data/template.py  |  3 ++-
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 0cd3d6c1..344e01db 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -23,6 +23,17 @@ TOOL_SYSTEM_PROMPT = (
 )
 
 
+GLM4_TOOL_SUFFIX_PROMPT = (
+    "在调用上述函数时，请使用 Json 格式表示调用的参数。"
+)
+
+GLM4_TOOL_PROMPT = (
+    "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持，"
+    "{tool_text}"
+    
+)
+
+
 def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
     tool_text = ""
     tool_names = []
@@ -53,6 +64,14 @@ def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
     )
 
 
+def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
+    tool_text = ""
+    for tool in tools:
+        tool_name = tool["name"]
+        tool_text += f"\n\n## {tool_name}\n\n{json.dumps(tool, ensure_ascii=False, indent=4)}\n{GLM4_TOOL_SUFFIX_PROMPT}"
+    return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+ 
+
 def default_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
     regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+).*?Action Input:\s*(.*)", re.DOTALL)
     action_match = re.search(regex, content)
@@ -69,10 +88,24 @@ def default_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
     return tool_name, json.dumps(arguments, ensure_ascii=False)
 
 
+def glm4_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
+    lines = content.strip().split("\n")
+    if len(lines) != 2:
+        return content
+    tool_name = lines[0].strip()
+    tool_input = lines[1].strip()
+    try:
+        arguments = json.loads(tool_input)
+    except json.JSONDecodeError:
+        return content
+    return tool_name, json.dumps(arguments, ensure_ascii=False)
+
+    
+
 @dataclass
 class Formatter(ABC):
     slots: SLOTS = field(default_factory=list)
-    tool_format: Optional[Literal["default"]] = None
+    tool_format: Optional[Literal["default", "glm4"]] = None
 
     @abstractmethod
     def apply(self, **kwargs) -> SLOTS: ...
@@ -175,6 +208,11 @@ class ToolFormatter(Formatter):
 
             if self.tool_format == "default":
                 return [default_tool_formatter(tools)]
+            elif self.tool_format == "glm4":
+                """
+                '[gMASK]<sop><|system|>\n你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n## get_current_weather\n\n{\n    "name": "get_current_weather",\n    "description": "Get the current weather",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "location": {\n                "type": "string",\n                "description": "The city and state, e.g. San Francisco, CA"\n            },\n            "format": {\n                "type": "string",\n                "enum": [\n                    "celsius",\n                    "fahrenheit"\n                ],\n                "description": "The temperature unit to use. Infer this from the users location."\n            }\n        },\n        "required": [\n            "location",\n            "format"\n        ]\n    }\n}\n在调用上述函数时，请使用 Json 格式表示调用的参数。<|user|>\nWhat\'s the weather like in San Francisco, Tokyo, and Paris? use celsius<|assistant|>'
+                """
+                return [glm4_tool_formatter(tools)]
             else:
                 raise NotImplementedError
         except Exception:
@@ -183,5 +221,7 @@ class ToolFormatter(Formatter):
     def extract(self, content: str) -> Union[str, Tuple[str, str]]:
         if self.tool_format == "default":
             return default_tool_extractor(content)
+        elif self.tool_format == "glm4":
+            return glm4_tool_extractor(content)
         else:
             raise NotImplementedError
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 3dce5ec6..b2aea217 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -662,9 +662,10 @@ _register_template(
     name="glm4",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
     format_assistant=StringFormatter(slots=["\n{{content}}"]),
-    format_system=StringFormatter(slots=["[gMASK]<sop>{{content}}"]),
+    format_system=StringFormatter(slots=["[gMASK]<sop><|system|>\n{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
     force_system=True,

From 44829df762c7973f29a1486c8bccec57665ad391 Mon Sep 17 00:00:00 2001
From: mMrBun <2015711377@qq.com>
Date: Sun, 9 Jun 2024 18:25:22 +0800
Subject: [PATCH 006/160] Removed unnecessary comments.

Former-commit-id: 2b81252aa693871098931cd7873ef83ef4922ba5
---
 src/llamafactory/data/formatter.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 344e01db..9f58915b 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -209,9 +209,6 @@ class ToolFormatter(Formatter):
             if self.tool_format == "default":
                 return [default_tool_formatter(tools)]
             elif self.tool_format == "glm4":
-                """
-                '[gMASK]<sop><|system|>\n你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n## get_current_weather\n\n{\n    "name": "get_current_weather",\n    "description": "Get the current weather",\n    "parameters": {\n        "type": "object",\n        "properties": {\n            "location": {\n                "type": "string",\n                "description": "The city and state, e.g. San Francisco, CA"\n            },\n            "format": {\n                "type": "string",\n                "enum": [\n                    "celsius",\n                    "fahrenheit"\n                ],\n                "description": "The temperature unit to use. Infer this from the users location."\n            }\n        },\n        "required": [\n            "location",\n            "format"\n        ]\n    }\n}\n在调用上述函数时，请使用 Json 格式表示调用的参数。<|user|>\nWhat\'s the weather like in San Francisco, Tokyo, and Paris? use celsius<|assistant|>'
-                """
                 return [glm4_tool_formatter(tools)]
             else:
                 raise NotImplementedError

From bc04ca464ac64562871060aa4cd3c2c17dade905 Mon Sep 17 00:00:00 2001
From: mMrBun <2015711377@qq.com>
Date: Mon, 10 Jun 2024 02:00:14 +0800
Subject: [PATCH 007/160] Optimize the handling of QWEN2 in scenarios involving
 multiple tool calls.

Former-commit-id: 48f870edc96ada40360f7e6e67cbf58805295b33
---
 src/llamafactory/api/chat.py       | 13 +++++++-----
 src/llamafactory/data/formatter.py | 34 ++++++++++++++++++------------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/llamafactory/api/chat.py b/src/llamafactory/api/chat.py
index 98957bc1..d4db1eea 100644
--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -150,11 +150,14 @@ async def create_chat_completion_response(
         else:
             result = response.response_text
 
-        if isinstance(result, tuple):
-            name, arguments = result
-            function = Function(name=name, arguments=arguments)
-            tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
-            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=[tool_call])
+        if isinstance(result, list):
+            tool_calls = []
+            for tool in result:
+                name, arguments = tool
+                function = Function(name=name, arguments=arguments)
+                tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
+                tool_calls.append(tool_call)
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=tool_calls)
             finish_reason = Finish.TOOL
         else:
             response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 9f58915b..1d917887 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -72,23 +72,29 @@ def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
     return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
  
 
-def default_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
-    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+).*?Action Input:\s*(.*)", re.DOTALL)
-    action_match = re.search(regex, content)
+def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
+    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*({.*?})(?=\nAction:|\Z)", re.DOTALL)
+    action_match = re.findall(regex, content)
     if not action_match:
         return content
 
-    tool_name = action_match.group(1).strip()
-    tool_input = action_match.group(2).strip().strip('"').strip("```")
-    try:
-        arguments = json.loads(tool_input)
-    except json.JSONDecodeError:
-        return content
+    results = []
+    
+    for match in action_match:
+        tool_name, tool_input = match
+        tool_name = tool_name.strip()
+        tool_input = tool_input.strip().strip('"').strip("```")
 
-    return tool_name, json.dumps(arguments, ensure_ascii=False)
+        try:
+            arguments = json.loads(tool_input)
+            results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
+        except json.JSONDecodeError:
+            return content
+
+    return results
 
 
-def glm4_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
+def glm4_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
     lines = content.strip().split("\n")
     if len(lines) != 2:
         return content
@@ -98,7 +104,7 @@ def glm4_tool_extractor(content: str) -> Union[str, Tuple[str, str]]:
         arguments = json.loads(tool_input)
     except json.JSONDecodeError:
         return content
-    return tool_name, json.dumps(arguments, ensure_ascii=False)
+    return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
 
     
@@ -110,7 +116,7 @@ class Formatter(ABC):
     @abstractmethod
     def apply(self, **kwargs) -> SLOTS: ...
 
-    def extract(self, content: str) -> Union[str, Tuple[str, str]]:
+    def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
         raise NotImplementedError
 
 
@@ -215,7 +221,7 @@ class ToolFormatter(Formatter):
         except Exception:
             return [""]
 
-    def extract(self, content: str) -> Union[str, Tuple[str, str]]:
+    def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
         if self.tool_format == "default":
             return default_tool_extractor(content)
         elif self.tool_format == "glm4":

From 784088db3fb6c599368d69ba97a232ba00d07c1f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 10 Jun 2024 21:24:15 +0800
Subject: [PATCH 008/160] fix #2666

Former-commit-id: f121d5c4f94af9f165132c4309cb9bdc8217d985
---
 src/llamafactory/model/adapter.py         |  2 +-
 tests/data/test_supervised.py             | 32 ++++++++++++++---------
 tests/model/model_utils/test_attention.py | 15 +++++------
 tests/model/test_freeze.py                | 19 ++++----------
 tests/model/test_full.py                  |  8 +++---
 tests/model/test_lora.py                  | 19 ++++----------
 6 files changed, 41 insertions(+), 54 deletions(-)

diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index f4e501a7..34b9eda6 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -209,6 +209,7 @@ def _setup_lora_tuning(
             "lora_alpha": finetuning_args.lora_alpha,
             "lora_dropout": finetuning_args.lora_dropout,
             "use_rslora": finetuning_args.use_rslora,
+            "use_dora": finetuning_args.use_dora,
             "modules_to_save": finetuning_args.additional_target,
         }
 
@@ -218,7 +219,6 @@ def _setup_lora_tuning(
             lora_config = LoraConfig(
                 task_type=TaskType.CAUSAL_LM,
                 inference_mode=False,
-                use_dora=finetuning_args.use_dora,
                 **peft_kwargs,
             )
             model = get_peft_model(model, lora_config)
diff --git a/tests/data/test_supervised.py b/tests/data/test_supervised.py
index bb7f71df..63a3453f 100644
--- a/tests/data/test_supervised.py
+++ b/tests/data/test_supervised.py
@@ -1,4 +1,5 @@
 import os
+import random
 
 import pytest
 from datasets import load_dataset
@@ -8,17 +9,17 @@ from llamafactory.hparams import get_train_args
 from llamafactory.model import load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TRAINING_ARGS = {
+TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
     "do_train": True,
     "finetuning_type": "full",
-    "dataset": "llamafactory/tiny_dataset",
+    "dataset": "llamafactory/tiny-supervised-dataset",
     "dataset_dir": "ONLINE",
     "template": "llama3",
-    "cutoff_len": 1024,
+    "cutoff_len": 8192,
     "overwrite_cache": True,
     "output_dir": "dummy_dir",
     "overwrite_output_dir": True,
@@ -26,19 +27,24 @@ TRAINING_ARGS = {
 }
 
 
-@pytest.mark.parametrize("test_num", [5])
-def test_supervised(test_num: int):
-    model_args, data_args, training_args, _, _ = get_train_args(TRAINING_ARGS)
+@pytest.mark.parametrize("num_samples", [10])
+def test_supervised(num_samples: int):
+    model_args, data_args, training_args, _, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     tokenizer = tokenizer_module["tokenizer"]
     tokenized_data = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
 
-    original_data = load_dataset(TRAINING_ARGS["dataset"], split="train")
-    for test_idx in range(test_num):
-        decode_result = tokenizer.decode(tokenized_data["input_ids"][test_idx])
+    original_data = load_dataset(TRAIN_ARGS["dataset"], split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        decoded_result = tokenizer.decode(tokenized_data["input_ids"][index])
+        prompt = original_data[index]["instruction"]
+        if original_data[index]["input"]:
+            prompt += "\n" + original_data[index]["input"]
+
         messages = [
-            {"role": "user", "content": original_data[test_idx]["instruction"]},
-            {"role": "assistant", "content": original_data[test_idx]["output"]},
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": original_data[index]["output"]},
         ]
         templated_result = tokenizer.apply_chat_template(messages, tokenize=False)
-        assert decode_result == templated_result
+        assert decoded_result == templated_result
diff --git a/tests/model/model_utils/test_attention.py b/tests/model/model_utils/test_attention.py
index 4d414289..751adda4 100644
--- a/tests/model/model_utils/test_attention.py
+++ b/tests/model/model_utils/test_attention.py
@@ -6,7 +6,12 @@ from llamafactory.hparams import get_infer_args
 from llamafactory.model import load_model, load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "template": "llama3",
+}
 
 
 def test_attention():
@@ -23,13 +28,7 @@ def test_attention():
         "fa2": "LlamaFlashAttention2",
     }
     for requested_attention in attention_available:
-        model_args, _, finetuning_args, _ = get_infer_args(
-            {
-                "model_name_or_path": TINY_LLAMA,
-                "template": "llama2",
-                "flash_attn": requested_attention,
-            }
-        )
+        model_args, _, finetuning_args, _ = get_infer_args({"flash_attn": requested_attention, **INFER_ARGS})
         tokenizer_module = load_tokenizer(model_args)
         model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args)
         for module in model.modules():
diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py
index c6cdec78..97800696 100644
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
@@ -6,14 +6,14 @@ from llamafactory.hparams import get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TRAINING_ARGS = {
+TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
     "do_train": True,
     "finetuning_type": "freeze",
-    "dataset": "llamafactory/tiny_dataset",
+    "dataset": "llamafactory/tiny-supervised-dataset",
     "dataset_dir": "ONLINE",
     "template": "llama3",
     "cutoff_len": 1024,
@@ -25,12 +25,7 @@ TRAINING_ARGS = {
 
 
 def test_freeze_all_modules():
-    model_args, _, _, finetuning_args, _ = get_train_args(
-        {
-            "freeze_trainable_layers": 1,
-            **TRAINING_ARGS,
-        }
-    )
+    model_args, _, _, finetuning_args, _ = get_train_args({"freeze_trainable_layers": 1, **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     for name, param in model.named_parameters():
@@ -44,11 +39,7 @@ def test_freeze_all_modules():
 
 def test_freeze_extra_modules():
     model_args, _, _, finetuning_args, _ = get_train_args(
-        {
-            "freeze_trainable_layers": 1,
-            "freeze_extra_modules": "embed_tokens,lm_head",
-            **TRAINING_ARGS,
-        }
+        {"freeze_trainable_layers": 1, "freeze_extra_modules": "embed_tokens,lm_head", **TRAIN_ARGS}
     )
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
diff --git a/tests/model/test_full.py b/tests/model/test_full.py
index ef57a980..6cb78f37 100644
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
@@ -6,14 +6,14 @@ from llamafactory.hparams import get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TRAINING_ARGS = {
+TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
     "do_train": True,
     "finetuning_type": "full",
-    "dataset": "llamafactory/tiny_dataset",
+    "dataset": "llamafactory/tiny-supervised-dataset",
     "dataset_dir": "ONLINE",
     "template": "llama3",
     "cutoff_len": 1024,
@@ -25,7 +25,7 @@ TRAINING_ARGS = {
 
 
 def test_full():
-    model_args, _, _, finetuning_args, _ = get_train_args(TRAINING_ARGS)
+    model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     for param in model.parameters():
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 1f2c02ae..2e2b89d9 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -6,14 +6,14 @@ from llamafactory.hparams import get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TRAINING_ARGS = {
+TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
     "do_train": True,
     "finetuning_type": "lora",
-    "dataset": "llamafactory/tiny_dataset",
+    "dataset": "llamafactory/tiny-supervised-dataset",
     "dataset_dir": "ONLINE",
     "template": "llama3",
     "cutoff_len": 1024,
@@ -25,12 +25,7 @@ TRAINING_ARGS = {
 
 
 def test_lora_all_modules():
-    model_args, _, _, finetuning_args, _ = get_train_args(
-        {
-            "lora_target": "all",
-            **TRAINING_ARGS,
-        }
-    )
+    model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "all", **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     linear_modules = set()
@@ -48,11 +43,7 @@ def test_lora_all_modules():
 
 def test_lora_extra_modules():
     model_args, _, _, finetuning_args, _ = get_train_args(
-        {
-            "lora_target": "all",
-            "additional_target": "embed_tokens,lm_head",
-            **TRAINING_ARGS,
-        }
+        {"lora_target": "all", "additional_target": "embed_tokens,lm_head", **TRAIN_ARGS}
     )
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)

From 4b2b92fd9aecc6e6f40c44d212f5889d9f692446 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 10 Jun 2024 23:56:00 +0800
Subject: [PATCH 009/160] update evaluator

Former-commit-id: bb8661e62481ff7027b8969f3d8a6a17290c9da3
---
 src/llamafactory/eval/evaluator.py |  4 +-
 src/llamafactory/eval/template.py  |  9 ++--
 tests/eval/test_eval_template.py   | 77 ++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 9 deletions(-)
 create mode 100644 tests/eval/test_eval_template.py

diff --git a/src/llamafactory/eval/evaluator.py b/src/llamafactory/eval/evaluator.py
index 192f4815..5c6fb104 100644
--- a/src/llamafactory/eval/evaluator.py
+++ b/src/llamafactory/eval/evaluator.py
@@ -26,9 +26,7 @@ class Evaluator:
         self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template)
         self.model = load_model(self.tokenizer, self.model_args, finetuning_args)
         self.eval_template = get_eval_template(self.eval_args.lang)
-        self.choice_inputs = [
-            self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES
-        ]
+        self.choice_inputs = [self.tokenizer.encode(ch, add_special_tokens=False)[-1] for ch in CHOICES]
 
     @torch.inference_mode()
     def batch_inference(self, batch_input: Dict[str, torch.Tensor]) -> List[str]:
diff --git a/src/llamafactory/eval/template.py b/src/llamafactory/eval/template.py
index a4a6ef0e..2cbb5aaf 100644
--- a/src/llamafactory/eval/template.py
+++ b/src/llamafactory/eval/template.py
@@ -10,7 +10,6 @@ class EvalTemplate:
     system: str
     choice: str
     answer: str
-    prefix: str
 
     def _parse_example(self, example: Dict[str, str]) -> Tuple[str, str]:
         r"""
@@ -42,8 +41,8 @@ class EvalTemplate:
 eval_templates: Dict[str, "EvalTemplate"] = {}
 
 
-def _register_eval_template(name: str, system: str, choice: str, answer: str, prefix: str) -> None:
-    eval_templates[name] = EvalTemplate(system=system, choice=choice, answer=answer, prefix=prefix)
+def _register_eval_template(name: str, system: str, choice: str, answer: str) -> None:
+    eval_templates[name] = EvalTemplate(system=system, choice=choice, answer=answer)
 
 
 def get_eval_template(name: str) -> "EvalTemplate":
@@ -56,8 +55,7 @@ _register_eval_template(
     name="en",
     system="The following are multiple choice questions (with answers) about {subject}.\n\n",
     choice="\n{choice}. {content}",
-    answer="\nAnswer: ",
-    prefix=" ",
+    answer="\nAnswer:",
 )
 
 
@@ -66,5 +64,4 @@ _register_eval_template(
     system="以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。\n\n",
     choice="\n{choice}. {content}",
     answer="\n答案：",
-    prefix=" ",
 )
diff --git a/tests/eval/test_eval_template.py b/tests/eval/test_eval_template.py
new file mode 100644
index 00000000..f6a91a67
--- /dev/null
+++ b/tests/eval/test_eval_template.py
@@ -0,0 +1,77 @@
+from llamafactory.eval.template import get_eval_template
+
+
+def test_eval_template_en():
+    support_set = [
+        {
+            "question": "Fewshot question",
+            "A": "Fewshot1",
+            "B": "Fewshot2",
+            "C": "Fewshot3",
+            "D": "Fewshot4",
+            "answer": "B",
+        }
+    ]
+    example = {
+        "question": "Target question",
+        "A": "Target1",
+        "B": "Target2",
+        "C": "Target3",
+        "D": "Target4",
+        "answer": "C",
+    }
+    template = get_eval_template(name="en")
+    messages = template.format_example(example, support_set=support_set, subject_name="SubName")
+    assert messages == [
+        {
+            "role": "user",
+            "content": (
+                "The following are multiple choice questions (with answers) about SubName.\n\n"
+                "Fewshot question\nA. Fewshot1\nB. Fewshot2\nC. Fewshot3\nD. Fewshot4\nAnswer:"
+            ),
+        },
+        {"role": "assistant", "content": "B"},
+        {
+            "role": "user",
+            "content": "Target question\nA. Target1\nB. Target2\nC. Target3\nD. Target4\nAnswer:",
+        },
+        {"role": "assistant", "content": "C"},
+    ]
+
+
+def test_eval_template_zh():
+    support_set = [
+        {
+            "question": "示例问题",
+            "A": "示例答案1",
+            "B": "示例答案2",
+            "C": "示例答案3",
+            "D": "示例答案4",
+            "answer": "B",
+        }
+    ]
+    example = {
+        "question": "目标问题",
+        "A": "目标答案1",
+        "B": "目标答案2",
+        "C": "目标答案3",
+        "D": "目标答案4",
+        "answer": "C",
+    }
+    template = get_eval_template(name="zh")
+    messages = template.format_example(example, support_set=support_set, subject_name="主题")
+    assert messages == [
+        {
+            "role": "user",
+            "content": (
+                "以下是中国关于主题考试的单项选择题，请选出其中的正确答案。\n\n"
+                "示例问题\nA. 示例答案1\nB. 示例答案2\nC. 示例答案3\nD. 示例答案4\n答案："
+            ),
+        },
+        {"role": "assistant", "content": "B"},
+        {
+            "role": "user",
+            "content": "目标问题\nA. 目标答案1\nB. 目标答案2\nC. 目标答案3\nD. 目标答案4\n答案：",
+        },
+        {"role": "assistant", "content": "C"},
+    ]

From ea2ca2777fd81dfb29e37f51adfd16cc484636dd Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 00:19:17 +0800
Subject: [PATCH 010/160] fix #4145

Fix the docker image


Former-commit-id: a9838281156fe870bfcde5d1f7afc15264fd4aad
---
 Dockerfile         | 38 ++++++++++++++++++++++++++++++++++----
 README.md          | 36 ++++++++++++++++++------------------
 README_zh.md       | 34 ++++++++++++++++++----------------
 docker-compose.yml | 10 ++++++++--
 4 files changed, 78 insertions(+), 40 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0a35e355..45849601 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,14 +1,44 @@
-FROM nvcr.io/nvidia/pytorch:24.01-py3
+# Use the NVIDIA official image with PyTorch 2.3.0
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html
+FROM nvcr.io/nvidia/pytorch:24.02-py3
 
+# Define installation arguments
+ARG INSTALL_BNB=false
+ARG INSTALL_VLLM=false
+ARG INSTALL_DEEPSPEED=false
+ARG PIP_INDEX=https://pypi.org/simple
+
+# Set the working directory
 WORKDIR /app
 
+# Install the requirements
 COPY requirements.txt /app/
-RUN pip install -r requirements.txt
+RUN pip config set global.index-url $PIP_INDEX
+RUN python -m pip install --upgrade pip
+RUN python -m pip install -r requirements.txt
 
+# Copy the rest of the application into the image
 COPY . /app/
-RUN pip install -e .[metrics,bitsandbytes,qwen]
 
+# Install the LLaMA Factory
+RUN EXTRA_PACKAGES="metrics"; \
+    if [ "$INSTALL_BNB" = "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
+    fi; \
+    if [ "$INSTALL_VLLM" = "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
+    fi; \
+    if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
+    fi; \
+    pip install -e .[$EXTRA_PACKAGES] && \
+    pip uninstall -y transformer-engine
+
+# Set up volumes
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
+
+# Expose port 7860 for the LLaMA Board
 EXPOSE 7860
 
-CMD [ "llamafactory-cli", "webui" ]
+# Expose port 8000 for the API service
+EXPOSE 8000
diff --git a/README.md b/README.md
index 4dea65b9..35dacd2e 100644
--- a/README.md
+++ b/README.md
@@ -405,9 +405,9 @@ Please refer to [data/README.md](data/README.md) for checking the details about
 Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Llama3-8B-Instruct model, respectively.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
 See [examples/README.md](examples/README.md) for advanced usage (including distributed training).
@@ -417,33 +417,33 @@ See [examples/README.md](examples/README.md) for advanced usage (including distr
 
 ### Fine-Tuning with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
 
-#### Use local environment
-
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
+llamafactory-cli webui
 ```
 
-</details>
-
-#### Use Docker
+### Build Docker
 
 ```bash
-docker build -f ./Dockerfile -t llama-factory:latest .
-docker run --gpus=all \
+docker build -f ./Dockerfile \
+    --build-arg INSTALL_BNB=false \
+    --build-arg INSTALL_VLLM=false \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    -t llamafactory:latest .
+
+docker run -it --gpus=all \
     -v ./hf_cache:/root/.cache/huggingface/ \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -p 7860:7860 \
+    -p 8000:8000 \
     --shm-size 16G \
-    --name llama_factory \
-    -d llama-factory:latest
+    --name llamafactory \
+    llamafactory:latest
 ```
 
-#### Use Docker Compose
-
-```bash
-docker compose -f ./docker-compose.yml up -d
-```
+> [!TIP]
+> Use Docker Compose to build image via `docker compose up -d`.
 
 <details><summary>Details about volume</summary>
 
diff --git a/README_zh.md b/README_zh.md
index ab0e8cb7..0ddb8b19 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -405,9 +405,9 @@ Docker 镜像：
 下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
 高级用法请参考 [examples/README_zh.md](examples/README_zh.md)（包括多 GPU 微调）。
@@ -417,31 +417,33 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_s
 
 ### LLaMA Board 可视化微调（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
 
-#### 使用本地环境
-
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
+llamafactory-cli webui
 ```
 
-#### 使用 Docker
+### 构建 Docker
 
 ```bash
-docker build -f ./Dockerfile -t llama-factory:latest .
-docker run --gpus=all \
+docker build -f ./Dockerfile \
+    --build-arg INSTALL_BNB=false \
+    --build-arg INSTALL_VLLM=false \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    -t llamafactory:latest .
+
+docker run -it --gpus=all \
     -v ./hf_cache:/root/.cache/huggingface/ \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -p 7860:7860 \
+    -p 8000:8000 \
     --shm-size 16G \
-    --name llama_factory \
-    -d llama-factory:latest
+    --name llamafactory \
+    llamafactory:latest
 ```
 
-#### 使用 Docker Compose
-
-```bash
-docker compose -f ./docker-compose.yml up -d
-```
+> [!TIP]
+> 通过 `docker compose up -d` 使用 Docker Compose 构建镜像。
 
 <details><summary>数据卷详情</summary>
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 9602a3e3..b3e4a34d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,17 +1,23 @@
 version: '3.8'
 
 services:
-  llama-factory:
+  llamafactory:
     build:
       dockerfile: Dockerfile
       context: .
-    container_name: llama_factory
+      args:
+        INSTALL_BNB: false
+        INSTALL_VLLM: false
+        INSTALL_DEEPSPEED: false
+        PIP_INDEX: https://pypi.org/simple
+    container_name: llamafactory
     volumes:
       - ./hf_cache:/root/.cache/huggingface/
       - ./data:/app/data
       - ./output:/app/output
     ports:
       - "7860:7860"
+      - "8000:8000"
     ipc: host
     deploy:
       resources:

From 4d7dd0330db1e4e1ac962d5cf973e7d995b35fcb Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 00:37:17 +0800
Subject: [PATCH 011/160] fix #4160

The split heads should be concatenated in dim=2


Former-commit-id: 4b3f247f270d44df9fe226cfe0dabfb7fcd2deda
---
 src/llamafactory/model/model_utils/longlora.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py
index c8dc52f5..cd468979 100644
--- a/src/llamafactory/model/model_utils/longlora.py
+++ b/src/llamafactory/model/model_utils/longlora.py
@@ -96,7 +96,8 @@ def llama_attention_forward(
             (
                 attn_output[:, :, : self.num_heads // 2],
                 attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
-            )
+            ),
+            dim=2,
         )
 
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
@@ -194,7 +195,8 @@ def llama_flash_attention_2_forward(
             (
                 attn_output[:, :, : self.num_heads // 2],
                 attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
-            )
+            ),
+            dim=2,
         )
 
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
@@ -293,7 +295,8 @@ def llama_sdpa_attention_forward(
             (
                 attn_output[:, :, : self.num_heads // 2],
                 attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
-            )
+            ),
+            dim=2,
         )
 
     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
@@ -303,7 +306,7 @@ def llama_sdpa_attention_forward(
 
 
 def _apply_llama_patch() -> None:
-    require_version("transformers==4.40.2", "To fix: pip install transformers==4.40.2")
+    require_version("transformers==4.41.2", "To fix: pip install transformers==4.41.2")
     LlamaAttention.forward = llama_attention_forward
     LlamaFlashAttention2.forward = llama_flash_attention_2_forward
     LlamaSdpaAttention.forward = llama_sdpa_attention_forward

From 0f1e59232616c5273e08a2150f5348dc21512c8e Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 00:44:26 +0800
Subject: [PATCH 012/160] release v0.8.1

Former-commit-id: 875a34f492701d1c644facbe9ede411af2931513
---
 src/llamafactory/extras/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index 1d4e43f1..8c34fd96 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -9,7 +9,7 @@ import trl
 from transformers.utils import is_torch_cuda_available, is_torch_npu_available
 
 
-VERSION = "0.8.1.dev0"
+VERSION = "0.8.1"
 
 
 def print_env() -> None:

From f330b736825cec3d2e570c23305ba190ec303dc9 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 00:50:53 +0800
Subject: [PATCH 013/160] set dev version

Former-commit-id: 16c47cc15226119e33e46ba0f2f6ccb37072257f
---
 src/llamafactory/extras/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index 8c34fd96..a8cb799d 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -9,7 +9,7 @@ import trl
 from transformers.utils import is_torch_cuda_available, is_torch_npu_available
 
 
-VERSION = "0.8.1"
+VERSION = "0.8.2.dev0"
 
 
 def print_env() -> None:

From 2723438531fd993aafb8c83bba784425e1f574a0 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 01:04:16 +0800
Subject: [PATCH 014/160] tiny fix

Former-commit-id: b5e9711ef375cc323fc083e742cccfc974550416
---
 src/llamafactory/model/model_utils/longlora.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py
index cd468979..4a8c562a 100644
--- a/src/llamafactory/model/model_utils/longlora.py
+++ b/src/llamafactory/model/model_utils/longlora.py
@@ -182,11 +182,9 @@ def llama_flash_attention_2_forward(
         query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
         if attention_mask is not None:
             attention_mask = attention_mask[:, :groupsz].repeat(num_groups, 1)
-    else:
-        groupsz = q_len
 
     attn_output: torch.Tensor = self._flash_attention_forward(
-        query_states, key_states, value_states, attention_mask, groupsz, dropout=dropout_rate
+        query_states, key_states, value_states, attention_mask, query_states.size(1), dropout=dropout_rate
     )
 
     if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back

From fce11bb38606cb760c297956c5f9b61c8686b7ff Mon Sep 17 00:00:00 2001
From: Alfredo Luque <alfredo.luque@airbnb.com>
Date: Tue, 11 Jun 2024 00:07:06 +0000
Subject: [PATCH 015/160] add manifest so requirements.txt in sdist

Former-commit-id: b501a3c56c51786c3006a2aca15a145641a4556c
---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..82c51f63
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include LICENSE requirements.txt

From 27aece94cf24d07af0112833dbabeb33eb73a367 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 12:48:53 +0800
Subject: [PATCH 016/160] tiny fix

Former-commit-id: c4b2e263d9cefbad0fbc5de72422e4ef8edbcb54
---
 src/llamafactory/hparams/parser.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index ff1fbf5d..ec5dd62c 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -171,9 +171,6 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.do_train and model_args.quantization_device_map == "auto":
         raise ValueError("Cannot use device map for quantized models in training.")
 
-    if finetuning_args.use_dora and model_args.use_unsloth:
-        raise ValueError("Unsloth does not support DoRA.")
-
     if finetuning_args.pure_bf16:
         if not is_torch_bf16_gpu_available():
             raise ValueError("This device does not support `pure_bf16`.")

From 820b6e7b327d035724ed9b422477731d0b75c967 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 15:38:38 +0800
Subject: [PATCH 017/160] fix #4198

Former-commit-id: 945d2c6cc73542adf9272ebd9aa332ea2c1c7361
---
 src/llamafactory/hparams/model_args.py  | 12 ++++++++++
 src/llamafactory/model/patcher.py       |  2 +-
 src/llamafactory/train/trainer_utils.py | 32 +++++++++++--------------
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 6352a420..71467770 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -1,6 +1,8 @@
 from dataclasses import asdict, dataclass, field
 from typing import Any, Dict, Literal, Optional
 
+from typing_extensions import Self
+
 
 @dataclass
 class ModelArguments:
@@ -216,3 +218,13 @@ class ModelArguments:
 
     def to_dict(self) -> Dict[str, Any]:
         return asdict(self)
+
+    @classmethod
+    def copyfrom(cls, old_arg: Self, **kwargs) -> Self:
+        arg_dict = old_arg.to_dict()
+        arg_dict.update(**kwargs)
+        new_arg = cls(**arg_dict)
+        new_arg.compute_dtype = old_arg.compute_dtype
+        new_arg.device_map = old_arg.device_map
+        new_arg.model_max_length = old_arg.model_max_length
+        return new_arg
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 87c92315..18221a10 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -79,7 +79,7 @@ def patch_config(
             if "device_map" not in init_kwargs and model_args.device_map:
                 init_kwargs["device_map"] = model_args.device_map
 
-            if init_kwargs["device_map"] == "auto":
+            if init_kwargs.get("device_map", None) == "auto":
                 init_kwargs["offload_folder"] = model_args.offload_folder
 
 
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 0ddcdb11..7e9cc881 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -83,15 +83,12 @@ def create_ref_model(
     The valuehead parameter is randomly initialized since it is useless for PPO training.
     """
     if finetuning_args.ref_model is not None:
-        ref_model_args_dict = model_args.to_dict()
-        ref_model_args_dict.update(
-            dict(
-                model_name_or_path=finetuning_args.ref_model,
-                adapter_name_or_path=finetuning_args.ref_model_adapters,
-                quantization_bit=finetuning_args.ref_model_quantization_bit,
-            )
+        ref_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.ref_model,
+            adapter_name_or_path=finetuning_args.ref_model_adapters,
+            quantization_bit=finetuning_args.ref_model_quantization_bit,
         )
-        ref_model_args = ModelArguments(**ref_model_args_dict)
         ref_finetuning_args = FinetuningArguments()
         tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
         ref_model = load_model(
@@ -102,9 +99,11 @@ def create_ref_model(
         if finetuning_args.finetuning_type == "lora":
             ref_model = None
         else:
-            tokenizer = load_tokenizer(model_args)["tokenizer"]
+            ref_model_args = ModelArguments.copyfrom(model_args)
+            ref_finetuning_args = FinetuningArguments()
+            tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
             ref_model = load_model(
-                tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead
+                tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
             )
             logger.info("Created reference model from the model itself.")
 
@@ -139,15 +138,12 @@ def create_reward_model(
         logger.info("Loaded adapter weights of reward model from {}".format(finetuning_args.reward_model))
         return None
     else:
-        reward_model_args_dict = model_args.to_dict()
-        reward_model_args_dict.update(
-            dict(
-                model_name_or_path=finetuning_args.reward_model,
-                adapter_name_or_path=finetuning_args.reward_model_adapters,
-                quantization_bit=finetuning_args.reward_model_quantization_bit,
-            )
+        reward_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.reward_model,
+            adapter_name_or_path=finetuning_args.reward_model_adapters,
+            quantization_bit=finetuning_args.reward_model_quantization_bit,
         )
-        reward_model_args = ModelArguments(**reward_model_args_dict)
         reward_finetuning_args = FinetuningArguments()
         tokenizer = load_tokenizer(reward_model_args)["tokenizer"]
         reward_model = load_model(

From f14f67f8036fe52e3f5ff4c08f085b7eb3a20f2a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 15:40:21 +0800
Subject: [PATCH 018/160] Update bug-report.yml

Former-commit-id: bb022cd867ebf2593e40fc6ba43b768603b129a3
---
 .github/ISSUE_TEMPLATE/bug-report.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 1d962200..768adea6 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -38,7 +38,9 @@ body:
         请合理使用 Markdown 标签来格式化您的文本。
 
       placeholder: |
+        ```bash
         llamafactory-cli train ...
+        ```
 
   - type: textarea
     id: expected-behavior

From 77bf3d66c7537a4c423f63b24c67b3e8b653ea60 Mon Sep 17 00:00:00 2001
From: d <913015993@qq.com>
Date: Tue, 11 Jun 2024 16:21:48 +0800
Subject: [PATCH 019/160] =?UTF-8?q?=E7=BB=8F=E8=BF=87=E5=A4=A7=E9=87=8F?=
 =?UTF-8?q?=E7=9A=84=E5=A2=9E=E9=87=8F=E9=A2=84=E8=AE=AD=E7=BB=83=EF=BC=8C?=
 =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E5=AF=B9=E6=AF=94=E8=AF=95=E9=AA=8C=EF=BC=8C?=
 =?UTF-8?q?=E5=8F=91=E7=8E=B0=E8=BF=99=E4=B8=AAbug=EF=BC=9Allama3=E5=9C=A8?=
 =?UTF-8?q?=E9=A2=84=E8=AE=AD=E7=BB=83=E6=97=B6=E4=BD=BF=E7=94=A8=E7=9A=84?=
 =?UTF-8?q?tokenizer.eos=5Ftoke=E6=98=AF'<|end=5Fof=5Ftext|>'=20=EF=BC=8C?=
 =?UTF-8?q?=E8=BF=99=E9=87=8C=E5=9C=A8=E6=AF=8F=E6=9D=A1=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E5=90=8E=E9=9D=A2=E4=B9=9F=E5=BE=97=E7=94=A8=E8=BF=99=E4=B8=AA?=
 =?UTF-8?q?=EF=BC=8C=E8=80=8C=E4=B8=8D=E6=98=AF'<|eot=5Fid|>'=EF=BC=8C?=
 =?UTF-8?q?=E5=90=A6=E5=88=99=E5=BE=88=E5=AE=B9=E6=98=93=E5=AF=BC=E8=87=B4?=
 =?UTF-8?q?=E4=B8=A5=E9=87=8D=E7=9A=84=E6=80=A7=E8=83=BD=E4=B8=8B=E9=99=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Former-commit-id: ef470561f742b16eaa0f99c4cadecd7c84ce6bd2
---
 src/llamafactory/data/processors/pretrain.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index 87727b55..4050f74c 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -12,7 +12,8 @@ def preprocess_pretrain_dataset(
     examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
-    text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]]
+    eos_token = '<|end_of_text|>' if data_args.template == 'llama3' else  tokenizer.eos_token
+    text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]]
 
     if not data_args.packing:
         if data_args.template == "gemma":

From 08f2f99f4bc527223bf09840267bf4fdecf78cd1 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 11 Jun 2024 16:52:36 +0800
Subject: [PATCH 020/160] fix deepspeed version

Former-commit-id: 938a69bb07d4de7d82928ff01c582032162c1480
---
 src/llamafactory/model/model_utils/moe.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/llamafactory/model/model_utils/moe.py b/src/llamafactory/model/model_utils/moe.py
index e554e45a..8a73c844 100644
--- a/src/llamafactory/model/model_utils/moe.py
+++ b/src/llamafactory/model/model_utils/moe.py
@@ -1,5 +1,6 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Sequence
 
+import torch
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils.versions import require_version
 
@@ -10,6 +11,13 @@ if TYPE_CHECKING:
     from ...hparams import ModelArguments
 
 
+def _set_z3_leaf_modules(model: "PreTrainedModel", leaf_modules: Sequence["torch.nn.Module"]) -> None:
+    require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
+    from deepspeed.utils import set_z3_leaf_modules  # type: ignore
+
+    set_z3_leaf_modules(model, leaf_modules)
+
+
 def add_z3_leaf_module(model: "PreTrainedModel") -> None:
     r"""
     Sets module as a leaf module to skip partitioning in deepspeed zero3.
@@ -17,33 +25,30 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:
     if not is_deepspeed_zero3_enabled():
         return
 
-    require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
-    from deepspeed.utils import set_z3_leaf_modules  # type: ignore
-
     if getattr(model.config, "model_type", None) == "dbrx":
         from transformers.models.dbrx.modeling_dbrx import DbrxFFN
 
-        set_z3_leaf_modules(model, [DbrxFFN])
+        _set_z3_leaf_modules(model, [DbrxFFN])
 
     if getattr(model.config, "model_type", None) == "jamba":
         from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock
 
-        set_z3_leaf_modules(model, [JambaSparseMoeBlock])
+        _set_z3_leaf_modules(model, [JambaSparseMoeBlock])
 
     if getattr(model.config, "model_type", None) == "jetmoe":
         from transformers.models.jetmoe.modeling_jetmoe import JetMoeMoA, JetMoeMoE
 
-        set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE])
+        _set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE])
 
     if getattr(model.config, "model_type", None) == "mixtral":
         from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
-        set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+        _set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
 
     if getattr(model.config, "model_type", None) == "qwen2moe":
         from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
 
-        set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
+        _set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
 
 
 def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:

From cc9717e2f2462a11cb28a7c09088ae0ae126d899 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 11 Jun 2024 17:02:14 +0800
Subject: [PATCH 021/160] Update pretrain.py

Former-commit-id: e2317b2a84149e39fddfd6366be3de23dfb71f82
---
 src/llamafactory/data/processors/pretrain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index 4050f74c..832c987e 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -12,7 +12,7 @@ def preprocess_pretrain_dataset(
     examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
-    eos_token = '<|end_of_text|>' if data_args.template == 'llama3' else  tokenizer.eos_token
+    eos_token = "<|end_of_text|>" if data_args.template == "llama3" else tokenizer.eos_token
     text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]]
 
     if not data_args.packing:

From 16c7c923969bb2871909a1bc629b49f16af7bc68 Mon Sep 17 00:00:00 2001
From: Arthur Kim <kimdwkimdw@gmail.com>
Date: Wed, 12 Jun 2024 16:49:12 +0900
Subject: [PATCH 022/160] Support vllm==0.5.0

Former-commit-id: e7a8ffd7af21bc3759f055033ba2209fa7a1be0e
---
 src/llamafactory/chat/vllm_engine.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 87ce8684..d096f6eb 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -13,7 +13,10 @@ from .base_engine import BaseEngine, Response
 if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
-    from vllm.sequence import MultiModalData
+    try:
+        from vllm.multimodal import MultiModalData  # vllm==0.5.0
+    except ImportError:
+        from vllm.sequence import MultiModalData  # vllm<0.5.0
 
 
 if TYPE_CHECKING:

From 6392d45ea766727e295bf20b443a31161cc3dc63 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 12 Jun 2024 16:50:11 +0800
Subject: [PATCH 023/160] fix #4242

Former-commit-id: cf260e7af03f49aa5e3d6daf3b27738ff9b9bcb8
---
 Dockerfile                           | 2 +-
 src/llamafactory/chat/vllm_engine.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 45849601..3932ff30 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -32,7 +32,7 @@ RUN EXTRA_PACKAGES="metrics"; \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
     pip install -e .[$EXTRA_PACKAGES] && \
-    pip uninstall -y transformer-engine
+    pip uninstall -y transformer-engine flash-attn
 
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index d096f6eb..d2850a6e 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -13,10 +13,11 @@ from .base_engine import BaseEngine, Response
 if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
+
     try:
-        from vllm.multimodal import MultiModalData  # vllm==0.5.0
+        from vllm.multimodal import MultiModalData  # type: ignore (for vllm>=0.5.0)
     except ImportError:
-        from vllm.sequence import MultiModalData  # vllm<0.5.0
+        from vllm.sequence import MultiModalData  # for vllm<0.5.0
 
 
 if TYPE_CHECKING:

From fe2c7eaa930c69773b34d99d289ac39ebd739343 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 12 Jun 2024 17:39:12 +0800
Subject: [PATCH 024/160] update readme

Former-commit-id: a436aaa83f0cf12c8f404459e5486f9369d538ec
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 35dacd2e..a773a5f4 100644
--- a/README.md
+++ b/README.md
@@ -443,7 +443,7 @@ docker run -it --gpus=all \
 ```
 
 > [!TIP]
-> Use Docker Compose to build image via `docker compose up -d`.
+> Use Docker Compose to build image via `docker-compose up -d`.
 
 <details><summary>Details about volume</summary>
 
diff --git a/README_zh.md b/README_zh.md
index 0ddb8b19..7a9cb159 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -443,7 +443,7 @@ docker run -it --gpus=all \
 ```
 
 > [!TIP]
-> 通过 `docker compose up -d` 使用 Docker Compose 构建镜像。
+> 通过 `docker-compose up -d` 使用 Docker Compose 构建镜像。
 
 <details><summary>数据卷详情</summary>
 

From 799873aa14cd5809a643e896908dea129aa600fd Mon Sep 17 00:00:00 2001
From: hzhaoy <hzywong@gmail.com>
Date: Wed, 12 Jun 2024 18:29:03 +0800
Subject: [PATCH 025/160] adapt vllm==0.5.0

Former-commit-id: 02afd9ff64f23e6707ac739ae1269f41bd70c340
---
 src/llamafactory/chat/vllm_engine.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index d2850a6e..34126adf 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -1,10 +1,12 @@
 import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence, Union
 
+from packaging import version
+
 from ..data import get_template_and_fix_tokenizer
 from ..extras.logging import get_logger
 from ..extras.misc import get_device_count
-from ..extras.packages import is_vllm_available
+from ..extras.packages import is_vllm_available, _get_package_version
 from ..model import load_config, load_tokenizer
 from ..model.model_utils.visual import LlavaMultiModalProjectorForYiVLForVLLM
 from .base_engine import BaseEngine, Response
@@ -14,10 +16,10 @@ if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
 
-    try:
-        from vllm.multimodal import MultiModalData  # type: ignore (for vllm>=0.5.0)
-    except ImportError:
-        from vllm.sequence import MultiModalData  # for vllm<0.5.0
+    if _get_package_version("vllm") >= version.parse("0.5.0"):
+        from vllm.multimodal.image import ImagePixelData
+    else:
+        from vllm.sequence import MultiModalData
 
 
 if TYPE_CHECKING:
@@ -110,7 +112,10 @@ class VllmEngine(BaseEngine):
         if self.processor is not None and image is not None:  # add image features
             image_processor: "BaseImageProcessor" = getattr(self.processor, "image_processor")
             pixel_values = image_processor(image, return_tensors="pt")["pixel_values"]
-            multi_modal_data = MultiModalData(type=MultiModalData.Type.IMAGE, data=pixel_values)
+            if _get_package_version("vllm") >= version.parse("0.5.0"):
+                multi_modal_data = ImagePixelData(pixel_values)
+            else:
+                multi_modal_data = MultiModalData(type=MultiModalData.Type.IMAGE, data=pixel_values)
         else:
             multi_modal_data = None
 

From 41beb7f0a398c2e362f80c1fbd8743fede1c4173 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 00:07:48 +0800
Subject: [PATCH 026/160] fix docker compose usage

Former-commit-id: 59a5bd5d5c8d2a44e2dad26b74e77a45e109c8d6
---
 README.md          | 10 ++++++++--
 README_zh.md       | 10 ++++++++--
 docker-compose.yml |  5 +++--
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index a773a5f4..65964560 100644
--- a/README.md
+++ b/README.md
@@ -423,6 +423,8 @@ llamafactory-cli webui
 
 ### Build Docker
 
+#### Use Docker
+
 ```bash
 docker build -f ./Dockerfile \
     --build-arg INSTALL_BNB=false \
@@ -442,8 +444,12 @@ docker run -it --gpus=all \
     llamafactory:latest
 ```
 
-> [!TIP]
-> Use Docker Compose to build image via `docker-compose up -d`.
+#### Use Docker Compose
+
+```bash
+docker-compose up -d
+docker-compose exec -it llamafactory bash
+```
 
 <details><summary>Details about volume</summary>
 
diff --git a/README_zh.md b/README_zh.md
index 7a9cb159..7962a6d1 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -423,6 +423,8 @@ llamafactory-cli webui
 
 ### 构建 Docker
 
+#### 使用 Docker
+
 ```bash
 docker build -f ./Dockerfile \
     --build-arg INSTALL_BNB=false \
@@ -442,8 +444,12 @@ docker run -it --gpus=all \
     llamafactory:latest
 ```
 
-> [!TIP]
-> 通过 `docker-compose up -d` 使用 Docker Compose 构建镜像。
+#### 使用 Docker Compose
+
+```bash
+docker-compose up -d
+docker-compose exec -it llamafactory bash
+```
 
 <details><summary>数据卷详情</summary>
 
diff --git a/docker-compose.yml b/docker-compose.yml
index b3e4a34d..c5dc34e9 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3.8'
-
 services:
   llamafactory:
     build:
@@ -19,6 +17,9 @@ services:
       - "7860:7860"
       - "8000:8000"
     ipc: host
+    tty: true
+    stdin_open: true
+    command: bash
     deploy:
       resources:
         reservations:

From 5080f2314cd57fd0d769d18f2cdbcd4e002500e3 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 00:48:44 +0800
Subject: [PATCH 027/160] fix lint

Former-commit-id: b170165679317af2b3f03633afac27661b3deb06
---
 README.md                                   |  2 +-
 README_zh.md                                |  2 +-
 src/llamafactory/hparams/finetuning_args.py | 17 ++++++++---------
 src/llamafactory/hparams/model_args.py      | 12 ++++++++----
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 65964560..994a62c6 100644
--- a/README.md
+++ b/README.md
@@ -448,7 +448,7 @@ docker run -it --gpus=all \
 
 ```bash
 docker-compose up -d
-docker-compose exec -it llamafactory bash
+docker-compose exec llamafactory bash
 ```
 
 <details><summary>Details about volume</summary>
diff --git a/README_zh.md b/README_zh.md
index 7962a6d1..fa395c6b 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -448,7 +448,7 @@ docker run -it --gpus=all \
 
 ```bash
 docker-compose up -d
-docker-compose exec -it llamafactory bash
+docker-compose exec llamafactory bash
 ```
 
 <details><summary>数据卷详情</summary>
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index 08af31e4..facbe792 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import Literal, Optional
+from typing import List, Literal, Optional
 
 
 @dataclass
@@ -319,20 +319,19 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
                 return [item.strip() for item in arg.split(",")]
             return arg
 
-        self.freeze_trainable_modules = split_arg(self.freeze_trainable_modules)
-        self.freeze_extra_modules = split_arg(self.freeze_extra_modules)
-        self.lora_alpha = self.lora_alpha or self.lora_rank * 2
-        self.lora_target = split_arg(self.lora_target)
-        self.additional_target = split_arg(self.additional_target)
-        self.galore_target = split_arg(self.galore_target)
+        self.freeze_trainable_modules: List[str] = split_arg(self.freeze_trainable_modules)
+        self.freeze_extra_modules: Optional[List[str]] = split_arg(self.freeze_extra_modules)
+        self.lora_alpha: int = self.lora_alpha or self.lora_rank * 2
+        self.lora_target: List[str] = split_arg(self.lora_target)
+        self.additional_target: Optional[List[str]] = split_arg(self.additional_target)
+        self.galore_target: List[str] = split_arg(self.galore_target)
         self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only
+        self.use_ref_model = self.pref_loss not in ["orpo", "simpo"]
 
         assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method."
         assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
         assert self.reward_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
 
-        self.use_ref_model = self.pref_loss not in ["orpo", "simpo"]
-
         if self.stage == "ppo" and self.reward_model is None:
             raise ValueError("`reward_model` is necessary for PPO training.")
 
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 71467770..359beafd 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -1,9 +1,13 @@
 from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, Literal, Optional
+from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union
 
 from typing_extensions import Self
 
 
+if TYPE_CHECKING:
+    import torch
+
+
 @dataclass
 class ModelArguments:
     r"""
@@ -194,9 +198,9 @@ class ModelArguments:
     )
 
     def __post_init__(self):
-        self.compute_dtype = None
-        self.device_map = None
-        self.model_max_length = None
+        self.compute_dtype: Optional["torch.dtype"] = None
+        self.device_map: Optional[Union[str, Dict[str, Any]]] = None
+        self.model_max_length: Optional[int] = None
 
         if self.split_special_tokens and self.use_fast_tokenizer:
             raise ValueError("`split_special_tokens` is only supported for slow tokenizers.")

From 1b6786a21f5bd384f1f401598c94287c5b2826d7 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 01:00:56 +0800
Subject: [PATCH 028/160] add neo-sft dataset

Former-commit-id: 34863fa7cb641ceca92e3a2eec914126db537b62
---
 README.md    | 1 +
 README_zh.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 994a62c6..5bbaf2d7 100644
--- a/README.md
+++ b/README.md
@@ -259,6 +259,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
+- [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
diff --git a/README_zh.md b/README_zh.md
index fa395c6b..fb616909 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -259,6 +259,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
+- [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)

From 0a75224f62b2c961b922d0cb95514fc5cd683e57 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 01:58:16 +0800
Subject: [PATCH 029/160] clean code

Former-commit-id: f54cafd5c7f0383370d1a2f357834a61a97397ce
---
 src/llamafactory/chat/vllm_engine.py          | 12 +++++-----
 src/llamafactory/extras/packages.py           | 22 +++++--------------
 .../model/model_utils/attention.py            |  7 +++---
 src/llamafactory/train/sft/metric.py          |  3 ++-
 4 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 34126adf..e4c05478 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -1,12 +1,10 @@
 import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence, Union
 
-from packaging import version
-
 from ..data import get_template_and_fix_tokenizer
 from ..extras.logging import get_logger
 from ..extras.misc import get_device_count
-from ..extras.packages import is_vllm_available, _get_package_version
+from ..extras.packages import is_vllm_available, is_vllm_version_greater_than_0_5
 from ..model import load_config, load_tokenizer
 from ..model.model_utils.visual import LlavaMultiModalProjectorForYiVLForVLLM
 from .base_engine import BaseEngine, Response
@@ -16,7 +14,7 @@ if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
 
-    if _get_package_version("vllm") >= version.parse("0.5.0"):
+    if is_vllm_version_greater_than_0_5():
         from vllm.multimodal.image import ImagePixelData
     else:
         from vllm.sequence import MultiModalData
@@ -112,9 +110,9 @@ class VllmEngine(BaseEngine):
         if self.processor is not None and image is not None:  # add image features
             image_processor: "BaseImageProcessor" = getattr(self.processor, "image_processor")
             pixel_values = image_processor(image, return_tensors="pt")["pixel_values"]
-            if _get_package_version("vllm") >= version.parse("0.5.0"):
-                multi_modal_data = ImagePixelData(pixel_values)
-            else:
+            if is_vllm_version_greater_than_0_5():
+                multi_modal_data = ImagePixelData(image=pixel_values)
+            else:  # TODO: remove vllm 0.4.3 support
                 multi_modal_data = MultiModalData(type=MultiModalData.Type.IMAGE, data=pixel_values)
         else:
             multi_modal_data = None
diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py
index 4c9e6492..0746bb4f 100644
--- a/src/llamafactory/extras/packages.py
+++ b/src/llamafactory/extras/packages.py
@@ -1,5 +1,6 @@
 import importlib.metadata
 import importlib.util
+from functools import lru_cache
 from typing import TYPE_CHECKING
 
 from packaging import version
@@ -24,10 +25,6 @@ def is_fastapi_available():
     return _is_package_available("fastapi")
 
 
-def is_flash_attn2_available():
-    return _is_package_available("flash_attn") and _get_package_version("flash_attn") > version.parse("2.0.0")
-
-
 def is_galore_available():
     return _is_package_available("galore_torch")
 
@@ -36,18 +33,10 @@ def is_gradio_available():
     return _is_package_available("gradio")
 
 
-def is_jieba_available():
-    return _is_package_available("jieba")
-
-
 def is_matplotlib_available():
     return _is_package_available("matplotlib")
 
 
-def is_nltk_available():
-    return _is_package_available("nltk")
-
-
 def is_pillow_available():
     return _is_package_available("PIL")
 
@@ -60,10 +49,6 @@ def is_rouge_available():
     return _is_package_available("rouge_chinese")
 
 
-def is_sdpa_available():
-    return _get_package_version("torch") > version.parse("2.1.1")
-
-
 def is_starlette_available():
     return _is_package_available("sse_starlette")
 
@@ -74,3 +59,8 @@ def is_uvicorn_available():
 
 def is_vllm_available():
     return _is_package_available("vllm")
+
+
+@lru_cache
+def is_vllm_version_greater_than_0_5():
+    return _get_package_version("vllm") >= version.parse("0.5.0")
diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index b52ddc86..2bd36fdc 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -1,7 +1,8 @@
 from typing import TYPE_CHECKING
 
+from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available
+
 from ...extras.logging import get_logger
-from ...extras.packages import is_flash_attn2_available, is_sdpa_available
 
 
 if TYPE_CHECKING:
@@ -21,13 +22,13 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model
         requested_attn_implementation = "eager"
 
     elif model_args.flash_attn == "sdpa":
-        if not is_sdpa_available():
+        if not is_torch_sdpa_available():
             logger.warning("torch>=2.1.1 is required for SDPA attention.")
             return
 
         requested_attn_implementation = "sdpa"
     elif model_args.flash_attn == "fa2":
-        if not is_flash_attn2_available():
+        if not is_flash_attn_2_available():
             logger.warning("FlashAttention-2 is not installed.")
             return
 
diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index b135fcfb..6ed356c1 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -2,9 +2,10 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
 
 import numpy as np
+from transformers.utils import is_jieba_available, is_nltk_available
 
 from ...extras.constants import IGNORE_INDEX
-from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available
+from ...extras.packages import is_rouge_available
 
 
 if TYPE_CHECKING:

From 103a507b39ef6fd2135f8cebef9d2202c1213186 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 02:25:50 +0800
Subject: [PATCH 030/160] fix #4209

DeepSpeed ZeRO3 has inflight param error when calling model.eval()


Former-commit-id: 4be013f18ea6a35b5a11db98db5f0670ffb41619
---
 src/llamafactory/train/dpo/trainer.py   |  7 +++++--
 src/llamafactory/train/kto/trainer.py   |  7 +++++--
 src/llamafactory/train/ppo/trainer.py   |  2 ++
 src/llamafactory/train/trainer_utils.py | 13 -------------
 4 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index d860b29a..5bdb9c43 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import defaultdict
 from contextlib import nullcontext
 from types import MethodType
@@ -10,7 +11,7 @@ from trl import DPOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps, get_ref_context
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
 if TYPE_CHECKING:
@@ -61,6 +62,8 @@ class CustomDPOTrainer(DPOTrainer):
         if not hasattr(self, "accelerator"):
             raise AttributeError("Please update `transformers`.")
 
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
         if ref_model is not None:
             if self.is_deepspeed_enabled:
                 if not (
@@ -176,7 +179,7 @@ class CustomDPOTrainer(DPOTrainer):
 
         if self.ref_model is None:
             ref_model = model
-            ref_context = get_ref_context(self.accelerator, model)
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
         else:
             ref_model = self.ref_model
             ref_context = nullcontext()
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 22a84e4a..3b4488fc 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import defaultdict
 from contextlib import nullcontext
 from types import MethodType
@@ -9,7 +10,7 @@ from trl import KTOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps, get_ref_context
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
 if TYPE_CHECKING:
@@ -60,6 +61,8 @@ class CustomKTOTrainer(KTOTrainer):
         if not hasattr(self, "accelerator"):
             raise AttributeError("Please update `transformers`.")
 
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
         if ref_model is not None:
             if self.is_deepspeed_enabled:
                 if not (
@@ -143,7 +146,7 @@ class CustomKTOTrainer(KTOTrainer):
         """
         if self.ref_model is None:
             ref_model = model
-            ref_context = get_ref_context(self.accelerator, model)
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
         else:
             ref_model = self.ref_model
             ref_context = nullcontext()
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 2e1288e4..737c45a3 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -1,6 +1,7 @@
 import math
 import os
 import sys
+import warnings
 from types import MethodType
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
@@ -136,6 +137,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
 
         device_type = unwrapped_model.pretrained_model.device.type
         self.amp_context = torch.autocast(device_type, dtype=model_args.compute_dtype)
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
 
         if finetuning_args.reward_model_type == "full":
             if self.is_deepspeed_enabled:
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 7e9cc881..48944a63 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -1,4 +1,3 @@
-from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -19,7 +18,6 @@ if is_galore_available():
 
 
 if TYPE_CHECKING:
-    from accelerate import Accelerator
     from transformers import PreTrainedModel, Seq2SeqTrainingArguments
     from trl import AutoModelForCausalLMWithValueHead
 
@@ -154,17 +152,6 @@ def create_reward_model(
         return reward_model
 
 
-@contextmanager
-def get_ref_context(accelerator: "Accelerator", model: "PreTrainedModel"):
-    r"""
-    Gets adapter context for the reference model.
-    """
-    with accelerator.unwrap_model(model).disable_adapter():
-        model.eval()
-        yield
-        model.train()
-
-
 def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]:
     r"""
     Returns a list of names of parameters with weight decay. (weights in non-layernorm layers)

From 49b58fd6afddacc66c4f560d970b7b776419f093 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 02:48:21 +0800
Subject: [PATCH 031/160] fix #4221

Former-commit-id: 05a3be4853b941909e7d193c31e8d62c8c5f879b
---
 src/llamafactory/data/aligner.py                 |  8 ++++++--
 src/llamafactory/data/loader.py                  | 11 ++++++-----
 src/llamafactory/data/preprocess.py              |  3 +--
 src/llamafactory/data/processors/feedback.py     |  3 +--
 src/llamafactory/data/processors/pairwise.py     |  3 +--
 src/llamafactory/data/processors/pretrain.py     |  2 +-
 src/llamafactory/data/processors/supervised.py   |  3 +--
 src/llamafactory/data/processors/unsupervised.py |  3 +--
 src/llamafactory/train/sft/metric.py             |  2 +-
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py
index 434956af..3e9d5c46 100644
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -10,6 +10,7 @@ from .data_utils import Role
 
 if TYPE_CHECKING:
     from datasets import Dataset, IterableDataset
+    from transformers import Seq2SeqTrainingArguments
 
     from ..hparams import DataArguments
     from .parser import DatasetAttr
@@ -175,7 +176,10 @@ def convert_sharegpt(
 
 
 def align_dataset(
-    dataset: Union["Dataset", "IterableDataset"], dataset_attr: "DatasetAttr", data_args: "DataArguments"
+    dataset: Union["Dataset", "IterableDataset"],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
 ) -> Union["Dataset", "IterableDataset"]:
     r"""
     Aligned dataset:
@@ -208,7 +212,7 @@ def align_dataset(
     if not data_args.streaming:
         kwargs = dict(
             num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=(not data_args.overwrite_cache),
+            load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
             desc="Converting format of dataset",
         )
 
diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py
index 2c236c76..ba426f81 100644
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -18,8 +18,7 @@ from .template import get_template_and_fix_tokenizer
 
 if TYPE_CHECKING:
     from datasets import Dataset, IterableDataset
-    from transformers import ProcessorMixin, Seq2SeqTrainingArguments
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments
 
     from ..hparams import DataArguments, ModelArguments
     from .parser import DatasetAttr
@@ -32,6 +31,7 @@ def load_single_dataset(
     dataset_attr: "DatasetAttr",
     model_args: "ModelArguments",
     data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
 ) -> Union["Dataset", "IterableDataset"]:
     logger.info("Loading dataset {}...".format(dataset_attr))
     data_path, data_name, data_dir, data_files = None, None, None, None
@@ -123,7 +123,7 @@ def load_single_dataset(
         max_samples = min(data_args.max_samples, len(dataset))
         dataset = dataset.select(range(max_samples))
 
-    return align_dataset(dataset, dataset_attr, data_args)
+    return align_dataset(dataset, dataset_attr, data_args, training_args)
 
 
 def get_dataset(
@@ -157,7 +157,8 @@ def get_dataset(
             if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True):
                 raise ValueError("The dataset is not applicable in the current training stage.")
 
-            all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args))
+            all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args, training_args))
+
         dataset = merge_dataset(all_datasets, data_args, training_args)
 
     with training_args.main_process_first(desc="pre-process dataset"):
@@ -169,7 +170,7 @@ def get_dataset(
         if not data_args.streaming:
             kwargs = dict(
                 num_proc=data_args.preprocessing_num_workers,
-                load_from_cache_file=(not data_args.overwrite_cache),
+                load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
                 desc="Running tokenizer on dataset",
             )
 
diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py
index 97789c39..875f55d6 100644
--- a/src/llamafactory/data/preprocess.py
+++ b/src/llamafactory/data/preprocess.py
@@ -13,8 +13,7 @@ from .processors.unsupervised import preprocess_unsupervised_dataset, print_unsu
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin, Seq2SeqTrainingArguments
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments
 
     from ..hparams import DataArguments
     from .template import Template
diff --git a/src/llamafactory/data/processors/feedback.py b/src/llamafactory/data/processors/feedback.py
index 98d83658..5fba452c 100644
--- a/src/llamafactory/data/processors/feedback.py
+++ b/src/llamafactory/data/processors/feedback.py
@@ -6,8 +6,7 @@ from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin
 
     from ...hparams import DataArguments
     from ..template import Template
diff --git a/src/llamafactory/data/processors/pairwise.py b/src/llamafactory/data/processors/pairwise.py
index fe984efa..db52c6a7 100644
--- a/src/llamafactory/data/processors/pairwise.py
+++ b/src/llamafactory/data/processors/pairwise.py
@@ -6,8 +6,7 @@ from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin
 
     from ...hparams import DataArguments
     from ..template import Template
diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index 832c987e..a10ccabd 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any, Dict, List
 
 
 if TYPE_CHECKING:
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer
 
     from ...hparams import DataArguments
 
diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py
index 19d60280..f59f5371 100644
--- a/src/llamafactory/data/processors/supervised.py
+++ b/src/llamafactory/data/processors/supervised.py
@@ -7,8 +7,7 @@ from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, gre
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin
 
     from ...hparams import DataArguments
     from ..template import Template
diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py
index f711eeac..38497a15 100644
--- a/src/llamafactory/data/processors/unsupervised.py
+++ b/src/llamafactory/data/processors/unsupervised.py
@@ -6,8 +6,7 @@ from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
 
 
 if TYPE_CHECKING:
-    from transformers import ProcessorMixin
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer, ProcessorMixin
 
     from ...hparams import DataArguments
     from ..template import Template
diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index 6ed356c1..923238d6 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -9,7 +9,7 @@ from ...extras.packages import is_rouge_available
 
 
 if TYPE_CHECKING:
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers import PreTrainedTokenizer
 
 
 if is_jieba_available():

From 46f441dd370f7d3aca7cbb6ca24c82301f95932f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 03:15:06 +0800
Subject: [PATCH 032/160] update examples

Former-commit-id: 19681f93db399d695aa8e35f8ec2a9e720875baa
---
 README.md                                     |   2 +-
 README_zh.md                                  |   2 +-
 examples/README.md                            | 126 ++++++++---------
 examples/README_zh.md                         | 128 ++++++++----------
 .../extras/fsdp_qlora/llama3_lora_sft.yaml    |   4 +-
 .../extras/llama_pro/llama3_freeze_sft.yaml   |   1 +
 examples/extras/loraplus/llama3_lora_sft.yaml |   1 +
 examples/extras/mod/llama3_full_sft.yaml      |   1 +
 examples/lora_multi_gpu/llama3_lora_sft.yaml  |  41 ------
 .../llama3_full_predict.yaml                  |   0
 .../llama3_full_sft_ds3.yaml}                 |   0
 .../llama3_lora_dpo.yaml                      |   1 +
 .../llama3_lora_eval.yaml                     |   0
 .../llama3_lora_kto.yaml                      |   2 +
 .../llama3_lora_ppo.yaml                      |   1 +
 .../llama3_lora_predict.yaml                  |   1 +
 .../llama3_lora_pretrain.yaml                 |   1 +
 .../llama3_lora_reward.yaml                   |   1 +
 .../llama3_lora_sft.yaml                      |   1 +
 .../llama3_lora_sft_ds0.yaml}                 |   4 +-
 .../llama3_lora_sft_ds3.yaml}                 |   4 +-
 .../llama3_preprocess.yaml                    |   0
 .../llava1_5_lora_sft.yaml                    |   1 +
 .../llama3_lora_sft_aqlm.yaml                 |   1 +
 .../llama3_lora_sft_awq.yaml                  |   1 +
 .../llama3_lora_sft_bitsandbytes.yaml         |   1 +
 .../llama3_lora_sft_gptq.yaml                 |   1 +
 27 files changed, 128 insertions(+), 199 deletions(-)
 delete mode 100644 examples/lora_multi_gpu/llama3_lora_sft.yaml
 rename examples/{full_multi_gpu => train_full}/llama3_full_predict.yaml (100%)
 rename examples/{full_multi_gpu/llama3_full_sft.yaml => train_full/llama3_full_sft_ds3.yaml} (100%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_dpo.yaml (96%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_eval.yaml (100%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_kto.yaml (94%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_ppo.yaml (96%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_predict.yaml (95%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_pretrain.yaml (96%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_reward.yaml (96%)
 rename examples/{lora_single_gpu => train_lora}/llama3_lora_sft.yaml (96%)
 rename examples/{lora_multi_npu/llama3_lora_sft_ds.yaml => train_lora/llama3_lora_sft_ds0.yaml} (98%)
 rename examples/{lora_multi_gpu/llama3_lora_sft_ds.yaml => train_lora/llama3_lora_sft_ds3.yaml} (98%)
 rename examples/{lora_single_gpu => train_lora}/llama3_preprocess.yaml (100%)
 rename examples/{lora_single_gpu => train_lora}/llava1_5_lora_sft.yaml (96%)
 rename examples/{qlora_single_gpu => train_qlora}/llama3_lora_sft_aqlm.yaml (96%)
 rename examples/{qlora_single_gpu => train_qlora}/llama3_lora_sft_awq.yaml (96%)
 rename examples/{qlora_single_gpu => train_qlora}/llama3_lora_sft_bitsandbytes.yaml (96%)
 rename examples/{qlora_single_gpu => train_qlora}/llama3_lora_sft_gptq.yaml (96%)

diff --git a/README.md b/README.md
index 5bbaf2d7..5dd10d5a 100644
--- a/README.md
+++ b/README.md
@@ -406,7 +406,7 @@ Please refer to [data/README.md](data/README.md) for checking the details about
 Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Llama3-8B-Instruct model, respectively.
 
 ```bash
-llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
diff --git a/README_zh.md b/README_zh.md
index fb616909..76bd2d89 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -406,7 +406,7 @@ Docker 镜像：
 下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。
 
 ```bash
-llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
diff --git a/examples/README.md b/examples/README.md
index f985d552..3372afb9 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -4,59 +4,57 @@ Make sure to execute these commands in the `LLaMA-Factory` directory.
 
 ## Table of Contents
 
-- [LoRA Fine-Tuning on A Single GPU](#lora-fine-tuning-on-a-single-gpu)
-- [QLoRA Fine-Tuning on a Single GPU](#qlora-fine-tuning-on-a-single-gpu)
-- [LoRA Fine-Tuning on Multiple GPUs](#lora-fine-tuning-on-multiple-gpus)
-- [LoRA Fine-Tuning on Multiple NPUs](#lora-fine-tuning-on-multiple-npus)
-- [Full-Parameter Fine-Tuning on Multiple GPUs](#full-parameter-fine-tuning-on-multiple-gpus)
+- [LoRA Fine-Tuning](#lora-fine-tuning)
+- [QLoRA Fine-Tuning](#qlora-fine-tuning)
+- [Full-Parameter Fine-Tuning](#full-parameter-fine-tuning)
 - [Merging LoRA Adapters and Quantization](#merging-lora-adapters-and-quantization)
 - [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
 - [Extras](#extras)
 
 ## Examples
 
-### LoRA Fine-Tuning on A Single GPU
+### LoRA Fine-Tuning
 
 #### (Continuous) Pre-Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
 ```
 
 #### Supervised Fine-Tuning
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### Multimodal Supervised Fine-Tuning
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
 ```
 
 #### Reward Modeling
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
 ```
 
 #### PPO Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
 ```
 
 #### DPO/ORPO/SimPO Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 ```
 
 #### KTO Training
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
 ```
 
 #### Preprocess Dataset
@@ -64,95 +62,79 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
+llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
 ```
 
 #### Evaluating on MMLU/CMMLU/C-Eval Benchmarks
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
+llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
 ```
 
 #### Batch Predicting and Computing BLEU and ROUGE Scores
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
-```
-
-### QLoRA Fine-Tuning on a Single GPU
-
-#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
-```
-
-#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
-```
-
-#### Supervised Fine-Tuning with 4-bit AWQ Quantization
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
-```
-
-#### Supervised Fine-Tuning with 2-bit AQLM Quantization
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
-```
-
-### LoRA Fine-Tuning on Multiple GPUs
-
-#### Supervised Fine-Tuning on Single Node
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml
 ```
 
 #### Supervised Fine-Tuning on Multiple Nodes
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds.yaml
 ```
 
-### LoRA Fine-Tuning on Multiple NPUs
+### QLoRA Fine-Tuning
 
-#### Supervised Fine-Tuning with DeepSpeed ZeRO-0
+#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
 
 ```bash
-ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
 ```
 
-### Full-Parameter Fine-Tuning on Multiple GPUs
+#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+```
+
+#### Supervised Fine-Tuning with 4-bit AWQ Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+```
+
+#### Supervised Fine-Tuning with 2-bit AQLM Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+```
+
+### Full-Parameter Fine-Tuning
 
 #### Supervised Fine-Tuning on Single Node
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```
 
 #### Supervised Fine-Tuning on Multiple Nodes
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```
 
 #### Batch Predicting and Computing BLEU and ROUGE Scores
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_predict.yaml
+llamafactory-cli train examples/train_full/llama3_full_predict.yaml
 ```
 
 ### Merging LoRA Adapters and Quantization
@@ -162,35 +144,33 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llam
 Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
 #### Quantizing Model using AutoGPTQ
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
 ```
 
 ### Inferring LoRA Fine-Tuned Models
 
-Use `CUDA_VISIBLE_DEVICES=0,1` to infer models on multiple devices.
-
 #### Use CLI
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### Use Web UI
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### Launch OpenAI-style API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+llamafactory-cli api examples/inference/llama3_lora_sft.yaml
 ```
 
 ### Extras
@@ -198,32 +178,32 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.y
 #### Full-Parameter Fine-Tuning using GaLore
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
 ```
 
 #### Full-Parameter Fine-Tuning using BAdam
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 ```
 
 #### LoRA+ Fine-Tuning
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
 ```
 
 #### Mixture-of-Depths Fine-Tuning
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
 ```
 
 #### LLaMA-Pro Fine-Tuning
 
 ```bash
 bash examples/extras/llama_pro/expand.sh
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```
 
 #### FSDP+QLoRA Fine-Tuning
diff --git a/examples/README_zh.md b/examples/README_zh.md
index cf5bbf49..64c31fbd 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -4,59 +4,57 @@
 
 ## 目录
 
-- [单 GPU LoRA 微调](#单-gpu-lora-微调)
-- [单 GPU QLoRA 微调](#单-gpu-qlora-微调)
-- [多 GPU LoRA 微调](#多-gpu-lora-微调)
-- [多 NPU LoRA 微调](#多-npu-lora-微调)
-- [多 GPU 全参数微调](#多-gpu-全参数微调)
+- [LoRA 微调](#lora-微调)
+- [QLoRA 微调](#qlora-微调)
+- [全参数微调](#全参数微调)
 - [合并 LoRA 适配器与模型量化](#合并-lora-适配器与模型量化)
 - [推理 LoRA 模型](#推理-lora-模型)
 - [杂项](#杂项)
 
 ## 示例
 
-### 单 GPU LoRA 微调
+### LoRA 微调
 
 #### （增量）预训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
 ```
 
 #### 指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### 多模态指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
 ```
 
 #### 奖励模型训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
 ```
 
 #### PPO 训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
 ```
 
 #### DPO/ORPO/SimPO 训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 ```
 
 #### KTO 训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
 ```
 
 #### 预处理数据集
@@ -64,95 +62,79 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 对于大数据集有帮助，在配置中使用 `tokenized_path` 以加载预处理后的数据集。
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
+llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
 ```
 
 #### 在 MMLU/CMMLU/C-Eval 上评估
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
+llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
 ```
 
 #### 批量预测并计算 BLEU 和 ROUGE 分数
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml
 ```
 
-### 单 GPU QLoRA 微调
-
-#### 基于 4/8 比特 Bitsandbytes 量化进行指令监督微调（推荐）
+#### 多机指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
-```
-
-#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
-```
-
-#### 基于 4 比特 AWQ 量化进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
-```
-
-#### 基于 2 比特 AQLM 量化进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
-```
-
-### 多 GPU LoRA 微调
-
-#### 在单机上进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
-```
-
-#### 在多机上进行指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### 使用 DeepSpeed ZeRO-3 平均分配显存
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds.yaml
 ```
 
-### 多 NPU LoRA 微调
+### QLoRA 微调
 
-#### 使用 DeepSpeed ZeRO-0 进行指令监督微调
+#### 基于 4/8 比特 Bitsandbytes 量化进行指令监督微调（推荐）
 
 ```bash
-ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
 ```
 
-### 多 GPU 全参数微调
+#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+```
+
+#### 基于 4 比特 AWQ 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+```
+
+#### 基于 2 比特 AQLM 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+```
+
+### 全参数微调
 
 #### 在单机上进行指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```
 
 #### 在多机上进行指令监督微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
-CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```
 
 #### 批量预测并计算 BLEU 和 ROUGE 分数
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_predict.yaml
+llamafactory-cli train examples/train_full/llama3_full_predict.yaml
 ```
 
 ### 合并 LoRA 适配器与模型量化
@@ -162,35 +144,33 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llam
 注：请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
 #### 使用 AutoGPTQ 量化模型
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
 ```
 
 ### 推理 LoRA 模型
 
-使用 `CUDA_VISIBLE_DEVICES=0,1` 进行多卡推理。
-
 #### 使用命令行接口
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### 使用浏览器界面
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### 启动 OpenAI 风格 API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+llamafactory-cli api examples/inference/llama3_lora_sft.yaml
 ```
 
 ### 杂项
@@ -198,32 +178,32 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.y
 #### 使用 GaLore 进行全参数训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
 ```
 
 #### 使用 BAdam 进行全参数训练
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 ```
 
 #### LoRA+ 微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
 ```
 
 #### 深度混合微调
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
 ```
 
 #### LLaMA-Pro 微调
 
 ```bash
 bash examples/extras/llama_pro/expand.sh
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```
 
 #### FSDP+QLoRA 微调
diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
index 084269ef..cc773991 100644
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -8,9 +8,6 @@ do_train: true
 finetuning_type: lora
 lora_target: all
 
-### ddp
-ddp_timeout: 180000000
-
 ### dataset
 dataset: identity,alpaca_en_demo
 template: llama3
@@ -34,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index 444a1113..f92d6945 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -32,6 +32,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
index 1ba654ec..57383ae0 100644
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml
index df03c1e0..085febfc 100644
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 pure_bf16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml
deleted file mode 100644
index 348e53b9..00000000
--- a/examples/lora_multi_gpu/llama3_lora_sft.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### ddp
-ddp_timeout: 180000000
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
diff --git a/examples/full_multi_gpu/llama3_full_predict.yaml b/examples/train_full/llama3_full_predict.yaml
similarity index 100%
rename from examples/full_multi_gpu/llama3_full_predict.yaml
rename to examples/train_full/llama3_full_predict.yaml
diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/train_full/llama3_full_sft_ds3.yaml
similarity index 100%
rename from examples/full_multi_gpu/llama3_full_sft.yaml
rename to examples/train_full/llama3_full_sft_ds3.yaml
diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/train_lora/llama3_lora_dpo.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_dpo.yaml
rename to examples/train_lora/llama3_lora_dpo.yaml
index 78344330..db25fb51 100644
--- a/examples/lora_single_gpu/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -32,6 +32,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_eval.yaml b/examples/train_lora/llama3_lora_eval.yaml
similarity index 100%
rename from examples/lora_single_gpu/llama3_lora_eval.yaml
rename to examples/train_lora/llama3_lora_eval.yaml
diff --git a/examples/lora_single_gpu/llama3_lora_kto.yaml b/examples/train_lora/llama3_lora_kto.yaml
similarity index 94%
rename from examples/lora_single_gpu/llama3_lora_kto.yaml
rename to examples/train_lora/llama3_lora_kto.yaml
index d5234c0a..f730c82e 100644
--- a/examples/lora_single_gpu/llama3_lora_kto.yaml
+++ b/examples/train_lora/llama3_lora_kto.yaml
@@ -6,6 +6,7 @@ stage: kto
 do_train: true
 finetuning_type: lora
 lora_target: all
+pref_beta: 0.1
 
 ### dataset
 dataset: kto_en_demo
@@ -30,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_ppo.yaml b/examples/train_lora/llama3_lora_ppo.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_ppo.yaml
rename to examples/train_lora/llama3_lora_ppo.yaml
index 98c842f9..e574014e 100644
--- a/examples/lora_single_gpu/llama3_lora_ppo.yaml
+++ b/examples/train_lora/llama3_lora_ppo.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### generate
 max_new_tokens: 512
diff --git a/examples/lora_single_gpu/llama3_lora_predict.yaml b/examples/train_lora/llama3_lora_predict.yaml
similarity index 95%
rename from examples/lora_single_gpu/llama3_lora_predict.yaml
rename to examples/train_lora/llama3_lora_predict.yaml
index a127d248..148c8635 100644
--- a/examples/lora_single_gpu/llama3_lora_predict.yaml
+++ b/examples/train_lora/llama3_lora_predict.yaml
@@ -22,3 +22,4 @@ overwrite_output_dir: true
 ### eval
 per_device_eval_batch_size: 1
 predict_with_generate: true
+ddp_timeout: 180000000
diff --git a/examples/lora_single_gpu/llama3_lora_pretrain.yaml b/examples/train_lora/llama3_lora_pretrain.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_pretrain.yaml
rename to examples/train_lora/llama3_lora_pretrain.yaml
index db435ca9..839b3e51 100644
--- a/examples/lora_single_gpu/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/llama3_lora_pretrain.yaml
@@ -29,6 +29,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/train_lora/llama3_lora_reward.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_reward.yaml
rename to examples/train_lora/llama3_lora_reward.yaml
index 1ce42ea4..79559d19 100644
--- a/examples/lora_single_gpu/llama3_lora_reward.yaml
+++ b/examples/train_lora/llama3_lora_reward.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/train_lora/llama3_lora_sft.yaml
similarity index 96%
rename from examples/lora_single_gpu/llama3_lora_sft.yaml
rename to examples/train_lora/llama3_lora_sft.yaml
index 651b636f..fe30c575 100644
--- a/examples/lora_single_gpu/llama3_lora_sft.yaml
+++ b/examples/train_lora/llama3_lora_sft.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml b/examples/train_lora/llama3_lora_sft_ds0.yaml
similarity index 98%
rename from examples/lora_multi_npu/llama3_lora_sft_ds.yaml
rename to examples/train_lora/llama3_lora_sft_ds0.yaml
index a0ec8aa1..08b638e6 100644
--- a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds0.yaml
@@ -6,9 +6,6 @@ stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: all
-
-### ddp
-ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z0_config.json
 
 ### dataset
@@ -34,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/train_lora/llama3_lora_sft_ds3.yaml
similarity index 98%
rename from examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
rename to examples/train_lora/llama3_lora_sft_ds3.yaml
index 1c432fa7..b7266d61 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds3.yaml
@@ -6,9 +6,6 @@ stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: all
-
-### ddp
-ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
 ### dataset
@@ -34,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/train_lora/llama3_preprocess.yaml
similarity index 100%
rename from examples/lora_single_gpu/llama3_preprocess.yaml
rename to examples/train_lora/llama3_preprocess.yaml
diff --git a/examples/lora_single_gpu/llava1_5_lora_sft.yaml b/examples/train_lora/llava1_5_lora_sft.yaml
similarity index 96%
rename from examples/lora_single_gpu/llava1_5_lora_sft.yaml
rename to examples/train_lora/llava1_5_lora_sft.yaml
index df510a93..55ac31fa 100644
--- a/examples/lora_single_gpu/llava1_5_lora_sft.yaml
+++ b/examples/train_lora/llava1_5_lora_sft.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
similarity index 96%
rename from examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
rename to examples/train_qlora/llama3_lora_sft_aqlm.yaml
index d54d6af6..7b6767d5 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/train_qlora/llama3_lora_sft_awq.yaml
similarity index 96%
rename from examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
rename to examples/train_qlora/llama3_lora_sft_awq.yaml
index 5cef178a..a2a26e4b 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_awq.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
similarity index 96%
rename from examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
rename to examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
index b308dcab..cc773991 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
@@ -31,6 +31,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/train_qlora/llama3_lora_sft_gptq.yaml
similarity index 96%
rename from examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
rename to examples/train_qlora/llama3_lora_sft_gptq.yaml
index b950042e..ad3d854c 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml
@@ -30,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1

From dedefecd2b5df10c06f6d524ef1ff62827841148 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 03:16:20 +0800
Subject: [PATCH 033/160] Update llama3_full_sft_ds3.yaml

Former-commit-id: e715af62d521112d9c155cfa91fbb42fa0e77710
---
 examples/train_full/llama3_full_sft_ds3.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/train_full/llama3_full_sft_ds3.yaml b/examples/train_full/llama3_full_sft_ds3.yaml
index 40b62f24..40afd2ee 100644
--- a/examples/train_full/llama3_full_sft_ds3.yaml
+++ b/examples/train_full/llama3_full_sft_ds3.yaml
@@ -5,9 +5,6 @@ model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 stage: sft
 do_train: true
 finetuning_type: full
-
-### ddp
-ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
 ### dataset
@@ -33,6 +30,7 @@ num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
 fp16: true
+ddp_timeout: 180000000
 
 ### eval
 val_size: 0.1

From dbd1458adf3e42cc2854980255f9c5a4db705666 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 03:19:18 +0800
Subject: [PATCH 034/160] add quant check in webui export tab

Former-commit-id: 6455ca07061ae9858cd7bc996b28be1fde697a3d
---
 src/llamafactory/webui/components/export.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/llamafactory/webui/components/export.py b/src/llamafactory/webui/components/export.py
index 7e1493c8..9d756a38 100644
--- a/src/llamafactory/webui/components/export.py
+++ b/src/llamafactory/webui/components/export.py
@@ -21,6 +21,13 @@ if TYPE_CHECKING:
 GPTQ_BITS = ["8", "4", "3", "2"]
 
 
+def can_quantize(checkpoint_path: Union[str, List[str]]) -> "gr.Dropdown":
+    if isinstance(checkpoint_path, list) and len(checkpoint_path) != 0:
+        return gr.Dropdown(value="none", interactive=False)
+    else:
+        return gr.Dropdown(interactive=True)
+
+
 def save_model(
     lang: str,
     model_name: str,
@@ -96,6 +103,9 @@ def create_export_tab(engine: "Engine") -> Dict[str, "Component"]:
         export_dir = gr.Textbox()
         export_hub_model_id = gr.Textbox()
 
+    checkpoint_path: gr.Dropdown = engine.manager.get_elem_by_id("top.checkpoint_path")
+    checkpoint_path.change(can_quantize, [checkpoint_path], [export_quantization_bit], queue=False)
+
     export_btn = gr.Button()
     info_box = gr.Textbox(show_label=False, interactive=False)
 

From 530165d9a5ca9b5182d5ff2620aeb0e959481df3 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 03:26:10 +0800
Subject: [PATCH 035/160] update examples

Former-commit-id: d6bf6231290d79eb3a63e711f18fa711ef18a4f6
---
 examples/README.md    | 10 ++++++----
 examples/README_zh.md | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 3372afb9..180d5f7b 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -11,6 +11,8 @@ Make sure to execute these commands in the `LLaMA-Factory` directory.
 - [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
 - [Extras](#extras)
 
+Use `CUDA_VISIBLE_DEVICES` (GPU) or `ASCEND_RT_VISIBLE_DEVICES` (NPU) to choose computing devices.
+
 ## Examples
 
 ### LoRA Fine-Tuning
@@ -87,7 +89,7 @@ FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llama
 #### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
 ```
 
 ### QLoRA Fine-Tuning
@@ -121,14 +123,14 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_s
 #### Supervised Fine-Tuning on Single Node
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
 
 #### Supervised Fine-Tuning on Multiple Nodes
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
 
 #### Batch Predicting and Computing BLEU and ROUGE Scores
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 64c31fbd..b6168a95 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -11,6 +11,8 @@
 - [推理 LoRA 模型](#推理-lora-模型)
 - [杂项](#杂项)
 
+使用 `CUDA_VISIBLE_DEVICES`（GPU）或 `ASCEND_RT_VISIBLE_DEVICES`（NPU）选择计算设备。
+
 ## 示例
 
 ### LoRA 微调
@@ -87,7 +89,7 @@ FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llama
 #### 使用 DeepSpeed ZeRO-3 平均分配显存
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
 ```
 
 ### QLoRA 微调
@@ -121,14 +123,14 @@ llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
 #### 在单机上进行指令监督微调
 
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
 
 #### 在多机上进行指令监督微调
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
+FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
 
 #### 批量预测并计算 BLEU 和 ROUGE 分数

From f4f315fd116a293c555ae55f8bd0df64bd5cfa6c Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 13 Jun 2024 16:02:21 +0800
Subject: [PATCH 036/160] Update README.md

Former-commit-id: f8d701cd3ce2e56f95b4f5439b8b48d5b62e0d2b
---
 examples/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 180d5f7b..a6d78936 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -97,25 +97,25 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.
 #### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
 ```
 
 #### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
 ```
 
 #### Supervised Fine-Tuning with 4-bit AWQ Quantization
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
 ```
 
 #### Supervised Fine-Tuning with 2-bit AQLM Quantization
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
 ```
 
 ### Full-Parameter Fine-Tuning

From 3ff9b87012977bdc7393bb9e86a31f4bb4e5ff0d Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 04:05:54 +0800
Subject: [PATCH 037/160] add test cases

Former-commit-id: 731176ff34cdf0cbf6b41c40c69f4ceb54c2daf6
---
 src/llamafactory/chat/vllm_engine.py   |  2 +-
 src/llamafactory/hparams/model_args.py |  8 +--
 src/llamafactory/model/adapter.py      | 43 +++++++------
 src/llamafactory/model/patcher.py      |  5 +-
 src/llamafactory/train/ppo/trainer.py  |  3 +-
 tests/model/test_base.py               | 32 ++++++++++
 tests/model/test_freeze.py             | 22 ++++++-
 tests/model/test_full.py               | 20 ++++++-
 tests/model/test_lora.py               | 83 +++++++++++++++++++++++++-
 9 files changed, 184 insertions(+), 34 deletions(-)
 create mode 100644 tests/model/test_base.py

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index e4c05478..f0812a99 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -52,7 +52,7 @@ class VllmEngine(BaseEngine):
             "model": model_args.model_name_or_path,
             "trust_remote_code": True,
             "download_dir": model_args.cache_dir,
-            "dtype": model_args.vllm_dtype,
+            "dtype": model_args.infer_dtype,
             "max_model_len": model_args.vllm_maxlen,
             "tensor_parallel_size": get_device_count() or 1,
             "gpu_memory_utilization": model_args.vllm_gpu_util,
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 359beafd..bbac2e4b 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -136,10 +136,6 @@ class ModelArguments:
         default=8,
         metadata={"help": "Maximum rank of all LoRAs in the vLLM engine."},
     )
-    vllm_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
-        default="auto",
-        metadata={"help": "Data type for model weights and activations in the vLLM engine."},
-    )
     offload_folder: str = field(
         default="offload",
         metadata={"help": "Path to offload model weights."},
@@ -148,6 +144,10 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether or not to use KV cache in generation."},
     )
+    infer_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
+        default="auto",
+        metadata={"help": "Data type for model weights and activations at inference."}
+    )
     hf_hub_token: Optional[str] = field(
         default=None,
         metadata={"help": "Auth token to log in with Hugging Face Hub."},
diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index 34b9eda6..c37f6009 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -25,8 +25,12 @@ def _setup_full_tuning(
     model: "PreTrainedModel",
     model_args: "ModelArguments",
     finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
     cast_trainable_params_to_fp32: bool,
 ) -> None:
+    if not is_trainable:
+        return
+
     logger.info("Fine-tuning method: Full")
     forbidden_modules = set()
     if model_args.visual_inputs and finetuning_args.freeze_vision_tower:
@@ -47,8 +51,12 @@ def _setup_freeze_tuning(
     model: "PreTrainedModel",
     model_args: "ModelArguments",
     finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
     cast_trainable_params_to_fp32: bool,
 ) -> None:
+    if not is_trainable:
+        return
+
     logger.info("Fine-tuning method: Freeze")
     if model_args.visual_inputs:
         config = model.config.text_config
@@ -132,7 +140,9 @@ def _setup_lora_tuning(
     is_trainable: bool,
     cast_trainable_params_to_fp32: bool,
 ) -> "PeftModel":
-    logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
+    if is_trainable:
+        logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
+
     adapter_to_resume = None
 
     if model_args.adapter_name_or_path is not None:
@@ -173,6 +183,8 @@ def _setup_lora_tuning(
                     offload_folder=model_args.offload_folder,
                 )
 
+        logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
+
     if is_trainable and adapter_to_resume is None:  # create new lora weights while training
         if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all":
             target_modules = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
@@ -227,9 +239,6 @@ def _setup_lora_tuning(
         for param in filter(lambda p: p.requires_grad, model.parameters()):
             param.data = param.data.to(torch.float32)
 
-    if model_args.adapter_name_or_path is not None:
-        logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
-
     return model
 
 
@@ -247,29 +256,27 @@ def init_adapter(
 
     Note that the trainable parameters must be cast to float32.
     """
-    if (not is_trainable) and model_args.adapter_name_or_path is None:
-        logger.info("Adapter is not found at evaluation, load the base model.")
-        return model
+    if is_trainable and getattr(model, "quantization_method", None) and finetuning_args.finetuning_type != "lora":
+        raise ValueError("Quantized models can only be used for the LoRA tuning.")
 
-    if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None):
-        raise ValueError("You can only use lora for quantized models.")
-
-    if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
+    if not is_trainable:
+        cast_trainable_params_to_fp32 = False
+    elif is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
         logger.info("ZeRO3/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.")
         cast_trainable_params_to_fp32 = False
     else:
         logger.info("Upcasting trainable params to float32.")
         cast_trainable_params_to_fp32 = True
 
-    if is_trainable and finetuning_args.finetuning_type == "full":
-        _setup_full_tuning(model, model_args, finetuning_args, cast_trainable_params_to_fp32)
-
-    if is_trainable and finetuning_args.finetuning_type == "freeze":
-        _setup_freeze_tuning(model, model_args, finetuning_args, cast_trainable_params_to_fp32)
-
-    if finetuning_args.finetuning_type == "lora":
+    if finetuning_args.finetuning_type == "full":
+        _setup_full_tuning(model, model_args, finetuning_args, is_trainable, cast_trainable_params_to_fp32)
+    elif finetuning_args.finetuning_type == "freeze":
+        _setup_freeze_tuning(model, model_args, finetuning_args, is_trainable, cast_trainable_params_to_fp32)
+    elif finetuning_args.finetuning_type == "lora":
         model = _setup_lora_tuning(
             config, model, model_args, finetuning_args, is_trainable, cast_trainable_params_to_fp32
         )
+    else:
+        raise NotImplementedError("Unknown finetuning type: {}.".format(finetuning_args.finetuning_type))
 
     return model
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 18221a10..b97ff433 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -44,7 +44,10 @@ def patch_config(
     is_trainable: bool,
 ) -> None:
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
-        model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
+        if model_args.infer_dtype == "auto":
+            model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
+        else:
+            model_args.compute_dtype = getattr(torch, model_args.infer_dtype)
 
     if is_torch_npu_available():
         use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 737c45a3..45f47455 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -135,8 +135,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model)
         self.is_chatglm_model = getattr(unwrapped_model.config, "model_type", None) == "chatglm"
 
-        device_type = unwrapped_model.pretrained_model.device.type
-        self.amp_context = torch.autocast(device_type, dtype=model_args.compute_dtype)
+        self.amp_context = torch.autocast(self.current_device.type, dtype=self.model_args.compute_dtype)
         warnings.simplefilter("ignore")  # remove gc warnings on ref model
 
         if finetuning_args.reward_model_type == "full":
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
new file mode 100644
index 00000000..32a3918e
--- /dev/null
+++ b/tests/model/test_base.py
@@ -0,0 +1,32 @@
+import os
+
+import torch
+from transformers import AutoModelForCausalLM
+
+from llamafactory.hparams import get_infer_args
+from llamafactory.model import load_model, load_tokenizer
+
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
+    state_dict_a = model_a.state_dict()
+    state_dict_b = model_b.state_dict()
+    assert set(state_dict_a.keys()) == set(state_dict_b.keys())
+    for name in state_dict_a.keys():
+        assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
+
+
+def test_base():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+    ref_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    compare_model(model, ref_model)
diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py
index 97800696..a0618315 100644
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
@@ -2,7 +2,7 @@ import os
 
 import torch
 
-from llamafactory.hparams import get_train_args
+from llamafactory.hparams import get_infer_args, get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
@@ -23,8 +23,15 @@ TRAIN_ARGS = {
     "fp16": True,
 }
 
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "finetuning_type": "freeze",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
 
-def test_freeze_all_modules():
+
+def test_freeze_train_all_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"freeze_trainable_layers": 1, **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
@@ -37,7 +44,7 @@ def test_freeze_all_modules():
             assert param.dtype == torch.float16
 
 
-def test_freeze_extra_modules():
+def test_freeze_train_extra_modules():
     model_args, _, _, finetuning_args, _ = get_train_args(
         {"freeze_trainable_layers": 1, "freeze_extra_modules": "embed_tokens,lm_head", **TRAIN_ARGS}
     )
@@ -50,3 +57,12 @@ def test_freeze_extra_modules():
         else:
             assert param.requires_grad is False
             assert param.dtype == torch.float16
+
+
+def test_freeze_inference():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+    for param in model.parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
diff --git a/tests/model/test_full.py b/tests/model/test_full.py
index 6cb78f37..802b987c 100644
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
@@ -2,7 +2,7 @@ import os
 
 import torch
 
-from llamafactory.hparams import get_train_args
+from llamafactory.hparams import get_infer_args, get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
@@ -23,11 +23,27 @@ TRAIN_ARGS = {
     "fp16": True,
 }
 
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "finetuning_type": "full",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
 
-def test_full():
+
+def test_full_train():
     model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     for param in model.parameters():
         assert param.requires_grad is True
         assert param.dtype == torch.float32
+
+
+def test_full_inference():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+    for param in model.parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 2e2b89d9..3e2503f1 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -1,13 +1,18 @@
 import os
+from typing import Sequence
 
 import torch
+from peft import LoraModel, PeftModel
+from transformers import AutoModelForCausalLM
 
-from llamafactory.hparams import get_train_args
+from llamafactory.hparams import get_infer_args, get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
 
 TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
+TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
+
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
@@ -23,8 +28,32 @@ TRAIN_ARGS = {
     "fp16": True,
 }
 
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "adapter_name_or_path": TINY_LLAMA_ADAPTER,
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
 
-def test_lora_all_modules():
+
+def load_reference_model() -> "torch.nn.Module":
+    model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA)
+    return PeftModel.from_pretrained(model, TINY_LLAMA_ADAPTER)
+
+
+def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_keys: Sequence[str] = []):
+    state_dict_a = model_a.state_dict()
+    state_dict_b = model_b.state_dict()
+    assert set(state_dict_a.keys()) == set(state_dict_b.keys())
+    for name in state_dict_a.keys():
+        if any(key in name for key in diff_keys):
+            assert torch.allclose(state_dict_a[name], state_dict_b[name]) is False
+        else:
+            assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
+
+
+def test_lora_train_all_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "all", **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
@@ -41,7 +70,7 @@ def test_lora_all_modules():
     assert linear_modules == {"q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"}
 
 
-def test_lora_extra_modules():
+def test_lora_train_extra_modules():
     model_args, _, _, finetuning_args, _ = get_train_args(
         {"lora_target": "all", "additional_target": "embed_tokens,lm_head", **TRAIN_ARGS}
     )
@@ -61,3 +90,51 @@ def test_lora_extra_modules():
             assert param.dtype == torch.float16
 
     assert extra_modules == {"embed_tokens", "lm_head"}
+
+
+def test_lora_train_old_adapters():
+    model_args, _, _, finetuning_args, _ = get_train_args(
+        {"adapter_name_or_path": TINY_LLAMA_ADAPTER, "create_new_adapter": False, **TRAIN_ARGS}
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
+    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
+    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
+        param.data = param.data.to(torch.float32)
+
+    compare_model(model, ref_model)
+
+
+def test_lora_train_new_adapters():
+    model_args, _, _, finetuning_args, _ = get_train_args(
+        {"adapter_name_or_path": TINY_LLAMA_ADAPTER, "create_new_adapter": True, **TRAIN_ARGS}
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
+    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
+    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
+        param.data = param.data.to(torch.float32)
+
+    compare_model(
+        model, ref_model, diff_keys=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"]
+    )
+
+
+def test_lora_inference():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+
+    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER)
+    ref_model = ref_model.merge_and_unload()
+    compare_model(model, ref_model)
+
+    for name, param in model.named_parameters():
+        assert param.requires_grad is False
+        assert param.dtype == torch.float16
+        assert "lora" not in name

From a30931fe0ff797428d0f03623389b88d0c8ecd28 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 04:34:55 +0800
Subject: [PATCH 038/160] fix #4295

Former-commit-id: 08f657868f9d605b837c5d8c2946a25cc05c8735
---
 src/llamafactory/train/sft/trainer.py  | 10 +++++++---
 src/llamafactory/train/sft/workflow.py |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index c063b214..6bf5b7c0 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -13,6 +13,7 @@ from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
+    from torch.utils.data import Dataset
     from transformers import ProcessorMixin
     from transformers.trainer import PredictionOutput
 
@@ -94,7 +95,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         padded_tensor[:, -src_tensor.shape[-1] :] = src_tensor  # adopt left-padding
         return padded_tensor.contiguous()  # in contiguous memory
 
-    def save_predictions(self, predict_results: "PredictionOutput") -> None:
+    def save_predictions(self, dataset: "Dataset", predict_results: "PredictionOutput") -> None:
         r"""
         Saves model predictions to `output_dir`.
 
@@ -120,6 +121,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
                     (preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1
                 )  # move pad token to last
 
+        decoded_inputs = self.tokenizer.batch_decode(
+            dataset["input_ids"], skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
         decoded_labels = self.tokenizer.batch_decode(
             labels, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
@@ -127,6 +131,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 
         with open(output_prediction_file, "w", encoding="utf-8") as writer:
             res: List[str] = []
-            for label, pred in zip(decoded_labels, decoded_preds):
-                res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False))
+            for text, label, pred in zip(decoded_inputs, decoded_labels, decoded_preds):
+                res.append(json.dumps({"prompt": text, "label": label, "predict": pred}, ensure_ascii=False))
             writer.write("\n".join(res))
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index f09b5173..a989b3f7 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -93,7 +93,7 @@ def run_sft(
             predict_results.metrics.pop("predict_loss", None)
         trainer.log_metrics("predict", predict_results.metrics)
         trainer.save_metrics("predict", predict_results.metrics)
-        trainer.save_predictions(predict_results)
+        trainer.save_predictions(dataset, predict_results)
 
     # Create model card
     create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)

From c29fa61a9c4eeed1532188524156eacc10db2127 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 04:47:13 +0800
Subject: [PATCH 039/160] fix #4292

Former-commit-id: 4cd4c179d24eab0fcaec2b29b9dd71970f877fe8
---
 src/llamafactory/webui/common.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py
index 37b38df0..3b8d5378 100644
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
@@ -39,7 +39,11 @@ def get_save_dir(*paths: str) -> os.PathLike:
     r"""
     Gets the path to saved model checkpoints.
     """
-    paths = (path.replace(os.path.sep, "").replace(" ", "").strip() for path in paths)
+    if os.path.sep in paths[-1]:
+        logger.warning("Found complex path, some features may be not available.")
+        return paths[-1]
+
+    paths = (path.replace(" ", "").strip() for path in paths)
     return os.path.join(DEFAULT_SAVE_DIR, *paths)
 
 
From 2af932d969df3e5b23e8abd524b2f899762c938a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 04:57:19 +0800
Subject: [PATCH 040/160] disable DP

Former-commit-id: c18fd609d268389f3e65274992045a6c9f8e6c1f
---
 src/llamafactory/hparams/parser.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index ec5dd62c..3476a726 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -8,6 +8,7 @@ import transformers
 from transformers import HfArgumentParser, Seq2SeqTrainingArguments
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.trainer_utils import get_last_checkpoint
+from transformers.training_args import ParallelMode
 from transformers.utils import is_torch_bf16_gpu_available
 from transformers.utils.versions import require_version
 
@@ -162,6 +163,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("PPO only accepts wandb or tensorboard logger.")
 
+    if training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED:
+        raise ValueError("Please launch distributed training with `llamafactory-cli` or `torchrun`.")
+
     if training_args.max_steps == -1 and data_args.streaming:
         raise ValueError("Please specify `max_steps` in streaming mode.")
 
@@ -181,14 +185,14 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if (
         finetuning_args.use_galore
         and finetuning_args.galore_layerwise
-        and training_args.parallel_mode.value == "distributed"
+        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
     ):
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
     if (
         finetuning_args.use_badam
         and finetuning_args.badam_mode == "layer"
-        and training_args.parallel_mode.value == "distributed"
+        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
     ):
         raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
 
@@ -230,7 +234,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
 
     # Post-process training arguments
     if (
-        training_args.parallel_mode.value == "distributed"
+        training_args.parallel_mode == ParallelMode.DISTRIBUTED
         and training_args.ddp_find_unused_parameters is None
         and finetuning_args.finetuning_type == "lora"
     ):
@@ -290,7 +294,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
             training_args.local_rank,
             training_args.device,
             training_args.n_gpu,
-            training_args.parallel_mode.value == "distributed",
+            training_args.parallel_mode == ParallelMode.DISTRIBUTED,
             str(model_args.compute_dtype),
         )
     )

From 2abfad9c1fd906034cdda5956fe02e70a456f654 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 05:11:33 +0800
Subject: [PATCH 041/160] fix #4271

Former-commit-id: 03707e78d29bfcf5d395a64bb38632bdb3ff47ce
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 405ac46e..1756551e 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ extra_require = {
     "torch": ["torch>=1.13.1"],
     "torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
-    "deepspeed": ["deepspeed>=0.10.0,<=0.14.0"],
+    "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "vllm": ["vllm>=0.4.3"],
     "galore": ["galore-torch"],

From d2df3f2d6e9c40cfa8143b2482b6d7d58a0fb70c Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 05:13:16 +0800
Subject: [PATCH 042/160] update readme

Former-commit-id: a43d302aa79cbfb9b0606e855b4c1af6865d8e68
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5dd10d5a..cae79694 100644
--- a/README.md
+++ b/README.md
@@ -463,7 +463,7 @@ docker-compose exec llamafactory bash
 ### Deploy with OpenAI-style API and vLLM
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
+API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
 ```
 
 > [!TIP]
diff --git a/README_zh.md b/README_zh.md
index 76bd2d89..af3ff8f0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -463,7 +463,7 @@ docker-compose exec llamafactory bash
 ### 利用 vLLM 部署 OpenAI API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
+API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
 ```
 
 > [!TIP]

From bb8853616685faff541a1e6cd5799fba98b8c1d6 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 17:54:33 +0800
Subject: [PATCH 043/160] add license

Former-commit-id: 69cfc98d7c81756a5ab6bf962240e393e449fef0
---
 evaluation/ceval/ceval.py                     |  1 +
 evaluation/cmmlu/cmmlu.py                     |  1 +
 evaluation/mmlu/mmlu.py                       |  1 +
 scripts/cal_flops.py                          | 23 +++++++++--
 scripts/cal_lr.py                             | 23 +++++++++--
 scripts/cal_ppl.py                            | 19 ++++++++-
 scripts/length_cdf.py                         | 19 ++++++++-
 scripts/llama_pro.py                          | 23 +++++++++--
 scripts/llamafy_baichuan2.py                  | 22 ++++++++--
 scripts/llamafy_qwen.py                       | 21 ++++++++--
 scripts/loftq_init.py                         | 23 +++++++++--
 scripts/test_toolcall.py                      | 15 +++++++
 setup.py                                      | 14 +++++++
 src/api.py                                    | 14 +++++++
 src/llamafactory/__init__.py                  | 14 +++++++
 src/llamafactory/api/app.py                   | 14 +++++++
 src/llamafactory/api/chat.py                  | 14 +++++++
 src/llamafactory/api/common.py                | 14 +++++++
 src/llamafactory/api/protocol.py              | 14 +++++++
 src/llamafactory/chat/__init__.py             | 14 +++++++
 src/llamafactory/chat/base_engine.py          | 14 +++++++
 src/llamafactory/chat/chat_model.py           | 17 ++++++++
 src/llamafactory/chat/hf_engine.py            | 14 +++++++
 src/llamafactory/chat/vllm_engine.py          | 14 +++++++
 src/llamafactory/cli.py                       | 14 +++++++
 src/llamafactory/data/__init__.py             | 14 +++++++
 src/llamafactory/data/aligner.py              | 14 +++++++
 src/llamafactory/data/collator.py             | 14 +++++++
 src/llamafactory/data/data_utils.py           | 14 +++++++
 src/llamafactory/data/formatter.py            | 14 +++++++
 src/llamafactory/data/loader.py               | 14 +++++++
 src/llamafactory/data/parser.py               | 14 +++++++
 src/llamafactory/data/preprocess.py           | 14 +++++++
 src/llamafactory/data/processors/feedback.py  | 14 +++++++
 src/llamafactory/data/processors/pairwise.py  | 14 +++++++
 src/llamafactory/data/processors/pretrain.py  | 17 ++++++++
 .../data/processors/processor_utils.py        | 14 +++++++
 .../data/processors/supervised.py             | 14 +++++++
 .../data/processors/unsupervised.py           | 14 +++++++
 src/llamafactory/data/template.py             | 14 +++++++
 src/llamafactory/eval/evaluator.py            | 39 +++++++++++++++++-
 src/llamafactory/eval/template.py             | 14 +++++++
 src/llamafactory/extras/callbacks.py          | 14 +++++++
 src/llamafactory/extras/constants.py          | 14 +++++++
 src/llamafactory/extras/env.py                | 14 +++++++
 src/llamafactory/extras/logging.py            | 14 +++++++
 src/llamafactory/extras/misc.py               | 14 +++++++
 src/llamafactory/extras/packages.py           | 17 ++++++++
 src/llamafactory/extras/ploting.py            | 14 +++++++
 src/llamafactory/hparams/__init__.py          | 14 +++++++
 src/llamafactory/hparams/data_args.py         | 17 ++++++++
 src/llamafactory/hparams/evaluation_args.py   | 14 +++++++
 src/llamafactory/hparams/finetuning_args.py   | 14 +++++++
 src/llamafactory/hparams/generating_args.py   | 14 +++++++
 src/llamafactory/hparams/model_args.py        | 17 ++++++++
 src/llamafactory/hparams/parser.py            | 17 ++++++++
 src/llamafactory/launcher.py                  | 14 +++++++
 src/llamafactory/model/__init__.py            | 14 +++++++
 src/llamafactory/model/adapter.py             | 14 +++++++
 src/llamafactory/model/loader.py              | 14 +++++++
 .../model/model_utils/attention.py            | 14 +++++++
 .../model/model_utils/checkpointing.py        | 19 ++++++++-
 .../model/model_utils/embedding.py            | 14 +++++++
 .../model/model_utils/longlora.py             | 17 ++++++++
 src/llamafactory/model/model_utils/misc.py    | 14 +++++++
 src/llamafactory/model/model_utils/mod.py     | 14 +++++++
 src/llamafactory/model/model_utils/moe.py     | 14 +++++++
 .../model/model_utils/quantization.py         | 18 ++++++++-
 src/llamafactory/model/model_utils/rope.py    | 18 +++++++++
 src/llamafactory/model/model_utils/unsloth.py | 14 +++++++
 .../model/model_utils/valuehead.py            | 14 +++++++
 src/llamafactory/model/model_utils/visual.py  | 17 ++++++++
 src/llamafactory/model/patcher.py             | 14 +++++++
 src/llamafactory/train/dpo/__init__.py        | 14 +++++++
 src/llamafactory/train/dpo/trainer.py         | 17 ++++++++
 src/llamafactory/train/dpo/workflow.py        | 17 +++++++-
 src/llamafactory/train/kto/__init__.py        | 14 +++++++
 src/llamafactory/train/kto/trainer.py         | 17 ++++++++
 src/llamafactory/train/kto/workflow.py        | 17 ++++++++
 src/llamafactory/train/ppo/__init__.py        | 14 +++++++
 src/llamafactory/train/ppo/ppo_utils.py       | 14 +++++++
 src/llamafactory/train/ppo/trainer.py         | 17 ++++++++
 src/llamafactory/train/ppo/workflow.py        | 17 +++++++-
 src/llamafactory/train/pt/__init__.py         | 14 +++++++
 src/llamafactory/train/pt/trainer.py          | 14 +++++++
 src/llamafactory/train/pt/workflow.py         | 17 +++++++-
 src/llamafactory/train/rm/__init__.py         | 14 +++++++
 src/llamafactory/train/rm/metric.py           | 14 +++++++
 src/llamafactory/train/rm/trainer.py          | 40 ++++++++++++++++++-
 src/llamafactory/train/rm/workflow.py         | 39 +++++++++++++++++-
 src/llamafactory/train/sft/__init__.py        | 14 +++++++
 src/llamafactory/train/sft/metric.py          | 18 +++++++++
 src/llamafactory/train/sft/trainer.py         | 17 ++++++++
 src/llamafactory/train/sft/workflow.py        | 17 +++++++-
 src/llamafactory/train/trainer_utils.py       | 19 +++++++++
 src/llamafactory/train/tuner.py               | 14 +++++++
 src/llamafactory/webui/chatter.py             | 14 +++++++
 src/llamafactory/webui/common.py              | 14 +++++++
 src/llamafactory/webui/components/__init__.py | 14 +++++++
 src/llamafactory/webui/components/chatbot.py  | 14 +++++++
 src/llamafactory/webui/components/data.py     | 14 +++++++
 src/llamafactory/webui/components/eval.py     | 14 +++++++
 src/llamafactory/webui/components/export.py   | 14 +++++++
 src/llamafactory/webui/components/infer.py    | 14 +++++++
 src/llamafactory/webui/components/top.py      | 14 +++++++
 src/llamafactory/webui/components/train.py    | 14 +++++++
 src/llamafactory/webui/css.py                 | 14 +++++++
 src/llamafactory/webui/engine.py              | 14 +++++++
 src/llamafactory/webui/interface.py           | 14 +++++++
 src/llamafactory/webui/locales.py             | 14 +++++++
 src/llamafactory/webui/manager.py             | 14 +++++++
 src/llamafactory/webui/runner.py              | 14 +++++++
 src/llamafactory/webui/utils.py               | 14 +++++++
 src/train.py                                  | 14 +++++++
 src/webui.py                                  | 14 +++++++
 tests/data/test_supervised.py                 | 14 +++++++
 tests/eval/test_eval_template.py              | 14 +++++++
 tests/model/model_utils/test_attention.py     | 14 +++++++
 tests/model/test_base.py                      | 14 +++++++
 tests/model/test_freeze.py                    | 14 +++++++
 tests/model/test_full.py                      | 14 +++++++
 tests/model/test_lora.py                      | 14 +++++++
 122 files changed, 1848 insertions(+), 32 deletions(-)

diff --git a/evaluation/ceval/ceval.py b/evaluation/ceval/ceval.py
index 4111d6b4..48442d50 100644
--- a/evaluation/ceval/ceval.py
+++ b/evaluation/ceval/ceval.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 
 import datasets
diff --git a/evaluation/cmmlu/cmmlu.py b/evaluation/cmmlu/cmmlu.py
index 37efb328..5ff548a4 100644
--- a/evaluation/cmmlu/cmmlu.py
+++ b/evaluation/cmmlu/cmmlu.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 
 import datasets
diff --git a/evaluation/mmlu/mmlu.py b/evaluation/mmlu/mmlu.py
index a4530250..1065fb31 100644
--- a/evaluation/mmlu/mmlu.py
+++ b/evaluation/mmlu/mmlu.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 
 import datasets
diff --git a/scripts/cal_flops.py b/scripts/cal_flops.py
index ac87e0ab..627b5534 100644
--- a/scripts/cal_flops.py
+++ b/scripts/cal_flops.py
@@ -1,7 +1,20 @@
 # coding=utf-8
-# Calculates the flops of pre-trained models.
-# Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
-# Inspired by: https://www.deepspeed.ai/tutorials/flops-profiler/
+# Copyright 2024 Microsoft Corporation and the LlamaFactory team.
+#
+# This code is inspired by Microsoft's DeepSpeed library.
+# https://www.deepspeed.ai/tutorials/flops-profiler/
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import fire
 import torch
@@ -17,6 +30,10 @@ def calculate_flops(
     seq_length: int = 256,
     flash_attn: str = "auto",
 ):
+    r"""
+    Calculates the flops of pre-trained models.
+    Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
+    """
     with get_accelerator().device(0):
         chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
         fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.model.device)
diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py
index bfa32cc9..ff21d27c 100644
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -1,7 +1,20 @@
 # coding=utf-8
-# Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
-# Usage: python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en --cutoff_len 1024 --batch_size 16
-# Inspired by: https://github.com/imoneoi/openchat/blob/master/ochat/training_deepspeed/train.py
+# Copyright 2024 imoneoi and the LlamaFactory team.
+#
+# This code is inspired by imoneoi's OpenChat library.
+# https://github.com/imoneoi/openchat/blob/3.6.0/ochat/training_deepspeed/train.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import math
 from typing import Literal
@@ -32,6 +45,10 @@ def calculate_lr(
     cutoff_len: int = 1024,  # i.e. maximum input length during training
     is_mistral: bool = False,  # mistral model uses a smaller learning rate,
 ):
+    r"""
+    Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
+    Usage: python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en --cutoff_len 1024 --batch_size 16
+    """
     model_args, data_args, training_args, _, _ = get_train_args(
         dict(
             stage=stage,
diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py
index 387b756c..fb503629 100644
--- a/scripts/cal_ppl.py
+++ b/scripts/cal_ppl.py
@@ -1,6 +1,17 @@
 # coding=utf-8
-# Calculates the ppl on the dataset of the pre-trained models.
-# Usage: python cal_ppl.py --model_name_or_path path_to_model --save_name ppl.json
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import json
 from dataclasses import dataclass
@@ -56,6 +67,10 @@ def cal_ppl(
     max_samples: Optional[int] = None,
     train_on_prompt: bool = False,
 ):
+    r"""
+    Calculates the ppl on the dataset of the pre-trained models.
+    Usage: python cal_ppl.py --model_name_or_path path_to_model --save_name ppl.json
+    """
     model_args, data_args, training_args, finetuning_args, _ = get_train_args(
         dict(
             stage=stage,
diff --git a/scripts/length_cdf.py b/scripts/length_cdf.py
index 7739dcf0..4cdf01e6 100644
--- a/scripts/length_cdf.py
+++ b/scripts/length_cdf.py
@@ -1,6 +1,17 @@
 # coding=utf-8
-# Calculates the distribution of the input lengths in the dataset.
-# Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en --template default
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from collections import defaultdict
 
@@ -19,6 +30,10 @@ def length_cdf(
     template: str = "default",
     interval: int = 1000,
 ):
+    r"""
+    Calculates the distribution of the input lengths in the dataset.
+    Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en --template default
+    """
     model_args, data_args, training_args, _, _ = get_train_args(
         dict(
             stage="sft",
diff --git a/scripts/llama_pro.py b/scripts/llama_pro.py
index 727998ae..f315335a 100644
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -1,7 +1,20 @@
 # coding=utf-8
-# Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.
-# Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
-# Inspired by: https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
+# Copyright 2024 Tencent Inc. and the LlamaFactory team.
+#
+# This code is inspired by Tencent's LLaMA-Pro library.
+# https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import json
 import os
@@ -37,6 +50,10 @@ def block_expansion(
     shard_size: Optional[str] = "2GB",
     save_safetensors: Optional[bool] = False,
 ):
+    r"""
+    Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.
+    Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
+    """
     config: "PretrainedConfig" = AutoConfig.from_pretrained(model_name_or_path)
     num_layers = getattr(config, "num_hidden_layers")
     setattr(config, "num_hidden_layers", num_layers + num_expand)
diff --git a/scripts/llamafy_baichuan2.py b/scripts/llamafy_baichuan2.py
index 1ae58879..19284f5f 100644
--- a/scripts/llamafy_baichuan2.py
+++ b/scripts/llamafy_baichuan2.py
@@ -1,8 +1,17 @@
 # coding=utf-8
-# Converts the Baichuan2-7B model in the same format as LLaMA2-7B.
-# Usage: python llamafy_baichuan2.py --input_dir input --output_dir output
-# Inspired by: https://huggingface.co/fireballoon/baichuan-llama-7b/blob/main/convert_baichuan_to_llama.py
-# Converted model: https://huggingface.co/hiyouga/Baichuan2-7B-Base-LLaMAfied
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import json
 import os
@@ -79,6 +88,11 @@ def save_config(input_dir: str, output_dir: str):
 def llamafy_baichuan2(
     input_dir: str, output_dir: str, shard_size: Optional[str] = "2GB", save_safetensors: Optional[bool] = False
 ):
+    r"""
+    Converts the Baichuan2-7B model in the same format as LLaMA2-7B.
+    Usage: python llamafy_baichuan2.py --input_dir input --output_dir output
+    Converted model: https://huggingface.co/hiyouga/Baichuan2-7B-Base-LLaMAfied
+    """
     try:
         os.makedirs(output_dir, exist_ok=False)
     except Exception as e:
diff --git a/scripts/llamafy_qwen.py b/scripts/llamafy_qwen.py
index 69cf3e8e..e5b59483 100644
--- a/scripts/llamafy_qwen.py
+++ b/scripts/llamafy_qwen.py
@@ -1,7 +1,17 @@
 # coding=utf-8
-# Converts the Qwen models in the same format as LLaMA2.
-# Usage: python llamafy_qwen.py --input_dir input --output_dir output
-# Converted model: https://huggingface.co/hiyouga/Qwen-14B-Chat-LLaMAfied
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import json
 import os
@@ -131,6 +141,11 @@ def save_config(input_dir: str, output_dir: str, torch_dtype: str):
 def llamafy_qwen(
     input_dir: str, output_dir: str, shard_size: Optional[str] = "2GB", save_safetensors: Optional[bool] = False
 ):
+    r"""
+    Converts the Qwen models in the same format as LLaMA2.
+    Usage: python llamafy_qwen.py --input_dir input --output_dir output
+    Converted model: https://huggingface.co/hiyouga/Qwen-14B-Chat-LLaMAfied
+    """
     try:
         os.makedirs(output_dir, exist_ok=False)
     except Exception as e:
diff --git a/scripts/loftq_init.py b/scripts/loftq_init.py
index 7f244316..159dea06 100644
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
@@ -1,7 +1,20 @@
 # coding=utf-8
-# Initializes LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ)
-# Usage: python loftq_init.py --model_name_or_path path_to_model --save_dir output_dir
-# Inspired by: https://github.com/huggingface/peft/blob/main/examples/loftq_finetuning/quantize_save_load.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.10.0/examples/loftq_finetuning/quantize_save_load.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import os
 from typing import TYPE_CHECKING, Optional
@@ -49,6 +62,10 @@ def quantize_loftq(
     lora_target: Optional[str] = "q_proj,v_proj",
     save_safetensors: Optional[bool] = False,
 ):
+    r"""
+    Initializes LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ)
+    Usage: python loftq_init.py --model_name_or_path path_to_model --save_dir output_dir
+    """
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
     loftq_config = LoftQConfig(loftq_bits=loftq_bits, loftq_iter=loftq_iter)
diff --git a/scripts/test_toolcall.py b/scripts/test_toolcall.py
index 7e460017..6f6fd06c 100644
--- a/scripts/test_toolcall.py
+++ b/scripts/test_toolcall.py
@@ -1,3 +1,18 @@
+# coding=utf-8
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from typing import Sequence
diff --git a/setup.py b/setup.py
index 1756551e..3d2ac921 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import re
 
diff --git a/src/api.py b/src/api.py
index 3655e393..0f925497 100644
--- a/src/api.py
+++ b/src/api.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 import uvicorn
diff --git a/src/llamafactory/__init__.py b/src/llamafactory/__init__.py
index 78230937..9d732777 100644
--- a/src/llamafactory/__init__.py
+++ b/src/llamafactory/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Level: api, webui > chat, eval, train > data, model > hparams > extras
 
 from .cli import VERSION
diff --git a/src/llamafactory/api/app.py b/src/llamafactory/api/app.py
index 21edab2f..c1264617 100644
--- a/src/llamafactory/api/app.py
+++ b/src/llamafactory/api/app.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from contextlib import asynccontextmanager
 from typing import Optional
diff --git a/src/llamafactory/api/chat.py b/src/llamafactory/api/chat.py
index 98957bc1..a2074dbb 100644
--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import base64
 import io
 import json
diff --git a/src/llamafactory/api/common.py b/src/llamafactory/api/common.py
index 5ad9a071..d1ac94de 100644
--- a/src/llamafactory/api/common.py
+++ b/src/llamafactory/api/common.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 from typing import TYPE_CHECKING, Any, Dict
 
diff --git a/src/llamafactory/api/protocol.py b/src/llamafactory/api/protocol.py
index 055fa781..a69132ea 100644
--- a/src/llamafactory/api/protocol.py
+++ b/src/llamafactory/api/protocol.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 from enum import Enum, unique
 from typing import Any, Dict, List, Optional, Union
diff --git a/src/llamafactory/chat/__init__.py b/src/llamafactory/chat/__init__.py
index a1a79de6..07276d48 100644
--- a/src/llamafactory/chat/__init__.py
+++ b/src/llamafactory/chat/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .base_engine import BaseEngine
 from .chat_model import ChatModel
 
diff --git a/src/llamafactory/chat/base_engine.py b/src/llamafactory/chat/base_engine.py
index 65b6c59c..92a51ebe 100644
--- a/src/llamafactory/chat/base_engine.py
+++ b/src/llamafactory/chat/base_engine.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Literal, Optional, Sequence, Union
diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
index 281ef0c1..fb800106 100644
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
@@ -1,3 +1,20 @@
+# Copyright 2024 THUDM and the LlamaFactory team.
+#
+# This code is inspired by the THUDM's ChatGLM implementation.
+# https://github.com/THUDM/ChatGLM-6B/blob/main/cli_demo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import asyncio
 from threading import Thread
 from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, Generator, List, Optional, Sequence
diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 28e6a409..a7ff7015 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import asyncio
 import concurrent.futures
 import os
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index f0812a99..d488a039 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence, Union
 
diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index 5042e53c..c7f136b3 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 import subprocess
diff --git a/src/llamafactory/data/__init__.py b/src/llamafactory/data/__init__.py
index b08691d3..307853bc 100644
--- a/src/llamafactory/data/__init__.py
+++ b/src/llamafactory/data/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .collator import KTODataCollatorWithPadding, PairwiseDataCollatorWithPadding
 from .data_utils import Role, split_dataset
 from .loader import get_dataset
diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py
index 3e9d5c46..299bdca3 100644
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from functools import partial
 from typing import TYPE_CHECKING, Any, Dict, List, Union
diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py
index 1dc8dd8d..e4859ff5 100644
--- a/src/llamafactory/data/collator.py
+++ b/src/llamafactory/data/collator.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import Any, Dict, Sequence
 
diff --git a/src/llamafactory/data/data_utils.py b/src/llamafactory/data/data_utils.py
index 9b313112..cc9761b1 100644
--- a/src/llamafactory/data/data_utils.py
+++ b/src/llamafactory/data/data_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from enum import Enum, unique
 from typing import TYPE_CHECKING, Dict, List, Tuple, Union
 
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 0cd3d6c1..590e682b 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import re
 from abc import ABC, abstractmethod
diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py
index ba426f81..f44ef5de 100644
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 import os
 import sys
diff --git a/src/llamafactory/data/parser.py b/src/llamafactory/data/parser.py
index ec97bfc1..4bebcd68 100644
--- a/src/llamafactory/data/parser.py
+++ b/src/llamafactory/data/parser.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from dataclasses import dataclass
diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py
index 875f55d6..9a8b97f3 100644
--- a/src/llamafactory/data/preprocess.py
+++ b/src/llamafactory/data/preprocess.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from functools import partial
 from typing import TYPE_CHECKING, Callable, Literal, Optional, Tuple
 
diff --git a/src/llamafactory/data/processors/feedback.py b/src/llamafactory/data/processors/feedback.py
index 5fba452c..219ab353 100644
--- a/src/llamafactory/data/processors/feedback.py
+++ b/src/llamafactory/data/processors/feedback.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
diff --git a/src/llamafactory/data/processors/pairwise.py b/src/llamafactory/data/processors/pairwise.py
index db52c6a7..b2939348 100644
--- a/src/llamafactory/data/processors/pairwise.py
+++ b/src/llamafactory/data/processors/pairwise.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index a10ccabd..fb4c840c 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from itertools import chain
 from typing import TYPE_CHECKING, Any, Dict, List
 
diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py
index 9903a053..93df0cd5 100644
--- a/src/llamafactory/data/processors/processor_utils.py
+++ b/src/llamafactory/data/processors/processor_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import bisect
 from typing import TYPE_CHECKING, List, Sequence
 
diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py
index f59f5371..eb5ffb1a 100644
--- a/src/llamafactory/data/processors/supervised.py
+++ b/src/llamafactory/data/processors/supervised.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from collections import defaultdict
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py
index 38497a15..75ad4d51 100644
--- a/src/llamafactory/data/processors/unsupervised.py
+++ b/src/llamafactory/data/processors/unsupervised.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.logging import get_logger
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index b600c567..786c679f 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
 
diff --git a/src/llamafactory/eval/evaluator.py b/src/llamafactory/eval/evaluator.py
index 5c6fb104..bbd7a44b 100644
--- a/src/llamafactory/eval/evaluator.py
+++ b/src/llamafactory/eval/evaluator.py
@@ -1,4 +1,41 @@
-# Inspired by: https://github.com/hendrycks/test/blob/master/evaluate_flan.py
+# Copyright 2024 the LlamaFactory team.
+#
+# This code is inspired by Dan's test library.
+# https://github.com/hendrycks/test/blob/master/evaluate_flan.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2020 Dan Hendrycks
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 
 import inspect
 import json
diff --git a/src/llamafactory/eval/template.py b/src/llamafactory/eval/template.py
index 2cbb5aaf..7d524e7c 100644
--- a/src/llamafactory/eval/template.py
+++ b/src/llamafactory/eval/template.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import Dict, List, Sequence, Tuple
 
diff --git a/src/llamafactory/extras/callbacks.py b/src/llamafactory/extras/callbacks.py
index 441ebbfd..0dff6a69 100644
--- a/src/llamafactory/extras/callbacks.py
+++ b/src/llamafactory/extras/callbacks.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import logging
 import os
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 7d96fb5f..e31e7419 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from collections import OrderedDict, defaultdict
 from enum import Enum
 from typing import Dict, Optional
diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index a8cb799d..586c24c0 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import platform
 
 import accelerate
diff --git a/src/llamafactory/extras/logging.py b/src/llamafactory/extras/logging.py
index 430b8a48..67622212 100644
--- a/src/llamafactory/extras/logging.py
+++ b/src/llamafactory/extras/logging.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import logging
 import os
 import sys
diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index fc33f77e..3d969df1 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import gc
 import os
 from typing import TYPE_CHECKING, Dict, Tuple
diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py
index 0746bb4f..35f546ab 100644
--- a/src/llamafactory/extras/packages.py
+++ b/src/llamafactory/extras/packages.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/import_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import importlib.metadata
 import importlib.util
 from functools import lru_cache
diff --git a/src/llamafactory/extras/ploting.py b/src/llamafactory/extras/ploting.py
index dea23bbe..596d55e7 100644
--- a/src/llamafactory/extras/ploting.py
+++ b/src/llamafactory/extras/ploting.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import math
 import os
diff --git a/src/llamafactory/hparams/__init__.py b/src/llamafactory/hparams/__init__.py
index d1ee98dd..cfe448c1 100644
--- a/src/llamafactory/hparams/__init__.py
+++ b/src/llamafactory/hparams/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .data_args import DataArguments
 from .evaluation_args import EvaluationArguments
 from .finetuning_args import FinetuningArguments
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index 1e0cd08c..95284766 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass, field
 from typing import Literal, Optional
 
diff --git a/src/llamafactory/hparams/evaluation_args.py b/src/llamafactory/hparams/evaluation_args.py
index 5a05f6f6..a7f221ca 100644
--- a/src/llamafactory/hparams/evaluation_args.py
+++ b/src/llamafactory/hparams/evaluation_args.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from dataclasses import dataclass, field
 from typing import Literal, Optional
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index facbe792..52dc299e 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass, field
 from typing import List, Literal, Optional
 
diff --git a/src/llamafactory/hparams/generating_args.py b/src/llamafactory/hparams/generating_args.py
index 0ee17d1a..7ebb4eed 100644
--- a/src/llamafactory/hparams/generating_args.py
+++ b/src/llamafactory/hparams/generating_args.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import asdict, dataclass, field
 from typing import Any, Dict, Optional
 
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index bbac2e4b..0a91f0fa 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import asdict, dataclass, field
 from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union
 
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 3476a726..1c57567c 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import logging
 import os
 import sys
diff --git a/src/llamafactory/launcher.py b/src/llamafactory/launcher.py
index de154db9..65e0b68f 100644
--- a/src/llamafactory/launcher.py
+++ b/src/llamafactory/launcher.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from llamafactory.train.tuner import run_exp
 
 
diff --git a/src/llamafactory/model/__init__.py b/src/llamafactory/model/__init__.py
index 9d23d59f..4abbaa1b 100644
--- a/src/llamafactory/model/__init__.py
+++ b/src/llamafactory/model/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .loader import load_config, load_model, load_tokenizer
 from .model_utils.misc import find_all_linear_modules
 from .model_utils.valuehead import load_valuehead_params
diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index c37f6009..dfa71525 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import re
 from typing import TYPE_CHECKING
 
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 697a04e7..69cccd93 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, Optional, TypedDict
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer
diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index 2bd36fdc..8ff3807b 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING
 
 from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available
diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py
index e0657be8..e4e84b12 100644
--- a/src/llamafactory/model/model_utils/checkpointing.py
+++ b/src/llamafactory/model/model_utils/checkpointing.py
@@ -1,3 +1,21 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's Transformers and PEFT library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/modeling_utils.py
+# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/utils/other.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from functools import partial
 from types import MethodType
@@ -68,7 +86,6 @@ def prepare_model_for_training(
         (1) cast the layernorm in fp32
         (2) make output embedding layer require grads
         (3) add the upcasting of the lm_head in fp32
-    Inspired by: https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/other.py#L72
     """
     if model_args.upcast_layernorm:
         logger.info("Upcasting layernorm weights in float32.")
diff --git a/src/llamafactory/model/model_utils/embedding.py b/src/llamafactory/model/model_utils/embedding.py
index 3d9278e3..3ff79828 100644
--- a/src/llamafactory/model/model_utils/embedding.py
+++ b/src/llamafactory/model/model_utils/embedding.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from contextlib import nullcontext
 from typing import TYPE_CHECKING
diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py
index 4a8c562a..7af43dcf 100644
--- a/src/llamafactory/model/model_utils/longlora.py
+++ b/src/llamafactory/model/model_utils/longlora.py
@@ -1,3 +1,20 @@
+# Copyright 2024 EleutherAI, HuggingFace Inc., and the LlamaFactory team.
+#
+# This code is based on the EleutherAI's GPT-NeoX and HuggingFace's Transformers libraries.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from typing import TYPE_CHECKING, Optional, Tuple
 
diff --git a/src/llamafactory/model/model_utils/misc.py b/src/llamafactory/model/model_utils/misc.py
index 4851bd29..a2812228 100644
--- a/src/llamafactory/model/model_utils/misc.py
+++ b/src/llamafactory/model/model_utils/misc.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, List
 
 from ...extras.logging import get_logger
diff --git a/src/llamafactory/model/model_utils/mod.py b/src/llamafactory/model/model_utils/mod.py
index 5708a1a8..ec73af00 100644
--- a/src/llamafactory/model/model_utils/mod.py
+++ b/src/llamafactory/model/model_utils/mod.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING
 
 from ...extras.constants import MOD_SUPPORTED_MODELS
diff --git a/src/llamafactory/model/model_utils/moe.py b/src/llamafactory/model/model_utils/moe.py
index 8a73c844..5c7473aa 100644
--- a/src/llamafactory/model/model_utils/moe.py
+++ b/src/llamafactory/model/model_utils/moe.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Sequence
 
 import torch
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 02a54f07..9e6b9da4 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's Optimum library.
+# https://github.com/huggingface/optimum/blob/v1.20.0/optimum/gptq/data.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from enum import Enum, unique
@@ -41,7 +58,6 @@ class QuantizationMethod(str, Enum):
 
 def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]:
     r"""
-    Inspired by: https://github.com/huggingface/optimum/blob/v1.16.0/optimum/gptq/data.py#L133
     TODO: remove tokenizer.decode() https://github.com/huggingface/optimum/pull/1600
     """
     if os.path.isfile(model_args.export_quantization_dataset):
diff --git a/src/llamafactory/model/model_utils/rope.py b/src/llamafactory/model/model_utils/rope.py
index 93ab8929..88303c4d 100644
--- a/src/llamafactory/model/model_utils/rope.py
+++ b/src/llamafactory/model/model_utils/rope.py
@@ -1,3 +1,21 @@
+# Copyright 2024 LMSYS and the LlamaFactory team.
+# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+# This code is inspired by the LMSYS's FastChat library.
+# https://github.com/lm-sys/FastChat/blob/v0.2.30/fastchat/train/train.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 from typing import TYPE_CHECKING
 
diff --git a/src/llamafactory/model/model_utils/unsloth.py b/src/llamafactory/model/model_utils/unsloth.py
index 8a16409d..9cfaec61 100644
--- a/src/llamafactory/model/model_utils/unsloth.py
+++ b/src/llamafactory/model/model_utils/unsloth.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 from ...extras.logging import get_logger
diff --git a/src/llamafactory/model/model_utils/valuehead.py b/src/llamafactory/model/model_utils/valuehead.py
index 64333688..9ab3d45a 100644
--- a/src/llamafactory/model/model_utils/valuehead.py
+++ b/src/llamafactory/model/model_utils/valuehead.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 import torch
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index c8260b7f..37237485 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/modeling_llava.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Tuple
 
 import torch
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index b97ff433..053516e4 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from types import MethodType
 from typing import TYPE_CHECKING, Any, Dict
diff --git a/src/llamafactory/train/dpo/__init__.py b/src/llamafactory/train/dpo/__init__.py
index 43fe9420..9ce0d089 100644
--- a/src/llamafactory/train/dpo/__init__.py
+++ b/src/llamafactory/train/dpo/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_dpo
 
 
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index 5bdb9c43..475d08c3 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import warnings
 from collections import defaultdict
 from contextlib import nullcontext
diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py
index 992985b0..8c3c2eb1 100644
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/dpo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
 
diff --git a/src/llamafactory/train/kto/__init__.py b/src/llamafactory/train/kto/__init__.py
index 34c7905a..a1900368 100644
--- a/src/llamafactory/train/kto/__init__.py
+++ b/src/llamafactory/train/kto/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_kto
 
 
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 3b4488fc..6e96fc0c 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/kto_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import warnings
 from collections import defaultdict
 from contextlib import nullcontext
diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py
index c79b160b..8a7af6d4 100644
--- a/src/llamafactory/train/kto/workflow.py
+++ b/src/llamafactory/train/kto/workflow.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/kto.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, List, Optional
 
 from ...data import KTODataCollatorWithPadding, get_dataset, split_dataset
diff --git a/src/llamafactory/train/ppo/__init__.py b/src/llamafactory/train/ppo/__init__.py
index d17336d5..161f6f5d 100644
--- a/src/llamafactory/train/ppo/__init__.py
+++ b/src/llamafactory/train/ppo/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_ppo
 
 
diff --git a/src/llamafactory/train/ppo/ppo_utils.py b/src/llamafactory/train/ppo/ppo_utils.py
index fec3fc1e..05c40946 100644
--- a/src/llamafactory/train/ppo/ppo_utils.py
+++ b/src/llamafactory/train/ppo/ppo_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 from contextlib import nullcontext
 from typing import TYPE_CHECKING, Dict, List, Literal, Optional
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 45f47455..61420f3b 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/ppo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import os
 import sys
diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py
index 111704c6..891d539a 100644
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/lvwerra/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/ppo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
 
diff --git a/src/llamafactory/train/pt/__init__.py b/src/llamafactory/train/pt/__init__.py
index bdf397f6..d80e6f22 100644
--- a/src/llamafactory/train/pt/__init__.py
+++ b/src/llamafactory/train/pt/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_pt
 
 
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index 1d96e82f..09729f2e 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from types import MethodType
 from typing import TYPE_CHECKING, Dict, Optional
 
diff --git a/src/llamafactory/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py
index 8a635567..f1df314e 100644
--- a/src/llamafactory/train/pt/workflow.py
+++ b/src/llamafactory/train/pt/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/language-modeling/run_clm.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import math
 from typing import TYPE_CHECKING, List, Optional
diff --git a/src/llamafactory/train/rm/__init__.py b/src/llamafactory/train/rm/__init__.py
index dedac35f..48278315 100644
--- a/src/llamafactory/train/rm/__init__.py
+++ b/src/llamafactory/train/rm/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_rm
 
 
diff --git a/src/llamafactory/train/rm/metric.py b/src/llamafactory/train/rm/metric.py
index 99dc6ab8..fb880b1c 100644
--- a/src/llamafactory/train/rm/metric.py
+++ b/src/llamafactory/train/rm/metric.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, Sequence, Tuple, Union
 
 import numpy as np
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index bfb344dc..14695d7d 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -1,3 +1,42 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# This code is inspired by CarperAI's trlx library.
+# https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/reward_model.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2022 CarperAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import json
 import os
 from types import MethodType
@@ -79,7 +118,6 @@ class PairwiseTrainer(Trainer):
         chosen_scores, rejected_scores = [], []
 
         # Compute pairwise loss. Only backprop on the different tokens before padding
-        # Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py
         loss = 0
         for i in range(batch_size):
             chosen_length = (chosen_input_ids[i] != self.tokenizer.pad_token_id).nonzero()[-1] + 1
diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py
index 2e9e194b..75c0a2bf 100644
--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
@@ -1,4 +1,41 @@
-# Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py
+# Copyright 2024 the LlamaFactory team.
+#
+# This code is inspired by CarperAI's trlx library.
+# https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2022 CarperAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 
 from typing import TYPE_CHECKING, List, Optional
 
diff --git a/src/llamafactory/train/sft/__init__.py b/src/llamafactory/train/sft/__init__.py
index f2f84e78..475dfe5f 100644
--- a/src/llamafactory/train/sft/__init__.py
+++ b/src/llamafactory/train/sft/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .workflow import run_sft
 
 
diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index 923238d6..d2147c22 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -1,3 +1,21 @@
+# Copyright 2024 HuggingFace Inc., THUDM, and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library and THUDM's ChatGLM implementation.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+# https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
 
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 6bf5b7c0..6ab6914e 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -1,3 +1,20 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer_seq2seq.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from types import MethodType
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index a989b3f7..dfc71cfb 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -1,4 +1,19 @@
-# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from typing import TYPE_CHECKING, List, Optional
 
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 48944a63..5621d5df 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -1,3 +1,22 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the GaLore's implementation: https://github.com/jiaweizzhao/GaLore
+# and the LoRA+'s implementation: https://github.com/nikhil-ghosh-berkeley/loraplus
+# and the BAdam's implementation: https://github.com/Ledzy/BAdam
+# and the TRL's implementation: https://github.com/huggingface/trl
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py
index eed875e9..788b4c4f 100644
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import torch
diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py
index c82710d3..864c41c7 100644
--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from typing import TYPE_CHECKING, Dict, Generator, List, Optional, Sequence, Tuple
diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py
index 3b8d5378..980428a4 100644
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from collections import defaultdict
diff --git a/src/llamafactory/webui/components/__init__.py b/src/llamafactory/webui/components/__init__.py
index 5c1e21b8..715fb6e4 100644
--- a/src/llamafactory/webui/components/__init__.py
+++ b/src/llamafactory/webui/components/__init__.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .chatbot import create_chat_box
 from .eval import create_eval_tab
 from .export import create_export_tab
diff --git a/src/llamafactory/webui/components/chatbot.py b/src/llamafactory/webui/components/chatbot.py
index f83694b1..ad74114b 100644
--- a/src/llamafactory/webui/components/chatbot.py
+++ b/src/llamafactory/webui/components/chatbot.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict, Tuple
 
 from ...data import Role
diff --git a/src/llamafactory/webui/components/data.py b/src/llamafactory/webui/components/data.py
index 232b973d..88e500cf 100644
--- a/src/llamafactory/webui/components/data.py
+++ b/src/llamafactory/webui/components/data.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple
diff --git a/src/llamafactory/webui/components/eval.py b/src/llamafactory/webui/components/eval.py
index 0a7a0f44..b522913e 100644
--- a/src/llamafactory/webui/components/eval.py
+++ b/src/llamafactory/webui/components/eval.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 from ...extras.packages import is_gradio_available
diff --git a/src/llamafactory/webui/components/export.py b/src/llamafactory/webui/components/export.py
index 9d756a38..14257949 100644
--- a/src/llamafactory/webui/components/export.py
+++ b/src/llamafactory/webui/components/export.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict, Generator, List, Union
 
 from ...extras.constants import PEFT_METHODS
diff --git a/src/llamafactory/webui/components/infer.py b/src/llamafactory/webui/components/infer.py
index 970f4629..03bccd7f 100644
--- a/src/llamafactory/webui/components/infer.py
+++ b/src/llamafactory/webui/components/infer.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 from ...extras.packages import is_gradio_available
diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index fd0ead3d..2515a83d 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 from ...data import TEMPLATES
diff --git a/src/llamafactory/webui/components/train.py b/src/llamafactory/webui/components/train.py
index 72dfc858..673f6bf4 100644
--- a/src/llamafactory/webui/components/train.py
+++ b/src/llamafactory/webui/components/train.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict
 
 from transformers.trainer_utils import SchedulerType
diff --git a/src/llamafactory/webui/css.py b/src/llamafactory/webui/css.py
index 36e3d4c2..53982119 100644
--- a/src/llamafactory/webui/css.py
+++ b/src/llamafactory/webui/css.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 CSS = r"""
 .duplicate-button {
   margin: auto !important;
diff --git a/src/llamafactory/webui/engine.py b/src/llamafactory/webui/engine.py
index eb6142d3..04893215 100644
--- a/src/llamafactory/webui/engine.py
+++ b/src/llamafactory/webui/engine.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Any, Dict
 
 from .chatter import WebChatModel
diff --git a/src/llamafactory/webui/interface.py b/src/llamafactory/webui/interface.py
index bae3ba76..d25f4d38 100644
--- a/src/llamafactory/webui/interface.py
+++ b/src/llamafactory/webui/interface.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 from ..extras.packages import is_gradio_available
diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py
index e30feab2..427f01b8 100644
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 LOCALES = {
     "lang": {
         "en": {
diff --git a/src/llamafactory/webui/manager.py b/src/llamafactory/webui/manager.py
index 326fdb8d..7e9b801a 100644
--- a/src/llamafactory/webui/manager.py
+++ b/src/llamafactory/webui/manager.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Dict, Generator, List, Set, Tuple
 
 
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 35014628..76982934 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from copy import deepcopy
 from subprocess import Popen, TimeoutExpired
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index e39f2aa4..6ce2a8e7 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
 import signal
diff --git a/src/train.py b/src/train.py
index b20aa9d2..6703ffdb 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from llamafactory.train.tuner import run_exp
 
 
diff --git a/src/webui.py b/src/webui.py
index bbefb54e..99370af2 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 from llamafactory.webui.interface import create_ui
diff --git a/tests/data/test_supervised.py b/tests/data/test_supervised.py
index 63a3453f..a72800d2 100644
--- a/tests/data/test_supervised.py
+++ b/tests/data/test_supervised.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 
diff --git a/tests/eval/test_eval_template.py b/tests/eval/test_eval_template.py
index f6a91a67..f85d9d57 100644
--- a/tests/eval/test_eval_template.py
+++ b/tests/eval/test_eval_template.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from llamafactory.eval.template import get_eval_template
 
 
diff --git a/tests/model/model_utils/test_attention.py b/tests/model/model_utils/test_attention.py
index 751adda4..97ac9dcc 100644
--- a/tests/model/model_utils/test_attention.py
+++ b/tests/model/model_utils/test_attention.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index 32a3918e..462e8cfa 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 import torch
diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py
index a0618315..ac5a023c 100644
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 import torch
diff --git a/tests/model/test_full.py b/tests/model/test_full.py
index 802b987c..bcd6480f 100644
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 import torch
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 3e2503f1..e49c026c 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -1,3 +1,17 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from typing import Sequence
 

From 308abfec6c0cc60560ab53aec8b203d1be9f5e45 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 17:58:52 +0800
Subject: [PATCH 044/160] add minicpm #4227

Former-commit-id: e1bb18ce60be9a1b203989def30f1b9194286325
---
 src/llamafactory/extras/constants.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index e31e7419..73a9969d 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -682,6 +682,21 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "MiniCPM-2B-SFT-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-2B-sft-bf16",
+            DownloadSource.MODELSCOPE: "OpenBMB/miniCPM-bf16",
+        },
+        "MiniCPM-2B-DPO-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-2B-dpo-bf16",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-2B-dpo-bf16",
+        },
+    },
+    template="cpm",
+)
+
+
 register_model_group(
     models={
         "Mistral-7B-v0.1": {

From 7f90b0cd209f7b943c7ca941e8238f0108f407ea Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 19:51:20 +0800
Subject: [PATCH 045/160] add tests

Former-commit-id: 484634ee9c982e82e919ff67d507e0210345182d
---
 Makefile                                      |  2 +-
 src/llamafactory/extras/misc.py               |  9 ++-
 tests/data/test_supervised.py                 |  2 +-
 tests/model/model_utils/test_checkpointing.py | 74 +++++++++++++++++++
 tests/model/test_base.py                      | 30 +++++++-
 tests/model/test_freeze.py                    |  3 +
 tests/model/test_full.py                      |  2 +
 tests/model/test_lora.py                      | 58 +++++++++++++--
 8 files changed, 166 insertions(+), 14 deletions(-)
 create mode 100644 tests/model/model_utils/test_checkpointing.py

diff --git a/Makefile b/Makefile
index 65be047b..3f13b215 100644
--- a/Makefile
+++ b/Makefile
@@ -11,4 +11,4 @@ style:
 	ruff format $(check_dirs)
 
 test:
-	pytest tests/
+	CUDA_VISIBLE_DEVICES= pytest tests/
diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index 3d969df1..93153b3e 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -22,6 +22,7 @@ from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList, PreTr
 from transformers.utils import (
     SAFE_WEIGHTS_NAME,
     WEIGHTS_NAME,
+    is_safetensors_available,
     is_torch_bf16_gpu_available,
     is_torch_cuda_available,
     is_torch_mps_available,
@@ -34,6 +35,11 @@ from .constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from .logging import get_logger
 
 
+if is_safetensors_available():
+    from safetensors import safe_open
+    from safetensors.torch import save_file
+
+
 _is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
 try:
     _is_bf16_available = is_torch_bf16_gpu_available()
@@ -128,9 +134,6 @@ def fix_valuehead_checkpoint(
         return
 
     if safe_serialization:
-        from safetensors import safe_open
-        from safetensors.torch import save_file
-
         path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
         with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
             state_dict: Dict[str, torch.Tensor] = {key: f.get_tensor(key) for key in f.keys()}
diff --git a/tests/data/test_supervised.py b/tests/data/test_supervised.py
index a72800d2..9f7b2dbf 100644
--- a/tests/data/test_supervised.py
+++ b/tests/data/test_supervised.py
@@ -41,7 +41,7 @@ TRAIN_ARGS = {
 }
 
 
-@pytest.mark.parametrize("num_samples", [10])
+@pytest.mark.parametrize("num_samples", [16])
 def test_supervised(num_samples: int):
     model_args, data_args, training_args, _, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
diff --git a/tests/model/model_utils/test_checkpointing.py b/tests/model/model_utils/test_checkpointing.py
new file mode 100644
index 00000000..670e693d
--- /dev/null
+++ b/tests/model/model_utils/test_checkpointing.py
@@ -0,0 +1,74 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+
+from llamafactory.extras.misc import get_current_device
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_model, load_tokenizer
+
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "lora_target": "all",
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+def test_checkpointing_enable():
+    model_args, _, _, finetuning_args, _ = get_train_args({"disable_gradient_checkpointing": False, **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+    for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()):
+        assert getattr(module, "gradient_checkpointing") is True
+
+
+def test_checkpointing_disable():
+    model_args, _, _, finetuning_args, _ = get_train_args({"disable_gradient_checkpointing": True, **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+    for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()):
+        assert getattr(module, "gradient_checkpointing") is False
+
+
+def test_upcast_layernorm():
+    model_args, _, _, finetuning_args, _ = get_train_args({"upcast_layernorm": True, **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+    for name, param in model.named_parameters():
+        if param.ndim == 1 and "norm" in name:
+            assert param.dtype == torch.float32
+
+
+def test_upcast_lmhead_output():
+    model_args, _, _, finetuning_args, _ = get_train_args({"upcast_lmhead_output": True, **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+    inputs = torch.randn((1, 16), dtype=torch.float16, device=get_current_device())
+    outputs: "torch.Tensor" = model.lm_head(inputs)
+    assert outputs.dtype == torch.float32
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index 462e8cfa..ee0b2886 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -13,16 +13,21 @@
 # limitations under the License.
 
 import os
+from typing import Dict
 
 import torch
 from transformers import AutoModelForCausalLM
+from trl import AutoModelForCausalLMWithValueHead
 
+from llamafactory.extras.misc import get_current_device
 from llamafactory.hparams import get_infer_args
 from llamafactory.model import load_model, load_tokenizer
 
 
 TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
+TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
+
 INFER_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "template": "llama3",
@@ -38,9 +43,32 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
         assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
 
 
+def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]):
+    state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
+    self.v_head.load_state_dict(state_dict, strict=False)
+    del state_dict
+
+
 def test_base():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
-    ref_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+
+    ref_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    compare_model(model, ref_model)
+
+
+def test_valuehead():
+    AutoModelForCausalLMWithValueHead.post_init = post_init  # patch for CPU test
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(
+        tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False, add_valuehead=True
+    )
+
+    ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
+        TINY_LLAMA_VALUEHEAD, torch_dtype=torch.float16, device_map=get_current_device()
+    )
     compare_model(model, ref_model)
diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py
index ac5a023c..5f478af6 100644
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
@@ -49,6 +49,7 @@ def test_freeze_train_all_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"freeze_trainable_layers": 1, **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     for name, param in model.named_parameters():
         if name.startswith("model.layers.1."):
             assert param.requires_grad is True
@@ -64,6 +65,7 @@ def test_freeze_train_extra_modules():
     )
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     for name, param in model.named_parameters():
         if name.startswith("model.layers.1.") or any(module in name for module in ["embed_tokens", "lm_head"]):
             assert param.requires_grad is True
@@ -77,6 +79,7 @@ def test_freeze_inference():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+
     for param in model.parameters():
         assert param.requires_grad is False
         assert param.dtype == torch.float16
diff --git a/tests/model/test_full.py b/tests/model/test_full.py
index bcd6480f..0a6e0743 100644
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
@@ -49,6 +49,7 @@ def test_full_train():
     model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     for param in model.parameters():
         assert param.requires_grad is True
         assert param.dtype == torch.float32
@@ -58,6 +59,7 @@ def test_full_inference():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+
     for param in model.parameters():
         assert param.requires_grad is False
         assert param.dtype == torch.float16
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index e49c026c..4923c8ad 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -18,7 +18,9 @@ from typing import Sequence
 import torch
 from peft import LoraModel, PeftModel
 from transformers import AutoModelForCausalLM
+from trl import AutoModelForCausalLMWithValueHead
 
+from llamafactory.extras.misc import get_current_device
 from llamafactory.hparams import get_infer_args, get_train_args
 from llamafactory.model import load_model, load_tokenizer
 
@@ -27,6 +29,8 @@ TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
 
+TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
+
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
     "stage": "sft",
@@ -67,10 +71,29 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_k
             assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
 
 
+def test_lora_train_qv_modules():
+    model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "q_proj,v_proj", **TRAIN_ARGS})
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
+    linear_modules = set()
+    for name, param in model.named_parameters():
+        if any(module in name for module in ["lora_A", "lora_B"]):
+            linear_modules.add(name.split(".lora_", maxsplit=1)[0].split(".")[-1])
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        else:
+            assert param.requires_grad is False
+            assert param.dtype == torch.float16
+
+    assert linear_modules == {"q_proj", "v_proj"}
+
+
 def test_lora_train_all_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "all", **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     linear_modules = set()
     for name, param in model.named_parameters():
         if any(module in name for module in ["lora_A", "lora_B"]):
@@ -90,6 +113,7 @@ def test_lora_train_extra_modules():
     )
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
     extra_modules = set()
     for name, param in model.named_parameters():
         if any(module in name for module in ["lora_A", "lora_B"]):
@@ -113,7 +137,9 @@ def test_lora_train_old_adapters():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
 
-    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
     ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
     for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
         param.data = param.data.to(torch.float32)
@@ -128,7 +154,9 @@ def test_lora_train_new_adapters():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
 
-    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
     ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
     for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
         param.data = param.data.to(torch.float32)
@@ -138,17 +166,31 @@ def test_lora_train_new_adapters():
     )
 
 
+def test_lora_train_valuehead():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(
+        tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True, add_valuehead=True
+    )
+
+    ref_model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(
+        TINY_LLAMA_VALUEHEAD, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    state_dict = model.state_dict()
+    ref_state_dict = ref_model.state_dict()
+
+    assert torch.allclose(state_dict["v_head.summary.weight"], ref_state_dict["v_head.summary.weight"])
+    assert torch.allclose(state_dict["v_head.summary.bias"], ref_state_dict["v_head.summary.bias"])
+
+
 def test_lora_inference():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
 
-    base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
     ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER)
     ref_model = ref_model.merge_and_unload()
     compare_model(model, ref_model)
-
-    for name, param in model.named_parameters():
-        assert param.requires_grad is False
-        assert param.dtype == torch.float16
-        assert "lora" not in name

From 14f7bfc545164f57ee2644d098a4cff44867d630 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 15 Jun 2024 20:06:17 +0800
Subject: [PATCH 046/160] use fixture

Former-commit-id: 10761985691b9f934f7689c1f82aa6dd68febcca
---
 src/llamafactory/hparams/model_args.py |  2 +-
 tests/model/test_base.py               | 15 ++++++++++-----
 tests/model/test_lora.py               | 14 +++++++++++++-
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 0a91f0fa..53bdbdf2 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -163,7 +163,7 @@ class ModelArguments:
     )
     infer_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
         default="auto",
-        metadata={"help": "Data type for model weights and activations at inference."}
+        metadata={"help": "Data type for model weights and activations at inference."},
     )
     hf_hub_token: Optional[str] = field(
         default=None,
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index ee0b2886..2deedde2 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -15,6 +15,7 @@
 import os
 from typing import Dict
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM
 from trl import AutoModelForCausalLMWithValueHead
@@ -43,10 +44,14 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
         assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
 
 
-def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]):
-    state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
-    self.v_head.load_state_dict(state_dict, strict=False)
-    del state_dict
+@pytest.fixture
+def fix_valuehead_cpu_loading():
+    def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]):
+        state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+    AutoModelForCausalLMWithValueHead.post_init = post_init
 
 
 def test_base():
@@ -60,8 +65,8 @@ def test_base():
     compare_model(model, ref_model)
 
 
+@pytest.mark.usefixtures("fix_valuehead_cpu_loading")
 def test_valuehead():
-    AutoModelForCausalLMWithValueHead.post_init = post_init  # patch for CPU test
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 4923c8ad..fe032332 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 import os
-from typing import Sequence
+from typing import Dict, Sequence
 
+import pytest
 import torch
 from peft import LoraModel, PeftModel
 from transformers import AutoModelForCausalLM
@@ -71,6 +72,16 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_k
             assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
 
 
+@pytest.fixture
+def fix_valuehead_cpu_loading():
+    def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]):
+        state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+    AutoModelForCausalLMWithValueHead.post_init = post_init
+
+
 def test_lora_train_qv_modules():
     model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "q_proj,v_proj", **TRAIN_ARGS})
     tokenizer_module = load_tokenizer(model_args)
@@ -166,6 +177,7 @@ def test_lora_train_new_adapters():
     )
 
 
+@pytest.mark.usefixtures("fix_valuehead_cpu_loading")
 def test_lora_train_valuehead():
     model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
     tokenizer_module = load_tokenizer(model_args)

From 05f3a3c944e161df7bbcd579c80bba14095f61a5 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:06:41 +0800
Subject: [PATCH 047/160] tiny fix

Former-commit-id: f7f440986b0ae3b38ea9f2da80789629d4f79ea1
---
 scripts/cal_flops.py                                | 2 +-
 scripts/cal_lr.py                                   | 2 +-
 scripts/llama_pro.py                                | 2 +-
 src/llamafactory/data/processors/pretrain.py        | 2 +-
 src/llamafactory/eval/evaluator.py                  | 2 +-
 src/llamafactory/extras/packages.py                 | 2 +-
 src/llamafactory/hparams/data_args.py               | 2 +-
 src/llamafactory/model/model_utils/checkpointing.py | 2 +-
 src/llamafactory/model/model_utils/longlora.py      | 6 ++++--
 src/llamafactory/model/model_utils/quantization.py  | 2 +-
 src/llamafactory/model/model_utils/visual.py        | 2 +-
 src/llamafactory/train/dpo/workflow.py              | 2 +-
 src/llamafactory/train/kto/trainer.py               | 4 ++--
 src/llamafactory/train/kto/workflow.py              | 2 +-
 src/llamafactory/train/ppo/trainer.py               | 2 +-
 src/llamafactory/train/ppo/workflow.py              | 2 +-
 src/llamafactory/train/pt/workflow.py               | 2 +-
 src/llamafactory/train/rm/trainer.py                | 4 ++--
 src/llamafactory/train/rm/workflow.py               | 2 +-
 src/llamafactory/train/sft/metric.py                | 2 +-
 src/llamafactory/train/sft/workflow.py              | 2 +-
 tests/model/test_base.py                            | 2 +-
 22 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/scripts/cal_flops.py b/scripts/cal_flops.py
index 627b5534..32526d89 100644
--- a/scripts/cal_flops.py
+++ b/scripts/cal_flops.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # Copyright 2024 Microsoft Corporation and the LlamaFactory team.
 #
-# This code is inspired by Microsoft's DeepSpeed library.
+# This code is inspired by the Microsoft's DeepSpeed library.
 # https://www.deepspeed.ai/tutorials/flops-profiler/
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py
index ff21d27c..ad6992cb 100644
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # Copyright 2024 imoneoi and the LlamaFactory team.
 #
-# This code is inspired by imoneoi's OpenChat library.
+# This code is inspired by the imoneoi's OpenChat library.
 # https://github.com/imoneoi/openchat/blob/3.6.0/ochat/training_deepspeed/train.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/scripts/llama_pro.py b/scripts/llama_pro.py
index f315335a..395375ef 100644
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # Copyright 2024 Tencent Inc. and the LlamaFactory team.
 #
-# This code is inspired by Tencent's LLaMA-Pro library.
+# This code is inspired by the Tencent's LLaMA-Pro library.
 # https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/data/processors/pretrain.py b/src/llamafactory/data/processors/pretrain.py
index fb4c840c..67d6009b 100644
--- a/src/llamafactory/data/processors/pretrain.py
+++ b/src/llamafactory/data/processors/pretrain.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/eval/evaluator.py b/src/llamafactory/eval/evaluator.py
index bbd7a44b..d3140793 100644
--- a/src/llamafactory/eval/evaluator.py
+++ b/src/llamafactory/eval/evaluator.py
@@ -1,6 +1,6 @@
 # Copyright 2024 the LlamaFactory team.
 #
-# This code is inspired by Dan's test library.
+# This code is inspired by the Dan's test library.
 # https://github.com/hendrycks/test/blob/master/evaluate_flan.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py
index 35f546ab..0a84a293 100644
--- a/src/llamafactory/extras/packages.py
+++ b/src/llamafactory/extras/packages.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/import_utils.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index 95284766..39290e21 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py
index e4e84b12..f5314125 100644
--- a/src/llamafactory/model/model_utils/checkpointing.py
+++ b/src/llamafactory/model/model_utils/checkpointing.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's Transformers and PEFT library.
+# This code is inspired by the HuggingFace's Transformers and PEFT library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/modeling_utils.py
 # https://github.com/huggingface/peft/blob/v0.10.0/src/peft/utils/other.py
 #
diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py
index 7af43dcf..af30bd50 100644
--- a/src/llamafactory/model/model_utils/longlora.py
+++ b/src/llamafactory/model/model_utils/longlora.py
@@ -1,7 +1,9 @@
-# Copyright 2024 EleutherAI, HuggingFace Inc., and the LlamaFactory team.
+# Copyright 2024 EleutherAI, HuggingFace Inc., Yukang Chen, and the LlamaFactory team.
 #
-# This code is based on the EleutherAI's GPT-NeoX and HuggingFace's Transformers libraries.
+# This code is based on the EleutherAI's GPT-NeoX and the HuggingFace's Transformers libraries.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+# This code is also inspired by the original LongLoRA implementation.
+# https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 9e6b9da4..0a0fca34 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's Optimum library.
+# This code is inspired by the HuggingFace's Optimum library.
 # https://github.com/huggingface/optimum/blob/v1.20.0/optimum/gptq/data.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index 37237485..700bf470 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's Transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/modeling_llava.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py
index 8c3c2eb1..431b5285 100644
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/dpo.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 6e96fc0c..91d68975 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/kto_trainer.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -114,8 +114,8 @@ class CustomKTOTrainer(KTOTrainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
     def forward(
diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py
index 8a7af6d4..8182a184 100644
--- a/src/llamafactory/train/kto/workflow.py
+++ b/src/llamafactory/train/kto/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/kto.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 61420f3b..df4a37be 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/ppo_trainer.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py
index 891d539a..4f4d2820 100644
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/ppo.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py
index f1df314e..b84a0e7d 100644
--- a/src/llamafactory/train/pt/workflow.py
+++ b/src/llamafactory/train/pt/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 14695d7d..7f91e5f5 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 the LlamaFactory team.
 #
-# This code is inspired by CarperAI's trlx library.
+# This code is inspired by the CarperAI's trlx library.
 # https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/reward_model.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -89,8 +89,8 @@ class PairwiseTrainer(Trainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
     def compute_loss(
diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py
index 75c0a2bf..6f24e964 100644
--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 the LlamaFactory team.
 #
-# This code is inspired by CarperAI's trlx library.
+# This code is inspired by the CarperAI's trlx library.
 # https://github.com/CarperAI/trlx/blob/v0.7.0/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index d2147c22..95bfcb69 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc., THUDM, and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library and THUDM's ChatGLM implementation.
+# This code is inspired by the HuggingFace's transformers library and the THUDM's ChatGLM implementation.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
 # https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py
 #
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index dfc71cfb..885bc7ac 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index 2deedde2..954492ef 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -41,7 +41,7 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
     state_dict_b = model_b.state_dict()
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
-        assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
+        assert torch.allclose(state_dict_a[name], state_dict_b[name])
 
 
 @pytest.fixture

From 32f45c9e91c7ce12936ba4d8c6ea55a8cc3ba31d Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:08:12 +0800
Subject: [PATCH 048/160] support pissa

Former-commit-id: ef8e45f2eaf466c54e9a671512a2974575677b08
---
 README.md                                   |  6 +-
 README_zh.md                                |  6 +-
 examples/README.md                          |  6 ++
 examples/README_zh.md                       |  6 ++
 examples/extras/pissa/llama3_lora_sft.yaml  | 42 ++++++++++
 scripts/loftq_init.py                       | 72 +++++++----------
 scripts/pissa_init.py                       | 79 ++++++++++++++++++
 src/llamafactory/hparams/finetuning_args.py | 20 ++++-
 src/llamafactory/hparams/model_args.py      |  8 +-
 src/llamafactory/hparams/parser.py          |  5 +-
 src/llamafactory/model/adapter.py           | 25 ++++--
 src/llamafactory/train/dpo/trainer.py       | 13 ++-
 src/llamafactory/train/pt/trainer.py        | 12 ++-
 src/llamafactory/train/sft/trainer.py       | 13 ++-
 src/llamafactory/train/trainer_utils.py     | 54 ++++++++++++-
 src/llamafactory/webui/components/train.py  |  9 ++-
 src/llamafactory/webui/locales.py           | 14 ++++
 src/llamafactory/webui/runner.py            |  2 +
 tests/model/test_pissa.py                   | 90 +++++++++++++++++++++
 19 files changed, 406 insertions(+), 76 deletions(-)
 create mode 100644 examples/extras/pissa/llama3_lora_sft.yaml
 create mode 100644 scripts/pissa_init.py
 create mode 100644 tests/model/test_pissa.py

diff --git a/README.md b/README.md
index cae79694..cb9a7222 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ Choose your path:
 - **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO, etc.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
-- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and Agent tuning.
+- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ, PiSSA and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
@@ -71,9 +71,9 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
-[24/06/07] We supported fine-tuning the **[Qwen-2](https://qwenlm.github.io/blog/qwen2/)** series models.
+[24/06/16] We support **[PiSSA](https://arxiv.org/abs/2404.02948)** algorithm. See [examples](examples/README.md) for usage.
 
-[24/06/05] We supported fine-tuning the **[GLM-4-9B/GLM-4-9B-Chat](https://github.com/THUDM/GLM-4)** models.
+[24/06/07] We supported fine-tuning the **[Qwen2](https://qwenlm.github.io/blog/qwen2/)** and **[GLM-4](https://github.com/THUDM/GLM-4)** models.
 
 [24/05/26] We supported **[SimPO](https://arxiv.org/abs/2405.14734)** algorithm for preference learning. See [examples](examples/README.md) for usage.
 
diff --git a/README_zh.md b/README_zh.md
index af3ff8f0..5c005f30 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -49,7 +49,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - **多种模型**：LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
 - **集成方法**：（增量）预训练、（多模态）指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练、ORPO 训练等等。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
-- **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ 和 Agent 微调。
+- **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ、PiSSA 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
 - **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
 - **极速推理**：基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。
@@ -71,9 +71,9 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
-[24/06/07] 我们支持了 **[Qwen-2](https://qwenlm.github.io/blog/qwen2/)** 系列模型的微调。
+[24/06/16] 我们支持了 **[PiSSA](https://arxiv.org/abs/2404.02948)** 算法。详细用法请参照 [examples](examples/README_zh.md)。
 
-[24/06/05] 我们支持了 **[GLM-4-9B/GLM-4-9B-Chat](https://github.com/THUDM/GLM-4)** 模型的微调。
+[24/06/07] 我们支持了 **[Qwen2](https://qwenlm.github.io/blog/qwen2/)** 和 **[GLM-4](https://github.com/THUDM/GLM-4)** 模型的微调。
 
 [24/05/26] 我们支持了 **[SimPO](https://arxiv.org/abs/2405.14734)** 偏好对齐算法。详细用法请参照 [examples](examples/README_zh.md)。
 
diff --git a/examples/README.md b/examples/README.md
index a6d78936..902d26b1 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -213,3 +213,9 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/single_node.sh
 ```
+
+#### PiSSA Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index b6168a95..586e498c 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -213,3 +213,9 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/single_node.sh
 ```
+
+#### PiSSA 微调
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
diff --git a/examples/extras/pissa/llama3_lora_sft.yaml b/examples/extras/pissa/llama3_lora_sft.yaml
new file mode 100644
index 00000000..fd4b9f1d
--- /dev/null
+++ b/examples/extras/pissa/llama3_lora_sft.yaml
@@ -0,0 +1,42 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+pissa_init: true
+pissa_iter: 4
+pissa_convert: true
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+fp16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
diff --git a/scripts/loftq_init.py b/scripts/loftq_init.py
index 159dea06..556f342c 100644
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's PEFT library.
+# This code is based on the HuggingFace's PEFT library.
 # https://github.com/huggingface/peft/blob/v0.10.0/examples/loftq_finetuning/quantize_save_load.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,11 +17,9 @@
 # limitations under the License.
 
 import os
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import fire
-import torch
-import torch.nn as nn
 from peft import LoftQConfig, LoraConfig, TaskType, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -30,41 +28,20 @@ if TYPE_CHECKING:
     from transformers import PreTrainedModel
 
 
-class Shell(nn.Module):
-    def __init__(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
-        super().__init__()
-        self.weight = nn.Parameter(weight, requires_grad=False)
-        if bias is not None:
-            self.bias = nn.Parameter(bias, requires_grad=False)
-
-
-def unwrap_model(model: nn.Module, pattern=".base_layer") -> None:
-    for name in {k.split(pattern)[0] for k, _ in model.named_modules() if pattern in k}:
-        parent_name = ".".join(name.split(".")[:-1])
-        child_name = name.split(".")[-1]
-        parent_module = model.get_submodule(parent_name)
-        child_module = getattr(parent_module, child_name)
-        base_layer = getattr(child_module, "base_layer")
-        weight = getattr(base_layer, "weight", None)
-        bias = getattr(base_layer, "bias", None)
-        setattr(parent_module, child_name, Shell(weight, bias))
-
-    print("Model unwrapped.")
-
-
 def quantize_loftq(
     model_name_or_path: str,
-    save_dir: str,
-    loftq_bits: Optional[int] = 4,
-    loftq_iter: Optional[int] = 1,
-    lora_alpha: Optional[int] = None,
-    lora_rank: Optional[int] = 16,
-    lora_target: Optional[str] = "q_proj,v_proj",
-    save_safetensors: Optional[bool] = False,
+    output_dir: str,
+    loftq_bits: int = 4,
+    loftq_iter: int = 4,
+    lora_alpha: int = None,
+    lora_rank: int = 16,
+    lora_dropout: float = 0,
+    lora_target: str = "q_proj,v_proj",
+    save_safetensors: bool = True,
 ):
     r"""
     Initializes LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ)
-    Usage: python loftq_init.py --model_name_or_path path_to_model --save_dir output_dir
+    Usage: python loftq_init.py --model_name_or_path path_to_model --output_dir output_dir
     """
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
@@ -74,25 +51,34 @@ def quantize_loftq(
         inference_mode=True,
         r=lora_rank,
         lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
-        lora_dropout=0.1,
+        lora_dropout=lora_dropout,
         target_modules=[name.strip() for name in lora_target.split(",")],
         init_lora_weights="loftq",
         loftq_config=loftq_config,
     )
 
     # Init LoftQ model
-    lora_model = get_peft_model(model, lora_config)
-    base_model: "PreTrainedModel" = lora_model.get_base_model()
+    print("Initializing LoftQ weights, it may be take several minutes, wait patiently.")
+    peft_model = get_peft_model(model, lora_config)
+    loftq_dir = os.path.join(output_dir, "loftq_init")
 
     # Save LoftQ model
-    setattr(lora_model.base_model.peft_config["default"], "base_model_name_or_path", save_dir)
-    setattr(lora_model.base_model.peft_config["default"], "init_lora_weights", True)
-    lora_model.save_pretrained(os.path.join(save_dir, "adapters"), safe_serialization=save_safetensors)
+    setattr(peft_model.peft_config["default"], "base_model_name_or_path", output_dir)
+    setattr(peft_model.peft_config["default"], "init_lora_weights", True)  # don't apply loftq again
+    peft_model.save_pretrained(loftq_dir, safe_serialization=save_safetensors)
+    print("Adapter weights saved in {}".format(loftq_dir))
 
     # Save base model
-    unwrap_model(base_model)
-    base_model.save_pretrained(save_dir, safe_serialization=save_safetensors)
-    tokenizer.save_pretrained(save_dir)
+    base_model: "PreTrainedModel" = peft_model.unload()
+    base_model.save_pretrained(output_dir, safe_serialization=save_safetensors)
+    tokenizer.save_pretrained(output_dir)
+    print("Model weights saved in {}".format(output_dir))
+
+    print("Fine-tune this model with:")
+    print("model_name_or_path: {}".format(output_dir))
+    print("adapter_name_or_path: {}".format(loftq_dir))
+    print("finetuning_type: lora")
+    print("quantization_bit: {}".format(loftq_bits))
 
 
 if __name__ == "__main__":
diff --git a/scripts/pissa_init.py b/scripts/pissa_init.py
new file mode 100644
index 00000000..1b673c45
--- /dev/null
+++ b/scripts/pissa_init.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is based on the HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.11.0/examples/pissa_finetuning/preprocess.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING
+
+import fire
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+
+def quantize_pissa(
+    model_name_or_path: str,
+    output_dir: str,
+    pissa_iter: int = 4,
+    lora_alpha: int = None,
+    lora_rank: int = 16,
+    lora_dropout: float = 0,
+    lora_target: str = "q_proj,v_proj",
+    save_safetensors: bool = True,
+):
+    r"""
+    Initializes LoRA weights with Principal Singular values and Singular vectors Adaptation (PiSSA)
+    Usage: python pissa_init.py --model_name_or_path path_to_model --output_dir output_dir
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
+    lora_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        r=lora_rank,
+        lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
+        lora_dropout=lora_dropout,
+        target_modules=[name.strip() for name in lora_target.split(",")],
+        init_lora_weights="pissa" if pissa_iter == -1 else "pissa_niter_{}".format(pissa_iter)
+    )
+
+    # Init PiSSA model
+    peft_model = get_peft_model(model, lora_config)
+    pissa_dir = os.path.join(output_dir, "pissa_init")
+
+    # Save PiSSA model
+    setattr(peft_model.peft_config["default"], "init_lora_weights", True)  # don't apply pissa again
+    peft_model.save_pretrained(pissa_dir, safe_serialization=save_safetensors)
+    print("Adapter weights saved in {}".format(pissa_dir))
+
+    # Save base model
+    base_model: "PreTrainedModel" = peft_model.unload()
+    base_model.save_pretrained(output_dir, safe_serialization=save_safetensors)
+    tokenizer.save_pretrained(output_dir)
+    print("Model weights saved in {}".format(output_dir))
+
+    print("Fine-tune this model with:")
+    print("model_name_or_path: {}".format(output_dir))
+    print("adapter_name_or_path: {}".format(pissa_dir))
+    print("finetuning_type: lora")
+    print("pissa_convert: true")
+
+
+if __name__ == "__main__":
+    fire.Fire(quantize_pissa)
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index 52dc299e..1ef46eca 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -108,6 +108,18 @@ class LoraArguments:
         default=False,
         metadata={"help": "Whether or not to use the weight-decomposed lora method (DoRA)."},
     )
+    pissa_init: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to initialize a PiSSA adapter."},
+    )
+    pissa_iter: int = field(
+        default=4,
+        metadata={"help": "The number of iteration steps performed by FSVD in PiSSA. Use -1 to disable it."},
+    )
+    pissa_convert: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to convert the PiSSA adapter to a normal LoRA adapter."},
+    )
     create_new_adapter: bool = field(
         default=False,
         metadata={"help": "Whether or not to create a new adapter with randomly initialized weight."},
@@ -340,7 +352,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         self.additional_target: Optional[List[str]] = split_arg(self.additional_target)
         self.galore_target: List[str] = split_arg(self.galore_target)
         self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only
-        self.use_ref_model = self.pref_loss not in ["orpo", "simpo"]
+        self.use_ref_model = (self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"])
 
         assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method."
         assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
@@ -367,5 +379,11 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora":
             raise ValueError("`loraplus_lr_ratio` is only valid for LoRA training.")
 
+        if self.pissa_convert and self.finetuning_type != "lora":
+            raise ValueError("`pissa_convert` is only valid for LoRA training.")
+
+        if self.pissa_convert and (self.stage in ["rm", "ppo", "kto"] or self.use_ref_model):
+            raise ValueError("Cannot use PiSSA for current training stage.")
+
         if self.train_mm_proj_only and self.finetuning_type != "full":
             raise ValueError("`train_mm_proj_only` is only valid for full training.")
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 53bdbdf2..996e9130 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -45,6 +45,10 @@ class ModelArguments:
             )
         },
     )
+    adapter_folder: Optional[str] = field(
+        default=None,
+        metadata={"help": "The folder containing the adapter weights to load."},
+    )
     cache_dir: Optional[str] = field(
         default=None,
         metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."},
@@ -150,7 +154,7 @@ class ModelArguments:
         metadata={"help": "Whether or not to disable CUDA graph in the vLLM engine."},
     )
     vllm_max_lora_rank: int = field(
-        default=8,
+        default=32,
         metadata={"help": "Maximum rank of all LoRAs in the vLLM engine."},
     )
     offload_folder: str = field(
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 1c57567c..31a805f6 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -90,6 +90,9 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
         if finetuning_args.finetuning_type != "lora":
             raise ValueError("Quantization is only compatible with the LoRA method.")
 
+        if finetuning_args.use_pissa:
+            raise ValueError("Please use scripts/pissa_init.py for quantized PiSSA.")
+
         if model_args.resize_vocab:
             raise ValueError("Cannot resize embedding layers of a quantized model.")
 
diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index dfa71525..a8f3a256 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -179,8 +179,16 @@ def _setup_lora_tuning(
         else:
             adapter_to_merge = model_args.adapter_name_or_path
 
+        init_kwargs = {
+            "subfolder": model_args.adapter_folder,
+            "offload_folder": model_args.offload_folder,
+            "cache_dir": model_args.cache_dir,
+            "revision": model_args.model_revision,
+            "token": model_args.hf_hub_token,
+        }
+
         for adapter in adapter_to_merge:
-            model: "LoraModel" = PeftModel.from_pretrained(model, adapter, offload_folder=model_args.offload_folder)
+            model: "LoraModel" = PeftModel.from_pretrained(model, adapter, **init_kwargs)
             model = model.merge_and_unload()
 
         if len(adapter_to_merge) > 0:
@@ -190,12 +198,7 @@ def _setup_lora_tuning(
             if model_args.use_unsloth:
                 model = load_unsloth_peft_model(config, model_args, is_trainable=is_trainable)
             else:
-                model = PeftModel.from_pretrained(
-                    model,
-                    adapter_to_resume,
-                    is_trainable=is_trainable,
-                    offload_folder=model_args.offload_folder,
-                )
+                model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable, **init_kwargs)
 
         logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
 
@@ -242,6 +245,14 @@ def _setup_lora_tuning(
         if model_args.use_unsloth:
             model = get_unsloth_peft_model(model, model_args, peft_kwargs)
         else:
+            if finetuning_args.pissa_init:
+                if finetuning_args.pissa_iter == -1:
+                    logger.info("Using PiSSA initialization.")
+                    peft_kwargs["init_lora_weights"] = "pissa"
+                else:
+                    logger.info("Using PiSSA initialization with FSVD steps {}.".format(finetuning_args.pissa_iter))
+                    peft_kwargs["init_lora_weights"] = "pissa_niter_{}".format(finetuning_args.pissa_iter)
+
             lora_config = LoraConfig(
                 task_type=TaskType.CAUSAL_LM,
                 inference_mode=False,
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index 475d08c3..9928d0bc 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's TRL library.
+# This code is inspired by the HuggingFace's TRL library.
 # https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import warnings
 from collections import defaultdict
 from contextlib import nullcontext
@@ -28,7 +29,7 @@ from trl import DPOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
+from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
 if TYPE_CHECKING:
@@ -91,6 +92,9 @@ class CustomDPOTrainer(DPOTrainer):
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
                 self.ref_model.eval()
 
+        if finetuning_args.pissa_convert:
+            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
 
@@ -109,8 +113,11 @@ class CustomDPOTrainer(DPOTrainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.finetuning_args.pissa_convert:
+            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
+
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
     def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index 09729f2e..f9e04cb5 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from types import MethodType
 from typing import TYPE_CHECKING, Dict, Optional
 
 from transformers import Trainer
 
 from ...extras.logging import get_logger
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
+from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -42,6 +43,10 @@ class CustomTrainer(Trainer):
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
         self.processor = processor
+
+        if finetuning_args.pissa_convert:
+            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
 
@@ -60,6 +65,9 @@ class CustomTrainer(Trainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.finetuning_args.pissa_convert:
+            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
+
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 6ab6914e..921e49ab 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -1,6 +1,6 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by HuggingFace's transformers library.
+# This code is inspired by the HuggingFace's transformers library.
 # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer_seq2seq.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,7 +26,7 @@ from transformers import Seq2SeqTrainer
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
+from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -51,6 +51,10 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
         self.processor = processor
+
+        if finetuning_args.pissa_convert:
+            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
 
@@ -69,8 +73,11 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 
     def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
         super()._save(output_dir, state_dict)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.finetuning_args.pissa_convert:
+            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
+
         if self.processor is not None:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
     def prediction_step(
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 5621d5df..2d6bab24 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -1,9 +1,9 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by the GaLore's implementation: https://github.com/jiaweizzhao/GaLore
-# and the LoRA+'s implementation: https://github.com/nikhil-ghosh-berkeley/loraplus
-# and the BAdam's implementation: https://github.com/Ledzy/BAdam
-# and the TRL's implementation: https://github.com/huggingface/trl
+# This code is inspired by the original GaLore's implementation: https://github.com/jiaweizzhao/GaLore
+# and the original LoRA+'s implementation: https://github.com/nikhil-ghosh-berkeley/loraplus
+# and the original BAdam's implementation: https://github.com/Ledzy/BAdam
+# and the HuggingFace's TRL library: https://github.com/huggingface/trl
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,9 +17,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
+from peft import PeftModel
 from transformers import Trainer
 from transformers.optimization import get_scheduler
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
@@ -37,6 +39,7 @@ if is_galore_available():
 
 
 if TYPE_CHECKING:
+    from accelerate import Accelerator
     from transformers import PreTrainedModel, Seq2SeqTrainingArguments
     from trl import AutoModelForCausalLMWithValueHead
 
@@ -171,6 +174,49 @@ def create_reward_model(
         return reward_model
 
 
+def convert_pissa_adapter(
+    output_dir: str,
+    state_dict: Dict[str, "torch.Tensor"],
+    accelerator: "Accelerator",
+    model: "PreTrainedModel",
+    training_args: "Seq2SeqTrainingArguments",
+) -> None:
+    r"""
+    Converts the PiSSA adapter to a LoRA adapter.
+    """
+    pissa_init_dir = os.path.join(training_args.output_dir, "pissa_init")
+    pissa_backup_dir = os.path.join(output_dir, "pissa_backup")
+    if output_dir == pissa_init_dir:
+        logger.info("Initial PiSSA adatper will be saved at: {}.".format(pissa_init_dir))
+        unwrapped_model = accelerator.unwrap_model(model)
+        if isinstance(unwrapped_model, PeftModel):
+            init_lora_weights = getattr(unwrapped_model.peft_config["default"], "init_lora_weights")
+            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", True)
+            unwrapped_model.save_pretrained(
+                output_dir,
+                state_dict=state_dict,
+                safe_serialization=training_args.save_safetensors,
+            )
+            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", init_lora_weights)
+    elif output_dir == training_args.output_dir:  # at the end of training
+        logger.info("Converted PiSSA adapter will be saved at: {}.".format(output_dir))
+        unwrapped_model = accelerator.unwrap_model(model)
+        if isinstance(unwrapped_model, PeftModel):  # backup the pissa adapter for further use
+            unwrapped_model.save_pretrained(
+                pissa_backup_dir,
+                state_dict=state_dict,
+                safe_serialization=training_args.save_safetensors,
+            )
+            unwrapped_model.save_pretrained(
+                output_dir,
+                state_dict=state_dict,
+                safe_serialization=training_args.save_safetensors,
+                convert_pissa_to_lora=pissa_init_dir,
+            )
+            unwrapped_model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
+            unwrapped_model.set_adapter("default")
+
+
 def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]:
     r"""
     Returns a list of names of parameters with weight decay. (weights in non-layernorm layers)
diff --git a/src/llamafactory/webui/components/train.py b/src/llamafactory/webui/components/train.py
index 673f6bf4..874f3c5e 100644
--- a/src/llamafactory/webui/components/train.py
+++ b/src/llamafactory/webui/components/train.py
@@ -163,10 +163,9 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             create_new_adapter = gr.Checkbox()
 
         with gr.Row():
-            with gr.Column(scale=1):
-                use_rslora = gr.Checkbox()
-                use_dora = gr.Checkbox()
-
+            use_rslora = gr.Checkbox()
+            use_dora = gr.Checkbox()
+            use_pissa = gr.Checkbox()
             lora_target = gr.Textbox(scale=2)
             additional_target = gr.Textbox(scale=2)
 
@@ -179,6 +178,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             create_new_adapter,
             use_rslora,
             use_dora,
+            use_pissa,
             lora_target,
             additional_target,
         }
@@ -193,6 +193,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             create_new_adapter=create_new_adapter,
             use_rslora=use_rslora,
             use_dora=use_dora,
+            use_pissa=use_pissa,
             lora_target=lora_target,
             additional_target=additional_target,
         )
diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py
index 427f01b8..8e8d6fce 100644
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
@@ -732,6 +732,20 @@ LOCALES = {
             "info": "使用权重分解的 LoRA。",
         },
     },
+    "use_pissa": {
+        "en": {
+            "label": "Use PiSSA",
+            "info": "Use PiSSA method.",
+        },
+        "ru": {
+            "label": "используйте PiSSA",
+            "info": "Используйте метод PiSSA.",
+        },
+        "zh": {
+            "label": "使用 PiSSA",
+            "info": "使用 PiSSA 方法。",
+        },
+    },
     "lora_target": {
         "en": {
             "label": "LoRA modules (optional)",
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 76982934..13dbba03 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -173,6 +173,8 @@ class Runner:
             args["create_new_adapter"] = get("train.create_new_adapter")
             args["use_rslora"] = get("train.use_rslora")
             args["use_dora"] = get("train.use_dora")
+            args["pissa_init"] = get("train.use_pissa")
+            args["pissa_convert"] = get("train.use_pissa")
             args["lora_target"] = get("train.lora_target") or "all"
             args["additional_target"] = get("train.additional_target") or None
 
diff --git a/tests/model/test_pissa.py b/tests/model/test_pissa.py
new file mode 100644
index 00000000..70c424fd
--- /dev/null
+++ b/tests/model/test_pissa.py
@@ -0,0 +1,90 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+from peft import LoraModel, PeftModel
+from transformers import AutoModelForCausalLM
+
+from llamafactory.extras.misc import get_current_device
+from llamafactory.hparams import get_infer_args, get_train_args
+from llamafactory.model import load_model, load_tokenizer
+
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TINY_LLAMA_PISSA = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-pissa")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "lora",
+    "pissa_init": True,
+    "pissa_iter": -1,
+    "dataset": "llamafactory/tiny-supervised-dataset",
+    "dataset_dir": "ONLINE",
+    "template": "llama3",
+    "cutoff_len": 1024,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+INFER_ARGS = {
+    "model_name_or_path": TINY_LLAMA_PISSA,
+    "adapter_name_or_path": TINY_LLAMA_PISSA,
+    "adapter_folder": "pissa_init",
+    "finetuning_type": "lora",
+    "template": "llama3",
+    "infer_dtype": "float16",
+}
+
+
+def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
+    state_dict_a = model_a.state_dict()
+    state_dict_b = model_b.state_dict()
+    assert set(state_dict_a.keys()) == set(state_dict_b.keys())
+    for name in state_dict_a.keys():
+        assert torch.allclose(state_dict_a[name], state_dict_b[name])
+
+
+def test_pissa_init():
+    model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
+
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA_PISSA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_PISSA, subfolder="pissa_init", is_trainable=True)
+    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
+        param.data = param.data.to(torch.float32)
+
+    compare_model(model, ref_model)
+
+
+def test_pissa_inference():
+    model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
+    tokenizer_module = load_tokenizer(model_args)
+    model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
+
+    base_model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA_PISSA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_PISSA, subfolder="pissa_init")
+    ref_model = ref_model.merge_and_unload()
+    compare_model(model, ref_model)

From 9049f72d2f517a7ed4f7fc863354eea730d5f2cd Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:21:06 +0800
Subject: [PATCH 049/160] increase tol

Former-commit-id: c29071445e34aed23123fdf883a4d877744a1b0e
---
 tests/model/test_pissa.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/model/test_pissa.py b/tests/model/test_pissa.py
index 70c424fd..41d02752 100644
--- a/tests/model/test_pissa.py
+++ b/tests/model/test_pissa.py
@@ -59,7 +59,7 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
     state_dict_b = model_b.state_dict()
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
-        assert torch.allclose(state_dict_a[name], state_dict_b[name])
+        assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-3, atol=1e-3)
 
 
 def test_pissa_init():

From 8393b0866659023e1829985f24f4685fdcae040e Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:22:23 +0800
Subject: [PATCH 050/160] Update tests.yml

Former-commit-id: 82e83615a706293abbf266d11c57caedafdd4c5b
---
 .github/workflows/tests.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 96092662..98bd9455 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -9,8 +9,6 @@ on:
       - "requirements.txt"
       - ".github/workflows/*.yml"
   pull_request:
-    types:
-      - review_requested
     branches:
       - main
     paths:

From 727943f078186df085b130d4acf2f448ac96d92b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:38:44 +0800
Subject: [PATCH 051/160] fix tol

Former-commit-id: bdb54bcb477126687db789bd89f2df84e424a2a3
---
 src/llamafactory/train/trainer_utils.py | 1 +
 tests/model/test_base.py                | 2 +-
 tests/model/test_lora.py                | 4 ++--
 tests/model/test_pissa.py               | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 2d6bab24..9052c96d 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -213,6 +213,7 @@ def convert_pissa_adapter(
                 safe_serialization=training_args.save_safetensors,
                 convert_pissa_to_lora=pissa_init_dir,
             )
+            # TODO: the model is applied pissa again unexpectedly
             unwrapped_model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
             unwrapped_model.set_adapter("default")
 
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index 954492ef..e1991b20 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -41,7 +41,7 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
     state_dict_b = model_b.state_dict()
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
-        assert torch.allclose(state_dict_a[name], state_dict_b[name])
+        assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5)
 
 
 @pytest.fixture
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index fe032332..64566fe8 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -67,9 +67,9 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_k
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
         if any(key in name for key in diff_keys):
-            assert torch.allclose(state_dict_a[name], state_dict_b[name]) is False
+            assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5) is False
         else:
-            assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
+            assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5) is True
 
 
 @pytest.fixture
diff --git a/tests/model/test_pissa.py b/tests/model/test_pissa.py
index 41d02752..030310d0 100644
--- a/tests/model/test_pissa.py
+++ b/tests/model/test_pissa.py
@@ -59,7 +59,7 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
     state_dict_b = model_b.state_dict()
     assert set(state_dict_a.keys()) == set(state_dict_b.keys())
     for name in state_dict_a.keys():
-        assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-3, atol=1e-3)
+        assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5)
 
 
 def test_pissa_init():

From 76cd879c8421388c39d83595673ee4123ea5991a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 16 Jun 2024 01:43:43 +0800
Subject: [PATCH 052/160] update pr template

Former-commit-id: 0b7c29674fda10c0ac87e0a0c75990feabb5a3de
---
 .github/PULL_REQUEST_TEMPLATE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index b31e9d19..d23d6be3 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -5,3 +5,4 @@ Fixes # (issue)
 ## Before submitting
 
 - [ ] Did you read the [contributor guideline](https://github.com/hiyouga/LLaMA-Factory/blob/main/.github/CONTRIBUTING.md)?
+- [ ] Did you write any new necessary tests?

From ca67b7a568592814ad577fba68cca666eafd94f0 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 16 Jun 2024 02:57:00 +0800
Subject: [PATCH 053/160] Update parser.py

Former-commit-id: d10c97193d08bd368aca1a72f0d1d8a96c76765d
---
 src/llamafactory/hparams/parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 31a805f6..467fc43d 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -90,8 +90,8 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
         if finetuning_args.finetuning_type != "lora":
             raise ValueError("Quantization is only compatible with the LoRA method.")
 
-        if finetuning_args.use_pissa:
-            raise ValueError("Please use scripts/pissa_init.py for quantized PiSSA.")
+        if finetuning_args.pissa_init:
+            raise ValueError("Please use scripts/pissa_init.py to initialize PiSSA for a quantized model.")
 
         if model_args.resize_vocab:
             raise ValueError("Cannot resize embedding layers of a quantized model.")

From 5e802b06452c2f38565b0e3295bc2eef1a206c35 Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 11:19:25 -0300
Subject: [PATCH 054/160] Update README.md

Add Magpie and Webinstruct to README

Former-commit-id: 2b32b9263f12605e48e11dce9b5fbb746d790745
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index cb9a7222..30c90e9b 100644
--- a/README.md
+++ b/README.md
@@ -270,6 +270,8 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
 - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
 - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
 </details>
 

From 9a0aca42a5d0252165820536148c02aca215c22f Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 11:22:06 -0300
Subject: [PATCH 055/160] Update README_zh.md

Add Magpie and WebInstruct to README

Former-commit-id: 6cf5323959fe9500ba06ab28980fcc8f62e1373f
---
 README_zh.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README_zh.md b/README_zh.md
index 5c005f30..531d9b56 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -270,8 +270,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
 - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
 - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
-
-</details>
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
 <details><summary>偏好数据集</summary>
 

From d3b0048d8c58c9f3d357f540dfc6d7e8f1197720 Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 11:34:31 -0300
Subject: [PATCH 056/160] Update README_zh.md

Fix details tag in datasets menus

Former-commit-id: d79c1bd4806e9ea13115fabebf9da2d19b0a52be
---
 README_zh.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README_zh.md b/README_zh.md
index 531d9b56..711596f0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -273,6 +273,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
 - [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
+</details>
+
 <details><summary>偏好数据集</summary>
 
 - [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)

From a720b82e63a4051ed5c933ede0570de69e07b10a Mon Sep 17 00:00:00 2001
From: Eli Costa <87460497+EliMCosta@users.noreply.github.com>
Date: Sun, 16 Jun 2024 19:16:23 -0300
Subject: [PATCH 057/160] Fix Dockerfile

Adds the commands to correctly execute LLama-Factory servers

Former-commit-id: 22af40f0895a6f88709a495febeca8507d41d989
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 3932ff30..9c2f645c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,3 +42,5 @@ EXPOSE 7860
 
 # Expose port 8000 for the API service
 EXPOSE 8000
+
+CMD [ "llamafactory-cli", "webui" ]

From 63bfe9967e48c9b2bab5ab3406a795015c6718cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=83=A1=E7=BF=80?= <1020327414@qq.com>
Date: Mon, 17 Jun 2024 16:45:57 +0800
Subject: [PATCH 058/160] Update requirements.txt

add pandas version requirements

Former-commit-id: ed1cf559aa2d02588aacf55a17b439473651f626
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 9e00555e..5c7202a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ accelerate>=0.30.1
 peft>=0.11.1
 trl>=0.8.6
 gradio>=4.0.0
+pandas>=2.2.2
 scipy
 einops
 sentencepiece

From 485a80d2947acfb6cdd635ab7c079b03629a5b47 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 17:47:25 +0800
Subject: [PATCH 059/160] tiny fix

Former-commit-id: 2289436567a7860d25d9da0afb39e4a3e5e83839
---
 examples/README.md                            | 14 ++++----
 examples/README_zh.md                         | 14 ++++----
 .../fsdp_qlora/{single_node.sh => train.sh}   |  0
 scripts/llama_pro.py                          |  2 +-
 scripts/loftq_init.py                         |  2 +-
 scripts/pissa_init.py                         |  5 ++-
 tests/model/test_lora.py                      | 34 +++++++------------
 7 files changed, 32 insertions(+), 39 deletions(-)
 rename examples/extras/fsdp_qlora/{single_node.sh => train.sh} (100%)

diff --git a/examples/README.md b/examples/README.md
index 902d26b1..007a81ab 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -195,6 +195,12 @@ llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
 ```
 
+#### PiSSA Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
+
 #### Mixture-of-Depths Fine-Tuning
 
 ```bash
@@ -211,11 +217,5 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 #### FSDP+QLoRA Fine-Tuning
 
 ```bash
-bash examples/extras/fsdp_qlora/single_node.sh
-```
-
-#### PiSSA Fine-Tuning
-
-```bash
-llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+bash examples/extras/fsdp_qlora/train.sh
 ```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 586e498c..b9d90f25 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -195,6 +195,12 @@ llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
 ```
 
+#### PiSSA 微调
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
+
 #### 深度混合微调
 
 ```bash
@@ -211,11 +217,5 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 #### FSDP+QLoRA 微调
 
 ```bash
-bash examples/extras/fsdp_qlora/single_node.sh
-```
-
-#### PiSSA 微调
-
-```bash
-llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+bash examples/extras/fsdp_qlora/train.sh
 ```
diff --git a/examples/extras/fsdp_qlora/single_node.sh b/examples/extras/fsdp_qlora/train.sh
similarity index 100%
rename from examples/extras/fsdp_qlora/single_node.sh
rename to examples/extras/fsdp_qlora/train.sh
diff --git a/scripts/llama_pro.py b/scripts/llama_pro.py
index 395375ef..17bf6fc2 100644
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -120,7 +120,7 @@ def block_expansion(
             json.dump(index, f, indent=2, sort_keys=True)
         print("Model weights saved in {}".format(output_dir))
 
-    print("Fine-tune this model with:")
+    print("- Fine-tune this model with:")
     print("model_name_or_path: {}".format(output_dir))
     print("finetuning_type: freeze")
     print("freeze_trainable_layers: {}".format(num_expand))
diff --git a/scripts/loftq_init.py b/scripts/loftq_init.py
index 556f342c..b9506fa3 100644
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
@@ -74,7 +74,7 @@ def quantize_loftq(
     tokenizer.save_pretrained(output_dir)
     print("Model weights saved in {}".format(output_dir))
 
-    print("Fine-tune this model with:")
+    print("- Fine-tune this model with:")
     print("model_name_or_path: {}".format(output_dir))
     print("adapter_name_or_path: {}".format(loftq_dir))
     print("finetuning_type: lora")
diff --git a/scripts/pissa_init.py b/scripts/pissa_init.py
index 1b673c45..10b81efc 100644
--- a/scripts/pissa_init.py
+++ b/scripts/pissa_init.py
@@ -68,11 +68,14 @@ def quantize_pissa(
     tokenizer.save_pretrained(output_dir)
     print("Model weights saved in {}".format(output_dir))
 
-    print("Fine-tune this model with:")
+    print("- Fine-tune this model with:")
     print("model_name_or_path: {}".format(output_dir))
     print("adapter_name_or_path: {}".format(pissa_dir))
     print("finetuning_type: lora")
+    print("pissa_init: false")
     print("pissa_convert: true")
+    print("- and optionally with:")
+    print("quantization_bit: 4")
 
 
 if __name__ == "__main__":
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 64566fe8..630e5f75 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -56,9 +56,15 @@ INFER_ARGS = {
 }
 
 
-def load_reference_model() -> "torch.nn.Module":
-    model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA)
-    return PeftModel.from_pretrained(model, TINY_LLAMA_ADAPTER)
+def load_reference_model(is_trainable: bool = False) -> "LoraModel":
+    model = AutoModelForCausalLM.from_pretrained(
+        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
+    )
+    lora_model = PeftModel.from_pretrained(model, TINY_LLAMA_ADAPTER, is_trainable=is_trainable)
+    for param in filter(lambda p: p.requires_grad, lora_model.parameters()):
+        param.data = param.data.to(torch.float32)
+
+    return lora_model
 
 
 def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_keys: Sequence[str] = []):
@@ -148,13 +154,7 @@ def test_lora_train_old_adapters():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
 
-    base_model = AutoModelForCausalLM.from_pretrained(
-        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
-    )
-    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
-    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
-        param.data = param.data.to(torch.float32)
-
+    ref_model = load_reference_model(is_trainable=True)
     compare_model(model, ref_model)
 
 
@@ -165,13 +165,7 @@ def test_lora_train_new_adapters():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
 
-    base_model = AutoModelForCausalLM.from_pretrained(
-        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
-    )
-    ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
-    for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
-        param.data = param.data.to(torch.float32)
-
+    ref_model = load_reference_model(is_trainable=True)
     compare_model(
         model, ref_model, diff_keys=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "gate_proj", "down_proj"]
     )
@@ -200,9 +194,5 @@ def test_lora_inference():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
 
-    base_model = AutoModelForCausalLM.from_pretrained(
-        TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
-    )
-    ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER)
-    ref_model = ref_model.merge_and_unload()
+    ref_model = load_reference_model().merge_and_unload()
     compare_model(model, ref_model)

From 60d9896a70fd9ea5a77df8daeaa46117f16e9582 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 18:17:48 +0800
Subject: [PATCH 060/160] fix #4326

Former-commit-id: 3c2c45812a720d92f7f5b15b9f03370fe6bf069e
---
 src/llamafactory/model/adapter.py              | 16 +++++++++++++---
 .../model/model_utils/quantization.py          | 18 ++++++++----------
 src/llamafactory/model/patcher.py              |  5 ++++-
 3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index a8f3a256..34518878 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -281,12 +281,22 @@ def init_adapter(
 
     Note that the trainable parameters must be cast to float32.
     """
-    if is_trainable and getattr(model, "quantization_method", None) and finetuning_args.finetuning_type != "lora":
-        raise ValueError("Quantized models can only be used for the LoRA tuning.")
+    if is_trainable and getattr(model, "quantization_method", None) is not None:
+        if finetuning_args.finetuning_type != "lora":
+            raise ValueError("Quantized models can only be used for the LoRA tuning.")
 
+        if finetuning_args.pissa_init:
+            raise ValueError("Cannot initialize PiSSA adapter on quantized models.")
+
+    # cast trainable parameters to float32 if:
+    # 1. is_trainable and quantization_bit is not None (qlora)
+    # 2. is_trainable and not deepspeed zero3 and not fsdp (zero3 or fsdp already in float32)
+    # 3. is_trainable and not pure_bf16 and not badam
     if not is_trainable:
         cast_trainable_params_to_fp32 = False
-    elif is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
+    elif model_args.quantization_bit is None and (
+        is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam
+    ):
         logger.info("ZeRO3/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.")
         cast_trainable_params_to_fp32 = False
     else:
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 0a0fca34..5251f84f 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -1,6 +1,7 @@
 # Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
 #
-# This code is inspired by the HuggingFace's Optimum library.
+# This code is inspired by the HuggingFace's Transformers and Optimum library.
+# https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/utils/quantization_config.py
 # https://github.com/huggingface/optimum/blob/v1.20.0/optimum/gptq/data.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -96,10 +97,7 @@ def configure_quantization(
     """
     if getattr(config, "quantization_config", None):  # ptq
         if is_deepspeed_zero3_enabled():
-            raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantized models.")
-
-        if model_args.quantization_device_map != "auto":
-            init_kwargs["device_map"] = {"": get_current_device()}
+            raise ValueError("DeepSpeed ZeRO-3 is incompatible with PTQ-quantized models.")
 
         quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
         quant_method = quantization_config.get("quant_method", "")
@@ -152,15 +150,15 @@ def configure_quantization(
                 bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
             )
 
+        # assign device map if:
+        # 1. not deepspeed zero3 and not fsdp
+        # 2. not auto quantization device map
         if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
             if model_args.quantization_bit != 4:
-                raise ValueError("Only 4-bit quantized model can use auto device map.")
+                raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
 
-            require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
-            require_version("accelerate>=0.28.0", "To fix: pip install accelerate>=0.28.0")
             require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
-            init_kwargs["torch_dtype"] = model_args.compute_dtype  # fsdp+qlora requires same dtype
         else:
-            init_kwargs["device_map"] = {"": get_current_device()}
+            init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
 
         logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 053516e4..8fa17d08 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -89,7 +89,10 @@ def patch_config(
     # deepspeed zero3 is not compatible with low_cpu_mem_usage
     init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage and (not is_deepspeed_zero3_enabled())
 
-    if not is_deepspeed_zero3_enabled() and not is_fsdp_enabled():  # cast dtype and device if not use zero3 or fsdp
+    # cast data type of the model if:
+    # 1. not deepspeed zero3 and not fsdp (keep zero3 or fsdp in float32)
+    # 2. fsdp + qlora
+    if model_args.quantization_bit is not None or (not is_deepspeed_zero3_enabled() and not is_fsdp_enabled()):
         init_kwargs["torch_dtype"] = model_args.compute_dtype
 
         if init_kwargs["low_cpu_mem_usage"]:  # device map requires low_cpu_mem_usage=True

From ba303fd1aa7b558e7b86871bbac567fd405ea039 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Mon, 17 Jun 2024 18:18:10 +0800
Subject: [PATCH 061/160] adapt for badam with ds zero3

Former-commit-id: fff2a020ec8713022bd8145f4a7168168ea07ca4
---
 src/llamafactory/hparams/parser.py    | 12 ++++++------
 src/llamafactory/train/sft/trainer.py | 15 +++++++++++++++
 src/llamafactory/train/utils.py       |  7 +++++++
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 6311297e..fe108657 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -184,12 +184,12 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
-    if (
-        finetuning_args.use_badam
-        and finetuning_args.badam_mode == "layer"
-        and training_args.parallel_mode.value == "distributed"
-    ):
-        raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
+    # if (
+    #     finetuning_args.use_badam
+    #     and finetuning_args.badam_mode == "layer"
+    #     and training_args.parallel_mode.value == "distributed"
+    # ):
+    #     raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
 
     if (finetuning_args.use_galore or finetuning_args.use_badam) and training_args.deepspeed is not None:
         raise ValueError("GaLore and BAdam are incompatible with DeepSpeed yet.")
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 35671e1b..cd73bf5c 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -55,6 +55,21 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
             output_dir = output_dir if output_dir is not None else self.args.output_dir
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
+    def training_step(self, *args, **kwargs):
+        r"""
+        Update the reference to deepspeed optimizer
+        """
+        if self.finetuning_args.use_badam and \
+            self.args.deepspeed_plugin is not None and \
+            self.args.deepspeed_plugin.zero_stage == 3:
+            
+            ds_optim = self.optimizer.optimizer
+            badam_optim = ds_optim.optimizer
+            badam_optim.ds_optimizer = ds_optim
+        
+        return super().training_step(*args, **kwargs)
+            
+
     def prediction_step(
         self,
         model: "torch.nn.Module",
diff --git a/src/llamafactory/train/utils.py b/src/llamafactory/train/utils.py
index 23834f2d..b189922b 100644
--- a/src/llamafactory/train/utils.py
+++ b/src/llamafactory/train/utils.py
@@ -309,6 +309,12 @@ def _create_badam_optimizer(
         dict(params=decay_params, weight_decay=training_args.weight_decay),
     ]
 
+    ds_zero3_enabled = False
+    if hasattr(training_args, "deepspeed_plugin") and training_args.deepspeed_plugin is not None:
+        assert training_args.deepspeed_plugin.zero_stage == 3, f"BAdam only supports deepspeed ZeRO-3 stage, got {training_args.deepspeed_plugin.zero_stage}"
+        assert finetuning_args.badam_mode == "layer", "BAdam only supports layer-wise update in ZeRO-3 stage"
+        ds_zero3_enabled = True
+
     if finetuning_args.badam_mode == "layer":
         from badam import BlockOptimizer
 
@@ -321,6 +327,7 @@ def _create_badam_optimizer(
             start_block=finetuning_args.badam_start_block,
             switch_mode=finetuning_args.badam_switch_mode,
             verbose=finetuning_args.badam_verbose,
+            ds_zero3_enabled=ds_zero3_enabled
         )
         logger.info(
             f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "

From 7408e778cade09a0c24a10815ae22d57b8eb74f0 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Mon, 17 Jun 2024 18:29:36 +0800
Subject: [PATCH 062/160] update gitigore

Former-commit-id: 0068648aee07840cd2a08071e093436aee3f5cb6
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 0355c666..2486e728 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,5 @@ cython_debug/
 user.config
 saves/
 cache/
+wandb
+ds_badam_exp
\ No newline at end of file

From 77242f41692482de2b120e143b23e158da582ce6 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 18:47:24 +0800
Subject: [PATCH 063/160] update readme

Former-commit-id: 07c629f77c3978f339402e578cde1aede3f37699
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index cb9a7222..60045118 100644
--- a/README.md
+++ b/README.md
@@ -481,7 +481,7 @@ Train the model by specifying a model ID of the ModelScope Hub as the `model_nam
 
 ### Use W&B Logger
 
-To use [Weights & Biases](https://wandb.ai) for logging experimental results, you need to add the following arguments.
+To use [Weights & Biases](https://wandb.ai) for logging experimental results, you need to add the following arguments to yaml files.
 
 ```yaml
 report_to: wandb
diff --git a/README_zh.md b/README_zh.md
index 5c005f30..fc824561 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -481,7 +481,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 ### 使用 W&B 面板
 
-若要使用 [Weights & Biases](https://wandb.ai) 记录实验数据，请添加下面的参数。
+若要使用 [Weights & Biases](https://wandb.ai) 记录实验数据，请在 yaml 文件中添加下面的参数。
 
 ```yaml
 report_to: wandb

From 43fab306b6932b424c5d0b03c2f138c9aee54c2b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 19:07:17 +0800
Subject: [PATCH 064/160] update chat engine #4335

Former-commit-id: b163df7de48777e4319c9ccc736b0acdd5f473ed
---
 src/llamafactory/chat/chat_model.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
index fb800106..2a72f422 100644
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
     from .base_engine import BaseEngine, Response
 
 
-def _start_background_loop(loop: asyncio.AbstractEventLoop) -> None:
+def _start_background_loop(loop: "asyncio.AbstractEventLoop") -> None:
     asyncio.set_event_loop(loop)
     loop.run_forever()
 
@@ -49,7 +49,8 @@ class ChatModel:
         self._loop = asyncio.new_event_loop()
         self._thread = Thread(target=_start_background_loop, args=(self._loop,), daemon=True)
         self._thread.start()
-        asyncio.run_coroutine_threadsafe(self.engine.start(), self._loop)
+        task = asyncio.run_coroutine_threadsafe(self.engine.start(), self._loop)
+        task.result()
 
     def chat(
         self,

From 875270b851d6073bcbd90ab350c281bf77af1e01 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 17 Jun 2024 22:35:56 +0800
Subject: [PATCH 065/160] lint

Former-commit-id: a19a7ac99af62b6715c96274f6350b124a784331
---
 scripts/pissa_init.py                       | 2 +-
 src/llamafactory/hparams/finetuning_args.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/pissa_init.py b/scripts/pissa_init.py
index 10b81efc..50239727 100644
--- a/scripts/pissa_init.py
+++ b/scripts/pissa_init.py
@@ -50,7 +50,7 @@ def quantize_pissa(
         lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
         lora_dropout=lora_dropout,
         target_modules=[name.strip() for name in lora_target.split(",")],
-        init_lora_weights="pissa" if pissa_iter == -1 else "pissa_niter_{}".format(pissa_iter)
+        init_lora_weights="pissa" if pissa_iter == -1 else "pissa_niter_{}".format(pissa_iter),
     )
 
     # Init PiSSA model
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index 1ef46eca..b676891e 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -352,7 +352,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         self.additional_target: Optional[List[str]] = split_arg(self.additional_target)
         self.galore_target: List[str] = split_arg(self.galore_target)
         self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only
-        self.use_ref_model = (self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"])
+        self.use_ref_model = self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"]
 
         assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method."
         assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."

From 12fcfc2b724341888dcdf797daaf1e670089e01b Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Tue, 18 Jun 2024 12:27:47 +0800
Subject: [PATCH 066/160] Support distributed BAdam.

Former-commit-id: bdcb986e37975911c190a74d3e60bb77aa2033bd
---
 src/llamafactory/hparams/parser.py    | 24 ++++++++++--------------
 src/llamafactory/train/dpo/trainer.py |  6 ++++++
 src/llamafactory/train/kto/trainer.py |  6 ++++++
 src/llamafactory/train/ppo/trainer.py |  6 ++++++
 src/llamafactory/train/pt/trainer.py  |  6 ++++++
 src/llamafactory/train/rm/trainer.py  |  6 ++++++
 src/llamafactory/train/sft/trainer.py | 22 ++++++----------------
 7 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index f922bbfd..598a34e5 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -209,24 +209,20 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
-<<<<<<< HEAD
-    # if (
-    #     finetuning_args.use_badam
-    #     and finetuning_args.badam_mode == "layer"
-    #     and training_args.parallel_mode.value == "distributed"
-    # ):
-    #     raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
-=======
     if (
         finetuning_args.use_badam
-        and finetuning_args.badam_mode == "layer"
-        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
+        and training_args.parallel_mode.value == "distributed"
     ):
-        raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
->>>>>>> upstream/main
+        if finetuning_args.badam_mode == "ratio":
+            raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
+        if (finetuning_args.badam_mode == "layer"
+            and training_args.deepspeed_plugin is not None
+            and training_args.deepspeed_plugin.zero_stage < 3
+        ):
+            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage, got stage {self.args.deepspeed_plugin.zero_stage}")
 
-    if (finetuning_args.use_galore or finetuning_args.use_badam) and training_args.deepspeed is not None:
-        raise ValueError("GaLore and BAdam are incompatible with DeepSpeed yet.")
+    if (finetuning_args.use_galore) and training_args.deepspeed is not None:
+        raise ValueError("GaLore are incompatible with DeepSpeed yet.")
 
     if model_args.infer_backend == "vllm":
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index 9928d0bc..284bf41a 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -100,6 +100,12 @@ class CustomDPOTrainer(DPOTrainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 91d68975..d8b609e0 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -95,6 +95,12 @@ class CustomKTOTrainer(KTOTrainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index df4a37be..5a18cbaa 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -170,6 +170,12 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
         r"""
         Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer.
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index f9e04cb5..1e5e9f6a 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -52,6 +52,12 @@ class CustomTrainer(Trainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 7f91e5f5..5d0e6263 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -76,6 +76,12 @@ class PairwiseTrainer(Trainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 0628ea59..9446d245 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -57,9 +57,14 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
-
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
+            if (self.args.deepspeed_plugin is not None
+            and self.args.deepspeed_plugin.zero_stage == 3
+            ):
+                from badam.utils import BAdamZeRO3Callback
+                self.callback_handler.add_callback(BAdamZeRO3Callback)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
@@ -80,21 +85,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         if self.processor is not None:
             getattr(self.processor, "image_processor").save_pretrained(output_dir)
 
-    def training_step(self, *args, **kwargs):
-        r"""
-        Update the reference to deepspeed optimizer
-        """
-        if self.finetuning_args.use_badam and \
-            self.args.deepspeed_plugin is not None and \
-            self.args.deepspeed_plugin.zero_stage == 3:
-            
-            ds_optim = self.optimizer.optimizer
-            badam_optim = ds_optim.optimizer
-            badam_optim.ds_optimizer = ds_optim
-        
-        return super().training_step(*args, **kwargs)
-            
-
     def prediction_step(
         self,
         model: "torch.nn.Module",

From 870a54ac84b874c9498944f2be56934a74638cb8 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Tue, 18 Jun 2024 12:39:26 +0800
Subject: [PATCH 067/160] fix typo

Former-commit-id: d4bee3716dbf8a84564d5bcc2059172604819f3e
---
 src/llamafactory/hparams/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 598a34e5..680559ac 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -219,7 +219,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
             and training_args.deepspeed_plugin is not None
             and training_args.deepspeed_plugin.zero_stage < 3
         ):
-            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage, got stage {self.args.deepspeed_plugin.zero_stage}")
+            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage, got stage {training_args.deepspeed_plugin.zero_stage}")
 
     if (finetuning_args.use_galore) and training_args.deepspeed is not None:
         raise ValueError("GaLore are incompatible with DeepSpeed yet.")

From c7479751e8e355d6b76744f0623ce66221b4eb03 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Tue, 18 Jun 2024 13:50:26 +0800
Subject: [PATCH 068/160] add example

Former-commit-id: 75603db09b085e3f703286b87abe041af020e615
---
 examples/extras/badam/llama3_badam_sft.yaml | 40 +++++++++++++++++++++
 examples/extras/badam/train_single_gpu.sh   | 37 +++++++++++++++++++
 examples/extras/badam/train_zero3.sh        | 39 ++++++++++++++++++++
 3 files changed, 116 insertions(+)
 create mode 100644 examples/extras/badam/llama3_badam_sft.yaml
 create mode 100644 examples/extras/badam/train_single_gpu.sh
 create mode 100644 examples/extras/badam/train_zero3.sh

diff --git a/examples/extras/badam/llama3_badam_sft.yaml b/examples/extras/badam/llama3_badam_sft.yaml
new file mode 100644
index 00000000..f5adb220
--- /dev/null
+++ b/examples/extras/badam/llama3_badam_sft.yaml
@@ -0,0 +1,40 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_badam: true
+badam_switch_mode: ascending
+badam_switch_interval: 50
+badam_verbose: 2
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-6
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/badam/train_single_gpu.sh b/examples/extras/badam/train_single_gpu.sh
new file mode 100644
index 00000000..8af79007
--- /dev/null
+++ b/examples/extras/badam/train_single_gpu.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=0
+
+cd ../../..
+
+llamafactory-cli train \
+    --stage sft \
+    --do_train True \
+    --model_name_or_path meta-llama/Llama-2-13b-hf \
+    --preprocessing_num_workers 16 \
+    --finetuning_type full \
+    --template default \
+    --flash_attn auto \
+    --dataset_dir data \
+    --dataset alpaca_en_demo \
+    --cutoff_len 1024 \
+    --learning_rate 1e-6 \
+    --num_train_epochs 3.0 \
+    --max_samples 100000 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --max_grad_norm 1.0 \
+    --logging_steps 5 \
+    --save_steps 100 \
+    --warmup_steps 0 \
+    --optim adamw_torch \
+    --packing False \
+    --report_to none \
+    --use_badam True \
+    --output_dir saves/LLaMA2-13B/full/BAdam \
+    --plot_loss True \
+    --ddp_timeout 180000000 \
+    --include_num_input_tokens_seen True \
+    --badam_mode layer \
+    --badam_switch_mode ascending \
+    --badam_switch_interval 50
\ No newline at end of file
diff --git a/examples/extras/badam/train_zero3.sh b/examples/extras/badam/train_zero3.sh
new file mode 100644
index 00000000..3b182134
--- /dev/null
+++ b/examples/extras/badam/train_zero3.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+cd ../../..
+
+llamafactory-cli train \
+    --stage sft \
+    --do_train True \
+    --model_name_or_path meta-llama/Llama-2-13b-hf \
+    --preprocessing_num_workers 16 \
+    --finetuning_type full \
+    --template default \
+    --flash_attn auto \
+    --dataset_dir data \
+    --dataset alpaca_en_demo \
+    --cutoff_len 1024 \
+    --learning_rate 1e-6 \
+    --num_train_epochs 3.0 \
+    --max_samples 100000 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 2 \
+    --lr_scheduler_type cosine \
+    --max_grad_norm 1.0 \
+    --logging_steps 5 \
+    --save_steps 100 \
+    --warmup_steps 0 \
+    --optim adamw_torch \
+    --packing False \
+    --report_to none \
+    --use_badam True \
+    --output_dir saves/LLaMA2-13B/full/BAdam \
+    --fp16 True \
+    --plot_loss True \
+    --ddp_timeout 180000000 \
+    --include_num_input_tokens_seen True \
+    --badam_mode layer \
+    --badam_switch_mode ascending \
+    --badam_switch_interval 50 \
+    --deepspeed cache/ds_z3_config.json 
\ No newline at end of file

From 372da52d4ac7264454db23b09b579a4555947cda Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 18 Jun 2024 22:08:56 +0800
Subject: [PATCH 069/160] fix #4335

Former-commit-id: 2ab449adbb160f339a0586edeb846fa311ad8382
---
 src/llamafactory/chat/base_engine.py |  5 -----
 src/llamafactory/chat/chat_model.py  |  2 --
 src/llamafactory/chat/hf_engine.py   | 10 ++++------
 src/llamafactory/chat/vllm_engine.py |  3 ---
 4 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/src/llamafactory/chat/base_engine.py b/src/llamafactory/chat/base_engine.py
index 92a51ebe..ccdf4c92 100644
--- a/src/llamafactory/chat/base_engine.py
+++ b/src/llamafactory/chat/base_engine.py
@@ -50,11 +50,6 @@ class BaseEngine(ABC):
         generating_args: "GeneratingArguments",
     ) -> None: ...
 
-    @abstractmethod
-    async def start(
-        self,
-    ) -> None: ...
-
     @abstractmethod
     async def chat(
         self,
diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
index 2a72f422..5c83fa67 100644
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
@@ -49,8 +49,6 @@ class ChatModel:
         self._loop = asyncio.new_event_loop()
         self._thread = Thread(target=_start_background_loop, args=(self._loop,), daemon=True)
         self._thread.start()
-        task = asyncio.run_coroutine_threadsafe(self.engine.start(), self._loop)
-        task.result()
 
     def chat(
         self,
diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index a7ff7015..30200456 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -59,6 +59,7 @@ class HuggingfaceEngine(BaseEngine):
             self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
         )  # must after fixing tokenizer to resize vocab
         self.generating_args = generating_args.to_dict()
+        self.semaphore = asyncio.Semaphore(int(os.environ.get("MAX_CONCURRENT", "1")))
 
     @staticmethod
     def _process_args(
@@ -259,9 +260,6 @@ class HuggingfaceEngine(BaseEngine):
 
         return scores
 
-    async def start(self) -> None:
-        self._semaphore = asyncio.Semaphore(int(os.environ.get("MAX_CONCURRENT", 1)))
-
     async def chat(
         self,
         messages: Sequence[Dict[str, str]],
@@ -286,7 +284,7 @@ class HuggingfaceEngine(BaseEngine):
             image,
             input_kwargs,
         )
-        async with self._semaphore:
+        async with self.semaphore:
             with concurrent.futures.ThreadPoolExecutor() as pool:
                 return await loop.run_in_executor(pool, self._chat, *input_args)
 
@@ -314,7 +312,7 @@ class HuggingfaceEngine(BaseEngine):
             image,
             input_kwargs,
         )
-        async with self._semaphore:
+        async with self.semaphore:
             with concurrent.futures.ThreadPoolExecutor() as pool:
                 stream = self._stream_chat(*input_args)
                 while True:
@@ -333,6 +331,6 @@ class HuggingfaceEngine(BaseEngine):
 
         loop = asyncio.get_running_loop()
         input_args = (self.model, self.tokenizer, batch_input, input_kwargs)
-        async with self._semaphore:
+        async with self.semaphore:
             with concurrent.futures.ThreadPoolExecutor() as pool:
                 return await loop.run_in_executor(pool, self._get_scores, *input_args)
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index d488a039..2626d612 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -183,9 +183,6 @@ class VllmEngine(BaseEngine):
         )
         return result_generator
 
-    async def start(self) -> None:
-        pass
-
     async def chat(
         self,
         messages: Sequence[Dict[str, str]],

From c106cc24e4631c3bc313c21f4fa85309f49cc8b7 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 18 Jun 2024 22:27:24 +0800
Subject: [PATCH 070/160] Update requirements.txt

Former-commit-id: da8684f9f0b0103d4fa81279343a48ecd0fcc0cd
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 5c7202a5..f76524d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ accelerate>=0.30.1
 peft>=0.11.1
 trl>=0.8.6
 gradio>=4.0.0
-pandas>=2.2.2
+pandas>=2.0.0
 scipy
 einops
 sentencepiece

From 4bc0bea0e9b6c5b01555f8768480b51be95f49be Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 18 Jun 2024 22:42:45 +0800
Subject: [PATCH 071/160] fix #4357

Former-commit-id: a6741bba8cebd16a6a3f97a2dc81057d0e27eb39
---
 src/llamafactory/chat/hf_engine.py         | 7 +++++++
 src/llamafactory/hparams/parser.py         | 1 +
 src/llamafactory/model/model_utils/rope.py | 4 ++--
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 30200456..9e60175b 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -59,6 +59,13 @@ class HuggingfaceEngine(BaseEngine):
             self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
         )  # must after fixing tokenizer to resize vocab
         self.generating_args = generating_args.to_dict()
+        try:
+            asyncio.get_event_loop()
+        except RuntimeError:
+            logger.warning("There is no current event loop, creating a new one.")
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
         self.semaphore = asyncio.Semaphore(int(os.environ.get("MAX_CONCURRENT", "1")))
 
     @staticmethod
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 467fc43d..9b305016 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -353,6 +353,7 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
 
     if model_args.export_dir is not None and model_args.export_device == "cpu":
         model_args.device_map = {"": torch.device("cpu")}
+        model_args.model_max_length = data_args.cutoff_len
     else:
         model_args.device_map = "auto"
 
diff --git a/src/llamafactory/model/model_utils/rope.py b/src/llamafactory/model/model_utils/rope.py
index 88303c4d..4373ee19 100644
--- a/src/llamafactory/model/model_utils/rope.py
+++ b/src/llamafactory/model/model_utils/rope.py
@@ -39,8 +39,8 @@ def configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_
         logger.warning("Current model does not support RoPE scaling.")
         return
 
-    if is_trainable:
-        if model_args.rope_scaling == "dynamic":
+    if model_args.model_max_length is not None:
+        if is_trainable and model_args.rope_scaling == "dynamic":
             logger.warning(
                 "Dynamic NTK scaling may not work well with fine-tuning. "
                 "See: https://github.com/huggingface/transformers/pull/24653"

From 665df5d7339e4979fdb4a5c4f6e4ee6a620e27bc Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 18 Jun 2024 22:53:54 +0800
Subject: [PATCH 072/160] add deepseek coder v2 #4346

Former-commit-id: d83d3846d8e3bf5c40d4b90c24e2c5909ec61864
---
 README.md                            | 58 ++++++++++++++--------------
 README_zh.md                         | 58 ++++++++++++++--------------
 src/llamafactory/extras/constants.py | 12 ++++++
 3 files changed, 70 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 60045118..ca9e7e1d 100644
--- a/README.md
+++ b/README.md
@@ -151,35 +151,35 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                    | Model size                       | Template  |
-| -------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                           | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | cohere    |
-| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                  | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | gemma     |
-| [GLM4](https://huggingface.co/THUDM)                     | 9B                               | glm4      |
-| [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B                   | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                       | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                           | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                           | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)               | 3B                               | gemma     |
-| [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                | 4B/7B/14B                        | phi       |
-| [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B                  | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2 (MoE)](https://huggingface.co/Qwen)               | 0.5B/1.5B/7B/57B/72B             | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | xverse    |
-| [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                    | 6B/34B                           | yi_vl     |
-| [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | yuan      |
+| Model                                                     | Model size                       | Template  |
+| --------------------------------------------------------- | -------------------------------- | --------- |
+| [Baichuan2](https://huggingface.co/baichuan-inc)          | 7B/13B                           | baichuan2 |
+| [BLOOM](https://huggingface.co/bigscience)                | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [BLOOMZ](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                  | 6B                               | chatglm3  |
+| [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/CodeGemma](https://huggingface.co/google)          | 2B/7B                            | gemma     |
+| [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
+| [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
+| [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
+| [LLaMA-2](https://huggingface.co/meta-llama)              | 7B/13B/70B                       | llama2    |
+| [LLaMA-3](https://huggingface.co/meta-llama)              | 8B/70B                           | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)              | 7B/13B                           | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)       | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                    | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                | 3B                               | gemma     |
+| [Phi-1.5/2](https://huggingface.co/microsoft)             | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                 | 4B/7B/14B                        | phi       |
+| [Qwen](https://huggingface.co/Qwen)                       | 1.8B/7B/14B/72B                  | qwen      |
+| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)         | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
+| [Qwen2 (MoE)](https://huggingface.co/Qwen)                | 0.5B/1.5B/7B/57B/72B             | qwen      |
+| [StarCoder2](https://huggingface.co/bigcode)              | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                   | 7B/13B/65B                       | xverse    |
+| [Yi (1/1.5)](https://huggingface.co/01-ai)                | 6B/9B/34B                        | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                     | 6B/34B                           | yi_vl     |
+| [Yuan](https://huggingface.co/IEITYuan)                   | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
diff --git a/README_zh.md b/README_zh.md
index fc824561..2ede76ba 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -151,35 +151,35 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 模型
 
-| 模型名                                                   | 模型大小                          | Template  |
-| -------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                           | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | cohere    |
-| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                  | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | gemma     |
-| [GLM4](https://huggingface.co/THUDM)                     | 9B                               | glm4      |
-| [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B                   | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                       | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                           | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                           | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)               | 3B                               | gemma     |
-| [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                | 4B/7B/14B                        | phi       |
-| [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B                  | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2 (MoE)](https://huggingface.co/Qwen)               | 0.5B/1.5B/7B/57B/72B             | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | xverse    |
-| [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                    | 6B/34B                           | yi_vl     |
-| [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | yuan      |
+| 模型名                                                    | 模型大小                          | Template  |
+| --------------------------------------------------------- | -------------------------------- | --------- |
+| [Baichuan2](https://huggingface.co/baichuan-inc)          | 7B/13B                           | baichuan2 |
+| [BLOOM](https://huggingface.co/bigscience)                | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [BLOOMZ](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                  | 6B                               | chatglm3  |
+| [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/CodeGemma](https://huggingface.co/google)          | 2B/7B                            | gemma     |
+| [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
+| [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
+| [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
+| [LLaMA-2](https://huggingface.co/meta-llama)              | 7B/13B/70B                       | llama2    |
+| [LLaMA-3](https://huggingface.co/meta-llama)              | 8B/70B                           | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)              | 7B/13B                           | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)       | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                    | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                | 3B                               | gemma     |
+| [Phi-1.5/2](https://huggingface.co/microsoft)             | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                 | 4B/7B/14B                        | phi       |
+| [Qwen](https://huggingface.co/Qwen)                       | 1.8B/7B/14B/72B                  | qwen      |
+| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)         | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
+| [Qwen2 (MoE)](https://huggingface.co/Qwen)                | 0.5B/1.5B/7B/57B/72B             | qwen      |
+| [StarCoder2](https://huggingface.co/bigcode)              | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                   | 7B/13B/65B                       | xverse    |
+| [Yi (1/1.5)](https://huggingface.co/01-ai)                | 6B/9B/34B                        | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                     | 6B/34B                           | yi_vl     |
+| [Yuan](https://huggingface.co/IEITYuan)                   | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 73a9969d..36265c8e 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -403,6 +403,18 @@ register_model_group(
             DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat",
             DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat",
         },
+        "DeepSeek-MoE-Coder-16B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
+        },
+        "DeepSeek-MoE-Coder-236B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Base",
+        },
+        "DeepSeek-MoE-Coder-16B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        },
+        "DeepSeek-MoE-Coder-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+        },
     },
     template="deepseek",
 )

From 85f3a09c83fc59cd8523ea3f9d47201a39e42e3d Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 18 Jun 2024 23:32:18 +0800
Subject: [PATCH 073/160] tiny fix

Former-commit-id: bb750fa3dde03ec024ae75596ecd4b884cb126c6
---
 Dockerfile   | 1 +
 README.md    | 4 ++--
 README_zh.md | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9c2f645c..61d58005 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,4 +43,5 @@ EXPOSE 7860
 # Expose port 8000 for the API service
 EXPOSE 8000
 
+# Launch LLaMA Board
 CMD [ "llamafactory-cli", "webui" ]
diff --git a/README.md b/README.md
index 2d95583b..0e8e55f7 100644
--- a/README.md
+++ b/README.md
@@ -260,6 +260,8 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
 - [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
@@ -270,8 +272,6 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
 - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
 - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
-- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
-- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
 </details>
 
diff --git a/README_zh.md b/README_zh.md
index ed5475d3..152cd6fa 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -260,6 +260,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
 - [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
 - [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
@@ -270,8 +272,6 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
 - [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
 - [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
-- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
-- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
 
 </details>
 

From e36a994fe67b23569845479be54d0cc709be63be Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 03:23:51 +0800
Subject: [PATCH 074/160] fix tool formatter, allow parallel function #4362

Former-commit-id: b8f16c976db4ecec1cc8558851c8cbfb6a5b7e9c
---
 src/llamafactory/api/chat.py       |  19 ++---
 src/llamafactory/data/formatter.py | 120 ++++++++++++---------------
 src/llamafactory/data/template.py  |  16 +++-
 src/llamafactory/webui/chatter.py  |  13 ++-
 tests/data/test_formatter.py       | 125 +++++++++++++++++++++++++++++
 5 files changed, 207 insertions(+), 86 deletions(-)
 create mode 100644 tests/data/test_formatter.py

diff --git a/src/llamafactory/api/chat.py b/src/llamafactory/api/chat.py
index 945856cb..2c7e11e2 100644
--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -92,9 +92,11 @@ def _process_request(
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
 
         if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
-            name = message.tool_calls[0].function.name
-            arguments = message.tool_calls[0].function.arguments
-            content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)
+            tool_calls = [
+                {"name": tool_call.function.name, "argument": tool_call.function.arguments}
+                for tool_call in message.tool_calls
+            ]
+            content = json.dumps(tool_calls, ensure_ascii=False)
             input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
         elif isinstance(message.content, list):
             for input_item in message.content:
@@ -118,7 +120,7 @@ def _process_request(
     if isinstance(tool_list, list) and len(tool_list):
         try:
             tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
-        except Exception:
+        except json.JSONDecodeError:
             raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
     else:
         tools = None
@@ -160,17 +162,16 @@ async def create_chat_completion_response(
     choices = []
     for i, response in enumerate(responses):
         if tools:
-            result = chat_model.engine.template.format_tools.extract(response.response_text)
+            result = chat_model.engine.template.extract_tool(response.response_text)
         else:
             result = response.response_text
 
         if isinstance(result, list):
             tool_calls = []
             for tool in result:
-                name, arguments = tool
-                function = Function(name=name, arguments=arguments)
-                tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
-                tool_calls.append(tool_call)
+                function = Function(name=tool[0], arguments=tool[1])
+                tool_calls.append(FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function))
+
             response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=tool_calls)
             finish_reason = Finish.TOOL
         else:
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index fa35df5b..70be6a5a 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -22,29 +22,20 @@ from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Tuple, Uni
 SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
 
 
-JSON_FORMAT_PROMPT = (
-    """, in a JSON format representing the kwargs (e.g. ```{"input": "hello world", "num_beams": 5}```)"""
-)
-
-
-TOOL_SYSTEM_PROMPT = (
+DEFAULT_TOOL_PROMPT = (
     "You have access to the following tools:\n{tool_text}"
     "Use the following format if using a tool:\n"
     "```\n"
     "Action: tool name (one of [{tool_names}]).\n"
-    "Action Input: the input to the tool{format_prompt}.\n"
+    "Action Input: the input to the tool, in a JSON format representing the kwargs "
+    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```).\n"""
     "```\n"
 )
 
 
-GLM4_TOOL_SUFFIX_PROMPT = (
-    "在调用上述函数时，请使用 Json 格式表示调用的参数。"
-)
-
 GLM4_TOOL_PROMPT = (
-    "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持，"
-    "{tool_text}"
-    
+    "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+    "你的任务是针对用户的问题和要求提供适当的答复和支持。{tool_text}"
 )
 
 
@@ -73,32 +64,19 @@ def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
         )
         tool_names.append(tool["name"])
 
-    return TOOL_SYSTEM_PROMPT.format(
-        tool_text=tool_text, tool_names=", ".join(tool_names), format_prompt=JSON_FORMAT_PROMPT
-    )
+    return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
 
 
-def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
-    tool_text = ""
-    for tool in tools:
-        tool_name = tool["name"]
-        tool_text += f"\n\n## {tool_name}\n\n{json.dumps(tool, ensure_ascii=False, indent=4)}\n{GLM4_TOOL_SUFFIX_PROMPT}"
-    return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
- 
-
 def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*({.*?})(?=\nAction:|\Z)", re.DOTALL)
-    action_match = re.findall(regex, content)
+    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|$)", re.DOTALL)
+    action_match: List[Tuple[str, str]] = re.findall(regex, content)
     if not action_match:
         return content
 
     results = []
-    
     for match in action_match:
-        tool_name, tool_input = match
-        tool_name = tool_name.strip()
-        tool_input = tool_input.strip().strip('"').strip("```")
-
+        tool_name = match[0].strip()
+        tool_input = match[1].strip().strip('"').strip("```")
         try:
             arguments = json.loads(tool_input)
             results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
@@ -108,19 +86,28 @@ def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
     return results
 
 
+def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
+    tool_text = ""
+    for tool in tools:
+        tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+            name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
+        )
+
+    return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+
+
 def glm4_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    lines = content.strip().split("\n")
-    if len(lines) != 2:
+    if "\n" not in content:
         return content
-    tool_name = lines[0].strip()
-    tool_input = lines[1].strip()
+
+    tool_name, tool_input = content.split("\n", maxsplit=1)
     try:
         arguments = json.loads(tool_input)
     except json.JSONDecodeError:
         return content
+
     return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
 
-    
 
 @dataclass
 class Formatter(ABC):
@@ -193,22 +180,28 @@ class FunctionFormatter(Formatter):
 
     def apply(self, **kwargs) -> SLOTS:
         content = kwargs.pop("content")
+        functions: List[Tuple[str, str]] = []
         try:
-            function = json.loads(content)
-            name = function["name"]
-            arguments = json.dumps(function["arguments"], ensure_ascii=False)
-        except Exception:
-            name, arguments = "", ""
+            tool_calls = json.loads(content)
+            if not isinstance(tool_calls, list):  # parallel function call
+                tool_calls = [tool_calls]
+
+            for tool_call in tool_calls:
+                functions.append((tool_call["name"], json.dumps(tool_call["arguments"], ensure_ascii=False)))
+
+        except json.JSONDecodeError:
+            functions = []
 
         elements = []
-        for slot in self.slots:
-            if isinstance(slot, str):
-                slot = slot.replace("{{name}}", name).replace("{{arguments}}", arguments)
-                elements.append(slot)
-            elif isinstance(slot, (dict, set)):
-                elements.append(slot)
-            else:
-                raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
+        for name, arguments in functions:
+            for slot in self.slots:
+                if isinstance(slot, str):
+                    slot = slot.replace("{{name}}", name).replace("{{arguments}}", arguments)
+                    elements.append(slot)
+                elif isinstance(slot, (dict, set)):
+                    elements.append(slot)
+                else:
+                    raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
 
         return elements
 
@@ -216,29 +209,22 @@ class FunctionFormatter(Formatter):
 @dataclass
 class ToolFormatter(Formatter):
     def __post_init__(self):
-        if self.tool_format is None:
+        if self.tool_format == "default":
+            self._tool_formatter = default_tool_formatter
+            self._tool_extractor = default_tool_extractor
+        elif self.tool_format == "glm4":
+            self._tool_formatter = glm4_tool_formatter
+            self._tool_extractor = glm4_tool_extractor
+        else:
             raise ValueError("Tool format was not found.")
 
     def apply(self, **kwargs) -> SLOTS:
         content = kwargs.pop("content")
         try:
             tools = json.loads(content)
-            if not len(tools):
-                return [""]
-
-            if self.tool_format == "default":
-                return [default_tool_formatter(tools)]
-            elif self.tool_format == "glm4":
-                return [glm4_tool_formatter(tools)]
-            else:
-                raise NotImplementedError
-        except Exception:
+            return [self._tool_formatter(tools) if len(tools) != 0 else ""]
+        except json.JSONDecodeError:
             return [""]
 
     def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
-        if self.tool_format == "default":
-            return default_tool_extractor(content)
-        elif self.tool_format == "glm4":
-            return glm4_tool_extractor(content)
-        else:
-            raise NotImplementedError
+        return self._tool_extractor(content)
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index d97699b0..77694c59 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -79,6 +79,12 @@ class Template:
         """
         return self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
 
+    def extract_tool(self, content: str) -> Union[str, List[Tuple[str, str]]]:
+        r"""
+        Extracts tool message.
+        """
+        return self.format_tools.extract(content)
+
     def _encode(
         self,
         tokenizer: "PreTrainedTokenizer",
@@ -100,7 +106,8 @@ class Template:
             if i == 0 and (system or tools or self.force_system):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
                 elements += self.format_system.apply(content=(system + tool_text))
-            elif i > 0 and i % 2 == 0:
+
+            if i > 0 and i % 2 == 0:
                 elements += self.format_separator.apply()
 
             if message["role"] == Role.USER.value:
@@ -191,7 +198,8 @@ class Llama2Template(Template):
             if i == 0 and (system or tools or self.force_system):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
                 system_text = self.format_system.apply(content=(system + tool_text))[0]
-            elif i > 0 and i % 2 == 0:
+
+            if i > 0 and i % 2 == 0:
                 elements += self.format_separator.apply()
 
             if message["role"] == Role.USER.value:
@@ -259,7 +267,9 @@ def _register_template(
     template_class = Llama2Template if name.startswith("llama2") else Template
     default_user_formatter = StringFormatter(slots=["{{content}}"])
     default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots)
-    default_function_formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots)
+    default_function_formatter = FunctionFormatter(
+        slots=["Action: {{name}}\nAction Input: {{arguments}}\n"] + eos_slots
+    )
     default_tool_formatter = ToolFormatter(tool_format="default")
     default_separator_formatter = EmptyFormatter()
     TEMPLATES[name] = template_class(
diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py
index 864c41c7..a2b54dce 100644
--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
@@ -140,16 +140,15 @@ class WebChatModel(ChatModel):
         ):
             response += new_text
             if tools:
-                result = self.engine.template.format_tools.extract(response)
+                result = self.engine.template.extract_tool(response)
             else:
                 result = response
 
-            if isinstance(result, tuple):
-                name, arguments = result
-                arguments = json.loads(arguments)
-                tool_call = json.dumps({"name": name, "arguments": arguments}, ensure_ascii=False)
-                output_messages = messages + [{"role": Role.FUNCTION.value, "content": tool_call}]
-                bot_text = "```json\n" + tool_call + "\n```"
+            if isinstance(result, list):
+                tool_calls = [{"name": tool[0], "arguments": json.loads(tool[1])} for tool in result]
+                tool_calls = json.dumps(tool_calls, indent=4, ensure_ascii=False)
+                output_messages = messages + [{"role": Role.FUNCTION.value, "content": tool_calls}]
+                bot_text = "```json\n" + tool_calls + "\n```"
             else:
                 output_messages = messages + [{"role": Role.ASSISTANT.value, "content": result}]
                 bot_text = result
diff --git a/tests/data/test_formatter.py b/tests/data/test_formatter.py
new file mode 100644
index 00000000..430eb0e6
--- /dev/null
+++ b/tests/data/test_formatter.py
@@ -0,0 +1,125 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from llamafactory.data.formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
+
+
+def test_empty_formatter():
+    formatter = EmptyFormatter(slots=["\n"])
+    assert formatter.apply() == ["\n"]
+
+
+def test_string_formatter():
+    formatter = StringFormatter(slots=["<s>", "Human: {{content}}\nAssistant:"])
+    assert formatter.apply(content="Hi") == ["<s>", "Human: Hi\nAssistant:"]
+
+
+def test_function_formatter():
+    formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}\n"])
+    tool_calls = json.dumps({"name": "tool_name", "arguments": {"foo": "bar", "size": 10}})
+    assert formatter.apply(content=tool_calls) == [
+        """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n"""
+    ]
+
+
+def test_multi_function_formatter():
+    formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}\n"])
+    tool_calls = json.dumps([{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}] * 2)
+    assert formatter.apply(content=tool_calls) == [
+        """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n""",
+        """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n""",
+    ]
+
+
+def test_default_tool_formatter():
+    formatter = ToolFormatter(tool_format="default")
+    tools = [
+        {
+            "name": "test_tool",
+            "description": "tool_desc",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "foo": {"type": "string", "description": "foo_desc"},
+                    "bar": {"type": "number", "description": "bar_desc"},
+                },
+                "required": ["foo"],
+            },
+        }
+    ]
+    assert formatter.apply(content=json.dumps(tools)) == [
+        "You have access to the following tools:\n"
+        "> Tool Name: test_tool\n"
+        "Tool Description: tool_desc\n"
+        "Tool Args:\n"
+        "  - foo (string, required): foo_desc\n"
+        "  - bar (number): bar_desc\n\n"
+        "Use the following format if using a tool:\n"
+        "```\n"
+        "Action: tool name (one of [test_tool]).\n"
+        "Action Input: the input to the tool, in a JSON format representing the kwargs "
+        """(e.g. ```{"input": "hello world", "num_beams": 5}```).\n"""
+        "```\n"
+    ]
+
+
+def test_default_tool_extractor():
+    formatter = ToolFormatter(tool_format="default")
+    result = """Action: test_tool\nAction Input: {"foo": "bar", "size": 10}\n"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
+
+
+def test_default_multi_tool_extractor():
+    formatter = ToolFormatter(tool_format="default")
+    result = (
+        """Action: test_tool\nAction Input: {"foo": "bar", "size": 10}\n"""
+        """Action: another_tool\nAction Input: {"foo": "job", "size": 2}\n"""
+    )
+    assert formatter.extract(result) == [
+        ("test_tool", """{"foo": "bar", "size": 10}"""),
+        ("another_tool", """{"foo": "job", "size": 2}"""),
+    ]
+
+
+def test_glm4_tool_formatter():
+    formatter = ToolFormatter(tool_format="glm4")
+    tools = [
+        {
+            "name": "test_tool",
+            "description": "tool_desc",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "foo": {"type": "string", "description": "foo_desc"},
+                    "bar": {"type": "number", "description": "bar_desc"},
+                },
+                "required": ["foo"],
+            },
+        }
+    ]
+    assert formatter.apply(content=json.dumps(tools)) == [
+        "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+        "你的任务是针对用户的问题和要求提供适当的答复和支持。"
+        "\n\n## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+            json.dumps(tools[0], indent=4)
+        )
+    ]
+
+
+def test_glm4_tool_extractor():
+    formatter = ToolFormatter(tool_format="glm4")
+    result = """test_tool\n{"foo": "bar", "size": 10}\n"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]

From 1ca639a77748b842e279f2baf2c8f301aa52272e Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 03:39:52 +0800
Subject: [PATCH 075/160] use prefix to replace force system

Former-commit-id: 731d9a964f1c3dbfb83825524d697831e691fb9d
---
 src/llamafactory/data/template.py | 75 +++++++++++++------------------
 1 file changed, 30 insertions(+), 45 deletions(-)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 77694c59..a12e9c88 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -38,12 +38,12 @@ class Template:
     format_observation: "Formatter"
     format_tools: "Formatter"
     format_separator: "Formatter"
+    format_prefix: "Formatter"
     default_system: str
     stop_words: List[str]
     image_token: str
     efficient_eos: bool
     replace_eos: bool
-    force_system: bool
 
     def encode_oneturn(
         self,
@@ -102,8 +102,9 @@ class Template:
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
-            elements = []
-            if i == 0 and (system or tools or self.force_system):
+            elements = self.format_prefix.apply()
+
+            if i == 0 and (system or tools):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
                 elements += self.format_system.apply(content=(system + tool_text))
 
@@ -193,9 +194,10 @@ class Llama2Template(Template):
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
-            elements = []
+            elements = self.format_prefix.apply()
+
             system_text = ""
-            if i == 0 and (system or tools or self.force_system):
+            if i == 0 and (system or tools):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
                 system_text = self.format_system.apply(content=(system + tool_text))[0]
 
@@ -230,12 +232,12 @@ def _register_template(
     format_observation: Optional["Formatter"] = None,
     format_tools: Optional["Formatter"] = None,
     format_separator: Optional["Formatter"] = None,
+    format_prefix: Optional["Formatter"] = None,
     default_system: str = "",
     stop_words: List[str] = [],
     image_token: str = "<image>",
     efficient_eos: bool = False,
     replace_eos: bool = False,
-    force_system: bool = False,
 ) -> None:
     r"""
     Registers a chat template.
@@ -272,6 +274,7 @@ def _register_template(
     )
     default_tool_formatter = ToolFormatter(tool_format="default")
     default_separator_formatter = EmptyFormatter()
+    default_prefix_formatter = EmptyFormatter()
     TEMPLATES[name] = template_class(
         format_user=format_user or default_user_formatter,
         format_assistant=format_assistant or default_assistant_formatter,
@@ -280,12 +283,12 @@ def _register_template(
         format_observation=format_observation or format_user or default_user_formatter,
         format_tools=format_tools or default_tool_formatter,
         format_separator=format_separator or default_separator_formatter,
+        format_prefix=format_prefix or default_prefix_formatter,
         default_system=default_system,
         stop_words=stop_words,
         image_token=image_token,
         efficient_eos=efficient_eos,
         replace_eos=replace_eos,
-        force_system=force_system,
     )
 
 
@@ -329,7 +332,7 @@ def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", pl
 
 
 def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") -> str:
-    jinja_template = ""
+    jinja_template = _convert_slots_to_jinja(template.format_prefix.apply(), tokenizer)
 
     if template.default_system:
         jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
@@ -339,11 +342,7 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer")
     )
 
     system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message")
-    if isinstance(template, Llama2Template):
-        pass
-    elif template.force_system:
-        jinja_template += "{{ " + system_message + " }}"
-    else:
+    if not isinstance(template, Llama2Template):
         jinja_template += "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}"
 
     jinja_template += "{% for message in messages %}"
@@ -459,9 +458,8 @@ _register_template(
 _register_template(
     name="belle",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     format_separator=EmptyFormatter(slots=["\n\n"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -486,10 +484,9 @@ _register_template(
 _register_template(
     name="chatglm2",
     format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
-    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
     format_separator=EmptyFormatter(slots=["\n\n"]),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -497,14 +494,14 @@ _register_template(
     name="chatglm3",
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
+    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(
         slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
     ),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -512,13 +509,12 @@ _register_template(
     name="chatglm3_system",
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_system=StringFormatter(
-        slots=[{"token": "[gMASK]"}, {"token": "sop"}, {"token": "<|system|>"}, "\n", "{{content}}"]
-    ),
+    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(
         slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
     ),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
     default_system=(
         "You are ChatGLM3, a large language model trained by Zhipu.AI. "
         "Follow the user's instructions carefully. Respond using markdown."
@@ -553,8 +549,7 @@ _register_template(
 
 _register_template(
     name="codegeex2",
-    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
 )
 
 
@@ -581,8 +576,7 @@ _register_template(
 _register_template(
     name="cpm",
     format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -615,8 +609,7 @@ _register_template(
 _register_template(
     name="deepseek",
     format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -648,9 +641,8 @@ _register_template(
     name="empty",
     format_user=StringFormatter(slots=["{{content}}"]),
     format_assistant=StringFormatter(slots=["{{content}}"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -672,13 +664,12 @@ _register_template(
 _register_template(
     name="gemma",
     format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     format_observation=StringFormatter(
         slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
     ),
     format_separator=EmptyFormatter(slots=["<end_of_turn>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -686,13 +677,13 @@ _register_template(
     name="glm4",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
     format_assistant=StringFormatter(slots=["\n{{content}}"]),
-    format_system=StringFormatter(slots=["[gMASK]<sop><|system|>\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
     format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
-    force_system=True,
 )
 
 
@@ -768,24 +759,21 @@ _register_template(
 _register_template(
     name="mistral",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
 _register_template(
     name="olmo",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
-    format_system=StringFormatter(slots=[{"eos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"eos_token"}]),
 )
 
 
 _register_template(
     name="openchat",
     format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -799,18 +787,16 @@ _register_template(
             )
         ]
     ),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|eot_id|>"],
     replace_eos=True,
-    force_system=True,
 )
 
 
 _register_template(
     name="orion",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    force_system=True,
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -852,7 +838,6 @@ _register_template(
     format_separator=EmptyFormatter(slots=["\n"]),
     stop_words=["<|end|>"],
     replace_eos=True,
-    force_system=True,
 )
 
 
From 235ed85b0f2fff6867f3d3528b367d6478405106 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 03:49:23 +0800
Subject: [PATCH 076/160] fix bug

Former-commit-id: 412139eaa2fde98ba19e1257d21144382a59f0d6
---
 src/llamafactory/data/template.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index a12e9c88..c9af9605 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -102,7 +102,10 @@ class Template:
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
-            elements = self.format_prefix.apply()
+            elements = []
+
+            if i == 0:
+                elements += self.format_prefix.apply()
 
             if i == 0 and (system or tools):
                 tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
@@ -194,7 +197,10 @@ class Llama2Template(Template):
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
-            elements = self.format_prefix.apply()
+            elements = []
+
+            if i == 0:
+                elements += self.format_prefix.apply()
 
             system_text = ""
             if i == 0 and (system or tools):

From fa3150548ed1eb5b3071cc0715d2c0a473c0075f Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Wed, 19 Jun 2024 12:29:40 +0800
Subject: [PATCH 077/160] Cleaner integration.

Former-commit-id: 26d4b05d424bd71f570195dd433258caf6465d92
---
 src/llamafactory/hparams/parser.py      |  7 ++-----
 src/llamafactory/train/dpo/trainer.py   | 12 +++---------
 src/llamafactory/train/kto/trainer.py   | 12 +++---------
 src/llamafactory/train/ppo/trainer.py   | 12 +++---------
 src/llamafactory/train/pt/trainer.py    | 12 +++---------
 src/llamafactory/train/rm/trainer.py    | 12 +++---------
 src/llamafactory/train/sft/trainer.py   | 11 +++--------
 src/llamafactory/train/trainer_utils.py | 10 ++++------
 8 files changed, 24 insertions(+), 64 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 680559ac..c7c17ae0 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -215,11 +215,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         if finetuning_args.badam_mode == "ratio":
             raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
-        if (finetuning_args.badam_mode == "layer"
-            and training_args.deepspeed_plugin is not None
-            and training_args.deepspeed_plugin.zero_stage < 3
-        ):
-            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage, got stage {training_args.deepspeed_plugin.zero_stage}")
+        if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()):
+            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.")
 
     if (finetuning_args.use_galore) and training_args.deepspeed is not None:
         raise ValueError("GaLore are incompatible with DeepSpeed yet.")
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index 284bf41a..a3e0e961 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -96,15 +96,9 @@ class CustomDPOTrainer(DPOTrainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index d8b609e0..0d50987f 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -91,15 +91,9 @@ class CustomKTOTrainer(KTOTrainer):
                 self.ref_model.eval()
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 5a18cbaa..f81831e6 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -166,15 +166,9 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                 self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
         r"""
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index 1e5e9f6a..d3516b41 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -48,15 +48,9 @@ class CustomTrainer(Trainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 5d0e6263..433251cf 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -72,15 +72,9 @@ class PairwiseTrainer(Trainer):
         self.processor = processor
         self.can_return_loss = True  # override property to return eval_loss
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 9446d245..45799b96 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -56,14 +56,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-            if (self.args.deepspeed_plugin is not None
-            and self.args.deepspeed_plugin.zero_stage == 3
-            ):
-                from badam.utils import BAdamZeRO3Callback
-                self.callback_handler.add_callback(BAdamZeRO3Callback)
+            from badam import clip_grad_norm_old_version, BAdamCallback
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.callback_handler.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 98c38842..b57f5a6e 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -371,11 +371,8 @@ def _create_badam_optimizer(
         dict(params=decay_params, weight_decay=training_args.weight_decay),
     ]
 
-    ds_zero3_enabled = False
-    if hasattr(training_args, "deepspeed_plugin") and training_args.deepspeed_plugin is not None:
-        assert training_args.deepspeed_plugin.zero_stage == 3, f"BAdam only supports deepspeed ZeRO-3 stage, got {training_args.deepspeed_plugin.zero_stage}"
-        assert finetuning_args.badam_mode == "layer", "BAdam only supports layer-wise update in ZeRO-3 stage"
-        ds_zero3_enabled = True
+    from transformers.integrations import is_deepspeed_zero3_enabled
+    ds_zero3_enabled = is_deepspeed_zero3_enabled()
 
     if finetuning_args.badam_mode == "layer":
         from badam import BlockOptimizer
@@ -400,6 +397,7 @@ def _create_badam_optimizer(
     elif finetuning_args.badam_mode == "ratio":
         from badam import BlockOptimizerRatio
 
+        assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer."
         assert finetuning_args.badam_update_ratio > 1e-6
         optimizer = BlockOptimizerRatio(
             param_groups=param_groups,
@@ -411,7 +409,7 @@ def _create_badam_optimizer(
             **optim_kwargs,
         )
         logger.info(
-            f"Using BAdam optimizer with ratio-wise update, update ratio is {finetuning_args.badam_update_ratio}, "
+            f"Using BAdam optimizer with ratio-based update, update ratio is {finetuning_args.badam_update_ratio}, "
             f"mask mode is {finetuning_args.badam_mask_mode}"
         )
 

From 3e0fa4a8dad19e0ead5c288d607dc6e9a60b6ff0 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 17:44:05 +0800
Subject: [PATCH 078/160] fix templates

Former-commit-id: 6f357d59b73309c5955683008632e7f320e7dcb1
---
 src/llamafactory/data/formatter.py      |  2 +-
 src/llamafactory/data/template.py       | 48 ++++++++-----------------
 src/llamafactory/train/trainer_utils.py |  3 +-
 3 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 70be6a5a..ed9ba8b8 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -68,7 +68,7 @@ def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
 
 
 def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|$)", re.DOTALL)
+    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
     action_match: List[Tuple[str, str]] = re.findall(regex, content)
     if not action_match:
         return content
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index c9af9605..3c4bc5ec 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -478,11 +478,7 @@ _register_template(
 _register_template(
     name="breeze",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    default_system=(
-        "You are a helpful AI assistant built by MediaTek Research. "
-        "The user you are helping speaks Traditional Chinese and comes from Taiwan."
-    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
 )
 
@@ -569,13 +565,8 @@ _register_template(
             )
         ]
     ),
-    format_system=StringFormatter(
-        slots=[{"bos_token"}, "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"]
-    ),
-    default_system=(
-        "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users "
-        "by providing thorough responses. You are trained by Cohere."
-    ),
+    format_system=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
 )
 
 
@@ -645,8 +636,6 @@ _register_template(
 
 _register_template(
     name="empty",
-    format_user=StringFormatter(slots=["{{content}}"]),
-    format_assistant=StringFormatter(slots=["{{content}}"]),
     format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
 )
@@ -695,25 +684,21 @@ _register_template(
 
 _register_template(
     name="intern",
-    format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
-    format_separator=EmptyFormatter(slots=[{"token": "<eoa>"}, "\n"]),
+    format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
+    format_system=StringFormatter(slots=["<|System|>:{{content}}\n"]),
+    format_separator=EmptyFormatter(slots=["<eoa>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<eoa>"],
-    efficient_eos=True,
+    efficient_eos=True,  # internlm tokenizer cannot set eos_token_id
 )
 
 
 _register_template(
     name="intern2",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]),
-    format_separator=EmptyFormatter(slots=["\n"]),
-    default_system=(
-        "You are an AI assistant whose name is InternLM (书生·浦语).\n"
-        "- InternLM (书生·浦语) is a conversational language model that is developed "
-        "by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
-        "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen "
-        "by the user such as English and 中文."
-    ),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_separator=EmptyFormatter(slots=["<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|im_end|>"],
     efficient_eos=True,  # internlm2 tokenizer cannot set eos_token_id
 )
@@ -722,7 +707,6 @@ _register_template(
 _register_template(
     name="llama2",
     format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
-    format_assistant=StringFormatter(slots=[" {{content}} ", {"eos_token"}]),
     format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
 )
 
@@ -745,9 +729,7 @@ _register_template(
             )
         ]
     ),
-    format_system=StringFormatter(
-        slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]
-    ),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
     format_observation=StringFormatter(
         slots=[
             (
@@ -756,7 +738,7 @@ _register_template(
             )
         ]
     ),
-    default_system="You are a helpful assistant.",
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|eot_id|>"],
     replace_eos=True,
 )
@@ -809,9 +791,9 @@ _register_template(
 _register_template(
     name="phi",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
-    default_system="You are a helpful AI assistant.",
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     stop_words=["<|end|>"],
     replace_eos=True,
 )
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 9052c96d..c1b90155 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -198,6 +198,7 @@ def convert_pissa_adapter(
                 safe_serialization=training_args.save_safetensors,
             )
             setattr(unwrapped_model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
     elif output_dir == training_args.output_dir:  # at the end of training
         logger.info("Converted PiSSA adapter will be saved at: {}.".format(output_dir))
         unwrapped_model = accelerator.unwrap_model(model)
@@ -233,7 +234,7 @@ def _create_galore_optimizer(
     finetuning_args: "FinetuningArguments",
 ) -> "torch.optim.Optimizer":
     if len(finetuning_args.galore_target) == 1 and finetuning_args.galore_target[0] == "all":
-        galore_targets = find_all_linear_modules(model)
+        galore_targets = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
     else:
         galore_targets = finetuning_args.galore_target
 

From c65f7e9bd5b204d143eb7768a87857d619c92017 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 20:03:50 +0800
Subject: [PATCH 079/160] fix jinja template

Former-commit-id: 0ebf2e2ee23918d28b0cbb20ba456732d6eedfbb
---
 src/llamafactory/data/template.py |  6 +++++-
 tests/data/test_supervised.py     |  9 +++++---
 tests/data/test_template.py       | 35 +++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 tests/data/test_template.py

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 3c4bc5ec..393ebfea 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -338,7 +338,11 @@ def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", pl
 
 
 def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") -> str:
-    jinja_template = _convert_slots_to_jinja(template.format_prefix.apply(), tokenizer)
+    jinja_template = ""
+
+    prefix = _convert_slots_to_jinja(template.format_prefix.apply(), tokenizer)
+    if prefix:
+        jinja_template += "{{ " + prefix + " }}"
 
     if template.default_system:
         jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
diff --git a/tests/data/test_supervised.py b/tests/data/test_supervised.py
index 9f7b2dbf..9cb49615 100644
--- a/tests/data/test_supervised.py
+++ b/tests/data/test_supervised.py
@@ -17,6 +17,7 @@ import random
 
 import pytest
 from datasets import load_dataset
+from transformers import AutoTokenizer
 
 from llamafactory.data import get_dataset
 from llamafactory.hparams import get_train_args
@@ -48,10 +49,11 @@ def test_supervised(num_samples: int):
     tokenizer = tokenizer_module["tokenizer"]
     tokenized_data = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
 
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+
     original_data = load_dataset(TRAIN_ARGS["dataset"], split="train")
     indexes = random.choices(range(len(original_data)), k=num_samples)
     for index in indexes:
-        decoded_result = tokenizer.decode(tokenized_data["input_ids"][index])
         prompt = original_data[index]["instruction"]
         if original_data[index]["input"]:
             prompt += "\n" + original_data[index]["input"]
@@ -60,5 +62,6 @@ def test_supervised(num_samples: int):
             {"role": "user", "content": prompt},
             {"role": "assistant", "content": original_data[index]["output"]},
         ]
-        templated_result = tokenizer.apply_chat_template(messages, tokenize=False)
-        assert decoded_result == templated_result
+        templated_result = ref_tokenizer.apply_chat_template(messages, tokenize=False)
+        decoded_result = tokenizer.decode(tokenized_data["input_ids"][index])
+        assert templated_result == decoded_result
diff --git a/tests/data/test_template.py b/tests/data/test_template.py
new file mode 100644
index 00000000..9d73c116
--- /dev/null
+++ b/tests/data/test_template.py
@@ -0,0 +1,35 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from transformers import AutoTokenizer
+
+from llamafactory.data import get_template_and_fix_tokenizer
+
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+
+def test_jinja_template():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    get_template_and_fix_tokenizer(tokenizer, name="llama3")
+    assert tokenizer.chat_template != ref_tokenizer.chat_template
+
+    messages = [
+        {"role": "user", "content": "hi!"},
+        {"role": "assistant", "content": "hello there"},
+    ]
+    assert tokenizer.apply_chat_template(messages) == ref_tokenizer.apply_chat_template(messages)

From b631bdc5b7b50bab61c479ad6ec0383b04d1dc2e Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 20:42:09 +0800
Subject: [PATCH 080/160] release v0.8.2

Former-commit-id: 3050bbe51d46acd8473275d2713fc28932e4a3d3
---
 .github/workflows/publish.yml  | 39 ++++++++++++++++++++++++++++++++++
 src/llamafactory/extras/env.py |  2 +-
 2 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/publish.yml

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 00000000..bb39be3a
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,39 @@
+name: publish
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  publish:
+    name: Upload release to PyPI
+
+    runs-on: ubuntu-latest
+
+    environment:
+      name: release
+      url: https://pypi.org/p/llamafactory
+
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.8"
+      
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install build
+      
+      - name: Build package
+        run: |
+          python -m build
+      
+      - name: Publish package
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index 586c24c0..ee224148 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -23,7 +23,7 @@ import trl
 from transformers.utils import is_torch_cuda_available, is_torch_npu_available
 
 
-VERSION = "0.8.2.dev0"
+VERSION = "0.8.2"
 
 
 def print_env() -> None:

From f0bff183245ae93ac6caff5515f711f758bb1aec Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 20:46:33 +0800
Subject: [PATCH 081/160] Update publish.yml

Former-commit-id: 60b0633e29c9e701aa3813bd1fdc0282bd07f7c8
---
 .github/workflows/publish.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index bb39be3a..15c7153e 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -2,7 +2,8 @@ name: publish
 
 on:
   release:
-    types: [published]
+    types:
+      - published
 
 jobs:
   publish:

From a7d7f79855b0f89557ff74463cbcced8bd43e9a3 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 21:08:16 +0800
Subject: [PATCH 082/160] set dev version

Former-commit-id: 221665345d97f839ce4ba8d54643da30c71b6083
---
 src/llamafactory/extras/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index ee224148..ab387231 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -23,7 +23,7 @@ import trl
 from transformers.utils import is_torch_cuda_available, is_torch_npu_available
 
 
-VERSION = "0.8.2"
+VERSION = "0.8.3.dev0"
 
 
 def print_env() -> None:

From 5f5d4c1923b7477d7497c88cdf92f7649f6cf386 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 21:27:00 +0800
Subject: [PATCH 083/160] update patcher

Former-commit-id: afb365e515d615dd62f791622450debab60ce5cc
---
 src/llamafactory/model/model_utils/checkpointing.py | 10 ++++------
 src/llamafactory/model/patcher.py                   |  5 +++++
 tests/model/model_utils/test_checkpointing.py       |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py
index f5314125..f4f3d8a5 100644
--- a/src/llamafactory/model/model_utils/checkpointing.py
+++ b/src/llamafactory/model/model_utils/checkpointing.py
@@ -78,9 +78,7 @@ def _fp32_forward_post_hook(
     return output.to(torch.float32)
 
 
-def prepare_model_for_training(
-    model: "PreTrainedModel", model_args: "ModelArguments", output_layer_name: str = "lm_head"
-) -> None:
+def prepare_model_for_training(model: "PreTrainedModel", model_args: "ModelArguments") -> None:
     r"""
     Includes:
         (1) cast the layernorm in fp32
@@ -104,8 +102,8 @@ def prepare_model_for_training(
             setattr(model.config, "use_cache", False)  # turn off when gradient checkpointing is enabled
             logger.info("Gradient checkpointing enabled.")
 
-    if hasattr(model, output_layer_name) and model_args.upcast_lmhead_output:
-        logger.info("Upcasting lm_head outputs in float32.")
-        output_layer = getattr(model, output_layer_name)
+    if model_args.upcast_lmhead_output:
+        output_layer = model.get_output_embeddings()
         if isinstance(output_layer, torch.nn.Linear) and output_layer.weight.dtype != torch.float32:
+            logger.info("Upcasting lm_head outputs in float32.")
             output_layer.register_forward_hook(_fp32_forward_post_hook)
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 8fa17d08..a53fde98 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -152,6 +152,10 @@ def patch_valuehead_model(model: "AutoModelForCausalLMWithValueHead") -> None:
         if isinstance(self.pretrained_model, PreTrainedModel):
             return self.pretrained_model.get_input_embeddings()
 
+    def get_output_embeddings(self: "AutoModelForCausalLMWithValueHead") -> torch.nn.Module:
+        if isinstance(self.pretrained_model, PreTrainedModel):
+            return self.pretrained_model.get_output_embeddings()
+
     def create_or_update_model_card(self: "AutoModelForCausalLMWithValueHead", output_dir: str) -> None:
         if isinstance(self.pretrained_model, PeftModel):
             self.pretrained_model.create_or_update_model_card(output_dir)
@@ -160,4 +164,5 @@ def patch_valuehead_model(model: "AutoModelForCausalLMWithValueHead") -> None:
     setattr(model, "_keys_to_ignore_on_save", ignore_modules)
     setattr(model, "tie_weights", MethodType(tie_weights, model))
     setattr(model, "get_input_embeddings", MethodType(get_input_embeddings, model))
+    setattr(model, "get_output_embeddings", MethodType(get_output_embeddings, model))
     setattr(model, "create_or_update_model_card", MethodType(create_or_update_model_card, model))
diff --git a/tests/model/model_utils/test_checkpointing.py b/tests/model/model_utils/test_checkpointing.py
index 670e693d..9b6dfc9e 100644
--- a/tests/model/model_utils/test_checkpointing.py
+++ b/tests/model/model_utils/test_checkpointing.py
@@ -70,5 +70,5 @@ def test_upcast_lmhead_output():
     tokenizer_module = load_tokenizer(model_args)
     model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
     inputs = torch.randn((1, 16), dtype=torch.float16, device=get_current_device())
-    outputs: "torch.Tensor" = model.lm_head(inputs)
+    outputs: "torch.Tensor" = model.get_output_embeddings()(inputs)
     assert outputs.dtype == torch.float32

From b2f5c0e0db51ae1814615c11aaaec4def9cd2d83 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 23:22:28 +0800
Subject: [PATCH 084/160] fix llamaboard abort

Former-commit-id: 9ef609a2c0185040e531dea3829a6f481539cdea
---
 src/llamafactory/webui/runner.py |  4 ++--
 src/llamafactory/webui/utils.py  | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 13dbba03..6cd21b07 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -24,7 +24,7 @@ from ..extras.misc import is_gpu_or_npu_available, torch_gc
 from ..extras.packages import is_gradio_available
 from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir, load_config
 from .locales import ALERTS, LOCALES
-from .utils import abort_leaf_process, gen_cmd, get_eval_results, get_trainer_info, load_args, save_args, save_cmd
+from .utils import abort_process, gen_cmd, get_eval_results, get_trainer_info, load_args, save_args, save_cmd
 
 
 if is_gradio_available():
@@ -52,7 +52,7 @@ class Runner:
     def set_abort(self) -> None:
         self.aborted = True
         if self.trainer is not None:
-            abort_leaf_process(self.trainer.pid)
+            abort_process(self.trainer.pid)
 
     def _initialize(self, data: Dict["Component", Any], do_train: bool, from_preview: bool) -> str:
         get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index 6ce2a8e7..a616bcba 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -33,16 +33,16 @@ if is_gradio_available():
     import gradio as gr
 
 
-def abort_leaf_process(pid: int) -> None:
+def abort_process(pid: int) -> None:
     r"""
-    Aborts the leaf processes.
+    Aborts the processes recursively in a bottom-up way.
     """
     children = psutil.Process(pid).children()
     if children:
         for child in children:
-            abort_leaf_process(child.pid)
-    else:
-        os.kill(pid, signal.SIGABRT)
+            abort_process(child.pid)
+
+    os.kill(pid, signal.SIGABRT)
 
 
 def can_quantize(finetuning_type: str) -> "gr.Dropdown":

From 0edccc11a5856cdfc76a1952e6d72626b96cab48 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 19 Jun 2024 23:46:03 +0800
Subject: [PATCH 085/160] improve llamaboard

Former-commit-id: e606ab35c0eced667dde7137c2d72848f264c96c
---
 src/llamafactory/webui/components/top.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index 2515a83d..09d43ac8 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -50,9 +50,13 @@ def create_top() -> Dict[str, "Component"]:
             visual_inputs = gr.Checkbox(scale=1)
 
     model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False)
-    model_name.input(save_config, inputs=[lang, model_name], queue=False)
+    model_name.input(save_config, inputs=[lang, model_name], queue=False).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    )
     model_path.input(save_config, inputs=[lang, model_name, model_path], queue=False)
-    finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False)
+    finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    )
     checkpoint_path.focus(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False)
 
     return dict(

From b232552d425a6c782f0c5310fbc1725b531fa7e1 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Thu, 20 Jun 2024 02:09:47 +0000
Subject: [PATCH 086/160] update dependencies

Former-commit-id: 25164273d1ca7a8f6f99b41279e342906f6bc4d5
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index f76524d8..7380add4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ matplotlib>=3.7.0
 fire
 packaging
 pyyaml
+numpy<2.0.0

From af2cb33bb219feb0a9fc2595cb41f054196481a7 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 20 Jun 2024 22:56:05 +0800
Subject: [PATCH 087/160] tiny fix

Former-commit-id: 2d8d47f6126d68db1701ed18fc31310c6f14dd49
---
 src/llamafactory/hparams/parser.py       |  3 +++
 src/llamafactory/model/adapter.py        | 17 ++++++++---------
 src/llamafactory/model/patcher.py        |  4 ++--
 src/llamafactory/webui/components/top.py |  4 ++--
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 9b305016..a593bf45 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -199,6 +199,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         if not is_torch_bf16_gpu_available():
             raise ValueError("This device does not support `pure_bf16`.")
 
+        if training_args.deepspeed:
+            raise ValueError("`pure_bf16` is incompatible with DeepSpeed.")
+
         if training_args.fp16 or training_args.bf16:
             raise ValueError("Turn off mixed precision training when using `pure_bf16`.")
 
diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index 34518878..7caef9cc 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -289,16 +289,15 @@ def init_adapter(
             raise ValueError("Cannot initialize PiSSA adapter on quantized models.")
 
     # cast trainable parameters to float32 if:
-    # 1. is_trainable and quantization_bit is not None (qlora)
-    # 2. is_trainable and not deepspeed zero3 and not fsdp (zero3 or fsdp already in float32)
-    # 3. is_trainable and not pure_bf16 and not badam
+    # 1. is_trainable and not pure_bf16 and not badam and quantization_bit is not None (qlora)
+    # 2. is_trainable and not pure_bf16 and not badam and not zero3 and not fsdp (zero3 or fsdp already in fp32)
+    cast_trainable_params_to_fp32 = False
     if not is_trainable:
-        cast_trainable_params_to_fp32 = False
-    elif model_args.quantization_bit is None and (
-        is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam
-    ):
-        logger.info("ZeRO3/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.")
-        cast_trainable_params_to_fp32 = False
+        pass
+    elif finetuning_args.pure_bf16 or finetuning_args.use_badam:
+        logger.info("Pure bf16 / BAdam detected, remaining trainable params in half precision.")
+    elif model_args.quantization_bit is None and (is_deepspeed_zero3_enabled() or is_fsdp_enabled()):
+        logger.info("ZeRO3 / FSDP detected, remaining trainable params in float32.")
     else:
         logger.info("Upcasting trainable params to float32.")
         cast_trainable_params_to_fp32 = True
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index a53fde98..35153649 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -91,8 +91,8 @@ def patch_config(
 
     # cast data type of the model if:
     # 1. not deepspeed zero3 and not fsdp (keep zero3 or fsdp in float32)
-    # 2. fsdp + qlora
-    if model_args.quantization_bit is not None or (not is_deepspeed_zero3_enabled() and not is_fsdp_enabled()):
+    # 2. quantization_bit is not None (qlora)
+    if (not is_deepspeed_zero3_enabled() and not is_fsdp_enabled()) or model_args.quantization_bit is not None:
         init_kwargs["torch_dtype"] = model_args.compute_dtype
 
         if init_kwargs["low_cpu_mem_usage"]:  # device map requires low_cpu_mem_usage=True
diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index 09d43ac8..18b9a7d2 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -49,10 +49,10 @@ def create_top() -> Dict[str, "Component"]:
             booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none", scale=3)
             visual_inputs = gr.Checkbox(scale=1)
 
-    model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False)
-    model_name.input(save_config, inputs=[lang, model_name], queue=False).then(
+    model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False).then(
         list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
     )
+    model_name.input(save_config, inputs=[lang, model_name], queue=False)
     model_path.input(save_config, inputs=[lang, model_name, model_path], queue=False)
     finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False).then(
         list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False

From 05abe47c8bc2eb4af0b7431d7e6ef92a1ab43371 Mon Sep 17 00:00:00 2001
From: Erich Schubert <kno10@users.noreply.github.com>
Date: Fri, 21 Jun 2024 09:14:21 +0200
Subject: [PATCH 088/160] Print help if no arguments given

Former-commit-id: 08dfb7ec636fd5bfbb30dac9d5fba6e32bfc6728
---
 src/llamafactory/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index c7f136b3..af9dd5f5 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -74,7 +74,7 @@ class Command(str, Enum):
 
 
 def main():
-    command = sys.argv.pop(1)
+    command = sys.argv.pop(1) if len(sys.argv) > 0 else Command.HELP
     if command == Command.API:
         run_api()
     elif command == Command.CHAT:

From f29c1ac6e54617ded23a29f6b0d98018e272cf92 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 22 Jun 2024 00:00:38 +0800
Subject: [PATCH 089/160] fix api

Former-commit-id: dcbd6d86dfc49f12529b02ec331e3e5c05740061
---
 src/llamafactory/api/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/api/chat.py b/src/llamafactory/api/chat.py
index 2c7e11e2..72b2ae50 100644
--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -93,7 +93,7 @@ def _process_request(
 
         if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
             tool_calls = [
-                {"name": tool_call.function.name, "argument": tool_call.function.arguments}
+                {"name": tool_call.function.name, "arguments": tool_call.function.arguments}
                 for tool_call in message.tool_calls
             ]
             content = json.dumps(tool_calls, ensure_ascii=False)

From 4513a2cc75f023ad6b5aaae3e98b584470bcbe1e Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 22 Jun 2024 01:31:32 +0800
Subject: [PATCH 090/160] remove dup template

Former-commit-id: 5fec12203b24608af4d4993f44a657eb5a0348e5
---
 src/llamafactory/data/template.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 393ebfea..b5bf688c 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -511,25 +511,6 @@ _register_template(
 )
 
 
-_register_template(
-    name="chatglm3_system",
-    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
-    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
-    format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
-    format_observation=StringFormatter(
-        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
-    ),
-    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
-    default_system=(
-        "You are ChatGLM3, a large language model trained by Zhipu.AI. "
-        "Follow the user's instructions carefully. Respond using markdown."
-    ),
-    stop_words=["<|user|>", "<|observation|>"],
-    efficient_eos=True,
-)
-
-
 _register_template(
     name="chatml",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),

From 43a065bb071217af77cbe8ecdd690a8168a79131 Mon Sep 17 00:00:00 2001
From: mMrBun <2015711377@qq.com>
Date: Sat, 22 Jun 2024 02:00:13 +0800
Subject: [PATCH 091/160] Add tool_format to overwrite tool formatter template

Former-commit-id: af08971ca50443fd5597e5e4412a3aa17214502f
---
 src/llamafactory/chat/hf_engine.py    | 2 +-
 src/llamafactory/chat/vllm_engine.py  | 2 +-
 src/llamafactory/data/template.py     | 5 ++++-
 src/llamafactory/hparams/data_args.py | 4 ++++
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 9e60175b..22a24339 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -54,7 +54,7 @@ class HuggingfaceEngine(BaseEngine):
         self.tokenizer = tokenizer_module["tokenizer"]
         self.processor = tokenizer_module["processor"]
         self.tokenizer.padding_side = "left" if self.can_generate else "right"
-        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template, data_args.tool_format)
         self.model = load_model(
             self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
         )  # must after fixing tokenizer to resize vocab
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 2626d612..f0d23676 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -59,7 +59,7 @@ class VllmEngine(BaseEngine):
         self.tokenizer = tokenizer_module["tokenizer"]
         self.processor = tokenizer_module["processor"]
         self.tokenizer.padding_side = "left"
-        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template, data_args.tool_format)
         self.generating_args = generating_args.to_dict()
 
         engine_args = {
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index b5bf688c..3d8ded3b 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -379,6 +379,7 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer")
 def get_template_and_fix_tokenizer(
     tokenizer: "PreTrainedTokenizer",
     name: Optional[str] = None,
+    tool_format: Optional[str] = None,
 ) -> Template:
     if name is None:
         template = TEMPLATES["empty"]  # placeholder
@@ -386,6 +387,9 @@ def get_template_and_fix_tokenizer(
         template = TEMPLATES.get(name, None)
         if template is None:
             raise ValueError("Template {} does not exist.".format(name))
+        
+    if tool_format:
+        template.format_tools = ToolFormatter(tool_format=tool_format)
 
     stop_words = template.stop_words
     if template.replace_eos:
@@ -660,7 +664,6 @@ _register_template(
     format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
-    format_tools=ToolFormatter(tool_format="glm4"),
     format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index 39290e21..959742e3 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -29,6 +29,10 @@ class DataArguments:
         default=None,
         metadata={"help": "Which template to use for constructing prompts in training and inference."},
     )
+    tool_format: Optional[str] = field(
+        default=None,
+        metadata={"help": "Specifies the tool format template for function calling ."},
+    )
     dataset: Optional[str] = field(
         default=None,
         metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."},

From 4c89aca243ffd5e2230437f0ac812c7048739a1a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 24 Jun 2024 18:22:12 +0800
Subject: [PATCH 092/160] update readme

Former-commit-id: a1477208471039d3578980f929f1ca8c2a07aa96
---
 CITATION.cff                          | 10 ++++--
 README.md                             | 49 +++++++++++++++++++--------
 README_zh.md                          | 49 +++++++++++++++++++--------
 src/llamafactory/extras/constants.py  |  2 +-
 src/llamafactory/train/ppo/trainer.py | 16 ++++-----
 5 files changed, 85 insertions(+), 41 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index 4caf3787..a572b5fa 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -12,12 +12,14 @@ authors:
   given-names: "Yanhan"
 - family-names: "Luo"
   given-names: "Zheyan"
+- family-names: "Feng"
+  given-names: "Zhangchi"
 - family-names: "Ma"
   given-names: "Yongqiang"
 title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
 url: "https://arxiv.org/abs/2403.13372"
 preferred-citation:
-  type: article
+  type: conference-paper
   authors:
     - family-names: "Zheng"
       given-names: "Yaowei"
@@ -29,9 +31,13 @@ preferred-citation:
       given-names: "Yanhan"
     - family-names: "Luo"
       given-names: "Zheyan"
+    - family-names: "Feng"
+      given-names: "Zhangchi"
     - family-names: "Ma"
       given-names: "Yongqiang"
-  journal: "arXiv preprint arXiv:2403.13372"
+  booktitle: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)"
   title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
   url: "https://arxiv.org/abs/2403.13372"
   year: 2024
+  publisher: "Association for Computational Linguistics"
+  address: "Bangkok, Thailand"
diff --git a/README.md b/README.md
index 0e8e55f7..9462964c 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
-[![Citation](https://img.shields.io/badge/citation-44-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-63-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -15,7 +15,7 @@
 
 [![GitHub Tread](https://trendshift.io/api/badge/repositories/4535)](https://trendshift.io/repositories/4535)
 
-👋 Join our [WeChat](assets/wechat.jpg).
+👋 Join our [WeChat](assets/wechat.jpg) or [NPU user group](assets/wechat_npu.jpg).
 
 \[ English | [中文](README_zh.md) \]
 
@@ -360,8 +360,6 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 <details><summary>For Ascend NPU users</summary>
 
-Join [NPU user group](assets/wechat_npu.jpg).
-
 To install LLaMA Factory on Ascend NPU devices, please specify extra dependencies: `pip install -e '.[torch-npu,metrics]'`. Additionally, you need to install the **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**. Please follow the [installation tutorial](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html) or use the following commands:
 
 ```bash
@@ -503,38 +501,55 @@ If you have a project that should be incorporated, please contact via email or c
 1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526)
 1. Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. [[arxiv]](https://arxiv.org/abs/2311.07816)
 1. Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. [[arxiv]](https://arxiv.org/abs/2312.15710)
-1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
-1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
+1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. KDD 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
+1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
 1. Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2402.05904)
 1. Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. [[arxiv]](https://arxiv.org/abs/2402.07625)
 1. Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11176)
 1. Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. [[arxiv]](https://arxiv.org/abs/2402.11187)
 1. Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. [[arxiv]](https://arxiv.org/abs/2402.11746)
 1. Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11801)
-1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. 2024. [[arxiv]](https://arxiv.org/abs/2402.11809)
+1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2402.11809)
 1. Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11819)
 1. Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. [[arxiv]](https://arxiv.org/abs/2402.12204)
 1. Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.14714)
-1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
+1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. ACL 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
 1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073)
 1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
 1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
-1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
+1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. COLING 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
 1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443)
 1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
 1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
 1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
-1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
+1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. ICML 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
 1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
 1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836)
 1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581)
 1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215)
 1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621)
-1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2404.17140)
-1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
+1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2404.17140)
+1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. NAACL 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
+1. Xu et al. Large Language Models for Cyber Security: A Systematic Literature Review. 2024. [[arxiv]](https://arxiv.org/abs/2405.04760)
+1. Dammu et al. "They are uncultured": Unveiling Covert Harms and Social Threats in LLM Generated Conversations. 2024. [[arxiv]](https://arxiv.org/abs/2405.05378)
+1. Yi et al. A safety realignment framework via subspace-oriented model fusion for large language models. 2024. [[arxiv]](https://arxiv.org/abs/2405.09055)
+1. Lou et al. SPO: Multi-Dimensional Preference Sequential Alignment With Implicit Reward Modeling. 2024. [[arxiv]](https://arxiv.org/abs/2405.12739)
+1. Zhang et al. Getting More from Less: Large Language Models are Good Spontaneous Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2405.13816)
+1. Zhang et al. TS-Align: A Teacher-Student Collaborative Framework for Scalable Iterative Finetuning of Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2405.20215)
+1. Zihong Chen. Sentence Segmentation and Sentence Punctuation Based on XunziALLM. 2024. [[paper]](https://aclanthology.org/2024.lt4hala-1.30)
+1. Gao et al. The Best of Both Worlds: Toward an Honest and Helpful Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2406.00380)
+1. Wang and Song. MARS: Benchmarking the Metaphysical Reasoning Abilities of Language Models with a Multi-task Evaluation Dataset. 2024. [[arxiv]](https://arxiv.org/abs/2406.02106)
+1. Hu et al. Computational Limits of Low-Rank Adaptation (LoRA) for Transformer-Based Models. 2024. [[arxiv]](https://arxiv.org/abs/2406.03136)
+1. Ge et al. Time Sensitive Knowledge Editing through Efficient Finetuning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2406.04496)
+1. Tan et al. Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions. 2024. [[arxiv]](https://arxiv.org/abs/2406.05688)
+1. Song et al. Turbo Sparse: Achieving LLM SOTA Performance with Minimal Activated Parameters. 2024. [[arxiv]](https://arxiv.org/abs/2406.05955)
+1. Gu et al. RWKV-CLIP: A Robust Vision-Language Representation Learner. 2024. [[arxiv]](https://arxiv.org/abs/2406.06973)
+1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115)
+1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815)
+1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
 1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
@@ -542,6 +557,8 @@ If you have a project that should be incorporated, please contact via email or c
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: A large language model specialized in generate metadata for stable diffusion. [[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
 1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**: A multimodal large language model specialized in Chinese medical domain, based on LLaVA-1.5-7B.
+1. **[AutoRE](https://github.com/THUDM/AutoRE)**: A document-level relation extraction system based on large language models.
+1. **[NVIDIA RTX AI Toolkit](https://github.com/NVIDIA/RTX-AI-Toolkit)**: SDKs for fine-tuning LLMs on Windows PC for NVIDIA RTX.
 
 </details>
 
@@ -556,10 +573,12 @@ Please follow the model licenses to use the corresponding model weights: [Baichu
 If this work is helpful, please kindly cite as:
 
 ```bibtex
-@article{zheng2024llamafactory,
+@inproceedings{zheng2024llamafactory,
   title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
-  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Yongqiang Ma},
-  journal={arXiv preprint arXiv:2403.13372},
+  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma},
+  booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)},
+  address={Bangkok, Thailand},
+  publisher={Association for Computational Linguistics},
   year={2024},
   url={http://arxiv.org/abs/2403.13372}
 }
diff --git a/README_zh.md b/README_zh.md
index 152cd6fa..2073ce17 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -15,7 +15,7 @@
 
 [![GitHub Tread](https://trendshift.io/api/badge/repositories/4535)](https://trendshift.io/repositories/4535)
 
-👋 加入我们的[微信群](assets/wechat.jpg)。
+👋 加入我们的[微信群](assets/wechat.jpg)或 [NPU 用户群](assets/wechat_npu.jpg)。
 
 \[ [English](README.md) | 中文 \]
 
@@ -360,8 +360,6 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 <details><summary>昇腾 NPU 用户指南</summary>
 
-加入 [NPU 用户群](assets/wechat_npu.jpg)。
-
 在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e '.[torch-npu,metrics]'` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
 
 ```bash
@@ -503,38 +501,55 @@ run_name: test_run # 可选
 1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526)
 1. Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. [[arxiv]](https://arxiv.org/abs/2311.07816)
 1. Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. [[arxiv]](https://arxiv.org/abs/2312.15710)
-1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
-1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
+1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. KDD 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
+1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
 1. Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2402.05904)
 1. Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. [[arxiv]](https://arxiv.org/abs/2402.07625)
 1. Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11176)
 1. Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. [[arxiv]](https://arxiv.org/abs/2402.11187)
 1. Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. [[arxiv]](https://arxiv.org/abs/2402.11746)
 1. Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11801)
-1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. 2024. [[arxiv]](https://arxiv.org/abs/2402.11809)
+1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2402.11809)
 1. Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11819)
 1. Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. [[arxiv]](https://arxiv.org/abs/2402.12204)
 1. Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.14714)
-1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
+1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. ACL 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
 1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073)
 1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
 1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
-1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
+1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. COLING 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
 1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443)
 1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
 1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
 1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
-1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
+1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. ICML 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
 1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
 1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836)
 1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581)
 1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215)
 1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621)
-1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2404.17140)
-1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
+1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2404.17140)
+1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. NAACL 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
+1. Xu et al. Large Language Models for Cyber Security: A Systematic Literature Review. 2024. [[arxiv]](https://arxiv.org/abs/2405.04760)
+1. Dammu et al. "They are uncultured": Unveiling Covert Harms and Social Threats in LLM Generated Conversations. 2024. [[arxiv]](https://arxiv.org/abs/2405.05378)
+1. Yi et al. A safety realignment framework via subspace-oriented model fusion for large language models. 2024. [[arxiv]](https://arxiv.org/abs/2405.09055)
+1. Lou et al. SPO: Multi-Dimensional Preference Sequential Alignment With Implicit Reward Modeling. 2024. [[arxiv]](https://arxiv.org/abs/2405.12739)
+1. Zhang et al. Getting More from Less: Large Language Models are Good Spontaneous Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2405.13816)
+1. Zhang et al. TS-Align: A Teacher-Student Collaborative Framework for Scalable Iterative Finetuning of Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2405.20215)
+1. Zihong Chen. Sentence Segmentation and Sentence Punctuation Based on XunziALLM. 2024. [[paper]](https://aclanthology.org/2024.lt4hala-1.30)
+1. Gao et al. The Best of Both Worlds: Toward an Honest and Helpful Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2406.00380)
+1. Wang and Song. MARS: Benchmarking the Metaphysical Reasoning Abilities of Language Models with a Multi-task Evaluation Dataset. 2024. [[arxiv]](https://arxiv.org/abs/2406.02106)
+1. Hu et al. Computational Limits of Low-Rank Adaptation (LoRA) for Transformer-Based Models. 2024. [[arxiv]](https://arxiv.org/abs/2406.03136)
+1. Ge et al. Time Sensitive Knowledge Editing through Efficient Finetuning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2406.04496)
+1. Tan et al. Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions. 2024. [[arxiv]](https://arxiv.org/abs/2406.05688)
+1. Song et al. Turbo Sparse: Achieving LLM SOTA Performance with Minimal Activated Parameters. 2024. [[arxiv]](https://arxiv.org/abs/2406.05955)
+1. Gu et al. RWKV-CLIP: A Robust Vision-Language Representation Learner. 2024. [[arxiv]](https://arxiv.org/abs/2406.06973)
+1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115)
+1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815)
+1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper，基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM，基于 Baichuan-13B 微调而得，具有法律推理和知识检索能力。
 1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。
@@ -542,6 +557,8 @@ run_name: test_run # 可选
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**：MBTI性格大模型项目，根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**：一个用于生成 Stable Diffusion 提示词的大型语言模型。[[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
 1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**：中文多模态医学大模型，基于 LLaVA-1.5-7B 在中文多模态医疗数据上微调而得。
+1. **[AutoRE](https://github.com/THUDM/AutoRE)**：基于大语言模型的文档级关系抽取系统。
+1. **[NVIDIA RTX AI Toolkit](https://github.com/NVIDIA/RTX-AI-Toolkit)**: 在 Windows 主机上利用英伟达 RTX 设备进行大型语言模型微调的开发包。
 
 </details>
 
@@ -556,10 +573,12 @@ run_name: test_run # 可选
 如果您觉得此项目有帮助，请考虑以下列格式引用
 
 ```bibtex
-@article{zheng2024llamafactory,
-  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models}, 
-  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Yongqiang Ma},
-  journal={arXiv preprint arXiv:2403.13372},
+@inproceedings{zheng2024llamafactory,
+  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
+  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma},
+  booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)},
+  address={Bangkok, Thailand},
+  publisher={Association for Computational Linguistics},
   year={2024},
   url={http://arxiv.org/abs/2403.13372}
 }
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 36265c8e..866f39d4 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -579,7 +579,7 @@ register_model_group(
 
 register_model_group(
     models={
-        "Jambda-v0.1": {
+        "Jamba-v0.1": {
             DownloadSource.DEFAULT: "ai21labs/Jamba-v0.1",
             DownloadSource.MODELSCOPE: "AI-ModelScope/Jamba-v0.1",
         }
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index df4a37be..38f4c6c8 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -202,18 +202,18 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
 
         if self.is_world_process_zero():
             logger.info("***** Running training *****")
-            logger.info("  Num examples = {}".format(num_examples))
-            logger.info("  Num Epochs = {}".format(num_train_epochs))
-            logger.info("  Instantaneous batch size per device = {}".format(self.args.per_device_train_batch_size))
+            logger.info("  Num examples = {:,}".format(num_examples))
+            logger.info("  Num Epochs = {:,}".format(num_train_epochs))
+            logger.info("  Instantaneous batch size per device = {:,}".format(self.args.per_device_train_batch_size))
             logger.info(
-                "  Total train batch size (w. parallel, buffer, distributed & accumulation) = {}".format(
+                "  Total train batch size (w. parallel, buffer, distributed & accumulation) = {:,}".format(
                     total_train_batch_size
                 )
             )
-            logger.info("  Gradient Accumulation steps = {}".format(self.args.gradient_accumulation_steps))
-            logger.info("  Num optimization epochs per batch = {}".format(self.finetuning_args.ppo_epochs))
-            logger.info("  Total training steps = {}".format(max_steps))
-            logger.info("  Number of trainable parameters = {}".format(count_parameters(self.model)[0]))
+            logger.info("  Gradient Accumulation steps = {:,}".format(self.args.gradient_accumulation_steps))
+            logger.info("  Num optimization epochs per batch = {:,}".format(self.finetuning_args.ppo_epochs))
+            logger.info("  Total training steps = {:,}".format(max_steps))
+            logger.info("  Number of trainable parameters = {:,}".format(count_parameters(self.model)[0]))
 
         dataiter = iter(self.dataloader)
         loss_meter = AverageMeter()

From 826d7808b427fb118914720a85e932bb26b8a940 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 24 Jun 2024 18:29:04 +0800
Subject: [PATCH 093/160] update readme

Former-commit-id: 0775d56ee3cfde34e28a48cbf4a583f4530def19
---
 CITATION.cff | 3 ++-
 README_zh.md | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index a572b5fa..01b4c9fd 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -20,6 +20,8 @@ title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
 url: "https://arxiv.org/abs/2403.13372"
 preferred-citation:
   type: conference-paper
+  conference:
+    name: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)"
   authors:
     - family-names: "Zheng"
       given-names: "Yaowei"
@@ -35,7 +37,6 @@ preferred-citation:
       given-names: "Zhangchi"
     - family-names: "Ma"
       given-names: "Yongqiang"
-  booktitle: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)"
   title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
   url: "https://arxiv.org/abs/2403.13372"
   year: 2024
diff --git a/README_zh.md b/README_zh.md
index 2073ce17..8b77e91e 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -4,7 +4,7 @@
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
-[![Citation](https://img.shields.io/badge/citation-44-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-63-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)

From af2607de1a1b0259e250e77bd6a5f2c5c1b6f223 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Mon, 24 Jun 2024 10:57:36 +0000
Subject: [PATCH 094/160] update docker files

  1. add docker-npu (Dockerfile and docker-compose.yml)
  2. move cuda docker to docker-cuda and tiny changes to adapt to the new path


Former-commit-id: 5431c1f18aadb072208efe7fd8e36fdcfbf807c2
---
 README.md                                     | 61 +++++++++++++++++--
 README_zh.md                                  | 58 ++++++++++++++++--
 Dockerfile => docker/docker-cuda/Dockerfile   | 10 +--
 .../docker-cuda/docker-compose.yml            |  6 +-
 docker/docker-npu/Dockerfile                  | 40 ++++++++++++
 docker/docker-npu/docker-compose.yml          | 31 ++++++++++
 6 files changed, 187 insertions(+), 19 deletions(-)
 rename Dockerfile => docker/docker-cuda/Dockerfile (87%)
 rename docker-compose.yml => docker/docker-cuda/docker-compose.yml (80%)
 create mode 100644 docker/docker-npu/Dockerfile
 create mode 100644 docker/docker-npu/docker-compose.yml

diff --git a/README.md b/README.md
index 9462964c..1107ae0b 100644
--- a/README.md
+++ b/README.md
@@ -383,10 +383,11 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 | torch-npu    | 2.1.0   | 2.1.0.post3 |
 | deepspeed    | 0.13.2  | 0.13.2      |
 
-Docker image:
+Docker users please refer to [Build Docker](#Build-Docker).
 
-- 32GB: [Download page](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html)
-- 64GB: [Download page](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
+**NOTE**
+
+The default docker image is [cosdt/cann:8.0.rc1-910b-ubuntu22.04](https://hub.docker.com/layers/cosdt/cann/8.0.rc1-910b-ubuntu22.04/images/sha256-29ef8aacf6b2babd292f06f00b9190c212e7c79a947411e213135e4d41a178a9?context=explore). More options can be found at [cosdt/cann](https://hub.docker.com/r/cosdt/cann/tags).
 
 Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
 
@@ -426,7 +427,10 @@ llamafactory-cli webui
 
 #### Use Docker
 
+<details><summary>For NVIDIA GPU users:</summary>
+
 ```bash
+cd ./docker/docker-cuda
 docker build -f ./Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
@@ -435,18 +439,63 @@ docker build -f ./Dockerfile \
     -t llamafactory:latest .
 
 docker run -it --gpus=all \
-    -v ./hf_cache:/root/.cache/huggingface/ \
-    -v ./data:/app/data \
-    -v ./output:/app/output \
+    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
+    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
+    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
     -p 7860:7860 \
     -p 8000:8000 \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
 ```
+</details>
+
+<details><summary>For Ascend NPU users:</summary>
+
+```bash
+cd ./docker/docker-npu
+docker build -f ./Dockerfile \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    -t llamafactory:latest .
+
+# add --device for multi-npu usage
+# or modify --device to change npu card
+docker run -it \
+    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
+    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
+    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64 \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -p 7860:7860 \
+    -p 8000:8000 \
+    --device /dev/davinci0 \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    --shm-size 16G \
+    --name llamafactory \
+    llamafactory:latest
+```
+</details>
 
 #### Use Docker Compose
 
+Firstly enter your docker path:
+
+```bash
+# for NVIDIA GPU users
+cd ./docker/docker-cuda
+
+# for Ascend NPU users
+cd ./docker/docker-npu
+```
+
+Then run the following command to build docker image and start the container:
+
 ```bash
 docker-compose up -d
 docker-compose exec llamafactory bash
diff --git a/README_zh.md b/README_zh.md
index 8b77e91e..6326c0b5 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -383,10 +383,11 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 | torch-npu    | 2.1.0   | 2.1.0.post3 |
 | deepspeed    | 0.13.2  | 0.13.2      |
 
-Docker 镜像：
+Docker用户请参考 [构建 Docker](#构建-Docker).
 
-- 32GB：[下载地址](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html)
-- 64GB：[下载地址](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
+**NOTE**
+
+默认镜像为 [cosdt/cann:8.0.rc1-910b-ubuntu22.04](https://hub.docker.com/layers/cosdt/cann/8.0.rc1-910b-ubuntu22.04/images/sha256-29ef8aacf6b2babd292f06f00b9190c212e7c79a947411e213135e4d41a178a9?context=explore). 更多选择见 [cosdt/cann](https://hub.docker.com/r/cosdt/cann/tags).
 
 请使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定运算设备。
 
@@ -426,7 +427,10 @@ llamafactory-cli webui
 
 #### 使用 Docker
 
+<details><summary>NVIDIA GPU 用户：</summary>
+
 ```bash
+cd ./docker/docker-cuda
 docker build -f ./Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
@@ -435,18 +439,60 @@ docker build -f ./Dockerfile \
     -t llamafactory:latest .
 
 docker run -it --gpus=all \
-    -v ./hf_cache:/root/.cache/huggingface/ \
-    -v ./data:/app/data \
-    -v ./output:/app/output \
+    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
+    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
+    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
     -p 7860:7860 \
     -p 8000:8000 \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
 ```
+</details>
+
+<details><summary>Ascend NPU 用户：</summary>
+
+```bash
+cd ./docker/docker-npu
+docker build -f ./Dockerfile \
+    --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    -t llamafactory:latest .
+
+# 增加 --device 来使用多卡 NPU 或修改第一个 --device 来更改 NPU 卡
+docker run -it \
+    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
+    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
+    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64 \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -p 7860:7860 \
+    -p 8000:8000 \
+    --device /dev/davinci0 \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    --shm-size 16G \
+    --name llamafactory \
+    llamafactory:latest
+```
+</details>
 
 #### 使用 Docker Compose
 
+首先进入 docker 目录：
+```bash
+# NVIDIA GPU 用户
+cd ./docker/docker-cuda
+
+# Ascend NPU 用户
+cd ./docker/docker-npu
+```
+然后运行以下命令创建 docker 镜像并启动容器:
+
 ```bash
 docker-compose up -d
 docker-compose exec llamafactory bash
diff --git a/Dockerfile b/docker/docker-cuda/Dockerfile
similarity index 87%
rename from Dockerfile
rename to docker/docker-cuda/Dockerfile
index 61d58005..72797a26 100644
--- a/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -9,16 +9,18 @@ ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
-WORKDIR /app
+WORKDIR /app/LLaMA-Factory
+
+RUN cd /app && \
+    git config --global http.version HTTP/1.1 && \
+    git clone https://github.com/hiyouga/LLaMA-Factory.git && \
+    cd /app/LLaMA-Factory
 
 # Install the requirements
-COPY requirements.txt /app/
 RUN pip config set global.index-url $PIP_INDEX
 RUN python -m pip install --upgrade pip
 RUN python -m pip install -r requirements.txt
 
-# Copy the rest of the application into the image
-COPY . /app/
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="metrics"; \
diff --git a/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
similarity index 80%
rename from docker-compose.yml
rename to docker/docker-cuda/docker-compose.yml
index c5dc34e9..a470aa60 100644
--- a/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -10,9 +10,9 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface/
-      - ./data:/app/data
-      - ./output:/app/output
+      - ../../hf_cache:/root/.cache/huggingface/
+      - ../../data:/app/LLaMA-Factory/data
+      - ../../output:/app/LLaMA-Factory/output
     ports:
       - "7860:7860"
       - "8000:8000"
diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
new file mode 100644
index 00000000..9456bcbf
--- /dev/null
+++ b/docker/docker-npu/Dockerfile
@@ -0,0 +1,40 @@
+# Using ubuntu 22.04 images with cann 8.0.rc1
+# More options can be found at https://hub.docker.com/r/cosdt/cann/tags
+FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Define installation arguments
+ARG INSTALL_DEEPSPEED=false
+ARG PIP_INDEX=https://pypi.org/simple
+
+# Set the working directory
+WORKDIR /app/LLaMA-Factory
+
+RUN cd /app && \
+    git config --global http.version HTTP/1.1 && \
+    git clone https://github.com/hiyouga/LLaMA-Factory.git && \
+    cd /app/LLaMA-Factory
+
+RUN pip config set global.index-url $PIP_INDEX
+RUN python3 -m pip install --upgrade pip
+
+# Install the LLaMA Factory
+RUN EXTRA_PACKAGES="torch-npu,metrics"; \
+    if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
+    fi; \
+    pip install -e .[$EXTRA_PACKAGES] && \
+    pip uninstall -y transformer-engine flash-attn
+
+# Set up volumes
+VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
+
+# Expose port 7860 for the LLaMA Board
+EXPOSE 7860
+
+# Expose port 8000 for the API service
+EXPOSE 8000
+
+# Launch LLaMA Board
+CMD [ "llamafactory-cli", "webui" ]
diff --git a/docker/docker-npu/docker-compose.yml b/docker/docker-npu/docker-compose.yml
new file mode 100644
index 00000000..93eb6718
--- /dev/null
+++ b/docker/docker-npu/docker-compose.yml
@@ -0,0 +1,31 @@
+services:
+  llamafactory:
+    build:
+      dockerfile: Dockerfile
+      context: .
+      args:
+        INSTALL_DEEPSPEED: false
+        PIP_INDEX: https://pypi.org/simple
+    container_name: llamafactory
+    volumes:
+      - ../../hf_cache:/root/.cache/huggingface/
+      - ../../data:/app/LLaMA-Factory/data
+      - ../../output:/app/LLaMA-Factory/output
+      - /usr/local/dcmi:/usr/local/dcmi
+      - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
+      - /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
+      - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
+      - /etc/ascend_install.info:/etc/ascend_install.info
+    ports:
+      - "7860:7860"
+      - "8000:8000"
+    ipc: host
+    tty: true
+    stdin_open: true
+    command: bash
+    devices:
+      - /dev/davinci0
+      - /dev/davinci_manager
+      - /dev/devmm_svm
+      - /dev/hisi_hdc
+    restart: unless-stopped

From 2926265a140b0a446cb0968a2151e7ae60a66b0b Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Mon, 24 Jun 2024 12:27:00 +0000
Subject: [PATCH 095/160] auto-label npu issue

Former-commit-id: d19c9eac783377151e58731723fb7cbb2dab3323
---
 .github/workflows/label_issue.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
index b9a5543c..352b4b25 100644
--- a/.github/workflows/label_issue.yml
+++ b/.github/workflows/label_issue.yml
@@ -13,5 +13,18 @@ jobs:
       - env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           ISSUE_URL: ${{ github.event.issue.html_url }}
+          ISSUE_TITLE: "${{ github.event.issue.title }}"
         run: |
           gh issue edit $ISSUE_URL --add-label "pending"
+
+          # auto-add label for npu
+          NPU_KEYWORDS=("npu" "ascend" "昇腾")
+          LABEL_NPU="npu"
+          ISSUE_TITLE_LOWER=$(echo "$ISSUE_TITLE" | tr '[:upper:]' '[:lower:]')
+
+          for keyword in "${NPU_KEYWORDS[@]}"; do
+            if [[ "$ISSUE_TITLE_LOWER" == *"$keyword"* ]] && [[ "$ISSUE_TITLE_LOWER" != *"input"* ]]; then
+              gh issue edit "$ISSUE_URL" --add-label "$LABEL_NPU"
+              break
+            fi
+          done

From 16e950454e06322cb69668c5971376ddcf64fc45 Mon Sep 17 00:00:00 2001
From: stceum <50257864+stceum@users.noreply.github.com>
Date: Mon, 24 Jun 2024 20:39:20 +0800
Subject: [PATCH 096/160] Bug Fix: `off` is parsed as `False` in yaml file,
 changed to `disabled` to avoid this.

Former-commit-id: 171289d8e4c111fdca2b100282b64c74a04a4726
---
 src/llamafactory/hparams/model_args.py          | 2 +-
 src/llamafactory/hparams/parser.py              | 4 ++++
 src/llamafactory/model/model_utils/attention.py | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 996e9130..9b51c064 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -97,7 +97,7 @@ class ModelArguments:
         default=None,
         metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
     )
-    flash_attn: Literal["off", "sdpa", "fa2", "auto"] = field(
+    flash_attn: Literal["disabled", "sdpa", "fa2", "auto"] = field(
         default="auto",
         metadata={"help": "Enable FlashAttention for faster training and inference."},
     )
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index a593bf45..9ef2d607 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -102,6 +102,10 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
         if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
             raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
 
+    # In case that `flash_attn` is set to `off` in the yaml file, and parsed as `False` afterwards.
+    if model_args.flash_attn == False:
+        raise ValueError("flash_attn should be \"disabled\", \"sdpa\", \"fa2\" or \"auto\".")
+
 
 def _check_extra_dependencies(
     model_args: "ModelArguments",
diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index 8ff3807b..dfd90936 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -32,7 +32,7 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model
     if model_args.flash_attn == "auto":
         return
 
-    elif model_args.flash_attn == "off":
+    elif model_args.flash_attn == "disabled":
         requested_attn_implementation = "eager"
 
     elif model_args.flash_attn == "sdpa":

From 5675c47a014c7679370bfd38ad40666b64c9bd89 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 21:35:34 +0800
Subject: [PATCH 097/160] Update test_attention.py

Former-commit-id: c2cc7a0f152aa14fc03ae413f4a9dc06742a29d7
---
 tests/model/model_utils/test_attention.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/model/model_utils/test_attention.py b/tests/model/model_utils/test_attention.py
index 97ac9dcc..4cae3d7c 100644
--- a/tests/model/model_utils/test_attention.py
+++ b/tests/model/model_utils/test_attention.py
@@ -29,7 +29,7 @@ INFER_ARGS = {
 
 
 def test_attention():
-    attention_available = ["off"]
+    attention_available = ["disabled"]
     if is_torch_sdpa_available():
         attention_available.append("sdpa")
 
@@ -37,7 +37,7 @@ def test_attention():
         attention_available.append("fa2")
 
     llama_attention_classes = {
-        "off": "LlamaAttention",
+        "disabled": "LlamaAttention",
         "sdpa": "LlamaSdpaAttention",
         "fa2": "LlamaFlashAttention2",
     }

From d475dd3809682acd9d871cf74edf604593664b73 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 21:37:42 +0800
Subject: [PATCH 098/160] Update parser.py

Former-commit-id: 60e605cd9d399bd04432864ede9c84302890eac8
---
 src/llamafactory/hparams/parser.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 9ef2d607..a593bf45 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -102,10 +102,6 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
         if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
             raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
 
-    # In case that `flash_attn` is set to `off` in the yaml file, and parsed as `False` afterwards.
-    if model_args.flash_attn == False:
-        raise ValueError("flash_attn should be \"disabled\", \"sdpa\", \"fa2\" or \"auto\".")
-
 
 def _check_extra_dependencies(
     model_args: "ModelArguments",

From 6b738d1c89fc52b9a6696016f9981cdd723fc64a Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 21:59:39 +0800
Subject: [PATCH 099/160] Update label_issue.yml

Former-commit-id: 90785a69c6210c3a02babb12c56fb7900095247c
---
 .github/workflows/label_issue.yml | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
index 352b4b25..507b6be7 100644
--- a/.github/workflows/label_issue.yml
+++ b/.github/workflows/label_issue.yml
@@ -13,18 +13,15 @@ jobs:
       - env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           ISSUE_URL: ${{ github.event.issue.html_url }}
-          ISSUE_TITLE: "${{ github.event.issue.title }}"
+          ISSUE_TITLE: ${{ github.event.issue.title }}
         run: |
-          gh issue edit $ISSUE_URL --add-label "pending"
-
-          # auto-add label for npu
-          NPU_KEYWORDS=("npu" "ascend" "昇腾")
-          LABEL_NPU="npu"
-          ISSUE_TITLE_LOWER=$(echo "$ISSUE_TITLE" | tr '[:upper:]' '[:lower:]')
-
-          for keyword in "${NPU_KEYWORDS[@]}"; do
-            if [[ "$ISSUE_TITLE_LOWER" == *"$keyword"* ]] && [[ "$ISSUE_TITLE_LOWER" != *"input"* ]]; then
-              gh issue edit "$ISSUE_URL" --add-label "$LABEL_NPU"
+          LABEL=pending
+          NPU_KEYWORDS=(npu ascend 昇腾)
+          ISSUE_TITLE_LOWER=$(echo $ISSUE_TITLE | tr '[:upper:]' '[:lower:]')
+          for KEYWORD in ${NPU_KEYWORDS[@]}; do
+            if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]] then
+              LABEL=pending,npu
               break
             fi
           done
+          gh issue edit $ISSUE_URL --add-label $LABEL

From a4f2d5aa6f4b58bbe33e7a94c17582987109b97d Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 22:01:23 +0800
Subject: [PATCH 100/160] Update label_issue.yml

Former-commit-id: dc2f7998b4ae9d7223c7c16732d835cea2a28713
---
 .github/workflows/label_issue.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
index 507b6be7..0e10f0b9 100644
--- a/.github/workflows/label_issue.yml
+++ b/.github/workflows/label_issue.yml
@@ -19,7 +19,7 @@ jobs:
           NPU_KEYWORDS=(npu ascend 昇腾)
           ISSUE_TITLE_LOWER=$(echo $ISSUE_TITLE | tr '[:upper:]' '[:lower:]')
           for KEYWORD in ${NPU_KEYWORDS[@]}; do
-            if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]] then
+            if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]]; then
               LABEL=pending,npu
               break
             fi

From a79e93f3357b78d8576e516711cc721995ac23a4 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 24 Jun 2024 22:34:31 +0800
Subject: [PATCH 101/160] fix #4410

Former-commit-id: f49adc4ab5eade21d7a9e029212f17688ee9b0cf
---
 src/llamafactory/data/formatter.py         |  4 ++--
 src/llamafactory/hparams/model_args.py     |  2 +-
 src/llamafactory/model/patcher.py          |  6 +++---
 src/llamafactory/webui/chatter.py          |  1 +
 src/llamafactory/webui/components/infer.py | 17 ++++++++++++++---
 src/llamafactory/webui/locales.py          | 11 +++++++++++
 6 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index ed9ba8b8..b5dc57ff 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -34,8 +34,8 @@ DEFAULT_TOOL_PROMPT = (
 
 
 GLM4_TOOL_PROMPT = (
-    "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
-    "你的任务是针对用户的问题和要求提供适当的答复和支持。{tool_text}"
+    "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+    "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具{tool_text}"
 )
 
 
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 9b51c064..3f21145d 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -97,7 +97,7 @@ class ModelArguments:
         default=None,
         metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
     )
-    flash_attn: Literal["disabled", "sdpa", "fa2", "auto"] = field(
+    flash_attn: Literal["auto", "disabled", "sdpa", "fa2"] = field(
         default="auto",
         metadata={"help": "Enable FlashAttention for faster training and inference."},
     )
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 35153649..24cd2601 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -58,10 +58,10 @@ def patch_config(
     is_trainable: bool,
 ) -> None:
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
-        if model_args.infer_dtype == "auto":
-            model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
-        else:
+        if model_args.infer_dtype != "auto" and not is_trainable:
             model_args.compute_dtype = getattr(torch, model_args.infer_dtype)
+        else:
+            model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
 
     if is_torch_npu_available():
         use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py
index a2b54dce..652c341c 100644
--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
@@ -87,6 +87,7 @@ class WebChatModel(ChatModel):
             visual_inputs=get("top.visual_inputs"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             infer_backend=get("infer.infer_backend"),
+            infer_dtype=get("infer.infer_dtype"),
         )
 
         if checkpoint_path:
diff --git a/src/llamafactory/webui/components/infer.py b/src/llamafactory/webui/components/infer.py
index 03bccd7f..a0064479 100644
--- a/src/llamafactory/webui/components/infer.py
+++ b/src/llamafactory/webui/components/infer.py
@@ -32,15 +32,26 @@ def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]:
     input_elems = engine.manager.get_base_elems()
     elem_dict = dict()
 
-    infer_backend = gr.Dropdown(choices=["huggingface", "vllm"], value="huggingface")
+    with gr.Row():
+        infer_backend = gr.Dropdown(choices=["huggingface", "vllm"], value="huggingface")
+        infer_dtype = gr.Dropdown(choices=["auto", "float16", "bfloat16", "float32"], value="auto")
+
     with gr.Row():
         load_btn = gr.Button()
         unload_btn = gr.Button()
 
     info_box = gr.Textbox(show_label=False, interactive=False)
 
-    input_elems.update({infer_backend})
-    elem_dict.update(dict(infer_backend=infer_backend, load_btn=load_btn, unload_btn=unload_btn, info_box=info_box))
+    input_elems.update({infer_backend, infer_dtype})
+    elem_dict.update(
+        dict(
+            infer_backend=infer_backend,
+            infer_dtype=infer_dtype,
+            load_btn=load_btn,
+            unload_btn=unload_btn,
+            info_box=info_box,
+        )
+    )
 
     chatbot, messages, chat_elems = create_chat_box(engine, visible=False)
     elem_dict.update(chat_elems)
diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py
index 8e8d6fce..cd166584 100644
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
@@ -1206,6 +1206,17 @@ LOCALES = {
             "label": "推理引擎",
         },
     },
+    "infer_dtype": {
+        "en": {
+            "label": "Inference data type",
+        },
+        "ru": {
+            "label": "Тип данных для вывода",
+        },
+        "zh": {
+            "label": "推理数据类型",
+        },
+    },
     "load_btn": {
         "en": {
             "value": "Load model",

From 9a1ec1984566d9c94c5377377956e14c87fc021a Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:06:18 +0800
Subject: [PATCH 102/160] Update loader.py

Former-commit-id: afa59d61844595e6b615227e6bfdc0b16c8015dd
---
 src/llamafactory/data/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py
index f44ef5de..8e7062db 100644
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -148,7 +148,7 @@ def get_dataset(
     tokenizer: "PreTrainedTokenizer",
     processor: Optional["ProcessorMixin"] = None,
 ) -> Union["Dataset", "IterableDataset"]:
-    template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
+    template = get_template_and_fix_tokenizer(tokenizer, data_args.template, data_args.tool_format)
     if data_args.train_on_prompt and template.efficient_eos:
         raise ValueError("Current template does not support `train_on_prompt`.")
 

From 5d6cf55208c59fb209cdb938a700e295e73b9496 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:12:59 +0800
Subject: [PATCH 103/160] Update template.py

Former-commit-id: d53517bff6f8734221d7df9982f3bdd4d2eb2cab
---
 src/llamafactory/data/template.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 3d8ded3b..3a72a858 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -664,6 +664,7 @@ _register_template(
     format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
     format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,

From 0b331a318baecee4c4e2af1e50a66d3e2bc299e3 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:14:36 +0800
Subject: [PATCH 104/160] Update test_formatter.py

Former-commit-id: d13ef043441734189b05e739dbbebb16077a6f0b
---
 tests/data/test_formatter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/data/test_formatter.py b/tests/data/test_formatter.py
index 430eb0e6..a01e8a7e 100644
--- a/tests/data/test_formatter.py
+++ b/tests/data/test_formatter.py
@@ -111,9 +111,9 @@ def test_glm4_tool_formatter():
         }
     ]
     assert formatter.apply(content=json.dumps(tools)) == [
-        "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
-        "你的任务是针对用户的问题和要求提供适当的答复和支持。"
-        "\n\n## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+        "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+        "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具\n\n"
+        "## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
             json.dumps(tools[0], indent=4)
         )
     ]

From 8bcdb6f52cda26b0b5580c1d3b26117c6745f1a3 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:21:10 +0800
Subject: [PATCH 105/160] Update cli.py

Former-commit-id: 9db6126496ec9e834541823715f700f92b3968c7
---
 src/llamafactory/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index af9dd5f5..e4846780 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -74,7 +74,7 @@ class Command(str, Enum):
 
 
 def main():
-    command = sys.argv.pop(1) if len(sys.argv) > 0 else Command.HELP
+    command = sys.argv.pop(1) if len(sys.argv) != 1 else Command.HELP
     if command == Command.API:
         run_api()
     elif command == Command.CHAT:

From 2e5d521ed8fb012ec60222233cce1e8120e5f861 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 24 Jun 2024 23:41:35 +0800
Subject: [PATCH 106/160] Update Dockerfile

Former-commit-id: 632681d8ece0eaac59bb364d971435a3bc6665a9
---
 docker/docker-cuda/Dockerfile | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 72797a26..61d58005 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -9,18 +9,16 @@ ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
-WORKDIR /app/LLaMA-Factory
-
-RUN cd /app && \
-    git config --global http.version HTTP/1.1 && \
-    git clone https://github.com/hiyouga/LLaMA-Factory.git && \
-    cd /app/LLaMA-Factory
+WORKDIR /app
 
 # Install the requirements
+COPY requirements.txt /app/
 RUN pip config set global.index-url $PIP_INDEX
 RUN python -m pip install --upgrade pip
 RUN python -m pip install -r requirements.txt
 
+# Copy the rest of the application into the image
+COPY . /app/
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="metrics"; \

From b5cdef43a1e46d00fbaef65ebf7f8ed35e049167 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 00:46:08 +0800
Subject: [PATCH 107/160] Update Dockerfile

Former-commit-id: c897a70501707c0f4c432bb8e9a9beeb4e8953a3
---
 docker/docker-cuda/Dockerfile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 61d58005..2d20bfe4 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -12,13 +12,14 @@ ARG PIP_INDEX=https://pypi.org/simple
 WORKDIR /app
 
 # Install the requirements
-COPY requirements.txt /app/
+COPY requirements.txt /app
 RUN pip config set global.index-url $PIP_INDEX
+RUN pip config set global.extra-index-url $PIP_INDEX
 RUN python -m pip install --upgrade pip
 RUN python -m pip install -r requirements.txt
 
 # Copy the rest of the application into the image
-COPY . /app/
+COPY . /app
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="metrics"; \
@@ -38,10 +39,9 @@ RUN EXTRA_PACKAGES="metrics"; \
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
 
 # Expose port 7860 for the LLaMA Board
+ENV GRADIO_SERVER_PORT 7860
 EXPOSE 7860
 
 # Expose port 8000 for the API service
+ENV API_PORT 8000
 EXPOSE 8000
-
-# Launch LLaMA Board
-CMD [ "llamafactory-cli", "webui" ]

From 8360544d6579a26a43422a6642545c2ba0db288c Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 00:46:47 +0800
Subject: [PATCH 108/160] Update docker-compose.yml

Former-commit-id: 56af208074e6af5465183af85367e7edd89d5aa6
---
 docker/docker-cuda/docker-compose.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index a470aa60..04d6531a 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -1,8 +1,8 @@
 services:
   llamafactory:
     build:
-      dockerfile: Dockerfile
-      context: .
+      dockerfile: ./docker/docker-cuda/Dockerfile
+      context: ../..
       args:
         INSTALL_BNB: false
         INSTALL_VLLM: false
@@ -10,9 +10,9 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ../../hf_cache:/root/.cache/huggingface/
-      - ../../data:/app/LLaMA-Factory/data
-      - ../../output:/app/LLaMA-Factory/output
+      - ./hf_cache:/root/.cache/huggingface/
+      - ./data:/app/data
+      - ./output:/app/output
     ports:
       - "7860:7860"
       - "8000:8000"

From 37013f80689d52c39aaccefd72b9e02be0feee20 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 00:50:34 +0800
Subject: [PATCH 109/160] Update Dockerfile

Former-commit-id: cdcd9455c19311394e148476a28ca75849c845b2
---
 docker/docker-npu/Dockerfile | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 9456bcbf..0fdd4472 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -1,5 +1,5 @@
-# Using ubuntu 22.04 images with cann 8.0.rc1
-# More options can be found at https://hub.docker.com/r/cosdt/cann/tags
+# Use the Ubuntu 22.04 image with CANN 8.0.rc1
+# More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
@@ -9,15 +9,17 @@ ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
-WORKDIR /app/LLaMA-Factory
-
-RUN cd /app && \
-    git config --global http.version HTTP/1.1 && \
-    git clone https://github.com/hiyouga/LLaMA-Factory.git && \
-    cd /app/LLaMA-Factory
+WORKDIR /app
 
+# Install the requirements
+COPY requirements.txt /app
 RUN pip config set global.index-url $PIP_INDEX
-RUN python3 -m pip install --upgrade pip
+RUN pip config set global.extra-index-url $PIP_INDEX
+RUN python -m pip install --upgrade pip
+RUN python -m pip install -r requirements.txt
+
+# Copy the rest of the application into the image
+COPY . /app
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="torch-npu,metrics"; \
@@ -31,10 +33,9 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
 
 # Expose port 7860 for the LLaMA Board
+ENV GRADIO_SERVER_PORT 7860
 EXPOSE 7860
 
 # Expose port 8000 for the API service
+ENV API_PORT 8000
 EXPOSE 8000
-
-# Launch LLaMA Board
-CMD [ "llamafactory-cli", "webui" ]

From 8367ec03a7dc36b4bfcf6c1f0e9cc49166a6302c Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 00:54:28 +0800
Subject: [PATCH 110/160] Update docker-compose.yml

Former-commit-id: e038daf8dfa5d948b70c18469cb5a0be9aec464a
---
 docker/docker-npu/docker-compose.yml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/docker/docker-npu/docker-compose.yml b/docker/docker-npu/docker-compose.yml
index 93eb6718..7fff6e73 100644
--- a/docker/docker-npu/docker-compose.yml
+++ b/docker/docker-npu/docker-compose.yml
@@ -1,20 +1,19 @@
 services:
   llamafactory:
     build:
-      dockerfile: Dockerfile
-      context: .
+      dockerfile: ./docker/docker-npu/Dockerfile
+      context: ../..
       args:
         INSTALL_DEEPSPEED: false
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ../../hf_cache:/root/.cache/huggingface/
-      - ../../data:/app/LLaMA-Factory/data
-      - ../../output:/app/LLaMA-Factory/output
+      - ./hf_cache:/root/.cache/huggingface/
+      - ./data:/app/data
+      - ./output:/app/output
       - /usr/local/dcmi:/usr/local/dcmi
       - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
-      - /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64
-      - /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
+      - /usr/local/Ascend/driver:/usr/local/Ascend/driver
       - /etc/ascend_install.info:/etc/ascend_install.info
     ports:
       - "7860:7860"

From cec2f1fc00ae29f627c897343035752936344cd7 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 01:03:38 +0800
Subject: [PATCH 111/160] Update README.md

Former-commit-id: abe7aca5e133960da9200e3a036d9a550f474171
---
 README.md | 84 ++++++++++++++++++++++++-------------------------------
 1 file changed, 37 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 1107ae0b..a20b848b 100644
--- a/README.md
+++ b/README.md
@@ -383,12 +383,6 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 | torch-npu    | 2.1.0   | 2.1.0.post3 |
 | deepspeed    | 0.13.2  | 0.13.2      |
 
-Docker users please refer to [Build Docker](#Build-Docker).
-
-**NOTE**
-
-The default docker image is [cosdt/cann:8.0.rc1-910b-ubuntu22.04](https://hub.docker.com/layers/cosdt/cann/8.0.rc1-910b-ubuntu22.04/images/sha256-29ef8aacf6b2babd292f06f00b9190c212e7c79a947411e213135e4d41a178a9?context=explore). More options can be found at [cosdt/cann](https://hub.docker.com/r/cosdt/cann/tags).
-
 Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
 
 If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
@@ -425,50 +419,62 @@ llamafactory-cli webui
 
 ### Build Docker
 
-#### Use Docker
-
-<details><summary>For NVIDIA GPU users:</summary>
+For CUDA users:
 
 ```bash
-cd ./docker/docker-cuda
-docker build -f ./Dockerfile \
+docker-compose -f ./docker/docker-cuda/docker-compose.yml up -d
+docker-compose exec llamafactory bash
+```
+
+For Ascend NPU users:
+
+```bash
+docker-compose -f ./docker/docker-npu/docker-compose.yml up -d
+docker-compose exec llamafactory bash
+```
+
+<details><summary>Build without Docker Compose</summary>
+
+For CUDA users:
+
+```bash
+docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
-docker run -it --gpus=all \
-    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
-    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
-    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+docker run -dit --gpus=all \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
     -p 7860:7860 \
     -p 8000:8000 \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
-```
-</details>
 
-<details><summary>For Ascend NPU users:</summary>
+docker exec -it llamafactory bash
+```
+
+For Ascend NPU users:
 
 ```bash
-cd ./docker/docker-npu
-docker build -f ./Dockerfile \
+# Change docker image upon your environment
+docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
-# add --device for multi-npu usage
-# or modify --device to change npu card
-docker run -it \
-    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
-    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
-    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+# Change `device` upon your resources
+docker run -dit \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
     -v /usr/local/dcmi:/usr/local/dcmi \
     -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64 \
-    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
     -v /etc/ascend_install.info:/etc/ascend_install.info \
     -p 7860:7860 \
     -p 8000:8000 \
@@ -479,28 +485,12 @@ docker run -it \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
+
+docker exec -it llamafactory bash
 ```
+
 </details>
 
-#### Use Docker Compose
-
-Firstly enter your docker path:
-
-```bash
-# for NVIDIA GPU users
-cd ./docker/docker-cuda
-
-# for Ascend NPU users
-cd ./docker/docker-npu
-```
-
-Then run the following command to build docker image and start the container:
-
-```bash
-docker-compose up -d
-docker-compose exec llamafactory bash
-```
-
 <details><summary>Details about volume</summary>
 
 - hf_cache: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory.

From b55eb3047407ec3e06fbaeaa2fb0e526e9219384 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 25 Jun 2024 01:06:59 +0800
Subject: [PATCH 112/160] Update README_zh.md

Former-commit-id: f0c95160fea48b8c6291f42beb79ac089177fbb2
---
 README_zh.md | 83 ++++++++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 45 deletions(-)

diff --git a/README_zh.md b/README_zh.md
index 6326c0b5..3bed0846 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -360,7 +360,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 <details><summary>昇腾 NPU 用户指南</summary>
 
-在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e '.[torch-npu,metrics]'` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
+在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e ".[torch-npu,metrics]"` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
 
 ```bash
 # 请替换 URL 为 CANN 版本和设备型号对应的 URL
@@ -383,12 +383,6 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 | torch-npu    | 2.1.0   | 2.1.0.post3 |
 | deepspeed    | 0.13.2  | 0.13.2      |
 
-Docker用户请参考 [构建 Docker](#构建-Docker).
-
-**NOTE**
-
-默认镜像为 [cosdt/cann:8.0.rc1-910b-ubuntu22.04](https://hub.docker.com/layers/cosdt/cann/8.0.rc1-910b-ubuntu22.04/images/sha256-29ef8aacf6b2babd292f06f00b9190c212e7c79a947411e213135e4d41a178a9?context=explore). 更多选择见 [cosdt/cann](https://hub.docker.com/r/cosdt/cann/tags).
-
 请使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定运算设备。
 
 如果遇到无法正常推理的情况，请尝试设置 `do_sample: false`。
@@ -425,49 +419,62 @@ llamafactory-cli webui
 
 ### 构建 Docker
 
-#### 使用 Docker
-
-<details><summary>NVIDIA GPU 用户：</summary>
+CUDA 用户：
 
 ```bash
-cd ./docker/docker-cuda
-docker build -f ./Dockerfile \
+docker-compose -f ./docker/docker-cuda/docker-compose.yml up -d
+docker-compose exec llamafactory bash
+```
+
+昇腾 NPU 用户：
+
+```bash
+docker-compose -f ./docker/docker-npu/docker-compose.yml up -d
+docker-compose exec llamafactory bash
+```
+
+<details><summary>不使用 Docker Compose 构建</summary>
+
+CUDA 用户：
+
+```bash
+docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
-docker run -it --gpus=all \
-    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
-    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
-    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+docker run -dit --gpus=all \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
     -p 7860:7860 \
     -p 8000:8000 \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
-```
-</details>
 
-<details><summary>Ascend NPU 用户：</summary>
+docker exec -it llamafactory bash
+```
+
+昇腾 NPU 用户：
 
 ```bash
-cd ./docker/docker-npu
-docker build -f ./Dockerfile \
+# 根据您的环境选择镜像
+docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
-# 增加 --device 来使用多卡 NPU 或修改第一个 --device 来更改 NPU 卡
-docker run -it \
-    -v /$(dirname $(dirname "$PWD"))/hf_cache:/root/.cache/huggingface/ \
-    -v /$(dirname $(dirname "$PWD"))/data:/app/data \
-    -v /$(dirname $(dirname "$PWD"))/output:/app/output \
+# 根据您的资源更改 `device`
+docker run -dit \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
     -v /usr/local/dcmi:/usr/local/dcmi \
     -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-    -v /usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64 \
-    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
     -v /etc/ascend_install.info:/etc/ascend_install.info \
     -p 7860:7860 \
     -p 8000:8000 \
@@ -478,26 +485,12 @@ docker run -it \
     --shm-size 16G \
     --name llamafactory \
     llamafactory:latest
+
+docker exec -it llamafactory bash
 ```
+
 </details>
 
-#### 使用 Docker Compose
-
-首先进入 docker 目录：
-```bash
-# NVIDIA GPU 用户
-cd ./docker/docker-cuda
-
-# Ascend NPU 用户
-cd ./docker/docker-npu
-```
-然后运行以下命令创建 docker 镜像并启动容器:
-
-```bash
-docker-compose up -d
-docker-compose exec llamafactory bash
-```
-
 <details><summary>数据卷详情</summary>
 
 - hf_cache：使用宿主机的 Hugging Face 缓存文件夹，允许更改为新的目录。

From 135bfbf7c1eb7d0c214d6add293cbd18e865f900 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 01:15:19 +0800
Subject: [PATCH 113/160] tiny fix

Former-commit-id: bb57478366a70a0871af30ab31c890f471e27ff4
---
 .dockerignore                              |  5 +++--
 README.md                                  | 10 ++++++----
 README_zh.md                               |  6 ++++--
 src/llamafactory/data/formatter.py         |  2 +-
 src/llamafactory/data/template.py          |  6 +++---
 src/llamafactory/hparams/data_args.py      |  8 ++++----
 src/llamafactory/webui/components/train.py |  2 +-
 src/llamafactory/webui/runner.py           |  2 +-
 8 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 2ac0e11d..75cd2209 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,10 +4,11 @@
 .venv
 cache
 data
+docker
+examples
+saves
 hf_cache
 output
-examples
 .dockerignore
 .gitattributes
 .gitignore
-Dockerfile
diff --git a/README.md b/README.md
index a20b848b..e1b45236 100644
--- a/README.md
+++ b/README.md
@@ -360,7 +360,7 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 <details><summary>For Ascend NPU users</summary>
 
-To install LLaMA Factory on Ascend NPU devices, please specify extra dependencies: `pip install -e '.[torch-npu,metrics]'`. Additionally, you need to install the **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**. Please follow the [installation tutorial](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html) or use the following commands:
+To install LLaMA Factory on Ascend NPU devices, please specify extra dependencies: `pip install -e ".[torch-npu,metrics]"`. Additionally, you need to install the **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**. Please follow the [installation tutorial](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html) or use the following commands:
 
 ```bash
 # replace the url according to your CANN version and devices
@@ -422,14 +422,16 @@ llamafactory-cli webui
 For CUDA users:
 
 ```bash
-docker-compose -f ./docker/docker-cuda/docker-compose.yml up -d
+cd docker/docker-cuda/
+docker-compose up -d
 docker-compose exec llamafactory bash
 ```
 
 For Ascend NPU users:
 
 ```bash
-docker-compose -f ./docker/docker-npu/docker-compose.yml up -d
+cd docker/docker-npu/
+docker-compose up -d
 docker-compose exec llamafactory bash
 ```
 
@@ -461,7 +463,7 @@ docker exec -it llamafactory bash
 For Ascend NPU users:
 
 ```bash
-# Change docker image upon your environment
+# Choose docker image upon your environment
 docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
diff --git a/README_zh.md b/README_zh.md
index 3bed0846..32edb1f7 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -422,14 +422,16 @@ llamafactory-cli webui
 CUDA 用户：
 
 ```bash
-docker-compose -f ./docker/docker-cuda/docker-compose.yml up -d
+cd docker/docker-cuda/
+docker-compose up -d
 docker-compose exec llamafactory bash
 ```
 
 昇腾 NPU 用户：
 
 ```bash
-docker-compose -f ./docker/docker-npu/docker-compose.yml up -d
+cd docker/docker-npu/
+docker-compose up -d
 docker-compose exec llamafactory bash
 ```
 
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index b5dc57ff..88ebf682 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -216,7 +216,7 @@ class ToolFormatter(Formatter):
             self._tool_formatter = glm4_tool_formatter
             self._tool_extractor = glm4_tool_extractor
         else:
-            raise ValueError("Tool format was not found.")
+            raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
 
     def apply(self, **kwargs) -> SLOTS:
         content = kwargs.pop("content")
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 3a72a858..193ff482 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -387,8 +387,9 @@ def get_template_and_fix_tokenizer(
         template = TEMPLATES.get(name, None)
         if template is None:
             raise ValueError("Template {} does not exist.".format(name))
-        
-    if tool_format:
+
+    if tool_format is not None:
+        logger.info("Using tool format: {}.".format(tool_format))
         template.format_tools = ToolFormatter(tool_format=tool_format)
 
     stop_words = template.stop_words
@@ -625,7 +626,6 @@ _register_template(
 
 _register_template(
     name="empty",
-    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     efficient_eos=True,
 )
 
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index 959742e3..dad13820 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -29,10 +29,6 @@ class DataArguments:
         default=None,
         metadata={"help": "Which template to use for constructing prompts in training and inference."},
     )
-    tool_format: Optional[str] = field(
-        default=None,
-        metadata={"help": "Specifies the tool format template for function calling ."},
-    )
     dataset: Optional[str] = field(
         default=None,
         metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."},
@@ -105,6 +101,10 @@ class DataArguments:
             "help": "Whether or not to pack the sequences in training. Will automatically enable in pre-training."
         },
     )
+    tool_format: Optional[str] = field(
+        default=None,
+        metadata={"help": "Tool format to use for constructing function calling examples."},
+    )
     tokenized_path: Optional[str] = field(
         default=None,
         metadata={"help": "Path to save or load the tokenized datasets."},
diff --git a/src/llamafactory/webui/components/train.py b/src/llamafactory/webui/components/train.py
index 874f3c5e..f33c37ee 100644
--- a/src/llamafactory/webui/components/train.py
+++ b/src/llamafactory/webui/components/train.py
@@ -291,7 +291,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         with gr.Column(scale=1):
             loss_viewer = gr.Plot()
 
-    input_elems.update({output_dir, config_path, device_count, ds_stage, ds_offload})
+    input_elems.update({output_dir, config_path, ds_stage, ds_offload})
     elem_dict.update(
         dict(
             cmd_preview_btn=cmd_preview_btn,
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 6cd21b07..549ec765 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -306,7 +306,7 @@ class Runner:
 
     def _form_config_dict(self, data: Dict["Component", Any]) -> Dict[str, Any]:
         config_dict = {}
-        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path", "train.device_count"]
+        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path"]
         for elem, value in data.items():
             elem_id = self.manager.get_id_by_elem(elem)
             if elem_id not in skip_ids:

From 98fb3d015a634122b84ea282d70e3dabdaa46dc1 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 01:51:29 +0800
Subject: [PATCH 114/160] fix #4419

Former-commit-id: 15069c3ca814d5ac9beec77d914b71cde7ea0f47
---
 README.md                             | 6 ++++--
 README_zh.md                          | 6 ++++--
 docker/docker-cuda/Dockerfile         | 2 +-
 docker/docker-cuda/docker-compose.yml | 3 ++-
 docker/docker-npu/Dockerfile          | 2 +-
 docker/docker-npu/docker-compose.yml  | 3 ++-
 6 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index e1b45236..4b42edd7 100644
--- a/README.md
+++ b/README.md
@@ -448,7 +448,8 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     -t llamafactory:latest .
 
 docker run -dit --gpus=all \
-    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -p 7860:7860 \
@@ -471,7 +472,8 @@ docker build -f ./docker/docker-npu/Dockerfile \
 
 # Change `device` upon your resources
 docker run -dit \
-    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -v /usr/local/dcmi:/usr/local/dcmi \
diff --git a/README_zh.md b/README_zh.md
index 32edb1f7..3926c09d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -448,7 +448,8 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     -t llamafactory:latest .
 
 docker run -dit --gpus=all \
-    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -p 7860:7860 \
@@ -471,7 +472,8 @@ docker build -f ./docker/docker-npu/Dockerfile \
 
 # 根据您的资源更改 `device`
 docker run -dit \
-    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./hf_cache:/root/.cache/huggingface \
+    -v ./ms_cache:/root/.cache/modelscope \
     -v ./data:/app/data \
     -v ./output:/app/output \
     -v /usr/local/dcmi:/usr/local/dcmi \
diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 2d20bfe4..827b7b3c 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -36,7 +36,7 @@ RUN EXTRA_PACKAGES="metrics"; \
     pip uninstall -y transformer-engine flash-attn
 
 # Set up volumes
-VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
+VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
 
 # Expose port 7860 for the LLaMA Board
 ENV GRADIO_SERVER_PORT 7860
diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index 04d6531a..1c0a3c75 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -10,7 +10,8 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface/
+      - ./hf_cache:/root/.cache/huggingface
+      - ./ms_cache:/root/.cache/modelscope
       - ./data:/app/data
       - ./output:/app/output
     ports:
diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 0fdd4472..08de626b 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -30,7 +30,7 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
     pip uninstall -y transformer-engine flash-attn
 
 # Set up volumes
-VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
+VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
 
 # Expose port 7860 for the LLaMA Board
 ENV GRADIO_SERVER_PORT 7860
diff --git a/docker/docker-npu/docker-compose.yml b/docker/docker-npu/docker-compose.yml
index 7fff6e73..a6b878fd 100644
--- a/docker/docker-npu/docker-compose.yml
+++ b/docker/docker-npu/docker-compose.yml
@@ -8,7 +8,8 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface/
+      - ./hf_cache:/root/.cache/huggingface
+      - ./ms_cache:/root/.cache/modelscope
       - ./data:/app/data
       - ./output:/app/output
       - /usr/local/dcmi:/usr/local/dcmi

From 9fd7a410bb899f6aa28f2007bc7573b151f3dfe4 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 01:54:53 +0800
Subject: [PATCH 115/160] tiny fix about badam

Former-commit-id: 03f49267c7406e36aee35639f86e6e0383897090
---
 .github/workflows/tests.yml                   |  2 +-
 .gitignore                                    |  8 ++--
 ...ma3_lora_sft.yaml => llama3_full_sft.yaml} |  2 +-
 ...adam_sft.yaml => llama3_full_sft_ds3.yaml} |  4 +-
 examples/extras/badam/train_single_gpu.sh     | 37 ------------------
 examples/extras/badam/train_zero3.sh          | 39 -------------------
 setup.py                                      |  2 +-
 src/llamafactory/hparams/parser.py            | 14 +++----
 src/llamafactory/train/dpo/trainer.py         |  3 +-
 src/llamafactory/train/kto/trainer.py         |  3 +-
 src/llamafactory/train/ppo/trainer.py         |  3 +-
 src/llamafactory/train/pt/trainer.py          |  3 +-
 src/llamafactory/train/rm/trainer.py          |  3 +-
 src/llamafactory/train/sft/trainer.py         |  3 +-
 src/llamafactory/train/trainer_utils.py       |  7 +---
 15 files changed, 31 insertions(+), 102 deletions(-)
 rename examples/extras/badam/{llama3_lora_sft.yaml => llama3_full_sft.yaml} (97%)
 rename examples/extras/badam/{llama3_badam_sft.yaml => llama3_full_sft_ds3.yaml} (89%)
 delete mode 100644 examples/extras/badam/train_single_gpu.sh
 delete mode 100644 examples/extras/badam/train_zero3.sh

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 98bd9455..73d77de5 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -34,7 +34,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install .[torch,dev]
+          python -m pip install ".[torch,dev]"
 
       - name: Check quality
         run: |
diff --git a/.gitignore b/.gitignore
index 2486e728..82e6e9e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,8 +160,8 @@ cython_debug/
 .idea/
 
 # custom .gitignore
-user.config
-saves/
 cache/
-wandb
-ds_badam_exp
\ No newline at end of file
+config/
+saves/
+output/
+wandb/
diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_full_sft.yaml
similarity index 97%
rename from examples/extras/badam/llama3_lora_sft.yaml
rename to examples/extras/badam/llama3_full_sft.yaml
index a78de2fa..31d61c33 100644
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft.yaml
@@ -6,6 +6,7 @@ stage: sft
 do_train: true
 finetuning_type: full
 use_badam: true
+badam_mode: layer
 badam_switch_mode: ascending
 badam_switch_interval: 50
 badam_verbose: 2
@@ -32,7 +33,6 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-pure_bf16: true
 
 ### eval
 val_size: 0.1
diff --git a/examples/extras/badam/llama3_badam_sft.yaml b/examples/extras/badam/llama3_full_sft_ds3.yaml
similarity index 89%
rename from examples/extras/badam/llama3_badam_sft.yaml
rename to examples/extras/badam/llama3_full_sft_ds3.yaml
index f5adb220..f2d7309f 100644
--- a/examples/extras/badam/llama3_badam_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft_ds3.yaml
@@ -6,9 +6,11 @@ stage: sft
 do_train: true
 finetuning_type: full
 use_badam: true
+badam_mode: layer
 badam_switch_mode: ascending
 badam_switch_interval: 50
 badam_verbose: 2
+deepspeed: examples/deepspeed/ds_z3_config.json
 
 ### dataset
 dataset: identity,alpaca_en_demo
@@ -28,7 +30,7 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
-learning_rate: 1.0e-6
+learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
diff --git a/examples/extras/badam/train_single_gpu.sh b/examples/extras/badam/train_single_gpu.sh
deleted file mode 100644
index 8af79007..00000000
--- a/examples/extras/badam/train_single_gpu.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=0
-
-cd ../../..
-
-llamafactory-cli train \
-    --stage sft \
-    --do_train True \
-    --model_name_or_path meta-llama/Llama-2-13b-hf \
-    --preprocessing_num_workers 16 \
-    --finetuning_type full \
-    --template default \
-    --flash_attn auto \
-    --dataset_dir data \
-    --dataset alpaca_en_demo \
-    --cutoff_len 1024 \
-    --learning_rate 1e-6 \
-    --num_train_epochs 3.0 \
-    --max_samples 100000 \
-    --per_device_train_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --max_grad_norm 1.0 \
-    --logging_steps 5 \
-    --save_steps 100 \
-    --warmup_steps 0 \
-    --optim adamw_torch \
-    --packing False \
-    --report_to none \
-    --use_badam True \
-    --output_dir saves/LLaMA2-13B/full/BAdam \
-    --plot_loss True \
-    --ddp_timeout 180000000 \
-    --include_num_input_tokens_seen True \
-    --badam_mode layer \
-    --badam_switch_mode ascending \
-    --badam_switch_interval 50
\ No newline at end of file
diff --git a/examples/extras/badam/train_zero3.sh b/examples/extras/badam/train_zero3.sh
deleted file mode 100644
index 3b182134..00000000
--- a/examples/extras/badam/train_zero3.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-
-cd ../../..
-
-llamafactory-cli train \
-    --stage sft \
-    --do_train True \
-    --model_name_or_path meta-llama/Llama-2-13b-hf \
-    --preprocessing_num_workers 16 \
-    --finetuning_type full \
-    --template default \
-    --flash_attn auto \
-    --dataset_dir data \
-    --dataset alpaca_en_demo \
-    --cutoff_len 1024 \
-    --learning_rate 1e-6 \
-    --num_train_epochs 3.0 \
-    --max_samples 100000 \
-    --per_device_train_batch_size 8 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --max_grad_norm 1.0 \
-    --logging_steps 5 \
-    --save_steps 100 \
-    --warmup_steps 0 \
-    --optim adamw_torch \
-    --packing False \
-    --report_to none \
-    --use_badam True \
-    --output_dir saves/LLaMA2-13B/full/BAdam \
-    --fp16 True \
-    --plot_loss True \
-    --ddp_timeout 180000000 \
-    --include_num_input_tokens_seen True \
-    --badam_mode layer \
-    --badam_switch_mode ascending \
-    --badam_switch_interval 50 \
-    --deepspeed cache/ds_z3_config.json 
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 3d2ac921..64f50a87 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ extra_require = {
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "vllm": ["vllm>=0.4.3"],
     "galore": ["galore-torch"],
-    "badam": ["badam"],
+    "badam": ["badam>=1.2.1"],
     "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index f2ccd5e6..a4b7f7a5 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -121,7 +121,7 @@ def _check_extra_dependencies(
         require_version("galore_torch", "To fix: pip install galore_torch")
 
     if finetuning_args.use_badam:
-        require_version("badam", "To fix: pip install badam")
+        require_version("badam>=1.2.1", "To fix: pip install badam>=1.2.1")
 
     if finetuning_args.plot_loss:
         require_version("matplotlib", "To fix: pip install matplotlib")
@@ -214,15 +214,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
 
     if (
         finetuning_args.use_badam
-        and training_args.parallel_mode.value == "distributed"
+        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
     ):
         if finetuning_args.badam_mode == "ratio":
-            raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
-        if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()):
-            raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.")
+            raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
+        elif not is_deepspeed_zero3_enabled():
+            raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.")
 
-    if (finetuning_args.use_galore) and training_args.deepspeed is not None:
-        raise ValueError("GaLore are incompatible with DeepSpeed yet.")
+    if finetuning_args.use_galore and training_args.deepspeed is not None:
+        raise ValueError("GaLore is incompatible with DeepSpeed yet.")
 
     if model_args.infer_backend == "vllm":
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index a3e0e961..ed4fd5d9 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -96,7 +96,8 @@ class CustomDPOTrainer(DPOTrainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 0d50987f..c2edf95a 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -91,7 +91,8 @@ class CustomKTOTrainer(KTOTrainer):
                 self.ref_model.eval()
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 2d5d7ffc..70d01919 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -166,7 +166,8 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                 self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index d3516b41..b6fb161d 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -48,7 +48,8 @@ class CustomTrainer(Trainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 433251cf..70c2e9a0 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -72,7 +72,8 @@ class PairwiseTrainer(Trainer):
         self.processor = processor
         self.can_return_loss = True  # override property to return eval_loss
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 45799b96..8f18317f 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -56,7 +56,8 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:
-            from badam import clip_grad_norm_old_version, BAdamCallback
+            from badam import BAdamCallback, clip_grad_norm_old_version
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
             self.callback_handler.add_callback(BAdamCallback)
 
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 0206dcb6..21d41c36 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -23,6 +23,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 import torch
 from peft import PeftModel
 from transformers import Trainer
+from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.optimization import get_scheduler
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 from transformers.trainer_pt_utils import get_parameter_names
@@ -372,9 +373,6 @@ def _create_badam_optimizer(
         dict(params=decay_params, weight_decay=training_args.weight_decay),
     ]
 
-    from transformers.integrations import is_deepspeed_zero3_enabled
-    ds_zero3_enabled = is_deepspeed_zero3_enabled()
-
     if finetuning_args.badam_mode == "layer":
         from badam import BlockOptimizer
 
@@ -387,7 +385,7 @@ def _create_badam_optimizer(
             start_block=finetuning_args.badam_start_block,
             switch_mode=finetuning_args.badam_switch_mode,
             verbose=finetuning_args.badam_verbose,
-            ds_zero3_enabled=ds_zero3_enabled
+            ds_zero3_enabled=is_deepspeed_zero3_enabled(),
         )
         logger.info(
             f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
@@ -398,7 +396,6 @@ def _create_badam_optimizer(
     elif finetuning_args.badam_mode == "ratio":
         from badam import BlockOptimizerRatio
 
-        assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer."
         assert finetuning_args.badam_update_ratio > 1e-6
         optimizer = BlockOptimizerRatio(
             param_groups=param_groups,

From 0f82a55305edc6e18048c5ee40ec844d7a9c9f34 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 02:31:44 +0800
Subject: [PATCH 116/160] fix #4379

Former-commit-id: 96bedb4b6445a04ff8b97fb2aadace50b2f882df
---
 src/llamafactory/train/tuner.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py
index 788b4c4f..a02fff22 100644
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import shutil
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import torch
@@ -19,6 +21,7 @@ from transformers import PreTrainedModel
 
 from ..data import get_template_and_fix_tokenizer
 from ..extras.callbacks import LogCallback
+from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from ..extras.logging import get_logger
 from ..hparams import get_infer_args, get_train_args
 from ..model import load_model, load_tokenizer
@@ -98,6 +101,25 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
             safe_serialization=(not model_args.export_legacy_format),
         )
 
+    if finetuning_args.stage == "rm":
+        if model_args.adapter_name_or_path is not None:
+            vhead_path = model_args.adapter_name_or_path[-1]
+        else:
+            vhead_path = model_args.model_name_or_path
+
+        if os.path.exists(os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME)):
+            shutil.copy(
+                os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME),
+                os.path.join(model_args.export_dir, V_HEAD_SAFE_WEIGHTS_NAME),
+            )
+            logger.info("Copied valuehead to {}.".format(model_args.export_dir))
+        elif os.path.exists(os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME)):
+            shutil.copy(
+                os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME),
+                os.path.join(model_args.export_dir, V_HEAD_WEIGHTS_NAME),
+            )
+            logger.info("Copied valuehead to {}.".format(model_args.export_dir))
+
     try:
         tokenizer.padding_side = "left"  # restore padding side
         tokenizer.init_kwargs["padding_side"] = "left"

From f51b435bcfb04eb919b119a274f03f4b399b5981 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 02:34:04 +0800
Subject: [PATCH 117/160] fix #4432

Former-commit-id: 972a3b469c600bc6528aef3a49b6fdec63d65803
---
 src/llamafactory/model/loader.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 69cccd93..e1015821 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -14,6 +14,7 @@
 
 from typing import TYPE_CHECKING, Any, Dict, Optional, TypedDict
 
+import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer
 from trl import AutoModelForCausalLMWithValueHead
 
@@ -175,6 +176,10 @@ def load_model(
 
     if not is_trainable:
         model.requires_grad_(False)
+        for param in model.parameters():
+            if param.data.dtype == torch.float32 and model_args.compute_dtype != torch.float32:
+                param.data = param.data.to(model_args.compute_dtype)
+
         model.eval()
     else:
         model.train()

From cd899734f3c819ad9b1bfa12b02461c7f7f6cb7b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 02:51:49 +0800
Subject: [PATCH 118/160] fix test case

Former-commit-id: 6663057cfbdc96385d901a5dfba22cfcd7a61b23
---
 tests/model/test_base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index e1991b20..6431a504 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -73,7 +73,8 @@ def test_valuehead():
         tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False, add_valuehead=True
     )
 
-    ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
+    ref_model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(
         TINY_LLAMA_VALUEHEAD, torch_dtype=torch.float16, device_map=get_current_device()
     )
+    ref_model.v_head = ref_model.v_head.to(torch.float16)
     compare_model(model, ref_model)

From 98f382fda3f0a85cae3621eb2a62af72d4937659 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 02:55:50 +0800
Subject: [PATCH 119/160] lint

Former-commit-id: c9e424d2198b5872ce118a6ab4c109bf73be2bee
---
 src/llamafactory/hparams/parser.py | 5 +----
 tests/data/test_formatter.py       | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index a4b7f7a5..d4bcfbc6 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -212,10 +212,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
-    if (
-        finetuning_args.use_badam
-        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
-    ):
+    if finetuning_args.use_badam and training_args.parallel_mode == ParallelMode.DISTRIBUTED:
         if finetuning_args.badam_mode == "ratio":
             raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
         elif not is_deepspeed_zero3_enabled():
diff --git a/tests/data/test_formatter.py b/tests/data/test_formatter.py
index a01e8a7e..37b21dc5 100644
--- a/tests/data/test_formatter.py
+++ b/tests/data/test_formatter.py
@@ -113,9 +113,7 @@ def test_glm4_tool_formatter():
     assert formatter.apply(content=json.dumps(tools)) == [
         "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
         "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具\n\n"
-        "## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
-            json.dumps(tools[0], indent=4)
-        )
+        "## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(json.dumps(tools[0], indent=4))
     ]
 
 
From a4f69d8914ac9b73997a452242f40e23f29605c3 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Tue, 25 Jun 2024 14:34:13 +0800
Subject: [PATCH 120/160] fix #4456

Former-commit-id: 920f4fa4ca9e08bcf0d16450e085ee0fa8b4e1c5
---
 .dockerignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.dockerignore b/.dockerignore
index 75cd2209..23ad75a8 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -5,7 +5,6 @@
 cache
 data
 docker
-examples
 saves
 hf_cache
 output

From daea86e0470b8b4c0cf31265798e670bac2f6a49 Mon Sep 17 00:00:00 2001
From: hzhaoy <hzywong@gmail.com>
Date: Tue, 25 Jun 2024 15:13:07 +0800
Subject: [PATCH 121/160] support flash-attn in Dockerfile

Former-commit-id: 0dba000aa178f915cea7d75bf0c9d47e671a21d2
---
 docker/docker-cuda/Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 827b7b3c..06a172f0 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -35,6 +35,11 @@ RUN EXTRA_PACKAGES="metrics"; \
     pip install -e .[$EXTRA_PACKAGES] && \
     pip uninstall -y transformer-engine flash-attn
 
+# Rebuild flash-attn
+RUN ninja --version || \
+    (pip uninstall -y ninja && pip install ninja) && \
+    MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation
+
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
 

From e115d55585380fddb4695ec6be1d25aa2463d351 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Wed, 26 Jun 2024 02:15:00 +0000
Subject: [PATCH 122/160] fix docker-compose path

Former-commit-id: 9de3c24aa2a8268be06c8fef8e47f4fb6715c7ec
---
 docker/docker-cuda/docker-compose.yml | 8 ++++----
 docker/docker-npu/docker-compose.yml  | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index 1c0a3c75..e2d1a5ad 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -10,10 +10,10 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface
-      - ./ms_cache:/root/.cache/modelscope
-      - ./data:/app/data
-      - ./output:/app/output
+      - ../../hf_cache:/root/.cache/huggingface
+      - ../../ms_cache:/root/.cache/modelscope
+      - ../../data:/app/data
+      - ../../output:/app/output
     ports:
       - "7860:7860"
       - "8000:8000"
diff --git a/docker/docker-npu/docker-compose.yml b/docker/docker-npu/docker-compose.yml
index a6b878fd..657cba9f 100644
--- a/docker/docker-npu/docker-compose.yml
+++ b/docker/docker-npu/docker-compose.yml
@@ -8,10 +8,10 @@ services:
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
-      - ./hf_cache:/root/.cache/huggingface
-      - ./ms_cache:/root/.cache/modelscope
-      - ./data:/app/data
-      - ./output:/app/output
+      - ../../hf_cache:/root/.cache/huggingface
+      - ../../ms_cache:/root/.cache/modelscope
+      - ../../data:/app/data
+      - ../../output:/app/output
       - /usr/local/dcmi:/usr/local/dcmi
       - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
       - /usr/local/Ascend/driver:/usr/local/Ascend/driver

From d82d86e16dfbb9ec5e8cc48d3c79bf559b685fb3 Mon Sep 17 00:00:00 2001
From: fanjunliang <fanjunliang@sensetime.com>
Date: Wed, 26 Jun 2024 18:21:42 +0800
Subject: [PATCH 123/160] fix torch-npu dependency

Former-commit-id: 7c8a8061d0cda6342f6c883748fb6bc6650df9f9
---
 docker/docker-npu/Dockerfile | 9 +++++----
 setup.py                     | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 08de626b..0ec16107 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -7,16 +7,17 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Define installation arguments
 ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
+ARG EXTRA_INDEX=https://download.pytorch.org/whl/cpu
 
 # Set the working directory
 WORKDIR /app
 
 # Install the requirements
 COPY requirements.txt /app
-RUN pip config set global.index-url $PIP_INDEX
-RUN pip config set global.extra-index-url $PIP_INDEX
-RUN python -m pip install --upgrade pip
-RUN python -m pip install -r requirements.txt
+RUN pip config set global.index-url $PIP_INDEX && \
+    pip config set global.extra-index-url $EXTRA_INDEX && \
+    pip install --upgrade pip && \
+    pip install -r requirements.txt
 
 # Copy the rest of the application into the image
 COPY . /app
diff --git a/setup.py b/setup.py
index 64f50a87..89301d1b 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,7 @@ def get_requires():
 
 extra_require = {
     "torch": ["torch>=1.13.1"],
-    "torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],

From cf2dc4c44462dd917b3ddd234399f35d21eef08f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 26 Jun 2024 19:43:16 +0800
Subject: [PATCH 124/160] fix #4556

Former-commit-id: 81faa9a985c14e83e38f42aedd228edb676b0695
---
 scripts/loftq_init.py             | 8 ++++++--
 scripts/pissa_init.py             | 8 ++++++--
 src/llamafactory/data/template.py | 2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/loftq_init.py b/scripts/loftq_init.py
index b9506fa3..4d2c01b9 100644
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
@@ -36,15 +36,19 @@ def quantize_loftq(
     lora_alpha: int = None,
     lora_rank: int = 16,
     lora_dropout: float = 0,
-    lora_target: str = "q_proj,v_proj",
+    lora_target: tuple = ("q_proj", "v_proj"),
     save_safetensors: bool = True,
 ):
     r"""
     Initializes LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ)
     Usage: python loftq_init.py --model_name_or_path path_to_model --output_dir output_dir
     """
+    if isinstance(lora_target, str):
+        lora_target = [name.strip() for name in lora_target.split(",")]
+
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
+
     loftq_config = LoftQConfig(loftq_bits=loftq_bits, loftq_iter=loftq_iter)
     lora_config = LoraConfig(
         task_type=TaskType.CAUSAL_LM,
@@ -52,7 +56,7 @@ def quantize_loftq(
         r=lora_rank,
         lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
         lora_dropout=lora_dropout,
-        target_modules=[name.strip() for name in lora_target.split(",")],
+        target_modules=lora_target,
         init_lora_weights="loftq",
         loftq_config=loftq_config,
     )
diff --git a/scripts/pissa_init.py b/scripts/pissa_init.py
index 50239727..ad9d161c 100644
--- a/scripts/pissa_init.py
+++ b/scripts/pissa_init.py
@@ -35,21 +35,25 @@ def quantize_pissa(
     lora_alpha: int = None,
     lora_rank: int = 16,
     lora_dropout: float = 0,
-    lora_target: str = "q_proj,v_proj",
+    lora_target: tuple = ("q_proj", "v_proj"),
     save_safetensors: bool = True,
 ):
     r"""
     Initializes LoRA weights with Principal Singular values and Singular vectors Adaptation (PiSSA)
     Usage: python pissa_init.py --model_name_or_path path_to_model --output_dir output_dir
     """
+    if isinstance(lora_target, str):
+        lora_target = [name.strip() for name in lora_target.split(",")]
+
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype="auto")
+
     lora_config = LoraConfig(
         task_type=TaskType.CAUSAL_LM,
         r=lora_rank,
         lora_alpha=lora_alpha if lora_alpha is not None else lora_rank * 2,
         lora_dropout=lora_dropout,
-        target_modules=[name.strip() for name in lora_target.split(",")],
+        target_modules=lora_target,
         init_lora_weights="pissa" if pissa_iter == -1 else "pissa_niter_{}".format(pissa_iter),
     )
 
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 193ff482..53f16df4 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -618,7 +618,7 @@ _register_template(
 
 _register_template(
     name="default",
-    format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]),
+    format_user=StringFormatter(slots=["Human: {{content}}\nAssistant:"]),
     format_system=StringFormatter(slots=["{{content}}\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )

From 72ba29d81a498e7284ac79b3ee7b294b443521be Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 26 Jun 2024 19:52:35 +0800
Subject: [PATCH 125/160] fix #4458

Former-commit-id: aab14b15268dbe74ded22549dbd3677474868cbb
---
 src/llamafactory/train/ppo/trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index 70d01919..c5f6e175 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -99,10 +99,10 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         )
 
         # Add deepspeed config
-        ppo_config.accelerator_kwargs["kwargs_handlers"] = [
-            DistributedDataParallelKwargs(find_unused_parameters=training_args.ddp_find_unused_parameters)
-        ]
         if training_args.deepspeed_plugin is not None:
+            ppo_config.accelerator_kwargs["kwargs_handlers"] = [
+                DistributedDataParallelKwargs(find_unused_parameters=training_args.ddp_find_unused_parameters)
+            ]
             ppo_config.accelerator_kwargs["deepspeed_plugin"] = training_args.deepspeed_plugin
 
         # Create optimizer and scheduler

From 08fa707085d0c1587a47dfa27dce42b37cab17aa Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Wed, 26 Jun 2024 22:11:44 +0800
Subject: [PATCH 126/160] improve autogptq integration

Former-commit-id: d68408c7b123b8ff92014db35cac0b24b414a6f4
---
 setup.py                                      |  2 +-
 .../model/model_utils/quantization.py         | 41 ++++++++++++-------
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/setup.py b/setup.py
index 64f50a87..8254b6d4 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@ extra_require = {
     "vllm": ["vllm>=0.4.3"],
     "galore": ["galore-torch"],
     "badam": ["badam>=1.2.1"],
-    "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
+    "gptq": ["optimum>=1.17.0", "auto-gptq>=0.5.0"],
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
     "qwen": ["transformers_stream_generator"],
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 5251f84f..fab61cb8 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -57,9 +57,9 @@ class QuantizationMethod(str, Enum):
     HQQ = "hqq"
 
 
-def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]:
+def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[Dict[str, Any]]:
     r"""
-    TODO: remove tokenizer.decode() https://github.com/huggingface/optimum/pull/1600
+    Prepares the dataset to perform AutoGPTQ.
     """
     if os.path.isfile(model_args.export_quantization_dataset):
         data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None)
@@ -68,20 +68,32 @@ def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "Mod
         data_path = model_args.export_quantization_dataset
         data_files = None
 
-    dataset = load_dataset(path=data_path, data_files=data_files, split="train", cache_dir=model_args.cache_dir)
-    maxlen = model_args.export_quantization_maxlen
+    dataset = load_dataset(
+        path=data_path,
+        data_files=data_files,
+        split="train",
+        cache_dir=model_args.cache_dir,
+        token=model_args.hf_hub_token,
+    )
 
     samples = []
+    maxlen = model_args.export_quantization_maxlen
     for _ in range(model_args.export_quantization_nsamples):
+        n_try = 0
         while True:
+            if n_try > 100:
+                raise ValueError("Cannot find satisfying example, considering decrease `export_quantization_maxlen`.")
+
             sample_idx = random.randint(0, len(dataset) - 1)
-            sample: Dict[str, torch.Tensor] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt")
-            if sample["input_ids"].size(1) >= maxlen:
+            sample: Dict[str, "torch.Tensor"] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt")
+            n_try += 1
+            if sample["input_ids"].size(1) > maxlen:
                 break  # TODO: fix large maxlen
 
         word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1)
         input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen]
-        samples.append(tokenizer.decode(input_ids[0].tolist(), skip_special_tokens=True))
+        attention_mask = sample["attention_mask"][:, word_idx : word_idx + maxlen]
+        samples.append({"input_ids": input_ids, "attention_mask": attention_mask})
 
     return samples
 
@@ -119,21 +131,20 @@ def configure_quantization(
         logger.info("Loading {}-bit {}-quantized model.".format(quant_bits, quant_method.upper()))
 
     elif model_args.export_quantization_bit is not None:  # auto-gptq
-        require_version("optimum>=1.16.0", "To fix: pip install optimum>=1.16.0")
+        require_version("optimum>=1.17.0", "To fix: pip install optimum>=1.17.0")
         require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
         from accelerate.utils import get_max_memory
 
         if getattr(config, "model_type", None) == "chatglm":
-            raise ValueError("ChatGLM model is not supported.")
+            raise ValueError("ChatGLM model is not supported yet.")
 
         init_kwargs["quantization_config"] = GPTQConfig(
             bits=model_args.export_quantization_bit,
-            tokenizer=tokenizer,
             dataset=_get_quantization_dataset(tokenizer, model_args),
         )
         init_kwargs["device_map"] = "auto"
         init_kwargs["max_memory"] = get_max_memory()
-        logger.info("Quantizing model to {} bit.".format(model_args.export_quantization_bit))
+        logger.info("Quantizing model to {} bit with AutoGPTQ.".format(model_args.export_quantization_bit))
 
     elif model_args.quantization_bit is not None:  # bnb
         if model_args.quantization_bit == 8:
@@ -150,9 +161,9 @@ def configure_quantization(
                 bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
             )
 
-        # assign device map if:
-        # 1. not deepspeed zero3 and not fsdp
-        # 2. not auto quantization device map
+        # Do not assign device map if:
+        # 1. deepspeed zero3 or fsdp (train)
+        # 2. auto quantization device map (inference)
         if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
             if model_args.quantization_bit != 4:
                 raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
@@ -161,4 +172,4 @@ def configure_quantization(
         else:
             init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
 
-        logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
+        logger.info("Quantizing model to {} bit with bitsandbytes.".format(model_args.quantization_bit))

From b46bd07119a69111047c303c03db9b8c1bb94f39 Mon Sep 17 00:00:00 2001
From: hzhaoy <hzywong@gmail.com>
Date: Thu, 27 Jun 2024 00:11:04 +0800
Subject: [PATCH 127/160] add flash-attn installation flag in Dockerfile

Former-commit-id: 2535044e95f6df628bd1f01e0eecb02407105d79
---
 README.md                             | 1 +
 README_zh.md                          | 1 +
 docker/docker-cuda/Dockerfile         | 9 ++++++---
 docker/docker-cuda/docker-compose.yml | 1 +
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4b42edd7..cdca8333 100644
--- a/README.md
+++ b/README.md
@@ -444,6 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg INSTALL_FLASH_ATTN=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
diff --git a/README_zh.md b/README_zh.md
index 3926c09d..d26c8268 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -444,6 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
+    --build-arg INSTALL_FLASH_ATTN=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 06a172f0..44aaf538 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -6,6 +6,7 @@ FROM nvcr.io/nvidia/pytorch:24.02-py3
 ARG INSTALL_BNB=false
 ARG INSTALL_VLLM=false
 ARG INSTALL_DEEPSPEED=false
+ARG INSTALL_FLASH_ATTN=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
@@ -36,9 +37,11 @@ RUN EXTRA_PACKAGES="metrics"; \
     pip uninstall -y transformer-engine flash-attn
 
 # Rebuild flash-attn
-RUN ninja --version || \
-    (pip uninstall -y ninja && pip install ninja) && \
-    MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation
+RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
+        ninja --version || \
+        (pip uninstall -y ninja && pip install ninja) && \
+        MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation \
+    fi;
 
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index 1c0a3c75..ad269cb0 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -7,6 +7,7 @@ services:
         INSTALL_BNB: false
         INSTALL_VLLM: false
         INSTALL_DEEPSPEED: false
+        INSTALL_FLASH_ATTN: false
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:

From 8aaf1185a53841110e3ff857a0d6cf5161ee72a6 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 00:29:42 +0800
Subject: [PATCH 128/160] support HQQ/EETQ #4113

Former-commit-id: b7cb51ddb394f04fe4646b2c297fc8d918c9979e
---
 README.md                                     |  4 +-
 README_zh.md                                  |  4 +-
 setup.py                                      |  8 +-
 src/llamafactory/extras/env.py                |  5 +-
 src/llamafactory/hparams/model_args.py        |  7 +-
 src/llamafactory/model/__init__.py            |  2 +
 src/llamafactory/model/loader.py              |  4 +-
 .../model/model_utils/quantization.py         | 75 ++++++++++++-------
 src/llamafactory/webui/chatter.py             | 10 ++-
 src/llamafactory/webui/common.py              |  2 +
 src/llamafactory/webui/components/export.py   |  5 +-
 src/llamafactory/webui/components/top.py      | 13 ++--
 src/llamafactory/webui/locales.py             | 20 ++++-
 src/llamafactory/webui/manager.py             |  1 +
 src/llamafactory/webui/runner.py              | 18 ++++-
 src/llamafactory/webui/utils.py               | 13 ++++
 16 files changed, 134 insertions(+), 57 deletions(-)

diff --git a/README.md b/README.md
index 4b42edd7..443c8cf7 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ Choose your path:
 
 - **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO, etc.
-- **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
+- **Scalable resources**: 16-bit full-tuning, freeze-tuning, LoRA and 2/3/4/5/6/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ.
 - **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ, PiSSA and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
@@ -341,7 +341,7 @@ cd LLaMA-Factory
 pip install -e ".[torch,metrics]"
 ```
 
-Extra dependencies available: torch, torch_npu, metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality
+Extra dependencies available: torch, torch-npu, metrics, deepspeed, bitsandbytes, hqq, eetq, gptq, awq, aqlm, vllm, galore, badam, qwen, modelscope, quality
 
 > [!TIP]
 > Use `pip install --no-deps -e .` to resolve package conflicts.
diff --git a/README_zh.md b/README_zh.md
index 3926c09d..d5172a7d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -48,7 +48,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 - **多种模型**：LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
 - **集成方法**：（增量）预训练、（多模态）指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练、ORPO 训练等等。
-- **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
+- **多种精度**：16 比特全参数微调、冻结微调、LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ 的 2/3/4/5/6/8 比特 QLoRA 微调。
 - **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ、PiSSA 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
 - **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
@@ -341,7 +341,7 @@ cd LLaMA-Factory
 pip install -e ".[torch,metrics]"
 ```
 
-可选的额外依赖项：torch、torch_npu、metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality
+可选的额外依赖项：torch、torch-npu、metrics、deepspeed、bitsandbytes、hqq、eetq、gptq、awq、aqlm、vllm、galore、badam、qwen、modelscope、quality
 
 > [!TIP]
 > 遇到包冲突时，可使用 `pip install --no-deps -e .` 解决。
diff --git a/setup.py b/setup.py
index 8254b6d4..d43c311c 100644
--- a/setup.py
+++ b/setup.py
@@ -39,12 +39,14 @@ extra_require = {
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
-    "vllm": ["vllm>=0.4.3"],
-    "galore": ["galore-torch"],
-    "badam": ["badam>=1.2.1"],
+    "hqq": ["hqq"],
+    "eetq": ["eetq"],
     "gptq": ["optimum>=1.17.0", "auto-gptq>=0.5.0"],
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
+    "vllm": ["vllm>=0.4.3"],
+    "galore": ["galore-torch"],
+    "badam": ["badam>=1.2.1"],
     "qwen": ["transformers_stream_generator"],
     "modelscope": ["modelscope"],
     "dev": ["ruff", "pytest"],
diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py
index ab387231..14876048 100644
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
@@ -1,4 +1,7 @@
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/commands/env.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 3f21145d..087c8c38 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -77,6 +77,10 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether or not to use memory-efficient model loading."},
     )
+    quantization_method: Literal["bitsandbytes", "hqq", "eetq"] = field(
+        default="bitsandbytes",
+        metadata={"help": "Quantization method to use for on-the-fly quantization."},
+    )
     quantization_bit: Optional[int] = field(
         default=None,
         metadata={"help": "The number of bits to quantize the model using bitsandbytes."},
@@ -235,9 +239,6 @@ class ModelArguments:
         if self.new_special_tokens is not None:  # support multiple special tokens
             self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")]
 
-        assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
-        assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization."
-
         if self.export_quantization_bit is not None and self.export_quantization_dataset is None:
             raise ValueError("Quantization dataset is necessary for exporting.")
 
diff --git a/src/llamafactory/model/__init__.py b/src/llamafactory/model/__init__.py
index 4abbaa1b..48cfe76c 100644
--- a/src/llamafactory/model/__init__.py
+++ b/src/llamafactory/model/__init__.py
@@ -14,10 +14,12 @@
 
 from .loader import load_config, load_model, load_tokenizer
 from .model_utils.misc import find_all_linear_modules
+from .model_utils.quantization import QuantizationMethod
 from .model_utils.valuehead import load_valuehead_params
 
 
 __all__ = [
+    "QuantizationMethod",
     "load_config",
     "load_model",
     "load_tokenizer",
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index e1015821..1261d17a 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -186,11 +186,11 @@ def load_model(
 
     trainable_params, all_param = count_parameters(model)
     if is_trainable:
-        param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
+        param_stats = "trainable params: {:,} || all params: {:,} || trainable%: {:.4f}".format(
             trainable_params, all_param, 100 * trainable_params / all_param
         )
     else:
-        param_stats = "all params: {:d}".format(all_param)
+        param_stats = "all params: {:,}".format(all_param)
 
     logger.info(param_stats)
 
diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index fab61cb8..3203b4aa 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -23,7 +23,7 @@ from typing import TYPE_CHECKING, Any, Dict, List
 
 import torch
 from datasets import load_dataset
-from transformers import BitsAndBytesConfig, GPTQConfig
+from transformers import BitsAndBytesConfig, EetqConfig, GPTQConfig, HqqConfig
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.modeling_utils import is_fsdp_enabled
 from transformers.utils.versions import require_version
@@ -59,7 +59,7 @@ class QuantizationMethod(str, Enum):
 
 def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[Dict[str, Any]]:
     r"""
-    Prepares the dataset to perform AutoGPTQ.
+    Prepares the tokenized dataset to perform AutoGPTQ. Do not use tensor output for JSON serialization.
     """
     if os.path.isfile(model_args.export_quantization_dataset):
         data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None)
@@ -93,7 +93,7 @@ def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "Mod
         word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1)
         input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen]
         attention_mask = sample["attention_mask"][:, word_idx : word_idx + maxlen]
-        samples.append({"input_ids": input_ids, "attention_mask": attention_mask})
+        samples.append({"input_ids": input_ids.tolist(), "attention_mask": attention_mask.tolist()})
 
     return samples
 
@@ -105,7 +105,7 @@ def configure_quantization(
     init_kwargs: Dict[str, Any],
 ) -> None:
     r"""
-    Priority: PTQ-quantized (training) > AutoGPTQ (export) > Bitsandbytes (training)
+    Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)
     """
     if getattr(config, "quantization_config", None):  # ptq
         if is_deepspeed_zero3_enabled():
@@ -131,6 +131,9 @@ def configure_quantization(
         logger.info("Loading {}-bit {}-quantized model.".format(quant_bits, quant_method.upper()))
 
     elif model_args.export_quantization_bit is not None:  # auto-gptq
+        if model_args.export_quantization_bit not in [8, 4, 3, 2]:
+            raise ValueError("AutoGPTQ only accepts 2/3/4/8-bit quantization.")
+
         require_version("optimum>=1.17.0", "To fix: pip install optimum>=1.17.0")
         require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
         from accelerate.utils import get_max_memory
@@ -146,30 +149,48 @@ def configure_quantization(
         init_kwargs["max_memory"] = get_max_memory()
         logger.info("Quantizing model to {} bit with AutoGPTQ.".format(model_args.export_quantization_bit))
 
-    elif model_args.quantization_bit is not None:  # bnb
-        if model_args.quantization_bit == 8:
-            require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0")
-            init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
+    elif model_args.quantization_bit is not None:  # on-the-fly
+        if model_args.quantization_method == QuantizationMethod.BITS_AND_BYTES.value:
+            if model_args.quantization_bit == 8:
+                require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0")
+                init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
+            elif model_args.quantization_bit == 4:
+                require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0")
+                init_kwargs["quantization_config"] = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=model_args.compute_dtype,
+                    bnb_4bit_use_double_quant=model_args.double_quantization,
+                    bnb_4bit_quant_type=model_args.quantization_type,
+                    bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
+                )
+            else:
+                raise ValueError("Bitsandbytes only accepts 4-bit or 8-bit quantization.")
 
-        elif model_args.quantization_bit == 4:
-            require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0")
-            init_kwargs["quantization_config"] = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=model_args.compute_dtype,
-                bnb_4bit_use_double_quant=model_args.double_quantization,
-                bnb_4bit_quant_type=model_args.quantization_type,
-                bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
-            )
+            # Do not assign device map if:
+            # 1. deepspeed zero3 or fsdp (train)
+            # 2. auto quantization device map (inference)
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
+                if model_args.quantization_bit != 4:
+                    raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
 
-        # Do not assign device map if:
-        # 1. deepspeed zero3 or fsdp (train)
-        # 2. auto quantization device map (inference)
-        if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
-            if model_args.quantization_bit != 4:
-                raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
+                require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
+            else:
+                init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
 
-            require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
-        else:
-            init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
+            logger.info("Quantizing model to {} bit with bitsandbytes.".format(model_args.quantization_bit))
+        elif model_args.quantization_method == QuantizationMethod.HQQ.value:
+            if model_args.quantization_bit not in [8, 6, 5, 4, 3, 2, 1]:
+                raise ValueError("HQQ only accepts 1/2/3/4/5/6/8-bit quantization.")
 
-        logger.info("Quantizing model to {} bit with bitsandbytes.".format(model_args.quantization_bit))
+            require_version("hqq", "To fix: pip install hqq")
+            init_kwargs["quantization_config"] = HqqConfig(
+                nbits=model_args.quantization_bit, quant_zero=False, quant_scale=False, axis=0
+            )  # use ATEN kernel (axis=0) for performance
+            logger.info("Quantizing model to {} bit with HQQ.".format(model_args.quantization_bit))
+        elif model_args.quantization_method == QuantizationMethod.EETQ.value:
+            if model_args.quantization_bit != 8:
+                raise ValueError("EETQ only accepts 8-bit quantization.")
+
+            require_version("eetq", "To fix: pip install eetq")
+            init_kwargs["quantization_config"] = EetqConfig()
+            logger.info("Quantizing model to {} bit with EETQ.".format(model_args.quantization_bit))
diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py
index 652c341c..8abef920 100644
--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
@@ -23,7 +23,7 @@ from ..data import Role
 from ..extras.constants import PEFT_METHODS
 from ..extras.misc import torch_gc
 from ..extras.packages import is_gradio_available
-from .common import get_save_dir
+from .common import QUANTIZATION_BITS, get_save_dir
 from .locales import ALERTS
 
 
@@ -76,11 +76,17 @@ class WebChatModel(ChatModel):
             yield error
             return
 
+        if get("top.quantization_bit") in QUANTIZATION_BITS:
+            quantization_bit = int(get("top.quantization_bit"))
+        else:
+            quantization_bit = None
+
         yield ALERTS["info_loading"][lang]
         args = dict(
             model_name_or_path=model_path,
             finetuning_type=finetuning_type,
-            quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
+            quantization_bit=quantization_bit,
+            quantization_method=get("top.quantization_method"),
             template=get("top.template"),
             flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
             use_unsloth=(get("top.booster") == "unsloth"),
diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py
index 980428a4..bced18f0 100644
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
@@ -47,6 +47,8 @@ DEFAULT_CONFIG_DIR = "config"
 DEFAULT_DATA_DIR = "data"
 DEFAULT_SAVE_DIR = "saves"
 USER_CONFIG = "user_config.yaml"
+QUANTIZATION_BITS = ["8", "6", "5", "4", "3", "2", "1"]
+GPTQ_BITS = ["8", "4", "3", "2"]
 
 
 def get_save_dir(*paths: str) -> os.PathLike:
diff --git a/src/llamafactory/webui/components/export.py b/src/llamafactory/webui/components/export.py
index 14257949..0a938f02 100644
--- a/src/llamafactory/webui/components/export.py
+++ b/src/llamafactory/webui/components/export.py
@@ -18,7 +18,7 @@ from ...extras.constants import PEFT_METHODS
 from ...extras.misc import torch_gc
 from ...extras.packages import is_gradio_available
 from ...train.tuner import export_model
-from ..common import get_save_dir
+from ..common import GPTQ_BITS, get_save_dir
 from ..locales import ALERTS
 
 
@@ -32,9 +32,6 @@ if TYPE_CHECKING:
     from ..engine import Engine
 
 
-GPTQ_BITS = ["8", "4", "3", "2"]
-
-
 def can_quantize(checkpoint_path: Union[str, List[str]]) -> "gr.Dropdown":
     if isinstance(checkpoint_path, list) and len(checkpoint_path) != 0:
         return gr.Dropdown(value="none", interactive=False)
diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index 18b9a7d2..e331d5e4 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -18,7 +18,7 @@ from ...data import TEMPLATES
 from ...extras.constants import METHODS, SUPPORTED_MODELS
 from ...extras.packages import is_gradio_available
 from ..common import get_model_info, list_checkpoints, save_config
-from ..utils import can_quantize
+from ..utils import can_quantize, can_quantize_to
 
 
 if is_gradio_available():
@@ -43,10 +43,11 @@ def create_top() -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as advanced_tab:
         with gr.Row():
-            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=2)
-            template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default", scale=2)
-            rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=3)
-            booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none", scale=3)
+            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=1)
+            quantization_method = gr.Dropdown(choices=["bitsandbytes", "hqq", "eetq"], value="bitsandbytes", scale=1)
+            template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default", scale=1)
+            rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=2)
+            booster = gr.Radio(choices=["auto", "flashattn2", "unsloth"], value="auto", scale=2)
             visual_inputs = gr.Checkbox(scale=1)
 
     model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False).then(
@@ -58,6 +59,7 @@ def create_top() -> Dict[str, "Component"]:
         list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
     )
     checkpoint_path.focus(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False)
+    quantization_method.change(can_quantize_to, [quantization_method], [quantization_bit], queue=False)
 
     return dict(
         lang=lang,
@@ -67,6 +69,7 @@ def create_top() -> Dict[str, "Component"]:
         checkpoint_path=checkpoint_path,
         advanced_tab=advanced_tab,
         quantization_bit=quantization_bit,
+        quantization_method=quantization_method,
         template=template,
         rope_scaling=rope_scaling,
         booster=booster,
diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py
index cd166584..435876e7 100644
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
@@ -85,15 +85,29 @@ LOCALES = {
     "quantization_bit": {
         "en": {
             "label": "Quantization bit",
-            "info": "Enable 4/8-bit model quantization (QLoRA).",
+            "info": "Enable quantization (QLoRA).",
         },
         "ru": {
             "label": "Уровень квантования",
-            "info": "Включить 4/8-битное квантование модели (QLoRA).",
+            "info": "Включить квантование (QLoRA).",
         },
         "zh": {
             "label": "量化等级",
-            "info": "启用 4/8 比特模型量化（QLoRA）。",
+            "info": "启用量化（QLoRA）。",
+        },
+    },
+    "quantization_method": {
+        "en": {
+            "label": "Quantization method",
+            "info": "Quantization algorithm to use.",
+        },
+        "ru": {
+            "label": "Метод квантования",
+            "info": "Алгоритм квантования, который следует использовать.",
+        },
+        "zh": {
+            "label": "量化方法",
+            "info": "使用的量化算法。",
         },
     },
     "template": {
diff --git a/src/llamafactory/webui/manager.py b/src/llamafactory/webui/manager.py
index 7e9b801a..ebe9f1b9 100644
--- a/src/llamafactory/webui/manager.py
+++ b/src/llamafactory/webui/manager.py
@@ -71,6 +71,7 @@ class Manager:
             self._id_to_elem["top.finetuning_type"],
             self._id_to_elem["top.checkpoint_path"],
             self._id_to_elem["top.quantization_bit"],
+            self._id_to_elem["top.quantization_method"],
             self._id_to_elem["top.template"],
             self._id_to_elem["top.rope_scaling"],
             self._id_to_elem["top.booster"],
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 549ec765..f7fbac30 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -22,7 +22,7 @@ from transformers.trainer import TRAINING_ARGS_NAME
 from ..extras.constants import LLAMABOARD_CONFIG, PEFT_METHODS, TRAINING_STAGES
 from ..extras.misc import is_gpu_or_npu_available, torch_gc
 from ..extras.packages import is_gradio_available
-from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir, load_config
+from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, QUANTIZATION_BITS, get_save_dir, load_config
 from .locales import ALERTS, LOCALES
 from .utils import abort_process, gen_cmd, get_eval_results, get_trainer_info, load_args, save_args, save_cmd
 
@@ -104,6 +104,11 @@ class Runner:
         model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
         user_config = load_config()
 
+        if get("top.quantization_bit") in QUANTIZATION_BITS:
+            quantization_bit = int(get("top.quantization_bit"))
+        else:
+            quantization_bit = None
+
         args = dict(
             stage=TRAINING_STAGES[get("train.training_stage")],
             do_train=True,
@@ -111,7 +116,8 @@ class Runner:
             cache_dir=user_config.get("cache_dir", None),
             preprocessing_num_workers=16,
             finetuning_type=finetuning_type,
-            quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
+            quantization_bit=quantization_bit,
+            quantization_method=get("top.quantization_method"),
             template=get("top.template"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
@@ -234,13 +240,19 @@ class Runner:
         model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
         user_config = load_config()
 
+        if get("top.quantization_bit") in QUANTIZATION_BITS:
+            quantization_bit = int(get("top.quantization_bit"))
+        else:
+            quantization_bit = None
+
         args = dict(
             stage="sft",
             model_name_or_path=get("top.model_path"),
             cache_dir=user_config.get("cache_dir", None),
             preprocessing_num_workers=16,
             finetuning_type=finetuning_type,
-            quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
+            quantization_bit=quantization_bit,
+            quantization_method=get("top.quantization_method"),
             template=get("top.template"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index a616bcba..4f313e4e 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -25,6 +25,7 @@ from yaml import safe_dump, safe_load
 from ..extras.constants import PEFT_METHODS, RUNNING_LOG, TRAINER_LOG, TRAINING_ARGS, TRAINING_STAGES
 from ..extras.packages import is_gradio_available, is_matplotlib_available
 from ..extras.ploting import gen_loss_plot
+from ..model import QuantizationMethod
 from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir
 from .locales import ALERTS
 
@@ -55,6 +56,18 @@ def can_quantize(finetuning_type: str) -> "gr.Dropdown":
         return gr.Dropdown(interactive=True)
 
 
+def can_quantize_to(quantization_method: str) -> "gr.Dropdown":
+    r"""
+    Returns the available quantization bits.
+    """
+    if quantization_method == QuantizationMethod.BITS_AND_BYTES.value:
+        return gr.Dropdown(choices=["none", "8", "4"])
+    elif quantization_method == QuantizationMethod.HQQ.value:
+        return gr.Dropdown(choices=["none", "8", "6", "5", "4", "3", "2", "1"])
+    elif quantization_method == QuantizationMethod.EETQ.value:
+        return gr.Dropdown(choices=["none", "8"])
+
+
 def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Tuple[List[str], bool]:
     r"""
     Modifys states after changing the training stage.

From d1cda4ec68026ab91d4be5b5b77f46ad5669bec7 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 00:36:04 +0800
Subject: [PATCH 129/160] tiny fix

Former-commit-id: 69dac21ed9f07977b4540eb838a0ef93f3d3abc4
---
 src/llamafactory/webui/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index 4f313e4e..069a2a2a 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -61,11 +61,11 @@ def can_quantize_to(quantization_method: str) -> "gr.Dropdown":
     Returns the available quantization bits.
     """
     if quantization_method == QuantizationMethod.BITS_AND_BYTES.value:
-        return gr.Dropdown(choices=["none", "8", "4"])
+        return gr.Dropdown(choices=["none", "8", "4"], value="none")
     elif quantization_method == QuantizationMethod.HQQ.value:
-        return gr.Dropdown(choices=["none", "8", "6", "5", "4", "3", "2", "1"])
+        return gr.Dropdown(choices=["none", "8", "6", "5", "4", "3", "2", "1"], value="none")
     elif quantization_method == QuantizationMethod.EETQ.value:
-        return gr.Dropdown(choices=["none", "8"])
+        return gr.Dropdown(choices=["none", "8"], value="none")
 
 
 def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Tuple[List[str], bool]:

From 5a7cb9af4ec2540adf233d6543a51fbd54d9d6b4 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 00:46:41 +0800
Subject: [PATCH 130/160] tiny fix

Former-commit-id: c6747a39dbbdda8decaa104499918bc7ac5f02e4
---
 src/llamafactory/webui/components/top.py | 2 +-
 src/llamafactory/webui/utils.py          | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py
index e331d5e4..9df3f062 100644
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
@@ -43,7 +43,7 @@ def create_top() -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as advanced_tab:
         with gr.Row():
-            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=1)
+            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", allow_custom_value=True, scale=1)
             quantization_method = gr.Dropdown(choices=["bitsandbytes", "hqq", "eetq"], value="bitsandbytes", scale=1)
             template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default", scale=1)
             rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=2)
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index 069a2a2a..14616ac4 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -61,11 +61,13 @@ def can_quantize_to(quantization_method: str) -> "gr.Dropdown":
     Returns the available quantization bits.
     """
     if quantization_method == QuantizationMethod.BITS_AND_BYTES.value:
-        return gr.Dropdown(choices=["none", "8", "4"], value="none")
+        available_bits = ["none", "8", "4"]
     elif quantization_method == QuantizationMethod.HQQ.value:
-        return gr.Dropdown(choices=["none", "8", "6", "5", "4", "3", "2", "1"], value="none")
+        available_bits = ["none", "8", "6", "5", "4", "3", "2", "1"]
     elif quantization_method == QuantizationMethod.EETQ.value:
-        return gr.Dropdown(choices=["none", "8"], value="none")
+        available_bits = ["none", "8"]
+
+    return gr.Dropdown(choices=available_bits)
 
 
 def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Tuple[List[str], bool]:

From 8e5b4bddf4528e0a795de3663eb4d9696a81b551 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 00:53:33 +0800
Subject: [PATCH 131/160] update examples

Former-commit-id: cce238f7d07919b79237bc9ab39265766c20f020
---
 examples/README.md                                            | 4 ++--
 examples/README_zh.md                                         | 4 ++--
 examples/train_lora/llama3_lora_dpo.yaml                      | 2 +-
 ...3_lora_sft_bitsandbytes.yaml => llama3_lora_sft_otfq.yaml} | 1 +
 4 files changed, 6 insertions(+), 5 deletions(-)
 rename examples/train_qlora/{llama3_lora_sft_bitsandbytes.yaml => llama3_lora_sft_otfq.yaml} (88%)

diff --git a/examples/README.md b/examples/README.md
index 007a81ab..d5aca5ad 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -94,10 +94,10 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.
 
 ### QLoRA Fine-Tuning
 
-#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
+#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes/HQQ/EETQ Quantization (Recommended)
 
 ```bash
-llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
 ```
 
 #### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
diff --git a/examples/README_zh.md b/examples/README_zh.md
index b9d90f25..d96bf882 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -94,10 +94,10 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.
 
 ### QLoRA 微调
 
-#### 基于 4/8 比特 Bitsandbytes 量化进行指令监督微调（推荐）
+#### 基于 4/8 比特 Bitsandbytes/HQQ/EETQ 量化进行指令监督微调（推荐）
 
 ```bash
-llamafactory-cli train examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
 ```
 
 #### 基于 4/8 比特 GPTQ 量化进行指令监督微调
diff --git a/examples/train_lora/llama3_lora_dpo.yaml b/examples/train_lora/llama3_lora_dpo.yaml
index db25fb51..188e5078 100644
--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -7,7 +7,7 @@ do_train: true
 finetuning_type: lora
 lora_target: all
 pref_beta: 0.1
-pref_loss: sigmoid  # [sigmoid (dpo), orpo, simpo]
+pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
 
 ### dataset
 dataset: dpo_en_demo
diff --git a/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml b/examples/train_qlora/llama3_lora_sft_otfq.yaml
similarity index 88%
rename from examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
rename to examples/train_qlora/llama3_lora_sft_otfq.yaml
index cc773991..9c73b439 100644
--- a/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/train_qlora/llama3_lora_sft_otfq.yaml
@@ -1,6 +1,7 @@
 ### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
+quantization_method: bitsandbytes  # choices: [bitsandbytes (4/8), hqq (2/3/4/5/6/8), eetq (8)]
 
 ### method
 stage: sft

From d1aad72826f33333b768e8184a20f38405fb78a6 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 01:12:25 +0800
Subject: [PATCH 132/160] add quant checks

Former-commit-id: 15bb053e3549739b1a2134640a659b0f35df7de7
---
 src/llamafactory/model/model_utils/quantization.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py
index 3203b4aa..317646e0 100644
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
@@ -108,8 +108,11 @@ def configure_quantization(
     Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)
     """
     if getattr(config, "quantization_config", None):  # ptq
-        if is_deepspeed_zero3_enabled():
-            raise ValueError("DeepSpeed ZeRO-3 is incompatible with PTQ-quantized models.")
+        if model_args.quantization_bit is not None:
+            logger.warning("`quantization_bit` will not affect on the PTQ-quantized models.")
+
+        if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+            raise ValueError("DeepSpeed ZeRO-3 or FSDP is incompatible with PTQ-quantized models.")
 
         quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
         quant_method = quantization_config.get("quant_method", "")
@@ -182,6 +185,9 @@ def configure_quantization(
             if model_args.quantization_bit not in [8, 6, 5, 4, 3, 2, 1]:
                 raise ValueError("HQQ only accepts 1/2/3/4/5/6/8-bit quantization.")
 
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+                raise ValueError("HQQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP.")
+
             require_version("hqq", "To fix: pip install hqq")
             init_kwargs["quantization_config"] = HqqConfig(
                 nbits=model_args.quantization_bit, quant_zero=False, quant_scale=False, axis=0
@@ -191,6 +197,9 @@ def configure_quantization(
             if model_args.quantization_bit != 8:
                 raise ValueError("EETQ only accepts 8-bit quantization.")
 
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+                raise ValueError("EETQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP.")
+
             require_version("eetq", "To fix: pip install eetq")
             init_kwargs["quantization_config"] = EetqConfig()
             logger.info("Quantizing model to {} bit with EETQ.".format(model_args.quantization_bit))

From 89d9dd5aa558a78d189462dc65549e6cddb16143 Mon Sep 17 00:00:00 2001
From: hzhaoy <hzywong@gmail.com>
Date: Thu, 27 Jun 2024 13:49:57 +0800
Subject: [PATCH 133/160] fix #4579

Former-commit-id: 0fa298ff6a4febea36ea9f11c7594277a77e6e9b
---
 src/llamafactory/train/sft/trainer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 8f18317f..f0a86dff 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -53,6 +53,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         self.processor = processor
 
         if finetuning_args.pissa_convert:
+            if self.is_deepspeed_enabled:
+                self.accelerator.deepspeed_config = self.accelerator.state.deepspeed_plugin.deepspeed_config
+                self.deepspeed = self._wrap_model(self.model_wrapped)
             self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
 
         if finetuning_args.use_badam:

From 31fcd03f3ca5186b9ab120d2d692a04de1785640 Mon Sep 17 00:00:00 2001
From: fanjunliang <fanjunliang@sensetime.com>
Date: Thu, 27 Jun 2024 15:21:55 +0800
Subject: [PATCH 134/160] support docker-npu-[amd64|arm64] build

Former-commit-id: 25f16f5e299c94175e62bac9f0da5b47a2bb31b7
---
 README.md                    |  2 +-
 README_zh.md                 |  2 +-
 docker/docker-npu/Dockerfile | 12 +++++++++---
 setup.py                     |  3 ++-
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 4b42edd7..9c509ff0 100644
--- a/README.md
+++ b/README.md
@@ -465,7 +465,7 @@ For Ascend NPU users:
 
 ```bash
 # Choose docker image upon your environment
-docker build -f ./docker/docker-npu/Dockerfile \
+docker build --platform linux/arm64 -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
diff --git a/README_zh.md b/README_zh.md
index 3926c09d..c3fb6ecf 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -465,7 +465,7 @@ docker exec -it llamafactory bash
 
 ```bash
 # 根据您的环境选择镜像
-docker build -f ./docker/docker-npu/Dockerfile \
+docker build --platform linux/arm64 -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 0ec16107..8d80397e 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -1,10 +1,11 @@
 # Use the Ubuntu 22.04 image with CANN 8.0.rc1
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
-FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
+FROM --platform=$TARGETPLATFORM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Define installation arguments
+ARG TARGETPLATFORM
 ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
 ARG EXTRA_INDEX=https://download.pytorch.org/whl/cpu
@@ -15,7 +16,6 @@ WORKDIR /app
 # Install the requirements
 COPY requirements.txt /app
 RUN pip config set global.index-url $PIP_INDEX && \
-    pip config set global.extra-index-url $EXTRA_INDEX && \
     pip install --upgrade pip && \
     pip install -r requirements.txt
 
@@ -23,7 +23,13 @@ RUN pip config set global.index-url $PIP_INDEX && \
 COPY . /app
 
 # Install the LLaMA Factory
-RUN EXTRA_PACKAGES="torch-npu,metrics"; \
+RUN EXTRA_PACKAGES="metrics"; \
+    if [ "$TARGETPLATFORM" == "linux/arm64" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},torch-npu-arm64"; \
+    else \
+        pip config set global.extra-index-url $EXTRA_INDEX; \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},torch-npu-amd64"; \
+    fi; \
     if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
diff --git a/setup.py b/setup.py
index 89301d1b..594070cd 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,8 @@ def get_requires():
 
 extra_require = {
     "torch": ["torch>=1.13.1"],
-    "torch-npu": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu-arm64": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu-amd64": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],

From 556eca918d025669a45804cad96a542dc1d663e0 Mon Sep 17 00:00:00 2001
From: faddddeout <39449491+injet-zhou@users.noreply.github.com>
Date: Thu, 27 Jun 2024 09:58:00 +0000
Subject: [PATCH 135/160] Exit the process with the subprocess's return code
 when utilizing the CLI

Former-commit-id: ab42a4e2501a80fba1704a506bd1209a441570fa
---
 src/llamafactory/cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index e4846780..c92f556b 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -91,7 +91,7 @@ def main():
             master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
             master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
             logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
-            subprocess.run(
+            subproc = subprocess.run(
                 (
                     "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
                     "--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
@@ -106,6 +106,7 @@ def main():
                 ),
                 shell=True,
             )
+            sys.exit(subproc.returncode)
         else:
             run_exp()
     elif command == Command.WEBDEMO:

From 4b123f49cbd6ac6bf507865374fcff8ed98d14da Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:16:46 +0800
Subject: [PATCH 136/160] Update setup.py

Former-commit-id: 42293ab26f7fd7ffb77b308655ccd47b7c2ffa84
---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 594070cd..dd333c48 100644
--- a/setup.py
+++ b/setup.py
@@ -35,8 +35,8 @@ def get_requires():
 
 extra_require = {
     "torch": ["torch>=1.13.1"],
-    "torch-npu-arm64": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
-    "torch-npu-amd64": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
+    "torch-npu-amd": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],

From e930a42083d9e47fde89519345458a416ada4c85 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:17:35 +0800
Subject: [PATCH 137/160] Update README.md

Former-commit-id: 01869ccbb5af2704c9d5bfdd4f2ff30978fb466d
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9c509ff0..4b42edd7 100644
--- a/README.md
+++ b/README.md
@@ -465,7 +465,7 @@ For Ascend NPU users:
 
 ```bash
 # Choose docker image upon your environment
-docker build --platform linux/arm64 -f ./docker/docker-npu/Dockerfile \
+docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .

From a2ebdbc112dbdf640be9d7624c068994c57cd377 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:17:52 +0800
Subject: [PATCH 138/160] Update README_zh.md

Former-commit-id: 62f2e27f4355aa35c26e1146dbe90fac3b380118
---
 README_zh.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_zh.md b/README_zh.md
index c3fb6ecf..3926c09d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -465,7 +465,7 @@ docker exec -it llamafactory bash
 
 ```bash
 # 根据您的环境选择镜像
-docker build --platform linux/arm64 -f ./docker/docker-npu/Dockerfile \
+docker build -f ./docker/docker-npu/Dockerfile \
     --build-arg INSTALL_DEEPSPEED=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .

From f6eda1c35df85b70b1aaa9b877cf4fd0333b0113 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:38:15 +0800
Subject: [PATCH 139/160] Update setup.py

Former-commit-id: 544e1844fb237eed3eb621f4e6e355eac2ff7b85
---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index dd333c48..64f50a87 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,6 @@ def get_requires():
 extra_require = {
     "torch": ["torch>=1.13.1"],
     "torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
-    "torch-npu-amd": ["torch==2.1.0+cpu", "torch-npu==2.1.0.post3", "decorator"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],

From 88018000ac7f2a042d424ad68f13ef9bdedd84ad Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:51:25 +0800
Subject: [PATCH 140/160] Update Dockerfile

Former-commit-id: 7dea6840256472f8aa2c642f11d9e30bfa0fb96f
---
 docker/docker-npu/Dockerfile | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 8d80397e..d1d176e9 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -1,39 +1,38 @@
 # Use the Ubuntu 22.04 image with CANN 8.0.rc1
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
-FROM --platform=$TARGETPLATFORM cosdt/cann:8.0.rc1-910b-ubuntu22.04
+FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
+# Set env
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Define installation arguments
-ARG TARGETPLATFORM
 ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
-ARG EXTRA_INDEX=https://download.pytorch.org/whl/cpu
+# x86 torch cpu index
+ARG TORCH_INDEX=https://download.pytorch.org/whl/cpu
 
 # Set the working directory
 WORKDIR /app
 
 # Install the requirements
 COPY requirements.txt /app
-RUN pip config set global.index-url $PIP_INDEX && \
-    pip install --upgrade pip && \
-    pip install -r requirements.txt
+RUN pip config set global.index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$PIP_INDEX" && \
+    python -m pip install --upgrade pip && \
+    python -m pip install -r requirements.txt
 
 # Copy the rest of the application into the image
 COPY . /app
 
 # Install the LLaMA Factory
-RUN EXTRA_PACKAGES="metrics"; \
-    if [ "$TARGETPLATFORM" == "linux/arm64" ]; then \
-        EXTRA_PACKAGES="${EXTRA_PACKAGES},torch-npu-arm64"; \
-    else \
-        pip config set global.extra-index-url $EXTRA_INDEX; \
-        EXTRA_PACKAGES="${EXTRA_PACKAGES},torch-npu-amd64"; \
-    fi; \
-    if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
+RUN EXTRA_PACKAGES="torch-npu,metrics"; \
+    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
-    pip install -e .[$EXTRA_PACKAGES] && \
+    if [ "$(uname -i)" != "aarch64" ]; then \
+        pip config set global.extra-index-url "$TORCH_INDEX" \
+    fi; \
+    pip install -e ".[$EXTRA_PACKAGES]" && \
     pip uninstall -y transformer-engine flash-attn
 
 # Set up volumes

From 12a91774b00f4bfd8869112ed726ed3d09d1b4fa Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 27 Jun 2024 19:57:40 +0800
Subject: [PATCH 141/160] Update Dockerfile

Former-commit-id: a239f535a64378b74ef34799cd8e2e4a78f00f4c
---
 docker/docker-npu/Dockerfile | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index d1d176e9..71ab3daf 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -2,13 +2,11 @@
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
-# Set env
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Define installation arguments
 ARG INSTALL_DEEPSPEED=false
 ARG PIP_INDEX=https://pypi.org/simple
-# x86 torch cpu index
 ARG TORCH_INDEX=https://download.pytorch.org/whl/cpu
 
 # Set the working directory
@@ -17,7 +15,7 @@ WORKDIR /app
 # Install the requirements
 COPY requirements.txt /app
 RUN pip config set global.index-url "$PIP_INDEX" && \
-    pip config set global.extra-index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$TORCH_INDEX" && \
     python -m pip install --upgrade pip && \
     python -m pip install -r requirements.txt
 
@@ -29,9 +27,6 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
     if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
-    if [ "$(uname -i)" != "aarch64" ]; then \
-        pip config set global.extra-index-url "$TORCH_INDEX" \
-    fi; \
     pip install -e ".[$EXTRA_PACKAGES]" && \
     pip uninstall -y transformer-engine flash-attn
 

From bf99223a80e7ba5d3f3513456207f4776e63e00e Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 20:14:48 +0800
Subject: [PATCH 142/160] tiny fix

Former-commit-id: c1a78a3a9f8ab9d57577cee37f9c457d60863ba2
---
 README.md                             |  2 +-
 README_zh.md                          |  2 +-
 docker/docker-cuda/Dockerfile         | 36 ++++++++++++++-------------
 docker/docker-cuda/docker-compose.yml |  2 +-
 docker/docker-npu/Dockerfile          |  4 +--
 src/llamafactory/cli.py               |  4 +--
 src/llamafactory/hparams/parser.py    |  4 +--
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 45ac23d8..44aed7e8 100644
--- a/README.md
+++ b/README.md
@@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
-    --build-arg INSTALL_FLASH_ATTN=false \
+    --build-arg INSTALL_FLASHATTN=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
diff --git a/README_zh.md b/README_zh.md
index c5fd4f69..7e3d51ad 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
     --build-arg INSTALL_BNB=false \
     --build-arg INSTALL_VLLM=false \
     --build-arg INSTALL_DEEPSPEED=false \
-    --build-arg INSTALL_FLASH_ATTN=false \
+    --build-arg INSTALL_FLASHATTN=false \
     --build-arg PIP_INDEX=https://pypi.org/simple \
     -t llamafactory:latest .
 
diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 44aaf538..718390a8 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -2,11 +2,14 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html
 FROM nvcr.io/nvidia/pytorch:24.02-py3
 
+# Define environments
+ENV MAX_JOBS=4
+
 # Define installation arguments
 ARG INSTALL_BNB=false
 ARG INSTALL_VLLM=false
 ARG INSTALL_DEEPSPEED=false
-ARG INSTALL_FLASH_ATTN=false
+ARG INSTALL_FLASHATTN=false
 ARG PIP_INDEX=https://pypi.org/simple
 
 # Set the working directory
@@ -14,34 +17,33 @@ WORKDIR /app
 
 # Install the requirements
 COPY requirements.txt /app
-RUN pip config set global.index-url $PIP_INDEX
-RUN pip config set global.extra-index-url $PIP_INDEX
-RUN python -m pip install --upgrade pip
-RUN python -m pip install -r requirements.txt
+RUN pip config set global.index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$PIP_INDEX" && \
+    python -m pip install --upgrade pip && \
+    python -m pip install -r requirements.txt
+
+# Rebuild flash attention
+RUN pip uninstall -y transformer-engine flash-attn && \
+    if [ "$INSTALL_FLASHATTN" == "true" ]; then \
+        pip uninstall -y ninja && pip install ninja && \
+        pip install --no-cache-dir flash-attn --no-build-isolation \
+    fi;
 
 # Copy the rest of the application into the image
 COPY . /app
 
 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="metrics"; \
-    if [ "$INSTALL_BNB" = "true" ]; then \
+    if [ "$INSTALL_BNB" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
     fi; \
-    if [ "$INSTALL_VLLM" = "true" ]; then \
+    if [ "$INSTALL_VLLM" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
     fi; \
-    if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
+    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
-    pip install -e .[$EXTRA_PACKAGES] && \
-    pip uninstall -y transformer-engine flash-attn
-
-# Rebuild flash-attn
-RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
-        ninja --version || \
-        (pip uninstall -y ninja && pip install ninja) && \
-        MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation \
-    fi;
+    pip install -e ".[$EXTRA_PACKAGES]"
 
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
diff --git a/docker/docker-cuda/docker-compose.yml b/docker/docker-cuda/docker-compose.yml
index 4ccb0c04..16267dc3 100644
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@@ -7,7 +7,7 @@ services:
         INSTALL_BNB: false
         INSTALL_VLLM: false
         INSTALL_DEEPSPEED: false
-        INSTALL_FLASH_ATTN: false
+        INSTALL_FLASHATTN: false
         PIP_INDEX: https://pypi.org/simple
     container_name: llamafactory
     volumes:
diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index 71ab3daf..e413d4e3 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -2,6 +2,7 @@
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
 
+# Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Define installation arguments
@@ -27,8 +28,7 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
     if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
         EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
     fi; \
-    pip install -e ".[$EXTRA_PACKAGES]" && \
-    pip uninstall -y transformer-engine flash-attn
+    pip install -e ".[$EXTRA_PACKAGES]"
 
 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index c92f556b..48eb2898 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -91,7 +91,7 @@ def main():
             master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
             master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
             logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
-            subproc = subprocess.run(
+            process = subprocess.run(
                 (
                     "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
                     "--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
@@ -106,7 +106,7 @@ def main():
                 ),
                 shell=True,
             )
-            sys.exit(subproc.returncode)
+            sys.exit(process.returncode)
         else:
             run_exp()
     elif command == Command.WEBDEMO:
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index d4bcfbc6..6017907c 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -199,8 +199,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         if not is_torch_bf16_gpu_available():
             raise ValueError("This device does not support `pure_bf16`.")
 
-        if training_args.deepspeed:
-            raise ValueError("`pure_bf16` is incompatible with DeepSpeed.")
+        if is_deepspeed_zero3_enabled():
+            raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
 
         if training_args.fp16 or training_args.bf16:
             raise ValueError("Turn off mixed precision training when using `pure_bf16`.")

From 95bf795de4fb5507fddf3c6807dab3662b034b17 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Thu, 27 Jun 2024 20:29:16 +0800
Subject: [PATCH 143/160] fix docker file

Former-commit-id: 688f02decb1185deb74b26444f7643cab7d355c1
---
 docker/docker-cuda/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index 718390a8..e4fac987 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -26,8 +26,8 @@ RUN pip config set global.index-url "$PIP_INDEX" && \
 RUN pip uninstall -y transformer-engine flash-attn && \
     if [ "$INSTALL_FLASHATTN" == "true" ]; then \
         pip uninstall -y ninja && pip install ninja && \
-        pip install --no-cache-dir flash-attn --no-build-isolation \
-    fi;
+        pip install --no-cache-dir flash-attn --no-build-isolation; \
+    fi
 
 # Copy the rest of the application into the image
 COPY . /app

From 9103fdf8666ffc511af60f577a8d07945cc0d0af Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 00:41:58 +0800
Subject: [PATCH 144/160] fix #4549

Former-commit-id: c9fdef10de737d1f433209812ef73e29cb60490a
---
 src/llamafactory/hparams/parser.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 6017907c..5c59fbe0 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -202,9 +202,6 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         if is_deepspeed_zero3_enabled():
             raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
 
-        if training_args.fp16 or training_args.bf16:
-            raise ValueError("Turn off mixed precision training when using `pure_bf16`.")
-
     if (
         finetuning_args.use_galore
         and finetuning_args.galore_layerwise

From 46f0189e88ec7fdfa13a8bb01446b4ec63c3fab5 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 01:04:24 +0800
Subject: [PATCH 145/160] refactor pissa, improve llamaboard

Former-commit-id: 619556e46c19718f702c97df5d570a2a4c5fb13a
---
 src/llamafactory/extras/misc.py               |  72 ++--------
 src/llamafactory/hparams/finetuning_args.py   |   6 +-
 src/llamafactory/hparams/parser.py            |  12 +-
 .../{extras => train}/callbacks.py            | 135 +++++++++++++++++-
 src/llamafactory/train/dpo/trainer.py         |  21 +--
 src/llamafactory/train/kto/trainer.py         |  13 +-
 src/llamafactory/train/ppo/trainer.py         |  35 +++--
 src/llamafactory/train/ppo/workflow.py        |   4 +-
 src/llamafactory/train/pt/trainer.py          |  23 ++-
 src/llamafactory/train/rm/trainer.py          |  19 +--
 src/llamafactory/train/rm/workflow.py         |   6 +-
 src/llamafactory/train/sft/trainer.py         |  23 +--
 src/llamafactory/train/trainer_utils.py       |  48 -------
 src/llamafactory/train/tuner.py               |   4 +-
 src/llamafactory/webui/runner.py              |   1 +
 src/llamafactory/webui/utils.py               |  13 +-
 16 files changed, 219 insertions(+), 216 deletions(-)
 rename src/llamafactory/{extras => train}/callbacks.py (59%)

diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index 93153b3e..30c287bd 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -1,4 +1,7 @@
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/peft_model.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,15 +17,11 @@
 
 import gc
 import os
-from typing import TYPE_CHECKING, Dict, Tuple
+from typing import TYPE_CHECKING, Tuple
 
 import torch
-from peft import PeftModel
-from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList, PreTrainedModel
+from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
 from transformers.utils import (
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    is_safetensors_available,
     is_torch_bf16_gpu_available,
     is_torch_cuda_available,
     is_torch_mps_available,
@@ -31,15 +30,9 @@ from transformers.utils import (
 )
 from transformers.utils.versions import require_version
 
-from .constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from .logging import get_logger
 
 
-if is_safetensors_available():
-    from safetensors import safe_open
-    from safetensors.torch import save_file
-
-
 _is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
 try:
     _is_bf16_available = is_torch_bf16_gpu_available()
@@ -48,8 +41,6 @@ except Exception:
 
 
 if TYPE_CHECKING:
-    from trl import AutoModelForCausalLMWithValueHead
-
     from ..hparams import ModelArguments
 
 
@@ -99,7 +90,7 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
         if num_params == 0 and hasattr(param, "ds_numel"):
             num_params = param.ds_numel
 
-        # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by 2
+        # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by itemsize
         if param.__class__.__name__ == "Params4bit":
             if hasattr(param, "quant_storage") and hasattr(param.quant_storage, "itemsize"):
                 num_bytes = param.quant_storage.itemsize
@@ -117,51 +108,6 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
     return trainable_params, all_param
 
 
-def fix_valuehead_checkpoint(
-    model: "AutoModelForCausalLMWithValueHead", output_dir: str, safe_serialization: bool
-) -> None:
-    r"""
-    The model is already unwrapped.
-
-    There are three cases:
-    1. full tuning without ds_zero3: state_dict = {"model.layers.*": ..., "v_head.summary.*": ...}
-    2. lora tuning without ds_zero3: state_dict = {"v_head.summary.*": ...}
-    3. under deepspeed zero3: state_dict = {"pretrained_model.model.layers.*": ..., "v_head.summary.*": ...}
-
-    We assume `stage3_gather_16bit_weights_on_model_save=true`.
-    """
-    if not isinstance(model.pretrained_model, (PreTrainedModel, PeftModel)):
-        return
-
-    if safe_serialization:
-        path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
-        with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
-            state_dict: Dict[str, torch.Tensor] = {key: f.get_tensor(key) for key in f.keys()}
-    else:
-        path_to_checkpoint = os.path.join(output_dir, WEIGHTS_NAME)
-        state_dict: Dict[str, torch.Tensor] = torch.load(path_to_checkpoint, map_location="cpu")
-
-    decoder_state_dict = {}
-    v_head_state_dict = {}
-    for name, param in state_dict.items():
-        if name.startswith("v_head."):
-            v_head_state_dict[name] = param
-        else:
-            decoder_state_dict[name.replace("pretrained_model.", "")] = param
-
-    os.remove(path_to_checkpoint)
-    model.pretrained_model.save_pretrained(
-        output_dir, state_dict=decoder_state_dict or None, safe_serialization=safe_serialization
-    )
-
-    if safe_serialization:
-        save_file(v_head_state_dict, os.path.join(output_dir, V_HEAD_SAFE_WEIGHTS_NAME), metadata={"format": "pt"})
-    else:
-        torch.save(v_head_state_dict, os.path.join(output_dir, V_HEAD_WEIGHTS_NAME))
-
-    logger.info("Value head model saved at: {}".format(output_dir))
-
-
 def get_current_device() -> torch.device:
     r"""
     Gets the current available device.
@@ -201,7 +147,7 @@ def get_logits_processor() -> "LogitsProcessorList":
     return logits_processor
 
 
-def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype:
+def infer_optim_dtype(model_dtype: "torch.dtype") -> "torch.dtype":
     r"""
     Infers the optimal dtype according to the model_dtype and device compatibility.
     """
@@ -220,7 +166,7 @@ def is_gpu_or_npu_available() -> bool:
     return is_torch_npu_available() or is_torch_cuda_available()
 
 
-def has_tokenized_data(path: os.PathLike) -> bool:
+def has_tokenized_data(path: "os.PathLike") -> bool:
     r"""
     Checks if the path has a tokenized dataset.
     """
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index b676891e..28da95ad 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -379,10 +379,10 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora":
             raise ValueError("`loraplus_lr_ratio` is only valid for LoRA training.")
 
-        if self.pissa_convert and self.finetuning_type != "lora":
-            raise ValueError("`pissa_convert` is only valid for LoRA training.")
+        if self.pissa_init and self.finetuning_type != "lora":
+            raise ValueError("`pissa_init` is only valid for LoRA training.")
 
-        if self.pissa_convert and (self.stage in ["rm", "ppo", "kto"] or self.use_ref_model):
+        if self.pissa_init and (self.stage in ["ppo", "kto"] or self.use_ref_model):
             raise ValueError("Cannot use PiSSA for current training stage.")
 
         if self.train_mm_proj_only and self.finetuning_type != "full":
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 5c59fbe0..8b2ea4c1 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -83,9 +83,6 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
     if model_args.adapter_name_or_path is not None and finetuning_args.finetuning_type != "lora":
         raise ValueError("Adapter is only valid for the LoRA method.")
 
-    if model_args.use_unsloth and is_deepspeed_zero3_enabled():
-        raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.")
-
     if model_args.quantization_bit is not None:
         if finetuning_args.finetuning_type != "lora":
             raise ValueError("Quantization is only compatible with the LoRA method.")
@@ -186,6 +183,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED:
         raise ValueError("Please launch distributed training with `llamafactory-cli` or `torchrun`.")
 
+    if training_args.deepspeed and training_args.parallel_mode != ParallelMode.DISTRIBUTED:
+        raise ValueError("Please use `FORCE_TORCHRUN=1` to launch DeepSpeed training.")
+
     if training_args.max_steps == -1 and data_args.streaming:
         raise ValueError("Please specify `max_steps` in streaming mode.")
 
@@ -195,6 +195,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.do_train and model_args.quantization_device_map == "auto":
         raise ValueError("Cannot use device map for quantized models in training.")
 
+    if finetuning_args.pissa_init and is_deepspeed_zero3_enabled():
+        raise ValueError("PiSSA is incompatible with DeepSpeed ZeRO-3.")
+
     if finetuning_args.pure_bf16:
         if not is_torch_bf16_gpu_available():
             raise ValueError("This device does not support `pure_bf16`.")
@@ -224,6 +227,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if model_args.visual_inputs and data_args.packing:
         raise ValueError("Cannot use packing in MLLM fine-tuning.")
 
+    if model_args.use_unsloth and is_deepspeed_zero3_enabled():
+        raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.")
+
     _verify_model_args(model_args, finetuning_args)
     _check_extra_dependencies(model_args, finetuning_args, training_args)
 
diff --git a/src/llamafactory/extras/callbacks.py b/src/llamafactory/train/callbacks.py
similarity index 59%
rename from src/llamafactory/extras/callbacks.py
rename to src/llamafactory/train/callbacks.py
index 0dff6a69..dc9c981e 100644
--- a/src/llamafactory/extras/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
@@ -1,4 +1,7 @@
-# Copyright 2024 the LlamaFactory team.
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,22 +25,78 @@ from concurrent.futures import ThreadPoolExecutor
 from datetime import timedelta
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
+import torch
 import transformers
-from transformers import TrainerCallback
+from peft import PeftModel
+from transformers import PreTrainedModel, ProcessorMixin, TrainerCallback
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length
+from transformers.utils import (
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    is_safetensors_available,
+)
 
-from .constants import TRAINER_LOG
-from .logging import LoggerHandler, get_logger
-from .misc import fix_valuehead_checkpoint
+from ..extras.constants import TRAINER_LOG, V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+from ..extras.logging import LoggerHandler, get_logger
 
 
+if is_safetensors_available():
+    from safetensors import safe_open
+    from safetensors.torch import save_file
+
 if TYPE_CHECKING:
     from transformers import TrainerControl, TrainerState, TrainingArguments
+    from trl import AutoModelForCausalLMWithValueHead
 
 
 logger = get_logger(__name__)
 
 
+def fix_valuehead_checkpoint(
+    model: "AutoModelForCausalLMWithValueHead", output_dir: str, safe_serialization: bool
+) -> None:
+    r"""
+    The model is already unwrapped.
+
+    There are three cases:
+    1. full tuning without ds_zero3: state_dict = {"model.layers.*": ..., "v_head.summary.*": ...}
+    2. lora tuning without ds_zero3: state_dict = {"v_head.summary.*": ...}
+    3. under deepspeed zero3: state_dict = {"pretrained_model.model.layers.*": ..., "v_head.summary.*": ...}
+
+    We assume `stage3_gather_16bit_weights_on_model_save=true`.
+    """
+    if not isinstance(model.pretrained_model, (PreTrainedModel, PeftModel)):
+        return
+
+    if safe_serialization:
+        path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
+        with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
+            state_dict: Dict[str, torch.Tensor] = {key: f.get_tensor(key) for key in f.keys()}
+    else:
+        path_to_checkpoint = os.path.join(output_dir, WEIGHTS_NAME)
+        state_dict: Dict[str, torch.Tensor] = torch.load(path_to_checkpoint, map_location="cpu")
+
+    decoder_state_dict = {}
+    v_head_state_dict = {}
+    for name, param in state_dict.items():
+        if name.startswith("v_head."):
+            v_head_state_dict[name] = param
+        else:
+            decoder_state_dict[name.replace("pretrained_model.", "")] = param
+
+    os.remove(path_to_checkpoint)
+    model.pretrained_model.save_pretrained(
+        output_dir, state_dict=decoder_state_dict or None, safe_serialization=safe_serialization
+    )
+
+    if safe_serialization:
+        save_file(v_head_state_dict, os.path.join(output_dir, V_HEAD_SAFE_WEIGHTS_NAME), metadata={"format": "pt"})
+    else:
+        torch.save(v_head_state_dict, os.path.join(output_dir, V_HEAD_WEIGHTS_NAME))
+
+    logger.info("Value head model saved at: {}".format(output_dir))
+
+
 class FixValueHeadModelCallback(TrainerCallback):
     def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
@@ -51,8 +110,70 @@ class FixValueHeadModelCallback(TrainerCallback):
             )
 
 
+class SaveProcessorCallback(TrainerCallback):
+    def __init__(self, processor: "ProcessorMixin") -> None:
+        r"""
+        Initializes a callback for saving the processor.
+        """
+        self.processor = processor
+
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the end of training.
+        """
+        if args.should_save:
+            getattr(self.processor, "image_processor").save_pretrained(args.output_dir)
+
+
+class PissaConvertCallback(TrainerCallback):
+    r"""
+    Initializes a callback for converting the PiSSA adapter to a normal one.
+    """
+
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the beginning of training.
+        """
+        if args.should_save:
+            model = kwargs.pop("model")
+            pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
+            logger.info("Initial PiSSA adatper will be saved at: {}.".format(pissa_init_dir))
+            if isinstance(model, PeftModel):
+                init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
+                setattr(model.peft_config["default"], "init_lora_weights", True)
+                model.save_pretrained(pissa_init_dir, safe_serialization=args.save_safetensors)
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the end of training.
+        """
+        if args.should_save:
+            model = kwargs.pop("model")
+            pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
+            pissa_backup_dir = os.path.join(args.output_dir, "pissa_backup")
+            pissa_convert_dir = os.path.join(args.output_dir, "pissa_converted")
+            logger.info("Converted PiSSA adapter will be saved at: {}.".format(pissa_convert_dir))
+            # 1. save a pissa backup with init_lora_weights: True
+            # 2. save a converted lora with init_lora_weights: pissa
+            # 3. load the pissa backup with init_lora_weights: True
+            # 4. delete the initial adapter and change init_lora_weights to pissa
+            if isinstance(model, PeftModel):
+                init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
+                setattr(model.peft_config["default"], "init_lora_weights", True)
+                model.save_pretrained(pissa_backup_dir, safe_serialization=args.save_safetensors)
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+                model.save_pretrained(
+                    pissa_convert_dir, safe_serialization=args.save_safetensors, convert_pissa_to_lora=pissa_init_dir
+                )
+                model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
+                model.set_adapter("default")
+                model.delete_adapter("pissa_init")
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
+
 class LogCallback(TrainerCallback):
-    def __init__(self, output_dir: str) -> None:
+    def __init__(self) -> None:
         r"""
         Initializes a callback for logging training and evaluation status.
         """
@@ -70,7 +191,7 @@ class LogCallback(TrainerCallback):
         self.webui_mode = os.environ.get("LLAMABOARD_ENABLED", "0").lower() in ["true", "1"]
         if self.webui_mode:
             signal.signal(signal.SIGABRT, self._set_abort)
-            self.logger_handler = LoggerHandler(output_dir)
+            self.logger_handler = LoggerHandler(os.environ.get("LLAMABOARD_WORKDIR"))
             logging.root.addHandler(self.logger_handler)
             transformers.logging.add_handler(self.logger_handler)
 
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index ed4fd5d9..e45467d6 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import warnings
 from collections import defaultdict
 from contextlib import nullcontext
@@ -29,7 +28,8 @@ from trl import DPOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
-from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler, get_batch_logps
+from ..callbacks import PissaConvertCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
 if TYPE_CHECKING:
@@ -54,7 +54,6 @@ class CustomDPOTrainer(DPOTrainer):
                 disable_dropout_in_model(ref_model)
 
         self.finetuning_args = finetuning_args
-        self.processor = processor
         self.reference_free = False
         self.use_dpo_data_collator = True  # hack to avoid warning
         self.generate_during_eval = False  # disable at evaluation
@@ -92,14 +91,17 @@ class CustomDPOTrainer(DPOTrainer):
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
                 self.ref_model.eval()
 
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
         if finetuning_args.pissa_convert:
-            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+            self.callback_handler.add_callback(PissaConvertCallback)
 
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -112,15 +114,6 @@ class CustomDPOTrainer(DPOTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.finetuning_args.pissa_convert:
-            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
-
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
-
     def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
         r"""
         Computes ORPO's odds ratio (OR) loss for batched log probabilities of the policy model.
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index c2edf95a..460311e4 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -27,6 +27,7 @@ from trl import KTOTrainer
 from trl.trainer import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
+from ..callbacks import SaveProcessorCallback
 from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps
 
 
@@ -53,7 +54,6 @@ class CustomKTOTrainer(KTOTrainer):
                 disable_dropout_in_model(ref_model)
 
         self.finetuning_args = finetuning_args
-        self.processor = processor
         self.reference_free = False
         self.use_dpo_data_collator = True  # hack to avoid warning
         self.generate_during_eval = False  # disable at evaluation
@@ -90,11 +90,14 @@ class CustomKTOTrainer(KTOTrainer):
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
                 self.ref_model.eval()
 
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -113,12 +116,6 @@ class CustomKTOTrainer(KTOTrainer):
         """
         return Trainer._get_train_sampler(self)
 
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
-
     def forward(
         self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"], prefix: Literal["", "kl_"] = ""
     ) -> Tuple["torch.Tensor", "torch.Tensor"]:
diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
index c5f6e175..57f0b848 100644
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
@@ -27,6 +27,7 @@ from accelerate.utils import DistributedDataParallelKwargs
 from tqdm import tqdm
 from transformers import GenerationConfig, Trainer, TrainerControl, TrainerState
 from transformers.optimization import get_scheduler
+from transformers.trainer_callback import CallbackHandler
 from transformers.trainer_pt_utils import remove_dummy_checkpoint
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
@@ -34,9 +35,9 @@ from trl import PPOConfig, PPOTrainer
 from trl.core import PPODecorators, logprobs_from_logits
 from trl.models.utils import unwrap_model_for_generation
 
-from ...extras.callbacks import FixValueHeadModelCallback, LogCallback
 from ...extras.logging import get_logger
 from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor
+from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback
 from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 from .ppo_utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm
 
@@ -131,7 +132,6 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         self.finetuning_args = finetuning_args
         self.reward_model = reward_model
         self.current_device = get_current_device()  # patch for deepspeed training
-        self.processor = processor
 
         self.generation_config = GenerationConfig(
             pad_token_id=self.tokenizer.pad_token_id,
@@ -143,8 +143,9 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         self.control = TrainerControl()
         self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
         self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
-        self.log_callback, self.save_callback = callbacks[0], callbacks[1]
-        assert isinstance(self.log_callback, LogCallback) and isinstance(self.save_callback, FixValueHeadModelCallback)
+        self.callback_handler = CallbackHandler(
+            [callbacks], self.accelerator.unwrap_model(self.model), self.tokenizer, self.optimizer, self.lr_scheduler
+        )
 
         if self.args.max_steps > 0:
             logger.info("max_steps is given, it will override any value given in num_train_epochs")
@@ -165,11 +166,16 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
             else:
                 self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
 
+        self.add_callback(FixValueHeadModelCallback)
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
         r"""
@@ -219,7 +225,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         dataiter = iter(self.dataloader)
         loss_meter = AverageMeter()
         reward_meter = AverageMeter()
-        self.log_callback.on_train_begin(self.args, self.state, self.control)
+        self.callback_handler.on_train_begin(self.args, self.state, self.control)
 
         for step in tqdm(range(max_steps), disable=not self.is_local_process_zero()):
             try:
@@ -257,7 +263,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                     logger.warning("Failed to save stats due to unknown errors.")
 
             self.state.global_step += 1
-            self.log_callback.on_step_end(self.args, self.state, self.control)
+            self.callback_handler.on_step_end(self.args, self.state, self.control)
 
             if self.is_local_process_zero() and (step + 1) % self.args.logging_steps == 0:
                 logs = dict(
@@ -269,7 +275,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                 tqdm.write(str(logs))
                 logs["step"] = step
                 self.state.log_history.append(logs)
-                self.log_callback.on_log(self.args, self.state, self.control)
+                self.callback_handler.on_log(self.args, self.state, self.control, logs)
                 loss_meter.reset()
                 reward_meter.reset()
 
@@ -277,17 +283,12 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
                 self.save_model(
                     os.path.join(self.args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, self.state.global_step))
                 )
-                self.save_callback.on_save(
-                    self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model)
-                )
+                self.callback_handler.on_save(self.args, self.state, self.control)
 
             if self.control.should_epoch_stop or self.control.should_training_stop:
                 break
 
-        self.log_callback.on_train_end(self.args, self.state, self.control)
-        self.save_callback.on_train_end(
-            self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model)
-        )
+        self.callback_handler.on_train_end(self.args, self.state, self.control)
 
     def create_optimizer(
         self,
@@ -505,7 +506,3 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
 
         elif self.args.should_save:
             self._save(output_dir)
-
-        if self.processor is not None and self.args.should_save:
-            output_dir = output_dir if output_dir is not None else self.args.output_dir
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py
index 4f4d2820..651296f3 100644
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
@@ -20,10 +20,9 @@ from typing import TYPE_CHECKING, List, Optional
 from transformers import DataCollatorWithPadding
 
 from ...data import get_dataset
-from ...extras.callbacks import FixValueHeadModelCallback
-from ...extras.misc import fix_valuehead_checkpoint
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
+from ..callbacks import FixValueHeadModelCallback, fix_valuehead_checkpoint
 from ..trainer_utils import create_ref_model, create_reward_model
 from .trainer import CustomPPOTrainer
 
@@ -75,6 +74,7 @@ def run_ppo(
         ppo_trainer.save_model()
         if training_args.should_save:
             fix_valuehead_checkpoint(model, training_args.output_dir, training_args.save_safetensors)
+
         ppo_trainer.save_state()  # must be called after save_model to have a folder
         if ppo_trainer.is_world_process_zero() and finetuning_args.plot_loss:
             plot_loss(training_args.output_dir, keys=["loss", "reward"])
diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
index b6fb161d..e8f180a6 100644
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from types import MethodType
-from typing import TYPE_CHECKING, Dict, Optional
+from typing import TYPE_CHECKING, Optional
 
 from transformers import Trainer
 
 from ...extras.logging import get_logger
-from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler
+from ..callbacks import PissaConvertCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -42,16 +42,18 @@ class CustomTrainer(Trainer):
     ) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
-        self.processor = processor
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
 
         if finetuning_args.pissa_convert:
-            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+            self.add_callback(PissaConvertCallback)
 
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -63,12 +65,3 @@ class CustomTrainer(Trainer):
     ) -> "torch.optim.lr_scheduler.LRScheduler":
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
-
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.finetuning_args.pissa_convert:
-            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
-
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 70c2e9a0..5eceead8 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -46,6 +46,7 @@ import torch
 from transformers import Trainer
 
 from ...extras.logging import get_logger
+from ..callbacks import FixValueHeadModelCallback, PissaConvertCallback, SaveProcessorCallback
 from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 
 
@@ -69,13 +70,20 @@ class PairwiseTrainer(Trainer):
     ) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
-        self.processor = processor
         self.can_return_loss = True  # override property to return eval_loss
+        self.add_callback(FixValueHeadModelCallback)
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.pissa_convert:
+            self.add_callback(PissaConvertCallback)
+
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -88,12 +96,6 @@ class PairwiseTrainer(Trainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
-
     def compute_loss(
         self, model: "PreTrainedModel", inputs: Dict[str, torch.Tensor], return_outputs: bool = False
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
@@ -164,4 +166,5 @@ class PairwiseTrainer(Trainer):
             res: List[str] = []
             for c_score, r_score in zip(chosen_scores, rejected_scores):
                 res.append(json.dumps({"chosen": round(float(c_score), 2), "rejected": round(float(r_score), 2)}))
+
             writer.write("\n".join(res))
diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py
index 6f24e964..e0b32b77 100644
--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
@@ -40,10 +40,9 @@
 from typing import TYPE_CHECKING, List, Optional
 
 from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset
-from ...extras.callbacks import FixValueHeadModelCallback
-from ...extras.misc import fix_valuehead_checkpoint
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
+from ..callbacks import fix_valuehead_checkpoint
 from ..trainer_utils import create_modelcard_and_push
 from .metric import compute_accuracy
 from .trainer import PairwiseTrainer
@@ -77,7 +76,7 @@ def run_rm(
         args=training_args,
         finetuning_args=finetuning_args,
         data_collator=data_collator,
-        callbacks=callbacks + [FixValueHeadModelCallback()],
+        callbacks=callbacks,
         compute_metrics=compute_accuracy,
         **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
@@ -89,6 +88,7 @@ def run_rm(
         trainer.save_model()
         if training_args.should_save:
             fix_valuehead_checkpoint(model, training_args.output_dir, training_args.save_safetensors)
+
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index f0a86dff..06bd2b6b 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -26,7 +26,8 @@ from transformers import Seq2SeqTrainer
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from ..trainer_utils import convert_pissa_adapter, create_custom_optimzer, create_custom_scheduler
+from ..callbacks import PissaConvertCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -50,19 +51,18 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
     ) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
-        self.processor = processor
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
 
         if finetuning_args.pissa_convert:
-            if self.is_deepspeed_enabled:
-                self.accelerator.deepspeed_config = self.accelerator.state.deepspeed_plugin.deepspeed_config
-                self.deepspeed = self._wrap_model(self.model_wrapped)
-            self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
+            self.add_callback(PissaConvertCallback)
 
         if finetuning_args.use_badam:
             from badam import BAdamCallback, clip_grad_norm_old_version
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
-            self.callback_handler.add_callback(BAdamCallback)
+            self.add_callback(BAdamCallback)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
@@ -75,15 +75,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
-    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
-        super()._save(output_dir, state_dict)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        if self.finetuning_args.pissa_convert:
-            convert_pissa_adapter(output_dir, state_dict, self.accelerator, self.model, self.args)
-
-        if self.processor is not None:
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
-
     def prediction_step(
         self,
         model: "torch.nn.Module",
diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py
index 21d41c36..4b581691 100644
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
@@ -17,11 +17,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from peft import PeftModel
 from transformers import Trainer
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.optimization import get_scheduler
@@ -40,7 +38,6 @@ if is_galore_available():
 
 
 if TYPE_CHECKING:
-    from accelerate import Accelerator
     from transformers import PreTrainedModel, Seq2SeqTrainingArguments
     from trl import AutoModelForCausalLMWithValueHead
 
@@ -175,51 +172,6 @@ def create_reward_model(
         return reward_model
 
 
-def convert_pissa_adapter(
-    output_dir: str,
-    state_dict: Dict[str, "torch.Tensor"],
-    accelerator: "Accelerator",
-    model: "PreTrainedModel",
-    training_args: "Seq2SeqTrainingArguments",
-) -> None:
-    r"""
-    Converts the PiSSA adapter to a LoRA adapter.
-    """
-    pissa_init_dir = os.path.join(training_args.output_dir, "pissa_init")
-    pissa_backup_dir = os.path.join(output_dir, "pissa_backup")
-    if output_dir == pissa_init_dir:
-        logger.info("Initial PiSSA adatper will be saved at: {}.".format(pissa_init_dir))
-        unwrapped_model = accelerator.unwrap_model(model)
-        if isinstance(unwrapped_model, PeftModel):
-            init_lora_weights = getattr(unwrapped_model.peft_config["default"], "init_lora_weights")
-            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", True)
-            unwrapped_model.save_pretrained(
-                output_dir,
-                state_dict=state_dict,
-                safe_serialization=training_args.save_safetensors,
-            )
-            setattr(unwrapped_model.peft_config["default"], "init_lora_weights", init_lora_weights)
-
-    elif output_dir == training_args.output_dir:  # at the end of training
-        logger.info("Converted PiSSA adapter will be saved at: {}.".format(output_dir))
-        unwrapped_model = accelerator.unwrap_model(model)
-        if isinstance(unwrapped_model, PeftModel):  # backup the pissa adapter for further use
-            unwrapped_model.save_pretrained(
-                pissa_backup_dir,
-                state_dict=state_dict,
-                safe_serialization=training_args.save_safetensors,
-            )
-            unwrapped_model.save_pretrained(
-                output_dir,
-                state_dict=state_dict,
-                safe_serialization=training_args.save_safetensors,
-                convert_pissa_to_lora=pissa_init_dir,
-            )
-            # TODO: the model is applied pissa again unexpectedly
-            unwrapped_model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
-            unwrapped_model.set_adapter("default")
-
-
 def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]:
     r"""
     Returns a list of names of parameters with weight decay. (weights in non-layernorm layers)
diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py
index a02fff22..dc982e07 100644
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -20,11 +20,11 @@ import torch
 from transformers import PreTrainedModel
 
 from ..data import get_template_and_fix_tokenizer
-from ..extras.callbacks import LogCallback
 from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from ..extras.logging import get_logger
 from ..hparams import get_infer_args, get_train_args
 from ..model import load_model, load_tokenizer
+from .callbacks import LogCallback
 from .dpo import run_dpo
 from .kto import run_kto
 from .ppo import run_ppo
@@ -41,8 +41,8 @@ logger = get_logger(__name__)
 
 
 def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallback"] = []) -> None:
+    callbacks.append(LogCallback())
     model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
-    callbacks.append(LogCallback(training_args.output_dir))
 
     if finetuning_args.stage == "pt":
         run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index f7fbac30..131d180d 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -310,6 +310,7 @@ class Runner:
 
             env = deepcopy(os.environ)
             env["LLAMABOARD_ENABLED"] = "1"
+            env["LLAMABOARD_WORKDIR"] = args["output_dir"]
             if args.get("deepspeed", None) is not None:
                 env["FORCE_TORCHRUN"] = "1"
 
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index 14616ac4..6e5fdbe4 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -38,12 +38,15 @@ def abort_process(pid: int) -> None:
     r"""
     Aborts the processes recursively in a bottom-up way.
     """
-    children = psutil.Process(pid).children()
-    if children:
-        for child in children:
-            abort_process(child.pid)
+    try:
+        children = psutil.Process(pid).children()
+        if children:
+            for child in children:
+                abort_process(child.pid)
 
-    os.kill(pid, signal.SIGABRT)
+        os.kill(pid, signal.SIGABRT)
+    except Exception:
+        pass
 
 
 def can_quantize(finetuning_type: str) -> "gr.Dropdown":

From 024760f8664f6ff307937bc96a23d51f5b36ae10 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 01:17:07 +0800
Subject: [PATCH 146/160] update examples

Former-commit-id: 66f248b90cfa2b29c73060459b2337b78154c47b
---
 examples/extras/fsdp_qlora/llama3_lora_sft.yaml  | 2 +-
 examples/extras/llama_pro/llama3_freeze_sft.yaml | 2 +-
 examples/extras/loraplus/llama3_lora_sft.yaml    | 2 +-
 examples/extras/pissa/llama3_lora_sft.yaml       | 2 +-
 examples/train_full/llama3_full_sft_ds3.yaml     | 2 +-
 examples/train_lora/llama3_lora_dpo.yaml         | 2 +-
 examples/train_lora/llama3_lora_kto.yaml         | 2 +-
 examples/train_lora/llama3_lora_ppo.yaml         | 2 +-
 examples/train_lora/llama3_lora_pretrain.yaml    | 2 +-
 examples/train_lora/llama3_lora_reward.yaml      | 4 ++--
 examples/train_lora/llama3_lora_sft.yaml         | 2 +-
 examples/train_lora/llama3_lora_sft_ds0.yaml     | 2 +-
 examples/train_lora/llama3_lora_sft_ds3.yaml     | 2 +-
 examples/train_lora/llava1_5_lora_sft.yaml       | 2 +-
 examples/train_qlora/llama3_lora_sft_aqlm.yaml   | 2 +-
 examples/train_qlora/llama3_lora_sft_awq.yaml    | 2 +-
 examples/train_qlora/llama3_lora_sft_gptq.yaml   | 2 +-
 examples/train_qlora/llama3_lora_sft_otfq.yaml   | 2 +-
 18 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
index cc773991..6c80ef58 100644
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index f92d6945..5e7e90bb 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -31,7 +31,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
index 57383ae0..062a312b 100644
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/extras/pissa/llama3_lora_sft.yaml b/examples/extras/pissa/llama3_lora_sft.yaml
index fd4b9f1d..05077b6c 100644
--- a/examples/extras/pissa/llama3_lora_sft.yaml
+++ b/examples/extras/pissa/llama3_lora_sft.yaml
@@ -32,7 +32,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_full/llama3_full_sft_ds3.yaml b/examples/train_full/llama3_full_sft_ds3.yaml
index 40afd2ee..c983ad5c 100644
--- a/examples/train_full/llama3_full_sft_ds3.yaml
+++ b/examples/train_full/llama3_full_sft_ds3.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_dpo.yaml b/examples/train_lora/llama3_lora_dpo.yaml
index 188e5078..d87c0669 100644
--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -31,7 +31,7 @@ learning_rate: 5.0e-6
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_kto.yaml b/examples/train_lora/llama3_lora_kto.yaml
index f730c82e..08208c25 100644
--- a/examples/train_lora/llama3_lora_kto.yaml
+++ b/examples/train_lora/llama3_lora_kto.yaml
@@ -30,7 +30,7 @@ learning_rate: 5.0e-6
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_ppo.yaml b/examples/train_lora/llama3_lora_ppo.yaml
index e574014e..512e90ea 100644
--- a/examples/train_lora/llama3_lora_ppo.yaml
+++ b/examples/train_lora/llama3_lora_ppo.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-5
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### generate
diff --git a/examples/train_lora/llama3_lora_pretrain.yaml b/examples/train_lora/llama3_lora_pretrain.yaml
index 839b3e51..5e8aaaef 100644
--- a/examples/train_lora/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/llama3_lora_pretrain.yaml
@@ -28,7 +28,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_reward.yaml b/examples/train_lora/llama3_lora_reward.yaml
index 79559d19..96c32238 100644
--- a/examples/train_lora/llama3_lora_reward.yaml
+++ b/examples/train_lora/llama3_lora_reward.yaml
@@ -25,11 +25,11 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
+learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_sft.yaml b/examples/train_lora/llama3_lora_sft.yaml
index fe30c575..55a8077e 100644
--- a/examples/train_lora/llama3_lora_sft.yaml
+++ b/examples/train_lora/llama3_lora_sft.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_sft_ds0.yaml b/examples/train_lora/llama3_lora_sft_ds0.yaml
index 08b638e6..f1442faa 100644
--- a/examples/train_lora/llama3_lora_sft_ds0.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds0.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llama3_lora_sft_ds3.yaml b/examples/train_lora/llama3_lora_sft_ds3.yaml
index b7266d61..66e7007e 100644
--- a/examples/train_lora/llama3_lora_sft_ds3.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds3.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_lora/llava1_5_lora_sft.yaml b/examples/train_lora/llava1_5_lora_sft.yaml
index 55ac31fa..ec03f82c 100644
--- a/examples/train_lora/llava1_5_lora_sft.yaml
+++ b/examples/train_lora/llava1_5_lora_sft.yaml
@@ -30,7 +30,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_qlora/llama3_lora_sft_aqlm.yaml b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
index 7b6767d5..3519d46b 100644
--- a/examples/train_qlora/llama3_lora_sft_aqlm.yaml
+++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_qlora/llama3_lora_sft_awq.yaml b/examples/train_qlora/llama3_lora_sft_awq.yaml
index a2a26e4b..df48669b 100644
--- a/examples/train_qlora/llama3_lora_sft_awq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_awq.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_qlora/llama3_lora_sft_gptq.yaml b/examples/train_qlora/llama3_lora_sft_gptq.yaml
index ad3d854c..61fa9bb4 100644
--- a/examples/train_qlora/llama3_lora_sft_gptq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval
diff --git a/examples/train_qlora/llama3_lora_sft_otfq.yaml b/examples/train_qlora/llama3_lora_sft_otfq.yaml
index 9c73b439..80a05768 100644
--- a/examples/train_qlora/llama3_lora_sft_otfq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_otfq.yaml
@@ -31,7 +31,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-fp16: true
+bf16: true
 ddp_timeout: 180000000
 
 ### eval

From 42e7489713ac0f464695283e05d0b86a450d8a37 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 01:26:50 +0800
Subject: [PATCH 147/160] add Gemma2 models

Former-commit-id: 8fc5a248ecfd6861cb90dac6c14fe89cdeaf8921
---
 README.md                            |  2 +-
 README_zh.md                         |  2 +-
 src/llamafactory/extras/constants.py | 12 ++++++++++++
 src/llamafactory/train/rm/trainer.py |  2 +-
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 44aed7e8..27d1e98b 100644
--- a/README.md
+++ b/README.md
@@ -160,7 +160,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
 | [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)          | 2B/7B                            | gemma     |
+| [Gemma/Gemma2/CodeGemma](https://huggingface.co/google)   | 2B/7B/9B/27B                     | gemma     |
 | [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
 | [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
diff --git a/README_zh.md b/README_zh.md
index 7e3d51ad..9fb56a4e 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -160,7 +160,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
 | [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)          | 2B/7B                            | gemma     |
+| [Gemma/Gemma2/CodeGemma](https://huggingface.co/google)   | 2B/7B/9B/27B                     | gemma     |
 | [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
 | [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 866f39d4..3ea21f76 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -507,6 +507,18 @@ register_model_group(
         "Gemma-1.1-7B-Chat": {
             DownloadSource.DEFAULT: "google/gemma-1.1-7b-it",
         },
+        "Gemma-2-9B": {
+            DownloadSource.DEFAULT: "google/gemma-2-9b",
+        },
+        "Gemma-2-27B": {
+            DownloadSource.DEFAULT: "google/gemma-2-27b",
+        },
+        "Gemma-2-9B-Chat": {
+            DownloadSource.DEFAULT: "google/gemma-2-9b-it",
+        },
+        "Gemma-2-27B-Chat": {
+            DownloadSource.DEFAULT: "google/gemma-2-27b-it",
+        },
     },
     template="gemma",
 )
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 5eceead8..accc877d 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -105,7 +105,7 @@ class PairwiseTrainer(Trainer):
         Subclass and override to inject custom behavior.
 
         Note that the first element will be removed from the output tuple.
-        See: https://github.com/huggingface/transformers/blob/v4.39.1/src/transformers/trainer.py#L3777
+        See: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py#L3842
         """
         # Compute rewards
         _, _, values = model(**inputs, output_hidden_states=True, return_dict=True)

From a1437c15f76aa8b14a8fd287e22e5aa163775541 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 01:28:59 +0800
Subject: [PATCH 148/160] fix docker flashattn

Former-commit-id: 0966f5d4616a3877a6b921976dc39e8799831d36
---
 docker/docker-cuda/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/docker-cuda/Dockerfile b/docker/docker-cuda/Dockerfile
index e4fac987..d94aa970 100644
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@@ -4,6 +4,7 @@ FROM nvcr.io/nvidia/pytorch:24.02-py3
 
 # Define environments
 ENV MAX_JOBS=4
+ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
 
 # Define installation arguments
 ARG INSTALL_BNB=false

From cfdf5a5a787191624c711b2a8adddb4aba0f6edc Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 03:18:54 +0800
Subject: [PATCH 149/160] increase pissa_iter for stability

Former-commit-id: 03f8d9b0fb10ae58e7f68508197330d616957899
---
 src/llamafactory/hparams/finetuning_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index 28da95ad..3867c0ec 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -113,7 +113,7 @@ class LoraArguments:
         metadata={"help": "Whether or not to initialize a PiSSA adapter."},
     )
     pissa_iter: int = field(
-        default=4,
+        default=16,
         metadata={"help": "The number of iteration steps performed by FSVD in PiSSA. Use -1 to disable it."},
     )
     pissa_convert: bool = field(

From fda2cf677b366816fb2088b969834dd09d71b57a Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 06:00:26 +0800
Subject: [PATCH 150/160] bf16 by default, gemma2 attns

Gemma2 finetuning cannot work until merging https://github.com/huggingface/transformers/pull/31674


Former-commit-id: da66c32c7be0adc28d2185b23e9f62d56acb961c
---
 src/llamafactory/model/model_utils/attention.py | 8 +++++++-
 src/llamafactory/model/patcher.py               | 2 +-
 src/llamafactory/webui/components/train.py      | 2 +-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index dfd90936..9021d277 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -28,7 +28,13 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
-def configure_attn_implementation(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
+def configure_attn_implementation(
+    config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool
+) -> None:
+    if getattr(config, "model_type", None) == "gemma2" and is_trainable:  # gemma2 adopts soft-cap attention
+        logger.warning("Gemma-2 models should use eager attention in training, change `flash_attn` to disabled.")
+        model_args.flash_attn = "disabled"
+
     if model_args.flash_attn == "auto":
         return
 
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 24cd2601..4eae0bb4 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -67,7 +67,7 @@ def patch_config(
         use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
         torch.npu.set_compile_mode(jit_compile=use_jit_compile)
 
-    configure_attn_implementation(config, model_args)
+    configure_attn_implementation(config, model_args, is_trainable)
     configure_rope(config, model_args, is_trainable)
     configure_longlora(config, model_args, is_trainable)
     configure_quantization(config, tokenizer, model_args, init_kwargs)
diff --git a/src/llamafactory/webui/components/train.py b/src/llamafactory/webui/components/train.py
index f33c37ee..d4832dd3 100644
--- a/src/llamafactory/webui/components/train.py
+++ b/src/llamafactory/webui/components/train.py
@@ -54,7 +54,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         num_train_epochs = gr.Textbox(value="3.0")
         max_grad_norm = gr.Textbox(value="1.0")
         max_samples = gr.Textbox(value="100000")
-        compute_type = gr.Dropdown(choices=["fp16", "bf16", "fp32", "pure_bf16"], value="fp16")
+        compute_type = gr.Dropdown(choices=["bf16", "fp16", "fp32", "pure_bf16"], value="bf16")
 
     input_elems.update({learning_rate, num_train_epochs, max_grad_norm, max_samples, compute_type})
     elem_dict.update(

From 6a75d5706030f7eadf5756e81eb1f1a702d06b79 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 28 Jun 2024 06:55:19 +0800
Subject: [PATCH 151/160] update readme

Former-commit-id: 9f809c311af373508cb51b204ae54b047729a9dc
---
 README.md    | 57 +++++++++++++++++++++++++---------------------------
 README_zh.md | 57 +++++++++++++++++++++++++---------------------------
 2 files changed, 54 insertions(+), 60 deletions(-)

diff --git a/README.md b/README.md
index 27d1e98b..6c6a48d9 100644
--- a/README.md
+++ b/README.md
@@ -151,35 +151,32 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                     | Model size                       | Template  |
-| --------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)          | 7B/13B                           | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)                | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                  | 6B                               | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/Gemma2/CodeGemma](https://huggingface.co/google)   | 2B/7B/9B/27B                     | gemma     |
-| [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
-| [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)              | 7B/13B/70B                       | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)              | 8B/70B                           | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)              | 7B/13B                           | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)       | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                    | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)                | 3B                               | gemma     |
-| [Phi-1.5/2](https://huggingface.co/microsoft)             | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                 | 4B/7B/14B                        | phi       |
-| [Qwen](https://huggingface.co/Qwen)                       | 1.8B/7B/14B/72B                  | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)         | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2 (MoE)](https://huggingface.co/Qwen)                | 0.5B/1.5B/7B/57B/72B             | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)              | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                   | 7B/13B/65B                       | xverse    |
-| [Yi (1/1.5)](https://huggingface.co/01-ai)                | 6B/9B/34B                        | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                     | 6B/34B                           | yi_vl     |
-| [Yuan](https://huggingface.co/IEITYuan)                   | 2B/51B/102B                      | yuan      |
+| Model                                                        | Model size                       | Template  |
+| ------------------------------------------------------------ | -------------------------------- | --------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)            | 7B/13B                           | baichuan2 |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)            | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                     | 6B                               | chatglm3  |
+| [Command R](https://huggingface.co/CohereForAI)              | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)    | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                      | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)     | 2B/7B/9B/27B                     | gemma     |
+| [GLM-4](https://huggingface.co/THUDM)                        | 9B                               | glm4      |
+| [InternLM2](https://huggingface.co/internlm)                 | 7B/20B                           | intern2   |
+| [Llama](https://github.com/facebookresearch/llama)           | 7B/13B/33B/65B                   | -         |
+| [Llama 2](https://huggingface.co/meta-llama)                 | 7B/13B/70B                       | llama2    |
+| [Llama 3](https://huggingface.co/meta-llama)                 | 8B/70B                           | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                 | 7B/13B                           | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)          | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                       | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                   | 3B                               | gemma     |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                    | 4B/7B/14B                        | phi       |
+| [Qwen/Qwen1.5/Qwen2 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen      |
+| [StarCoder 2](https://huggingface.co/bigcode)                | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                      | 7B/13B/65B                       | xverse    |
+| [Yi/Yi-1.5](https://huggingface.co/01-ai)                    | 6B/9B/34B                        | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                        | 6B/34B                           | yi_vl     |
+| [Yuan 2](https://huggingface.co/IEITYuan)                    | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
@@ -610,7 +607,7 @@ If you have a project that should be incorporated, please contact via email or c
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
 
-Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index 9fb56a4e..f695646a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -151,35 +151,32 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 模型
 
-| 模型名                                                    | 模型大小                          | Template  |
-| --------------------------------------------------------- | -------------------------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)          | 7B/13B                           | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)                | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                  | 6B                               | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)           | 35B/104B                         | cohere    |
-| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B                  | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                   | 7B/11B/40B/180B                  | falcon    |
-| [Gemma/Gemma2/CodeGemma](https://huggingface.co/google)   | 2B/7B/9B/27B                     | gemma     |
-| [GLM4](https://huggingface.co/THUDM)                      | 9B                               | glm4      |
-| [InternLM2](https://huggingface.co/internlm)              | 7B/20B                           | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)        | 7B/13B/33B/65B                   | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)              | 7B/13B/70B                       | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)              | 8B/70B                           | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)              | 7B/13B                           | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)       | 7B/8x7B/8x22B                    | mistral   |
-| [OLMo](https://huggingface.co/allenai)                    | 1B/7B                            | -         |
-| [PaliGemma](https://huggingface.co/google)                | 3B                               | gemma     |
-| [Phi-1.5/2](https://huggingface.co/microsoft)             | 1.3B/2.7B                        | -         |
-| [Phi-3](https://huggingface.co/microsoft)                 | 4B/7B/14B                        | phi       |
-| [Qwen](https://huggingface.co/Qwen)                       | 1.8B/7B/14B/72B                  | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)         | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | qwen      |
-| [Qwen2 (MoE)](https://huggingface.co/Qwen)                | 0.5B/1.5B/7B/57B/72B             | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)              | 3B/7B/15B                        | -         |
-| [XVERSE](https://huggingface.co/xverse)                   | 7B/13B/65B                       | xverse    |
-| [Yi (1/1.5)](https://huggingface.co/01-ai)                | 6B/9B/34B                        | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                     | 6B/34B                           | yi_vl     |
-| [Yuan](https://huggingface.co/IEITYuan)                   | 2B/51B/102B                      | yuan      |
+| 模型名                                                       | 模型大小                          | Template  |
+| ------------------------------------------------------------ | -------------------------------- | --------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)            | 7B/13B                           | baichuan2 |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)            | 560M/1.1B/1.7B/3B/7.1B/176B      | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                     | 6B                               | chatglm3  |
+| [Command R](https://huggingface.co/CohereForAI)              | 35B/104B                         | cohere    |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)    | 7B/16B/67B/236B                  | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                      | 7B/11B/40B/180B                  | falcon    |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)     | 2B/7B/9B/27B                     | gemma     |
+| [GLM-4](https://huggingface.co/THUDM)                        | 9B                               | glm4      |
+| [InternLM2](https://huggingface.co/internlm)                 | 7B/20B                           | intern2   |
+| [Llama](https://github.com/facebookresearch/llama)           | 7B/13B/33B/65B                   | -         |
+| [Llama 2](https://huggingface.co/meta-llama)                 | 7B/13B/70B                       | llama2    |
+| [Llama 3](https://huggingface.co/meta-llama)                 | 8B/70B                           | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                 | 7B/13B                           | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)          | 7B/8x7B/8x22B                    | mistral   |
+| [OLMo](https://huggingface.co/allenai)                       | 1B/7B                            | -         |
+| [PaliGemma](https://huggingface.co/google)                   | 3B                               | gemma     |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | -         |
+| [Phi-3](https://huggingface.co/microsoft)                    | 4B/7B/14B                        | phi       |
+| [Qwen/Qwen1.5/Qwen2 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen      |
+| [StarCoder 2](https://huggingface.co/bigcode)                | 3B/7B/15B                        | -         |
+| [XVERSE](https://huggingface.co/xverse)                      | 7B/13B/65B                       | xverse    |
+| [Yi/Yi-1.5](https://huggingface.co/01-ai)                    | 6B/9B/34B                        | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                        | 6B/34B                           | yi_vl     |
+| [Yuan 2](https://huggingface.co/IEITYuan)                    | 2B/51B/102B                      | yuan      |
 
 > [!NOTE]
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
@@ -610,7 +607,7 @@ run_name: test_run # 可选
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
 
-使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [InternLM2](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## 引用
 

From 3c4f8eaa55be246546e7c4ed6aedfe9dd08f6785 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 29 Jun 2024 01:42:14 +0800
Subject: [PATCH 152/160] loose gemma2 attention

Former-commit-id: a0b645017a2de3d58b6cbc71bd91ec96fc7a818b
---
 src/llamafactory/model/model_utils/attention.py | 10 ++++++++--
 src/llamafactory/train/callbacks.py             |  5 +----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index 9021d277..80d9d4b8 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -32,8 +32,14 @@ def configure_attn_implementation(
     config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool
 ) -> None:
     if getattr(config, "model_type", None) == "gemma2" and is_trainable:  # gemma2 adopts soft-cap attention
-        logger.warning("Gemma-2 models should use eager attention in training, change `flash_attn` to disabled.")
-        model_args.flash_attn = "disabled"
+        if model_args.flash_attn == "auto":
+            logger.warning("Gemma-2 models should use eager attention in training, change `flash_attn` to disabled.")
+            model_args.flash_attn = "disabled"
+        else:
+            logger.warning(
+                "Gemma-2 models should use eager attention in training, but you set `flash_attn: {}`. "
+                "Will proceed at your own risk.".format(model_args.flash_attn)
+            )
 
     if model_args.flash_attn == "auto":
         return
diff --git a/src/llamafactory/train/callbacks.py b/src/llamafactory/train/callbacks.py
index dc9c981e..4d024278 100644
--- a/src/llamafactory/train/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
@@ -1,7 +1,4 @@
-# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
-#
-# This code is inspired by the HuggingFace's transformers library.
-# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py
+# Copyright 2024 the LlamaFactory team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 889c042ecd3a8d2c05dbba80056ef807fcf89619 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 30 Jun 2024 21:05:31 +0800
Subject: [PATCH 153/160] update npu docker

Former-commit-id: 2f4d5174205605b8821d4fb626283e07694ecf80
---
 docker/docker-npu/Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/docker-npu/Dockerfile b/docker/docker-npu/Dockerfile
index e413d4e3..34cf9616 100644
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@@ -1,6 +1,9 @@
 # Use the Ubuntu 22.04 image with CANN 8.0.rc1
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
+# FROM cosdt/cann:8.0.rc1-910-ubuntu22.04
 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
+# FROM cosdt/cann:8.0.rc1-910-openeuler22.03
+# FROM cosdt/cann:8.0.rc1-910b-openeuler22.03
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive

From 188b4be64d6a57479657d47adb0039ce8b6cc7c5 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 30 Jun 2024 21:28:51 +0800
Subject: [PATCH 154/160] fix #4398 #4592

Former-commit-id: 8c92d268903c00392c8bd75a731daa1f107d6202
---
 README.md                        |  2 ++
 README_zh.md                     |  4 +++-
 src/llamafactory/extras/misc.py  | 24 +++++++++++++++++-------
 src/llamafactory/model/loader.py |  3 ++-
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 6c6a48d9..6299650b 100644
--- a/README.md
+++ b/README.md
@@ -384,6 +384,8 @@ Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to
 
 If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
 
+Download the pre-built Docker images: [32GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html) | [64GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
+
 </details>
 
 ### Data Preparation
diff --git a/README_zh.md b/README_zh.md
index f695646a..efc647a7 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -357,7 +357,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 <details><summary>昇腾 NPU 用户指南</summary>
 
-在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e ".[torch-npu,metrics]"` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
+在昇腾 NPU 设备上安装 LLaMA Factory 时，需要指定额外依赖项，使用 `pip install -e ".[torch-npu,metrics]"` 命令安装。此外，还需要安装 **[Ascend CANN Toolkit 与 Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**，安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令：
 
 ```bash
 # 请替换 URL 为 CANN 版本和设备型号对应的 URL
@@ -384,6 +384,8 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
 如果遇到无法正常推理的情况，请尝试设置 `do_sample: false`。
 
+下载预构建 Docker 镜像：[32GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html) | [64GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
+
 </details>
 
 ### 数据准备
diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index 30c287bd..20c752c5 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -20,7 +20,9 @@ import os
 from typing import TYPE_CHECKING, Tuple
 
 import torch
+import transformers.dynamic_module_utils
 from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
+from transformers.dynamic_module_utils import get_relative_imports
 from transformers.utils import (
     is_torch_bf16_gpu_available,
     is_torch_cuda_available,
@@ -69,6 +71,9 @@ class AverageMeter:
 
 
 def check_dependencies() -> None:
+    r"""
+    Checks the version of the required packages.
+    """
     if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]:
         logger.warning("Version checking has been disabled, may lead to unexpected behaviors.")
     else:
@@ -79,7 +84,7 @@ def check_dependencies() -> None:
         require_version("trl>=0.8.6", "To fix: pip install trl>=0.8.6")
 
 
-def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
+def count_parameters(model: "torch.nn.Module") -> Tuple[int, int]:
     r"""
     Returns the number of trainable parameters and number of all parameters in the model.
     """
@@ -108,7 +113,7 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
     return trainable_params, all_param
 
 
-def get_current_device() -> torch.device:
+def get_current_device() -> "torch.device":
     r"""
     Gets the current available device.
     """
@@ -147,6 +152,13 @@ def get_logits_processor() -> "LogitsProcessorList":
     return logits_processor
 
 
+def has_tokenized_data(path: "os.PathLike") -> bool:
+    r"""
+    Checks if the path has a tokenized dataset.
+    """
+    return os.path.isdir(path) and len(os.listdir(path)) > 0
+
+
 def infer_optim_dtype(model_dtype: "torch.dtype") -> "torch.dtype":
     r"""
     Infers the optimal dtype according to the model_dtype and device compatibility.
@@ -166,11 +178,9 @@ def is_gpu_or_npu_available() -> bool:
     return is_torch_npu_available() or is_torch_cuda_available()
 
 
-def has_tokenized_data(path: "os.PathLike") -> bool:
-    r"""
-    Checks if the path has a tokenized dataset.
-    """
-    return os.path.isdir(path) and len(os.listdir(path)) > 0
+def skip_check_imports() -> None:
+    if os.environ.get("FORCE_CHECK_IMPORTS", "0").lower() not in ["true", "1"]:
+        transformers.dynamic_module_utils.check_imports = get_relative_imports
 
 
 def torch_gc() -> None:
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 1261d17a..fe700d53 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -19,7 +19,7 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Se
 from trl import AutoModelForCausalLMWithValueHead
 
 from ..extras.logging import get_logger
-from ..extras.misc import count_parameters, try_download_model_from_ms
+from ..extras.misc import count_parameters, skip_check_imports, try_download_model_from_ms
 from .adapter import init_adapter
 from .model_utils.misc import register_autoclass
 from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
@@ -48,6 +48,7 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
 
     Note: including inplace operation of model_args.
     """
+    skip_check_imports()
     model_args.model_name_or_path = try_download_model_from_ms(model_args)
     return {
         "trust_remote_code": True,

From b670fb57db89aa5ac66da1e6288c69a9c213dac2 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 00:22:52 +0800
Subject: [PATCH 155/160] update readme

Former-commit-id: 7998d969bf942c91cf41a189e3941f6e04c81c6f
---
 README.md    | 10 +++++++++-
 README_zh.md | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6299650b..3d3feae5 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
-[![Citation](https://img.shields.io/badge/citation-63-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-71-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -593,6 +593,14 @@ If you have a project that should be incorporated, please contact via email or c
 1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115)
 1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815)
 1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099)
+1. Ding et al. IntentionQA: A Benchmark for Evaluating Purchase Intention Comprehension Abilities of Language Models in E-commerce. 2024. [[arxiv]](https://arxiv.org/abs/2406.10173)
+1. He et al. COMMUNITY-CROSS-INSTRUCT: Unsupervised Instruction Generation for Aligning Large Language Models to Online Communities. 2024. [[arxiv]](https://arxiv.org/abs/2406.12074)
+1. Lin et al. FVEL: Interactive Formal Verification Environment with Large Language Models via Theorem Proving. 2024. [[arxiv]](https://arxiv.org/abs/2406.14408)
+1. Treutlein et al. Connecting the Dots: LLMs can Infer and Verbalize Latent Structure from Disparate Training Data. 2024. [[arxiv]](https://arxiv.org/abs/2406.14546)
+1. Feng et al. SS-Bench: A Benchmark for Social Story Generation and Evaluation. 2024. [[arxiv]](https://arxiv.org/abs/2406.15695)
+1. Feng et al. Self-Constructed Context Decompilation with Fined-grained Alignment Enhancement. 2024. [[arxiv]](https://arxiv.org/abs/2406.17233)
+1. Liu et al. Large Language Models for Cuffless Blood Pressure Measurement From Wearable Biosignals. 2024. [[arxiv]](https://arxiv.org/abs/2406.18069)
+1. Iyer et al. Exploring Very Low-Resource Translation with LLMs: The University of Edinburgh’s Submission to AmericasNLP 2024 Translation Task. AmericasNLP 2024. [[paper]](https://aclanthology.org/2024.americasnlp-1.25)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
 1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
diff --git a/README_zh.md b/README_zh.md
index efc647a7..cb5a42e4 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -4,7 +4,7 @@
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
-[![Citation](https://img.shields.io/badge/citation-63-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-71-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -593,6 +593,14 @@ run_name: test_run # 可选
 1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115)
 1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815)
 1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099)
+1. Ding et al. IntentionQA: A Benchmark for Evaluating Purchase Intention Comprehension Abilities of Language Models in E-commerce. 2024. [[arxiv]](https://arxiv.org/abs/2406.10173)
+1. He et al. COMMUNITY-CROSS-INSTRUCT: Unsupervised Instruction Generation for Aligning Large Language Models to Online Communities. 2024. [[arxiv]](https://arxiv.org/abs/2406.12074)
+1. Lin et al. FVEL: Interactive Formal Verification Environment with Large Language Models via Theorem Proving. 2024. [[arxiv]](https://arxiv.org/abs/2406.14408)
+1. Treutlein et al. Connecting the Dots: LLMs can Infer and Verbalize Latent Structure from Disparate Training Data. 2024. [[arxiv]](https://arxiv.org/abs/2406.14546)
+1. Feng et al. SS-Bench: A Benchmark for Social Story Generation and Evaluation. 2024. [[arxiv]](https://arxiv.org/abs/2406.15695)
+1. Feng et al. Self-Constructed Context Decompilation with Fined-grained Alignment Enhancement. 2024. [[arxiv]](https://arxiv.org/abs/2406.17233)
+1. Liu et al. Large Language Models for Cuffless Blood Pressure Measurement From Wearable Biosignals. 2024. [[arxiv]](https://arxiv.org/abs/2406.18069)
+1. Iyer et al. Exploring Very Low-Resource Translation with LLMs: The University of Edinburgh’s Submission to AmericasNLP 2024 Translation Task. AmericasNLP 2024. [[paper]](https://aclanthology.org/2024.americasnlp-1.25)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper，基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM，基于 Baichuan-13B 微调而得，具有法律推理和知识检索能力。
 1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。

From 67d2eb6b2a1169c69a49e6511e4a6ac7d0b0566b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 01:19:27 +0800
Subject: [PATCH 156/160] fix #4402 #4617

Deprecate reserved_label_len arg


Former-commit-id: 4b6568984c0be4b31e7aa91b7c0d52b7f7b12b0b
---
 src/llamafactory/data/data_utils.py           |  12 +-
 src/llamafactory/data/formatter.py            | 116 ++-------------
 src/llamafactory/data/processors/feedback.py  |  16 +-
 src/llamafactory/data/processors/pairwise.py  |  17 ++-
 .../data/processors/processor_utils.py        |  15 +-
 .../data/processors/supervised.py             |  15 +-
 .../data/processors/unsupervised.py           |   9 +-
 src/llamafactory/data/template.py             | 114 +++++---------
 src/llamafactory/data/tool_utils.py           | 140 ++++++++++++++++++
 src/llamafactory/hparams/data_args.py         |   7 -
 tests/data/test_formatter.py                  |   4 +-
 tests/data/test_processor.py                  |  32 ++++
 tests/data/test_template.py                   |  55 ++++++-
 13 files changed, 329 insertions(+), 223 deletions(-)
 create mode 100644 src/llamafactory/data/tool_utils.py
 create mode 100644 tests/data/test_processor.py

diff --git a/src/llamafactory/data/data_utils.py b/src/llamafactory/data/data_utils.py
index cc9761b1..76ded47e 100644
--- a/src/llamafactory/data/data_utils.py
+++ b/src/llamafactory/data/data_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from enum import Enum, unique
-from typing import TYPE_CHECKING, Dict, List, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Sequence, Set, Union
 
 from datasets import concatenate_datasets, interleave_datasets
 
@@ -30,6 +30,9 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
+SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
+
+
 @unique
 class Role(str, Enum):
     USER = "user"
@@ -39,13 +42,6 @@ class Role(str, Enum):
     OBSERVATION = "observation"
 
 
-def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]:
-    max_target_len = int(max_len * (target_len / (source_len + target_len)))
-    max_target_len = max(max_target_len, reserved_label_len)
-    max_source_len = max_len - min(max_target_len, target_len)
-    return max_source_len, max_target_len
-
-
 def merge_dataset(
     all_datasets: List[Union["Dataset", "IterableDataset"]],
     data_args: "DataArguments",
diff --git a/src/llamafactory/data/formatter.py b/src/llamafactory/data/formatter.py
index 88ebf682..c1653a76 100644
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -16,97 +16,10 @@ import json
 import re
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
-
-SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
-
-
-DEFAULT_TOOL_PROMPT = (
-    "You have access to the following tools:\n{tool_text}"
-    "Use the following format if using a tool:\n"
-    "```\n"
-    "Action: tool name (one of [{tool_names}]).\n"
-    "Action Input: the input to the tool, in a JSON format representing the kwargs "
-    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```).\n"""
-    "```\n"
-)
-
-
-GLM4_TOOL_PROMPT = (
-    "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
-    "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具{tool_text}"
-)
-
-
-def default_tool_formatter(tools: List[Dict[str, Any]]) -> str:
-    tool_text = ""
-    tool_names = []
-    for tool in tools:
-        param_text = ""
-        for name, param in tool["parameters"]["properties"].items():
-            required = ", required" if name in tool["parameters"].get("required", []) else ""
-            enum = ", should be one of [{}]".format(", ".join(param["enum"])) if param.get("enum", None) else ""
-            items = (
-                ", where each item should be {}".format(param["items"].get("type", "")) if param.get("items") else ""
-            )
-            param_text += "  - {name} ({type}{required}): {desc}{enum}{items}\n".format(
-                name=name,
-                type=param.get("type", ""),
-                required=required,
-                desc=param.get("description", ""),
-                enum=enum,
-                items=items,
-            )
-
-        tool_text += "> Tool Name: {name}\nTool Description: {desc}\nTool Args:\n{args}\n".format(
-            name=tool["name"], desc=tool.get("description", ""), args=param_text
-        )
-        tool_names.append(tool["name"])
-
-    return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
-
-
-def default_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
-    action_match: List[Tuple[str, str]] = re.findall(regex, content)
-    if not action_match:
-        return content
-
-    results = []
-    for match in action_match:
-        tool_name = match[0].strip()
-        tool_input = match[1].strip().strip('"').strip("```")
-        try:
-            arguments = json.loads(tool_input)
-            results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
-        except json.JSONDecodeError:
-            return content
-
-    return results
-
-
-def glm4_tool_formatter(tools: List[Dict[str, Any]]) -> str:
-    tool_text = ""
-    for tool in tools:
-        tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
-            name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
-        )
-
-    return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
-
-
-def glm4_tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
-    if "\n" not in content:
-        return content
-
-    tool_name, tool_input = content.split("\n", maxsplit=1)
-    try:
-        arguments = json.loads(tool_input)
-    except json.JSONDecodeError:
-        return content
-
-    return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
+from .data_utils import SLOTS
+from .tool_utils import DefaultToolUtils, GLM4ToolUtils
 
 
 @dataclass
@@ -168,15 +81,12 @@ class StringFormatter(Formatter):
 @dataclass
 class FunctionFormatter(Formatter):
     def __post_init__(self):
-        has_name, has_args = False, False
-        for slot in filter(lambda s: isinstance(s, str), self.slots):
-            if "{{name}}" in slot:
-                has_name = True
-            if "{{arguments}}" in slot:
-                has_args = True
-
-        if not has_name or not has_args:
-            raise ValueError("Name and arguments placeholders are required in the function formatter.")
+        if self.tool_format == "default":
+            self.slots = DefaultToolUtils.get_function_slots() + self.slots
+        elif self.tool_format == "glm4":
+            self.slots = GLM4ToolUtils.get_function_slots() + self.slots
+        else:
+            raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
 
     def apply(self, **kwargs) -> SLOTS:
         content = kwargs.pop("content")
@@ -210,11 +120,11 @@ class FunctionFormatter(Formatter):
 class ToolFormatter(Formatter):
     def __post_init__(self):
         if self.tool_format == "default":
-            self._tool_formatter = default_tool_formatter
-            self._tool_extractor = default_tool_extractor
+            self._tool_formatter = DefaultToolUtils.tool_formatter
+            self._tool_extractor = DefaultToolUtils.tool_extractor
         elif self.tool_format == "glm4":
-            self._tool_formatter = glm4_tool_formatter
-            self._tool_extractor = glm4_tool_extractor
+            self._tool_formatter = GLM4ToolUtils.tool_formatter
+            self._tool_extractor = GLM4ToolUtils.tool_extractor
         else:
             raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
 
diff --git a/src/llamafactory/data/processors/feedback.py b/src/llamafactory/data/processors/feedback.py
index 219ab353..7ba05e23 100644
--- a/src/llamafactory/data/processors/feedback.py
+++ b/src/llamafactory/data/processors/feedback.py
@@ -16,7 +16,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
+from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, infer_seqlen
 
 
 if TYPE_CHECKING:
@@ -55,12 +55,8 @@ def _encode_feedback_example(
     else:
         kl_messages = prompt + [kl_response[1]]
 
-    prompt_ids, response_ids = template.encode_oneturn(
-        tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
-    _, kl_response_ids = template.encode_oneturn(
-        tokenizer, kl_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
+    prompt_ids, response_ids = template.encode_oneturn(tokenizer, messages, system, tools)
+    _, kl_response_ids = template.encode_oneturn(tokenizer, kl_messages, system, tools)
 
     if template.efficient_eos:
         response_ids += [tokenizer.eos_token_id]
@@ -70,6 +66,12 @@ def _encode_feedback_example(
         image_token_id = tokenizer.convert_tokens_to_ids(template.image_token)
         prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids
 
+    # do not consider the kl_response
+    source_len, target_len = infer_seqlen(len(prompt_ids), len(response_ids), data_args.cutoff_len)
+    prompt_ids = prompt_ids[:source_len]
+    response_ids = response_ids[:target_len]
+    kl_response_ids = kl_response_ids[:target_len]
+
     input_ids = prompt_ids + response_ids
     labels = [IGNORE_INDEX] * len(prompt_ids) + response_ids
     kl_input_ids = prompt_ids + kl_response_ids
diff --git a/src/llamafactory/data/processors/pairwise.py b/src/llamafactory/data/processors/pairwise.py
index b2939348..c6001e6e 100644
--- a/src/llamafactory/data/processors/pairwise.py
+++ b/src/llamafactory/data/processors/pairwise.py
@@ -16,7 +16,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
+from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, infer_seqlen
 
 
 if TYPE_CHECKING:
@@ -44,12 +44,8 @@ def _encode_pairwise_example(
 
     chosen_messages = prompt + [response[0]]
     rejected_messages = prompt + [response[1]]
-    prompt_ids, chosen_ids = template.encode_oneturn(
-        tokenizer, chosen_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
-    _, rejected_ids = template.encode_oneturn(
-        tokenizer, rejected_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
+    prompt_ids, chosen_ids = template.encode_oneturn(tokenizer, chosen_messages, system, tools)
+    _, rejected_ids = template.encode_oneturn(tokenizer, rejected_messages, system, tools)
 
     if template.efficient_eos:
         chosen_ids += [tokenizer.eos_token_id]
@@ -59,6 +55,13 @@ def _encode_pairwise_example(
         image_token_id = tokenizer.convert_tokens_to_ids(template.image_token)
         prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids
 
+    source_len, target_len = infer_seqlen(
+        len(prompt_ids), max(len(chosen_ids), len(rejected_ids)), data_args.cutoff_len
+    )  # consider the response is more important
+    prompt_ids = prompt_ids[:source_len]
+    chosen_ids = chosen_ids[:target_len]
+    rejected_ids = rejected_ids[:target_len]
+
     chosen_input_ids = prompt_ids + chosen_ids
     chosen_labels = [IGNORE_INDEX] * len(prompt_ids) + chosen_ids
     rejected_input_ids = prompt_ids + rejected_ids
diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py
index 93df0cd5..455908ae 100644
--- a/src/llamafactory/data/processors/processor_utils.py
+++ b/src/llamafactory/data/processors/processor_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import bisect
-from typing import TYPE_CHECKING, List, Sequence
+from typing import TYPE_CHECKING, List, Sequence, Tuple
 
 from ...extras.packages import is_pillow_available
 
@@ -76,3 +76,16 @@ def get_paligemma_token_type_ids(input_len: int, processor: "ProcessorMixin") ->
     """
     image_seq_length = getattr(processor, "image_seq_length")
     return [0] * image_seq_length + [1] * (input_len - image_seq_length)
+
+
+def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> Tuple[int, int]:
+    if target_len * 2 < cutoff_len:  # truncate source
+        max_target_len = cutoff_len
+    elif source_len * 2 < cutoff_len:  # truncate target
+        max_target_len = cutoff_len - source_len
+    else:  # truncate both
+        max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))
+
+    new_target_len = min(max_target_len, target_len)
+    new_source_len = max(cutoff_len - new_target_len, 0)
+    return new_source_len, new_target_len
diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py
index eb5ffb1a..b283542d 100644
--- a/src/llamafactory/data/processors/supervised.py
+++ b/src/llamafactory/data/processors/supervised.py
@@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, greedy_knapsack
+from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, greedy_knapsack, infer_seqlen
 
 
 if TYPE_CHECKING:
@@ -51,10 +51,17 @@ def _encode_supervised_example(
         input_ids += [image_token_id] * getattr(processor, "image_seq_length")
         labels += [IGNORE_INDEX] * getattr(processor, "image_seq_length")
 
-    encoded_pairs = template.encode_multiturn(
-        tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
+    encoded_pairs = template.encode_multiturn(tokenizer, messages, system, tools)
+    total_length = 1 if template.efficient_eos else 0
     for turn_idx, (source_ids, target_ids) in enumerate(encoded_pairs):
+        if total_length >= data_args.cutoff_len:
+            break
+
+        source_len, target_len = infer_seqlen(len(source_ids), len(target_ids), data_args.cutoff_len - total_length)
+        source_ids = source_ids[:source_len]
+        target_ids = target_ids[:target_len]
+        total_length += source_len + target_len
+
         if data_args.train_on_prompt:
             source_mask = source_ids
         elif turn_idx != 0 and template.efficient_eos:
diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py
index 75ad4d51..b3fc85c9 100644
--- a/src/llamafactory/data/processors/unsupervised.py
+++ b/src/llamafactory/data/processors/unsupervised.py
@@ -16,7 +16,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
 
 from ...extras.logging import get_logger
 from ..data_utils import Role
-from .processor_utils import get_paligemma_token_type_ids, get_pixel_values
+from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, infer_seqlen
 
 
 if TYPE_CHECKING:
@@ -47,9 +47,7 @@ def _encode_unsupervised_example(
     else:
         messages = prompt + [{"role": Role.ASSISTANT.value, "content": ""}]
 
-    input_ids, labels = template.encode_oneturn(
-        tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len
-    )
+    input_ids, labels = template.encode_oneturn(tokenizer, messages, system, tools)
     if template.efficient_eos:
         labels += [tokenizer.eos_token_id]
 
@@ -57,6 +55,9 @@ def _encode_unsupervised_example(
         image_token_id = tokenizer.convert_tokens_to_ids(template.image_token)
         input_ids = [image_token_id] * getattr(processor, "image_seq_length") + input_ids
 
+    source_len, target_len = infer_seqlen(len(input_ids), len(labels), data_args.cutoff_len)
+    input_ids = input_ids[:source_len]
+    labels = labels[:target_len]
     return input_ids, labels
 
 
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 53f16df4..aefd5195 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -16,7 +16,7 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
 
 from ..extras.logging import get_logger
-from .data_utils import Role, infer_max_len
+from .data_utils import Role
 from .formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
 
 
@@ -48,36 +48,33 @@ class Template:
     def encode_oneturn(
         self,
         tokenizer: "PreTrainedTokenizer",
-        messages: List[Dict[str, str]],
+        messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
-        cutoff_len: int = 1_000_000,
-        reserved_label_len: int = 1,
     ) -> Tuple[List[int], List[int]]:
         r"""
         Returns a single pair of token ids representing prompt and response respectively.
         """
-        encoded_pairs = self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
         prompt_ids = []
-        for query_ids, resp_ids in encoded_pairs[:-1]:
-            prompt_ids += query_ids + resp_ids
-        prompt_ids = prompt_ids + encoded_pairs[-1][0]
-        answer_ids = encoded_pairs[-1][1]
+        for encoded_ids in encoded_messages[:-1]:
+            prompt_ids += encoded_ids
+
+        answer_ids = encoded_messages[-1]
         return prompt_ids, answer_ids
 
     def encode_multiturn(
         self,
         tokenizer: "PreTrainedTokenizer",
-        messages: List[Dict[str, str]],
+        messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
-        cutoff_len: int = 1_000_000,
-        reserved_label_len: int = 1,
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+    ) -> List[Tuple[List[int], List[int]]]:
         r"""
         Returns multiple pairs of token ids representing prompts and responses respectively.
         """
-        return self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
 
     def extract_tool(self, content: str) -> Union[str, List[Tuple[str, str]]]:
         r"""
@@ -88,16 +85,14 @@ class Template:
     def _encode(
         self,
         tokenizer: "PreTrainedTokenizer",
-        messages: List[Dict[str, str]],
+        messages: Sequence[Dict[str, str]],
         system: Optional[str],
         tools: Optional[str],
-        cutoff_len: int,
-        reserved_label_len: int,
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+    ) -> List[List[int]]:
         r"""
         Encodes formatted inputs to pairs of token ids.
-        Turn 0: system + query        resp
-        Turn t: sep + query           resp
+        Turn 0: prefix + system + query        resp
+        Turn t: sep + query                    resp
         """
         system = system or self.default_system
         encoded_messages = []
@@ -106,10 +101,9 @@ class Template:
 
             if i == 0:
                 elements += self.format_prefix.apply()
-
-            if i == 0 and (system or tools):
-                tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
-                elements += self.format_system.apply(content=(system + tool_text))
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    elements += self.format_system.apply(content=(system + tool_text))
 
             if i > 0 and i % 2 == 0:
                 elements += self.format_separator.apply()
@@ -127,11 +121,9 @@ class Template:
 
             encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
 
-        return self._make_pairs(encoded_messages, cutoff_len, reserved_label_len)
+        return encoded_messages
 
-    def _convert_elements_to_ids(
-        self, tokenizer: "PreTrainedTokenizer", elements: List[Union[str, Dict[str, str]]]
-    ) -> List[int]:
+    def _convert_elements_to_ids(self, tokenizer: "PreTrainedTokenizer", elements: "SLOTS") -> List[int]:
         r"""
         Converts elements to token ids.
         """
@@ -152,60 +144,32 @@ class Template:
 
         return token_ids
 
-    def _make_pairs(
-        self,
-        encoded_messages: Sequence[List[int]],
-        cutoff_len: int,
-        reserved_label_len: int,
-    ) -> Sequence[Tuple[List[int], List[int]]]:
-        encoded_pairs = []
-        total_length = 0
-        for i in range(0, len(encoded_messages), 2):
-            if total_length >= cutoff_len:
-                break
-
-            max_source_len, max_target_len = infer_max_len(
-                source_len=len(encoded_messages[i]),
-                target_len=len(encoded_messages[i + 1]),
-                max_len=(cutoff_len - total_length),
-                reserved_label_len=reserved_label_len,
-            )
-            source_ids = encoded_messages[i][:max_source_len]
-            target_ids = encoded_messages[i + 1][:max_target_len]
-            total_length += len(source_ids) + len(target_ids)
-            encoded_pairs.append((source_ids, target_ids))
-
-        return encoded_pairs
-
 
 @dataclass
 class Llama2Template(Template):
     def _encode(
         self,
         tokenizer: "PreTrainedTokenizer",
-        messages: List[Dict[str, str]],
+        messages: Sequence[Dict[str, str]],
         system: str,
         tools: str,
-        cutoff_len: int,
-        reserved_label_len: int,
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+    ) -> List[List[int]]:
         r"""
         Encodes formatted inputs to pairs of token ids.
-        Turn 0: system + query        resp
-        Turn t: sep + query           resp
+        Turn 0: prefix + system + query        resp
+        Turn t: sep + query                    resp
         """
         system = system or self.default_system
         encoded_messages = []
         for i, message in enumerate(messages):
             elements = []
 
+            system_text = ""
             if i == 0:
                 elements += self.format_prefix.apply()
-
-            system_text = ""
-            if i == 0 and (system or tools):
-                tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
-                system_text = self.format_system.apply(content=(system + tool_text))[0]
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    system_text = self.format_system.apply(content=(system + tool_text))[0]
 
             if i > 0 and i % 2 == 0:
                 elements += self.format_separator.apply()
@@ -223,7 +187,7 @@ class Llama2Template(Template):
 
             encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
 
-        return self._make_pairs(encoded_messages, cutoff_len, reserved_label_len)
+        return encoded_messages
 
 
 TEMPLATES: Dict[str, Template] = {}
@@ -240,7 +204,7 @@ def _register_template(
     format_separator: Optional["Formatter"] = None,
     format_prefix: Optional["Formatter"] = None,
     default_system: str = "",
-    stop_words: List[str] = [],
+    stop_words: Sequence[str] = [],
     image_token: str = "<image>",
     efficient_eos: bool = False,
     replace_eos: bool = False,
@@ -275,9 +239,7 @@ def _register_template(
     template_class = Llama2Template if name.startswith("llama2") else Template
     default_user_formatter = StringFormatter(slots=["{{content}}"])
     default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots)
-    default_function_formatter = FunctionFormatter(
-        slots=["Action: {{name}}\nAction Input: {{arguments}}\n"] + eos_slots
-    )
+    default_function_formatter = FunctionFormatter(slots=eos_slots, tool_format="default")
     default_tool_formatter = ToolFormatter(tool_format="default")
     default_separator_formatter = EmptyFormatter()
     default_prefix_formatter = EmptyFormatter()
@@ -390,7 +352,9 @@ def get_template_and_fix_tokenizer(
 
     if tool_format is not None:
         logger.info("Using tool format: {}.".format(tool_format))
+        eos_slots = [] if template.efficient_eos else [{"eos_token"}]
         template.format_tools = ToolFormatter(tool_format=tool_format)
+        template.format_function = FunctionFormatter(slots=eos_slots, tool_format=tool_format)
 
     stop_words = template.stop_words
     if template.replace_eos:
@@ -506,10 +470,11 @@ _register_template(
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
     format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
-    format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
+    format_function=FunctionFormatter(slots=[], tool_format="glm4"),
     format_observation=StringFormatter(
         slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
     ),
+    format_tools=ToolFormatter(tool_format="glm4"),
     format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
@@ -603,16 +568,15 @@ _register_template(
 _register_template(
     name="deepseekcoder",
     format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
-    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_separator=EmptyFormatter(slots=["\n<|EOT|>\n"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
     default_system=(
         "You are an AI programming assistant, utilizing the Deepseek Coder model, "
         "developed by Deepseek Company, and you only answer questions related to computer science. "
         "For politically sensitive questions, security and privacy issues, "
         "and other non-computer science questions, you will refuse to answer\n"
     ),
-    stop_words=["<|EOT|>"],
-    efficient_eos=True,
 )
 
 
@@ -662,7 +626,7 @@ _register_template(
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
     format_assistant=StringFormatter(slots=["\n{{content}}"]),
     format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
-    format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
+    format_function=FunctionFormatter(slots=[], tool_format="glm4"),
     format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
     format_tools=ToolFormatter(tool_format="glm4"),
     format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
diff --git a/src/llamafactory/data/tool_utils.py b/src/llamafactory/data/tool_utils.py
new file mode 100644
index 00000000..ac5565d5
--- /dev/null
+++ b/src/llamafactory/data/tool_utils.py
@@ -0,0 +1,140 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Union
+
+from .data_utils import SLOTS
+
+
+DEFAULT_TOOL_PROMPT = (
+    "You have access to the following tools:\n{tool_text}"
+    "Use the following format if using a tool:\n"
+    "```\n"
+    "Action: tool name (one of [{tool_names}]).\n"
+    "Action Input: the input to the tool, in a JSON format representing the kwargs "
+    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```).\n"""
+    "```\n"
+)
+
+
+GLM4_TOOL_PROMPT = (
+    "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+    "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具{tool_text}"
+)
+
+
+@dataclass
+class ToolUtils(ABC):
+    @staticmethod
+    @abstractmethod
+    def get_function_slots() -> SLOTS: ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str: ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]: ...
+
+
+class DefaultToolUtils(ToolUtils):
+    @staticmethod
+    def get_function_slots() -> SLOTS:
+        return ["Action: {{name}}\nAction Input: {{arguments}}\n"]
+
+    @staticmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+        tool_text = ""
+        tool_names = []
+        for tool in tools:
+            param_text = ""
+            for name, param in tool["parameters"]["properties"].items():
+                required, enum, items = "", "", ""
+                if name in tool["parameters"].get("required", []):
+                    required = ", required"
+
+                if param.get("enum", None):
+                    enum = ", should be one of [{}]".format(", ".join(param["enum"]))
+
+                if param.get("items", None):
+                    items = ", where each item should be {}".format(param["items"].get("type", ""))
+
+                param_text += "  - {name} ({type}{required}): {desc}{enum}{items}\n".format(
+                    name=name,
+                    type=param.get("type", ""),
+                    required=required,
+                    desc=param.get("description", ""),
+                    enum=enum,
+                    items=items,
+                )
+
+            tool_text += "> Tool Name: {name}\nTool Description: {desc}\nTool Args:\n{args}\n".format(
+                name=tool["name"], desc=tool.get("description", ""), args=param_text
+            )
+            tool_names.append(tool["name"])
+
+        return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
+
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
+        regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
+        action_match: List[Tuple[str, str]] = re.findall(regex, content)
+        if not action_match:
+            return content
+
+        results = []
+        for match in action_match:
+            tool_name = match[0].strip()
+            tool_input = match[1].strip().strip('"').strip("```")
+            try:
+                arguments = json.loads(tool_input)
+                results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
+            except json.JSONDecodeError:
+                return content
+
+        return results
+
+
+class GLM4ToolUtils(ToolUtils):
+    @staticmethod
+    def get_function_slots() -> SLOTS:
+        return ["{{name}}\n{{arguments}}"]
+
+    @staticmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+                name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
+            )
+
+        return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
+        if "\n" not in content:
+            return content
+
+        tool_name, tool_input = content.split("\n", maxsplit=1)
+        try:
+            arguments = json.loads(tool_input)
+        except json.JSONDecodeError:
+            return content
+
+        return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index dad13820..880be84a 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -45,10 +45,6 @@ class DataArguments:
         default=1024,
         metadata={"help": "The cutoff length of the tokenized inputs in the dataset."},
     )
-    reserved_label_len: int = field(
-        default=1,
-        metadata={"help": "The minimum cutoff length reserved for the tokenized labels in the dataset."},
-    )
     train_on_prompt: bool = field(
         default=False,
         metadata={"help": "Whether to disable the mask on the prompt or not."},
@@ -111,9 +107,6 @@ class DataArguments:
     )
 
     def __post_init__(self):
-        if self.reserved_label_len >= self.cutoff_len:
-            raise ValueError("`reserved_label_len` must be smaller than `cutoff_len`.")
-
         if self.streaming and self.val_size > 1e-6 and self.val_size < 1:
             raise ValueError("Streaming mode should have an integer val size.")
 
diff --git a/tests/data/test_formatter.py b/tests/data/test_formatter.py
index 37b21dc5..1845df24 100644
--- a/tests/data/test_formatter.py
+++ b/tests/data/test_formatter.py
@@ -28,7 +28,7 @@ def test_string_formatter():
 
 
 def test_function_formatter():
-    formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}\n"])
+    formatter = FunctionFormatter(slots=[], tool_format="default")
     tool_calls = json.dumps({"name": "tool_name", "arguments": {"foo": "bar", "size": 10}})
     assert formatter.apply(content=tool_calls) == [
         """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n"""
@@ -36,7 +36,7 @@ def test_function_formatter():
 
 
 def test_multi_function_formatter():
-    formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}\n"])
+    formatter = FunctionFormatter(slots=[], tool_format="default")
     tool_calls = json.dumps([{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}] * 2)
     assert formatter.apply(content=tool_calls) == [
         """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n""",
diff --git a/tests/data/test_processor.py b/tests/data/test_processor.py
new file mode 100644
index 00000000..fa8f7172
--- /dev/null
+++ b/tests/data/test_processor.py
@@ -0,0 +1,32 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import pytest
+
+from llamafactory.data.processors.processor_utils import infer_seqlen
+
+
+@pytest.mark.parametrize(
+    "test_input,test_output",
+    [
+        ((3000, 2000, 1000), (600, 400)),
+        ((2000, 3000, 1000), (400, 600)),
+        ((1000, 100, 1000), (900, 100)),
+        ((100, 1000, 1000), (100, 900)),
+    ],
+)
+def test_infer_seqlen(test_input: Tuple[int, int, int], test_output: Tuple[int, int]):
+    assert test_output == infer_seqlen(*test_input)
diff --git a/tests/data/test_template.py b/tests/data/test_template.py
index 9d73c116..e4728a84 100644
--- a/tests/data/test_template.py
+++ b/tests/data/test_template.py
@@ -21,15 +21,60 @@ from llamafactory.data import get_template_and_fix_tokenizer
 
 TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
+MESSAGES = [
+    {"role": "user", "content": "How are you"},
+    {"role": "assistant", "content": "I am fine!"},
+    {"role": "user", "content": "你好"},
+    {"role": "assistant", "content": "很高兴认识你！"},
+]
+
+
+def test_encode_oneturn():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    template = get_template_and_fix_tokenizer(tokenizer, name="llama3")
+    prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES)
+    assert tokenizer.decode(prompt_ids) == (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\nI am fine!<|eot_id|>"
+        "<|start_header_id|>user<|end_header_id|>\n\n你好<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    assert tokenizer.decode(answer_ids) == "很高兴认识你！<|eot_id|>"
+
+
+def test_encode_multiturn():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    template = get_template_and_fix_tokenizer(tokenizer, name="llama3")
+    encoded_pairs = template.encode_multiturn(tokenizer, MESSAGES)
+    assert tokenizer.decode(encoded_pairs[0][0]) == (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    assert tokenizer.decode(encoded_pairs[0][1]) == "I am fine!<|eot_id|>"
+    assert tokenizer.decode(encoded_pairs[1][0]) == (
+        "<|start_header_id|>user<|end_header_id|>\n\n你好<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    assert tokenizer.decode(encoded_pairs[1][1]) == "很高兴认识你！<|eot_id|>"
+
 
 def test_jinja_template():
     tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
     ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
     get_template_and_fix_tokenizer(tokenizer, name="llama3")
     assert tokenizer.chat_template != ref_tokenizer.chat_template
+    assert tokenizer.apply_chat_template(MESSAGES) == ref_tokenizer.apply_chat_template(MESSAGES)
 
-    messages = [
-        {"role": "user", "content": "hi!"},
-        {"role": "assistant", "content": "hello there"},
-    ]
-    assert tokenizer.apply_chat_template(messages) == ref_tokenizer.apply_chat_template(messages)
+
+def test_qwen_template():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")
+    template = get_template_and_fix_tokenizer(tokenizer, name="qwen")
+    prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES)
+    assert tokenizer.decode(prompt_ids) == (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\nHow are you<|im_end|>\n"
+        "<|im_start|>assistant\nI am fine!<|im_end|>\n"
+        "<|im_start|>user\n你好<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    assert tokenizer.decode(answer_ids) == "很高兴认识你！<|im_end|>"

From 38c94d2e9c656902b86e0b76d913024772e9c7f1 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 01:29:09 +0800
Subject: [PATCH 157/160] Update label_issue.yml

Former-commit-id: fffa3defdda02ad579cb703c0704f94bad94f21a
---
 .github/workflows/label_issue.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml
index 0e10f0b9..ffd644a7 100644
--- a/.github/workflows/label_issue.yml
+++ b/.github/workflows/label_issue.yml
@@ -16,7 +16,7 @@ jobs:
           ISSUE_TITLE: ${{ github.event.issue.title }}
         run: |
           LABEL=pending
-          NPU_KEYWORDS=(npu ascend 昇腾)
+          NPU_KEYWORDS=(npu ascend huawei 华为 昇腾)
           ISSUE_TITLE_LOWER=$(echo $ISSUE_TITLE | tr '[:upper:]' '[:lower:]')
           for KEYWORD in ${NPU_KEYWORDS[@]}; do
             if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]]; then

From 884b49e662ed23fbe9be63062a1f3a0385bff0a4 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 03:51:20 +0800
Subject: [PATCH 158/160] add eval acc

Former-commit-id: 7ffde76fbfb6192e3aac31ccc098f31ce89181ae
---
 src/llamafactory/train/sft/metric.py   | 24 +++++++++++++++++++++---
 src/llamafactory/train/sft/trainer.py  | 17 ++++++-----------
 src/llamafactory/train/sft/workflow.py |  7 ++++---
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index 95bfcb69..72faef0a 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -17,9 +17,11 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Dict
 
 import numpy as np
+import torch
+from transformers import EvalPrediction
 from transformers.utils import is_jieba_available, is_nltk_available
 
 from ...extras.constants import IGNORE_INDEX
@@ -42,6 +44,22 @@ if is_rouge_available():
     from rouge_chinese import Rouge
 
 
+def compute_accuracy(eval_preds: "EvalPrediction") -> Dict[str, float]:
+    preds, labels = eval_preds.predictions, eval_preds.label_ids
+    accuracies = []
+    for i in range(len(preds)):
+        pred, label = preds[i, 1:], labels[i, :-1]
+        label_mask = label != IGNORE_INDEX
+        accuracies.append(np.mean(pred[label_mask] == label[label_mask]))
+
+    return {"accuracy": float(np.mean(accuracies))}
+
+
+def eval_logit_processor(logits: "torch.Tensor", labels: "torch.Tensor") -> "torch.Tensor":
+    logits = logits[0] if isinstance(logits, (list, tuple)) else logits
+    return torch.argmax(logits, dim=-1)
+
+
 @dataclass
 class ComputeMetrics:
     r"""
@@ -50,11 +68,11 @@ class ComputeMetrics:
 
     tokenizer: "PreTrainedTokenizer"
 
-    def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]:
+    def __call__(self, eval_preds: "EvalPrediction") -> Dict[str, float]:
         r"""
         Uses the model predictions to compute metrics.
         """
-        preds, labels = eval_preds
+        preds, labels = eval_preds.predictions, eval_preds.label_ids
         score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []}
 
         preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id)
diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
index 06bd2b6b..954bb69f 100644
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
@@ -135,21 +135,16 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 
         for i in range(len(preds)):
             pad_len = np.nonzero(preds[i] != self.tokenizer.pad_token_id)[0]
-            if len(pad_len):
-                preds[i] = np.concatenate(
-                    (preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1
-                )  # move pad token to last
+            if len(pad_len):  # move pad token to last
+                preds[i] = np.concatenate((preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1)
 
-        decoded_inputs = self.tokenizer.batch_decode(
-            dataset["input_ids"], skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        decoded_labels = self.tokenizer.batch_decode(
-            labels, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        decoded_inputs = self.tokenizer.batch_decode(dataset["input_ids"], skip_special_tokens=True)
+        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
 
         with open(output_prediction_file, "w", encoding="utf-8") as writer:
             res: List[str] = []
             for text, label, pred in zip(decoded_inputs, decoded_labels, decoded_preds):
                 res.append(json.dumps({"prompt": text, "label": label, "predict": pred}, ensure_ascii=False))
+
             writer.write("\n".join(res))
diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
index 885bc7ac..0c3f9b11 100644
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
@@ -25,7 +25,7 @@ from ...extras.misc import get_logits_processor
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
 from ..trainer_utils import create_modelcard_and_push
-from .metric import ComputeMetrics
+from .metric import ComputeMetrics, compute_accuracy, eval_logit_processor
 from .trainer import CustomSeq2SeqTrainer
 
 
@@ -72,7 +72,8 @@ def run_sft(
         finetuning_args=finetuning_args,
         data_collator=data_collator,
         callbacks=callbacks,
-        compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None,
+        compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else compute_accuracy,
+        preprocess_logits_for_metrics=None if training_args.predict_with_generate else eval_logit_processor,
         **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
     )
@@ -91,7 +92,7 @@ def run_sft(
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
         if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "eval_accuracy"])
 
     # Evaluation
     if training_args.do_eval:

From 4357e4239171ad69f95dcf2cf12b6cf894a48450 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 03:55:20 +0800
Subject: [PATCH 159/160] tiny fix

Former-commit-id: 19e43c3a9ed771e991cb273d394ab28fb923f868
---
 src/llamafactory/model/model_utils/attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py
index 80d9d4b8..4bed7e21 100644
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
@@ -35,7 +35,7 @@ def configure_attn_implementation(
         if model_args.flash_attn == "auto":
             logger.warning("Gemma-2 models should use eager attention in training, change `flash_attn` to disabled.")
             model_args.flash_attn = "disabled"
-        else:
+        elif model_args.flash_attn != "disabled":
             logger.warning(
                 "Gemma-2 models should use eager attention in training, but you set `flash_attn: {}`. "
                 "Will proceed at your own risk.".format(model_args.flash_attn)

From 973cf8e980ab8154939a41b963bb782385458563 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 1 Jul 2024 05:43:17 +0800
Subject: [PATCH 160/160] tiny fix

Former-commit-id: 5dd2e5c3323f56420b5845a5ed28bcd9d4da5e41
---
 src/llamafactory/train/sft/metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
index 72faef0a..c69608c0 100644
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
@@ -48,7 +48,7 @@ def compute_accuracy(eval_preds: "EvalPrediction") -> Dict[str, float]:
     preds, labels = eval_preds.predictions, eval_preds.label_ids
     accuracies = []
     for i in range(len(preds)):
-        pred, label = preds[i, 1:], labels[i, :-1]
+        pred, label = preds[i, :-1], labels[i, 1:]
         label_mask = label != IGNORE_INDEX
         accuracies.append(np.mean(pred[label_mask] == label[label_mask]))