From de509fa081707007cc45f016364977bdf6a8b57d Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Thu, 9 May 2024 01:26:15 -0400
Subject: [PATCH 01/14] remove unnecessary environment variable usage

Former-commit-id: b783673e0aa064963d9cce712378f73c9b5ff51d
---
 src/webui.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/webui.py b/src/webui.py
index b9385259..000098d1 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -5,9 +5,7 @@ from llmtuner.webui.interface import create_ui
 
 def main():
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
-    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
-    create_ui().queue().launch(share=gradio_share, server_name=server_name, server_port=server_port)
+    create_ui().queue().launch(server_name=server_name)
 
 
 if __name__ == "__main__":

From 58c5a5afafde79bd854e5f66110aa6a7a2203cd1 Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Thu, 9 May 2024 01:32:00 -0400
Subject: [PATCH 02/14] more removal of unnecessary environment variables

Former-commit-id: 425b9d616622467487339d5055b5d513c0fe7814
---
 src/llmtuner/webui/interface.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index 969ce6bd..6cfce8aa 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -79,6 +79,4 @@ def run_web_ui() -> None:
 
 def run_web_demo() -> None:
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
-    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
-    create_web_demo().queue().launch(share=gradio_share, server_name=server_name, server_port=server_port)
+    create_web_demo().queue().launch(server_name=server_name)

From 2370e7403f0fba55f179b8ddfb8353f06c9929ac Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Thu, 9 May 2024 01:33:20 -0400
Subject: [PATCH 03/14] yet another removal of unnecessary environment
 variables

Former-commit-id: 3c11157a496ae5a1e786870e4642475e53aad123
---
 src/llmtuner/webui/interface.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index 6cfce8aa..bbd91bb7 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -72,9 +72,7 @@ def create_web_demo() -> gr.Blocks:
 
 def run_web_ui() -> None:
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
-    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
-    create_ui().queue().launch(share=gradio_share, server_name=server_name, server_port=server_port)
+    create_ui().queue().launch(server_name=server_name)
 
 
 def run_web_demo() -> None:

From dd42439b038cea2fbcdb854e838850e009a9efbd Mon Sep 17 00:00:00 2001
From: Tendo33 <sjf1998112@gmail.com>
Date: Thu, 9 May 2024 14:28:01 +0800
Subject: [PATCH 04/14] 1.Change the name of is_fastapi_available function 2.
 Added the log of printing requests when deploying using vllm

Former-commit-id: fd2e6dec589f4ebe55d4c203991c47bf5b728ef8
---
 src/llmtuner/api/app.py         | 16 +++++---
 src/llmtuner/api/chat.py        | 73 ++++++++++++++++++++++-----------
 src/llmtuner/api/common.py      |  6 +--
 src/llmtuner/extras/packages.py |  2 +-
 4 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index 375ee61f..2d93312d 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -4,7 +4,7 @@ from typing import Annotated, Optional
 
 from ..chat import ChatModel
 from ..extras.misc import torch_gc
-from ..extras.packages import is_fastapi_availble, is_starlette_available, is_uvicorn_available
+from ..extras.packages import is_fastapi_available, is_starlette_available, is_uvicorn_available
 from .chat import (
     create_chat_completion_response,
     create_score_evaluation_response,
@@ -20,7 +20,7 @@ from .protocol import (
 )
 
 
-if is_fastapi_availble():
+if is_fastapi_available():
     from fastapi import Depends, FastAPI, HTTPException, status
     from fastapi.middleware.cors import CORSMiddleware
     from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
@@ -54,7 +54,8 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
 
     async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]):
         if api_key and (auth is None or auth.credentials != api_key):
-            raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
+            raise HTTPException(
+                status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
 
     @app.get(
         "/v1/models",
@@ -74,10 +75,12 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
     )
     async def create_chat_completion(request: ChatCompletionRequest):
         if not chat_model.engine.can_generate:
-            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+            raise HTTPException(
+                status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
         if request.stream:
-            generate = create_stream_chat_completion_response(request, chat_model)
+            generate = create_stream_chat_completion_response(
+                request, chat_model)
             return EventSourceResponse(generate, media_type="text/event-stream")
         else:
             return await create_chat_completion_response(request, chat_model)
@@ -90,7 +93,8 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
     )
     async def create_score_evaluation(request: ScoreEvaluationRequest):
         if chat_model.engine.can_generate:
-            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+            raise HTTPException(
+                status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
         return await create_score_evaluation_response(request, chat_model)
 
diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index 2a703877..3ab473d1 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -3,7 +3,8 @@ import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Optional, Tuple
 
 from ..data import Role as DataRole
-from ..extras.packages import is_fastapi_availble
+from ..extras.packages import is_fastapi_available
+from ..extras.logging import get_logger
 from .common import dictify, jsonify
 from .protocol import (
     ChatCompletionMessage,
@@ -19,8 +20,9 @@ from .protocol import (
     ScoreEvaluationResponse,
 )
 
+logger = get_logger(__name__)
 
-if is_fastapi_availble():
+if is_fastapi_available():
     from fastapi import HTTPException, status
 
 
@@ -39,8 +41,13 @@ ROLE_MAPPING = {
 
 
 def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, str]], str, str]:
+
+    params = dictify(request)
+    logger.info(f"==== request ====\n{params}")
+
     if len(request.messages) == 0:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
 
     if request.messages[0].role == Role.SYSTEM:
         system = request.messages.pop(0).content
@@ -48,29 +55,37 @@ def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, s
         system = ""
 
     if len(request.messages) % 2 == 0:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
+                            detail="Only supports u/a/u/a/u...")
 
     input_messages = []
     for i, message in enumerate(request.messages):
         if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
         elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
 
         if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
             name = message.tool_calls[0].function.name
             arguments = message.tool_calls[0].function.arguments
-            content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)
-            input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
+            content = json.dumps(
+                {"name": name, "argument": arguments}, ensure_ascii=False)
+            input_messages.append(
+                {"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
         else:
-            input_messages.append({"role": ROLE_MAPPING[message.role], "content": message.content})
+            input_messages.append(
+                {"role": ROLE_MAPPING[message.role], "content": message.content})
 
     tool_list = request.tools
     if isinstance(tool_list, list) and len(tool_list):
         try:
-            tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
+            tools = json.dumps([dictify(tool.function)
+                               for tool in tool_list], ensure_ascii=False)
         except Exception:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
     else:
         tools = ""
 
@@ -84,8 +99,10 @@ def _create_stream_chat_completion_chunk(
     index: Optional[int] = 0,
     finish_reason: Optional["Finish"] = None,
 ) -> str:
-    choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason)
-    chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data])
+    choice_data = ChatCompletionStreamResponseChoice(
+        index=index, delta=delta, finish_reason=finish_reason)
+    chunk = ChatCompletionStreamResponse(
+        id=completion_id, model=model, choices=[choice_data])
     return jsonify(chunk)
 
 
@@ -110,21 +127,26 @@ async def create_chat_completion_response(
     choices = []
     for i, response in enumerate(responses):
         if tools:
-            result = chat_model.engine.template.format_tools.extract(response.response_text)
+            result = chat_model.engine.template.format_tools.extract(
+                response.response_text)
         else:
             result = response.response_text
 
         if isinstance(result, tuple):
             name, arguments = result
             function = Function(name=name, arguments=arguments)
-            tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
-            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=[tool_call])
+            tool_call = FunctionCall(id="call_{}".format(
+                uuid.uuid4().hex), function=function)
+            response_message = ChatCompletionMessage(
+                role=Role.ASSISTANT, tool_calls=[tool_call])
             finish_reason = Finish.TOOL
         else:
-            response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
+            response_message = ChatCompletionMessage(
+                role=Role.ASSISTANT, content=result)
             finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH
 
-        choices.append(ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason))
+        choices.append(ChatCompletionResponseChoice(
+            index=i, message=response_message, finish_reason=finish_reason))
         prompt_length = response.prompt_length
         response_length += response.response_length
 
@@ -143,13 +165,16 @@ async def create_stream_chat_completion_response(
     completion_id = "chatcmpl-{}".format(uuid.uuid4().hex)
     input_messages, system, tools = _process_request(request)
     if tools:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
+                            detail="Cannot stream function calls.")
 
     if request.n > 1:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream multiple responses.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
+                            detail="Cannot stream multiple responses.")
 
     yield _create_stream_chat_completion_chunk(
-        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(role=Role.ASSISTANT, content="")
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(
+            role=Role.ASSISTANT, content="")
     )
     async for new_token in chat_model.astream_chat(
         input_messages,
@@ -163,7 +188,8 @@ async def create_stream_chat_completion_response(
     ):
         if len(new_token) != 0:
             yield _create_stream_chat_completion_chunk(
-                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token)
+                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(
+                    content=new_token)
             )
 
     yield _create_stream_chat_completion_chunk(
@@ -176,7 +202,8 @@ async def create_score_evaluation_response(
     request: "ScoreEvaluationRequest", chat_model: "ChatModel"
 ) -> "ScoreEvaluationResponse":
     if len(request.messages) == 0:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
 
     scores = await chat_model.aget_scores(request.messages, max_length=request.max_length)
     return ScoreEvaluationResponse(model=request.model, scores=scores)
diff --git a/src/llmtuner/api/common.py b/src/llmtuner/api/common.py
index 5ad9a071..3e95d211 100644
--- a/src/llmtuner/api/common.py
+++ b/src/llmtuner/api/common.py
@@ -6,11 +6,11 @@ if TYPE_CHECKING:
     from pydantic import BaseModel
 
 
-def dictify(data: "BaseModel") -> Dict[str, Any]:
+def dictify(data: "BaseModel", **kwargs) -> Dict[str, Any]:
     try:  # pydantic v2
-        return data.model_dump(exclude_unset=True)
+        return data.model_dump(**kwargs)
     except AttributeError:  # pydantic v1
-        return data.dict(exclude_unset=True)
+        return data.dict(**kwargs)
 
 
 def jsonify(data: "BaseModel") -> str:
diff --git a/src/llmtuner/extras/packages.py b/src/llmtuner/extras/packages.py
index a7317eec..4c9e6492 100644
--- a/src/llmtuner/extras/packages.py
+++ b/src/llmtuner/extras/packages.py
@@ -20,7 +20,7 @@ def _get_package_version(name: str) -> "Version":
         return version.parse("0.0.0")
 
 
-def is_fastapi_availble():
+def is_fastapi_available():
     return _is_package_available("fastapi")
 
 

From 10e65f004228b7f64b36e83c70a6e9ed5c0ce058 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 00:03:59 +0800
Subject: [PATCH 05/14] fix #3674

Former-commit-id: 56857770f8e66d667e3fe3ad7a11ab321c7fe020
---
 src/llmtuner/train/tuner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index ffdc3e60..8f103ca1 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -68,6 +68,8 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
         output_dtype = getattr(model.config, "torch_dtype", torch.float16)
         setattr(model.config, "torch_dtype", output_dtype)
         model = model.to(output_dtype)
+    else:
+        setattr(model.config, "torch_dtype", torch.float16)
 
     model.save_pretrained(
         save_directory=model_args.export_dir,

From 0f941f30f7ee06720903ce6e13796e7431aab382 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 00:33:49 +0800
Subject: [PATCH 06/14] update readme

Former-commit-id: 638043ced426c392014c5f42ce00f378f92f905d
---
 README.md    | 12 +++++++++---
 README_zh.md | 13 +++++++++----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 90c66caf..80154cae 100644
--- a/README.md
+++ b/README.md
@@ -366,17 +366,23 @@ See [examples/README.md](examples/README.md) for advanced usage (including distr
 #### Use local environment
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
-<details><summary>For Alibaba Cloud users</summary>
+<details><summary>For Alibaba Cloud PAI or AutoDL users</summary>
 
-If you encountered display problems in LLaMA Board on Alibaba Cloud, try using the following command to set environment variables before starting LLaMA Board:
+If you encountered display problems in LLaMA Board on Alibaba Cloud PAI, try using the following command to set environment variables before starting LLaMA Board:
 
 ```bash
 export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
 ```
 
+If you are using AutoDL, please install a specific version of Gradio:
+
+```bash
+pip install gradio==4.10.0
+```
+
 </details>
 
 #### Use Docker
diff --git a/README_zh.md b/README_zh.md
index 0aba9043..5656fb4a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -366,17 +366,23 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_s
 #### 使用本地环境
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
-<details><summary>阿里云用户指南</summary>
+<details><summary>阿里云 PAI 和 AutoDL 用户指南</summary>
 
-如果您在阿里云上使用 LLaMA Board 时遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
+如果您在阿里云 PAI 上使用 LLaMA Board 时遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
 
 ```bash
 export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
 ```
 
+如果您正在使用 AutoDL，请安装下述 Gradio 版本：
+
+```bash
+pip install gradio==4.10.0
+```
+
 </details>
 
 #### 使用 Docker
@@ -475,7 +481,6 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**：一个用于生成 Stable Diffusion 提示词的大型语言模型。[[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
 1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**：中文多模态医学大模型，基于 LLaVA-1.5-7B 在中文多模态医疗数据上微调而得。
 
-
 </details>
 
 ## 协议

From 51e0f095a94852342397d468d5c17dc5e2789a6d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 01:10:30 +0800
Subject: [PATCH 07/14] remove checksum and fix ui args

Former-commit-id: 58c522cd5cc4498a3fa8ed99424b5d63c9e56ccb
---
 README.md                       |  4 ++--
 README_zh.md                    |  4 ++--
 data/dataset_info.json          | 25 +++++--------------------
 src/llmtuner/data/loader.py     |  4 +---
 src/llmtuner/data/parser.py     |  2 --
 src/llmtuner/data/utils.py      | 15 ---------------
 src/llmtuner/webui/interface.py |  6 ++++--
 src/webui.py                    |  3 ++-
 8 files changed, 16 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 80154cae..57a34dab 100644
--- a/README.md
+++ b/README.md
@@ -366,7 +366,7 @@ See [examples/README.md](examples/README.md) for advanced usage (including distr
 #### Use local environment
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
 <details><summary>For Alibaba Cloud PAI or AutoDL users</summary>
@@ -374,7 +374,7 @@ CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli w
 If you encountered display problems in LLaMA Board on Alibaba Cloud PAI, try using the following command to set environment variables before starting LLaMA Board:
 
 ```bash
-export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
+export GRADIO_SERVER_PORT=7860 GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
 ```
 
 If you are using AutoDL, please install a specific version of Gradio:
diff --git a/README_zh.md b/README_zh.md
index 5656fb4a..047b1645 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -366,7 +366,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_s
 #### 使用本地环境
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
 <details><summary>阿里云 PAI 和 AutoDL 用户指南</summary>
@@ -374,7 +374,7 @@ CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli w
 如果您在阿里云 PAI 上使用 LLaMA Board 时遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
 
 ```bash
-export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
+export GRADIO_SERVER_PORT=7860 GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
 ```
 
 如果您正在使用 AutoDL，请安装下述 Gradio 版本：
diff --git a/data/dataset_info.json b/data/dataset_info.json
index d5b7208f..032a5c49 100644
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -1,27 +1,21 @@
 {
   "alpaca_en": {
-    "file_name": "alpaca_data_en_52k.json",
-    "file_sha1": "607f94a7f581341e59685aef32f531095232cf23"
+    "file_name": "alpaca_data_en_52k.json"
   },
   "alpaca_zh": {
-    "file_name": "alpaca_data_zh_51k.json",
-    "file_sha1": "2ba9827122c158dc256668d42bd1bcb8bc6b786e"
+    "file_name": "alpaca_data_zh_51k.json"
   },
   "alpaca_gpt4_en": {
-    "file_name": "alpaca_gpt4_data_en.json",
-    "file_sha1": "647f4ad447bd993e4b6b6223d1be15208bab694a"
+    "file_name": "alpaca_gpt4_data_en.json"
   },
   "alpaca_gpt4_zh": {
-    "file_name": "alpaca_gpt4_data_zh.json",
-    "file_sha1": "3eaa3bda364ccdd59925d7448a698256c31ef845"
+    "file_name": "alpaca_gpt4_data_zh.json"
   },
   "identity": {
-    "file_name": "identity.json",
-    "file_sha1": "0f67e97fd01612006ab3536cdaf6cfb0d1e7f279"
+    "file_name": "identity.json"
   },
   "oaast_sft_zh": {
     "file_name": "oaast_sft_zh.json",
-    "file_sha1": "a6a91f18f80f37b10ded9cf633fb50c033bf7b9f",
     "columns": {
       "prompt": "instruction",
       "query": "input",
@@ -31,7 +25,6 @@
   },
   "lima": {
     "file_name": "lima.json",
-    "file_sha1": "9db59f6b7007dc4b17529fc63379b9cd61640f37",
     "columns": {
       "prompt": "instruction",
       "query": "input",
@@ -41,7 +34,6 @@
   },
   "glaive_toolcall": {
     "file_name": "glaive_toolcall_10k.json",
-    "file_sha1": "36aea64548fbf6aa300bef411b9221092ed84902",
     "formatting": "sharegpt",
     "columns": {
       "messages": "conversations",
@@ -50,7 +42,6 @@
   },
   "mllm_demo": {
     "file_name": "mllm_demo.json",
-    "file_sha1": "d626cc0ad88a26d0dc9fcb47336821cf486d8bcc",
     "formatting": "sharegpt",
     "columns": {
       "messages": "messages",
@@ -308,7 +299,6 @@
   },
   "oaast_rm_zh": {
     "file_name": "oaast_rm_zh.json",
-    "file_sha1": "1065af1f3784dd61be5e79713a35f427b713a232",
     "columns": {
       "prompt": "instruction",
       "query": "input",
@@ -319,17 +309,14 @@
   },
   "comparison_gpt4_en": {
     "file_name": "comparison_gpt4_data_en.json",
-    "file_sha1": "96fa18313544e22444fe20eead7754b17da452ae",
     "ranking": true
   },
   "comparison_gpt4_zh": {
     "file_name": "comparison_gpt4_data_zh.json",
-    "file_sha1": "515b18ed497199131ddcc1af950345c11dc5c7fd",
     "ranking": true
   },
   "orca_rlhf": {
     "file_name": "orca_rlhf.json",
-    "file_sha1": "acc8f74d16fd1fc4f68e7d86eaa781c2c3f5ba8e",
     "ranking": true,
     "columns": {
       "prompt": "question",
@@ -370,14 +357,12 @@
   },
   "wiki_demo": {
     "file_name": "wiki_demo.txt",
-    "file_sha1": "e70375e28eda542a90c68213640cc371898ce181",
     "columns": {
       "prompt": "text"
     }
   },
   "c4_demo": {
     "file_name": "c4_demo.json",
-    "file_sha1": "a5a0c86759732f9a5238e447fecd74f28a66cca8",
     "columns": {
       "prompt": "text"
     }
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index ca0d5407..3cc01b0d 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -11,7 +11,7 @@ from .aligner import align_dataset
 from .parser import get_dataset_list
 from .preprocess import get_preprocess_and_print_func
 from .template import get_template_and_fix_tokenizer
-from .utils import checksum, merge_dataset
+from .utils import merge_dataset
 
 
 if TYPE_CHECKING:
@@ -61,8 +61,6 @@ def load_single_dataset(
 
         if data_path is None:
             raise ValueError("File extension must be txt, csv, json or jsonl.")
-
-        checksum(data_files, dataset_attr.file_sha1)
     else:
         raise NotImplementedError
 
diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py
index 01a417a9..3170fd8a 100644
--- a/src/llmtuner/data/parser.py
+++ b/src/llmtuner/data/parser.py
@@ -21,7 +21,6 @@ class DatasetAttr:
     load_from: Literal["hf_hub", "ms_hub", "script", "file"]
     dataset_name: str
     """ extra configs """
-    file_sha1: Optional[str] = None
     subset: Optional[str] = None
     folder: Optional[str] = None
     ranking: bool = False
@@ -99,7 +98,6 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
         else:
             dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])
 
-        dataset_attr.set_attr("file_sha1", dataset_info[name])
         dataset_attr.set_attr("subset", dataset_info[name])
         dataset_attr.set_attr("folder", dataset_info[name])
         dataset_attr.set_attr("ranking", dataset_info[name], default=False)
diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py
index dc189609..29fd4ad4 100644
--- a/src/llmtuner/data/utils.py
+++ b/src/llmtuner/data/utils.py
@@ -26,21 +26,6 @@ class Role(str, Enum):
     OBSERVATION = "observation"
 
 
-def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
-    if file_sha1 is None:
-        logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.")
-        return
-
-    if len(data_files) != 1:
-        logger.warning("Checksum failed: too many files.")
-        return
-
-    with open(data_files[0], "rb") as f:
-        sha1 = hashlib.sha1(f.read()).hexdigest()
-        if sha1 != file_sha1:
-            logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0]))
-
-
 def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]:
     max_target_len = int(max_len * (target_len / (source_len + target_len)))
     max_target_len = max(max_target_len, reserved_label_len)
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index bbd91bb7..91709d40 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -71,10 +71,12 @@ def create_web_demo() -> gr.Blocks:
 
 
 def run_web_ui() -> None:
+    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    create_ui().queue().launch(server_name=server_name)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name)
 
 
 def run_web_demo() -> None:
+    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    create_web_demo().queue().launch(server_name=server_name)
+    create_web_demo().queue().launch(share=gradio_share, server_name=server_name)
diff --git a/src/webui.py b/src/webui.py
index 000098d1..3f8690d0 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -4,8 +4,9 @@ from llmtuner.webui.interface import create_ui
 
 
 def main():
+    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    create_ui().queue().launch(server_name=server_name)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name)
 
 
 if __name__ == "__main__":

From 5f61ae6a5d2703a73f53ca1ece380198d798d19d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 01:25:16 +0800
Subject: [PATCH 08/14] fix #3658

Former-commit-id: 4777efe517c05a599f0ccdf9ccf760f3986d126e
---
 src/llmtuner/extras/callbacks.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/extras/callbacks.py b/src/llmtuner/extras/callbacks.py
index a142928a..6d24b244 100644
--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -139,13 +139,15 @@ class LogCallback(TrainerCallback):
         r"""
         Event called after an evaluation phase.
         """
-        self._close_thread_pool()
+        if not self.do_train:
+            self._close_thread_pool()
 
     def on_predict(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
         Event called after a successful prediction.
         """
-        self._close_thread_pool()
+        if not self.do_train:
+            self._close_thread_pool()
 
     def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""

From c627d358a9c6817c5cda6b9a39d22f08781c198f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 01:28:51 +0800
Subject: [PATCH 09/14] lint

Former-commit-id: 482d412dd961896b362ef574a3df5b2d58003327
---
 src/llmtuner/data/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py
index 29fd4ad4..aaa5bdc0 100644
--- a/src/llmtuner/data/utils.py
+++ b/src/llmtuner/data/utils.py
@@ -1,6 +1,5 @@
-import hashlib
 from enum import Enum, unique
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Tuple, Union
 
 from datasets import concatenate_datasets, interleave_datasets
 

From 93cbb0e2fc01c85063ccf80536b77891b0706889 Mon Sep 17 00:00:00 2001
From: Tendo33 <sjf1998112@gamil.com>
Date: Mon, 13 May 2024 09:40:33 +0800
Subject: [PATCH 10/14] ruff check scripts src tests --fix

Former-commit-id: b2bf7f5724f7962fef6b6d9d82c7a5bea9cbae47
---
 src/llmtuner/api/app.py  | 12 +++----
 src/llmtuner/api/chat.py | 67 ++++++++++++++--------------------------
 2 files changed, 27 insertions(+), 52 deletions(-)

diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index 5936955b..6d06d1d0 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -56,8 +56,7 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
 
     async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]):
         if api_key and (auth is None or auth.credentials != api_key):
-            raise HTTPException(
-                status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
+            raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
 
     @app.get(
         "/v1/models",
@@ -77,12 +76,10 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
     )
     async def create_chat_completion(request: ChatCompletionRequest):
         if not chat_model.engine.can_generate:
-            raise HTTPException(
-                status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
         if request.stream:
-            generate = create_stream_chat_completion_response(
-                request, chat_model)
+            generate = create_stream_chat_completion_response(request, chat_model)
             return EventSourceResponse(generate, media_type="text/event-stream")
         else:
             return await create_chat_completion_response(request, chat_model)
@@ -95,8 +92,7 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
     )
     async def create_score_evaluation(request: ScoreEvaluationRequest):
         if chat_model.engine.can_generate:
-            raise HTTPException(
-                status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
         return await create_score_evaluation_response(request, chat_model)
 
diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index 3ab473d1..76ddc88d 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -3,8 +3,8 @@ import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Optional, Tuple
 
 from ..data import Role as DataRole
-from ..extras.packages import is_fastapi_available
 from ..extras.logging import get_logger
+from ..extras.packages import is_fastapi_available
 from .common import dictify, jsonify
 from .protocol import (
     ChatCompletionMessage,
@@ -20,6 +20,7 @@ from .protocol import (
     ScoreEvaluationResponse,
 )
 
+
 logger = get_logger(__name__)
 
 if is_fastapi_available():
@@ -41,13 +42,11 @@ ROLE_MAPPING = {
 
 
 def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, str]], str, str]:
-
     params = dictify(request)
     logger.info(f"==== request ====\n{params}")
 
     if len(request.messages) == 0:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
 
     if request.messages[0].role == Role.SYSTEM:
         system = request.messages.pop(0).content
@@ -55,37 +54,29 @@ def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, s
         system = ""
 
     if len(request.messages) % 2 == 0:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
-                            detail="Only supports u/a/u/a/u...")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...")
 
     input_messages = []
     for i, message in enumerate(request.messages):
         if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
         elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
 
         if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
             name = message.tool_calls[0].function.name
             arguments = message.tool_calls[0].function.arguments
-            content = json.dumps(
-                {"name": name, "argument": arguments}, ensure_ascii=False)
-            input_messages.append(
-                {"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
+            content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)
+            input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
         else:
-            input_messages.append(
-                {"role": ROLE_MAPPING[message.role], "content": message.content})
+            input_messages.append({"role": ROLE_MAPPING[message.role], "content": message.content})
 
     tool_list = request.tools
     if isinstance(tool_list, list) and len(tool_list):
         try:
-            tools = json.dumps([dictify(tool.function)
-                               for tool in tool_list], ensure_ascii=False)
+            tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
         except Exception:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
     else:
         tools = ""
 
@@ -99,10 +90,8 @@ def _create_stream_chat_completion_chunk(
     index: Optional[int] = 0,
     finish_reason: Optional["Finish"] = None,
 ) -> str:
-    choice_data = ChatCompletionStreamResponseChoice(
-        index=index, delta=delta, finish_reason=finish_reason)
-    chunk = ChatCompletionStreamResponse(
-        id=completion_id, model=model, choices=[choice_data])
+    choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason)
+    chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data])
     return jsonify(chunk)
 
 
@@ -127,26 +116,21 @@ async def create_chat_completion_response(
     choices = []
     for i, response in enumerate(responses):
         if tools:
-            result = chat_model.engine.template.format_tools.extract(
-                response.response_text)
+            result = chat_model.engine.template.format_tools.extract(response.response_text)
         else:
             result = response.response_text
 
         if isinstance(result, tuple):
             name, arguments = result
             function = Function(name=name, arguments=arguments)
-            tool_call = FunctionCall(id="call_{}".format(
-                uuid.uuid4().hex), function=function)
-            response_message = ChatCompletionMessage(
-                role=Role.ASSISTANT, tool_calls=[tool_call])
+            tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=[tool_call])
             finish_reason = Finish.TOOL
         else:
-            response_message = ChatCompletionMessage(
-                role=Role.ASSISTANT, content=result)
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
             finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH
 
-        choices.append(ChatCompletionResponseChoice(
-            index=i, message=response_message, finish_reason=finish_reason))
+        choices.append(ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason))
         prompt_length = response.prompt_length
         response_length += response.response_length
 
@@ -165,16 +149,13 @@ async def create_stream_chat_completion_response(
     completion_id = "chatcmpl-{}".format(uuid.uuid4().hex)
     input_messages, system, tools = _process_request(request)
     if tools:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
-                            detail="Cannot stream function calls.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
 
     if request.n > 1:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
-                            detail="Cannot stream multiple responses.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream multiple responses.")
 
     yield _create_stream_chat_completion_chunk(
-        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(
-            role=Role.ASSISTANT, content="")
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(role=Role.ASSISTANT, content="")
     )
     async for new_token in chat_model.astream_chat(
         input_messages,
@@ -188,8 +169,7 @@ async def create_stream_chat_completion_response(
     ):
         if len(new_token) != 0:
             yield _create_stream_chat_completion_chunk(
-                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(
-                    content=new_token)
+                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token)
             )
 
     yield _create_stream_chat_completion_chunk(
@@ -202,8 +182,7 @@ async def create_score_evaluation_response(
     request: "ScoreEvaluationRequest", chat_model: "ChatModel"
 ) -> "ScoreEvaluationResponse":
     if len(request.messages) == 0:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
 
     scores = await chat_model.aget_scores(request.messages, max_length=request.max_length)
     return ScoreEvaluationResponse(model=request.model, scores=scores)

From 3d6a80660e6b8fe0f54bf132a08e6df57584847a Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 13 May 2024 16:51:20 +0800
Subject: [PATCH 11/14] support Yi 1.5

Former-commit-id: d12b8f866aa51e5e22d2b3d29704a13308de3e5b
---
 README.md                            |  4 +--
 README_zh.md                         |  4 +--
 src/llmtuner/extras/constants.py     | 39 +++++++++++++++++++++++-----
 src/llmtuner/model/utils/longlora.py |  2 +-
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 57a34dab..d260ad36 100644
--- a/README.md
+++ b/README.md
@@ -161,7 +161,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
-| [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                        | q_proj,v_proj     | yi        |
+| [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | q_proj,v_proj     | yi        |
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
@@ -487,7 +487,7 @@ If you have a project that should be incorporated, please contact via email or c
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
 
-Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2/LLaVA-1.5](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index 047b1645..8912d5e1 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -161,7 +161,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
-| [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                        | q_proj,v_proj     | yi        |
+| [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | q_proj,v_proj     | yi        |
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
@@ -487,7 +487,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
 
-使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2/LLaVA-1.5](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## 引用
 
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 50c78b3f..ff52f29a 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -320,14 +320,14 @@ register_model_group(
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-base",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-base",
         },
+        "DeepSeek-MoE-236B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2",
+        },
         "DeepSeek-MoE-16B-Chat": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat",
         },
-        "DeepSeek-MoE-236B": {
-            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2",
-            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2",
-        },
         "DeepSeek-MoE-236B-Chat": {
             DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat",
             DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat",
@@ -424,13 +424,13 @@ register_model_group(
 register_model_group(
     models={
         "CodeGemma-2B": {
-            DownloadSource.DEFAULT: "google/codegemma-2b",
+            DownloadSource.DEFAULT: "google/codegemma-1.1-2b",
         },
         "CodeGemma-7B": {
             DownloadSource.DEFAULT: "google/codegemma-7b",
         },
         "CodeGemma-7B-Chat": {
-            DownloadSource.DEFAULT: "google/codegemma-7b-it",
+            DownloadSource.DEFAULT: "google/codegemma-1.1-7b-it",
             DownloadSource.MODELSCOPE: "AI-ModelScope/codegemma-7b-it",
         },
     },
@@ -581,6 +581,9 @@ register_model_group(
             DownloadSource.DEFAULT: "shenzhi-wang/Llama3-8B-Chinese-Chat",
             DownloadSource.MODELSCOPE: "LLM-Research/Llama3-8B-Chinese-Chat",
         },
+        "LLaMA3-70B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3-70B-Chinese-Chat",
+        },
     },
     template="llama3",
 )
@@ -1174,6 +1177,30 @@ register_model_group(
             DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-4bits",
             DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-4bits",
         },
+        "Yi-1.5-6B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B",
+        },
+        "Yi-1.5-9B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B",
+        },
+        "Yi-1.5-34B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B",
+        },
+        "Yi-1.5-6B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B-Chat",
+        },
+        "Yi-1.5-9B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B-Chat",
+        },
+        "Yi-1.5-34B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B-Chat",
+        },
     },
     template="yi",
 )
diff --git a/src/llmtuner/model/utils/longlora.py b/src/llmtuner/model/utils/longlora.py
index c91febdd..a11351f1 100644
--- a/src/llmtuner/model/utils/longlora.py
+++ b/src/llmtuner/model/utils/longlora.py
@@ -302,7 +302,7 @@ def llama_sdpa_attention_forward(
 
 
 def _apply_llama_patch() -> None:
-    require_version("transformers==4.40.1", "To fix: pip install transformers==4.40.1")
+    require_version("transformers==4.40.2", "To fix: pip install transformers==4.40.2")
     LlamaAttention.forward = llama_attention_forward
     LlamaFlashAttention2.forward = llama_flash_attention_2_forward
     LlamaSdpaAttention.forward = llama_sdpa_attention_forward

From 414049ba20f0916d7814973f66b125e70f508799 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 13 May 2024 18:24:35 +0800
Subject: [PATCH 12/14] fix #3702

Former-commit-id: c27afa296bc8ae2b0611b3248200e88e4c185aef
---
 README.md                     | 48 ++++++++++++++++++-----------------
 README_zh.md                  | 48 ++++++++++++++++++-----------------
 src/llmtuner/api/chat.py      |  6 ++---
 src/llmtuner/api/common.py    |  6 ++---
 src/llmtuner/data/template.py |  2 +-
 5 files changed, 56 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index d260ad36..90fcb295 100644
--- a/README.md
+++ b/README.md
@@ -70,57 +70,59 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
-[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See `examples/lora_single_gpu/sft_mllm.sh` for usage.
+[24/05/13] We supported fine-tuning the **Yi-1.5** series models.
+
+[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See [examples](examples/README.md) for usage.
 
 [24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details.
 
-[24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See `examples/extras/mod` for usage.
+<details><summary>Full Changelog</summary>
 
-[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See `examples/extras/badam` for usage.
+[24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See [examples](examples/README.md) for usage.
+
+[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See [examples](examples/README.md) for usage.
 
 [24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
 
-<details><summary>Full Changelog</summary>
-
-[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
+[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See [examples](examples/README.md) for usage.
 
 [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
 
-[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
+[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See [examples](examples/README.md) for usage.
 
-[24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See `examples/extras/loraplus` for usage.
+[24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See [examples](examples/README.md) for usage.
 
-[24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. See `examples/extras/galore` for usage.
+[24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. See [examples](examples/README.md) for usage.
 
-[24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `--infer_backend vllm` to enjoy **270%** inference speed. (LoRA is not yet supported, merge it first.)
+[24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `infer_backend: vllm` to enjoy **270%** inference speed.
 
-[24/02/28] We supported weight-decomposed LoRA (**[DoRA](https://arxiv.org/abs/2402.09353)**). Try `--use_dora` to activate DoRA training.
+[24/02/28] We supported weight-decomposed LoRA (**[DoRA](https://arxiv.org/abs/2402.09353)**). Try `use_dora: true` to activate DoRA training.
 
-[24/02/15] We supported **block expansion** proposed by [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro). See `examples/extras/llama_pro` for usage.
+[24/02/15] We supported **block expansion** proposed by [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro). See [examples](examples/README.md) for usage.
 
 [24/02/05] Qwen1.5 (Qwen2 beta version) series models are supported in LLaMA-Factory. Check this [blog post](https://qwenlm.github.io/blog/qwen1.5/) for details.
 
-[24/01/18] We supported **agent tuning** for most models, equipping model with tool using abilities by fine-tuning with `--dataset glaive_toolcall`.
+[24/01/18] We supported **agent tuning** for most models, equipping model with tool using abilities by fine-tuning with `dataset: glaive_toolcall`.
 
-[23/12/23] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s implementation to boost LoRA tuning for the LLaMA, Mistral and Yi models. Try `--use_unsloth` argument to activate unsloth patch. It achieves **170%** speed in our benchmark, check [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison) for details.
+[23/12/23] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s implementation to boost LoRA tuning for the LLaMA, Mistral and Yi models. Try `use_unsloth: true` argument to activate unsloth patch. It achieves **170%** speed in our benchmark, check [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison) for details.
 
 [23/12/12] We supported fine-tuning the latest MoE model **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)** in our framework. See hardware requirement [here](#hardware-requirement).
 
-[23/12/01] We supported downloading pre-trained models and datasets from the **[ModelScope Hub](https://modelscope.cn/models)** for Chinese mainland users. See [this tutorial](#use-modelscope-hub-optional) for usage.
+[23/12/01] We supported downloading pre-trained models and datasets from the **[ModelScope Hub](https://modelscope.cn/models)** for Chinese mainland users. See [this tutorial](#download-from-modelscope-hub) for usage.
 
-[23/10/21] We supported **[NEFTune](https://arxiv.org/abs/2310.05914)** trick for fine-tuning. Try `--neftune_noise_alpha` argument to activate NEFTune, e.g., `--neftune_noise_alpha 5`.
+[23/10/21] We supported **[NEFTune](https://arxiv.org/abs/2310.05914)** trick for fine-tuning. Try `neftune_noise_alpha: 5` argument to activate NEFTune.
 
-[23/09/27] We supported **$S^2$-Attn** proposed by [LongLoRA](https://github.com/dvlab-research/LongLoRA) for the LLaMA models. Try `--shift_attn` argument to enable shift short attention.
+[23/09/27] We supported **$S^2$-Attn** proposed by [LongLoRA](https://github.com/dvlab-research/LongLoRA) for the LLaMA models. Try `shift_attn: true` argument to enable shift short attention.
 
-[23/09/23] We integrated MMLU, C-Eval and CMMLU benchmarks in this repo. See [this example](#evaluation) to evaluate your models.
+[23/09/23] We integrated MMLU, C-Eval and CMMLU benchmarks in this repo. See [examples](examples/README.md) for usage.
 
-[23/09/10] We supported **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**. Try `--flash_attn fa2` argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs.
+[23/09/10] We supported **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**. Try `flash_attn: fa2` argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs.
 
-[23/08/12] We supported **RoPE scaling** to extend the context length of the LLaMA models. Try `--rope_scaling linear` argument in training and `--rope_scaling dynamic` argument at inference to extrapolate the position embeddings.
+[23/08/12] We supported **RoPE scaling** to extend the context length of the LLaMA models. Try `rope_scaling: linear` argument in training and `rope_scaling: dynamic` argument at inference to extrapolate the position embeddings.
 
-[23/08/11] We supported **[DPO training](https://arxiv.org/abs/2305.18290)** for instruction-tuned models. See [this example](#dpo-training) to train your models.
+[23/08/11] We supported **[DPO training](https://arxiv.org/abs/2305.18290)** for instruction-tuned models. See [examples](examples/README.md) for usage.
 
-[23/07/31] We supported **dataset streaming**. Try `--streaming` and `--max_steps 10000` arguments to load your dataset in streaming mode.
+[23/07/31] We supported **dataset streaming**. Try `streaming: true` and `max_steps: 10000` arguments to load your dataset in streaming mode.
 
 [23/07/29] We released two instruction-tuned 13B models at Hugging Face. See these Hugging Face Repos ([LLaMA-2](https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat) / [Baichuan](https://huggingface.co/hiyouga/Baichuan-13B-sft)) for details.
 
@@ -132,7 +134,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 [23/06/22] We aligned the [demo API](src/api_demo.py) with the [OpenAI's](https://platform.openai.com/docs/api-reference/chat) format where you can insert the fine-tuned model in **arbitrary ChatGPT-based applications**.
 
-[23/06/03] We supported quantized training and inference (aka **[QLoRA](https://github.com/artidoro/qlora)**). Try `--quantization_bit 4/8` argument to work with quantized models.
+[23/06/03] We supported quantized training and inference (aka **[QLoRA](https://github.com/artidoro/qlora)**). See [examples](examples/README.md) for usage.
 
 </details>
 
diff --git a/README_zh.md b/README_zh.md
index 8912d5e1..1d15515e 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -70,57 +70,59 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
-[24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 `examples/lora_single_gpu/sft_mllm.sh`。
+[24/05/13] 我们支持了 Yi-1.5 系列模型的微调。
+
+[24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 [examples](examples/README_zh.md)。
 
 [24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型，详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。
 
-[24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 `examples/extras/mod`。
+<details><summary>展开日志</summary>
 
-[24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 `examples/extras/badam`。
+[24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 [examples](examples/README_zh.md)。
+
+[24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 [examples](examples/README_zh.md)。
 
 [24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
 
-<details><summary>展开日志</summary>
-
-[24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
+[24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 [examples](examples/README_zh.md)。
 
 [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
 
-[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
+[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 [examples](examples/README_zh.md)。
 
-[24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 `examples/extras/loraplus`。
+[24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 [examples](examples/README_zh.md)。
 
-[24/03/07] 我们支持了梯度低秩投影（**[GaLore](https://arxiv.org/abs/2403.03507)**）算法。详细用法请参照 `examples/extras/galore`。
+[24/03/07] 我们支持了梯度低秩投影（**[GaLore](https://arxiv.org/abs/2403.03507)**）算法。详细用法请参照 [examples](examples/README_zh.md)。
 
-[24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `--infer_backend vllm` 来获得 **270%** 的推理速度。（尚不支持 LoRA，请先合并权重。）
+[24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `infer_backend: vllm` 来获得 **270%** 的推理速度。
 
-[24/02/28] 我们支持了 **[DoRA](https://arxiv.org/abs/2402.09353)** 微调。请使用 `--use_dora` 参数进行 DoRA 微调。
+[24/02/28] 我们支持了 **[DoRA](https://arxiv.org/abs/2402.09353)** 微调。请使用 `use_dora: true` 参数进行 DoRA 微调。
 
-[24/02/15] 我们支持了 [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro) 提出的**块扩展**方法。详细用法请参照 `examples/extras/llama_pro`。
+[24/02/15] 我们支持了 [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro) 提出的**块扩展**方法。详细用法请参照 [examples](examples/README_zh.md)。
 
 [24/02/05] Qwen1.5（Qwen2 测试版）系列模型已在 LLaMA-Factory 中实现微调支持。详情请查阅该[博客页面](https://qwenlm.github.io/zh/blog/qwen1.5/)。
 
-[24/01/18] 我们针对绝大多数模型实现了 **Agent 微调**，微调时指定 `--dataset glaive_toolcall` 即可使模型获得工具调用能力。
+[24/01/18] 我们针对绝大多数模型实现了 **Agent 微调**，微调时指定 `dataset: glaive_toolcall` 即可使模型获得工具调用能力。
 
-[23/12/23] 我们针对 LLaMA, Mistral 和 Yi 模型支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的 LoRA 训练加速。请使用 `--use_unsloth` 参数启用 unsloth 优化。该方法可提供 **170%** 的训练速度，详情请查阅[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
+[23/12/23] 我们针对 LLaMA, Mistral 和 Yi 模型支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的 LoRA 训练加速。请使用 `use_unsloth: true` 参数启用 unsloth 优化。该方法可提供 **170%** 的训练速度，详情请查阅[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
 
 [23/12/12] 我们支持了微调最新的混合专家模型 **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)**。硬件需求请查阅[此处](#硬件依赖)。
 
-[23/12/01] 我们支持了从 **[魔搭社区](https://modelscope.cn/models)** 下载预训练模型和数据集。详细用法请参照 [此教程](#使用魔搭社区可跳过)。
+[23/12/01] 我们支持了从 **[魔搭社区](https://modelscope.cn/models)** 下载预训练模型和数据集。详细用法请参照 [此教程](#从魔搭社区下载)。
 
-[23/10/21] 我们支持了 **[NEFTune](https://arxiv.org/abs/2310.05914)** 训练技巧。请使用 `--neftune_noise_alpha` 参数启用 NEFTune，例如 `--neftune_noise_alpha 5`。
+[23/10/21] 我们支持了 **[NEFTune](https://arxiv.org/abs/2310.05914)** 训练技巧。请使用 `neftune_noise_alpha: 5` 参数启用 NEFTune。
 
-[23/09/27] 我们针对 LLaMA 模型支持了 [LongLoRA](https://github.com/dvlab-research/LongLoRA) 提出的 **$S^2$-Attn**。请使用 `--shift_attn` 参数以启用该功能。
+[23/09/27] 我们针对 LLaMA 模型支持了 [LongLoRA](https://github.com/dvlab-research/LongLoRA) 提出的 **$S^2$-Attn**。请使用 `shift_attn: true` 参数以启用该功能。
 
-[23/09/23] 我们在项目中集成了 MMLU、C-Eval 和 CMMLU 评估集。使用方法请参阅[此示例](#模型评估)。
+[23/09/23] 我们在项目中集成了 MMLU、C-Eval 和 CMMLU 评估集。详细用法请参照 [examples](examples/README_zh.md)。
 
-[23/09/10] 我们支持了 **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**。如果您使用的是 RTX4090、A100 或 H100 GPU，请使用 `--flash_attn fa2` 参数以启用 FlashAttention-2。
+[23/09/10] 我们支持了 **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**。如果您使用的是 RTX4090、A100 或 H100 GPU，请使用 `flash_attn: fa2` 参数以启用 FlashAttention-2。
 
-[23/08/12] 我们支持了 **RoPE 插值**来扩展 LLaMA 模型的上下文长度。请使用 `--rope_scaling linear` 参数训练模型或使用 `--rope_scaling dynamic` 参数评估模型。
+[23/08/12] 我们支持了 **RoPE 插值**来扩展 LLaMA 模型的上下文长度。请使用 `rope_scaling: linear` 参数训练模型或使用 `rope_scaling: dynamic` 参数评估模型。
 
-[23/08/11] 我们支持了指令模型的 **[DPO 训练](https://arxiv.org/abs/2305.18290)**。使用方法请参阅[此示例](#dpo-训练)。
+[23/08/11] 我们支持了指令模型的 **[DPO 训练](https://arxiv.org/abs/2305.18290)**。详细用法请参照 [examples](examples/README_zh.md)。
 
-[23/07/31] 我们支持了**数据流式加载**。请使用 `--streaming` 和 `--max_steps 10000` 参数来流式加载数据集。
+[23/07/31] 我们支持了**数据流式加载**。请使用 `streaming: true` 和 `max_steps: 10000` 参数来流式加载数据集。
 
 [23/07/29] 我们在 Hugging Face 发布了两个 13B 指令微调模型。详细内容请查阅我们的 Hugging Face 项目（[LLaMA-2](https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat) / [Baichuan](https://huggingface.co/hiyouga/Baichuan-13B-sft)）。
 
@@ -132,7 +134,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 [23/06/22] 我们对齐了[示例 API](src/api_demo.py) 与 [OpenAI API](https://platform.openai.com/docs/api-reference/chat) 的格式，您可以将微调模型接入**任意基于 ChatGPT 的应用**中。
 
-[23/06/03] 我们实现了 4 比特的 LoRA 训练（也称 **[QLoRA](https://github.com/artidoro/qlora)**）。请使用 `--quantization_bit 4` 参数进行 4 比特量化微调。
+[23/06/03] 我们实现了 4 比特的 LoRA 训练（也称 **[QLoRA](https://github.com/artidoro/qlora)**）。详细用法请参照 [examples](examples/README_zh.md)。
 
 </details>
 
diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index 76ddc88d..b7a08f0b 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -21,8 +21,6 @@ from .protocol import (
 )
 
 
-logger = get_logger(__name__)
-
 if is_fastapi_available():
     from fastapi import HTTPException, status
 
@@ -32,6 +30,7 @@ if TYPE_CHECKING:
     from .protocol import ChatCompletionRequest, ScoreEvaluationRequest
 
 
+logger = get_logger(__name__)
 ROLE_MAPPING = {
     Role.USER: DataRole.USER.value,
     Role.ASSISTANT: DataRole.ASSISTANT.value,
@@ -42,8 +41,7 @@ ROLE_MAPPING = {
 
 
 def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, str]], str, str]:
-    params = dictify(request)
-    logger.info(f"==== request ====\n{params}")
+    logger.info("==== request ====\n{}".format(json.dumps(dictify(request), indent=2, ensure_ascii=False)))
 
     if len(request.messages) == 0:
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
diff --git a/src/llmtuner/api/common.py b/src/llmtuner/api/common.py
index 3e95d211..5ad9a071 100644
--- a/src/llmtuner/api/common.py
+++ b/src/llmtuner/api/common.py
@@ -6,11 +6,11 @@ if TYPE_CHECKING:
     from pydantic import BaseModel
 
 
-def dictify(data: "BaseModel", **kwargs) -> Dict[str, Any]:
+def dictify(data: "BaseModel") -> Dict[str, Any]:
     try:  # pydantic v2
-        return data.model_dump(**kwargs)
+        return data.model_dump(exclude_unset=True)
     except AttributeError:  # pydantic v1
-        return data.dict(**kwargs)
+        return data.dict(exclude_unset=True)
 
 
 def jsonify(data: "BaseModel") -> str:
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index ada6cfcd..f716102f 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -308,7 +308,7 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer")
         jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
 
     jinja_template += (
-        "{% if messages[0]['role'] == 'system' %}" "{% set system_message = messages[0]['content'] %}" "{% endif %}"
+        "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}"
     )
 
     system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message")

From 68540734fb4af9e11963bb6a6fe8cade92cde020 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 13 May 2024 20:09:09 +0800
Subject: [PATCH 13/14] fix #3724

Former-commit-id: 93a02454746bd39e8f284e94a18bead80f545dae
---
 src/llmtuner/model/utils/longlora.py | 29 ++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/llmtuner/model/utils/longlora.py b/src/llmtuner/model/utils/longlora.py
index a11351f1..c8dc52f5 100644
--- a/src/llmtuner/model/utils/longlora.py
+++ b/src/llmtuner/model/utils/longlora.py
@@ -41,9 +41,9 @@ def llama_attention_forward(
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     bsz, q_len, _ = hidden_states.size()
 
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
+    query_states: "torch.Tensor" = self.q_proj(hidden_states)
+    key_states: "torch.Tensor" = self.k_proj(hidden_states)
+    value_states: "torch.Tensor" = self.v_proj(hidden_states)
 
     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -87,7 +87,7 @@ def llama_attention_forward(
     # upcast attention to fp32
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
     attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-    attn_output = torch.matmul(attn_weights, value_states)  # (bsz, :, seq_len, :) or (bsz*n_group, :, groupsz, :)
+    attn_output = torch.matmul(attn_weights, value_states)  # (bsz, :, seq_len, :) or (bsz * n_group, :, groupsz, :)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
     if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
@@ -125,9 +125,9 @@ def llama_flash_attention_2_forward(
 
     bsz, q_len, _ = hidden_states.size()
 
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
+    query_states: "torch.Tensor" = self.q_proj(hidden_states)
+    key_states: "torch.Tensor" = self.k_proj(hidden_states)
+    value_states: "torch.Tensor" = self.v_proj(hidden_states)
 
     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -233,9 +233,9 @@ def llama_sdpa_attention_forward(
 
     bsz, q_len, _ = hidden_states.size()
 
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
+    query_states: "torch.Tensor" = self.q_proj(hidden_states)
+    key_states: "torch.Tensor" = self.k_proj(hidden_states)
+    value_states: "torch.Tensor" = self.v_proj(hidden_states)
 
     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -270,11 +270,12 @@ def llama_sdpa_attention_forward(
 
     causal_mask = attention_mask
     if attention_mask is not None:
-        causal_mask = causal_mask[:, :, :, :groupsz]
+        causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
 
-    query_states = query_states.contiguous()
-    key_states = key_states.contiguous()
-    value_states = value_states.contiguous()
+    if query_states.device.type == "cuda" and causal_mask is not None:
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
 
     attn_output = torch.nn.functional.scaled_dot_product_attention(
         query_states,

From 3318b6e188fea0c9484a43f1747338f1fd032156 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 13 May 2024 20:39:36 +0800
Subject: [PATCH 14/14] update examples

Former-commit-id: dae83f419919305cb23bb2b9da1277a1616179c5
---
 examples/README.md                                   | 12 ++++++------
 examples/README_zh.md                                | 12 ++++++------
 examples/extras/badam/llama3_lora_sft.yaml           |  2 +-
 examples/extras/fsdp_qlora/llama3_lora_sft.yaml      |  5 ++++-
 examples/extras/galore/llama3_full_sft.yaml          |  2 +-
 examples/extras/llama_pro/llama3_freeze_sft.yaml     |  4 ++--
 examples/extras/loraplus/llama3_lora_sft.yaml        |  4 ++--
 examples/extras/mod/llama3_full_sft.yaml             |  2 +-
 examples/full_multi_gpu/llama3_full_sft.yaml         |  2 +-
 examples/lora_multi_gpu/llama3_lora_sft.yaml         |  2 +-
 examples/lora_multi_gpu/llama3_lora_sft_ds.yaml      |  2 +-
 examples/lora_single_gpu/llama3_lora_dpo.yaml        |  2 +-
 examples/lora_single_gpu/llama3_lora_orpo.yaml       |  2 +-
 examples/lora_single_gpu/llama3_lora_pretrain.yaml   |  2 +-
 examples/lora_single_gpu/llama3_lora_reward.yaml     |  2 +-
 examples/lora_single_gpu/llama3_lora_sft.yaml        |  2 +-
 examples/lora_single_gpu/llama3_preprocess.yaml      |  1 -
 examples/lora_single_gpu/llava1_5_lora_sft.yaml      |  2 +-
 examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml  |  2 +-
 examples/qlora_single_gpu/llama3_lora_sft_awq.yaml   |  2 +-
 .../llama3_lora_sft_bitsandbytes.yaml                |  5 +----
 examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml  |  2 +-
 22 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index ce19f9d1..0838314a 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -28,6 +28,12 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
 ```
 
+#### Multimodal Supervised Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+```
+
 #### Reward Modeling
 
 ```bash
@@ -52,12 +58,6 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
 ```
 
-#### Multimodal Supervised Fine-Tuning
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
-```
-
 #### Preprocess Dataset
 
 It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 91bdcda9..7fe43954 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -28,6 +28,12 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
 ```
 
+#### 多模态指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+```
+
 #### 奖励模型训练
 
 ```bash
@@ -52,12 +58,6 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
 ```
 
-#### 多模态指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
-```
-
 #### 预处理数据集
 
 对于大数据集有帮助，在配置中使用 `tokenized_path` 以加载预处理后的数据集。
diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_lora_sft.yaml
index 9f1f1976..5e8994bc 100644
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@@ -15,7 +15,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -36,6 +35,7 @@ warmup_steps: 0.1
 pure_bf16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
index 64bf1356..1fd8f16a 100644
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -8,12 +8,14 @@ do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
+# ddp
+ddp_timeout: 180000000
+
 # dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -34,6 +36,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/galore/llama3_full_sft.yaml b/examples/extras/galore/llama3_full_sft.yaml
index 5aec8af9..3bc074c5 100644
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -16,7 +16,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -37,6 +36,7 @@ warmup_steps: 0.1
 pure_bf16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index a54be8b8..4d92cdad 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -14,7 +14,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -32,9 +31,10 @@ learning_rate: 0.0001
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_steps: 0.1
-pure_bf16: true
+fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
index dfb7058b..0956aa71 100644
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -13,7 +13,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -31,9 +30,10 @@ learning_rate: 0.0001
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_steps: 0.1
-pure_bf16: true
+fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml
index 5f80521d..5dc8c061 100644
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -34,6 +33,7 @@ warmup_steps: 0.1
 pure_bf16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/full_multi_gpu/llama3_full_sft.yaml
index ef35e441..2d8031f1 100644
--- a/examples/full_multi_gpu/llama3_full_sft.yaml
+++ b/examples/full_multi_gpu/llama3_full_sft.yaml
@@ -15,7 +15,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -36,6 +35,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml
index d9690679..6cc06f8a 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml
@@ -15,7 +15,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -36,6 +35,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
index 26955167..5a7348c1 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
@@ -16,7 +16,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -37,6 +36,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/lora_single_gpu/llama3_lora_dpo.yaml
index f71f752d..16c6d0c9 100644
--- a/examples/lora_single_gpu/llama3_lora_dpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml
@@ -13,7 +13,6 @@ dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -34,6 +33,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_orpo.yaml b/examples/lora_single_gpu/llama3_lora_orpo.yaml
index 5d78d260..bc42bdd4 100644
--- a/examples/lora_single_gpu/llama3_lora_orpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_orpo.yaml
@@ -12,7 +12,6 @@ dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_pretrain.yaml b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
index 64245b71..48425b15 100644
--- a/examples/lora_single_gpu/llama3_lora_pretrain.yaml
+++ b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
@@ -11,7 +11,6 @@ lora_target: q_proj,v_proj
 dataset: c4_demo
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -32,6 +31,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/lora_single_gpu/llama3_lora_reward.yaml
index f190f4ac..ecaf8d72 100644
--- a/examples/lora_single_gpu/llama3_lora_reward.yaml
+++ b/examples/lora_single_gpu/llama3_lora_reward.yaml
@@ -12,7 +12,6 @@ dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/lora_single_gpu/llama3_lora_sft.yaml
index f99df305..0e5e30b3 100644
--- a/examples/lora_single_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/lora_single_gpu/llama3_preprocess.yaml
index 0b3dc599..4c45c1cd 100644
--- a/examples/lora_single_gpu/llama3_preprocess.yaml
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 tokenized_path: saves/llama3-8b/dataset/sft
diff --git a/examples/lora_single_gpu/llava1_5_lora_sft.yaml b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
index 96c2701a..84d2a672 100644
--- a/examples/lora_single_gpu/llava1_5_lora_sft.yaml
+++ b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
@@ -13,7 +13,6 @@ dataset: mllm_demo
 template: vicuna
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -34,6 +33,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
index 11f1d277..a1d5f95d 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
index 4b070d45..8941d6b2 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
index 7bc31bde..885fcd83 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
@@ -8,15 +8,11 @@ do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
-ddp_timeout: 180000000
-
 # dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -37,6 +33,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
index 2f8cfe45..87a404a0 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500