From 59e6ebf0396890b522c9460a4ec4a1da5430b395 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 28 Mar 2024 18:16:27 +0800
Subject: [PATCH 001/341] update trainers

Former-commit-id: d0dd6eefed0b86895ed00a7cafb331e5193db645
---
 README.md                                     |  4 +-
 README_zh.md                                  |  4 +-
 examples/extras/galore/adamw.sh               | 31 ----------
 examples/extras/galore/adamw_8bit_bf16.sh     | 32 -----------
 .../extras/galore/galore_adamw_8bit_bf16.sh   | 36 ------------
 .../extras/galore/{galore_adamw.sh => sft.sh} |  2 +-
 src/llmtuner/model/utils.py                   |  2 +
 src/llmtuner/train/dpo/trainer.py             | 14 +++--
 src/llmtuner/train/ppo/workflow.py            |  5 +-
 src/llmtuner/train/pt/trainer.py              | 18 ++++--
 src/llmtuner/train/rm/trainer.py              | 16 ++++--
 src/llmtuner/train/sft/trainer.py             | 14 +++--
 src/llmtuner/train/utils.py                   | 56 +++++++++++++------
 13 files changed, 89 insertions(+), 145 deletions(-)
 delete mode 100644 examples/extras/galore/adamw.sh
 delete mode 100644 examples/extras/galore/adamw_8bit_bf16.sh
 delete mode 100644 examples/extras/galore/galore_adamw_8bit_bf16.sh
 rename examples/extras/galore/{galore_adamw.sh => sft.sh} (98%)

diff --git a/README.md b/README.md
index 605e6ad9..b9a7f7dc 100644
--- a/README.md
+++ b/README.md
@@ -72,9 +72,9 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 [24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/fsdp_qlora` for usage.
 
-[24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. Try `loraplus_lr_ratio=16.0` to enable LoRA+ algorithm.
+[24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See `examples/extras/loraplus` for usage.
 
-[24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. Try `--use_galore` to use the memory-efficient optimizer.
+[24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. See `examples/extras/galore` for usage.
 
 [24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `--infer_backend vllm` to enjoy **270%** inference speed. (LoRA is not yet supported, merge it first.)
 
diff --git a/README_zh.md b/README_zh.md
index 242c1ff7..beb88e7d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -72,9 +72,9 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 [24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/fsdp_qlora`。
 
-[24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。请使用 `loraplus_lr_ratio=16.0` 参数开启 LoRA+ 方法。
+[24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 `examples/extras/loraplus`。
 
-[24/03/07] 我们支持了梯度低秩投影（**[GaLore](https://arxiv.org/abs/2403.03507)**）算法。请使用 `--use_galore` 参数切换显存高效的优化器。
+[24/03/07] 我们支持了梯度低秩投影（**[GaLore](https://arxiv.org/abs/2403.03507)**）算法。详细用法请参照 `examples/extras/galore`。
 
 [24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `--infer_backend vllm` 来获得 **270%** 的推理速度。（尚不支持 LoRA，请先合并权重。）
 
diff --git a/examples/extras/galore/adamw.sh b/examples/extras/galore/adamw.sh
deleted file mode 100644
index d4f5afb4..00000000
--- a/examples/extras/galore/adamw.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/extras/galore/adamw_8bit_bf16.sh b/examples/extras/galore/adamw_8bit_bf16.sh
deleted file mode 100644
index ecb4fa96..00000000
--- a/examples/extras/galore/adamw_8bit_bf16.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --optim adamw_8bit \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/galore/galore_adamw_8bit_bf16.sh b/examples/extras/galore/galore_adamw_8bit_bf16.sh
deleted file mode 100644
index cedc8bee..00000000
--- a/examples/extras/galore/galore_adamw_8bit_bf16.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --optim adamw_8bit \
-    --use_galore \
-    --galore_layerwise \
-    --galore_target mlp,self_attn \
-    --galore_rank 128 \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/galore/galore_adamw.sh b/examples/extras/galore/sft.sh
similarity index 98%
rename from examples/extras/galore/galore_adamw.sh
rename to examples/extras/galore/sft.sh
index 063bb6df..1ffeb5ca 100644
--- a/examples/extras/galore/galore_adamw.sh
+++ b/examples/extras/galore/sft.sh
@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
     --max_samples 3000 \
     --val_size 0.1 \
     --plot_loss \
-    --fp16
+    --pure_bf16
diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py
index 5a437491..1b96a9dd 100644
--- a/src/llmtuner/model/utils.py
+++ b/src/llmtuner/model/utils.py
@@ -47,6 +47,8 @@ def find_all_linear_modules(model: "PreTrainedModel") -> List[str]:
     output_layer_names = ["lm_head"]
     if model.config.model_type == "chatglm":
         output_layer_names.append("output_layer")
+    elif model.config.model_type == "internlm2":
+        output_layer_names.append("output")
 
     module_names = set()
     for name, module in model.named_modules():
diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py
index ed0fe5f1..39e84679 100644
--- a/src/llmtuner/train/dpo/trainer.py
+++ b/src/llmtuner/train/dpo/trainer.py
@@ -8,7 +8,7 @@ from trl import DPOTrainer
 from trl.trainer.utils import disable_dropout_in_model
 
 from ...extras.constants import IGNORE_INDEX
-from ..utils import create_custom_optimzer
+from ..utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -63,12 +63,16 @@ class CustomDPOTrainer(DPOTrainer):
             else:
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
 
-    def create_optimizer_and_scheduler(self, num_training_steps: int) -> None:
+    def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
-            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args, num_training_steps)
+            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
 
-        self.create_optimizer()
-        self.create_scheduler(num_training_steps=num_training_steps, optimizer=self.optimizer)
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
 
     def sft_loss(self, chosen_logits: torch.FloatTensor, chosen_labels: torch.LongTensor) -> torch.Tensor:
         r"""
diff --git a/src/llmtuner/train/ppo/workflow.py b/src/llmtuner/train/ppo/workflow.py
index dff135d2..658b244d 100644
--- a/src/llmtuner/train/ppo/workflow.py
+++ b/src/llmtuner/train/ppo/workflow.py
@@ -13,7 +13,7 @@ from ...extras.callbacks import FixValueHeadModelCallback
 from ...extras.misc import fix_valuehead_checkpoint
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
-from ..utils import create_custom_optimzer, create_ref_model, create_reward_model
+from ..utils import create_custom_optimzer, create_custom_scheduler, create_ref_model, create_reward_model
 from .trainer import CustomPPOTrainer
 
 
@@ -70,7 +70,8 @@ def run_ppo(
         total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size
         num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size)
 
-    optimizer = create_custom_optimzer(model, training_args, finetuning_args, num_training_steps)
+    optimizer = create_custom_optimzer(model, training_args, finetuning_args)
+    create_custom_scheduler(training_args, num_training_steps, optimizer)
     if optimizer is None:
         optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=training_args.learning_rate)
 
diff --git a/src/llmtuner/train/pt/trainer.py b/src/llmtuner/train/pt/trainer.py
index 16e3f5f0..af2848fb 100644
--- a/src/llmtuner/train/pt/trainer.py
+++ b/src/llmtuner/train/pt/trainer.py
@@ -1,12 +1,14 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 from transformers import Trainer
 
 from ...extras.logging import get_logger
-from ..utils import create_custom_optimzer
+from ..utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
+    import torch
+
     from ...hparams import FinetuningArguments
 
 
@@ -22,9 +24,13 @@ class CustomTrainer(Trainer):
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
 
-    def create_optimizer_and_scheduler(self, num_training_steps: int) -> None:
+    def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
-            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args, num_training_steps)
+            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
 
-        self.create_optimizer()
-        self.create_scheduler(num_training_steps=num_training_steps, optimizer=self.optimizer)
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
diff --git a/src/llmtuner/train/rm/trainer.py b/src/llmtuner/train/rm/trainer.py
index 4f5d7190..8d0f2763 100644
--- a/src/llmtuner/train/rm/trainer.py
+++ b/src/llmtuner/train/rm/trainer.py
@@ -1,12 +1,12 @@
 import json
 import os
-from typing import TYPE_CHECKING, Dict, List, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 from transformers import Trainer
 
 from ...extras.logging import get_logger
-from ..utils import create_custom_optimzer
+from ..utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -29,12 +29,16 @@ class PairwiseTrainer(Trainer):
         self.finetuning_args = finetuning_args
         self.can_return_loss = True  # override property to return eval_loss
 
-    def create_optimizer_and_scheduler(self, num_training_steps: int) -> None:
+    def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
-            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args, num_training_steps)
+            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
 
-        self.create_optimizer()
-        self.create_scheduler(num_training_steps=num_training_steps, optimizer=self.optimizer)
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
 
     def compute_loss(
         self, model: "PreTrainedModel", inputs: Dict[str, torch.Tensor], return_outputs: bool = False
diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py
index 4a49bb27..8d2f9fa0 100644
--- a/src/llmtuner/train/sft/trainer.py
+++ b/src/llmtuner/train/sft/trainer.py
@@ -8,7 +8,7 @@ from transformers import Seq2SeqTrainer
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
-from ..utils import create_custom_optimzer
+from ..utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
@@ -29,12 +29,16 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
 
-    def create_optimizer_and_scheduler(self, num_training_steps: int) -> None:
+    def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
-            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args, num_training_steps)
+            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
 
-        self.create_optimizer()
-        self.create_scheduler(num_training_steps=num_training_steps, optimizer=self.optimizer)
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
 
     def prediction_step(
         self,
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index 49c42d4e..73854a5e 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -5,6 +5,7 @@ from transformers import Trainer
 from transformers.optimization import get_scheduler
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 from transformers.trainer_pt_utils import get_parameter_names
+from transformers.utils.versions import require_version
 
 from ..extras.logging import get_logger
 from ..extras.packages import is_galore_available
@@ -28,7 +29,13 @@ logger = get_logger(__name__)
 
 
 class DummyOptimizer(torch.optim.Optimizer):
-    def __init__(self, lr: float = 1e-3, optimizer_dict: Optional[dict] = None, *args, **kwargs) -> None:
+    r"""
+    A dummy optimizer used for the GaLore algorithm.
+    """
+
+    def __init__(
+        self, lr: float = 1e-3, optimizer_dict: Optional[Dict["torch.nn.Parameter", "torch.optim.Optimizer"]] = None
+    ) -> None:
         dummy_tensor = torch.randn(1, 1)
         self.optimizer_dict = optimizer_dict
         super().__init__([dummy_tensor], {"lr": lr})
@@ -155,8 +162,9 @@ def _create_galore_optimizer(
     model: "PreTrainedModel",
     training_args: "Seq2SeqTrainingArguments",
     finetuning_args: "FinetuningArguments",
-    max_steps: int,
 ) -> "torch.optim.Optimizer":
+    require_version("galore_torch", "To fix: pip install galore_torch")
+
     if len(finetuning_args.galore_target) == 1 and finetuning_args.galore_target[0] == "all":
         galore_targets = find_all_linear_modules(model)
     else:
@@ -211,29 +219,19 @@ def _create_galore_optimizer(
         for param in decay_params:
             param_groups = [dict(params=[param], weight_decay=training_args.weight_decay)]
             optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
-        for param in galore_params:
+        for param in galore_params:  # galore params have weight decay
             param_groups = [dict(params=[param], weight_decay=training_args.weight_decay, **galore_kwargs)]
             optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
 
-        scheduler_dict: Dict["torch.Tensor", "torch.optim.lr_scheduler.LRScheduler"] = {}
-        for param in trainable_params:
-            scheduler_dict[param] = get_scheduler(
-                training_args.lr_scheduler_type,
-                optimizer=optimizer_dict[param],
-                num_warmup_steps=training_args.get_warmup_steps(max_steps) * 2,
-                num_training_steps=max_steps * 2,
-            )
-
-        def optimizer_hook(param: "torch.Tensor"):
+        def optimizer_hook(param: "torch.nn.Parameter"):
             if param.grad is not None:
                 optimizer_dict[param].step()
                 optimizer_dict[param].zero_grad()
-                scheduler_dict[param].step()
 
         for param in trainable_params:
             param.register_post_accumulate_grad_hook(optimizer_hook)
 
-        optimizer = DummyOptimizer(lr=training_args.learning_rate)  # display scheduler result
+        optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict)
     else:
         param_groups = [
             dict(params=nodecay_params),
@@ -292,10 +290,34 @@ def create_custom_optimzer(
     model: "PreTrainedModel",
     training_args: "Seq2SeqTrainingArguments",
     finetuning_args: "FinetuningArguments",
-    max_steps: int,
 ) -> Optional["torch.optim.Optimizer"]:
     if finetuning_args.use_galore:
-        return _create_galore_optimizer(model, training_args, finetuning_args, max_steps)
+        return _create_galore_optimizer(model, training_args, finetuning_args)
 
     if finetuning_args.loraplus_lr_ratio is not None:
         return _create_loraplus_optimizer(model, training_args, finetuning_args)
+
+
+def create_custom_scheduler(
+    training_args: "Seq2SeqTrainingArguments",
+    num_training_steps: int,
+    optimizer: Optional["torch.optim.Optimizer"] = None,
+) -> None:
+    if optimizer is not None and isinstance(optimizer, DummyOptimizer):
+        optimizer_dict = optimizer.optimizer_dict
+        scheduler_dict: Dict["torch.nn.Parameter", "torch.optim.lr_scheduler.LRScheduler"] = {}
+
+        for param in optimizer_dict.keys():
+            scheduler_dict[param] = get_scheduler(
+                training_args.lr_scheduler_type,
+                optimizer=optimizer_dict[param],
+                num_warmup_steps=training_args.get_warmup_steps(num_training_steps) * 2,
+                num_training_steps=num_training_steps * 2,
+            )
+
+        def scheduler_hook(param: "torch.nn.Parameter"):
+            if param.grad is not None:
+                scheduler_dict[param].step()
+
+        for param in optimizer_dict.keys():
+            param.register_post_accumulate_grad_hook(scheduler_hook)

From 14b75a0b93c485d6c966f2422029ac0043d498a8 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 28 Mar 2024 18:31:17 +0800
Subject: [PATCH 002/341] fix #3010

Former-commit-id: a5e823ae75556eaa3b52ce7a887a6e7838a1eb5f
---
 src/llmtuner/extras/callbacks.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/llmtuner/extras/callbacks.py b/src/llmtuner/extras/callbacks.py
index 086dea6d..985b0292 100644
--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -58,9 +58,17 @@ class LogCallback(TrainerCallback):
             self.in_training = True
             self.start_time = time.time()
             self.max_steps = state.max_steps
-            if os.path.exists(os.path.join(args.output_dir, LOG_FILE_NAME)) and args.overwrite_output_dir:
-                logger.warning("Previous log file in this folder will be deleted.")
-                os.remove(os.path.join(args.output_dir, LOG_FILE_NAME))
+
+        if args.save_on_each_node:
+            if not state.is_local_process_zero:
+                return
+        else:
+            if not state.is_world_process_zero:
+                return
+
+        if os.path.exists(os.path.join(args.output_dir, LOG_FILE_NAME)) and args.overwrite_output_dir:
+            logger.warning("Previous log file in this folder will be deleted.")
+            os.remove(os.path.join(args.output_dir, LOG_FILE_NAME))
 
     def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
@@ -112,8 +120,12 @@ class LogCallback(TrainerCallback):
         r"""
         Event called after logging the last logs.
         """
-        if not state.is_local_process_zero:
-            return
+        if args.save_on_each_node:
+            if not state.is_local_process_zero:
+                return
+        else:
+            if not state.is_world_process_zero:
+                return
 
         logs = dict(
             current_steps=self.cur_steps,

From f0e564beaa87a412c9a103c6c3bec125493f702e Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 28 Mar 2024 18:35:11 +0800
Subject: [PATCH 003/341] update readme

Former-commit-id: 6b634b5c2dbad827e8cc9850b8d7697c2056532a
---
 README.md    | 6 +++---
 README_zh.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index b9a7f7dc..9ad1e291 100644
--- a/README.md
+++ b/README.md
@@ -451,7 +451,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 ```
 
 > [!TIP]
-> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` to infer the fine-tuned model.
+> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` to infer the fine-tuned model if `--create_new_adapter` was enabled.
 
 > [!WARNING]
 > Use `--per_device_train_batch_size=1` for LLaMA-2 models in fp16 PPO training.
@@ -482,7 +482,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 ```
 
 > [!TIP]
-> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` to infer the fine-tuned model.
+> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` to infer the fine-tuned model if `--create_new_adapter` was enabled.
 
 ### Distributed Training
 
@@ -570,7 +570,7 @@ deepspeed --num_gpus 8 src/train_bash.py \
 ### Merge LoRA weights and export model
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/export_model.py \
+CUDA_VISIBLE_DEVICES= python src/export_model.py \
     --model_name_or_path path_to_llama_model \
     --adapter_name_or_path path_to_checkpoint \
     --template default \
diff --git a/README_zh.md b/README_zh.md
index beb88e7d..b207ae73 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -450,7 +450,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 ```
 
 > [!TIP]
-> 使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` 来进行微调模型的推理。
+> 如果开启了 `--create_new_adapter`，则使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` 来进行微调模型的推理。
 
 > [!WARNING]
 > 如果使用 fp16 精度进行 LLaMA-2 模型的 PPO 训练，请使用 `--per_device_train_batch_size=1`。
@@ -481,7 +481,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 ```
 
 > [!TIP]
-> 使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` 来进行微调模型的推理。
+> 如果开启了 `--create_new_adapter`，则使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` 来进行微调模型的推理。
 
 ### 多 GPU 分布式训练
 
@@ -569,7 +569,7 @@ deepspeed --num_gpus 8 src/train_bash.py \
 ### 合并 LoRA 权重并导出模型
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/export_model.py \
+CUDA_VISIBLE_DEVICES= python src/export_model.py \
     --model_name_or_path path_to_llama_model \
     --adapter_name_or_path path_to_checkpoint \
     --template default \

From 9408366a36a43c07b88d1eade9dc56967cc09d12 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 28 Mar 2024 20:22:31 +0800
Subject: [PATCH 004/341] fix #2982

Former-commit-id: e5e6a0c50c7a1c0052ed6b459450b9735ff2c9a1
---
 src/llmtuner/extras/misc.py             |  2 +-
 src/llmtuner/hparams/finetuning_args.py |  4 ----
 src/llmtuner/hparams/parser.py          | 16 +++++++++-------
 src/llmtuner/train/ppo/workflow.py      |  4 ++--
 4 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index cd2ff5bc..85761f1d 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -64,7 +64,7 @@ def check_dependencies() -> None:
         require_version("transformers>=4.37.2", "To fix: pip install transformers>=4.37.2")
         require_version("datasets>=2.14.3", "To fix: pip install datasets>=2.14.3")
         require_version("accelerate>=0.27.2", "To fix: pip install accelerate>=0.27.2")
-        require_version("peft>=0.9.0", "To fix: pip install peft>=0.9.0")
+        require_version("peft>=0.10.0", "To fix: pip install peft>=0.10.0")
         require_version("trl>=0.8.1", "To fix: pip install trl>=0.8.1")
 
 
diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index 0dd28a8f..c1f08334 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -118,10 +118,6 @@ class RLHFArguments:
         default=4,
         metadata={"help": "The number of epochs to perform in a PPO optimization step."},
     )
-    ppo_logger: Optional[str] = field(
-        default=None,
-        metadata={"help": 'Log with either "wandb" or "tensorboard" in PPO training.'},
-    )
     ppo_score_norm: bool = field(
         default=False,
         metadata={"help": "Use score normalization in PPO training."},
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 8e27f379..4fbc3db9 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -8,7 +8,6 @@ import transformers
 from transformers import HfArgumentParser, Seq2SeqTrainingArguments
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import is_torch_bf16_gpu_available
-from transformers.utils.versions import require_version
 
 from ..extras.logging import get_logger
 from ..extras.misc import check_dependencies
@@ -119,6 +118,13 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if finetuning_args.stage == "ppo" and finetuning_args.reward_model_type == "lora" and model_args.use_unsloth:
         raise ValueError("Unsloth does not support lora reward model.")
 
+    if (
+        finetuning_args.stage == "ppo"
+        and training_args.report_to is not None
+        and training_args.report_to[0] not in ["wandb", "tensorboard"]
+    ):
+        raise ValueError("PPO only accepts wandb or tensorboard logger.")
+
     if training_args.max_steps == -1 and data_args.streaming:
         raise ValueError("Please specify `max_steps` in streaming mode.")
 
@@ -128,12 +134,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.do_train and model_args.use_unsloth and not is_unsloth_available():
         raise ValueError("Unsloth was not installed: https://github.com/unslothai/unsloth")
 
-    if finetuning_args.use_dora:
-        if model_args.quantization_bit is not None:
-            require_version("peft>=0.10.0", "To fix: pip install peft>=0.10.0")
-
-        if model_args.use_unsloth:
-            raise ValueError("Unsloth does not support DoRA.")
+    if finetuning_args.use_dora and model_args.use_unsloth:
+        raise ValueError("Unsloth does not support DoRA.")
 
     if finetuning_args.pure_bf16:
         if not is_torch_bf16_gpu_available():
diff --git a/src/llmtuner/train/ppo/workflow.py b/src/llmtuner/train/ppo/workflow.py
index 658b244d..0e03086b 100644
--- a/src/llmtuner/train/ppo/workflow.py
+++ b/src/llmtuner/train/ppo/workflow.py
@@ -55,11 +55,11 @@ def run_ppo(
         seed=training_args.seed,
         optimize_device_cache=True,
         target=finetuning_args.ppo_target,
-        log_with=finetuning_args.ppo_logger,
         use_score_scaling=finetuning_args.ppo_score_norm,
         use_score_norm=finetuning_args.ppo_score_norm,
         whiten_rewards=finetuning_args.ppo_whiten_rewards,
         accelerator_kwargs={"step_scheduler_with_optimizer": False},
+        log_with=training_args.report_to[0] if training_args.report_to is not None else None,
         project_kwargs={"logging_dir": training_args.logging_dir},
     )
 
@@ -71,10 +71,10 @@ def run_ppo(
         num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size)
 
     optimizer = create_custom_optimzer(model, training_args, finetuning_args)
-    create_custom_scheduler(training_args, num_training_steps, optimizer)
     if optimizer is None:
         optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=training_args.learning_rate)
 
+    create_custom_scheduler(training_args, num_training_steps, optimizer)
     lr_scheduler = get_scheduler(
         training_args.lr_scheduler_type,
         optimizer=optimizer,

From 32dcc5a491cd40d3e1fd6d53e0102300a94e43cb Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 28 Mar 2024 20:24:27 +0800
Subject: [PATCH 005/341] add project

Former-commit-id: 0418e9fecb2337b5d1b72e8358adb8aa10803c4b
---
 README.md        | 3 ++-
 README_zh.md     | 3 ++-
 requirements.txt | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9ad1e291..baa85076 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-26-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-27-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -709,6 +709,7 @@ docker compose -f ./docker-compose.yml up -d
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
+1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
diff --git a/README_zh.md b/README_zh.md
index b207ae73..5ef49549 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-26-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-27-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -682,6 +682,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
+1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper，基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM，基于 Baichuan-13B 微调而得，具有法律推理和知识检索能力。
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。
diff --git a/requirements.txt b/requirements.txt
index 186a3030..1ba2acb4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ torch>=1.13.1
 transformers>=4.37.2
 datasets>=2.14.3
 accelerate>=0.27.2
-peft>=0.9.0
+peft>=0.10.0
 trl>=0.8.1
 gradio>=3.38.0,<4.0.0
 scipy

From 50224b09ccd41dc91a4e60ea039b3bbea77f2556 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 28 Mar 2024 22:02:32 +0800
Subject: [PATCH 006/341] update readme

Former-commit-id: 312d4f90784800dc8db4eaa7d908e6761115bc51
---
 README.md          | 26 +++++++++++++++-----------
 README_zh.md       | 36 +++++++++++++++++++++++++++++++++---
 docker-compose.yml |  2 ++
 3 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index baa85076..af6ef66f 100644
--- a/README.md
+++ b/README.md
@@ -76,10 +76,10 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 [24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. See `examples/extras/galore` for usage.
 
-[24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `--infer_backend vllm` to enjoy **270%** inference speed. (LoRA is not yet supported, merge it first.)
-
 <details><summary>Full Changelog</summary>
 
+[24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `--infer_backend vllm` to enjoy **270%** inference speed. (LoRA is not yet supported, merge it first.)
+
 [24/02/28] We supported weight-decomposed LoRA (**[DoRA](https://arxiv.org/abs/2402.09353)**). Try `--use_dora` to activate DoRA training.
 
 [24/02/15] We supported **block expansion** proposed by [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro). See `examples/extras/llama_pro` for usage.
@@ -586,7 +586,7 @@ CUDA_VISIBLE_DEVICES= python src/export_model.py \
 > [!TIP]
 > Use `--model_name_or_path path_to_export` solely to use the exported model.
 > 
-> Use `--export_quantization_bit 4` and `--export_quantization_dataset data/c4_demo.json` to quantize the model with AutoGPTQ after merging the LoRA weights.
+> Use `CUDA_VISIBLE_DEVICES=0`, `--export_quantization_bit 4` and `--export_quantization_dataset data/c4_demo.json` to quantize the model with AutoGPTQ after merging the LoRA weights.
 
 ### Inference with OpenAI-style API
 
@@ -662,19 +662,23 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 
 ### Dockerize Training
 
-#### Get ready
-
-Necessary dockerized environment is needed, such as Docker or Docker Compose.
-
-#### Docker support
+#### Use Docker
 
 ```bash
 docker build -f ./Dockerfile -t llama-factory:latest .
 
-docker run --gpus=all -v ./hf_cache:/root/.cache/huggingface/ -v ./data:/app/data -v ./output:/app/output -p 7860:7860 --shm-size 16G --name llama_factory -d llama-factory:latest
+docker run --gpus=all \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
+    -e CUDA_VISIBLE_DEVICES=0 \
+    -p 7860:7860 \
+    --shm-size 16G \
+    --name llama_factory \
+    -d llama-factory:latest
 ```
 
-#### Docker Compose support
+#### Use Docker Compose
 
 ```bash
 docker compose -f ./docker-compose.yml up -d
@@ -682,7 +686,7 @@ docker compose -f ./docker-compose.yml up -d
 
 > [!TIP]
 > Details about volume:
-> * hf_cache: Utilize Huggingface cache on the host machine. Reassignable if a cache already exists in a different directory.
+> * hf_cache: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory.
 > * data: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI.
 > * output: Set export dir to this location so that the merged result can be accessed directly on the host machine.
 
diff --git a/README_zh.md b/README_zh.md
index 5ef49549..d018ee32 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -76,10 +76,10 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 [24/03/07] 我们支持了梯度低秩投影（**[GaLore](https://arxiv.org/abs/2403.03507)**）算法。详细用法请参照 `examples/extras/galore`。
 
-[24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `--infer_backend vllm` 来获得 **270%** 的推理速度。（尚不支持 LoRA，请先合并权重。）
-
 <details><summary>展开日志</summary>
 
+[24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `--infer_backend vllm` 来获得 **270%** 的推理速度。（尚不支持 LoRA，请先合并权重。）
+
 [24/02/28] 我们支持了 **[DoRA](https://arxiv.org/abs/2402.09353)** 微调。请使用 `--use_dora` 参数进行 DoRA 微调。
 
 [24/02/15] 我们支持了 [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro) 提出的**块扩展**方法。详细用法请参照 `examples/extras/llama_pro`。
@@ -585,7 +585,7 @@ CUDA_VISIBLE_DEVICES= python src/export_model.py \
 > [!TIP]
 > 仅使用 `--model_name_or_path path_to_export` 来加载导出后的模型。
 > 
-> 合并 LoRA 权重之后可再次使用 `--export_quantization_bit 4` 和 `--export_quantization_dataset data/c4_demo.json` 基于 AutoGPTQ 量化模型。
+> 合并 LoRA 权重之后可再次使用 `CUDA_VISIBLE_DEVICES=0`、`--export_quantization_bit 4` 和 `--export_quantization_dataset data/c4_demo.json` 基于 AutoGPTQ 量化模型。
 
 ### 使用 OpenAI 风格 API 推理
 
@@ -659,6 +659,36 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 > [!TIP]
 > 我们建议在量化模型的预测中使用 `--per_device_eval_batch_size=1` 和 `--max_target_length 128`。
 
+### 使用容器
+
+#### 使用 Docker
+
+```bash
+docker build -f ./Dockerfile -t llama-factory:latest .
+
+docker run --gpus=all \
+    -v ./hf_cache:/root/.cache/huggingface/ \
+    -v ./data:/app/data \
+    -v ./output:/app/output \
+    -e CUDA_VISIBLE_DEVICES=0 \
+    -p 7860:7860 \
+    --shm-size 16G \
+    --name llama_factory \
+    -d llama-factory:latest
+```
+
+#### 使用 Docker Compose
+
+```bash
+docker compose -f ./docker-compose.yml up -d
+```
+
+> [!TIP]
+> 数据卷详情：
+> * hf_cache：使用宿主机的 Hugging Face 缓存文件夹，允许更改为新的目录。
+> * data：宿主机中存放数据集的文件夹路径。
+> * output：将导出目录设置为该路径后，即可在宿主机中访问导出后的模型。
+
 ## 使用了 LLaMA Factory 的项目
 
 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
diff --git a/docker-compose.yml b/docker-compose.yml
index 9602a3e3..333dc51e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -10,6 +10,8 @@ services:
       - ./hf_cache:/root/.cache/huggingface/
       - ./data:/app/data
       - ./output:/app/output
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
     ports:
       - "7860:7860"
     ipc: host

From fbd0584391f2549a2097a3f119b0eaae006dd424 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 29 Mar 2024 11:36:08 +0800
Subject: [PATCH 007/341] release v0.6.1

Former-commit-id: a59d823f554505b2e649e6e111b9dee8306d3ad8
---
 src/llmtuner/__init__.py           |  2 +-
 src/llmtuner/train/dpo/workflow.py |  1 +
 src/llmtuner/train/ppo/trainer.py  | 94 ++++++++++++++++++++++++++++--
 src/llmtuner/train/ppo/workflow.py | 51 +---------------
 src/llmtuner/train/utils.py        |  4 +-
 5 files changed, 95 insertions(+), 57 deletions(-)

diff --git a/src/llmtuner/__init__.py b/src/llmtuner/__init__.py
index 6852ae2f..903e82ad 100644
--- a/src/llmtuner/__init__.py
+++ b/src/llmtuner/__init__.py
@@ -7,5 +7,5 @@ from .train import export_model, run_exp
 from .webui import create_ui, create_web_demo
 
 
-__version__ = "0.6.0"
+__version__ = "0.6.1"
 __all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]
diff --git a/src/llmtuner/train/dpo/workflow.py b/src/llmtuner/train/dpo/workflow.py
index 7014177a..851de982 100644
--- a/src/llmtuner/train/dpo/workflow.py
+++ b/src/llmtuner/train/dpo/workflow.py
@@ -28,6 +28,7 @@ def run_dpo(
     tokenizer = load_tokenizer(model_args)
     dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm")
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
     data_collator = DPODataCollatorWithPadding(
         tokenizer=tokenizer,
         pad_to_multiple_of=8,
diff --git a/src/llmtuner/train/ppo/trainer.py b/src/llmtuner/train/ppo/trainer.py
index a06d7ef1..de87532a 100644
--- a/src/llmtuner/train/ppo/trainer.py
+++ b/src/llmtuner/train/ppo/trainer.py
@@ -6,20 +6,23 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 import torch
 from tqdm import tqdm
 from transformers import GenerationConfig, Trainer, TrainerControl, TrainerState
+from transformers.optimization import get_scheduler
 from transformers.trainer_pt_utils import remove_dummy_checkpoint
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
-from trl import PPOTrainer
+from trl import PPOConfig, PPOTrainer
 from trl.core import PPODecorators, logprobs_from_logits
 
 from ...extras.callbacks import FixValueHeadModelCallback, LogCallback
 from ...extras.logging import get_logger
 from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor
+from ..utils import create_custom_optimzer, create_custom_scheduler
 from .utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm
 
 
 if TYPE_CHECKING:
-    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+    from datasets import Dataset
+    from transformers import DataCollatorWithPadding, PreTrainedTokenizer, Seq2SeqTrainingArguments, TrainerCallback
     from trl import AutoModelForCausalLMWithValueHead
 
     from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments
@@ -40,10 +43,53 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         finetuning_args: "FinetuningArguments",
         generating_args: "GeneratingArguments",
         callbacks: List["TrainerCallback"],
-        reward_model: "AutoModelForCausalLMWithValueHead",
-        **kwargs,
+        model: "AutoModelForCausalLMWithValueHead",
+        reward_model: Optional["AutoModelForCausalLMWithValueHead"],
+        ref_model: Optional["AutoModelForCausalLMWithValueHead"],
+        tokenizer: "PreTrainedTokenizer",
+        dataset: "Dataset",
+        data_collator: "DataCollatorWithPadding",
     ):
-        PPOTrainer.__init__(self, **kwargs)
+        backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
+        ppo_config = PPOConfig(
+            model_name=model_args.model_name_or_path,
+            learning_rate=training_args.learning_rate,
+            mini_batch_size=training_args.per_device_train_batch_size,
+            batch_size=backward_batch_size * finetuning_args.ppo_buffer_size,
+            gradient_accumulation_steps=training_args.gradient_accumulation_steps,
+            ppo_epochs=finetuning_args.ppo_epochs,
+            max_grad_norm=training_args.max_grad_norm,
+            seed=training_args.seed,
+            optimize_device_cache=True,
+            target=finetuning_args.ppo_target,
+            use_score_scaling=finetuning_args.ppo_score_norm,
+            use_score_norm=finetuning_args.ppo_score_norm,
+            whiten_rewards=finetuning_args.ppo_whiten_rewards,
+            accelerator_kwargs={"step_scheduler_with_optimizer": False},
+            log_with=training_args.report_to[0] if training_args.report_to is not None else None,
+            project_kwargs={"logging_dir": training_args.logging_dir},
+        )
+
+        # Create optimizer and scheduler
+        if training_args.max_steps > 0:
+            num_training_steps = training_args.max_steps
+        else:
+            total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size
+            num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size)
+
+        optimizer = self.create_optimizer(model, training_args, finetuning_args)
+        scheduler = self.create_scheduler(training_args, num_training_steps, optimizer)
+
+        PPOTrainer.__init__(
+            self,
+            config=ppo_config,
+            model=model,
+            ref_model=ref_model,
+            tokenizer=tokenizer,
+            dataset=dataset,
+            data_collator=data_collator,
+            lr_scheduler=scheduler,
+        )
 
         self.args = training_args
         self.model_args = model_args
@@ -205,6 +251,44 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
             self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model)
         )
 
+    def create_optimizer(
+        self,
+        model: "AutoModelForCausalLMWithValueHead",
+        training_args: "Seq2SeqTrainingArguments",
+        finetuning_args: "FinetuningArguments",
+    ) -> "torch.optim.Optimizer":
+        optimizer = create_custom_optimzer(model, training_args, finetuning_args)
+        if optimizer is None:
+            decay_params, nodecay_params = [], []
+            decay_param_names = self.get_decay_parameter_names(model)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if name in decay_param_names:
+                        decay_params.append(param)
+                    else:
+                        nodecay_params.append(param)
+
+            optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+            param_groups = [
+                dict(params=nodecay_params),
+                dict(params=decay_params, weight_decay=training_args.weight_decay),
+            ]
+            optimizer = optim_class(param_groups, **optim_kwargs)
+
+        return optimizer
+
+    def create_scheduler(
+        self, training_args: "Seq2SeqTrainingArguments", num_training_steps: int, optimizer: "torch.optim.Optimizer"
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(training_args, num_training_steps, optimizer)
+        lr_scheduler = get_scheduler(
+            training_args.lr_scheduler_type,
+            optimizer=optimizer,
+            num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
+            num_training_steps=num_training_steps,
+        )
+        return lr_scheduler
+
     @torch.no_grad()
     def get_inputs(self, batch: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
         r"""
diff --git a/src/llmtuner/train/ppo/workflow.py b/src/llmtuner/train/ppo/workflow.py
index 0e03086b..d5854073 100644
--- a/src/llmtuner/train/ppo/workflow.py
+++ b/src/llmtuner/train/ppo/workflow.py
@@ -1,19 +1,15 @@
 # Inspired by: https://github.com/lvwerra/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py
 
-import math
 from typing import TYPE_CHECKING, List, Optional
 
-from torch.optim import AdamW
 from transformers import DataCollatorWithPadding
-from transformers.optimization import get_scheduler
-from trl import PPOConfig
 
 from ...data import get_dataset
 from ...extras.callbacks import FixValueHeadModelCallback
 from ...extras.misc import fix_valuehead_checkpoint
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
-from ..utils import create_custom_optimzer, create_custom_scheduler, create_ref_model, create_reward_model
+from ..utils import create_ref_model, create_reward_model
 from .trainer import CustomPPOTrainer
 
 
@@ -42,46 +38,6 @@ def run_ppo(
     ref_model = create_ref_model(model_args, finetuning_args, add_valuehead=True)
     reward_model = create_reward_model(model, model_args, finetuning_args)
 
-    # Create ppo config
-    backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
-    ppo_config = PPOConfig(
-        model_name=model_args.model_name_or_path,
-        learning_rate=training_args.learning_rate,
-        mini_batch_size=training_args.per_device_train_batch_size,
-        batch_size=backward_batch_size * finetuning_args.ppo_buffer_size,
-        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
-        ppo_epochs=finetuning_args.ppo_epochs,
-        max_grad_norm=training_args.max_grad_norm,
-        seed=training_args.seed,
-        optimize_device_cache=True,
-        target=finetuning_args.ppo_target,
-        use_score_scaling=finetuning_args.ppo_score_norm,
-        use_score_norm=finetuning_args.ppo_score_norm,
-        whiten_rewards=finetuning_args.ppo_whiten_rewards,
-        accelerator_kwargs={"step_scheduler_with_optimizer": False},
-        log_with=training_args.report_to[0] if training_args.report_to is not None else None,
-        project_kwargs={"logging_dir": training_args.logging_dir},
-    )
-
-    # Create optimizer and scheduler
-    if training_args.max_steps > 0:
-        num_training_steps = training_args.max_steps
-    else:
-        total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size
-        num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size)
-
-    optimizer = create_custom_optimzer(model, training_args, finetuning_args)
-    if optimizer is None:
-        optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=training_args.learning_rate)
-
-    create_custom_scheduler(training_args, num_training_steps, optimizer)
-    lr_scheduler = get_scheduler(
-        training_args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
-        num_training_steps=num_training_steps,
-    )
-
     # Initialize our Trainer
     ppo_trainer = CustomPPOTrainer(
         model_args=model_args,
@@ -89,15 +45,12 @@ def run_ppo(
         finetuning_args=finetuning_args,
         generating_args=generating_args,
         callbacks=callbacks + [FixValueHeadModelCallback()],
-        reward_model=reward_model,
-        config=ppo_config,
         model=model,
+        reward_model=reward_model,
         ref_model=ref_model,
         tokenizer=tokenizer,
         dataset=dataset,
         data_collator=data_collator,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
     )
 
     # Training
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index 73854a5e..8f218a78 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -70,7 +70,7 @@ def create_modelcard_and_push(
 
 def create_ref_model(
     model_args: "ModelArguments", finetuning_args: "FinetuningArguments", add_valuehead: bool = False
-) -> Union["PreTrainedModel", "AutoModelForCausalLMWithValueHead"]:
+) -> Optional[Union["PreTrainedModel", "AutoModelForCausalLMWithValueHead"]]:
     r"""
     Creates reference model for PPO/DPO training. Evaluation mode is not supported.
 
@@ -105,7 +105,7 @@ def create_ref_model(
 
 def create_reward_model(
     model: "AutoModelForCausalLMWithValueHead", model_args: "ModelArguments", finetuning_args: "FinetuningArguments"
-) -> "AutoModelForCausalLMWithValueHead":
+) -> Optional["AutoModelForCausalLMWithValueHead"]:
     r"""
     Creates reward model for PPO training.
     """

From b0efebf8532704de9f85a93ed386f2f72f109a1d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 30 Mar 2024 20:37:08 +0800
Subject: [PATCH 008/341] upgrade gradio to 4.21.0

Former-commit-id: 63eecbeb967d849e1d03d8d03fb6421c0ee89257
---
 requirements.txt                         |  2 +-
 src/evaluate.py                          |  3 +-
 src/llmtuner/webui/chatter.py            | 17 +++----
 src/llmtuner/webui/common.py             | 14 +++---
 src/llmtuner/webui/components/chatbot.py | 11 +++--
 src/llmtuner/webui/components/data.py    | 18 ++++----
 src/llmtuner/webui/components/eval.py    |  6 +--
 src/llmtuner/webui/components/export.py  | 14 +++---
 src/llmtuner/webui/components/infer.py   |  4 +-
 src/llmtuner/webui/components/top.py     |  8 ++--
 src/llmtuner/webui/components/train.py   | 22 +++++-----
 src/llmtuner/webui/engine.py             | 50 ++++++++++-----------
 src/llmtuner/webui/interface.py          | 29 ++++++------
 src/llmtuner/webui/manager.py            | 56 +++++++++++++++++-------
 src/llmtuner/webui/runner.py             | 46 +++++++++----------
 src/llmtuner/webui/utils.py              | 18 ++++----
 src/train_web.py                         |  4 +-
 src/web_demo.py                          |  4 +-
 18 files changed, 167 insertions(+), 159 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1ba2acb4..88b88ee4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ datasets>=2.14.3
 accelerate>=0.27.2
 peft>=0.10.0
 trl>=0.8.1
-gradio>=3.38.0,<4.0.0
+gradio>4.0.0,<=4.21.0
 scipy
 einops
 sentencepiece
diff --git a/src/evaluate.py b/src/evaluate.py
index 13796c0c..705a6e42 100644
--- a/src/evaluate.py
+++ b/src/evaluate.py
@@ -2,8 +2,7 @@ from llmtuner import Evaluator
 
 
 def main():
-    evaluator = Evaluator()
-    evaluator.eval()
+    Evaluator().eval()
 
 
 if __name__ == "__main__":
diff --git a/src/llmtuner/webui/chatter.py b/src/llmtuner/webui/chatter.py
index d149ca26..2621bd5e 100644
--- a/src/llmtuner/webui/chatter.py
+++ b/src/llmtuner/webui/chatter.py
@@ -36,7 +36,7 @@ class WebChatModel(ChatModel):
         return self.engine is not None
 
     def load_model(self, data: Dict[Component, Any]) -> Generator[str, None, None]:
-        get = lambda name: data[self.manager.get_elem_by_name(name)]
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
         lang = get("top.lang")
         error = ""
         if self.loaded:
@@ -80,7 +80,7 @@ class WebChatModel(ChatModel):
         yield ALERTS["info_loaded"][lang]
 
     def unload_model(self, data: Dict[Component, Any]) -> Generator[str, None, None]:
-        lang = data[self.manager.get_elem_by_name("top.lang")]
+        lang = data[self.manager.get_elem_by_id("top.lang")]
 
         if self.demo_mode:
             gr.Warning(ALERTS["err_demo"][lang])
@@ -97,13 +97,13 @@ class WebChatModel(ChatModel):
         chatbot: List[Tuple[str, str]],
         role: str,
         query: str,
-        messages: Sequence[Tuple[str, str]],
+        messages: Sequence[Dict[str, str]],
         system: str,
         tools: str,
         max_new_tokens: int,
         top_p: float,
         temperature: float,
-    ) -> Generator[Tuple[Sequence[Tuple[str, str]], Sequence[Tuple[str, str]]], None, None]:
+    ) -> Generator[Tuple[List[Tuple[str, str]], List[Dict[str, str]]], None, None]:
         chatbot.append([query, ""])
         query_messages = messages + [{"role": role, "content": query}]
         response = ""
@@ -126,12 +126,5 @@ class WebChatModel(ChatModel):
                 output_messages = query_messages + [{"role": Role.ASSISTANT.value, "content": result}]
                 bot_text = result
 
-            chatbot[-1] = [query, self.postprocess(bot_text)]
+            chatbot[-1] = [query, bot_text]
             yield chatbot, output_messages
-
-    def postprocess(self, response: str) -> str:
-        blocks = response.split("```")
-        for i, block in enumerate(blocks):
-            if i % 2 == 0:
-                blocks[i] = block.replace("<", "&lt;").replace(">", "&gt;")
-        return "```".join(blocks)
diff --git a/src/llmtuner/webui/common.py b/src/llmtuner/webui/common.py
index 961d6f0d..67e6ff2a 100644
--- a/src/llmtuner/webui/common.py
+++ b/src/llmtuner/webui/common.py
@@ -79,9 +79,9 @@ def get_template(model_name: str) -> str:
     return "default"
 
 
-def list_adapters(model_name: str, finetuning_type: str) -> Dict[str, Any]:
+def list_adapters(model_name: str, finetuning_type: str) -> "gr.Dropdown":
     if finetuning_type not in PEFT_METHODS:
-        return gr.update(value=[], choices=[], interactive=False)
+        return gr.Dropdown(value=[], choices=[], interactive=False)
 
     adapters = []
     if model_name and finetuning_type == "lora":
@@ -92,7 +92,7 @@ def list_adapters(model_name: str, finetuning_type: str) -> Dict[str, Any]:
                     os.path.isfile(os.path.join(save_dir, adapter, name)) for name in ADAPTER_NAMES
                 ):
                     adapters.append(adapter)
-    return gr.update(value=[], choices=adapters, interactive=True)
+    return gr.Dropdown(value=[], choices=adapters, interactive=True)
 
 
 def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]:
@@ -104,12 +104,12 @@ def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]:
         return {}
 
 
-def list_dataset(dataset_dir: str = None, training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Dict[str, Any]:
+def list_dataset(dataset_dir: str = None, training_stage: str = list(TRAINING_STAGES.keys())[0]) -> "gr.Dropdown":
     dataset_info = load_dataset_info(dataset_dir if dataset_dir is not None else DEFAULT_DATA_DIR)
     ranking = TRAINING_STAGES[training_stage] in ["rm", "dpo"]
     datasets = [k for k, v in dataset_info.items() if v.get("ranking", False) == ranking]
-    return gr.update(value=[], choices=datasets)
+    return gr.Dropdown(value=[], choices=datasets)
 
 
-def autoset_packing(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Dict[str, Any]:
-    return gr.update(value=(TRAINING_STAGES[training_stage] == "pt"))
+def autoset_packing(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> "gr.Button":
+    return gr.Button(value=(TRAINING_STAGES[training_stage] == "pt"))
diff --git a/src/llmtuner/webui/components/chatbot.py b/src/llmtuner/webui/components/chatbot.py
index bf5bb66a..d7d5bd66 100644
--- a/src/llmtuner/webui/components/chatbot.py
+++ b/src/llmtuner/webui/components/chatbot.py
@@ -7,7 +7,6 @@ from ..utils import check_json_schema
 
 
 if TYPE_CHECKING:
-    from gradio.blocks import Block
     from gradio.components import Component
 
     from ..engine import Engine
@@ -15,9 +14,9 @@ if TYPE_CHECKING:
 
 def create_chat_box(
     engine: "Engine", visible: bool = False
-) -> Tuple["Block", "Component", "Component", Dict[str, "Component"]]:
-    with gr.Box(visible=visible) as chat_box:
-        chatbot = gr.Chatbot()
+) -> Tuple["gr.Column", "Component", "Component", Dict[str, "Component"]]:
+    with gr.Column(visible=visible) as chat_box:
+        chatbot = gr.Chatbot(show_copy_button=True)
         messages = gr.State([])
         with gr.Row():
             with gr.Column(scale=4):
@@ -33,14 +32,14 @@ def create_chat_box(
                 temperature = gr.Slider(0.01, 1.5, value=0.95, step=0.01)
                 clear_btn = gr.Button()
 
-    tools.input(check_json_schema, [tools, engine.manager.get_elem_by_name("top.lang")])
+    tools.input(check_json_schema, inputs=[tools, engine.manager.get_elem_by_id("top.lang")])
 
     submit_btn.click(
         engine.chatter.predict,
         [chatbot, role, query, messages, system, tools, max_new_tokens, top_p, temperature],
         [chatbot, messages],
         show_progress=True,
-    ).then(lambda: gr.update(value=""), outputs=[query])
+    ).then(lambda: "", outputs=[query])
 
     clear_btn.click(lambda: ([], []), outputs=[chatbot, messages], show_progress=True)
 
diff --git a/src/llmtuner/webui/components/data.py b/src/llmtuner/webui/components/data.py
index c63b6ea5..46274417 100644
--- a/src/llmtuner/webui/components/data.py
+++ b/src/llmtuner/webui/components/data.py
@@ -1,6 +1,6 @@
 import json
 import os
-from typing import TYPE_CHECKING, Any, Dict, Tuple
+from typing import TYPE_CHECKING, Dict, Tuple
 
 import gradio as gr
 
@@ -22,24 +22,24 @@ def next_page(page_index: int, total_num: int) -> int:
     return page_index + 1 if (page_index + 1) * PAGE_SIZE < total_num else page_index
 
 
-def can_preview(dataset_dir: str, dataset: list) -> Dict[str, Any]:
+def can_preview(dataset_dir: str, dataset: list) -> "gr.Button":
     try:
         with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f:
             dataset_info = json.load(f)
     except Exception:
-        return gr.update(interactive=False)
+        return gr.Button(interactive=False)
 
     if (
         len(dataset) > 0
         and "file_name" in dataset_info[dataset[0]]
         and os.path.isfile(os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"]))
     ):
-        return gr.update(interactive=True)
+        return gr.Button(interactive=True)
     else:
-        return gr.update(interactive=False)
+        return gr.Button(interactive=False)
 
 
-def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, Dict[str, Any]]:
+def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, "gr.Column"]:
     with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f:
         dataset_info = json.load(f)
 
@@ -51,7 +51,7 @@ def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int,
             data = [json.loads(line) for line in f]
         else:
             data = [line for line in f]  # noqa: C416
-    return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.update(visible=True)
+    return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
 
 
 def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dict[str, "Component"]:
@@ -67,7 +67,7 @@ def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dic
             close_btn = gr.Button()
 
         with gr.Row():
-            preview_samples = gr.JSON(interactive=False)
+            preview_samples = gr.JSON()
 
     dataset.change(can_preview, [dataset_dir, dataset], [data_preview_btn], queue=False).then(
         lambda: 0, outputs=[page_index], queue=False
@@ -81,7 +81,7 @@ def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dic
     next_btn.click(next_page, [page_index, preview_count], [page_index], queue=False).then(
         get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
     )
-    close_btn.click(lambda: gr.update(visible=False), outputs=[preview_box], queue=False)
+    close_btn.click(lambda: gr.Column(visible=False), outputs=[preview_box], queue=False)
     return dict(
         data_preview_btn=data_preview_btn,
         preview_count=preview_count,
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index 4c35ad8f..452b06fb 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -53,7 +53,7 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
         resume_btn = gr.Checkbox(visible=False, interactive=False, value=False)
         process_bar = gr.Slider(visible=False, interactive=False)
 
-    with gr.Box():
+    with gr.Row():
         output_box = gr.Markdown()
 
     output_elems = [output_box, process_bar]
@@ -68,9 +68,9 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
         )
     )
 
-    cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems)
+    cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems, concurrency_limit=None)
     start_btn.click(engine.runner.run_eval, input_elems, output_elems)
     stop_btn.click(engine.runner.set_abort, queue=False)
-    resume_btn.change(engine.runner.monitor, outputs=output_elems)
+    resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
 
     return elem_dict
diff --git a/src/llmtuner/webui/components/export.py b/src/llmtuner/webui/components/export.py
index a40590ca..b394d75c 100644
--- a/src/llmtuner/webui/components/export.py
+++ b/src/llmtuner/webui/components/export.py
@@ -74,7 +74,7 @@ def save_model(
 
 def create_export_tab(engine: "Engine") -> Dict[str, "Component"]:
     with gr.Row():
-        max_shard_size = gr.Slider(value=1, minimum=1, maximum=100)
+        max_shard_size = gr.Slider(value=1, minimum=1, maximum=100, step=1)
         export_quantization_bit = gr.Dropdown(choices=["none", "8", "4", "3", "2"], value="none")
         export_quantization_dataset = gr.Textbox(value="data/c4_demo.json")
         export_legacy_format = gr.Checkbox()
@@ -89,12 +89,12 @@ def create_export_tab(engine: "Engine") -> Dict[str, "Component"]:
     export_btn.click(
         save_model,
         [
-            engine.manager.get_elem_by_name("top.lang"),
-            engine.manager.get_elem_by_name("top.model_name"),
-            engine.manager.get_elem_by_name("top.model_path"),
-            engine.manager.get_elem_by_name("top.adapter_path"),
-            engine.manager.get_elem_by_name("top.finetuning_type"),
-            engine.manager.get_elem_by_name("top.template"),
+            engine.manager.get_elem_by_id("top.lang"),
+            engine.manager.get_elem_by_id("top.model_name"),
+            engine.manager.get_elem_by_id("top.model_path"),
+            engine.manager.get_elem_by_id("top.adapter_path"),
+            engine.manager.get_elem_by_id("top.finetuning_type"),
+            engine.manager.get_elem_by_id("top.template"),
             max_shard_size,
             export_quantization_bit,
             export_quantization_dataset,
diff --git a/src/llmtuner/webui/components/infer.py b/src/llmtuner/webui/components/infer.py
index 135535a4..097ded25 100644
--- a/src/llmtuner/webui/components/infer.py
+++ b/src/llmtuner/webui/components/infer.py
@@ -29,11 +29,11 @@ def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]:
     elem_dict.update(dict(chat_box=chat_box, **chat_elems))
 
     load_btn.click(engine.chatter.load_model, input_elems, [info_box]).then(
-        lambda: gr.update(visible=engine.chatter.loaded), outputs=[chat_box]
+        lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_box]
     )
 
     unload_btn.click(engine.chatter.unload_model, input_elems, [info_box]).then(
         lambda: ([], []), outputs=[chatbot, history]
-    ).then(lambda: gr.update(visible=engine.chatter.loaded), outputs=[chat_box])
+    ).then(lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_box])
 
     return elem_dict
diff --git a/src/llmtuner/webui/components/top.py b/src/llmtuner/webui/components/top.py
index d8b49588..6c5030cd 100644
--- a/src/llmtuner/webui/components/top.py
+++ b/src/llmtuner/webui/components/top.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Dict, Tuple
+from typing import TYPE_CHECKING, Dict
 
 import gradio as gr
 
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
     from gradio.components import Component
 
 
-def create_top() -> Tuple["gr.Dropdown", Dict[str, "Component"]]:
+def create_top() -> Dict[str, "Component"]:
     available_models = list(SUPPORTED_MODELS.keys()) + ["Custom"]
 
     with gr.Row():
@@ -25,7 +25,7 @@ def create_top() -> Tuple["gr.Dropdown", Dict[str, "Component"]]:
         adapter_path = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=5)
         refresh_btn = gr.Button(scale=1)
 
-    with gr.Accordion(label="Advanced config", open=False) as advanced_tab:
+    with gr.Accordion(open=False) as advanced_tab:
         with gr.Row():
             quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none")
             template = gr.Dropdown(choices=list(templates.keys()), value="default")
@@ -44,7 +44,7 @@ def create_top() -> Tuple["gr.Dropdown", Dict[str, "Component"]]:
 
     refresh_btn.click(list_adapters, [model_name, finetuning_type], [adapter_path], queue=False)
 
-    return lang, dict(
+    return dict(
         lang=lang,
         model_name=model_name,
         model_path=model_path,
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 0725f5eb..c7220b08 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -68,7 +68,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         )
     )
 
-    with gr.Accordion(label="Extra config", open=False) as extra_tab:
+    with gr.Accordion(open=False) as extra_tab:
         with gr.Row():
             logging_steps = gr.Slider(value=5, minimum=5, maximum=1000, step=5)
             save_steps = gr.Slider(value=100, minimum=10, maximum=5000, step=10)
@@ -113,7 +113,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         )
     )
 
-    with gr.Accordion(label="Freeze config", open=False) as freeze_tab:
+    with gr.Accordion(open=False) as freeze_tab:
         with gr.Row():
             num_layer_trainable = gr.Slider(value=3, minimum=1, maximum=128, step=1, scale=2)
             name_module_trainable = gr.Textbox(value="all", scale=3)
@@ -125,7 +125,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         )
     )
 
-    with gr.Accordion(label="LoRA config", open=False) as lora_tab:
+    with gr.Accordion(open=False) as lora_tab:
         with gr.Row():
             lora_rank = gr.Slider(value=8, minimum=1, maximum=1024, step=1, scale=1)
             lora_alpha = gr.Slider(value=16, minimum=1, maximum=2048, step=1, scale=1)
@@ -155,7 +155,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         )
     )
 
-    with gr.Accordion(label="RLHF config", open=False) as rlhf_tab:
+    with gr.Accordion(open=False) as rlhf_tab:
         with gr.Row():
             dpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1)
             dpo_ftx = gr.Slider(value=0, minimum=0, maximum=10, step=0.01, scale=1)
@@ -163,7 +163,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     training_stage.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False).then(
         list_adapters,
-        [engine.manager.get_elem_by_name("top.model_name"), engine.manager.get_elem_by_name("top.finetuning_type")],
+        [engine.manager.get_elem_by_id("top.model_name"), engine.manager.get_elem_by_id("top.finetuning_type")],
         [reward_model],
         queue=False,
     ).then(autoset_packing, [training_stage], [packing], queue=False)
@@ -171,7 +171,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
     input_elems.update({dpo_beta, dpo_ftx, reward_model})
     elem_dict.update(dict(rlhf_tab=rlhf_tab, dpo_beta=dpo_beta, dpo_ftx=dpo_ftx, reward_model=reward_model))
 
-    with gr.Accordion(label="GaLore config", open=False) as galore_tab:
+    with gr.Accordion(open=False) as galore_tab:
         with gr.Row():
             use_galore = gr.Checkbox(scale=1)
             galore_rank = gr.Slider(value=16, minimum=1, maximum=1024, step=1, scale=2)
@@ -205,7 +205,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
                 resume_btn = gr.Checkbox(visible=False, interactive=False)
                 process_bar = gr.Slider(visible=False, interactive=False)
 
-            with gr.Box():
+            with gr.Row():
                 output_box = gr.Markdown()
 
         with gr.Column(scale=1):
@@ -214,10 +214,10 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
     input_elems.add(output_dir)
     output_elems = [output_box, process_bar]
 
-    cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems)
+    cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems, concurrency_limit=None)
     start_btn.click(engine.runner.run_train, input_elems, output_elems)
     stop_btn.click(engine.runner.set_abort, queue=False)
-    resume_btn.change(engine.runner.monitor, outputs=output_elems)
+    resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
 
     elem_dict.update(
         dict(
@@ -235,8 +235,8 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
     output_box.change(
         gen_plot,
         [
-            engine.manager.get_elem_by_name("top.model_name"),
-            engine.manager.get_elem_by_name("top.finetuning_type"),
+            engine.manager.get_elem_by_id("top.model_name"),
+            engine.manager.get_elem_by_id("top.finetuning_type"),
             output_dir,
         ],
         loss_viewer,
diff --git a/src/llmtuner/webui/engine.py b/src/llmtuner/webui/engine.py
index fb04ca05..7f76f799 100644
--- a/src/llmtuner/webui/engine.py
+++ b/src/llmtuner/webui/engine.py
@@ -1,6 +1,5 @@
 from typing import Any, Dict, Generator
 
-import gradio as gr
 from gradio.components import Component  # cannot use TYPE_CHECKING here
 
 from .chatter import WebChatModel
@@ -19,44 +18,45 @@ class Engine:
         self.runner = Runner(self.manager, demo_mode)
         self.chatter = WebChatModel(self.manager, demo_mode, lazy_init=(not pure_chat))
 
-    def _form_dict(self, resume_dict: Dict[str, Dict[str, Any]]):
-        return {self.manager.get_elem_by_name(k): gr.update(**v) for k, v in resume_dict.items()}
+    def _update_component(self, input_dict: Dict[str, Dict[str, Any]]) -> Dict["Component", "Component"]:
+        r"""
+        Gets the dict to update the components.
+        """
+        output_dict: Dict["Component", "Component"] = {}
+        for elem_id, elem_attr in input_dict.items():
+            elem = self.manager.get_elem_by_id(elem_id)
+            output_dict[elem] = elem.__class__(**elem_attr)
 
-    def resume(self) -> Generator[Dict[Component, Dict[str, Any]], None, None]:
+        return output_dict
+
+    def resume(self) -> Generator[Dict[Component, Component], None, None]:
         user_config = load_config() if not self.demo_mode else {}
         lang = user_config.get("lang", None) or "en"
 
         init_dict = {"top.lang": {"value": lang}, "infer.chat_box": {"visible": self.chatter.loaded}}
 
         if not self.pure_chat:
-            init_dict["train.dataset"] = {"choices": list_dataset()["choices"]}
-            init_dict["eval.dataset"] = {"choices": list_dataset()["choices"]}
+            init_dict["train.dataset"] = {"choices": list_dataset().choices}
+            init_dict["eval.dataset"] = {"choices": list_dataset().choices}
+            init_dict["train.output_dir"] = {"value": "train_" + get_time()}
+            init_dict["eval.output_dir"] = {"value": "eval_" + get_time()}
 
             if user_config.get("last_model", None):
                 init_dict["top.model_name"] = {"value": user_config["last_model"]}
                 init_dict["top.model_path"] = {"value": get_model_path(user_config["last_model"])}
 
-        yield self._form_dict(init_dict)
+        yield self._update_component(init_dict)
 
-        if not self.pure_chat:
-            if self.runner.alive and not self.demo_mode:
-                yield {elem: gr.update(value=value) for elem, value in self.runner.running_data.items()}
-                if self.runner.do_train:
-                    yield self._form_dict({"train.resume_btn": {"value": True}})
-                else:
-                    yield self._form_dict({"eval.resume_btn": {"value": True}})
+        if self.runner.alive and not self.demo_mode and not self.pure_chat:
+            yield {elem: elem.__class__(value=value) for elem, value in self.runner.running_data.items()}
+            if self.runner.do_train:
+                yield self._update_component({"train.resume_btn": {"value": True}})
             else:
-                yield self._form_dict(
-                    {
-                        "train.output_dir": {"value": "train_" + get_time()},
-                        "eval.output_dir": {"value": "eval_" + get_time()},
-                    }
-                )
+                yield self._update_component({"eval.resume_btn": {"value": True}})
 
-    def change_lang(self, lang: str) -> Dict[Component, Dict[str, Any]]:
+    def change_lang(self, lang: str) -> Dict[Component, Component]:
         return {
-            component: gr.update(**LOCALES[name][lang])
-            for elems in self.manager.all_elems.values()
-            for name, component in elems.items()
-            if name in LOCALES
+            elem: elem.__class__(**LOCALES[elem_name][lang])
+            for elem_name, elem in self.manager.get_elem_iter()
+            if elem_name in LOCALES
         }
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index a1f4d53f..d943594d 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -14,7 +14,7 @@ from .css import CSS
 from .engine import Engine
 
 
-require_version("gradio>=3.38.0,<4.0.0", 'To fix: pip install "gradio>=3.38.0,<4.0.0"')
+require_version("gradio>4.0.0,<=4.21.0", "To fix: pip install gradio==4.21.0")
 
 
 def create_ui(demo_mode: bool = False) -> gr.Blocks:
@@ -29,23 +29,24 @@ def create_ui(demo_mode: bool = False) -> gr.Blocks:
             )
             gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
 
-        lang, engine.manager.all_elems["top"] = create_top()
+        engine.manager.add_elem_dict("top", create_top())
+        lang: "gr.Dropdown" = engine.manager.get_elem_by_id("top.lang")
 
         with gr.Tab("Train"):
-            engine.manager.all_elems["train"] = create_train_tab(engine)
+            engine.manager.add_elem_dict("train", create_train_tab(engine))
 
         with gr.Tab("Evaluate & Predict"):
-            engine.manager.all_elems["eval"] = create_eval_tab(engine)
+            engine.manager.add_elem_dict("eval", create_eval_tab(engine))
 
         with gr.Tab("Chat"):
-            engine.manager.all_elems["infer"] = create_infer_tab(engine)
+            engine.manager.add_elem_dict("infer", create_infer_tab(engine))
 
         if not demo_mode:
             with gr.Tab("Export"):
-                engine.manager.all_elems["export"] = create_export_tab(engine)
+                engine.manager.add_elem_dict("export", create_export_tab(engine))
 
-        demo.load(engine.resume, outputs=engine.manager.list_elems())
-        lang.change(engine.change_lang, [lang], engine.manager.list_elems(), queue=False)
+        demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
+        lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
         lang.input(save_config, inputs=[lang], queue=False)
 
     return demo
@@ -56,19 +57,17 @@ def create_web_demo() -> gr.Blocks:
 
     with gr.Blocks(title="Web Demo", css=CSS) as demo:
         lang = gr.Dropdown(choices=["en", "zh"])
-        engine.manager.all_elems["top"] = dict(lang=lang)
+        engine.manager.add_elem_dict("top", dict(lang=lang))
 
         chat_box, _, _, chat_elems = create_chat_box(engine, visible=True)
-        engine.manager.all_elems["infer"] = dict(chat_box=chat_box, **chat_elems)
+        engine.manager.add_elem_dict("infer", dict(chat_box=chat_box, **chat_elems))
 
-        demo.load(engine.resume, outputs=engine.manager.list_elems())
-        lang.change(engine.change_lang, [lang], engine.manager.list_elems(), queue=False)
+        demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
+        lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
         lang.input(save_config, inputs=[lang], queue=False)
 
     return demo
 
 
 if __name__ == "__main__":
-    demo = create_ui()
-    demo.queue()
-    demo.launch(server_name="0.0.0.0", share=False, inbrowser=True)
+    create_ui().queue().launch(server_name="0.0.0.0", server_port=None, share=False, inbrowser=True)
diff --git a/src/llmtuner/webui/manager.py b/src/llmtuner/webui/manager.py
index 51ddf491..266a1f3a 100644
--- a/src/llmtuner/webui/manager.py
+++ b/src/llmtuner/webui/manager.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Dict, List, Set
+from typing import TYPE_CHECKING, Dict, Generator, List, Set, Tuple
 
 
 if TYPE_CHECKING:
@@ -7,27 +7,49 @@ if TYPE_CHECKING:
 
 class Manager:
     def __init__(self) -> None:
-        self.all_elems: Dict[str, Dict[str, "Component"]] = {}
+        self._elem_dicts: Dict[str, Dict[str, "Component"]] = {}
 
-    def get_elem_by_name(self, name: str) -> "Component":
+    def add_elem_dict(self, tab_name: str, elem_dict: Dict[str, "Component"]) -> None:
         r"""
+        Adds a elem dict.
+        """
+        self._elem_dicts[tab_name] = elem_dict
+
+    def get_elem_list(self) -> List["Component"]:
+        r"""
+        Returns the list of all elements.
+        """
+        return [elem for elem_dict in self._elem_dicts.values() for elem in elem_dict.values()]
+
+    def get_elem_iter(self) -> Generator[Tuple[str, "Component"], None, None]:
+        r"""
+        Returns an iterator over all elements with their names.
+        """
+        for elem_dict in self._elem_dicts.values():
+            for elem_name, elem in elem_dict.items():
+                yield elem_name, elem
+
+    def get_elem_by_id(self, elem_id: str) -> "Component":
+        r"""
+        Gets element by id.
+
         Example: top.lang, train.dataset
         """
-        tab_name, elem_name = name.split(".")
-        return self.all_elems[tab_name][elem_name]
+        tab_name, elem_name = elem_id.split(".")
+        return self._elem_dicts[tab_name][elem_name]
 
     def get_base_elems(self) -> Set["Component"]:
+        r"""
+        Gets the base elements that are commonly used.
+        """
         return {
-            self.all_elems["top"]["lang"],
-            self.all_elems["top"]["model_name"],
-            self.all_elems["top"]["model_path"],
-            self.all_elems["top"]["adapter_path"],
-            self.all_elems["top"]["finetuning_type"],
-            self.all_elems["top"]["quantization_bit"],
-            self.all_elems["top"]["template"],
-            self.all_elems["top"]["rope_scaling"],
-            self.all_elems["top"]["booster"],
+            self._elem_dicts["top"]["lang"],
+            self._elem_dicts["top"]["model_name"],
+            self._elem_dicts["top"]["model_path"],
+            self._elem_dicts["top"]["finetuning_type"],
+            self._elem_dicts["top"]["adapter_path"],
+            self._elem_dicts["top"]["quantization_bit"],
+            self._elem_dicts["top"]["template"],
+            self._elem_dicts["top"]["rope_scaling"],
+            self._elem_dicts["top"]["booster"],
         }
-
-    def list_elems(self) -> List["Component"]:
-        return [elem for elems in self.all_elems.values() for elem in elems.values()]
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 0cf50f6a..753225af 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -48,8 +48,8 @@ class Runner:
     def set_abort(self) -> None:
         self.aborted = True
 
-    def _initialize(self, data: Dict[Component, Any], do_train: bool, from_preview: bool) -> str:
-        get = lambda name: data[self.manager.get_elem_by_name(name)]
+    def _initialize(self, data: Dict["Component", Any], do_train: bool, from_preview: bool) -> str:
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
         lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
         dataset = get("train.dataset") if do_train else get("eval.dataset")
 
@@ -95,8 +95,8 @@ class Runner:
         else:
             return finish_info
 
-    def _parse_train_args(self, data: Dict[Component, Any]) -> Dict[str, Any]:
-        get = lambda name: data[self.manager.get_elem_by_name(name)]
+    def _parse_train_args(self, data: Dict["Component", Any]) -> Dict[str, Any]:
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
         user_config = load_config()
 
         if get("top.adapter_path"):
@@ -196,8 +196,8 @@ class Runner:
 
         return args
 
-    def _parse_eval_args(self, data: Dict[Component, Any]) -> Dict[str, Any]:
-        get = lambda name: data[self.manager.get_elem_by_name(name)]
+    def _parse_eval_args(self, data: Dict["Component", Any]) -> Dict[str, Any]:
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
         user_config = load_config()
 
         if get("top.adapter_path"):
@@ -232,6 +232,7 @@ class Runner:
             temperature=get("eval.temperature"),
             output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("eval.output_dir")),
         )
+        args["disable_tqdm"] = True
 
         if get("eval.predict"):
             args["do_predict"] = True
@@ -240,22 +241,20 @@ class Runner:
 
         return args
 
-    def _preview(
-        self, data: Dict[Component, Any], do_train: bool
-    ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
+    def _preview(self, data: Dict["Component", Any], do_train: bool) -> Generator[Tuple[str, "gr.Slider"], None, None]:
         error = self._initialize(data, do_train, from_preview=True)
         if error:
             gr.Warning(error)
-            yield error, gr.update(visible=False)
+            yield error, gr.Slider(visible=False)
         else:
             args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
-            yield gen_cmd(args), gr.update(visible=False)
+            yield gen_cmd(args), gr.Slider(visible=False)
 
-    def _launch(self, data: Dict[Component, Any], do_train: bool) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
+    def _launch(self, data: Dict["Component", Any], do_train: bool) -> Generator[Tuple[str, "gr.Slider"], None, None]:
         error = self._initialize(data, do_train, from_preview=False)
         if error:
             gr.Warning(error)
-            yield error, gr.update(visible=False)
+            yield error, gr.Slider(visible=False)
         else:
             args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
             run_kwargs = dict(args=args, callbacks=[self.trainer_callback])
@@ -264,20 +263,20 @@ class Runner:
             self.thread.start()
             yield from self.monitor()
 
-    def preview_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
+    def preview_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, gr.Slider], None, None]:
         yield from self._preview(data, do_train=True)
 
-    def preview_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
+    def preview_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, gr.Slider], None, None]:
         yield from self._preview(data, do_train=False)
 
-    def run_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
+    def run_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, gr.Slider], None, None]:
         yield from self._launch(data, do_train=True)
 
-    def run_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
+    def run_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, gr.Slider], None, None]:
         yield from self._launch(data, do_train=False)
 
-    def monitor(self) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
-        get = lambda name: self.running_data[self.manager.get_elem_by_name(name)]
+    def monitor(self) -> Generator[Tuple[str, "gr.Slider"], None, None]:
+        get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
         self.running = True
         lang = get("top.lang")
         output_dir = get_save_dir(
@@ -286,13 +285,14 @@ class Runner:
             get("{}.output_dir".format("train" if self.do_train else "eval")),
         )
 
-        while self.thread.is_alive():
-            time.sleep(2)
+        while self.thread is not None and self.thread.is_alive():
             if self.aborted:
-                yield ALERTS["info_aborting"][lang], gr.update(visible=False)
+                yield ALERTS["info_aborting"][lang], gr.Slider(visible=False)
             else:
                 yield self.logger_handler.log, update_process_bar(self.trainer_callback)
 
+            time.sleep(2)
+
         if self.do_train:
             if os.path.exists(os.path.join(output_dir, TRAINING_ARGS_NAME)):
                 finish_info = ALERTS["info_finished"][lang]
@@ -304,4 +304,4 @@ class Runner:
             else:
                 finish_info = ALERTS["err_failed"][lang]
 
-        yield self._finalize(lang, finish_info), gr.update(visible=False)
+        yield self._finalize(lang, finish_info), gr.Slider(visible=False)
diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py
index 05cdd7f6..275b6573 100644
--- a/src/llmtuner/webui/utils.py
+++ b/src/llmtuner/webui/utils.py
@@ -19,26 +19,26 @@ if is_matplotlib_available():
     import matplotlib.pyplot as plt
 
 
-def update_process_bar(callback: "LogCallback") -> Dict[str, Any]:
+def update_process_bar(callback: "LogCallback") -> "gr.Slider":
     if not callback.max_steps:
-        return gr.update(visible=False)
+        return gr.Slider(visible=False)
 
     percentage = round(100 * callback.cur_steps / callback.max_steps, 0) if callback.max_steps != 0 else 100.0
     label = "Running {:d}/{:d}: {} < {}".format(
         callback.cur_steps, callback.max_steps, callback.elapsed_time, callback.remaining_time
     )
-    return gr.update(label=label, value=percentage, visible=True)
+    return gr.Slider(label=label, value=percentage, visible=True)
 
 
 def get_time() -> str:
-    return datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    return datetime.now().strftime(r"%Y-%m-%d-%H-%M-%S")
 
 
-def can_quantize(finetuning_type: str) -> Dict[str, Any]:
+def can_quantize(finetuning_type: str) -> "gr.Dropdown":
     if finetuning_type != "lora":
-        return gr.update(value="None", interactive=False)
+        return gr.Dropdown(value="None", interactive=False)
     else:
-        return gr.update(interactive=True)
+        return gr.Dropdown(interactive=True)
 
 
 def check_json_schema(text: str, lang: str) -> None:
@@ -48,8 +48,8 @@ def check_json_schema(text: str, lang: str) -> None:
             assert isinstance(tools, list)
             for tool in tools:
                 if "name" not in tool:
-                    raise ValueError("Name not found.")
-    except ValueError:
+                    raise NotImplementedError("Name not found.")
+    except NotImplementedError:
         gr.Warning(ALERTS["err_tool_name"][lang])
     except Exception:
         gr.Warning(ALERTS["err_json_schema"][lang])
diff --git a/src/train_web.py b/src/train_web.py
index 3f7855c0..8327f4dd 100644
--- a/src/train_web.py
+++ b/src/train_web.py
@@ -2,9 +2,7 @@ from llmtuner import create_ui
 
 
 def main():
-    demo = create_ui()
-    demo.queue()
-    demo.launch(server_name="0.0.0.0", share=False, inbrowser=True)
+    create_ui().queue().launch(server_name="0.0.0.0", server_port=None, share=False, inbrowser=True)
 
 
 if __name__ == "__main__":
diff --git a/src/web_demo.py b/src/web_demo.py
index 17d21968..3b57ee73 100644
--- a/src/web_demo.py
+++ b/src/web_demo.py
@@ -2,9 +2,7 @@ from llmtuner import create_web_demo
 
 
 def main():
-    demo = create_web_demo()
-    demo.queue()
-    demo.launch(server_name="0.0.0.0", share=False, inbrowser=True)
+    create_web_demo().queue().launch(server_name="0.0.0.0", server_port=None, share=False, inbrowser=True)
 
 
 if __name__ == "__main__":

From 6198121923db142195f302a0e11e223e0854b932 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 30 Mar 2024 23:09:12 +0800
Subject: [PATCH 009/341] support save args in webui #2807 #3046

some ideas are borrowed from @marko1616


Former-commit-id: b5a062aa2d4a37670007e8b3dae5b6f5b7ffb15c
---
 src/llmtuner/extras/misc.py            |  1 +
 src/llmtuner/webui/common.py           | 21 ++++++
 src/llmtuner/webui/components/eval.py  |  4 +-
 src/llmtuner/webui/components/train.py | 66 +++++++++++++------
 src/llmtuner/webui/engine.py           |  5 +-
 src/llmtuner/webui/interface.py        | 18 ++----
 src/llmtuner/webui/locales.py          | 88 ++++++++++++++++++++++----
 src/llmtuner/webui/manager.py          | 46 ++++++++------
 src/llmtuner/webui/runner.py           | 50 +++++++++++----
 9 files changed, 219 insertions(+), 80 deletions(-)

diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index 85761f1d..c7b687e9 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -66,6 +66,7 @@ def check_dependencies() -> None:
         require_version("accelerate>=0.27.2", "To fix: pip install accelerate>=0.27.2")
         require_version("peft>=0.10.0", "To fix: pip install peft>=0.10.0")
         require_version("trl>=0.8.1", "To fix: pip install trl>=0.8.1")
+        require_version("gradio>4.0.0,<=4.21.0", "To fix: pip install gradio==4.21.0")
 
 
 def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
diff --git a/src/llmtuner/webui/common.py b/src/llmtuner/webui/common.py
index 67e6ff2a..798e6408 100644
--- a/src/llmtuner/webui/common.py
+++ b/src/llmtuner/webui/common.py
@@ -20,6 +20,7 @@ from ..extras.misc import use_modelscope
 
 ADAPTER_NAMES = {WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME}
 DEFAULT_CACHE_DIR = "cache"
+DEFAULT_CONFIG_DIR = "config"
 DEFAULT_DATA_DIR = "data"
 DEFAULT_SAVE_DIR = "saves"
 USER_CONFIG = "user.config"
@@ -33,6 +34,10 @@ def get_config_path() -> os.PathLike:
     return os.path.join(DEFAULT_CACHE_DIR, USER_CONFIG)
 
 
+def get_save_path(config_path: str) -> os.PathLike:
+    return os.path.join(DEFAULT_CONFIG_DIR, config_path)
+
+
 def load_config() -> Dict[str, Any]:
     try:
         with open(get_config_path(), "r", encoding="utf-8") as f:
@@ -52,6 +57,22 @@ def save_config(lang: str, model_name: Optional[str] = None, model_path: Optiona
         json.dump(user_config, f, indent=2, ensure_ascii=False)
 
 
+def load_args(config_path: str) -> Optional[Dict[str, Any]]:
+    try:
+        with open(get_save_path(config_path), "r", encoding="utf-8") as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def save_args(config_path: str, config_dict: Dict[str, Any]) -> str:
+    os.makedirs(DEFAULT_CONFIG_DIR, exist_ok=True)
+    with open(get_save_path(config_path), "w", encoding="utf-8") as f:
+        json.dump(config_dict, f, indent=2, ensure_ascii=False)
+
+    return str(get_save_path(config_path))
+
+
 def get_model_path(model_name: str) -> str:
     user_config = load_config()
     path_dict: Dict[DownloadSource, str] = SUPPORTED_MODELS.get(model_name, defaultdict(str))
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index 452b06fb..4d2fe5c0 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -46,8 +46,8 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Row():
         cmd_preview_btn = gr.Button()
-        start_btn = gr.Button()
-        stop_btn = gr.Button()
+        start_btn = gr.Button(variant="primary")
+        stop_btn = gr.Button(variant="stop")
 
     with gr.Row():
         resume_btn = gr.Checkbox(visible=False, interactive=False, value=False)
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index c7220b08..52c8fdb6 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -27,8 +27,6 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         dataset = gr.Dropdown(multiselect=True, scale=4)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
-    dataset_dir.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False)
-
     input_elems.update({training_stage, dataset_dir, dataset})
     elem_dict.update(dict(training_stage=training_stage, dataset_dir=dataset_dir, dataset=dataset, **preview_elems))
 
@@ -127,19 +125,30 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as lora_tab:
         with gr.Row():
-            lora_rank = gr.Slider(value=8, minimum=1, maximum=1024, step=1, scale=1)
-            lora_alpha = gr.Slider(value=16, minimum=1, maximum=2048, step=1, scale=1)
-            lora_dropout = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1)
-            lora_target = gr.Textbox(scale=2)
+            lora_rank = gr.Slider(value=8, minimum=1, maximum=1024, step=1)
+            lora_alpha = gr.Slider(value=16, minimum=1, maximum=2048, step=1)
+            lora_dropout = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01)
+            loraplus_lr_ratio = gr.Slider(value=0, minimum=0, maximum=64, step=0.01)
+            create_new_adapter = gr.Checkbox()
 
         with gr.Row():
             use_rslora = gr.Checkbox(scale=1)
             use_dora = gr.Checkbox(scale=1)
-            create_new_adapter = gr.Checkbox(scale=1)
+            lora_target = gr.Textbox(scale=2)
             additional_target = gr.Textbox(scale=2)
 
     input_elems.update(
-        {lora_rank, lora_alpha, lora_dropout, lora_target, use_rslora, use_dora, create_new_adapter, additional_target}
+        {
+            lora_rank,
+            lora_alpha,
+            lora_dropout,
+            loraplus_lr_ratio,
+            create_new_adapter,
+            use_rslora,
+            use_dora,
+            lora_target,
+            additional_target,
+        }
     )
     elem_dict.update(
         dict(
@@ -147,10 +156,11 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             lora_rank=lora_rank,
             lora_alpha=lora_alpha,
             lora_dropout=lora_dropout,
-            lora_target=lora_target,
+            loraplus_lr_ratio=loraplus_lr_ratio,
+            create_new_adapter=create_new_adapter,
             use_rslora=use_rslora,
             use_dora=use_dora,
-            create_new_adapter=create_new_adapter,
+            lora_target=lora_target,
             additional_target=additional_target,
         )
     )
@@ -161,13 +171,6 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             dpo_ftx = gr.Slider(value=0, minimum=0, maximum=10, step=0.01, scale=1)
             reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=2)
 
-    training_stage.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False).then(
-        list_adapters,
-        [engine.manager.get_elem_by_id("top.model_name"), engine.manager.get_elem_by_id("top.finetuning_type")],
-        [reward_model],
-        queue=False,
-    ).then(autoset_packing, [training_stage], [packing], queue=False)
-
     input_elems.update({dpo_beta, dpo_ftx, reward_model})
     elem_dict.update(dict(rlhf_tab=rlhf_tab, dpo_beta=dpo_beta, dpo_ftx=dpo_ftx, reward_model=reward_model))
 
@@ -177,7 +180,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             galore_rank = gr.Slider(value=16, minimum=1, maximum=1024, step=1, scale=2)
             galore_update_interval = gr.Slider(value=200, minimum=1, maximum=1024, step=1, scale=2)
             galore_scale = gr.Slider(value=0.25, minimum=0, maximum=1, step=0.01, scale=2)
-            galore_target = gr.Textbox(value="mlp,attn", scale=3)
+            galore_target = gr.Textbox(value="all", scale=3)
 
     input_elems.update({use_galore, galore_rank, galore_update_interval, galore_scale, galore_target})
     elem_dict.update(
@@ -193,13 +196,16 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Row():
         cmd_preview_btn = gr.Button()
-        start_btn = gr.Button()
-        stop_btn = gr.Button()
+        arg_save_btn = gr.Button()
+        arg_load_btn = gr.Button()
+        start_btn = gr.Button(variant="primary")
+        stop_btn = gr.Button(variant="stop")
 
     with gr.Row():
         with gr.Column(scale=3):
             with gr.Row():
                 output_dir = gr.Textbox()
+                config_path = gr.Textbox()
 
             with gr.Row():
                 resume_btn = gr.Checkbox(visible=False, interactive=False)
@@ -211,20 +217,38 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         with gr.Column(scale=1):
             loss_viewer = gr.Plot()
 
-    input_elems.add(output_dir)
+    input_elems.update({output_dir, config_path})
     output_elems = [output_box, process_bar]
 
     cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems, concurrency_limit=None)
+    arg_save_btn.click(engine.runner.save_args, input_elems, output_elems, concurrency_limit=None)
+    arg_load_btn.click(
+        engine.runner.load_args,
+        [engine.manager.get_elem_by_id("top.lang"), config_path],
+        list(input_elems),
+        concurrency_limit=None,
+    )
     start_btn.click(engine.runner.run_train, input_elems, output_elems)
     stop_btn.click(engine.runner.set_abort, queue=False)
     resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
 
+    dataset_dir.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False)
+    training_stage.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False).then(
+        list_adapters,
+        [engine.manager.get_elem_by_id("top.model_name"), engine.manager.get_elem_by_id("top.finetuning_type")],
+        [reward_model],
+        queue=False,
+    ).then(autoset_packing, [training_stage], [packing], queue=False)
+
     elem_dict.update(
         dict(
             cmd_preview_btn=cmd_preview_btn,
+            arg_save_btn=arg_save_btn,
+            arg_load_btn=arg_load_btn,
             start_btn=start_btn,
             stop_btn=stop_btn,
             output_dir=output_dir,
+            config_path=config_path,
             resume_btn=resume_btn,
             process_bar=process_bar,
             output_box=output_box,
diff --git a/src/llmtuner/webui/engine.py b/src/llmtuner/webui/engine.py
index 7f76f799..0ee7f047 100644
--- a/src/llmtuner/webui/engine.py
+++ b/src/llmtuner/webui/engine.py
@@ -38,8 +38,9 @@ class Engine:
         if not self.pure_chat:
             init_dict["train.dataset"] = {"choices": list_dataset().choices}
             init_dict["eval.dataset"] = {"choices": list_dataset().choices}
-            init_dict["train.output_dir"] = {"value": "train_" + get_time()}
-            init_dict["eval.output_dir"] = {"value": "eval_" + get_time()}
+            init_dict["train.output_dir"] = {"value": "train_{}".format(get_time())}
+            init_dict["train.config_path"] = {"value": "{}.json".format(get_time())}
+            init_dict["eval.output_dir"] = {"value": "eval_{}".format(get_time())}
 
             if user_config.get("last_model", None):
                 init_dict["top.model_name"] = {"value": user_config["last_model"]}
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index d943594d..f89d3ca5 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -1,5 +1,4 @@
 import gradio as gr
-from transformers.utils.versions import require_version
 
 from .common import save_config
 from .components import (
@@ -14,9 +13,6 @@ from .css import CSS
 from .engine import Engine
 
 
-require_version("gradio>4.0.0,<=4.21.0", "To fix: pip install gradio==4.21.0")
-
-
 def create_ui(demo_mode: bool = False) -> gr.Blocks:
     engine = Engine(demo_mode=demo_mode, pure_chat=False)
 
@@ -29,21 +25,21 @@ def create_ui(demo_mode: bool = False) -> gr.Blocks:
             )
             gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
 
-        engine.manager.add_elem_dict("top", create_top())
+        engine.manager.add_elems("top", create_top())
         lang: "gr.Dropdown" = engine.manager.get_elem_by_id("top.lang")
 
         with gr.Tab("Train"):
-            engine.manager.add_elem_dict("train", create_train_tab(engine))
+            engine.manager.add_elems("train", create_train_tab(engine))
 
         with gr.Tab("Evaluate & Predict"):
-            engine.manager.add_elem_dict("eval", create_eval_tab(engine))
+            engine.manager.add_elems("eval", create_eval_tab(engine))
 
         with gr.Tab("Chat"):
-            engine.manager.add_elem_dict("infer", create_infer_tab(engine))
+            engine.manager.add_elems("infer", create_infer_tab(engine))
 
         if not demo_mode:
             with gr.Tab("Export"):
-                engine.manager.add_elem_dict("export", create_export_tab(engine))
+                engine.manager.add_elems("export", create_export_tab(engine))
 
         demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
         lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
@@ -57,10 +53,10 @@ def create_web_demo() -> gr.Blocks:
 
     with gr.Blocks(title="Web Demo", css=CSS) as demo:
         lang = gr.Dropdown(choices=["en", "zh"])
-        engine.manager.add_elem_dict("top", dict(lang=lang))
+        engine.manager.add_elems("top", dict(lang=lang))
 
         chat_box, _, _, chat_elems = create_chat_box(engine, visible=True)
-        engine.manager.add_elem_dict("infer", dict(chat_box=chat_box, **chat_elems))
+        engine.manager.add_elems("infer", dict(chat_box=chat_box, **chat_elems))
 
         demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
         lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index 4f329e8e..f6d6d421 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -628,18 +628,32 @@ LOCALES = {
             "info": "LoRA 权重随机丢弃的概率。",
         },
     },
-    "lora_target": {
+    "loraplus_lr_ratio": {
         "en": {
-            "label": "LoRA modules (optional)",
-            "info": "Name(s) of modules to apply LoRA. Use commas to separate multiple modules.",
+            "label": "LoRA+ LR ratio",
+            "info": "The LR ratio of the B matrices in LoRA.",
         },
         "ru": {
-            "label": "Модули LoRA (опционально)",
-            "info": "Имена модулей для применения LoRA. Используйте запятые для разделения нескольких модулей.",
+            "label": "LoRA+ LR коэффициент",
+            "info": "Коэффициент LR матриц B в LoRA.",
         },
         "zh": {
-            "label": "LoRA 作用模块（非必填）",
-            "info": "应用 LoRA 的模块名称。使用英文逗号分隔多个名称。",
+            "label": "LoRA+ 学习率比例",
+            "info": "LoRA+ 中 B 矩阵的学习率倍数。",
+        },
+    },
+    "create_new_adapter": {
+        "en": {
+            "label": "Create new adapter",
+            "info": "Create a new adapter with randomly initialized weight upon the existing one.",
+        },
+        "ru": {
+            "label": "Создать новый адаптер",
+            "info": "Создать новый адаптер с случайной инициализацией веса на основе существующего.",
+        },
+        "zh": {
+            "label": "新建适配器",
+            "info": "在现有的适配器上创建一个随机初始化后的新适配器。",
         },
     },
     "use_rslora": {
@@ -670,18 +684,18 @@ LOCALES = {
             "info": "使用权重分解的 LoRA。",
         },
     },
-    "create_new_adapter": {
+    "lora_target": {
         "en": {
-            "label": "Create new adapter",
-            "info": "Create a new adapter with randomly initialized weight upon the existing one.",
+            "label": "LoRA modules (optional)",
+            "info": "Name(s) of modules to apply LoRA. Use commas to separate multiple modules.",
         },
         "ru": {
-            "label": "Создать новый адаптер",
-            "info": "Создать новый адаптер с случайной инициализацией веса на основе существующего.",
+            "label": "Модули LoRA (опционально)",
+            "info": "Имена модулей для применения LoRA. Используйте запятые для разделения нескольких модулей.",
         },
         "zh": {
-            "label": "新建适配器",
-            "info": "在现有的适配器上创建一个随机初始化后的新适配器。",
+            "label": "LoRA 作用模块（非必填）",
+            "info": "应用 LoRA 的模块名称。使用英文逗号分隔多个名称。",
         },
     },
     "additional_target": {
@@ -849,6 +863,28 @@ LOCALES = {
             "value": "预览命令",
         },
     },
+    "arg_save_btn": {
+        "en": {
+            "value": "Save arguments",
+        },
+        "ru": {
+            "value": "Сохранить аргументы",
+        },
+        "zh": {
+            "value": "保存训练参数",
+        },
+    },
+    "arg_load_btn": {
+        "en": {
+            "value": "Load arguments",
+        },
+        "ru": {
+            "value": "Загрузить аргументы",
+        },
+        "zh": {
+            "value": "载入训练参数",
+        },
+    },
     "start_btn": {
         "en": {
             "value": "Start",
@@ -885,6 +921,20 @@ LOCALES = {
             "info": "保存结果的路径。",
         },
     },
+    "config_path": {
+        "en": {
+            "label": "Config path",
+            "info": "Path to config saving arguments.",
+        },
+        "ru": {
+            "label": "Путь к конфигурации",
+            "info": "Путь для сохранения аргументов конфигурации.",
+        },
+        "zh": {
+            "label": "配置路径",
+            "info": "保存训练参数的配置文件路径。",
+        },
+    },
     "output_box": {
         "en": {
             "value": "Ready.",
@@ -1236,6 +1286,11 @@ ALERTS = {
         "ru": "Неверная схема JSON.",
         "zh": "Json 格式错误。",
     },
+    "err_config_not_found": {
+        "en": "Config file is not found.",
+        "ru": "Файл конфигурации не найден.",
+        "zh": "未找到配置文件。",
+    },
     "warn_no_cuda": {
         "en": "CUDA environment was not detected.",
         "ru": "Среда CUDA не обнаружена.",
@@ -1256,6 +1311,11 @@ ALERTS = {
         "ru": "Завершено.",
         "zh": "训练完毕。",
     },
+    "info_config_saved": {
+        "en": "Arguments have been saved at: ",
+        "ru": "Аргументы были сохранены по адресу: ",
+        "zh": "训练参数已保存至：",
+    },
     "info_loading": {
         "en": "Loading model...",
         "ru": "Загрузка модели...",
diff --git a/src/llmtuner/webui/manager.py b/src/llmtuner/webui/manager.py
index 266a1f3a..a67c0995 100644
--- a/src/llmtuner/webui/manager.py
+++ b/src/llmtuner/webui/manager.py
@@ -7,27 +7,30 @@ if TYPE_CHECKING:
 
 class Manager:
     def __init__(self) -> None:
-        self._elem_dicts: Dict[str, Dict[str, "Component"]] = {}
+        self._id_to_elem: Dict[str, "Component"] = {}
+        self._elem_to_id: Dict["Component", str] = {}
 
-    def add_elem_dict(self, tab_name: str, elem_dict: Dict[str, "Component"]) -> None:
+    def add_elems(self, tab_name: str, elem_dict: Dict[str, "Component"]) -> None:
         r"""
-        Adds a elem dict.
+        Adds elements to manager.
         """
-        self._elem_dicts[tab_name] = elem_dict
+        for elem_name, elem in elem_dict.items():
+            elem_id = "{}.{}".format(tab_name, elem_name)
+            self._id_to_elem[elem_id] = elem
+            self._elem_to_id[elem] = elem_id
 
     def get_elem_list(self) -> List["Component"]:
         r"""
         Returns the list of all elements.
         """
-        return [elem for elem_dict in self._elem_dicts.values() for elem in elem_dict.values()]
+        return list(self._id_to_elem.values())
 
     def get_elem_iter(self) -> Generator[Tuple[str, "Component"], None, None]:
         r"""
         Returns an iterator over all elements with their names.
         """
-        for elem_dict in self._elem_dicts.values():
-            for elem_name, elem in elem_dict.items():
-                yield elem_name, elem
+        for elem_id, elem in self._id_to_elem.items():
+            yield elem_id.split(".")[-1], elem
 
     def get_elem_by_id(self, elem_id: str) -> "Component":
         r"""
@@ -35,21 +38,26 @@ class Manager:
 
         Example: top.lang, train.dataset
         """
-        tab_name, elem_name = elem_id.split(".")
-        return self._elem_dicts[tab_name][elem_name]
+        return self._id_to_elem[elem_id]
+
+    def get_id_by_elem(self, elem: "Component") -> str:
+        r"""
+        Gets id by element.
+        """
+        return self._elem_to_id[elem]
 
     def get_base_elems(self) -> Set["Component"]:
         r"""
         Gets the base elements that are commonly used.
         """
         return {
-            self._elem_dicts["top"]["lang"],
-            self._elem_dicts["top"]["model_name"],
-            self._elem_dicts["top"]["model_path"],
-            self._elem_dicts["top"]["finetuning_type"],
-            self._elem_dicts["top"]["adapter_path"],
-            self._elem_dicts["top"]["quantization_bit"],
-            self._elem_dicts["top"]["template"],
-            self._elem_dicts["top"]["rope_scaling"],
-            self._elem_dicts["top"]["booster"],
+            self._id_to_elem["top.lang"],
+            self._id_to_elem["top.model_name"],
+            self._id_to_elem["top.model_path"],
+            self._id_to_elem["top.finetuning_type"],
+            self._id_to_elem["top.adapter_path"],
+            self._id_to_elem["top.quantization_bit"],
+            self._id_to_elem["top.template"],
+            self._id_to_elem["top.rope_scaling"],
+            self._id_to_elem["top.booster"],
         }
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 753225af..ab646051 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -15,7 +15,7 @@ from ..extras.constants import TRAINING_STAGES
 from ..extras.logging import LoggerHandler
 from ..extras.misc import get_device_count, torch_gc
 from ..train import run_exp
-from .common import get_module, get_save_dir, load_config
+from .common import get_module, get_save_dir, load_args, load_config, save_args
 from .locales import ALERTS
 from .utils import gen_cmd, get_eval_results, update_process_bar
 
@@ -150,23 +150,21 @@ class Runner:
         args["disable_tqdm"] = True
 
         if args["finetuning_type"] == "freeze":
-            args["num_layer_trainable"] = int(get("train.num_layer_trainable"))
+            args["num_layer_trainable"] = get("train.num_layer_trainable")
             args["name_module_trainable"] = get("train.name_module_trainable")
         elif args["finetuning_type"] == "lora":
-            args["lora_rank"] = int(get("train.lora_rank"))
-            args["lora_alpha"] = int(get("train.lora_alpha"))
-            args["lora_dropout"] = float(get("train.lora_dropout"))
-            args["lora_target"] = get("train.lora_target") or get_module(get("top.model_name"))
+            args["lora_rank"] = get("train.lora_rank")
+            args["lora_alpha"] = get("train.lora_alpha")
+            args["lora_dropout"] = get("train.lora_dropout")
+            args["loraplus_lr_ratio"] = get("train.loraplus_lr_ratio") or None
+            args["create_new_adapter"] = get("train.create_new_adapter")
             args["use_rslora"] = get("train.use_rslora")
             args["use_dora"] = get("train.use_dora")
+            args["lora_target"] = get("train.lora_target") or get_module(get("top.model_name"))
             args["additional_target"] = get("train.additional_target") or None
-            if args["stage"] in ["rm", "ppo", "dpo"]:
-                args["create_new_adapter"] = args["quantization_bit"] is None
-            else:
-                args["create_new_adapter"] = get("train.create_new_adapter")
 
             if args["use_llama_pro"]:
-                args["num_layer_trainable"] = int(get("train.num_layer_trainable"))
+                args["num_layer_trainable"] = get("train.num_layer_trainable")
 
         if args["stage"] == "ppo":
             args["reward_model"] = ",".join(
@@ -305,3 +303,33 @@ class Runner:
                 finish_info = ALERTS["err_failed"][lang]
 
         yield self._finalize(lang, finish_info), gr.Slider(visible=False)
+
+    def save_args(self, data: Dict[Component, Any]) -> Tuple[str, "gr.Slider"]:
+        error = self._initialize(data, do_train=True, from_preview=True)
+        if error:
+            gr.Warning(error)
+            return error, gr.Slider(visible=False)
+
+        config_dict: Dict[str, Any] = {}
+        lang = data[self.manager.get_elem_by_id("top.lang")]
+        config_path = data[self.manager.get_elem_by_id("train.config_path")]
+        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path"]
+        for elem, value in data.items():
+            elem_id = self.manager.get_id_by_elem(elem)
+            if elem_id not in skip_ids:
+                config_dict[elem_id] = value
+
+        save_path = save_args(config_path, config_dict)
+        return ALERTS["info_config_saved"][lang] + save_path, gr.Slider(visible=False)
+
+    def load_args(self, lang: str, config_path: str) -> Dict[Component, Any]:
+        config_dict = load_args(config_path)
+        if config_dict is None:
+            gr.Warning(ALERTS["err_config_not_found"][lang])
+            return {self.manager.get_elem_by_id("top.lang"): lang}
+
+        output_dict: Dict["Component", Any] = {}
+        for elem_id, value in config_dict.items():
+            output_dict[self.manager.get_elem_by_id(elem_id)] = value
+
+        return output_dict

From a6858a36c0aae77ac21c550ec2d7708371652ba1 Mon Sep 17 00:00:00 2001
From: marko1616 <marko1616@outlook.com>
Date: Sat, 30 Mar 2024 23:45:04 +0800
Subject: [PATCH 010/341] Fix Llama model save for full param train

Former-commit-id: ca17b5db4f97c3ec9fe2004877f150e8f51ab4b5
---
 src/llmtuner/model/patcher.py | 9 +++++++++
 src/llmtuner/train/tuner.py   | 8 --------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index cb55f5ed..e3d7539f 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -312,6 +312,15 @@ def patch_config(
 def patch_model(
     model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments", is_trainable: bool
 ) -> None:
+    #Config check and fix
+    gen_config = model.generation_config
+    if not gen_config.do_sample and (
+        (gen_config.temperature is not None and gen_config.temperature != 1.0)
+        or (gen_config.top_p is not None and gen_config.top_p != 1.0)
+        or (gen_config.typical_p is not None and gen_config.typical_p != 1.0)
+    ):
+        gen_config.do_sample = True
+    
     if "GenerationMixin" not in str(model.generate.__func__):
         model.generate = MethodType(PreTrainedModel.generate, model)
 
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index a03955d5..1b8e3cb7 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -64,14 +64,6 @@ def export_model(args: Optional[Dict[str, Any]] = None):
         for param in model.parameters():
             param.data = param.data.to(output_dtype)
 
-    gen_config = model.generation_config  # check and fix generation config
-    if not gen_config.do_sample and (
-        (gen_config.temperature is not None and gen_config.temperature != 1.0)
-        or (gen_config.top_p is not None and gen_config.top_p != 1.0)
-        or (gen_config.typical_p is not None and gen_config.typical_p != 1.0)
-    ):
-        gen_config.do_sample = True
-
     model.save_pretrained(
         save_directory=model_args.export_dir,
         max_shard_size="{}GB".format(model_args.export_size),

From 1f617c6e08027a84e977dc3f1756e05c146d3b2c Mon Sep 17 00:00:00 2001
From: marko1616 <marko1616@outlook.com>
Date: Sat, 30 Mar 2024 23:46:55 +0800
Subject: [PATCH 011/341] fix blank line contains whitespace

Former-commit-id: 7bc3bcc64353d5a1d4870c6a9509b64cff710492
---
 src/llmtuner/model/patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index e3d7539f..03ca0096 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -320,7 +320,7 @@ def patch_model(
         or (gen_config.typical_p is not None and gen_config.typical_p != 1.0)
     ):
         gen_config.do_sample = True
-    
+
     if "GenerationMixin" not in str(model.generate.__func__):
         model.generate = MethodType(PreTrainedModel.generate, model)
 

From 526111a30394074ef3a6c68387c8ae98153fbff1 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 31 Mar 2024 00:10:29 +0800
Subject: [PATCH 012/341] tiny fix

Former-commit-id: ba4a9b3c01e2f7467fbc5be268f47c0d003caa65
---
 src/llmtuner/model/patcher.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 03ca0096..3aa5c3e9 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -312,15 +312,6 @@ def patch_config(
 def patch_model(
     model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments", is_trainable: bool
 ) -> None:
-    #Config check and fix
-    gen_config = model.generation_config
-    if not gen_config.do_sample and (
-        (gen_config.temperature is not None and gen_config.temperature != 1.0)
-        or (gen_config.top_p is not None and gen_config.top_p != 1.0)
-        or (gen_config.typical_p is not None and gen_config.typical_p != 1.0)
-    ):
-        gen_config.do_sample = True
-
     if "GenerationMixin" not in str(model.generate.__func__):
         model.generate = MethodType(PreTrainedModel.generate, model)
 
@@ -328,6 +319,14 @@ def patch_model(
         setattr(model, "lm_head", model.transformer.output_layer)
         setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
 
+    gen_config = model.generation_config  # check and fix generation config
+    if not gen_config.do_sample and (
+        (gen_config.temperature is not None and gen_config.temperature != 1.0)
+        or (gen_config.top_p is not None and gen_config.top_p != 1.0)
+        or (gen_config.typical_p is not None and gen_config.typical_p != 1.0)
+    ):
+        gen_config.do_sample = True
+
     if model_args.resize_vocab:
         _resize_embedding_layer(model, tokenizer)
 

From d764cd87368d8bfb8c174bf5e3b2a5c663ce2081 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 31 Mar 2024 18:29:50 +0800
Subject: [PATCH 013/341] support ORPO

Former-commit-id: f44a4c27e2461cdaa1b16865f597a31033c0e6d9
---
 README.md                               |   7 +-
 README_zh.md                            |   7 +-
 data/README.md                          |   6 +
 data/README_zh.md                       |   6 +
 examples/lora_single_gpu/README.md      |   3 +-
 examples/lora_single_gpu/orpo.sh        |  32 +++++
 src/llmtuner/data/__init__.py           |  11 +-
 src/llmtuner/data/collator.py           |  51 ++++++++
 src/llmtuner/data/loader.py             |   4 +-
 src/llmtuner/data/preprocess.py         |  34 +++---
 src/llmtuner/data/utils.py              |   2 +-
 src/llmtuner/extras/callbacks.py        |   1 +
 src/llmtuner/extras/constants.py        |   3 +
 src/llmtuner/hparams/finetuning_args.py |   6 +-
 src/llmtuner/train/dpo/trainer.py       |  31 ++---
 src/llmtuner/train/dpo/workflow.py      |   7 +-
 src/llmtuner/train/orpo/__init__.py     |   4 +
 src/llmtuner/train/orpo/trainer.py      | 150 ++++++++++++++++++++++++
 src/llmtuner/train/orpo/workflow.py     |  68 +++++++++++
 src/llmtuner/train/rm/workflow.py       |   3 +-
 src/llmtuner/train/tuner.py             |   3 +
 src/llmtuner/webui/common.py            |   3 +-
 22 files changed, 395 insertions(+), 47 deletions(-)
 create mode 100644 examples/lora_single_gpu/orpo.sh
 create mode 100644 src/llmtuner/data/collator.py
 create mode 100644 src/llmtuner/train/orpo/__init__.py
 create mode 100644 src/llmtuner/train/orpo/trainer.py
 create mode 100644 src/llmtuner/train/orpo/workflow.py

diff --git a/README.md b/README.md
index af6ef66f..b9059426 100644
--- a/README.md
+++ b/README.md
@@ -68,16 +68,18 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
+
 [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
 
 [24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/fsdp_qlora` for usage.
 
+<details><summary>Full Changelog</summary>
+
 [24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See `examples/extras/loraplus` for usage.
 
 [24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. See `examples/extras/galore` for usage.
 
-<details><summary>Full Changelog</summary>
-
 [24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `--infer_backend vllm` to enjoy **270%** inference speed. (LoRA is not yet supported, merge it first.)
 
 [24/02/28] We supported weight-decomposed LoRA (**[DoRA](https://arxiv.org/abs/2402.09353)**). Try `--use_dora` to activate DoRA training.
@@ -165,6 +167,7 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ
 | Reward Modeling        | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | PPO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | DPO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| ORPO Training          | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 
 > [!NOTE]
 > Use `--quantization_bit 4` argument to enable QLoRA.
diff --git a/README_zh.md b/README_zh.md
index d018ee32..5c81be44 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -68,16 +68,18 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
+
 [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
 
 [24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/fsdp_qlora`。
 
+<details><summary>展开日志</summary>
+
 [24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 `examples/extras/loraplus`。
 
 [24/03/07] 我们支持了梯度低秩投影（**[GaLore](https://arxiv.org/abs/2403.03507)**）算法。详细用法请参照 `examples/extras/galore`。
 
-<details><summary>展开日志</summary>
-
 [24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `--infer_backend vllm` 来获得 **270%** 的推理速度。（尚不支持 LoRA，请先合并权重。）
 
 [24/02/28] 我们支持了 **[DoRA](https://arxiv.org/abs/2402.09353)** 微调。请使用 `--use_dora` 参数进行 DoRA 微调。
@@ -165,6 +167,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | 奖励模型训练            | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | PPO 训练               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | DPO 训练               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| ORPO 训练              | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 
 > [!NOTE]
 > 请使用 `--quantization_bit 4` 参数来启用 QLoRA 训练。
diff --git a/data/README.md b/data/README.md
index fa2c9ee0..2ea0c117 100644
--- a/data/README.md
+++ b/data/README.md
@@ -34,6 +34,8 @@ If you are using a custom dataset, please provide your dataset definition in the
 
 Given above, you can use the custom dataset via specifying `--dataset dataset_name`.
 
+----
+
 Currently we support dataset in **alpaca** or **sharegpt** format, the dataset in alpaca format should follow the below format:
 
 ```json
@@ -84,6 +86,10 @@ For the preference datasets, the `response` column should be a string list whose
 }
 ```
 
+Remember to set `"ranking": true` for the preference datasets.
+
+----
+
 The dataset in sharegpt format should follow the below format:
 
 ```json
diff --git a/data/README_zh.md b/data/README_zh.md
index e0004f4a..b00f81d9 100644
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -34,6 +34,8 @@
 
 添加后可通过指定 `--dataset 数据集名称` 参数使用自定义数据集。
 
+----
+
 该项目目前支持两种格式的数据集：**alpaca** 和 **sharegpt**，其中 alpaca 格式的数据集按照以下方式组织：
 
 ```json
@@ -84,6 +86,10 @@
 }
 ```
 
+添加偏好数据集需要额外指定 `"ranking": true`。
+
+----
+
 而 sharegpt 格式的数据集按照以下方式组织：
 
 ```json
diff --git a/examples/lora_single_gpu/README.md b/examples/lora_single_gpu/README.md
index ae0f4722..151d0784 100644
--- a/examples/lora_single_gpu/README.md
+++ b/examples/lora_single_gpu/README.md
@@ -1,8 +1,9 @@
 Usage:
 
 - `pretrain.sh`: do pre-train (optional)
-- `sft.sh`: do supervised fine-tune
+- `sft.sh`: do supervised fine-tuning
 - `reward.sh`: do reward modeling (must after sft.sh)
 - `ppo.sh`: do PPO training (must after sft.sh and reward.sh)
 - `dpo.sh`: do DPO training (must after sft.sh)
+- `orpo.sh`: do ORPO training
 - `predict.sh`: do predict (must after sft.sh and dpo.sh)
diff --git a/examples/lora_single_gpu/orpo.sh b/examples/lora_single_gpu/orpo.sh
new file mode 100644
index 00000000..77662ecf
--- /dev/null
+++ b/examples/lora_single_gpu/orpo.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage orpo \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset comparison_gpt4_en \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/orpo \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 1e-5 \
+    --num_train_epochs 1.0 \
+    --max_samples 1000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/src/llmtuner/data/__init__.py b/src/llmtuner/data/__init__.py
index 80dbf5ff..792e89d9 100644
--- a/src/llmtuner/data/__init__.py
+++ b/src/llmtuner/data/__init__.py
@@ -1,6 +1,15 @@
+from .collator import PairwiseDataCollatorWithPadding
 from .loader import get_dataset
 from .template import Template, get_template_and_fix_tokenizer, templates
 from .utils import Role, split_dataset
 
 
-__all__ = ["get_dataset", "Template", "get_template_and_fix_tokenizer", "templates", "Role", "split_dataset"]
+__all__ = [
+    "PairwiseDataCollatorWithPadding",
+    "get_dataset",
+    "Template",
+    "get_template_and_fix_tokenizer",
+    "templates",
+    "Role",
+    "split_dataset",
+]
diff --git a/src/llmtuner/data/collator.py b/src/llmtuner/data/collator.py
new file mode 100644
index 00000000..5e506546
--- /dev/null
+++ b/src/llmtuner/data/collator.py
@@ -0,0 +1,51 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Sequence, Tuple
+
+import torch
+from transformers import DataCollatorForSeq2Seq
+
+
+@dataclass
+class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq):
+    r"""
+    Data collator for pairwise data.
+    """
+
+    def _pad_labels(self, batch: torch.Tensor, positions: List[Tuple[int, int]]) -> torch.Tensor:
+        r"""
+        Masks out the input ids except for the responses.
+        """
+        padded_labels = []
+        for feature, (prompt_len, answer_len) in zip(batch, positions):
+            if self.tokenizer.padding_side == "left":
+                start, end = feature.size(0) - answer_len, feature.size(0)
+            else:
+                start, end = prompt_len, prompt_len + answer_len
+            padded_tensor = self.label_pad_token_id * torch.ones_like(feature)
+            padded_tensor[start:end] = feature[start:end]
+            padded_labels.append(padded_tensor)
+        return torch.stack(padded_labels, dim=0).contiguous()  # in contiguous memory
+
+    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        r"""
+        Pads batched data to the longest sequence in the batch.
+
+        We generate 2 * n examples where the first n examples represent chosen examples and
+        the last n examples represent rejected examples.
+        """
+        concatenated_features = []
+        label_positions = []
+        for key in ("chosen_ids", "rejected_ids"):
+            for feature in features:
+                prompt_len, answer_len = len(feature["prompt_ids"]), len(feature[key])
+                concatenated_features.append(
+                    {
+                        "input_ids": feature["prompt_ids"] + feature[key],
+                        "attention_mask": [1] * (prompt_len + answer_len),
+                    }
+                )
+                label_positions.append((prompt_len, answer_len))
+
+        batch = super().__call__(concatenated_features)
+        batch["labels"] = self._pad_labels(batch["input_ids"], label_positions)
+        return batch
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index 935695ad..0ab734e0 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -117,7 +117,6 @@ def get_dataset(
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
     stage: Literal["pt", "sft", "rm", "ppo"],
-    # split: Optional[str] = "train", # TODO: add split
 ) -> Union["Dataset", "IterableDataset"]:
     template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
     if data_args.train_on_prompt and template.efficient_eos:
@@ -138,6 +137,9 @@ def get_dataset(
     with training_args.main_process_first(desc="load dataset"):
         all_datasets = []
         for dataset_attr in get_dataset_list(data_args):
+            if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True):
+                raise ValueError("The dataset is not applicable in the current training stage.")
+
             all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args))
         dataset = merge_dataset(all_datasets, data_args, training_args)
 
diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 7fb0a9b6..b8edfa10 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -23,23 +23,25 @@ def preprocess_pretrain_dataset(
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
     text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]]
-    if not data_args.packing:
-        return tokenizer(text_examples, add_special_tokens=False, max_length=data_args.cutoff_len)
 
-    tokenized_examples = tokenizer(text_examples, add_special_tokens=False)
-    concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
-    total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
-    block_size = data_args.cutoff_len
-    # we drop the small remainder, and if the total_length < block_size, we exclude this batch
-    total_length = (total_length // block_size) * block_size
-    # split by chunks of cutoff_len
-    result = {
-        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-        for k, t in concatenated_examples.items()
-    }
-    if data_args.template == "gemma":
-        for i in range(len(result["input_ids"])):
-            result["input_ids"][i][0] = tokenizer.bos_token_id
+    if not data_args.packing:
+        if data_args.template == "gemma":
+            text_examples = [tokenizer.bos_token + example for example in text_examples]
+
+        result = tokenizer(text_examples, add_special_tokens=False, max_length=data_args.cutoff_len)
+    else:
+        tokenized_examples = tokenizer(text_examples, add_special_tokens=False)
+        concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
+        total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
+        block_size = data_args.cutoff_len
+        total_length = (total_length // block_size) * block_size
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        if data_args.template == "gemma":
+            for i in range(len(result["input_ids"])):
+                result["input_ids"][i][0] = tokenizer.bos_token_id
 
     return result
 
diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py
index c0b6d6c2..83ee0610 100644
--- a/src/llmtuner/data/utils.py
+++ b/src/llmtuner/data/utils.py
@@ -44,7 +44,7 @@ def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
 def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]:
     max_target_len = int(max_len * (target_len / (source_len + target_len)))
     max_target_len = max(max_target_len, reserved_label_len)
-    max_source_len = max_len - max_target_len
+    max_source_len = max_len - min(max_target_len, target_len)
     return max_source_len, max_target_len
 
 
diff --git a/src/llmtuner/extras/callbacks.py b/src/llmtuner/extras/callbacks.py
index 985b0292..6e347c3c 100644
--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -134,6 +134,7 @@ class LogCallback(TrainerCallback):
             eval_loss=state.log_history[-1].get("eval_loss", None),
             predict_loss=state.log_history[-1].get("predict_loss", None),
             reward=state.log_history[-1].get("reward", None),
+            accuracy=state.log_history[-1].get("rewards/accuracies", None),
             learning_rate=state.log_history[-1].get("learning_rate", None),
             epoch=state.log_history[-1].get("epoch", None),
             percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 12ba8b23..8af8d8e8 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -39,9 +39,12 @@ TRAINING_STAGES = {
     "Reward Modeling": "rm",
     "PPO": "ppo",
     "DPO": "dpo",
+    "ORPO": "orpo",
     "Pre-Training": "pt",
 }
 
+STAGES_USE_PAIR_DATA = ["rm", "dpo", "orpo"]
+
 V_HEAD_WEIGHTS_NAME = "value_head.bin"
 
 V_HEAD_SAFE_WEIGHTS_NAME = "value_head.safetensors"
diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index c1f08334..177a9f8a 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -110,6 +110,10 @@ class RLHFArguments:
         default=0.0,
         metadata={"help": "The supervised fine-tuning loss coefficient in DPO training."},
     )
+    orpo_beta: float = field(
+        default=0.1,
+        metadata={"help": "The beta (lambda) parameter in ORPO loss representing the weight of the SFT loss."},
+    )
     ppo_buffer_size: int = field(
         default=1,
         metadata={"help": "The number of mini-batches to make experience buffer in a PPO optimization step."},
@@ -209,7 +213,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         default=False,
         metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."},
     )
-    stage: Literal["pt", "sft", "rm", "ppo", "dpo"] = field(
+    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo"] = field(
         default="sft",
         metadata={"help": "Which stage will be performed in training."},
     )
diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py
index 39e84679..c7e385da 100644
--- a/src/llmtuner/train/dpo/trainer.py
+++ b/src/llmtuner/train/dpo/trainer.py
@@ -74,7 +74,7 @@ class CustomDPOTrainer(DPOTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
-    def sft_loss(self, chosen_logits: torch.FloatTensor, chosen_labels: torch.LongTensor) -> torch.Tensor:
+    def sft_loss(self, chosen_logits: "torch.FloatTensor", chosen_labels: "torch.LongTensor") -> "torch.Tensor":
         r"""
         Computes supervised cross-entropy loss of given labels under the given logits.
 
@@ -85,8 +85,8 @@ class CustomDPOTrainer(DPOTrainer):
         return -all_logps
 
     def concatenated_forward(
-        self, model: "PreTrainedModel", batch: Dict[str, torch.Tensor]
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
+    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
         batch_copied = BatchEncoding({k: v.detach().clone() for k, v in batch.items()})  # avoid error
 
         all_logits = model(
@@ -107,9 +107,9 @@ class CustomDPOTrainer(DPOTrainer):
     def get_batch_loss_metrics(
         self,
         model: "PreTrainedModel",
-        batch: Dict[str, torch.Tensor],
+        batch: Dict[str, "torch.Tensor"],
         train_eval: Literal["train", "eval"] = "train",
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+    ) -> Tuple["torch.Tensor", Dict[str, "torch.Tensor"]]:
         r"""
         Computes the DPO loss and other metrics for the given batch of inputs for train or test.
         """
@@ -142,21 +142,22 @@ class CustomDPOTrainer(DPOTrainer):
             reference_chosen_logps,
             reference_rejected_logps,
         )
+        batch_loss = losses.mean()
         if self.ftx_gamma > 1e-6:
             batch_size = batch["input_ids"].size(0) // 2
             chosen_labels, _ = batch["labels"].split(batch_size, dim=0)
-            losses += self.ftx_gamma * self.sft_loss(policy_chosen_logits, chosen_labels)
+            batch_loss += self.ftx_gamma * self.sft_loss(policy_chosen_logits, chosen_labels).mean()
 
         reward_accuracies = (chosen_rewards > rejected_rewards).float()
 
         prefix = "eval_" if train_eval == "eval" else ""
-        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.cpu().mean()
-        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.cpu().mean()
-        metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.cpu().mean()
-        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).cpu().mean()
-        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.detach().cpu().mean()
-        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.detach().cpu().mean()
-        metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.detach().cpu().mean()
-        metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.detach().cpu().mean()
+        metrics["{}rewards/chosen".format(prefix)] = chosen_rewards.cpu().mean()
+        metrics["{}rewards/rejected".format(prefix)] = rejected_rewards.cpu().mean()
+        metrics["{}rewards/accuracies".format(prefix)] = reward_accuracies.cpu().mean()
+        metrics["{}rewards/margins".format(prefix)] = (chosen_rewards - rejected_rewards).cpu().mean()
+        metrics["{}logps/rejected".format(prefix)] = policy_rejected_logps.detach().cpu().mean()
+        metrics["{}logps/chosen".format(prefix)] = policy_chosen_logps.detach().cpu().mean()
+        metrics["{}logits/rejected".format(prefix)] = policy_rejected_logits.detach().cpu().mean()
+        metrics["{}logits/chosen".format(prefix)] = policy_chosen_logits.detach().cpu().mean()
 
-        return losses.mean(), metrics
+        return batch_loss, metrics
diff --git a/src/llmtuner/train/dpo/workflow.py b/src/llmtuner/train/dpo/workflow.py
index 851de982..4a1e867e 100644
--- a/src/llmtuner/train/dpo/workflow.py
+++ b/src/llmtuner/train/dpo/workflow.py
@@ -2,13 +2,12 @@
 
 from typing import TYPE_CHECKING, List, Optional
 
-from ...data import get_dataset, split_dataset
+from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset
 from ...extras.constants import IGNORE_INDEX
 from ...extras.ploting import plot_loss
 from ...hparams import ModelArguments
 from ...model import load_model, load_tokenizer
 from ..utils import create_modelcard_and_push, create_ref_model
-from .collator import DPODataCollatorWithPadding
 from .trainer import CustomDPOTrainer
 
 
@@ -29,7 +28,7 @@ def run_dpo(
     dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm")
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
 
-    data_collator = DPODataCollatorWithPadding(
+    data_collator = PairwiseDataCollatorWithPadding(
         tokenizer=tokenizer,
         pad_to_multiple_of=8,
         label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
@@ -64,7 +63,7 @@ def run_dpo(
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
         if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "accuracy"])
 
     # Evaluation
     if training_args.do_eval:
diff --git a/src/llmtuner/train/orpo/__init__.py b/src/llmtuner/train/orpo/__init__.py
new file mode 100644
index 00000000..e79d5ea3
--- /dev/null
+++ b/src/llmtuner/train/orpo/__init__.py
@@ -0,0 +1,4 @@
+from .workflow import run_orpo
+
+
+__all__ = ["run_orpo"]
diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py
new file mode 100644
index 00000000..291351e4
--- /dev/null
+++ b/src/llmtuner/train/orpo/trainer.py
@@ -0,0 +1,150 @@
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import Trainer
+from trl import DPOTrainer
+from trl.trainer.utils import disable_dropout_in_model
+
+from ...extras.constants import IGNORE_INDEX
+from ..utils import create_custom_optimzer, create_custom_scheduler
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from ...hparams import FinetuningArguments
+
+
+class CustomORPOTrainer(DPOTrainer):
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", "torch.nn.Module"],
+        finetuning_args: "FinetuningArguments",
+        disable_dropout: bool = True,
+        **kwargs,
+    ):
+        if disable_dropout:
+            disable_dropout_in_model(model)
+
+        self.finetuning_args = finetuning_args
+        self.reference_free = False
+        self.use_dpo_data_collator = True  # hack to avoid warning
+        self.generate_during_eval = False  # disable at evaluation
+        self.label_pad_token_id = IGNORE_INDEX
+        self.padding_value = 0
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.precompute_ref_log_probs = False
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+        self._peft_has_been_casted_to_bf16 = False
+
+        self.beta = finetuning_args.orpo_beta
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        Trainer.__init__(self, model=model, **kwargs)
+
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    def sft_loss(self, chosen_logits: "torch.FloatTensor", chosen_labels: "torch.LongTensor") -> "torch.Tensor":
+        r"""
+        Computes supervised cross-entropy loss of given labels under the given logits.
+
+        Returns:
+            A tensor of shape (batch_size,) containing the cross-entropy loss of each samples.
+        """
+        all_logps = self.get_batch_logps(chosen_logits, chosen_labels, average_log_prob=True)
+        return -all_logps
+
+    # Borrowed from:
+    # https://github.com/huggingface/trl/blob/0ee349dcd43b0f4b3169449f16751c38ac4a609f/trl/trainer/orpo_trainer.py#L592
+    def odds_ratio_loss(
+        self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor"
+    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""
+        Computes ORPO's odds ratio (OR) loss.
+
+        Args:
+            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of five tensors: (losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen).
+        """
+
+        # Derived from Eqs. (4) and (7) from https://arxiv.org/abs/2403.07691 by using log identities and exp(log(P(y|x)) = P(y|x)
+        log_odds = (chosen_logps - rejected_logps) - (
+            torch.log(1 - torch.exp(chosen_logps)) - torch.log(1 - torch.exp(rejected_logps))
+        )
+        ratio = F.logsigmoid(log_odds)
+        losses = self.beta * ratio
+
+        chosen_rewards = self.beta * chosen_logps.detach()
+        rejected_rewards = self.beta * rejected_logps.detach()
+
+        return losses, chosen_rewards, rejected_rewards, ratio, log_odds
+
+    def concatenated_forward(
+        self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
+    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        all_logits = model(
+            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], return_dict=True
+        ).logits.to(torch.float32)
+
+        all_logps = self.get_batch_logps(
+            all_logits,
+            batch["labels"],
+            average_log_prob=False,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+        batch_size = batch["input_ids"].size(0) // 2
+        chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0)
+        chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0)
+        return chosen_logps, rejected_logps, chosen_logits, rejected_logits
+
+    def get_batch_loss_metrics(
+        self,
+        model: "PreTrainedModel",
+        batch: Dict[str, "torch.Tensor"],
+        train_eval: Literal["train", "eval"] = "train",
+    ) -> Tuple["torch.Tensor", Dict[str, "torch.Tensor"]]:
+        r"""
+        Computes the ORPO loss and other metrics for the given batch of inputs for train or test.
+        """
+        metrics = {}
+        chosen_logps, rejected_logps, chosen_logits, rejected_logits = self.concatenated_forward(model, batch)
+
+        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = self.odds_ratio_loss(
+            chosen_logps, rejected_logps
+        )
+        batch_size = batch["input_ids"].size(0) // 2
+        chosen_labels, _ = batch["labels"].split(batch_size, dim=0)
+        sft_loss = self.sft_loss(chosen_logits, chosen_labels)
+        batch_loss = (sft_loss - losses).mean()
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics["{}rewards/chosen".format(prefix)] = chosen_rewards.cpu().mean()
+        metrics["{}rewards/rejected".format(prefix)] = rejected_rewards.cpu().mean()
+        metrics["{}rewards/accuracies".format(prefix)] = reward_accuracies.cpu().mean()
+        metrics["{}rewards/margins".format(prefix)] = (chosen_rewards - rejected_rewards).cpu().mean()
+        metrics["{}logps/rejected".format(prefix)] = rejected_logps.detach().cpu().mean()
+        metrics["{}logps/chosen".format(prefix)] = chosen_logps.detach().cpu().mean()
+        metrics["{}logits/rejected".format(prefix)] = rejected_logits.detach().cpu().mean()
+        metrics["{}logits/chosen".format(prefix)] = chosen_logits.detach().cpu().mean()
+        metrics["{}sft_loss".format(prefix)] = sft_loss.detach().cpu().mean()
+        metrics["{}log_odds_ratio".format(prefix)] = log_odds_ratio.detach().cpu().mean()
+        metrics["{}log_odds_chosen".format(prefix)] = log_odds_chosen.detach().cpu().mean()
+
+        return batch_loss, metrics
diff --git a/src/llmtuner/train/orpo/workflow.py b/src/llmtuner/train/orpo/workflow.py
new file mode 100644
index 00000000..1d549d28
--- /dev/null
+++ b/src/llmtuner/train/orpo/workflow.py
@@ -0,0 +1,68 @@
+# Inspired by: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
+
+from typing import TYPE_CHECKING, List, Optional
+
+from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset
+from ...extras.constants import IGNORE_INDEX
+from ...extras.ploting import plot_loss
+from ...hparams import ModelArguments
+from ...model import load_model, load_tokenizer
+from ..utils import create_modelcard_and_push
+from .trainer import CustomORPOTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments
+
+
+def run_orpo(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[List["TrainerCallback"]] = None,
+):
+    tokenizer = load_tokenizer(model_args)
+    dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm")
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
+    data_collator = PairwiseDataCollatorWithPadding(
+        tokenizer=tokenizer,
+        pad_to_multiple_of=8,
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+    )
+
+    # Update arguments
+    training_args.remove_unused_columns = False  # important for pairwise dataset
+
+    # Initialize our Trainer
+    trainer = CustomORPOTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **split_dataset(dataset, data_args, training_args),
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "accuracy"])
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/src/llmtuner/train/rm/workflow.py b/src/llmtuner/train/rm/workflow.py
index dd4b8467..f260f82e 100644
--- a/src/llmtuner/train/rm/workflow.py
+++ b/src/llmtuner/train/rm/workflow.py
@@ -2,13 +2,12 @@
 
 from typing import TYPE_CHECKING, List, Optional
 
-from ...data import get_dataset, split_dataset
+from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset
 from ...extras.callbacks import FixValueHeadModelCallback
 from ...extras.misc import fix_valuehead_checkpoint
 from ...extras.ploting import plot_loss
 from ...model import load_model, load_tokenizer
 from ..utils import create_modelcard_and_push
-from .collator import PairwiseDataCollatorWithPadding
 from .metric import compute_accuracy
 from .trainer import PairwiseTrainer
 
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index 1b8e3cb7..299e4f2a 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -9,6 +9,7 @@ from ..extras.logging import get_logger
 from ..hparams import get_infer_args, get_train_args
 from ..model import load_model_and_tokenizer
 from .dpo import run_dpo
+from .orpo import run_orpo
 from .ppo import run_ppo
 from .pt import run_pt
 from .rm import run_rm
@@ -36,6 +37,8 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["Tra
         run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
     elif finetuning_args.stage == "dpo":
         run_dpo(model_args, data_args, training_args, finetuning_args, callbacks)
+    elif finetuning_args.stage == "orpo":
+        run_orpo(model_args, data_args, training_args, finetuning_args, callbacks)
     else:
         raise ValueError("Unknown task.")
 
diff --git a/src/llmtuner/webui/common.py b/src/llmtuner/webui/common.py
index 798e6408..96ef2737 100644
--- a/src/llmtuner/webui/common.py
+++ b/src/llmtuner/webui/common.py
@@ -11,6 +11,7 @@ from ..extras.constants import (
     DEFAULT_MODULE,
     DEFAULT_TEMPLATE,
     PEFT_METHODS,
+    STAGES_USE_PAIR_DATA,
     SUPPORTED_MODELS,
     TRAINING_STAGES,
     DownloadSource,
@@ -127,7 +128,7 @@ def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]:
 
 def list_dataset(dataset_dir: str = None, training_stage: str = list(TRAINING_STAGES.keys())[0]) -> "gr.Dropdown":
     dataset_info = load_dataset_info(dataset_dir if dataset_dir is not None else DEFAULT_DATA_DIR)
-    ranking = TRAINING_STAGES[training_stage] in ["rm", "dpo"]
+    ranking = TRAINING_STAGES[training_stage] in STAGES_USE_PAIR_DATA
     datasets = [k for k, v in dataset_info.items() if v.get("ranking", False) == ranking]
     return gr.Dropdown(value=[], choices=datasets)
 

From 1aba442bcd099a4c83840a23e88d072227a51e56 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 31 Mar 2024 18:34:59 +0800
Subject: [PATCH 014/341] support orpo in webui

Former-commit-id: dd5cc78d4fb18dd0a2e9d57f0f046cfe9f0dc2c9
---
 src/llmtuner/webui/components/train.py |  7 +++++--
 src/llmtuner/webui/locales.py          | 14 ++++++++++++++
 src/llmtuner/webui/runner.py           |  5 +++--
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 52c8fdb6..9c9f143e 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -169,10 +169,13 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         with gr.Row():
             dpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1)
             dpo_ftx = gr.Slider(value=0, minimum=0, maximum=10, step=0.01, scale=1)
+            orpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1)
             reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=2)
 
-    input_elems.update({dpo_beta, dpo_ftx, reward_model})
-    elem_dict.update(dict(rlhf_tab=rlhf_tab, dpo_beta=dpo_beta, dpo_ftx=dpo_ftx, reward_model=reward_model))
+    input_elems.update({dpo_beta, dpo_ftx, orpo_beta, reward_model})
+    elem_dict.update(
+        dict(rlhf_tab=rlhf_tab, dpo_beta=dpo_beta, dpo_ftx=dpo_ftx, orpo_beta=orpo_beta, reward_model=reward_model)
+    )
 
     with gr.Accordion(open=False) as galore_tab:
         with gr.Row():
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index f6d6d421..be2841e8 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -757,6 +757,20 @@ LOCALES = {
             "info": "DPO-ftx 中 SFT 损失的权重大小。",
         },
     },
+    "orpo_beta": {
+        "en": {
+            "label": "ORPO beta",
+            "info": "Value of the beta parameter in the ORPO loss.",
+        },
+        "ru": {
+            "label": "ORPO бета",
+            "info": "Значение параметра бета в функции потерь ORPO.",
+        },
+        "zh": {
+            "label": "ORPO beta 参数",
+            "info": "ORPO 损失函数中 beta 超参数大小。",
+        },
+    },
     "reward_model": {
         "en": {
             "label": "Reward model",
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index ab646051..891a2e4b 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -174,10 +174,11 @@ class Runner:
                 ]
             )
             args["reward_model_type"] = "lora" if args["finetuning_type"] == "lora" else "full"
-
-        if args["stage"] == "dpo":
+        elif args["stage"] == "dpo":
             args["dpo_beta"] = get("train.dpo_beta")
             args["dpo_ftx"] = get("train.dpo_ftx")
+        elif args["stage"] == "orpo":
+            args["orpo_beta"] = get("train.orpo_beta")
 
         if get("train.val_size") > 1e-6 and args["stage"] != "ppo":
             args["val_size"] = get("train.val_size")

From 9abd83adb1282effa551b09deda1a401a716d41d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 31 Mar 2024 18:46:34 +0800
Subject: [PATCH 015/341] update readme

Former-commit-id: 297b01f16ac78cde15a5d85a9a5b82ea20bfaf23
---
 README.md                              | 2 +-
 README_zh.md                           | 2 +-
 src/llmtuner/webui/components/eval.py  | 2 +-
 src/llmtuner/webui/components/train.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b9059426..d0738eaa 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Choose your path:
 ## Features
 
 - **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
-- **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO and DPO.
+- **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
 - **Advanced algorithms**: GaLore, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
diff --git a/README_zh.md b/README_zh.md
index 5c81be44..460784b9 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -44,7 +44,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 ## 项目特色
 
 - **多种模型**：LLaMA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
-- **集成方法**：（增量）预训练、指令监督微调、奖励模型训练、PPO 训练和 DPO 训练。
+- **集成方法**：（增量）预训练、指令监督微调、奖励模型训练、PPO 训练、DPO 训练和 ORPO 训练。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
 - **先进算法**：GaLore、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index 4d2fe5c0..a1dae98c 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -70,7 +70,7 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems, concurrency_limit=None)
     start_btn.click(engine.runner.run_eval, input_elems, output_elems)
-    stop_btn.click(engine.runner.set_abort, queue=False)
+    stop_btn.click(engine.runner.set_abort)
     resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
 
     return elem_dict
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 9c9f143e..9b2be6b2 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -232,7 +232,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         concurrency_limit=None,
     )
     start_btn.click(engine.runner.run_train, input_elems, output_elems)
-    stop_btn.click(engine.runner.set_abort, queue=False)
+    stop_btn.click(engine.runner.set_abort)
     resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
 
     dataset_dir.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False)

From 00e17a377c6dfa0b62aa4b30e4c4f02c7148b7b6 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 31 Mar 2024 19:27:08 +0800
Subject: [PATCH 016/341] use log1p in orpo loss

https://github.com/huggingface/trl/pull/1491

Former-commit-id: 3b15d495264b00a4f8716bafea334778874963d7
---
 src/llmtuner/train/orpo/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py
index 291351e4..af34b55e 100644
--- a/src/llmtuner/train/orpo/trainer.py
+++ b/src/llmtuner/train/orpo/trainer.py
@@ -84,7 +84,7 @@ class CustomORPOTrainer(DPOTrainer):
 
         # Derived from Eqs. (4) and (7) from https://arxiv.org/abs/2403.07691 by using log identities and exp(log(P(y|x)) = P(y|x)
         log_odds = (chosen_logps - rejected_logps) - (
-            torch.log(1 - torch.exp(chosen_logps)) - torch.log(1 - torch.exp(rejected_logps))
+            torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps))
         )
         ratio = F.logsigmoid(log_odds)
         losses = self.beta * ratio

From c5a46f911394b48e7e1f708e2d97ad7594eec439 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 31 Mar 2024 19:43:48 +0800
Subject: [PATCH 017/341] fix plots

Former-commit-id: 81355671296b84d438967463bb2a92934ff31aae
---
 src/llmtuner/train/dpo/trainer.py   | 5 ++---
 src/llmtuner/train/dpo/workflow.py  | 2 +-
 src/llmtuner/train/orpo/workflow.py | 2 +-
 src/llmtuner/train/rm/workflow.py   | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py
index c7e385da..7582e16f 100644
--- a/src/llmtuner/train/dpo/trainer.py
+++ b/src/llmtuner/train/dpo/trainer.py
@@ -142,11 +142,10 @@ class CustomDPOTrainer(DPOTrainer):
             reference_chosen_logps,
             reference_rejected_logps,
         )
-        batch_loss = losses.mean()
         if self.ftx_gamma > 1e-6:
             batch_size = batch["input_ids"].size(0) // 2
             chosen_labels, _ = batch["labels"].split(batch_size, dim=0)
-            batch_loss += self.ftx_gamma * self.sft_loss(policy_chosen_logits, chosen_labels).mean()
+            losses += self.ftx_gamma * self.sft_loss(policy_chosen_logits, chosen_labels)
 
         reward_accuracies = (chosen_rewards > rejected_rewards).float()
 
@@ -160,4 +159,4 @@ class CustomDPOTrainer(DPOTrainer):
         metrics["{}logits/rejected".format(prefix)] = policy_rejected_logits.detach().cpu().mean()
         metrics["{}logits/chosen".format(prefix)] = policy_chosen_logits.detach().cpu().mean()
 
-        return batch_loss, metrics
+        return losses.mean(), metrics
diff --git a/src/llmtuner/train/dpo/workflow.py b/src/llmtuner/train/dpo/workflow.py
index 4a1e867e..929dd029 100644
--- a/src/llmtuner/train/dpo/workflow.py
+++ b/src/llmtuner/train/dpo/workflow.py
@@ -63,7 +63,7 @@ def run_dpo(
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
         if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "accuracy"])
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "rewards/accuracies"])
 
     # Evaluation
     if training_args.do_eval:
diff --git a/src/llmtuner/train/orpo/workflow.py b/src/llmtuner/train/orpo/workflow.py
index 1d549d28..5a2fd36c 100644
--- a/src/llmtuner/train/orpo/workflow.py
+++ b/src/llmtuner/train/orpo/workflow.py
@@ -56,7 +56,7 @@ def run_orpo(
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
         if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "accuracy"])
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "rewards/accuracies", "sft_loss"])
 
     # Evaluation
     if training_args.do_eval:
diff --git a/src/llmtuner/train/rm/workflow.py b/src/llmtuner/train/rm/workflow.py
index f260f82e..42bf1ce6 100644
--- a/src/llmtuner/train/rm/workflow.py
+++ b/src/llmtuner/train/rm/workflow.py
@@ -55,7 +55,7 @@ def run_rm(
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
         if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "eval_accuracy"])
 
     # Evaluation
     if training_args.do_eval:

From 52d402e2a95e1fada3bc37506621bed8d148c341 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 1 Apr 2024 14:37:53 +0800
Subject: [PATCH 018/341] fix IPO and ORPO loss

Former-commit-id: fc27955732aedbb12003faf19b760e2768b228f2
---
 src/llmtuner/train/dpo/trainer.py  | 14 +++++--
 src/llmtuner/train/orpo/trainer.py | 62 ++++++++----------------------
 2 files changed, 27 insertions(+), 49 deletions(-)

diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py
index 7582e16f..11727420 100644
--- a/src/llmtuner/train/dpo/trainer.py
+++ b/src/llmtuner/train/dpo/trainer.py
@@ -87,16 +87,22 @@ class CustomDPOTrainer(DPOTrainer):
     def concatenated_forward(
         self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
     ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""
+        Computes the sum log probabilities of the labels under the given logits if loss_type != IPO.
+
+        Otherwise the average log probabilities.
+        """
         batch_copied = BatchEncoding({k: v.detach().clone() for k, v in batch.items()})  # avoid error
 
-        all_logits = model(
+        all_logits: "torch.Tensor" = model(
             input_ids=batch_copied["input_ids"], attention_mask=batch_copied["attention_mask"], return_dict=True
         ).logits.to(torch.float32)
 
         all_logps = self.get_batch_logps(
-            all_logits,
-            batch["labels"],
-            average_log_prob=False,
+            logits=all_logits,
+            labels=batch_copied["labels"],
+            average_log_prob=(self.loss_type == "ipo"),
+            is_encoder_decoder=self.is_encoder_decoder,
             label_pad_token_id=self.label_pad_token_id,
         )
         batch_size = batch["input_ids"].size(0) // 2
diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py
index af34b55e..50b999f8 100644
--- a/src/llmtuner/train/orpo/trainer.py
+++ b/src/llmtuner/train/orpo/trainer.py
@@ -56,55 +56,31 @@ class CustomORPOTrainer(DPOTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
-    def sft_loss(self, chosen_logits: "torch.FloatTensor", chosen_labels: "torch.LongTensor") -> "torch.Tensor":
-        r"""
-        Computes supervised cross-entropy loss of given labels under the given logits.
-
-        Returns:
-            A tensor of shape (batch_size,) containing the cross-entropy loss of each samples.
-        """
-        all_logps = self.get_batch_logps(chosen_logits, chosen_labels, average_log_prob=True)
-        return -all_logps
-
-    # Borrowed from:
-    # https://github.com/huggingface/trl/blob/0ee349dcd43b0f4b3169449f16751c38ac4a609f/trl/trainer/orpo_trainer.py#L592
-    def odds_ratio_loss(
-        self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor"
-    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+    def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
         r"""
         Computes ORPO's odds ratio (OR) loss.
-
-        Args:
-            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
-            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
-
-        Returns:
-            A tuple of five tensors: (losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen).
         """
-
-        # Derived from Eqs. (4) and (7) from https://arxiv.org/abs/2403.07691 by using log identities and exp(log(P(y|x)) = P(y|x)
         log_odds = (chosen_logps - rejected_logps) - (
             torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps))
         )
-        ratio = F.logsigmoid(log_odds)
-        losses = self.beta * ratio
-
-        chosen_rewards = self.beta * chosen_logps.detach()
-        rejected_rewards = self.beta * rejected_logps.detach()
-
-        return losses, chosen_rewards, rejected_rewards, ratio, log_odds
+        odds_ratio_loss = -F.logsigmoid(log_odds)
+        return odds_ratio_loss
 
     def concatenated_forward(
         self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
     ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
-        all_logits = model(
+        r"""
+        Computes the average log probabilities of the labels under the given logits.
+        """
+        all_logits: "torch.Tensor" = model(
             input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], return_dict=True
         ).logits.to(torch.float32)
 
         all_logps = self.get_batch_logps(
-            all_logits,
-            batch["labels"],
-            average_log_prob=False,
+            logits=all_logits,
+            labels=batch["labels"],
+            average_log_prob=True,
+            is_encoder_decoder=self.is_encoder_decoder,
             label_pad_token_id=self.label_pad_token_id,
         )
         batch_size = batch["input_ids"].size(0) // 2
@@ -123,15 +99,12 @@ class CustomORPOTrainer(DPOTrainer):
         """
         metrics = {}
         chosen_logps, rejected_logps, chosen_logits, rejected_logits = self.concatenated_forward(model, batch)
+        sft_loss = chosen_logps
+        odds_ratio_loss = self.odds_ratio_loss(chosen_logps, rejected_logps)
+        batch_loss = (sft_loss + self.beta * odds_ratio_loss).mean()
 
-        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = self.odds_ratio_loss(
-            chosen_logps, rejected_logps
-        )
-        batch_size = batch["input_ids"].size(0) // 2
-        chosen_labels, _ = batch["labels"].split(batch_size, dim=0)
-        sft_loss = self.sft_loss(chosen_logits, chosen_labels)
-        batch_loss = (sft_loss - losses).mean()
-
+        chosen_rewards = self.beta * chosen_logps.detach()
+        rejected_rewards = self.beta * rejected_logps.detach()
         reward_accuracies = (chosen_rewards > rejected_rewards).float()
 
         prefix = "eval_" if train_eval == "eval" else ""
@@ -144,7 +117,6 @@ class CustomORPOTrainer(DPOTrainer):
         metrics["{}logits/rejected".format(prefix)] = rejected_logits.detach().cpu().mean()
         metrics["{}logits/chosen".format(prefix)] = chosen_logits.detach().cpu().mean()
         metrics["{}sft_loss".format(prefix)] = sft_loss.detach().cpu().mean()
-        metrics["{}log_odds_ratio".format(prefix)] = log_odds_ratio.detach().cpu().mean()
-        metrics["{}log_odds_chosen".format(prefix)] = log_odds_chosen.detach().cpu().mean()
+        metrics["{}odds_ratio_loss".format(prefix)] = odds_ratio_loss.detach().cpu().mean()
 
         return batch_loss, metrics

From be0a807e8c5895a3db20b53cf85004e1fb1bb9ee Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 1 Apr 2024 14:42:41 +0800
Subject: [PATCH 019/341] fix ORPO loss

Former-commit-id: 5544ddde9087f00f9e20b78d0079f20c2f5d1604
---
 src/llmtuner/train/orpo/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py
index 50b999f8..f5b7ff42 100644
--- a/src/llmtuner/train/orpo/trainer.py
+++ b/src/llmtuner/train/orpo/trainer.py
@@ -99,7 +99,7 @@ class CustomORPOTrainer(DPOTrainer):
         """
         metrics = {}
         chosen_logps, rejected_logps, chosen_logits, rejected_logits = self.concatenated_forward(model, batch)
-        sft_loss = chosen_logps
+        sft_loss = -chosen_logps
         odds_ratio_loss = self.odds_ratio_loss(chosen_logps, rejected_logps)
         batch_loss = (sft_loss + self.beta * odds_ratio_loss).mean()
 

From 61eb3a3d466cdedbe6346a7ee12818fcce023414 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 1 Apr 2024 16:23:28 +0800
Subject: [PATCH 020/341] update webui

Former-commit-id: e96d260917a35ad2068f7b28b4f0b334b808ccc2
---
 examples/lora_single_gpu/orpo.sh       |  2 +-
 src/llmtuner/webui/components/train.py | 24 ++++++++++++++++--------
 src/llmtuner/webui/locales.py          | 14 ++++++++++++++
 src/llmtuner/webui/runner.py           | 11 +++++------
 4 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/examples/lora_single_gpu/orpo.sh b/examples/lora_single_gpu/orpo.sh
index 77662ecf..407907b1 100644
--- a/examples/lora_single_gpu/orpo.sh
+++ b/examples/lora_single_gpu/orpo.sh
@@ -4,7 +4,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
     --stage orpo \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset comparison_gpt4_en \
+    --dataset orca_rlhf \
     --dataset_dir ../../data \
     --template default \
     --finetuning_type lora \
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 9b2be6b2..1c425d51 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -21,10 +21,10 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Row():
         training_stage = gr.Dropdown(
-            choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=2
+            choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=1
         )
-        dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2)
-        dataset = gr.Dropdown(multiselect=True, scale=4)
+        dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=1)
+        dataset = gr.Dropdown(multiselect=True, scale=2, allow_custom_value=True)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
     input_elems.update({training_stage, dataset_dir, dataset})
@@ -75,11 +75,17 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             optim = gr.Textbox(value="adamw_torch")
 
         with gr.Row():
-            resize_vocab = gr.Checkbox()
-            packing = gr.Checkbox()
-            upcast_layernorm = gr.Checkbox()
-            use_llama_pro = gr.Checkbox()
-            shift_attn = gr.Checkbox()
+            with gr.Column():
+                resize_vocab = gr.Checkbox()
+                packing = gr.Checkbox()
+
+            with gr.Column():
+                upcast_layernorm = gr.Checkbox()
+                use_llama_pro = gr.Checkbox()
+
+            with gr.Column():
+                shift_attn = gr.Checkbox()
+                report_to = gr.Checkbox()
 
     input_elems.update(
         {
@@ -93,6 +99,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             upcast_layernorm,
             use_llama_pro,
             shift_attn,
+            report_to,
         }
     )
     elem_dict.update(
@@ -108,6 +115,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             upcast_layernorm=upcast_layernorm,
             use_llama_pro=use_llama_pro,
             shift_attn=shift_attn,
+            report_to=report_to,
         )
     )
 
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index be2841e8..b7319fd4 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -536,6 +536,20 @@ LOCALES = {
             "info": "使用 LongLoRA 提出的 shift short attention。",
         },
     },
+    "report_to": {
+        "en": {
+            "label": "Enable external logger",
+            "info": "Use TensorBoard or wandb to log experiment.",
+        },
+        "ru": {
+            "label": "Включить внешний регистратор",
+            "info": "Использовать TensorBoard или wandb для ведения журнала экспериментов.",
+        },
+        "zh": {
+            "label": "启用外部记录面板",
+            "info": "使用 TensorBoard 或 wandb 记录实验。",
+        },
+    },
     "freeze_tab": {
         "en": {
             "label": "Freeze tuning configurations",
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 891a2e4b..dae7daf8 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -80,20 +80,18 @@ class Runner:
         if not from_preview and not is_torch_cuda_available():
             gr.Warning(ALERTS["warn_no_cuda"][lang])
 
-        self.aborted = False
         self.logger_handler.reset()
         self.trainer_callback = LogCallback(self)
         return ""
 
     def _finalize(self, lang: str, finish_info: str) -> str:
+        finish_info = ALERTS["info_aborted"][lang] if self.aborted else finish_info
         self.thread = None
-        self.running_data = None
+        self.aborted = False
         self.running = False
+        self.running_data = None
         torch_gc()
-        if self.aborted:
-            return ALERTS["info_aborted"][lang]
-        else:
-            return finish_info
+        return finish_info
 
     def _parse_train_args(self, data: Dict["Component", Any]) -> Dict[str, Any]:
         get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
@@ -141,6 +139,7 @@ class Runner:
             upcast_layernorm=get("train.upcast_layernorm"),
             use_llama_pro=get("train.use_llama_pro"),
             shift_attn=get("train.shift_attn"),
+            report_to="all" if get("train.report_to") else "none",
             use_galore=get("train.use_galore"),
             output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("train.output_dir")),
             fp16=(get("train.compute_type") == "fp16"),

From e7f13098c67af33e0893931d43c16121c9058186 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 1 Apr 2024 17:34:04 +0800
Subject: [PATCH 021/341] support infer 4bit model on GPUs #3023

Former-commit-id: 950a9dab9055839990656b2b40956792b253573d
---
 src/llmtuner/hparams/model_args.py |  4 ++++
 src/llmtuner/model/patcher.py      | 16 ++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index a3719586..f96fb636 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -53,6 +53,10 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether or not to use double quantization in int4 training."},
     )
+    quantization_device_map: Optional[Literal["auto"]] = field(
+        default=None,
+        metadata={"help": "Device map used for loading the 4-bit quantized model, needs bitsandbytes>=0.43.0."},
+    )
     rope_scaling: Optional[Literal["linear", "dynamic"]] = field(
         default=None,
         metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 3aa5c3e9..1a6da78a 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -208,11 +208,6 @@ def _configure_quantization(
         logger.info("Quantizing model to {} bit.".format(model_args.export_quantization_bit))
 
     elif model_args.quantization_bit is not None:  # bnb
-        if is_deepspeed_zero3_enabled():
-            require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
-            require_version("accelerate>=0.28.0", "To fix: pip install accelerate>=0.28.0")
-            require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
-
         if model_args.quantization_bit == 8:
             require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0")
             init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
@@ -227,7 +222,16 @@ def _configure_quantization(
                 bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp qlora
             )
 
-        init_kwargs["device_map"] = {"": get_current_device()}
+        if is_deepspeed_zero3_enabled() or model_args.quantization_device_map == "auto":
+            if model_args.quantization_bit != 4:
+                raise ValueError("Only 4-bit quantized model can use auto device map.")
+
+            require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
+            require_version("accelerate>=0.28.0", "To fix: pip install accelerate>=0.28.0")
+            require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
+        else:
+            init_kwargs["device_map"] = {"": get_current_device()}
+
         logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
 
 
From 40211db27598be4e369cb146c7d3a1e40b4dfb1f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 1 Apr 2024 21:35:18 +0800
Subject: [PATCH 022/341] fix #3077

Former-commit-id: d0340391e8075cff0d84b3ef879c2101b66ca1dc
---
 README.md                                  |  4 ++--
 README_zh.md                               |  4 ++--
 src/llmtuner/api/app.py                    | 10 ++++++++--
 src/llmtuner/api/protocol.py               | 18 +++++++++++++++---
 src/llmtuner/extras/patches/llama_patch.py |  2 +-
 src/llmtuner/model/patcher.py              |  2 +-
 tests/test_toolcall.py                     | 13 ++++++++-----
 7 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index d0738eaa..e4716873 100644
--- a/README.md
+++ b/README.md
@@ -264,8 +264,8 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.1    |
-| datasets     | 2.14.3  | 2.17.1    |
+| transformers | 4.37.2  | 4.39.2    |
+| datasets     | 2.14.3  | 2.18.0    |
 | accelerate   | 0.27.2  | 0.28.0    |
 | peft         | 0.9.0   | 0.10.0    |
 | trl          | 0.8.1   | 0.8.1     |
diff --git a/README_zh.md b/README_zh.md
index 460784b9..b13c0f19 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -264,8 +264,8 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.1    |
-| datasets     | 2.14.3  | 2.17.1    |
+| transformers | 4.37.2  | 4.39.2    |
+| datasets     | 2.14.3  | 2.18.0    |
 | accelerate   | 0.27.2  | 0.28.0    |
 | peft         | 0.9.0   | 0.10.0    |
 | trl          | 0.8.1   | 0.8.1     |
diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index c5a18bc7..3f06fef1 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -108,12 +108,18 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
             elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]:
                 raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
 
-            input_messages.append({"role": role_mapping[message.role], "content": message.content})
+            if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
+                name = message.tool_calls[0].function.name
+                arguments = message.tool_calls[0].function.arguments
+                content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)
+                input_messages.append({"role": role_mapping[Role.FUNCTION], "content": content})
+            else:
+                input_messages.append({"role": role_mapping[message.role], "content": message.content})
 
         tool_list = request.tools
         if isinstance(tool_list, list) and len(tool_list):
             try:
-                tools = json.dumps([tool["function"] for tool in tool_list], ensure_ascii=False)
+                tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
             except Exception:
                 raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
         else:
diff --git a/src/llmtuner/api/protocol.py b/src/llmtuner/api/protocol.py
index 3e39fe0b..ece2132b 100644
--- a/src/llmtuner/api/protocol.py
+++ b/src/llmtuner/api/protocol.py
@@ -1,6 +1,6 @@
 import time
 from enum import Enum, unique
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 from pydantic import BaseModel, Field
 from typing_extensions import Literal
@@ -39,6 +39,17 @@ class Function(BaseModel):
     arguments: str
 
 
+class FunctionDefinition(BaseModel):
+    name: str
+    description: str
+    parameters: Dict[str, Any]
+
+
+class FunctionAvailable(BaseModel):
+    type: Literal["function", "code_interpreter"] = "function"
+    function: Optional[FunctionDefinition] = None
+
+
 class FunctionCall(BaseModel):
     id: Literal["call_default"] = "call_default"
     type: Literal["function"] = "function"
@@ -47,7 +58,8 @@ class FunctionCall(BaseModel):
 
 class ChatMessage(BaseModel):
     role: Role
-    content: str
+    content: Optional[str] = None
+    tool_calls: Optional[List[FunctionCall]] = None
 
 
 class ChatCompletionMessage(BaseModel):
@@ -59,7 +71,7 @@ class ChatCompletionMessage(BaseModel):
 class ChatCompletionRequest(BaseModel):
     model: str
     messages: List[ChatMessage]
-    tools: list = []
+    tools: Optional[List[FunctionAvailable]] = None
     do_sample: bool = True
     temperature: Optional[float] = None
     top_p: Optional[float] = None
diff --git a/src/llmtuner/extras/patches/llama_patch.py b/src/llmtuner/extras/patches/llama_patch.py
index 7fb9f9d6..f0b65d65 100644
--- a/src/llmtuner/extras/patches/llama_patch.py
+++ b/src/llmtuner/extras/patches/llama_patch.py
@@ -193,6 +193,6 @@ def llama_flash_attn_forward(
 
 
 def apply_llama_patch() -> None:
-    require_version("transformers==4.39.1", "To fix: pip install transformers==4.39.1")
+    require_version("transformers==4.39.2", "To fix: pip install transformers==4.39.2")
     LlamaAttention.forward = llama_torch_attn_forward
     LlamaFlashAttention2.forward = llama_flash_attn_forward
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 1a6da78a..97399a2c 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -331,7 +331,7 @@ def patch_model(
     ):
         gen_config.do_sample = True
 
-    if model_args.resize_vocab:
+    if is_trainable and model_args.resize_vocab:
         _resize_embedding_layer(model, tokenizer)
 
     if is_trainable:
diff --git a/tests/test_toolcall.py b/tests/test_toolcall.py
index a54a0053..d36e7fec 100644
--- a/tests/test_toolcall.py
+++ b/tests/test_toolcall.py
@@ -15,7 +15,7 @@ def calculate_gpa(grades: Sequence[str], hours: Sequence[int]) -> float:
     for grade, hour in zip(grades, hours):
         total_score += grade_to_score[grade] * hour
         total_hour += hour
-    return total_score / total_hour
+    return round(total_score / total_hour, 2)
 
 
 def main():
@@ -45,16 +45,19 @@ def main():
     messages = []
     messages.append({"role": "user", "content": "My grades are A, A, B, and C. The credit hours are 3, 4, 3, and 2."})
     result = client.chat.completions.create(messages=messages, model="test", tools=tools)
+    if result.choices[0].message.tool_calls is None:
+        raise ValueError("Cannot retrieve function call from the response.")
+
+    messages.append(result.choices[0].message)
     tool_call = result.choices[0].message.tool_calls[0].function
+    print(tool_call)
+    # Function(arguments='{"grades": ["A", "A", "B", "C"], "hours": [3, 4, 3, 2]}', name='calculate_gpa')
     name, arguments = tool_call.name, json.loads(tool_call.arguments)
-    messages.append(
-        {"role": "function", "content": json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)}
-    )
     tool_result = tool_map[name](**arguments)
     messages.append({"role": "tool", "content": json.dumps({"gpa": tool_result}, ensure_ascii=False)})
     result = client.chat.completions.create(messages=messages, model="test", tools=tools)
     print(result.choices[0].message.content)
-    # Based on your grades and credit hours, your calculated Grade Point Average (GPA) is 3.4166666666666665.
+    # Based on the grades and credit hours you provided, your Grade Point Average (GPA) is 3.42.
 
 
 if __name__ == "__main__":

From 85726c91cef98f5e0db0c0c3135535aaeb8bbb46 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 1 Apr 2024 21:49:40 +0800
Subject: [PATCH 023/341] add qwen1.5 moe

Former-commit-id: 3ea94f0d12cec25ac694a2c4ae8971c356990b61
---
 README.md                        |  8 ++++----
 README_zh.md                     |  8 ++++----
 src/llmtuner/extras/constants.py | 20 ++++++++++++++++++--
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index e4716873..c90392c8 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-27-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-28-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -138,12 +138,11 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
-| [Mistral](https://huggingface.co/mistralai)              | 7B                          | q_proj,v_proj     | mistral   |
-| [Mixtral](https://huggingface.co/mistralai)              | 8x7B                        | q_proj,v_proj     | mistral   |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B                     | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
-| [Qwen1.5](https://huggingface.co/Qwen)                   | 0.5B/1.8B/4B/7B/14B/72B     | q_proj,v_proj     | qwen      |
+| [Qwen1.5 (MoE)](https://huggingface.co/Qwen)             | 0.5B/1.8B/4B/7B/14B/72B     | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                  | q_proj,v_proj     | xverse    |
 | [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                   | q_proj,v_proj     | yi        |
@@ -716,6 +715,7 @@ docker compose -f ./docker-compose.yml up -d
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
+1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
 1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
diff --git a/README_zh.md b/README_zh.md
index b13c0f19..d591e852 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-27-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-28-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -138,12 +138,11 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
-| [Mistral](https://huggingface.co/mistralai)              | 7B                          | q_proj,v_proj     | mistral   |
-| [Mixtral](https://huggingface.co/mistralai)              | 8x7B                        | q_proj,v_proj     | mistral   |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B                     | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
-| [Qwen1.5](https://huggingface.co/Qwen)                   | 0.5B/1.8B/4B/7B/14B/72B     | q_proj,v_proj     | qwen      |
+| [Qwen1.5 (MoE)](https://huggingface.co/Qwen)             | 0.5B/1.8B/4B/7B/14B/72B     | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                  | q_proj,v_proj     | xverse    |
 | [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                   | q_proj,v_proj     | yi        |
@@ -715,6 +714,7 @@ docker compose -f ./docker-compose.yml up -d
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
+1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
 1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper，基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM，基于 Baichuan-13B 微调而得，具有法律推理和知识检索能力。
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 8af8d8e8..6e46218b 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -463,14 +463,18 @@ register_model_group(
 
 register_model_group(
     models={
-        "Mistral-7B": {
+        "Mistral-7B-v0.1": {
             DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.1",
             DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.1",
         },
-        "Mistral-7B-Chat": {
+        "Mistral-7B-v0.1-Chat": {
             DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.1",
             DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.1",
         },
+        "Mistral-7B-v0.2": {
+            DownloadSource.DEFAULT: "alpindale/Mistral-7B-v0.2-hf",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.2-hf",
+        },
         "Mistral-7B-v0.2-Chat": {
             DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.2",
             DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.2",
@@ -663,6 +667,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B",
         },
+        "Qwen1.5-MoE-A2.7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B",
+        },
         "Qwen1.5-0.5B-Chat": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat",
@@ -687,6 +695,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat",
         },
+        "Qwen1.5-MoE-A2.7B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat",
+        },
         "Qwen1.5-0.5B-int8-Chat": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
@@ -735,6 +747,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-AWQ",
         },
+        "Qwen1.5-MoE-A2.7B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
+        },
     },
     template="qwen",
 )

From 1dc963caa6aced8feb0adbd2e674a421966ab3cd Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 1 Apr 2024 22:53:52 +0800
Subject: [PATCH 024/341] fix #3083

Former-commit-id: ff9a3f73961a362d0ddc22079f80a85465fffda8
---
 src/llmtuner/model/patcher.py      | 29 +++++++++++++++++------------
 src/llmtuner/train/dpo/trainer.py  |  5 ++++-
 src/llmtuner/train/orpo/trainer.py |  2 +-
 src/llmtuner/train/ppo/trainer.py  |  2 +-
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 97399a2c..db9849cf 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -235,6 +235,12 @@ def _configure_quantization(
         logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
 
 
+def _fp32_forward_post_hook(
+    module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor"
+) -> "torch.Tensor":
+    return output.to(torch.float32)
+
+
 def _prepare_model_for_training(
     model: "PreTrainedModel", model_args: "ModelArguments", output_layer_name: str = "lm_head"
 ) -> None:
@@ -263,14 +269,10 @@ def _prepare_model_for_training(
             logger.info("Gradient checkpointing enabled.")
 
     if hasattr(model, output_layer_name) and model_args.upcast_lmhead_output:
-
-        def fp32_forward_post_hook(module: torch.nn.Module, args: Tuple[torch.Tensor], output: torch.Tensor):
-            return output.to(torch.float32)
-
         logger.info("Upcasting lm_head outputs in float32.")
         output_layer = getattr(model, output_layer_name)
         if isinstance(output_layer, torch.nn.Linear) and output_layer.weight.dtype != torch.float32:
-            output_layer.register_forward_hook(fp32_forward_post_hook)
+            output_layer.register_forward_hook(_fp32_forward_post_hook)
 
 
 def patch_tokenizer(tokenizer: "PreTrainedTokenizer") -> None:
@@ -316,13 +318,6 @@ def patch_config(
 def patch_model(
     model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments", is_trainable: bool
 ) -> None:
-    if "GenerationMixin" not in str(model.generate.__func__):
-        model.generate = MethodType(PreTrainedModel.generate, model)
-
-    if getattr(model.config, "model_type", None) == "chatglm":
-        setattr(model, "lm_head", model.transformer.output_layer)
-        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
-
     gen_config = model.generation_config  # check and fix generation config
     if not gen_config.do_sample and (
         (gen_config.temperature is not None and gen_config.temperature != 1.0)
@@ -331,6 +326,16 @@ def patch_model(
     ):
         gen_config.do_sample = True
 
+    if "GenerationMixin" not in str(model.generate.__func__):
+        model.generate = MethodType(PreTrainedModel.generate, model)
+
+    if is_trainable and getattr(model.config, "model_type", None) == "chatglm":
+        setattr(model, "lm_head", model.transformer.output_layer)
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
+
+    if is_trainable and getattr(model.config, "model_type", None) == "qwen2" and model_args.flash_attn:
+        setattr(model.config, "use_cache", False)  # qwen2 does not support use_cache when using flashattn
+
     if is_trainable and model_args.resize_vocab:
         _resize_embedding_layer(model, tokenizer)
 
diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py
index 11727420..0b316c62 100644
--- a/src/llmtuner/train/dpo/trainer.py
+++ b/src/llmtuner/train/dpo/trainer.py
@@ -95,7 +95,10 @@ class CustomDPOTrainer(DPOTrainer):
         batch_copied = BatchEncoding({k: v.detach().clone() for k, v in batch.items()})  # avoid error
 
         all_logits: "torch.Tensor" = model(
-            input_ids=batch_copied["input_ids"], attention_mask=batch_copied["attention_mask"], return_dict=True
+            input_ids=batch_copied["input_ids"],
+            attention_mask=batch_copied["attention_mask"],
+            return_dict=True,
+            use_cache=False,
         ).logits.to(torch.float32)
 
         all_logps = self.get_batch_logps(
diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py
index f5b7ff42..d84e0199 100644
--- a/src/llmtuner/train/orpo/trainer.py
+++ b/src/llmtuner/train/orpo/trainer.py
@@ -73,7 +73,7 @@ class CustomORPOTrainer(DPOTrainer):
         Computes the average log probabilities of the labels under the given logits.
         """
         all_logits: "torch.Tensor" = model(
-            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], return_dict=True
+            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], return_dict=True, use_cache=False
         ).logits.to(torch.float32)
 
         all_logps = self.get_batch_logps(
diff --git a/src/llmtuner/train/ppo/trainer.py b/src/llmtuner/train/ppo/trainer.py
index de87532a..6be45958 100644
--- a/src/llmtuner/train/ppo/trainer.py
+++ b/src/llmtuner/train/ppo/trainer.py
@@ -353,7 +353,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         batch = self.prepare_model_inputs(queries, responses)
 
         with torch.cuda.amp.autocast(dtype=self.model_args.compute_dtype):  # support bf16
-            _, _, values = reward_model(**batch, output_hidden_states=True, return_dict=True)
+            _, _, values = reward_model(**batch, output_hidden_states=True, return_dict=True, use_cache=False)
 
         if getattr(unwrapped_model.config, "model_type", None) == "chatglm":  # assume same architecture
             values = torch.transpose(values, 0, 1)

From 9e14501edbee6e45ba4ba33fbf6ff0c09656c883 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 1 Apr 2024 23:24:08 +0800
Subject: [PATCH 025/341] set dev version

Former-commit-id: 922ecae89210e5b8d62d78774f123a6d75c525ba
---
 src/llmtuner/__init__.py      | 2 +-
 src/llmtuner/data/template.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/__init__.py b/src/llmtuner/__init__.py
index 903e82ad..1a0189bb 100644
--- a/src/llmtuner/__init__.py
+++ b/src/llmtuner/__init__.py
@@ -7,5 +7,5 @@ from .train import export_model, run_exp
 from .webui import create_ui, create_web_demo
 
 
-__version__ = "0.6.1"
+__version__ = "0.6.2.dev0"
 __all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index cd645cf7..6cc12c56 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -414,7 +414,7 @@ _register_template(
 
 _register_template(
     name="baichuan",
-    format_user=StringFormatter(slots=["<reserved_102>{{content}}<reserved_103>"]),
+    format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
     efficient_eos=True,
 )
 

From 0c4a1381a4cddfe5bce0477eb93413af18454dbe Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 1 Apr 2024 23:30:03 +0800
Subject: [PATCH 026/341] Update SECURITY.md

Former-commit-id: e22217c75421a89fd7e2ada62ce0e08245dd05e7
---
 .github/SECURITY.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/SECURITY.md b/.github/SECURITY.md
index f6ffaeb9..d34728eb 100644
--- a/.github/SECURITY.md
+++ b/.github/SECURITY.md
@@ -1,6 +1,6 @@
 # Reporting Security Issues
 
-To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/electron/electron/security/advisories/new) tab.
+To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/hiyouga/LLaMA-Factory/security/advisories/new) tab.
 
 We will send a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.
 

From 03e20bb5c6bd5bf714ec29e82b183fd067f2729f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 13:58:39 +0800
Subject: [PATCH 027/341] fix #3022

Former-commit-id: dac2f617bda9470ac8d85c7e9def09cc04970506
---
 src/llmtuner/model/patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index db9849cf..379b0c48 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -336,7 +336,7 @@ def patch_model(
     if is_trainable and getattr(model.config, "model_type", None) == "qwen2" and model_args.flash_attn:
         setattr(model.config, "use_cache", False)  # qwen2 does not support use_cache when using flashattn
 
-    if is_trainable and model_args.resize_vocab:
+    if model_args.resize_vocab:
         _resize_embedding_layer(model, tokenizer)
 
     if is_trainable:

From 117b67ea30e528cf36dd6707bd14b465319e8968 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 14:26:31 +0800
Subject: [PATCH 028/341] add moe aux loss control #3085

Former-commit-id: c9187ebc944e2de454ace3304b7d28eabb1b1a81
---
 src/llmtuner/extras/misc.py        |  8 +++-----
 src/llmtuner/hparams/model_args.py |  4 ++++
 src/llmtuner/model/loader.py       |  3 +--
 src/llmtuner/model/patcher.py      | 24 +++++++++++++++---------
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index c7b687e9..60cf153b 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -203,17 +203,15 @@ def torch_gc() -> None:
         torch.cuda.ipc_collect()
 
 
-def try_download_model_from_ms(model_args: "ModelArguments") -> None:
+def try_download_model_from_ms(model_args: "ModelArguments") -> str:
     if not use_modelscope() or os.path.exists(model_args.model_name_or_path):
-        return
+        return model_args.model_name_or_path
 
     try:
         from modelscope import snapshot_download
 
         revision = "master" if model_args.model_revision == "main" else model_args.model_revision
-        model_args.model_name_or_path = snapshot_download(
-            model_args.model_name_or_path, revision=revision, cache_dir=model_args.cache_dir
-        )
+        return snapshot_download(model_args.model_name_or_path, revision=revision, cache_dir=model_args.cache_dir)
     except ImportError:
         raise ImportError("Please install modelscope via `pip install modelscope -U`")
 
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index f96fb636..be71d32f 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -73,6 +73,10 @@ class ModelArguments:
         default=False,
         metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."},
     )
+    moe_aux_loss_coef: Optional[float] = field(
+        default=None,
+        metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."},
+    )
     disable_gradient_checkpointing: bool = field(
         default=False,
         metadata={"help": "Whether or not to disable gradient checkpointing."},
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index b1816aa7..d05c0886 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -20,6 +20,7 @@ logger = get_logger(__name__)
 
 
 def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
+    model_args.model_name_or_path = try_download_model_from_ms(model_args)
     return {
         "trust_remote_code": True,
         "cache_dir": model_args.cache_dir,
@@ -34,9 +35,7 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
 
     Note: including inplace operation of model_args.
     """
-    try_download_model_from_ms(model_args)
     init_kwargs = _get_init_kwargs(model_args)
-
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
         use_fast=model_args.use_fast_tokenizer,
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 379b0c48..7132470a 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -290,11 +290,6 @@ def patch_config(
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
         model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
 
-    if getattr(config, "model_type", None) == "qwen":
-        setattr(config, "use_flash_attn", model_args.flash_attn)
-        for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
-            setattr(config, dtype_name, model_args.compute_dtype == dtype)
-
     _configure_attn_implementation(config, model_args, init_kwargs)
     _configure_rope(config, model_args, is_trainable)
     _configure_longlora(config, model_args, is_trainable)
@@ -304,11 +299,25 @@ def patch_config(
         setattr(config, "use_cache", True)
         logger.info("Using KV cache for faster generation.")
 
+    if model_args.moe_aux_loss_coef is not None:
+        if getattr(config, "model_type", None) in ["mixtral", "qwen2_moe"]:
+            setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)
+        elif getattr(config, "model_type", None) == "deepseek":
+            setattr(config, "aux_loss_alpha", model_args.moe_aux_loss_coef)
+
+    if getattr(config, "model_type", None) == "qwen":
+        setattr(config, "use_flash_attn", model_args.flash_attn)
+        for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
+            setattr(config, dtype_name, model_args.compute_dtype == dtype)
+
+    if getattr(config, "model_type", None) == "qwen2" and is_trainable and model_args.flash_attn:
+        setattr(config, "use_cache", False)  # qwen2 does not support use_cache when using flashattn
+
     init_kwargs["torch_dtype"] = model_args.compute_dtype
     if not is_deepspeed_zero3_enabled():
         init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage
         if init_kwargs["low_cpu_mem_usage"]:
-            if "device_map" not in init_kwargs:  # quant models cannot use auto device map
+            if "device_map" not in init_kwargs:
                 init_kwargs["device_map"] = model_args.device_map or {"": get_current_device()}
 
             if init_kwargs["device_map"] == "auto":
@@ -333,9 +342,6 @@ def patch_model(
         setattr(model, "lm_head", model.transformer.output_layer)
         setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
 
-    if is_trainable and getattr(model.config, "model_type", None) == "qwen2" and model_args.flash_attn:
-        setattr(model.config, "use_cache", False)  # qwen2 does not support use_cache when using flashattn
-
     if model_args.resize_vocab:
         _resize_embedding_layer(model, tokenizer)
 

From b12176d818d70b46ae33409eee16e073c7f5d713 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 20:07:43 +0800
Subject: [PATCH 029/341] simplify readme

Former-commit-id: 0da6ec2d516326fe9c7583ba71cd1778eb838178
---
 README.md                                    | 394 ++-----------------
 README_zh.md                                 | 390 ++----------------
 examples/{ => extras}/fsdp_qlora/README.md   |   0
 examples/{ => extras}/fsdp_qlora/fsdp.sh     |   5 +-
 examples/full_multi_gpu/multi_node.sh        |   2 +-
 examples/full_multi_gpu/single_node.sh       |   2 +-
 examples/inference/api_demo.sh               |   7 +
 examples/inference/cli_demo.sh               |   7 +
 examples/inference/evaluate.sh               |  12 +
 examples/inference/web_demo.sh               |   7 +
 examples/lora_multi_gpu/multi_node.sh        |   2 +-
 examples/lora_multi_gpu/single_node.sh       |   2 +-
 examples/lora_single_gpu/prepare.sh          |  18 +
 examples/merge_lora/README.md                |   9 +
 examples/merge_lora/merge.sh                 |   2 +-
 src/llmtuner/data/loader.py                  |  19 +-
 src/llmtuner/extras/misc.py                  |  12 +
 src/llmtuner/extras/patches/llama_patch.py   |   2 +-
 src/llmtuner/extras/patches/mixtral_patch.py |  38 --
 src/llmtuner/hparams/data_args.py            |   4 +-
 src/llmtuner/model/patcher.py                | 101 ++---
 src/llmtuner/model/utils.py                  |  16 +-
 src/llmtuner/train/dpo/collator.py           |  54 ---
 src/llmtuner/train/rm/collator.py            |  29 --
 24 files changed, 244 insertions(+), 890 deletions(-)
 rename examples/{ => extras}/fsdp_qlora/README.md (100%)
 rename examples/{ => extras}/fsdp_qlora/fsdp.sh (87%)
 create mode 100644 examples/inference/api_demo.sh
 create mode 100644 examples/inference/cli_demo.sh
 create mode 100644 examples/inference/evaluate.sh
 create mode 100644 examples/inference/web_demo.sh
 create mode 100644 examples/lora_single_gpu/prepare.sh
 delete mode 100644 src/llmtuner/extras/patches/mixtral_patch.py
 delete mode 100644 src/llmtuner/train/dpo/collator.py
 delete mode 100644 src/llmtuner/train/rm/collator.py

diff --git a/README.md b/README.md
index c90392c8..6450e61e 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ Choose your path:
 
 ## Benchmark
 
-Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning), LLaMA-Factory's LoRA tuning offers up to **3.7 times faster** training speed with a better Rouge score on the advertising text generation task. By leveraging 4-bit quantization technique, LLaMA-Factory's QLoRA further improves the efficiency regarding the GPU memory.
+Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning), LLaMA Factory's LoRA tuning offers up to **3.7 times faster** training speed with a better Rouge score on the advertising text generation task. By leveraging 4-bit quantization technique, LLaMA Factory's QLoRA further improves the efficiency regarding the GPU memory.
 
 ![benchmark](assets/benchmark.svg)
 
@@ -62,7 +62,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 - **Training Speed**: the number of training samples processed per second during the training. (bs=4, cutoff_len=1024)
 - **Rouge Score**: Rouge-2 score on the development set of the [advertising text generation](https://aclanthology.org/D19-1321.pdf) task. (bs=4, cutoff_len=1024)
 - **GPU Memory**: Peak GPU memory usage in 4-bit quantized training. (bs=1, cutoff_len=1024)
-- We adopt `pre_seq_len=128` for ChatGLM's P-Tuning and `lora_rank=32` for LLaMA-Factory's LoRA tuning.
+- We adopt `pre_seq_len=128` for ChatGLM's P-Tuning and `lora_rank=32` for LLaMA Factory's LoRA tuning.
 
 </details>
 
@@ -72,7 +72,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
 
-[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/fsdp_qlora` for usage.
+[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
 
 <details><summary>Full Changelog</summary>
 
@@ -168,9 +168,6 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ
 | DPO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | ORPO Training          | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 
-> [!NOTE]
-> Use `--quantization_bit 4` argument to enable QLoRA.
-
 ## Provided Datasets
 
 <details><summary>Pre-training datasets</summary>
@@ -263,7 +260,7 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.2    |
+| transformers | 4.37.2  | 4.39.3    |
 | datasets     | 2.14.3  | 2.18.0    |
 | accelerate   | 0.27.2  | 0.28.0    |
 | peft         | 0.9.0   | 0.10.0    |
@@ -293,23 +290,28 @@ huggingface-cli login
 
 ## Getting Started
 
-### Data Preparation (optional)
+### Data Preparation
 
-Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use a single `.json` file or a [dataset loading script](https://huggingface.co/docs/datasets/dataset_script) with multiple files to create a custom dataset.
+Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope hub or load the dataset in local disk.
 
 > [!NOTE]
-> Please update `data/dataset_info.json` to use your custom dataset. About the format of this file, please refer to `data/README.md`.
+> Please update `data/dataset_info.json` to use your custom dataset.
 
-### Dependence Installation (optional)
+### Dependence Installation
 
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
 conda create -n llama_factory python=3.10
 conda activate llama_factory
 cd LLaMA-Factory
-pip install -r requirements.txt
+pip install -e .[metrics]
 ```
 
+> [!TIP]
+> Extra dependencies available: deepspeed, metrics, unsloth, vllm, bitsandbytes, gptq, awq, aqlm, qwen, quality
+
+<details><summary>For Windows users</summary>
+
 If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you will be required to install a pre-built version of `bitsandbytes` library, which supports CUDA 11.1 to 12.2, please select the appropriate [release version](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) based on your CUDA version.
 
 ```bash
@@ -318,352 +320,17 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 To enable FlashAttention-2 on the Windows platform, you need to install the precompiled `flash-attn` library, which supports CUDA 12.1 to 12.2. Please download the corresponding version from [flash-attention](https://github.com/bdashore3/flash-attention/releases) based on your requirements.
 
-### Use ModelScope Hub (optional)
+</details>
 
-If you have trouble with downloading models and datasets from Hugging Face, you can use LLaMA-Factory together with ModelScope in the following manner.
+### LLaMA Board GUI
 
-```bash
-export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows
-```
-
-Then you can train the corresponding model by specifying a model ID of the ModelScope Hub. (find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models))
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --model_name_or_path modelscope/Llama-2-7b-ms \
-    ... # arguments (same as below)
-```
-
-LLaMA Board also supports using the models and datasets on the ModelScope Hub.
-
-```bash
-CUDA_VISIBLE_DEVICES=0 USE_MODELSCOPE_HUB=1 python src/train_web.py
-```
-
-### Train on a single GPU
-
-> [!IMPORTANT]
-> If you want to train models on multiple GPUs, please refer to [Distributed Training](#distributed-training).
-
-
-#### LLaMA Board GUI
+#### Use local environment
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 python src/train_web.py
+# or CUDA_VISIBLE_DEVICES=0 python -m llmtuner.webui.interface
 ```
 
-#### Pre-Training
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage pt \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --dataset wiki_demo \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_pt_checkpoint \
-    --overwrite_cache \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### Supervised Fine-Tuning
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --dataset alpaca_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_sft_checkpoint \
-    --overwrite_cache \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### Reward Modeling
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage rm \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset comparison_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_rm_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### PPO Training
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage ppo \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset alpaca_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --reward_model path_to_rm_checkpoint \
-    --output_dir path_to_ppo_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --top_k 0 \
-    --top_p 0.9 \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-> [!TIP]
-> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` to infer the fine-tuned model if `--create_new_adapter` was enabled.
-
-> [!WARNING]
-> Use `--per_device_train_batch_size=1` for LLaMA-2 models in fp16 PPO training.
-
-#### DPO Training
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage dpo \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset comparison_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_dpo_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-> [!TIP]
-> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` to infer the fine-tuned model if `--create_new_adapter` was enabled.
-
-### Distributed Training
-
-#### Use Huggingface Accelerate
-
-```bash
-accelerate launch --config_file config.yaml src/train_bash.py \
-    --ddp_timeout 180000000 \
-    ... # arguments (same as above)
-```
-
-<details><summary>Example config.yaml for LoRA training</summary>
-
-```yaml
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: 0
-main_training_function: main
-mixed_precision: fp16
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-```
-
-</details>
-
-> [!TIP]
-> We commend using Accelerate for LoRA tuning.
-
-#### Use DeepSpeed
-
-```bash
-deepspeed --num_gpus 8 src/train_bash.py \
-    --deepspeed ds_config.json \
-    --ddp_timeout 180000000 \
-    ... # arguments (same as above)
-```
-
-<details><summary>Example ds_config.json for full-parameter training with DeepSpeed ZeRO-2</summary>
-
-```json
-{
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "zero_allow_untested_optimizer": true,
-  "fp16": {
-    "enabled": "auto",
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 16,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 5e8,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 5e8,
-    "contiguous_gradients": true,
-    "round_robin_gradients": true
-  }
-}
-```
-
-</details>
-
-> [!TIP]
-> Refer to [examples](examples) for more training scripts.
-
-### Merge LoRA weights and export model
-
-```bash
-CUDA_VISIBLE_DEVICES= python src/export_model.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora \
-    --export_dir path_to_export \
-    --export_size 2 \
-    --export_legacy_format False
-```
-
-> [!WARNING]
-> Merging LoRA weights into a quantized model is not supported.
-
-> [!TIP]
-> Use `--model_name_or_path path_to_export` solely to use the exported model.
-> 
-> Use `CUDA_VISIBLE_DEVICES=0`, `--export_quantization_bit 4` and `--export_quantization_dataset data/c4_demo.json` to quantize the model with AutoGPTQ after merging the LoRA weights.
-
-### Inference with OpenAI-style API
-
-```bash
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-> [!TIP]
-> Visit `http://localhost:8000/docs` for API documentation.
-
-### Inference with command line
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-### Inference with web browser
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-### Evaluation
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template vanilla \
-    --finetuning_type lora \
-    --task mmlu \
-    --split test \
-    --lang en \
-    --n_shot 5 \
-    --batch_size 4
-```
-
-### Predict
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft \
-    --do_predict \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --dataset alpaca_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --output_dir path_to_predict_result \
-    --per_device_eval_batch_size 1 \
-    --max_samples 100 \
-    --predict_with_generate \
-    --fp16
-```
-
-> [!WARNING]
-> Use `--per_device_train_batch_size=1` for LLaMA-2 models in fp16 predict.
-
-> [!TIP]
-> We recommend using `--per_device_eval_batch_size=1` and `--max_target_length 128` at 4/8-bit predict.
-
-### Dockerize Training
-
 #### Use Docker
 
 ```bash
@@ -692,6 +359,27 @@ docker compose -f ./docker-compose.yml up -d
 > * data: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI.
 > * output: Set export dir to this location so that the merged result can be accessed directly on the host machine.
 
+> [!WARNING]
+> LLaMA Board GUI does not yet support multi-GPUs training.
+
+### Command Line Interface
+
+See [examples](examples) for usage.
+
+> [!TIP]
+> Use `python src/train_bash.py -h` to display arguments description.
+
+### Use ModelScope Hub
+
+If you have trouble with downloading models and datasets from Hugging Face, you can use ModelScope.
+
+```bash
+export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows
+```
+
+> [!TIP]
+> Train the model by specifying a model ID of the ModelScope Hub as the `--model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `modelscope/Llama-2-7b-ms`.
+
 ## Projects using LLaMA Factory
 
 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
@@ -738,7 +426,7 @@ If this work is helpful, please kindly cite as:
 
 ```bibtex
 @article{zheng2024llamafactory,
-  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models}, 
+  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
   author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Yongqiang Ma},
   journal={arXiv preprint arXiv:2403.13372},
   year={2024},
@@ -748,7 +436,7 @@ If this work is helpful, please kindly cite as:
 
 ## Acknowledgement
 
-This repo benefits from [PEFT](https://github.com/huggingface/peft), [QLoRA](https://github.com/artidoro/qlora) and [FastChat](https://github.com/lm-sys/FastChat). Thanks for their wonderful works.
+This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https://github.com/huggingface/trl), [QLoRA](https://github.com/artidoro/qlora) and [FastChat](https://github.com/lm-sys/FastChat). Thanks for their wonderful works.
 
 ## Star History
 
diff --git a/README_zh.md b/README_zh.md
index d591e852..8b19f17f 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -53,7 +53,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 性能指标
 
-与 ChatGLM 官方的 [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning) 微调相比，LLaMA-Factory 的 LoRA 微调提供了 **3.7 倍**的加速比，同时在广告文案生成任务上取得了更高的 Rouge 分数。结合 4 比特量化技术，LLaMA-Factory 的 QLoRA 微调进一步降低了 GPU 显存消耗。
+与 ChatGLM 官方的 [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning) 微调相比，LLaMA Factory 的 LoRA 微调提供了 **3.7 倍**的加速比，同时在广告文案生成任务上取得了更高的 Rouge 分数。结合 4 比特量化技术，LLaMA Factory 的 QLoRA 微调进一步降低了 GPU 显存消耗。
 
 ![benchmark](assets/benchmark.svg)
 
@@ -62,7 +62,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - **Training Speed**: 训练阶段每秒处理的样本数量。（批处理大小=4，截断长度=1024）
 - **Rouge Score**: [广告文案生成](https://aclanthology.org/D19-1321.pdf)任务验证集上的 Rouge-2 分数。（批处理大小=4，截断长度=1024）
 - **GPU Memory**: 4 比特量化训练的 GPU 显存峰值。（批处理大小=1，截断长度=1024）
-- 我们在 ChatGLM 的 P-Tuning 中采用 `pre_seq_len=128`，在 LLaMA-Factory 的 LoRA 微调中采用 `lora_rank=32`。
+- 我们在 ChatGLM 的 P-Tuning 中采用 `pre_seq_len=128`，在 LLaMA Factory 的 LoRA 微调中采用 `lora_rank=32`。
 
 </details>
 
@@ -72,7 +72,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
 
-[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/fsdp_qlora`。
+[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
 
 <details><summary>展开日志</summary>
 
@@ -168,9 +168,6 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | DPO 训练               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | ORPO 训练              | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 
-> [!NOTE]
-> 请使用 `--quantization_bit 4` 参数来启用 QLoRA 训练。
-
 ## 数据集
 
 <details><summary>预训练数据集</summary>
@@ -263,7 +260,7 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.2    |
+| transformers | 4.37.2  | 4.39.3    |
 | datasets     | 2.14.3  | 2.18.0    |
 | accelerate   | 0.27.2  | 0.28.0    |
 | peft         | 0.9.0   | 0.10.0    |
@@ -293,23 +290,28 @@ huggingface-cli login
 
 ## 如何使用
 
-### 数据准备（可跳过）
+### 数据准备
 
-关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。构建自定义数据集时，既可以使用单个 `.json` 文件，也可以使用一个[数据加载脚本](https://huggingface.co/docs/datasets/dataset_script)和多个文件。
+关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。
 
 > [!NOTE]
-> 使用自定义数据集时，请更新 `data/dataset_info.json` 文件，该文件的格式请参考 `data/README_zh.md`。
+> 使用自定义数据集时，请更新 `data/dataset_info.json` 文件。
 
-### 环境搭建（可跳过）
+### 安装依赖
 
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
 conda create -n llama_factory python=3.10
 conda activate llama_factory
 cd LLaMA-Factory
-pip install -r requirements.txt
+pip install -e .[metrics]
 ```
 
+> [!TIP]
+> 可选的额外依赖项：deepspeed、metrics、unsloth、vllm、bitsandbytes、gptq、awq、aqlm、qwen、quality
+
+<details><summary>Windows 用户指南</summary>
+
 如果要在 Windows 平台上开启量化 LoRA（QLoRA），需要安装预编译的 `bitsandbytes` 库, 支持 CUDA 11.1 到 12.2, 请根据您的 CUDA 版本情况选择适合的[发布版本](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。
 
 ```bash
@@ -318,350 +320,17 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 如果要在 Windows 平台上开启 FlashAttention-2，需要安装预编译的 `flash-attn` 库，支持 CUDA 12.1 到 12.2，请根据需求到 [flash-attention](https://github.com/bdashore3/flash-attention/releases) 下载对应版本安装。
 
-### 使用魔搭社区（可跳过）
+</details>
 
-如果您在 Hugging Face 模型和数据集的下载中遇到了问题，可以通过下述方法使用魔搭社区。
+### LLaMA Board 可视化界面
 
-```bash
-export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
-```
-
-接着即可通过指定模型名称来训练对应的模型。（在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型）
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --model_name_or_path modelscope/Llama-2-7b-ms \
-    ... # 参数同下
-```
-
-LLaMA Board 同样支持魔搭社区的模型和数据集下载。
-
-```bash
-CUDA_VISIBLE_DEVICES=0 USE_MODELSCOPE_HUB=1 python src/train_web.py
-```
-
-### 单 GPU 训练
-
-> [!IMPORTANT]
-> 如果您使用多张 GPU 训练模型，请移步[多 GPU 分布式训练](#多-gpu-分布式训练)部分。
-
-#### LLaMA Board GUI
+#### 使用本地环境
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 python src/train_web.py
+# 或 CUDA_VISIBLE_DEVICES=0 python -m llmtuner.webui.interface
 ```
 
-#### 预训练
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage pt \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --dataset wiki_demo \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_pt_checkpoint \
-    --overwrite_cache \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### 指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --dataset alpaca_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_sft_checkpoint \
-    --overwrite_cache \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### 奖励模型训练
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage rm \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset comparison_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_rm_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### PPO 训练
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage ppo \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset alpaca_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --reward_model path_to_rm_checkpoint \
-    --output_dir path_to_ppo_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --top_k 0 \
-    --top_p 0.9 \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-> [!TIP]
-> 如果开启了 `--create_new_adapter`，则使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` 来进行微调模型的推理。
-
-> [!WARNING]
-> 如果使用 fp16 精度进行 LLaMA-2 模型的 PPO 训练，请使用 `--per_device_train_batch_size=1`。
-
-#### DPO 训练
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage dpo \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset comparison_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_dpo_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-> [!TIP]
-> 如果开启了 `--create_new_adapter`，则使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` 来进行微调模型的推理。
-
-### 多 GPU 分布式训练
-
-#### 使用 Huggingface Accelerate
-
-```bash
-accelerate launch --config_file config.yaml src/train_bash.py \
-    --ddp_timeout 180000000 \
-    ... # 参数同上
-```
-
-<details><summary>使用 Accelerate 进行 LoRA 训练的 config.yaml 示例</summary>
-
-```yaml
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: 0
-main_training_function: main
-mixed_precision: fp16
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-```
-
-</details>
-
-> [!TIP]
-> 我们推荐使用 Accelerate 进行 LoRA 训练。
-
-#### 使用 DeepSpeed
-
-```bash
-deepspeed --num_gpus 8 src/train_bash.py \
-    --deepspeed ds_config.json \
-    --ddp_timeout 180000000 \
-    ... # 参数同上
-```
-
-<details><summary>使用 DeepSpeed ZeRO-2 进行全参数训练的 ds_config.json 示例</summary>
-
-```json
-{
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "zero_allow_untested_optimizer": true,
-  "fp16": {
-    "enabled": "auto",
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 16,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 5e8,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 5e8,
-    "contiguous_gradients": true,
-    "round_robin_gradients": true
-  }
-}
-```
-
-</details>
-
-> [!TIP]
-> 更多训练脚本请查看 [examples](examples)。
-
-### 合并 LoRA 权重并导出模型
-
-```bash
-CUDA_VISIBLE_DEVICES= python src/export_model.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora \
-    --export_dir path_to_export \
-    --export_size 2 \
-    --export_legacy_format False
-```
-
-> [!WARNING]
-> 尚不支持量化模型的 LoRA 权重合并及导出。
-
-> [!TIP]
-> 仅使用 `--model_name_or_path path_to_export` 来加载导出后的模型。
-> 
-> 合并 LoRA 权重之后可再次使用 `CUDA_VISIBLE_DEVICES=0`、`--export_quantization_bit 4` 和 `--export_quantization_dataset data/c4_demo.json` 基于 AutoGPTQ 量化模型。
-
-### 使用 OpenAI 风格 API 推理
-
-```bash
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-> [!TIP]
-> 关于 API 文档请见 `http://localhost:8000/docs`。
-
-### 使用命令行推理
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-### 使用浏览器推理
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-### 模型评估
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template vanilla \
-    --finetuning_type lora \
-    --task ceval \
-    --split validation \
-    --lang zh \
-    --n_shot 5 \
-    --batch_size 4
-```
-
-### 模型预测
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft \
-    --do_predict \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --dataset alpaca_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --output_dir path_to_predict_result \
-    --per_device_eval_batch_size 1 \
-    --max_samples 100 \
-    --predict_with_generate \
-    --fp16
-```
-
-> [!WARNING]
-> 如果使用 fp16 精度进行 LLaMA-2 模型的预测，请使用 `--per_device_eval_batch_size=1`。
-
-> [!TIP]
-> 我们建议在量化模型的预测中使用 `--per_device_eval_batch_size=1` 和 `--max_target_length 128`。
-
-### 使用容器
 
 #### 使用 Docker
 
@@ -691,6 +360,27 @@ docker compose -f ./docker-compose.yml up -d
 > * data：宿主机中存放数据集的文件夹路径。
 > * output：将导出目录设置为该路径后，即可在宿主机中访问导出后的模型。
 
+> [!WARNING]
+> LLaMA Board 可视化界面尚不支持多 GPU 训练。
+
+### 命令行接口
+
+使用方法请参考 [examples](examples) 文件夹。
+
+> [!TIP]
+> 使用 `python src/train_bash.py -h` 查看参数文档。
+
+### 使用魔搭社区
+
+如果您在 Hugging Face 模型和数据集的下载中遇到了问题，可以通过下述方法使用魔搭社区。
+
+```bash
+export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
+```
+
+> [!TIP]
+> 将 `--model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型，例如 `modelscope/Llama-2-7b-ms`。
+
 ## 使用了 LLaMA Factory 的项目
 
 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
@@ -747,7 +437,7 @@ docker compose -f ./docker-compose.yml up -d
 
 ## 致谢
 
-本项目受益于 [PEFT](https://github.com/huggingface/peft)、[QLoRA](https://github.com/artidoro/qlora) 和 [FastChat](https://github.com/lm-sys/FastChat)，感谢以上诸位作者的付出。
+本项目受益于 [PEFT](https://github.com/huggingface/peft)、[TRL](https://github.com/huggingface/trl)、[QLoRA](https://github.com/artidoro/qlora) 和 [FastChat](https://github.com/lm-sys/FastChat)，感谢以上诸位作者的付出。
 
 ## Star History
 
diff --git a/examples/fsdp_qlora/README.md b/examples/extras/fsdp_qlora/README.md
similarity index 100%
rename from examples/fsdp_qlora/README.md
rename to examples/extras/fsdp_qlora/README.md
diff --git a/examples/fsdp_qlora/fsdp.sh b/examples/extras/fsdp_qlora/fsdp.sh
similarity index 87%
rename from examples/fsdp_qlora/fsdp.sh
rename to examples/extras/fsdp_qlora/fsdp.sh
index 2f7772a4..0fce3ecc 100644
--- a/examples/fsdp_qlora/fsdp.sh
+++ b/examples/extras/fsdp_qlora/fsdp.sh
@@ -15,11 +15,13 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
     --overwrite_cache \
     --overwrite_output_dir \
     --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
+    --gradient_accumulation_steps 4 \
     --lr_scheduler_type cosine \
     --logging_steps 10 \
+    --warmup_steps 20 \
     --save_steps 100 \
     --eval_steps 100 \
     --evaluation_strategy steps \
@@ -28,6 +30,7 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
+    --ddp_timeout 180000000 \
     --quantization_bit 4 \
     --plot_loss \
     --fp16
diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
index b463cc10..d1382bc2 100644
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -33,6 +33,6 @@ python -m torch.distributed.run \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
-    --ddp_timeout 1800000 \
+    --ddp_timeout 180000000 \
     --plot_loss \
     --fp16
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
index a54b81b3..ea4acf90 100644
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -27,6 +27,6 @@ deepspeed --num_gpus 4 ../../src/train_bash.py \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
-    --ddp_timeout 1800000 \
+    --ddp_timeout 180000000 \
     --plot_loss \
     --fp16
diff --git a/examples/inference/api_demo.sh b/examples/inference/api_demo.sh
new file mode 100644
index 00000000..4a601bb6
--- /dev/null
+++ b/examples/inference/api_demo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --template default \
+    --finetuning_type lora
diff --git a/examples/inference/cli_demo.sh b/examples/inference/cli_demo.sh
new file mode 100644
index 00000000..fdeb01e6
--- /dev/null
+++ b/examples/inference/cli_demo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --template default \
+    --finetuning_type lora
diff --git a/examples/inference/evaluate.sh b/examples/inference/evaluate.sh
new file mode 100644
index 00000000..b3053662
--- /dev/null
+++ b/examples/inference/evaluate.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --template vanilla \
+    --finetuning_type lora \
+    --task mmlu \
+    --split test \
+    --lang en \
+    --n_shot 5 \
+    --batch_size 4
diff --git a/examples/inference/web_demo.sh b/examples/inference/web_demo.sh
new file mode 100644
index 00000000..0f8307fb
--- /dev/null
+++ b/examples/inference/web_demo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --template default \
+    --finetuning_type lora
diff --git a/examples/lora_multi_gpu/multi_node.sh b/examples/lora_multi_gpu/multi_node.sh
index 562364a7..5172b9a6 100644
--- a/examples/lora_multi_gpu/multi_node.sh
+++ b/examples/lora_multi_gpu/multi_node.sh
@@ -30,6 +30,6 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
-    --ddp_timeout 1800000 \
+    --ddp_timeout 180000000 \
     --plot_loss \
     --fp16
diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh
index ddb9459d..269d76d7 100644
--- a/examples/lora_multi_gpu/single_node.sh
+++ b/examples/lora_multi_gpu/single_node.sh
@@ -30,6 +30,6 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
-    --ddp_timeout 1800000 \
+    --ddp_timeout 180000000 \
     --plot_loss \
     --fp16
diff --git a/examples/lora_single_gpu/prepare.sh b/examples/lora_single_gpu/prepare.sh
new file mode 100644
index 00000000..3652cea4
--- /dev/null
+++ b/examples/lora_single_gpu/prepare.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES= python ../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --max_samples 3000 \
+    --tokenized_path ../../saves/datasets/sft
diff --git a/examples/merge_lora/README.md b/examples/merge_lora/README.md
index c6c16071..a095f288 100644
--- a/examples/merge_lora/README.md
+++ b/examples/merge_lora/README.md
@@ -1,3 +1,12 @@
+> [!WARNING]
+> Merging LoRA weights into a quantized model is not supported.
+
+> [!TIP]
+> Use `--model_name_or_path path_to_model` solely to use the exported model or model fine-tuned in full/freeze mode.
+>
+> Use `CUDA_VISIBLE_DEVICES=0`, `--export_quantization_bit 4` and `--export_quantization_dataset data/c4_demo.json` to quantize the model with AutoGPTQ after merging the LoRA weights.
+
+
 Usage:
 
 - `merge.sh`: merge the lora weights
diff --git a/examples/merge_lora/merge.sh b/examples/merge_lora/merge.sh
index 42b9fcdd..bd2babb8 100644
--- a/examples/merge_lora/merge.sh
+++ b/examples/merge_lora/merge.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/export_model.py \
+CUDA_VISIBLE_DEVICES= python ../../src/export_model.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index 0ab734e0..c22f9a77 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -6,6 +6,7 @@ from datasets import load_dataset, load_from_disk
 
 from ..extras.constants import FILEEXT2TYPE
 from ..extras.logging import get_logger
+from ..extras.misc import is_path_available
 from .aligner import align_dataset
 from .parser import get_dataset_list
 from .preprocess import get_preprocess_and_print_func
@@ -122,11 +123,12 @@ def get_dataset(
     if data_args.train_on_prompt and template.efficient_eos:
         raise ValueError("Current template does not support `train_on_prompt`.")
 
-    # Load from cache
-    if data_args.cache_path is not None:
-        if os.path.exists(data_args.cache_path):
+    # Load tokenized dataset
+    if data_args.tokenized_path is not None:
+        if not is_path_available(data_args.tokenized_path):
             logger.warning("Loading dataset from disk will ignore other data arguments.")
-            dataset = load_from_disk(data_args.cache_path)
+            dataset = load_from_disk(data_args.tokenized_path)
+            logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
             if data_args.streaming:
                 dataset = dataset.to_iterable_dataset()
             return dataset
@@ -158,10 +160,13 @@ def get_dataset(
 
         dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
 
-        if data_args.cache_path is not None and not os.path.exists(data_args.cache_path):
+        if data_args.tokenized_path is not None:
             if training_args.should_save:
-                dataset.save_to_disk(data_args.cache_path)
-                logger.info("Dataset cache saved at {}.".format(data_args.cache_path))
+                dataset.save_to_disk(data_args.tokenized_path)
+                logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path))
+                logger.info("Please restart the training with `--tokenized_path {}`.".format(data_args.tokenized_path))
+
+            exit(0)
 
         if training_args.should_log:
             try:
diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index 60cf153b..a696b315 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -193,6 +193,18 @@ def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype:
         return torch.float32
 
 
+def is_path_available(path: os.PathLike) -> bool:
+    r"""
+    Checks if the path is empty or not exist.
+    """
+    if not os.path.exists(path):
+        return True
+    elif os.path.isdir(path) and not os.listdir(path):
+        return True
+    else:
+        return False
+
+
 def torch_gc() -> None:
     r"""
     Collects GPU memory.
diff --git a/src/llmtuner/extras/patches/llama_patch.py b/src/llmtuner/extras/patches/llama_patch.py
index f0b65d65..6a90c41a 100644
--- a/src/llmtuner/extras/patches/llama_patch.py
+++ b/src/llmtuner/extras/patches/llama_patch.py
@@ -193,6 +193,6 @@ def llama_flash_attn_forward(
 
 
 def apply_llama_patch() -> None:
-    require_version("transformers==4.39.2", "To fix: pip install transformers==4.39.2")
+    require_version("transformers==4.39.3", "To fix: pip install transformers==4.39.3")
     LlamaAttention.forward = llama_torch_attn_forward
     LlamaFlashAttention2.forward = llama_flash_attn_forward
diff --git a/src/llmtuner/extras/patches/mixtral_patch.py b/src/llmtuner/extras/patches/mixtral_patch.py
deleted file mode 100644
index 382492e0..00000000
--- a/src/llmtuner/extras/patches/mixtral_patch.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import torch
-import torch.nn.functional as F
-from transformers.models.mixtral.modeling_mixtral import MixtralBLockSparseTop2MLP, MixtralSparseMoeBlock
-
-
-def mlp_forward(self: "MixtralBLockSparseTop2MLP", hidden_states: torch.Tensor) -> torch.Tensor:
-    current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
-    current_hidden_states = self.w2(current_hidden_states)
-    return current_hidden_states
-
-
-# Modified from: https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py
-def moe_forward(self: "MixtralSparseMoeBlock", hidden_states: torch.Tensor) -> torch.Tensor:
-    batch_size, sequence_length, hidden_dim = hidden_states.shape
-    hidden_states = hidden_states.view(-1, hidden_dim)
-    # router_logits: (batch * sequence_length, n_experts)
-    router_logits = self.gate(hidden_states)
-
-    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-    topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
-    topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
-    # we cast back to the input dtype
-    topk_weight = topk_weight.to(hidden_states.dtype)
-
-    hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
-    y = torch.empty_like(hidden_states)
-    flat_topk_idx = topk_idx.view(-1)
-    for i in range(self.num_experts):
-        expert = self.experts[i]
-        y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
-    y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
-    final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
-    return final_hidden_states, router_logits
-
-
-def patch_mixtral_replace_moe_impl() -> None:
-    MixtralBLockSparseTop2MLP.forward = mlp_forward
-    MixtralSparseMoeBlock.forward = moe_forward
diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py
index 76e6d6da..f5f75c77 100644
--- a/src/llmtuner/hparams/data_args.py
+++ b/src/llmtuner/hparams/data_args.py
@@ -84,9 +84,9 @@ class DataArguments:
             "help": "Whether or not to pack the sequences in training. Will automatically enable in pre-training."
         },
     )
-    cache_path: Optional[str] = field(
+    tokenized_path: Optional[str] = field(
         default=None,
-        metadata={"help": "Path to save or load the pre-processed datasets."},
+        metadata={"help": "Path to save or load the tokenized datasets."},
     )
 
     def __post_init__(self):
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 7132470a..434a3a84 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -17,8 +17,7 @@ from ..extras.logging import get_logger
 from ..extras.misc import get_current_device, infer_optim_dtype
 from ..extras.packages import is_flash_attn2_available
 from ..extras.patches.llama_patch import apply_llama_patch
-from ..extras.patches.mixtral_patch import patch_mixtral_replace_moe_impl
-from .utils import QuantizationMethod
+from .utils import QuantizationMethod, add_z3_leaf_module
 
 
 if TYPE_CHECKING:
@@ -32,47 +31,6 @@ logger = get_logger(__name__)
 SUPPORTED_CLASS_FOR_S2ATTN = ["llama"]
 
 
-def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int):
-    embedding_dim = embed_weight.size(1)
-    avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True)
-    noise_weight = torch.empty_like(embed_weight[-num_new_tokens:])
-    noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim)))
-    embed_weight[-num_new_tokens:] = avg_weight + noise_weight
-
-
-def _resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None:
-    r"""
-    Resize token embeddings.
-    """
-    if is_deepspeed_zero3_enabled():
-        import deepspeed  # type: ignore
-
-        params = [model.get_input_embeddings().weight]
-        if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
-            params.append(model.get_output_embeddings().weight)
-
-        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
-    else:
-        context_maybe_zero3 = nullcontext()
-
-    with context_maybe_zero3:
-        current_embedding_size = model.get_input_embeddings().weight.size(0)
-
-    if len(tokenizer) > current_embedding_size:
-        if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
-            logger.warning("Current model does not support resizing token embeddings.")
-            return
-
-        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
-        with context_maybe_zero3:
-            new_embedding_size = model.get_input_embeddings().weight.size(0)
-            num_new_tokens = new_embedding_size - current_embedding_size
-            _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens)
-            _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens)
-
-        logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size))
-
-
 def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]:
     r"""
     Inspired by: https://github.com/huggingface/optimum/blob/v1.16.0/optimum/gptq/data.py#L133
@@ -180,8 +138,12 @@ def _configure_quantization(
         quant_method = quantization_config.get("quant_method", "")
 
         if quant_method == QuantizationMethod.GPTQ:
+            require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
             quantization_config["use_exllama"] = False  # disable exllama
 
+        if quant_method == QuantizationMethod.AWQ:
+            require_version("autoawq", "To fix: pip install autoawq")
+
         if quant_method == QuantizationMethod.AQLM:
             require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
             require_version("aqlm>=1.1.0", "To fix: pip install aqlm[gpu]>=1.1.0")
@@ -235,6 +197,47 @@ def _configure_quantization(
         logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
 
 
+def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int):
+    embedding_dim = embed_weight.size(1)
+    avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True)
+    noise_weight = torch.empty_like(embed_weight[-num_new_tokens:])
+    noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim)))
+    embed_weight[-num_new_tokens:] = avg_weight + noise_weight
+
+
+def _resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None:
+    r"""
+    Resize token embeddings.
+    """
+    if is_deepspeed_zero3_enabled():
+        import deepspeed  # type: ignore
+
+        params = [model.get_input_embeddings().weight]
+        if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
+            params.append(model.get_output_embeddings().weight)
+
+        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
+    else:
+        context_maybe_zero3 = nullcontext()
+
+    with context_maybe_zero3:
+        current_embedding_size = model.get_input_embeddings().weight.size(0)
+
+    if len(tokenizer) > current_embedding_size:
+        if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
+            logger.warning("Current model does not support resizing token embeddings.")
+            return
+
+        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
+        with context_maybe_zero3:
+            new_embedding_size = model.get_input_embeddings().weight.size(0)
+            num_new_tokens = new_embedding_size - current_embedding_size
+            _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens)
+            _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens)
+
+        logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size))
+
+
 def _fp32_forward_post_hook(
     module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor"
 ) -> "torch.Tensor":
@@ -348,15 +351,15 @@ def patch_model(
     if is_trainable:
         _prepare_model_for_training(model, model_args)
 
-    if getattr(model.config, "model_type", None) == "mixtral" and is_deepspeed_zero3_enabled():
-        require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
-        from deepspeed.utils import set_z3_leaf_modules  # type: ignore
+    if getattr(model.config, "model_type", None) == "mixtral":
         from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
-        set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+        add_z3_leaf_module(model, MixtralSparseMoeBlock)
 
-        if is_trainable:
-            patch_mixtral_replace_moe_impl()
+    if getattr(model.config, "model_type", None) == "qwen2moe":
+        from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
+
+        add_z3_leaf_module(model, Qwen2MoeSparseMoeBlock)
 
     try:
         model.add_model_tags(["llama-factory"])
diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py
index 1b96a9dd..771e6112 100644
--- a/src/llmtuner/model/utils.py
+++ b/src/llmtuner/model/utils.py
@@ -3,7 +3,9 @@ from typing import TYPE_CHECKING, Dict, List
 
 import torch
 from transformers import PreTrainedModel
+from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils import cached_file
+from transformers.utils.versions import require_version
 
 from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from ..extras.logging import get_logger
@@ -28,11 +30,23 @@ class QuantizationMethod(str, Enum):
     GPTQ = "gptq"
     AWQ = "awq"
     AQLM = "aqlm"
+    QUANTO = "quanto"
+
+
+def add_z3_leaf_module(model: "PreTrainedModel", module: "torch.nn.Module") -> None:
+    r"""
+    Sets module as a leaf module to skip partitioning in deepspeed zero3.
+    """
+    if is_deepspeed_zero3_enabled():
+        require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
+        from deepspeed.utils import set_z3_leaf_modules  # type: ignore
+
+        set_z3_leaf_modules(model, [module])
 
 
 def find_all_linear_modules(model: "PreTrainedModel") -> List[str]:
     r"""
-    Finds all available modules to apply lora.
+    Finds all available modules to apply lora or galore.
     """
     quantization_method = getattr(model, "quantization_method", None)
     if quantization_method is None:
diff --git a/src/llmtuner/train/dpo/collator.py b/src/llmtuner/train/dpo/collator.py
deleted file mode 100644
index 7e8ba1c5..00000000
--- a/src/llmtuner/train/dpo/collator.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from dataclasses import dataclass
-from typing import Any, Dict, List, Sequence, Tuple
-
-import torch
-from transformers import DataCollatorForSeq2Seq
-
-
-@dataclass
-class DPODataCollatorWithPadding(DataCollatorForSeq2Seq):
-    r"""
-    Data collator for pairwise data.
-    """
-
-    def _pad_labels(self, batch: torch.Tensor, positions: List[Tuple[int, int]]) -> torch.Tensor:
-        padded_labels = []
-        for feature, (prompt_len, answer_len) in zip(batch, positions):
-            if self.tokenizer.padding_side == "left":
-                start, end = feature.size(0) - answer_len, feature.size(0)
-            else:
-                start, end = prompt_len, prompt_len + answer_len
-            padded_tensor = self.label_pad_token_id * torch.ones_like(feature)
-            padded_tensor[start:end] = feature[start:end]
-            padded_labels.append(padded_tensor)
-        return torch.stack(padded_labels, dim=0).contiguous()  # in contiguous memory
-
-    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
-        r"""
-        Pads batched data to the longest sequence in the batch.
-
-        We generate 2 * n examples where the first n examples represent chosen examples and
-        the last n examples represent rejected examples.
-        """
-        concatenated_features = []
-        label_positions = []
-        for key in ("chosen_ids", "rejected_ids"):
-            for feature in features:
-                prompt_len, answer_len = len(feature["prompt_ids"]), len(feature[key])
-                concatenated_features.append(
-                    {
-                        "input_ids": feature["prompt_ids"] + feature[key],
-                        "attention_mask": [1] * (prompt_len + answer_len),
-                    }
-                )
-                label_positions.append((prompt_len, answer_len))
-
-        batch = self.tokenizer.pad(
-            concatenated_features,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors=self.return_tensors,
-        )
-        batch["labels"] = self._pad_labels(batch["input_ids"], label_positions)
-        return batch
diff --git a/src/llmtuner/train/rm/collator.py b/src/llmtuner/train/rm/collator.py
deleted file mode 100644
index 8d5d4ada..00000000
--- a/src/llmtuner/train/rm/collator.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from dataclasses import dataclass
-from typing import Any, Dict, Sequence
-
-import torch
-from transformers import DataCollatorWithPadding
-
-
-@dataclass
-class PairwiseDataCollatorWithPadding(DataCollatorWithPadding):
-    r"""
-    Data collator for pairwise data.
-    """
-
-    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
-        r"""
-        Pads batched data to the longest sequence in the batch.
-
-        We generate 2 * n examples where the first n examples represent chosen examples and
-        the last n examples represent rejected examples.
-        """
-        features = [
-            {
-                "input_ids": feature["prompt_ids"] + feature[key],
-                "attention_mask": [1] * (len(feature["prompt_ids"]) + len(feature[key])),
-            }
-            for key in ("chosen_ids", "rejected_ids")
-            for feature in features
-        ]
-        return super().__call__(features)

From 2074cf99fb4202696f20b470c79985898363b70c Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 20:22:11 +0800
Subject: [PATCH 030/341] update readme

Former-commit-id: 0c73d3c8a5762a8f119b27322ffd52a61de6fe38
---
 README.md    | 27 +++++++++++++++------------
 README_zh.md | 27 +++++++++++++++------------
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 6450e61e..99278fbc 100644
--- a/README.md
+++ b/README.md
@@ -307,8 +307,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-> [!TIP]
-> Extra dependencies available: deepspeed, metrics, unsloth, vllm, bitsandbytes, gptq, awq, aqlm, qwen, quality
+Extra dependencies available: deepspeed, metrics, unsloth, vllm, bitsandbytes, gptq, awq, aqlm, qwen, quality
 
 <details><summary>For Windows users</summary>
 
@@ -324,6 +323,9 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 ### LLaMA Board GUI
 
+> [!IMPORTANT]
+> LLaMA Board GUI only supports training on a single GPU, please use [CLI](#command-line-interface) for distributed training.
+
 #### Use local environment
 
 ```bash
@@ -335,7 +337,6 @@ CUDA_VISIBLE_DEVICES=0 python src/train_web.py
 
 ```bash
 docker build -f ./Dockerfile -t llama-factory:latest .
-
 docker run --gpus=all \
     -v ./hf_cache:/root/.cache/huggingface/ \
     -v ./data:/app/data \
@@ -353,14 +354,13 @@ docker run --gpus=all \
 docker compose -f ./docker-compose.yml up -d
 ```
 
-> [!TIP]
-> Details about volume:
-> * hf_cache: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory.
-> * data: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI.
-> * output: Set export dir to this location so that the merged result can be accessed directly on the host machine.
+<details><summary>Details about volume</summary>
 
-> [!WARNING]
-> LLaMA Board GUI does not yet support multi-GPUs training.
+- hf_cache: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory.
+- data: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI.
+- output: Set export dir to this location so that the merged result can be accessed directly on the host machine.
+
+</details>
 
 ### Command Line Interface
 
@@ -377,11 +377,12 @@ If you have trouble with downloading models and datasets from Hugging Face, you
 export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows
 ```
 
-> [!TIP]
-> Train the model by specifying a model ID of the ModelScope Hub as the `--model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `modelscope/Llama-2-7b-ms`.
+Train the model by specifying a model ID of the ModelScope Hub as the `--model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `modelscope/Llama-2-7b-ms`.
 
 ## Projects using LLaMA Factory
 
+<details><summary>Click to show</summary>
+
 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
 1. Yu et al. Open, Closed, or Small Language Models for Text Classification? 2023. [[arxiv]](https://arxiv.org/abs/2308.10092)
 1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526)
@@ -411,6 +412,8 @@ export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: A series of large language models for Chinese medical domain, based on LLaMA2-7B and Baichuan-13B.
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.
 
+</details>
+
 > [!TIP]
 > If you have a project that should be incorporated, please contact via email or create a pull request.
 
diff --git a/README_zh.md b/README_zh.md
index 8b19f17f..d6dea04d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -307,8 +307,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-> [!TIP]
-> 可选的额外依赖项：deepspeed、metrics、unsloth、vllm、bitsandbytes、gptq、awq、aqlm、qwen、quality
+可选的额外依赖项：deepspeed、metrics、unsloth、vllm、bitsandbytes、gptq、awq、aqlm、qwen、quality
 
 <details><summary>Windows 用户指南</summary>
 
@@ -324,6 +323,9 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 ### LLaMA Board 可视化界面
 
+> [!IMPORTANT]
+> LLaMA Board 可视化界面目前仅支持单 GPU 训练，请使用[命令行接口](#命令行接口)来进行分布式训练。
+
 #### 使用本地环境
 
 ```bash
@@ -331,7 +333,6 @@ CUDA_VISIBLE_DEVICES=0 python src/train_web.py
 # 或 CUDA_VISIBLE_DEVICES=0 python -m llmtuner.webui.interface
 ```
 
-
 #### 使用 Docker
 
 ```bash
@@ -354,14 +355,13 @@ docker run --gpus=all \
 docker compose -f ./docker-compose.yml up -d
 ```
 
-> [!TIP]
-> 数据卷详情：
-> * hf_cache：使用宿主机的 Hugging Face 缓存文件夹，允许更改为新的目录。
-> * data：宿主机中存放数据集的文件夹路径。
-> * output：将导出目录设置为该路径后，即可在宿主机中访问导出后的模型。
+<details><summary>数据卷详情</summary>
 
-> [!WARNING]
-> LLaMA Board 可视化界面尚不支持多 GPU 训练。
+- hf_cache：使用宿主机的 Hugging Face 缓存文件夹，允许更改为新的目录。
+- data：宿主机中存放数据集的文件夹路径。
+- output：将导出目录设置为该路径后，即可在宿主机中访问导出后的模型。
+
+</details>
 
 ### 命令行接口
 
@@ -378,11 +378,12 @@ docker compose -f ./docker-compose.yml up -d
 export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 ```
 
-> [!TIP]
-> 将 `--model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型，例如 `modelscope/Llama-2-7b-ms`。
+将 `--model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型，例如 `modelscope/Llama-2-7b-ms`。
 
 ## 使用了 LLaMA Factory 的项目
 
+<details><summary>点击显示</summary>
+
 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
 1. Yu et al. Open, Closed, or Small Language Models for Text Classification? 2023. [[arxiv]](https://arxiv.org/abs/2308.10092)
 1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526)
@@ -412,6 +413,8 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: 医疗大模型项目 CareGPT，基于 LLaMA2-7B 和 Baichuan-13B 在中文医疗数据上微调而得。
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**：MBTI性格大模型项目，根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。
 
+</details>
+
 > [!TIP]
 > 如果您有项目希望添加至上述列表，请通过邮件联系或者创建一个 PR。
 

From c1510d19c79aff38805dec69951be52257b96604 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 20:37:37 +0800
Subject: [PATCH 031/341] update readme

Former-commit-id: 9b8e7ccdab167f53fb897e1940562682324e8ff0
---
 README.md                                     | 21 ++++++---
 README_zh.md                                  | 22 ++++++----
 examples/README.md                            | 43 +++++++++++++++++++
 .../extras/fsdp_qlora/{fsdp.sh => sft.sh}     |  0
 4 files changed, 71 insertions(+), 15 deletions(-)
 create mode 100644 examples/README.md
 rename examples/extras/fsdp_qlora/{fsdp.sh => sft.sh} (100%)

diff --git a/README.md b/README.md
index 99278fbc..c374ab38 100644
--- a/README.md
+++ b/README.md
@@ -245,8 +245,6 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ
 
 </details>
 
-Please refer to [data/README.md](data/README.md) for details.
-
 Some datasets require confirmation before using them, so we recommend logging in with your Hugging Face account using these commands.
 
 ```bash
@@ -366,8 +364,18 @@ docker compose -f ./docker-compose.yml up -d
 
 See [examples](examples) for usage.
 
-> [!TIP]
-> Use `python src/train_bash.py -h` to display arguments description.
+Use `python src/train_bash.py -h` to display arguments description.
+
+### Deploy with OpenAI-style API and vLLM
+
+```bash
+CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
+    --model_name_or_path path_to_model \
+    --adapter_name_or_path path_to_lora_adapter \
+    --template default \
+    --finetuning_type lora \
+    --infer_backend vllm
+```
 
 ### Use ModelScope Hub
 
@@ -381,6 +389,8 @@ Train the model by specifying a model ID of the ModelScope Hub as the `--model_n
 
 ## Projects using LLaMA Factory
 
+If you have a project that should be incorporated, please contact via email or create a pull request.
+
 <details><summary>Click to show</summary>
 
 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
@@ -414,9 +424,6 @@ Train the model by specifying a model ID of the ModelScope Hub as the `--model_n
 
 </details>
 
-> [!TIP]
-> If you have a project that should be incorporated, please contact via email or create a pull request.
-
 ## License
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
diff --git a/README_zh.md b/README_zh.md
index d6dea04d..6f060460 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -245,8 +245,6 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 </details>
 
-使用方法请参考 [data/README_zh.md](data/README_zh.md) 文件。
-
 部分数据集的使用需要确认，我们推荐使用下述命令登录您的 Hugging Face 账户。
 
 ```bash
@@ -337,7 +335,6 @@ CUDA_VISIBLE_DEVICES=0 python src/train_web.py
 
 ```bash
 docker build -f ./Dockerfile -t llama-factory:latest .
-
 docker run --gpus=all \
     -v ./hf_cache:/root/.cache/huggingface/ \
     -v ./data:/app/data \
@@ -367,8 +364,18 @@ docker compose -f ./docker-compose.yml up -d
 
 使用方法请参考 [examples](examples) 文件夹。
 
-> [!TIP]
-> 使用 `python src/train_bash.py -h` 查看参数文档。
+使用 `python src/train_bash.py -h` 查看参数文档。
+
+### 使用 OpenAI 风格 API 和 vLLM 部署
+
+```bash
+CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
+    --model_name_or_path path_to_model \
+    --adapter_name_or_path path_to_lora_adapter \
+    --template default \
+    --finetuning_type lora \
+    --infer_backend vllm
+```
 
 ### 使用魔搭社区
 
@@ -382,6 +389,8 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 ## 使用了 LLaMA Factory 的项目
 
+如果您有项目希望添加至上述列表，请通过邮件联系或者创建一个 PR。
+
 <details><summary>点击显示</summary>
 
 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
@@ -415,9 +424,6 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 </details>
 
-> [!TIP]
-> 如果您有项目希望添加至上述列表，请通过邮件联系或者创建一个 PR。
-
 ## 协议
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 00000000..71c4b742
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,43 @@
+We provide diverse examples about fine-tuning LLMs.
+
+```
+examples/
+├── lora_single_gpu/
+│   ├── pt.sh: Pre-training
+│   ├── sft.sh: Supervised fine-tuning
+│   ├── reward.sh: Reward modeling
+│   ├── ppo.sh: PPO training
+│   ├── dpo.sh: DPO training
+│   ├── orpo.sh: ORPO training
+│   ├── prepare.sh: Save tokenized dataset
+│   └── predict.sh: Batch prediction
+├── qlora_single_gpu/
+│   ├── bitsandbytes.sh
+│   ├── gptq.sh
+│   ├── awq.sh
+│   └── aqlm.sh
+├── lora_multi_gpu/
+│   ├── single_node.sh
+│   └── multi_node.sh
+├── full_multi_gpu/
+│   ├── single_node.sh
+│   └── multi_node.sh
+├── merge_lora/
+│   ├── merge.sh
+│   └── quantize.sh
+├── inference/
+│   ├── cli_demo.sh
+│   ├── api_demo.sh
+│   ├── web_demo.sh
+│   └── evaluate.sh
+└── extras/
+    ├── galore/
+    │   └── sft.sh
+    ├── loraplus/
+    │   └── sft.sh
+    ├── llama_pro/
+    │   ├── expand.sh
+    │   └── sft.sh
+    └── fsdp_qlora/
+        └── sft.sh
+```
diff --git a/examples/extras/fsdp_qlora/fsdp.sh b/examples/extras/fsdp_qlora/sft.sh
similarity index 100%
rename from examples/extras/fsdp_qlora/fsdp.sh
rename to examples/extras/fsdp_qlora/sft.sh

From 933a084999b534aaaf3358f4458096184e46c1ae Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 20:41:49 +0800
Subject: [PATCH 032/341] update examples

Former-commit-id: bf36b16e48d6438de6d0b2f2bfe33f7895699b9d
---
 examples/README.md                   |  4 ++--
 examples/extras/fsdp_qlora/README.md |  5 -----
 examples/extras/fsdp_qlora/sft.sh    |  4 ++++
 examples/lora_single_gpu/README.md   |  9 ---------
 examples/merge_lora/README.md        | 13 -------------
 5 files changed, 6 insertions(+), 29 deletions(-)
 delete mode 100644 examples/extras/fsdp_qlora/README.md
 delete mode 100644 examples/lora_single_gpu/README.md
 delete mode 100644 examples/merge_lora/README.md

diff --git a/examples/README.md b/examples/README.md
index 71c4b742..6aeaaab1 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -23,8 +23,8 @@ examples/
 │   ├── single_node.sh
 │   └── multi_node.sh
 ├── merge_lora/
-│   ├── merge.sh
-│   └── quantize.sh
+│   ├── merge.sh: Merge LoRA weights
+│   └── quantize.sh: Quantize with AutoGPTQ
 ├── inference/
 │   ├── cli_demo.sh
 │   ├── api_demo.sh
diff --git a/examples/extras/fsdp_qlora/README.md b/examples/extras/fsdp_qlora/README.md
deleted file mode 100644
index 39cd2dbe..00000000
--- a/examples/extras/fsdp_qlora/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-```bash
-pip install "transformers>=4.39.1"
-pip install "accelerate>=0.28.0"
-pip install "bitsandbytes>=0.43.0"
-```
diff --git a/examples/extras/fsdp_qlora/sft.sh b/examples/extras/fsdp_qlora/sft.sh
index 0fce3ecc..8ffb5f2e 100644
--- a/examples/extras/fsdp_qlora/sft.sh
+++ b/examples/extras/fsdp_qlora/sft.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+pip install "transformers>=4.39.1"
+pip install "accelerate>=0.28.0"
+pip install "bitsandbytes>=0.43.0"
+
 CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
     --config_file ../accelerate/fsdp_config.yaml \
     ../../src/train_bash.py \
diff --git a/examples/lora_single_gpu/README.md b/examples/lora_single_gpu/README.md
deleted file mode 100644
index 151d0784..00000000
--- a/examples/lora_single_gpu/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-Usage:
-
-- `pretrain.sh`: do pre-train (optional)
-- `sft.sh`: do supervised fine-tuning
-- `reward.sh`: do reward modeling (must after sft.sh)
-- `ppo.sh`: do PPO training (must after sft.sh and reward.sh)
-- `dpo.sh`: do DPO training (must after sft.sh)
-- `orpo.sh`: do ORPO training
-- `predict.sh`: do predict (must after sft.sh and dpo.sh)
diff --git a/examples/merge_lora/README.md b/examples/merge_lora/README.md
deleted file mode 100644
index a095f288..00000000
--- a/examples/merge_lora/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-> [!WARNING]
-> Merging LoRA weights into a quantized model is not supported.
-
-> [!TIP]
-> Use `--model_name_or_path path_to_model` solely to use the exported model or model fine-tuned in full/freeze mode.
->
-> Use `CUDA_VISIBLE_DEVICES=0`, `--export_quantization_bit 4` and `--export_quantization_dataset data/c4_demo.json` to quantize the model with AutoGPTQ after merging the LoRA weights.
-
-
-Usage:
-
-- `merge.sh`: merge the lora weights
-- `quantize.sh`: quantize the model with AutoGPTQ (must after merge.sh, optional)

From 38b59664e6a25034354fe0472d699b327269c0c1 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 20:51:21 +0800
Subject: [PATCH 033/341] update examples

Former-commit-id: c078582a759f6bce6e760cd39a05883f7eb194fe
---
 examples/README.md                | 52 +++++++++++++++----------------
 examples/extras/fsdp_qlora/sft.sh |  8 ++---
 examples/inference/api_demo.sh    |  2 +-
 examples/inference/cli_demo.sh    |  2 +-
 examples/inference/evaluate.sh    |  2 +-
 examples/inference/web_demo.sh    |  2 +-
 6 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 6aeaaab1..6732faaf 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,41 +3,41 @@ We provide diverse examples about fine-tuning LLMs.
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pt.sh: Pre-training
-│   ├── sft.sh: Supervised fine-tuning
-│   ├── reward.sh: Reward modeling
-│   ├── ppo.sh: PPO training
-│   ├── dpo.sh: DPO training
-│   ├── orpo.sh: ORPO training
+│   ├── pt.sh: Do pre-training
+│   ├── sft.sh: Do supervised fine-tuning
+│   ├── reward.sh: Do reward modeling
+│   ├── ppo.sh: Do PPO training
+│   ├── dpo.sh: Do DPO training
+│   ├── orpo.sh: Do ORPO training
 │   ├── prepare.sh: Save tokenized dataset
-│   └── predict.sh: Batch prediction
+│   └── predict.sh: Do batch predict
 ├── qlora_single_gpu/
-│   ├── bitsandbytes.sh
-│   ├── gptq.sh
-│   ├── awq.sh
-│   └── aqlm.sh
+│   ├── bitsandbytes.sh: Fine-tune 4/8-bit BNB models
+│   ├── gptq.sh: Fine-tune 4/8-bit GPTQ models
+│   ├── awq.sh: Fine-tune 4-bit AWQ models
+│   └── aqlm.sh: Fine-tune 2-bit AQLM models
 ├── lora_multi_gpu/
-│   ├── single_node.sh
-│   └── multi_node.sh
+│   ├── single_node.sh: Fine-tune model with Accelerate on single node
+│   └── multi_node.sh: Fine-tune model with Accelerate on multiple nodes
 ├── full_multi_gpu/
-│   ├── single_node.sh
-│   └── multi_node.sh
+│   ├── single_node.sh: Fine-tune model with DeepSpeed on single node
+│   └── multi_node.sh: Fine-tune model with DeepSpeed on multiple nodes
 ├── merge_lora/
-│   ├── merge.sh: Merge LoRA weights
-│   └── quantize.sh: Quantize with AutoGPTQ
+│   ├── merge.sh: Merge LoRA weights into the pre-trained models
+│   └── quantize.sh: Quantize fine-tuned model with AutoGPTQ
 ├── inference/
-│   ├── cli_demo.sh
-│   ├── api_demo.sh
-│   ├── web_demo.sh
-│   └── evaluate.sh
+│   ├── cli_demo.sh: Launch a command line interface
+│   ├── api_demo.sh: Launch an OpenAI-style API
+│   ├── web_demo.sh: Launch a web interface
+│   └── evaluate.sh: Evaluate model on the MMLU benchmark
 └── extras/
     ├── galore/
-    │   └── sft.sh
+    │   └── sft.sh: Fine-tune model with GaLore
     ├── loraplus/
-    │   └── sft.sh
+    │   └── sft.sh: Fine-tune model with LoRA+
     ├── llama_pro/
-    │   ├── expand.sh
-    │   └── sft.sh
+    │   ├── expand.sh: Expand layers in the model
+    │   └── sft.sh: Fine-tune expanded model
     └── fsdp_qlora/
-        └── sft.sh
+        └── sft.sh: Fine-tune quantized model with FSDP
 ```
diff --git a/examples/extras/fsdp_qlora/sft.sh b/examples/extras/fsdp_qlora/sft.sh
index 8ffb5f2e..614245d3 100644
--- a/examples/extras/fsdp_qlora/sft.sh
+++ b/examples/extras/fsdp_qlora/sft.sh
@@ -5,17 +5,17 @@ pip install "accelerate>=0.28.0"
 pip install "bitsandbytes>=0.43.0"
 
 CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
-    --config_file ../accelerate/fsdp_config.yaml \
-    ../../src/train_bash.py \
+    --config_file ../../accelerate/fsdp_config.yaml \
+    ../../../src/train_bash.py \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-70b-hf \
     --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
+    --dataset_dir ../../../data \
     --template default \
     --finetuning_type lora \
     --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-70B/lora/sft \
+    --output_dir ../../../saves/LLaMA2-70B/lora/sft \
     --overwrite_cache \
     --overwrite_output_dir \
     --cutoff_len 1024 \
diff --git a/examples/inference/api_demo.sh b/examples/inference/api_demo.sh
index 4a601bb6..aee86595 100644
--- a/examples/inference/api_demo.sh
+++ b/examples/inference/api_demo.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
+CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python ../../src/api_demo.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \
diff --git a/examples/inference/cli_demo.sh b/examples/inference/cli_demo.sh
index fdeb01e6..3e4a1e4e 100644
--- a/examples/inference/cli_demo.sh
+++ b/examples/inference/cli_demo.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \
+CUDA_VISIBLE_DEVICES=0 python ../../src/cli_demo.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \
diff --git a/examples/inference/evaluate.sh b/examples/inference/evaluate.sh
index b3053662..b54c2a60 100644
--- a/examples/inference/evaluate.sh
+++ b/examples/inference/evaluate.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \
+CUDA_VISIBLE_DEVICES=0 python ../../src/evaluate.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template vanilla \
diff --git a/examples/inference/web_demo.sh b/examples/inference/web_demo.sh
index 0f8307fb..201be2b4 100644
--- a/examples/inference/web_demo.sh
+++ b/examples/inference/web_demo.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \
+CUDA_VISIBLE_DEVICES=0 python ../../src/web_demo.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \

From 35621c6089b51f81bc7c56d161761cc9c173fe3e Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 20:58:45 +0800
Subject: [PATCH 034/341] add zh readme

Former-commit-id: 389a170a4d42c56c71c0e17bbe018c4cb1983b5a
---
 README.md             |  2 +-
 README_zh.md          |  2 +-
 examples/README_zh.md | 43 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)
 create mode 100644 examples/README_zh.md

diff --git a/README.md b/README.md
index c374ab38..6dab2c93 100644
--- a/README.md
+++ b/README.md
@@ -362,7 +362,7 @@ docker compose -f ./docker-compose.yml up -d
 
 ### Command Line Interface
 
-See [examples](examples) for usage.
+See [examples/README.md](examples/README.md) for usage.
 
 Use `python src/train_bash.py -h` to display arguments description.
 
diff --git a/README_zh.md b/README_zh.md
index 6f060460..c62c212c 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -362,7 +362,7 @@ docker compose -f ./docker-compose.yml up -d
 
 ### 命令行接口
 
-使用方法请参考 [examples](examples) 文件夹。
+使用方法请参考 [examples/README_zh.md](examples/README_zh.md)。
 
 使用 `python src/train_bash.py -h` 查看参数文档。
 
diff --git a/examples/README_zh.md b/examples/README_zh.md
new file mode 100644
index 00000000..50b5203f
--- /dev/null
+++ b/examples/README_zh.md
@@ -0,0 +1,43 @@
+我们提供了多样化的示例脚本。
+
+```
+examples/
+├── lora_single_gpu/
+│   ├── pt.sh: 进行预训练
+│   ├── sft.sh: 进行指令监督微调
+│   ├── reward.sh: 进行奖励模型训练
+│   ├── ppo.sh: 进行 PPO 训练
+│   ├── dpo.sh: 进行 DPO 训练
+│   ├── orpo.sh: 进行 ORPO 训练
+│   ├── prepare.sh: 保存预处理后的数据集
+│   └── predict.sh: 进行批量预测
+├── qlora_single_gpu/
+│   ├── bitsandbytes.sh: 微调 4/8 比特 BNB 模型
+│   ├── gptq.sh: 微调 4/8 比特 GPTQ 模型
+│   ├── awq.sh: 微调 4 比特 AWQ 模型
+│   └── aqlm.sh: 微调 2 比特 AQLM 模型
+├── lora_multi_gpu/
+│   ├── single_node.sh: 使用 Accelerate 进行单节点训练
+│   └── multi_node.sh: 使用 Accelerate 进行多节点训练
+├── full_multi_gpu/
+│   ├── single_node.sh: 使用 DeepSpeed 进行单节点训练
+│   └── multi_node.sh: 使用 DeepSpeed 进行多节点训练
+├── merge_lora/
+│   ├── merge.sh: 将 LoRA 权重合并到预训练模型中
+│   └── quantize.sh: 使用 AutoGPTQ 量化模型
+├── inference/
+│   ├── cli_demo.sh: 启动命令行推理接口
+│   ├── api_demo.sh: 启动 OpenAI 风格 API
+│   ├── web_demo.sh: 启动浏览器推理接口
+│   └── evaluate.sh: 在 MMLU 数据集上评测模型
+└── extras/
+    ├── galore/
+    │   └── sft.sh: 使用 GaLore 训练模型
+    ├── loraplus/
+    │   └── sft.sh: 使用 LoRA+ 训练模型
+    ├── llama_pro/
+    │   ├── expand.sh: 扩展模型中的层
+    │   └── sft.sh: 训练扩展后的模型
+    └── fsdp_qlora/
+        └── sft.sh: 使用 FSDP 微调量化模型
+```

From 755b6511ff4872ab8912c096890f4ba21970384b Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 21:09:25 +0800
Subject: [PATCH 035/341] update examples

Former-commit-id: 2715cfe20f6f4532bebaa47b80ccd5df43d6a490
---
 examples/README.md                 | 2 +-
 examples/README_zh.md              | 2 +-
 examples/lora_single_gpu/dpo.sh    | 2 +-
 examples/lora_single_gpu/reward.sh | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 6732faaf..4f34be52 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,7 +3,7 @@ We provide diverse examples about fine-tuning LLMs.
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pt.sh: Do pre-training
+│   ├── pretrain.sh: Do pre-training
 │   ├── sft.sh: Do supervised fine-tuning
 │   ├── reward.sh: Do reward modeling
 │   ├── ppo.sh: Do PPO training
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 50b5203f..a77209b3 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -3,7 +3,7 @@
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pt.sh: 进行预训练
+│   ├── pretrain.sh: 进行预训练
 │   ├── sft.sh: 进行指令监督微调
 │   ├── reward.sh: 进行奖励模型训练
 │   ├── ppo.sh: 进行 PPO 训练
diff --git a/examples/lora_single_gpu/dpo.sh b/examples/lora_single_gpu/dpo.sh
index daa8ac85..56a2dfc3 100644
--- a/examples/lora_single_gpu/dpo.sh
+++ b/examples/lora_single_gpu/dpo.sh
@@ -6,7 +6,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --create_new_adapter \
-    --dataset comparison_gpt4_en \
+    --dataset orca_rlhf \
     --dataset_dir ../../data \
     --template default \
     --finetuning_type lora \
diff --git a/examples/lora_single_gpu/reward.sh b/examples/lora_single_gpu/reward.sh
index 0f775926..1212d082 100644
--- a/examples/lora_single_gpu/reward.sh
+++ b/examples/lora_single_gpu/reward.sh
@@ -6,7 +6,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --create_new_adapter \
-    --dataset comparison_gpt4_en \
+    --dataset orca_rlhf \
     --dataset_dir ../../data \
     --template default \
     --finetuning_type lora \

From 5bf0cca2b8020685dc6c17ae2c91aaacc844ec8f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 22:17:48 +0800
Subject: [PATCH 036/341] update readme

Former-commit-id: 7ea7333b51be6b1120fc0b13675f5a0ac3c5a12b
---
 README.md    | 10 ++++------
 README_zh.md | 10 ++++------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 6dab2c93..1b218418 100644
--- a/README.md
+++ b/README.md
@@ -327,8 +327,8 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 #### Use local environment
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_web.py
-# or CUDA_VISIBLE_DEVICES=0 python -m llmtuner.webui.interface
+export CUDA_VISIBLE_DEVICES=0 # `set CUDA_VISIBLE_DEVICES=0` for Windows
+python src/train_web.py # or python -m llmtuner.webui.interface
 ```
 
 #### Use Docker
@@ -370,10 +370,8 @@ Use `python src/train_bash.py -h` to display arguments description.
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
-    --model_name_or_path path_to_model \
-    --adapter_name_or_path path_to_lora_adapter \
-    --template default \
-    --finetuning_type lora \
+    --model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --template mistral \
     --infer_backend vllm
 ```
 
diff --git a/README_zh.md b/README_zh.md
index c62c212c..a8390f21 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -327,8 +327,8 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 #### 使用本地环境
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_web.py
-# 或 CUDA_VISIBLE_DEVICES=0 python -m llmtuner.webui.interface
+export CUDA_VISIBLE_DEVICES=0 # Windows 使用 `set CUDA_VISIBLE_DEVICES=0`
+python src/train_web.py # 或 python -m llmtuner.webui.interface
 ```
 
 #### 使用 Docker
@@ -370,10 +370,8 @@ docker compose -f ./docker-compose.yml up -d
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
-    --model_name_or_path path_to_model \
-    --adapter_name_or_path path_to_lora_adapter \
-    --template default \
-    --finetuning_type lora \
+    --model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --template mistral \
     --infer_backend vllm
 ```
 

From a74a7585e045abaf5066e8af4d9e18ef7c009706 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 22:45:20 +0800
Subject: [PATCH 037/341] update vllm example

Former-commit-id: 2df6d2eacfa27ebc69455696b93649624c1facbe
---
 README.md    | 5 +++--
 README_zh.md | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1b218418..778d90b7 100644
--- a/README.md
+++ b/README.md
@@ -369,10 +369,11 @@ Use `python src/train_bash.py -h` to display arguments description.
 ### Deploy with OpenAI-style API and vLLM
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
+CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
     --model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 \
     --template mistral \
-    --infer_backend vllm
+    --infer_backend vllm \
+    --vllm_enforce_eager
 ```
 
 ### Use ModelScope Hub
diff --git a/README_zh.md b/README_zh.md
index a8390f21..7b02c55d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -369,10 +369,11 @@ docker compose -f ./docker-compose.yml up -d
 ### 使用 OpenAI 风格 API 和 vLLM 部署
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
+CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
     --model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 \
     --template mistral \
-    --infer_backend vllm
+    --infer_backend vllm \
+    --vllm_enforce_eager
 ```
 
 ### 使用魔搭社区

From f6530222f73b1634337bd02486bb7b7edae5c191 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 3 Apr 2024 14:47:59 +0800
Subject: [PATCH 038/341] fix #3116

Former-commit-id: b7256aa33d761280751518c20f29f9b8ea3fb025
---
 src/llmtuner/hparams/parser.py    | 2 +-
 src/llmtuner/train/ppo/trainer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 4fbc3db9..9264d1ee 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -120,7 +120,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
 
     if (
         finetuning_args.stage == "ppo"
-        and training_args.report_to is not None
+        and training_args.report_to
         and training_args.report_to[0] not in ["wandb", "tensorboard"]
     ):
         raise ValueError("PPO only accepts wandb or tensorboard logger.")
diff --git a/src/llmtuner/train/ppo/trainer.py b/src/llmtuner/train/ppo/trainer.py
index 6be45958..020d54cf 100644
--- a/src/llmtuner/train/ppo/trainer.py
+++ b/src/llmtuner/train/ppo/trainer.py
@@ -66,7 +66,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
             use_score_norm=finetuning_args.ppo_score_norm,
             whiten_rewards=finetuning_args.ppo_whiten_rewards,
             accelerator_kwargs={"step_scheduler_with_optimizer": False},
-            log_with=training_args.report_to[0] if training_args.report_to is not None else None,
+            log_with=training_args.report_to[0] if training_args.report_to else None,
             project_kwargs={"logging_dir": training_args.logging_dir},
         )
 

From 1348f7d8605a1b5be533eb4f83073f27927e562b Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 3 Apr 2024 18:14:24 +0800
Subject: [PATCH 039/341] fix resize vocab at inference #3022

Former-commit-id: c243720b89eec0af2872fa3c7980a0026d893f4d
---
 scripts/cal_lr.py              |  8 ++++----
 scripts/length_cdf.py          |  6 +++---
 setup.py                       |  2 +-
 src/llmtuner/chat/hf_engine.py |  9 +++++----
 src/llmtuner/eval/evaluator.py |  5 +++--
 src/llmtuner/model/__init__.py |  3 +--
 src/llmtuner/model/loader.py   | 16 +---------------
 src/llmtuner/train/tuner.py    |  5 +++--
 src/llmtuner/train/utils.py    | 17 ++++++++++-------
 9 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py
index 6decf0c2..ffe47f28 100644
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -15,7 +15,7 @@ from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
 from llmtuner.data import get_dataset
 from llmtuner.extras.constants import IGNORE_INDEX
 from llmtuner.hparams import get_train_args
-from llmtuner.model import load_model_and_tokenizer
+from llmtuner.model import load_tokenizer
 
 
 BASE_LR = 3e-4  # 1.5e-4 for 30B-70B models
@@ -32,7 +32,7 @@ def calculate_lr(
     cutoff_len: Optional[int] = 1024,  # i.e. maximum input length during training
     is_mistral: Optional[bool] = False,  # mistral model uses a smaller learning rate,
 ):
-    model_args, data_args, training_args, finetuning_args, _ = get_train_args(
+    model_args, data_args, training_args, _, _ = get_train_args(
         dict(
             stage=stage,
             model_name_or_path=model_name_or_path,
@@ -44,8 +44,8 @@ def calculate_lr(
             overwrite_cache=True,
         )
     )
-    _, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, is_trainable=False, add_valuehead=False)
-    trainset = get_dataset(tokenizer, model_args, data_args, training_args, stage=stage)
+    tokenizer = load_tokenizer(model_args)
+    trainset = get_dataset(tokenizer, model_args, data_args, training_args, stage)
     if stage == "pt":
         data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     elif stage == "sft":
diff --git a/scripts/length_cdf.py b/scripts/length_cdf.py
index d9cb06f5..cf0698de 100644
--- a/scripts/length_cdf.py
+++ b/scripts/length_cdf.py
@@ -10,7 +10,7 @@ from tqdm import tqdm
 
 from llmtuner.data import get_dataset
 from llmtuner.hparams import get_train_args
-from llmtuner.model import load_model_and_tokenizer
+from llmtuner.model import load_tokenizer
 
 
 def length_cdf(
@@ -20,7 +20,7 @@ def length_cdf(
     template: Optional[str] = "default",
     interval: Optional[int] = 1000,
 ):
-    model_args, data_args, training_args, finetuning_args, _ = get_train_args(
+    model_args, data_args, training_args, _, _ = get_train_args(
         dict(
             stage="sft",
             model_name_or_path=model_name_or_path,
@@ -32,7 +32,7 @@ def length_cdf(
             overwrite_cache=True,
         )
     )
-    _, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, is_trainable=False, add_valuehead=False)
+    tokenizer = load_tokenizer(model_args)
     trainset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft")
     total_num = len(trainset)
     length_dict = defaultdict(int)
diff --git a/setup.py b/setup.py
index 8d6c2031..2caee7a8 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ def get_requires():
 
 
 extra_require = {
-    "deepspeed": ["deepspeed"],
+    "deepspeed": ["deepspeed>=0.10.0"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "unsloth": ["torch==2.2.0", "unsloth[cu121-ampere-torch220]"],
     "vllm": ["vllm>=0.3.3"],
diff --git a/src/llmtuner/chat/hf_engine.py b/src/llmtuner/chat/hf_engine.py
index c634ba16..bcdbd15a 100644
--- a/src/llmtuner/chat/hf_engine.py
+++ b/src/llmtuner/chat/hf_engine.py
@@ -9,7 +9,7 @@ from transformers import GenerationConfig, TextIteratorStreamer
 
 from ..data import get_template_and_fix_tokenizer
 from ..extras.misc import get_logits_processor
-from ..model import load_model_and_tokenizer
+from ..model import load_model, load_tokenizer
 from .base_engine import BaseEngine, Response
 
 
@@ -30,11 +30,12 @@ class HuggingfaceEngine(BaseEngine):
         generating_args: "GeneratingArguments",
     ) -> None:
         self.can_generate = finetuning_args.stage == "sft"
-        self.model, self.tokenizer = load_model_and_tokenizer(
-            model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
-        )
+        self.tokenizer = load_tokenizer(model_args)
         self.tokenizer.padding_side = "left" if self.can_generate else "right"
         self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
+        self.model = load_model(
+            self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
+        )
         self.generating_args = generating_args.to_dict()
 
     @staticmethod
diff --git a/src/llmtuner/eval/evaluator.py b/src/llmtuner/eval/evaluator.py
index 4969561f..2c039928 100644
--- a/src/llmtuner/eval/evaluator.py
+++ b/src/llmtuner/eval/evaluator.py
@@ -14,16 +14,17 @@ from transformers.utils import cached_file
 from ..data import get_template_and_fix_tokenizer
 from ..extras.constants import CHOICES, SUBJECTS
 from ..hparams import get_eval_args
-from ..model import load_model_and_tokenizer
+from ..model import load_model, load_tokenizer
 from .template import get_eval_template
 
 
 class Evaluator:
     def __init__(self, args: Optional[Dict[str, Any]] = None) -> None:
         self.model_args, self.data_args, self.eval_args, finetuning_args = get_eval_args(args)
-        self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args)
+        self.tokenizer = load_tokenizer(self.model_args)
         self.tokenizer.padding_side = "right"  # avoid overflow issue in batched inference for llama2
         self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template)
+        self.model = load_model(self.tokenizer, self.model_args, finetuning_args)
         self.eval_template = get_eval_template(self.eval_args.lang)
         self.choice_inputs = [
             self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES
diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py
index 4b1b26fc..1eaf4271 100644
--- a/src/llmtuner/model/__init__.py
+++ b/src/llmtuner/model/__init__.py
@@ -1,10 +1,9 @@
-from .loader import load_model, load_model_and_tokenizer, load_tokenizer
+from .loader import load_model, load_tokenizer
 from .utils import find_all_linear_modules, load_valuehead_params
 
 
 __all__ = [
     "load_model",
-    "load_model_and_tokenizer",
     "load_tokenizer",
     "load_valuehead_params",
     "find_all_linear_modules",
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index d05c0886..e91a7b68 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Dict, Tuple
+from typing import TYPE_CHECKING, Any, Dict
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from trl import AutoModelForCausalLMWithValueHead
@@ -133,17 +133,3 @@ def load_model(
             )
 
     return model
-
-
-def load_model_and_tokenizer(
-    model_args: "ModelArguments",
-    finetuning_args: "FinetuningArguments",
-    is_trainable: bool = False,
-    add_valuehead: bool = False,
-) -> Tuple["PreTrainedModel", "PreTrainedTokenizer"]:
-    r"""
-    Loads pretrained model and tokenizer.
-    """
-    tokenizer = load_tokenizer(model_args)
-    model = load_model(tokenizer, model_args, finetuning_args, is_trainable, add_valuehead)
-    return model, tokenizer
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index 299e4f2a..f6c2e16b 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -7,7 +7,7 @@ from ..data import get_template_and_fix_tokenizer
 from ..extras.callbacks import LogCallback
 from ..extras.logging import get_logger
 from ..hparams import get_infer_args, get_train_args
-from ..model import load_model_and_tokenizer
+from ..model import load_model, load_tokenizer
 from .dpo import run_dpo
 from .orpo import run_orpo
 from .ppo import run_ppo
@@ -52,8 +52,9 @@ def export_model(args: Optional[Dict[str, Any]] = None):
     if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None:
         raise ValueError("Please merge adapters before quantizing the model.")
 
-    model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args)
+    tokenizer = load_tokenizer(model_args)
     get_template_and_fix_tokenizer(tokenizer, data_args.template)
+    model = load_model(tokenizer, model_args, finetuning_args)  # must after fixing tokenizer to resize vocab
 
     if getattr(model, "quantization_method", None) and model_args.adapter_name_or_path is not None:
         raise ValueError("Cannot merge adapters to a quantized model.")
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index 8f218a78..cf199633 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -10,7 +10,7 @@ from transformers.utils.versions import require_version
 from ..extras.logging import get_logger
 from ..extras.packages import is_galore_available
 from ..hparams import FinetuningArguments, ModelArguments
-from ..model import find_all_linear_modules, load_model_and_tokenizer, load_valuehead_params
+from ..model import find_all_linear_modules, load_model, load_tokenizer, load_valuehead_params
 
 
 if is_galore_available():
@@ -87,16 +87,18 @@ def create_ref_model(
         )
         ref_model_args = ModelArguments(**ref_model_args_dict)
         ref_finetuning_args = FinetuningArguments(finetuning_type="lora")
-        ref_model, _ = load_model_and_tokenizer(
-            ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
+        tokenizer = load_tokenizer(ref_model_args)
+        ref_model = load_model(
+            tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
         )
         logger.info("Created reference model from {}".format(finetuning_args.ref_model))
     else:
         if finetuning_args.finetuning_type == "lora":
             ref_model = None
         else:
-            ref_model, _ = load_model_and_tokenizer(
-                model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead
+            tokenizer = load_tokenizer(model_args)
+            ref_model = load_model(
+                tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead
             )
             logger.info("Created reference model from the model itself.")
 
@@ -141,8 +143,9 @@ def create_reward_model(
         )
         reward_model_args = ModelArguments(**reward_model_args_dict)
         reward_finetuning_args = FinetuningArguments(finetuning_type="lora")
-        reward_model, _ = load_model_and_tokenizer(
-            reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True
+        tokenizer = load_tokenizer(reward_model_args)
+        reward_model = load_model(
+            tokenizer, reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True
         )
         logger.info("Loaded full weights of reward model from {}".format(finetuning_args.reward_model))
         logger.warning("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.")

From 43d134ba2968e15b77cf883505765d43ac381538 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 3 Apr 2024 21:56:43 +0800
Subject: [PATCH 040/341] fix requires for windows

Former-commit-id: 5e25fae40b7ea9cfa72717efbe3677199ca9608f
---
 README.md                      | 2 +-
 README_zh.md                   | 2 +-
 requirements.txt               | 1 -
 setup.py                       | 1 +
 src/llmtuner/chat/hf_engine.py | 2 +-
 5 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 778d90b7..edeedebd 100644
--- a/README.md
+++ b/README.md
@@ -305,7 +305,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-Extra dependencies available: deepspeed, metrics, unsloth, vllm, bitsandbytes, gptq, awq, aqlm, qwen, quality
+Extra dependencies available: deepspeed, metrics, unsloth, galore, vllm, bitsandbytes, gptq, awq, aqlm, qwen, quality
 
 <details><summary>For Windows users</summary>
 
diff --git a/README_zh.md b/README_zh.md
index 7b02c55d..09f02922 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -305,7 +305,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-可选的额外依赖项：deepspeed、metrics、unsloth、vllm、bitsandbytes、gptq、awq、aqlm、qwen、quality
+可选的额外依赖项：deepspeed、metrics、unsloth、galore、vllm、bitsandbytes、gptq、awq、aqlm、qwen、quality
 
 <details><summary>Windows 用户指南</summary>
 
diff --git a/requirements.txt b/requirements.txt
index 88b88ee4..fe8ab35c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,4 +15,3 @@ fastapi
 sse-starlette
 matplotlib
 fire
-galore-torch
diff --git a/setup.py b/setup.py
index 2caee7a8..67b6f98d 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@ extra_require = {
     "deepspeed": ["deepspeed>=0.10.0"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "unsloth": ["torch==2.2.0", "unsloth[cu121-ampere-torch220]"],
+    "galore": ["galore-torch"],
     "vllm": ["vllm>=0.3.3"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
diff --git a/src/llmtuner/chat/hf_engine.py b/src/llmtuner/chat/hf_engine.py
index bcdbd15a..ddb48e47 100644
--- a/src/llmtuner/chat/hf_engine.py
+++ b/src/llmtuner/chat/hf_engine.py
@@ -35,7 +35,7 @@ class HuggingfaceEngine(BaseEngine):
         self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
         self.model = load_model(
             self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
-        )
+        )  # must after fixing tokenizer to resize vocab
         self.generating_args = generating_args.to_dict()
 
     @staticmethod

From b1986a06b99f48ef3dad39954be830e1b7db8095 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 4 Apr 2024 00:55:31 +0800
Subject: [PATCH 041/341] fix bug in latest gradio

Former-commit-id: 44a962862b4a74e50ef5786c8d5719faaa65f63f
---
 requirements.txt                       |   2 +-
 scripts/llamafy_internlm2.py           | 114 -------------------------
 src/llmtuner/extras/misc.py            |   2 +-
 src/llmtuner/webui/components/eval.py  |   6 +-
 src/llmtuner/webui/components/train.py |  84 ++++++++----------
 src/llmtuner/webui/locales.py          |   5 ++
 src/llmtuner/webui/runner.py           |  85 ++++++++++++------
 src/llmtuner/webui/utils.py            |  17 ++--
 8 files changed, 111 insertions(+), 204 deletions(-)
 delete mode 100644 scripts/llamafy_internlm2.py

diff --git a/requirements.txt b/requirements.txt
index fe8ab35c..3928d28d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ datasets>=2.14.3
 accelerate>=0.27.2
 peft>=0.10.0
 trl>=0.8.1
-gradio>4.0.0,<=4.21.0
+gradio>=4.0.0
 scipy
 einops
 sentencepiece
diff --git a/scripts/llamafy_internlm2.py b/scripts/llamafy_internlm2.py
deleted file mode 100644
index b6b03e7d..00000000
--- a/scripts/llamafy_internlm2.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# coding=utf-8
-# Converts the InternLM2 model in the same format as LLaMA2.
-# Usage: python llamafy_internlm2.py --input_dir input --output_dir output
-# Warning: We have found that the converted model cannot infer correctly. It will be fixed later.
-
-import json
-import os
-from collections import OrderedDict
-from typing import Any, Dict, Optional
-
-import fire
-import torch
-from safetensors.torch import save_file
-from tqdm import tqdm
-from transformers.modeling_utils import (
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-    shard_checkpoint,
-)
-
-
-CONFIG_NAME = "config.json"
-
-
-def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetensors: bool):
-    with open(os.path.join(input_dir, CONFIG_NAME), "r", encoding="utf-8") as f:
-        internlm2_config_dict: Dict[str, Any] = json.load(f)
-
-    internlm2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
-    for filepath in tqdm(os.listdir(input_dir), desc="Load weights"):
-        if os.path.isfile(os.path.join(input_dir, filepath)) and filepath.endswith(".bin"):
-            shard_weight = torch.load(os.path.join(input_dir, filepath), map_location="cpu")
-            internlm2_state_dict.update(shard_weight)
-
-    llama2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
-    for key, value in tqdm(internlm2_state_dict.items(), desc="Convert format"):
-        if "output" in key:
-            llama2_state_dict[key.replace("output", "lm_head")] = value
-        elif "tok_embeddings" in key:
-            llama2_state_dict[key.replace("tok_embeddings", "embed_tokens")] = value
-        elif "wqkv" in key:
-            num_q_heads = internlm2_config_dict["num_attention_heads"]
-            num_kv_heads = internlm2_config_dict["num_key_value_heads"]
-            q_size = value.size(0) // (num_q_heads + 2 * num_kv_heads) * num_q_heads
-            kv_size = value.size(0) // (num_q_heads + 2 * num_kv_heads) * num_kv_heads
-            llama2_state_dict[key.replace("attention.wqkv", "self_attn.q_proj")] = value[:q_size, ...]
-            llama2_state_dict[key.replace("attention.wqkv", "self_attn.k_proj")] = value[
-                q_size : q_size + kv_size, ...
-            ]
-            llama2_state_dict[key.replace("attention.wqkv", "self_attn.v_proj")] = value[q_size + kv_size :, ...]
-        elif "wo" in key:
-            llama2_state_dict[key.replace("attention.wo", "self_attn.o_proj")] = value
-        elif "attention_norm" in key:
-            llama2_state_dict[key.replace("attention_norm", "input_layernorm")] = value
-        elif "ffn_norm" in key:
-            llama2_state_dict[key.replace("ffn_norm", "post_attention_layernorm")] = value
-        elif "w1" in key:
-            llama2_state_dict[key.replace("feed_forward.w1", "mlp.gate_proj")] = value
-        elif "w2" in key:
-            llama2_state_dict[key.replace("feed_forward.w2", "mlp.down_proj")] = value
-        elif "w3" in key:
-            llama2_state_dict[key.replace("feed_forward.w3", "mlp.up_proj")] = value
-        else:
-            llama2_state_dict[key] = value
-
-    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
-    shards, index = shard_checkpoint(llama2_state_dict, max_shard_size=shard_size, weights_name=weights_name)
-
-    for shard_file, shard in tqdm(shards.items(), desc="Save weights"):
-        if save_safetensors:
-            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
-        else:
-            torch.save(shard, os.path.join(output_dir, shard_file))
-
-    if index is None:
-        print("Model weights saved in {}".format(os.path.join(output_dir, WEIGHTS_NAME)))
-    else:
-        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
-        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
-            json.dump(index, f, indent=2, sort_keys=True)
-        print("Model weights saved in {}".format(output_dir))
-
-
-def save_config(input_dir: str, output_dir: str):
-    with open(os.path.join(input_dir, CONFIG_NAME), "r", encoding="utf-8") as f:
-        llama2_config_dict: Dict[str, Any] = json.load(f)
-
-    llama2_config_dict["architectures"] = ["LlamaForCausalLM"]
-    llama2_config_dict.pop("auto_map", None)
-    llama2_config_dict.pop("bias", None)
-    llama2_config_dict.pop("rope_scaling", None)
-    llama2_config_dict["model_type"] = "llama"
-
-    with open(os.path.join(output_dir, CONFIG_NAME), "w", encoding="utf-8") as f:
-        json.dump(llama2_config_dict, f, indent=2)
-    print("Model config saved in {}".format(os.path.join(output_dir, CONFIG_NAME)))
-
-
-def llamafy_internlm2(
-    input_dir: str, output_dir: str, shard_size: Optional[str] = "2GB", save_safetensors: Optional[bool] = False
-):
-    try:
-        os.makedirs(output_dir, exist_ok=False)
-    except Exception as e:
-        raise print("Output dir already exists", e)
-
-    save_weight(input_dir, output_dir, shard_size, save_safetensors)
-    save_config(input_dir, output_dir)
-
-
-if __name__ == "__main__":
-    fire.Fire(llamafy_internlm2)
diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index a696b315..2093d7ea 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -66,7 +66,7 @@ def check_dependencies() -> None:
         require_version("accelerate>=0.27.2", "To fix: pip install accelerate>=0.27.2")
         require_version("peft>=0.10.0", "To fix: pip install peft>=0.10.0")
         require_version("trl>=0.8.1", "To fix: pip install trl>=0.8.1")
-        require_version("gradio>4.0.0,<=4.21.0", "To fix: pip install gradio==4.21.0")
+        require_version("gradio>=4.0.0", "To fix: pip install gradio>=4.0.0")
 
 
 def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index a1dae98c..87611da5 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -21,8 +21,6 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
         dataset = gr.Dropdown(multiselect=True, scale=4)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
-    dataset_dir.change(list_dataset, [dataset_dir], [dataset], queue=False)
-
     input_elems.update({dataset_dir, dataset})
     elem_dict.update(dict(dataset_dir=dataset_dir, dataset=dataset, **preview_elems))
 
@@ -50,7 +48,7 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
         stop_btn = gr.Button(variant="stop")
 
     with gr.Row():
-        resume_btn = gr.Checkbox(visible=False, interactive=False, value=False)
+        resume_btn = gr.Checkbox(visible=False, interactive=False)
         process_bar = gr.Slider(visible=False, interactive=False)
 
     with gr.Row():
@@ -73,4 +71,6 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
     stop_btn.click(engine.runner.set_abort)
     resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
 
+    dataset_dir.change(list_dataset, [dataset_dir], [dataset], queue=False)
+
     return elem_dict
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 1c425d51..4f108db0 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -6,7 +6,6 @@ from transformers.trainer_utils import SchedulerType
 from ...extras.constants import TRAINING_STAGES
 from ..common import DEFAULT_DATA_DIR, autoset_packing, list_adapters, list_dataset
 from ..components.data import create_preview_box
-from ..utils import gen_plot
 
 
 if TYPE_CHECKING:
@@ -24,7 +23,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=1
         )
         dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=1)
-        dataset = gr.Dropdown(multiselect=True, scale=2, allow_custom_value=True)
+        dataset = gr.Dropdown(multiselect=True, scale=4, allow_custom_value=True)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
     input_elems.update({training_stage, dataset_dir, dataset})
@@ -121,8 +120,8 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as freeze_tab:
         with gr.Row():
-            num_layer_trainable = gr.Slider(value=3, minimum=1, maximum=128, step=1, scale=2)
-            name_module_trainable = gr.Textbox(value="all", scale=3)
+            num_layer_trainable = gr.Slider(value=3, minimum=1, maximum=128, step=1)
+            name_module_trainable = gr.Textbox(value="all")
 
     input_elems.update({num_layer_trainable, name_module_trainable})
     elem_dict.update(
@@ -140,8 +139,10 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             create_new_adapter = gr.Checkbox()
 
         with gr.Row():
-            use_rslora = gr.Checkbox(scale=1)
-            use_dora = gr.Checkbox(scale=1)
+            with gr.Column(scale=1):
+                use_rslora = gr.Checkbox()
+                use_dora = gr.Checkbox()
+
             lora_target = gr.Textbox(scale=2)
             additional_target = gr.Textbox(scale=2)
 
@@ -175,10 +176,10 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as rlhf_tab:
         with gr.Row():
-            dpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1)
-            dpo_ftx = gr.Slider(value=0, minimum=0, maximum=10, step=0.01, scale=1)
-            orpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1)
-            reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=2)
+            dpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01)
+            dpo_ftx = gr.Slider(value=0, minimum=0, maximum=10, step=0.01)
+            orpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01)
+            reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True)
 
     input_elems.update({dpo_beta, dpo_ftx, orpo_beta, reward_model})
     elem_dict.update(
@@ -187,11 +188,11 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as galore_tab:
         with gr.Row():
-            use_galore = gr.Checkbox(scale=1)
-            galore_rank = gr.Slider(value=16, minimum=1, maximum=1024, step=1, scale=2)
-            galore_update_interval = gr.Slider(value=200, minimum=1, maximum=1024, step=1, scale=2)
-            galore_scale = gr.Slider(value=0.25, minimum=0, maximum=1, step=0.01, scale=2)
-            galore_target = gr.Textbox(value="all", scale=3)
+            use_galore = gr.Checkbox()
+            galore_rank = gr.Slider(value=16, minimum=1, maximum=1024, step=1)
+            galore_update_interval = gr.Slider(value=200, minimum=1, maximum=1024, step=1)
+            galore_scale = gr.Slider(value=0.25, minimum=0, maximum=1, step=0.01)
+            galore_target = gr.Textbox(value="all")
 
     input_elems.update({use_galore, galore_rank, galore_update_interval, galore_scale, galore_target})
     elem_dict.update(
@@ -228,29 +229,6 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         with gr.Column(scale=1):
             loss_viewer = gr.Plot()
 
-    input_elems.update({output_dir, config_path})
-    output_elems = [output_box, process_bar]
-
-    cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems, concurrency_limit=None)
-    arg_save_btn.click(engine.runner.save_args, input_elems, output_elems, concurrency_limit=None)
-    arg_load_btn.click(
-        engine.runner.load_args,
-        [engine.manager.get_elem_by_id("top.lang"), config_path],
-        list(input_elems),
-        concurrency_limit=None,
-    )
-    start_btn.click(engine.runner.run_train, input_elems, output_elems)
-    stop_btn.click(engine.runner.set_abort)
-    resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
-
-    dataset_dir.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False)
-    training_stage.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False).then(
-        list_adapters,
-        [engine.manager.get_elem_by_id("top.model_name"), engine.manager.get_elem_by_id("top.finetuning_type")],
-        [reward_model],
-        queue=False,
-    ).then(autoset_packing, [training_stage], [packing], queue=False)
-
     elem_dict.update(
         dict(
             cmd_preview_btn=cmd_preview_btn,
@@ -267,15 +245,27 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         )
     )
 
-    output_box.change(
-        gen_plot,
-        [
-            engine.manager.get_elem_by_id("top.model_name"),
-            engine.manager.get_elem_by_id("top.finetuning_type"),
-            output_dir,
-        ],
-        loss_viewer,
-        queue=False,
+    input_elems.update({output_dir, config_path})
+    output_elems = [output_box, process_bar, loss_viewer]
+
+    cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems, concurrency_limit=None)
+    arg_save_btn.click(engine.runner.save_args, input_elems, output_elems, concurrency_limit=None)
+    arg_load_btn.click(
+        engine.runner.load_args,
+        [engine.manager.get_elem_by_id("top.lang"), config_path],
+        list(input_elems) + [output_box],
+        concurrency_limit=None,
     )
+    start_btn.click(engine.runner.run_train, input_elems, output_elems)
+    stop_btn.click(engine.runner.set_abort)
+    resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
+
+    dataset_dir.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False)
+    training_stage.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False).then(
+        list_adapters,
+        [engine.manager.get_elem_by_id("top.model_name"), engine.manager.get_elem_by_id("top.finetuning_type")],
+        [reward_model],
+        queue=False,
+    ).then(autoset_packing, [training_stage], [packing], queue=False)
 
     return elem_dict
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index b7319fd4..c3111e8f 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -1344,6 +1344,11 @@ ALERTS = {
         "ru": "Аргументы были сохранены по адресу: ",
         "zh": "训练参数已保存至：",
     },
+    "info_config_loaded": {
+        "en": "Arguments have been restored.",
+        "ru": "Аргументы были восстановлены.",
+        "zh": "训练参数已载入。",
+    },
     "info_loading": {
         "en": "Loading model...",
         "ru": "Загрузка модели...",
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index dae7daf8..2d3ef80f 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -2,7 +2,7 @@ import logging
 import os
 import time
 from threading import Thread
-from typing import TYPE_CHECKING, Any, Dict, Generator, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Generator
 
 import gradio as gr
 import transformers
@@ -17,7 +17,7 @@ from ..extras.misc import get_device_count, torch_gc
 from ..train import run_exp
 from .common import get_module, get_save_dir, load_args, load_config, save_args
 from .locales import ALERTS
-from .utils import gen_cmd, get_eval_results, update_process_bar
+from .utils import gen_cmd, gen_plot, get_eval_results, update_process_bar
 
 
 if TYPE_CHECKING:
@@ -239,20 +239,22 @@ class Runner:
 
         return args
 
-    def _preview(self, data: Dict["Component", Any], do_train: bool) -> Generator[Tuple[str, "gr.Slider"], None, None]:
+    def _preview(self, data: Dict["Component", Any], do_train: bool) -> Generator[Dict[Component, str], None, None]:
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
         error = self._initialize(data, do_train, from_preview=True)
         if error:
             gr.Warning(error)
-            yield error, gr.Slider(visible=False)
+            yield {output_box: error}
         else:
             args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
-            yield gen_cmd(args), gr.Slider(visible=False)
+            yield {output_box: gen_cmd(args)}
 
-    def _launch(self, data: Dict["Component", Any], do_train: bool) -> Generator[Tuple[str, "gr.Slider"], None, None]:
+    def _launch(self, data: Dict["Component", Any], do_train: bool) -> Generator[Dict[Component, Any], None, None]:
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
         error = self._initialize(data, do_train, from_preview=False)
         if error:
             gr.Warning(error)
-            yield error, gr.Slider(visible=False)
+            yield {output_box: error}
         else:
             args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
             run_kwargs = dict(args=args, callbacks=[self.trainer_callback])
@@ -261,54 +263,80 @@ class Runner:
             self.thread.start()
             yield from self.monitor()
 
-    def preview_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, gr.Slider], None, None]:
+    def preview_train(self, data: Dict[Component, Any]) -> Generator[Dict[Component, str], None, None]:
         yield from self._preview(data, do_train=True)
 
-    def preview_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, gr.Slider], None, None]:
+    def preview_eval(self, data: Dict[Component, Any]) -> Generator[Dict[Component, str], None, None]:
         yield from self._preview(data, do_train=False)
 
-    def run_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, gr.Slider], None, None]:
+    def run_train(self, data: Dict[Component, Any]) -> Generator[Dict[Component, Any], None, None]:
         yield from self._launch(data, do_train=True)
 
-    def run_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, gr.Slider], None, None]:
+    def run_eval(self, data: Dict[Component, Any]) -> Generator[Dict[Component, Any], None, None]:
         yield from self._launch(data, do_train=False)
 
-    def monitor(self) -> Generator[Tuple[str, "gr.Slider"], None, None]:
+    def monitor(self) -> Generator[Dict[Component, Any], None, None]:
         get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
         self.running = True
+
         lang = get("top.lang")
-        output_dir = get_save_dir(
-            get("top.model_name"),
-            get("top.finetuning_type"),
-            get("{}.output_dir".format("train" if self.do_train else "eval")),
-        )
+        model_name = get("top.model_name")
+        finetuning_type = get("top.finetuning_type")
+        output_dir = get("{}.output_dir".format("train" if self.do_train else "eval"))
+        output_path = get_save_dir(model_name, finetuning_type, output_dir)
+
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if self.do_train else "eval"))
+        process_bar = self.manager.get_elem_by_id("{}.process_bar".format("train" if self.do_train else "eval"))
+        loss_viewer = self.manager.get_elem_by_id("train.loss_viewer") if self.do_train else None
 
         while self.thread is not None and self.thread.is_alive():
             if self.aborted:
-                yield ALERTS["info_aborting"][lang], gr.Slider(visible=False)
+                yield {
+                    output_box: ALERTS["info_aborting"][lang],
+                    process_bar: gr.Slider(visible=False),
+                }
             else:
-                yield self.logger_handler.log, update_process_bar(self.trainer_callback)
+                return_dict = {
+                    output_box: self.logger_handler.log,
+                    process_bar: update_process_bar(self.trainer_callback),
+                }
+                if self.do_train:
+                    plot = gen_plot(output_path)
+                    if plot is not None:
+                        return_dict[loss_viewer] = plot
+
+                yield return_dict
 
             time.sleep(2)
 
         if self.do_train:
-            if os.path.exists(os.path.join(output_dir, TRAINING_ARGS_NAME)):
+            if os.path.exists(os.path.join(output_path, TRAINING_ARGS_NAME)):
                 finish_info = ALERTS["info_finished"][lang]
             else:
                 finish_info = ALERTS["err_failed"][lang]
         else:
-            if os.path.exists(os.path.join(output_dir, "all_results.json")):
-                finish_info = get_eval_results(os.path.join(output_dir, "all_results.json"))
+            if os.path.exists(os.path.join(output_path, "all_results.json")):
+                finish_info = get_eval_results(os.path.join(output_path, "all_results.json"))
             else:
                 finish_info = ALERTS["err_failed"][lang]
 
-        yield self._finalize(lang, finish_info), gr.Slider(visible=False)
+        return_dict = {
+            output_box: self._finalize(lang, finish_info),
+            process_bar: gr.Slider(visible=False),
+        }
+        if self.do_train:
+            plot = gen_plot(output_path)
+            if plot is not None:
+                return_dict[loss_viewer] = plot
 
-    def save_args(self, data: Dict[Component, Any]) -> Tuple[str, "gr.Slider"]:
+        yield return_dict
+
+    def save_args(self, data: Dict[Component, Any]) -> Dict[Component, str]:
+        output_box = self.manager.get_elem_by_id("train.output_box")
         error = self._initialize(data, do_train=True, from_preview=True)
         if error:
             gr.Warning(error)
-            return error, gr.Slider(visible=False)
+            return {output_box: error}
 
         config_dict: Dict[str, Any] = {}
         lang = data[self.manager.get_elem_by_id("top.lang")]
@@ -320,15 +348,16 @@ class Runner:
                 config_dict[elem_id] = value
 
         save_path = save_args(config_path, config_dict)
-        return ALERTS["info_config_saved"][lang] + save_path, gr.Slider(visible=False)
+        return {output_box: ALERTS["info_config_saved"][lang] + save_path}
 
     def load_args(self, lang: str, config_path: str) -> Dict[Component, Any]:
+        output_box = self.manager.get_elem_by_id("train.output_box")
         config_dict = load_args(config_path)
         if config_dict is None:
             gr.Warning(ALERTS["err_config_not_found"][lang])
-            return {self.manager.get_elem_by_id("top.lang"): lang}
+            return {output_box: ALERTS["err_config_not_found"][lang]}
 
-        output_dict: Dict["Component", Any] = {}
+        output_dict: Dict["Component", Any] = {output_box: ALERTS["info_config_loaded"][lang]}
         for elem_id, value in config_dict.items():
             output_dict[self.manager.get_elem_by_id(elem_id)] = value
 
diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py
index 275b6573..d96b1f6b 100644
--- a/src/llmtuner/webui/utils.py
+++ b/src/llmtuner/webui/utils.py
@@ -1,13 +1,12 @@
 import json
 import os
 from datetime import datetime
-from typing import TYPE_CHECKING, Any, Dict
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import gradio as gr
 
 from ..extras.packages import is_matplotlib_available
 from ..extras.ploting import smooth
-from .common import get_save_dir
 from .locales import ALERTS
 
 
@@ -36,7 +35,7 @@ def get_time() -> str:
 
 def can_quantize(finetuning_type: str) -> "gr.Dropdown":
     if finetuning_type != "lora":
-        return gr.Dropdown(value="None", interactive=False)
+        return gr.Dropdown(value="none", interactive=False)
     else:
         return gr.Dropdown(interactive=True)
 
@@ -74,11 +73,9 @@ def get_eval_results(path: os.PathLike) -> str:
     return "```json\n{}\n```\n".format(result)
 
 
-def gen_plot(base_model: str, finetuning_type: str, output_dir: str) -> "matplotlib.figure.Figure":
-    if not base_model:
-        return
-    log_file = get_save_dir(base_model, finetuning_type, output_dir, "trainer_log.jsonl")
-    if not os.path.isfile(log_file):
+def gen_plot(output_path: str) -> Optional["matplotlib.figure.Figure"]:
+    log_file = os.path.join(output_path, "trainer_log.jsonl")
+    if not os.path.isfile(log_file) or not is_matplotlib_available():
         return
 
     plt.close("all")
@@ -88,13 +85,13 @@ def gen_plot(base_model: str, finetuning_type: str, output_dir: str) -> "matplot
     steps, losses = [], []
     with open(log_file, "r", encoding="utf-8") as f:
         for line in f:
-            log_info = json.loads(line)
+            log_info: Dict[str, Any] = json.loads(line)
             if log_info.get("loss", None):
                 steps.append(log_info["current_steps"])
                 losses.append(log_info["loss"])
 
     if len(losses) == 0:
-        return None
+        return
 
     ax.plot(steps, losses, color="#1f77b4", alpha=0.4, label="original")
     ax.plot(steps, smooth(losses), color="#1f77b4", label="smoothed")

From 48ceac845c83c6c8d06ad3f0c31805d7e9bd6bf3 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 4 Apr 2024 02:07:20 +0800
Subject: [PATCH 042/341] back to gradio 4.21 and fix chat

Former-commit-id: 695734a40a702ea059d855da54080cc8d161e41a
---
 requirements.txt                         |  2 +-
 src/llmtuner/extras/misc.py              |  2 +-
 src/llmtuner/webui/chatter.py            | 24 +++++++++++++++---------
 src/llmtuner/webui/components/chatbot.py | 14 ++++++++------
 src/llmtuner/webui/components/infer.py   |  4 ++--
 5 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3928d28d..1fa5a142 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ datasets>=2.14.3
 accelerate>=0.27.2
 peft>=0.10.0
 trl>=0.8.1
-gradio>=4.0.0
+gradio>=4.0.0,<=4.21.0
 scipy
 einops
 sentencepiece
diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index 2093d7ea..49b99eee 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -66,7 +66,7 @@ def check_dependencies() -> None:
         require_version("accelerate>=0.27.2", "To fix: pip install accelerate>=0.27.2")
         require_version("peft>=0.10.0", "To fix: pip install peft>=0.10.0")
         require_version("trl>=0.8.1", "To fix: pip install trl>=0.8.1")
-        require_version("gradio>=4.0.0", "To fix: pip install gradio>=4.0.0")
+        require_version("gradio>=4.0.0,<=4.21.0", "To fix: pip install gradio==4.21.0")
 
 
 def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
diff --git a/src/llmtuner/webui/chatter.py b/src/llmtuner/webui/chatter.py
index 2621bd5e..8c744153 100644
--- a/src/llmtuner/webui/chatter.py
+++ b/src/llmtuner/webui/chatter.py
@@ -92,23 +92,29 @@ class WebChatModel(ChatModel):
         torch_gc()
         yield ALERTS["info_unloaded"][lang]
 
-    def predict(
+    def append(
         self,
-        chatbot: List[Tuple[str, str]],
+        chatbot: List[List[Optional[str]]],
+        messages: Sequence[Dict[str, str]],
         role: str,
         query: str,
+    ) -> Tuple[List[List[Optional[str]]], List[Dict[str, str]], str]:
+        return chatbot + [[query, None]], messages + [{"role": role, "content": query}], ""
+
+    def stream(
+        self,
+        chatbot: List[List[Optional[str]]],
         messages: Sequence[Dict[str, str]],
         system: str,
         tools: str,
         max_new_tokens: int,
         top_p: float,
         temperature: float,
-    ) -> Generator[Tuple[List[Tuple[str, str]], List[Dict[str, str]]], None, None]:
-        chatbot.append([query, ""])
-        query_messages = messages + [{"role": role, "content": query}]
+    ) -> Generator[Tuple[List[List[Optional[str]]], List[Dict[str, str]]], None, None]:
+        chatbot[-1][1] = ""
         response = ""
         for new_text in self.stream_chat(
-            query_messages, system, tools, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature
+            messages, system, tools, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature
         ):
             response += new_text
             if tools:
@@ -120,11 +126,11 @@ class WebChatModel(ChatModel):
                 name, arguments = result
                 arguments = json.loads(arguments)
                 tool_call = json.dumps({"name": name, "arguments": arguments}, ensure_ascii=False)
-                output_messages = query_messages + [{"role": Role.FUNCTION.value, "content": tool_call}]
+                output_messages = messages + [{"role": Role.FUNCTION.value, "content": tool_call}]
                 bot_text = "```json\n" + tool_call + "\n```"
             else:
-                output_messages = query_messages + [{"role": Role.ASSISTANT.value, "content": result}]
+                output_messages = messages + [{"role": Role.ASSISTANT.value, "content": result}]
                 bot_text = result
 
-            chatbot[-1] = [query, bot_text]
+            chatbot[-1][1] = bot_text
             yield chatbot, output_messages
diff --git a/src/llmtuner/webui/components/chatbot.py b/src/llmtuner/webui/components/chatbot.py
index d7d5bd66..8efd333c 100644
--- a/src/llmtuner/webui/components/chatbot.py
+++ b/src/llmtuner/webui/components/chatbot.py
@@ -35,13 +35,15 @@ def create_chat_box(
     tools.input(check_json_schema, inputs=[tools, engine.manager.get_elem_by_id("top.lang")])
 
     submit_btn.click(
-        engine.chatter.predict,
-        [chatbot, role, query, messages, system, tools, max_new_tokens, top_p, temperature],
+        engine.chatter.append,
+        [chatbot, messages, role, query],
+        [chatbot, messages, query],
+    ).then(
+        engine.chatter.stream,
+        [chatbot, messages, system, tools, max_new_tokens, top_p, temperature],
         [chatbot, messages],
-        show_progress=True,
-    ).then(lambda: "", outputs=[query])
-
-    clear_btn.click(lambda: ([], []), outputs=[chatbot, messages], show_progress=True)
+    )
+    clear_btn.click(lambda: ([], []), outputs=[chatbot, messages])
 
     return (
         chat_box,
diff --git a/src/llmtuner/webui/components/infer.py b/src/llmtuner/webui/components/infer.py
index 097ded25..1e56d432 100644
--- a/src/llmtuner/webui/components/infer.py
+++ b/src/llmtuner/webui/components/infer.py
@@ -25,7 +25,7 @@ def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]:
     input_elems.update({infer_backend})
     elem_dict.update(dict(infer_backend=infer_backend, load_btn=load_btn, unload_btn=unload_btn, info_box=info_box))
 
-    chat_box, chatbot, history, chat_elems = create_chat_box(engine, visible=False)
+    chat_box, chatbot, messages, chat_elems = create_chat_box(engine, visible=False)
     elem_dict.update(dict(chat_box=chat_box, **chat_elems))
 
     load_btn.click(engine.chatter.load_model, input_elems, [info_box]).then(
@@ -33,7 +33,7 @@ def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]:
     )
 
     unload_btn.click(engine.chatter.unload_model, input_elems, [info_box]).then(
-        lambda: ([], []), outputs=[chatbot, history]
+        lambda: ([], []), outputs=[chatbot, messages]
     ).then(lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_box])
 
     return elem_dict

From a246ac19148965d0fb342b58868cc80ea6f1e45f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 4 Apr 2024 02:19:03 +0800
Subject: [PATCH 043/341] tiny fix

Former-commit-id: 70aceecb27e72095c05462d01f956061669b267e
---
 src/llmtuner/webui/runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 2d3ef80f..ef5379cd 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -277,6 +277,7 @@ class Runner:
 
     def monitor(self) -> Generator[Dict[Component, Any], None, None]:
         get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
+        self.aborted = False
         self.running = True
 
         lang = get("top.lang")

From 3ac333fc6a134e2961f88fbbf4aca590c1438586 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 4 Apr 2024 14:48:21 +0800
Subject: [PATCH 044/341] update examples

Former-commit-id: de40ad62ba3d4c74c69de97b39cc79786ac28f0f
---
 examples/accelerate/fsdp_config.yaml   | 4 ++--
 examples/accelerate/master_config.yaml | 4 ++--
 examples/accelerate/single_config.yaml | 4 ++--
 examples/accelerate/slave_config.yaml  | 4 ++--
 examples/merge_lora/merge.sh           | 1 +
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/accelerate/fsdp_config.yaml b/examples/accelerate/fsdp_config.yaml
index abfbf8f6..60025597 100644
--- a/examples/accelerate/fsdp_config.yaml
+++ b/examples/accelerate/fsdp_config.yaml
@@ -15,8 +15,8 @@ fsdp_config:
 machine_rank: 0
 main_training_function: main
 mixed_precision: fp16
-num_machines: 1
-num_processes: 2
+num_machines: 1 # the number of nodes
+num_processes: 2 # the number of GPUs in all nodes
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/examples/accelerate/master_config.yaml b/examples/accelerate/master_config.yaml
index aa41f7e1..9c8fc275 100644
--- a/examples/accelerate/master_config.yaml
+++ b/examples/accelerate/master_config.yaml
@@ -8,8 +8,8 @@ main_process_ip: 192.168.0.1
 main_process_port: 29555
 main_training_function: main
 mixed_precision: fp16
-num_machines: 2
-num_processes: 16
+num_machines: 2 # the number of nodes
+num_processes: 16 # the number of GPUs in all nodes
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/examples/accelerate/single_config.yaml b/examples/accelerate/single_config.yaml
index ddb5c910..97f8c633 100644
--- a/examples/accelerate/single_config.yaml
+++ b/examples/accelerate/single_config.yaml
@@ -6,8 +6,8 @@ gpu_ids: all
 machine_rank: 0
 main_training_function: main
 mixed_precision: fp16
-num_machines: 1
-num_processes: 4
+num_machines: 1 # the number of nodes
+num_processes: 4 # the number of GPUs in all nodes
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/examples/accelerate/slave_config.yaml b/examples/accelerate/slave_config.yaml
index fcb4bb93..e4a63e82 100644
--- a/examples/accelerate/slave_config.yaml
+++ b/examples/accelerate/slave_config.yaml
@@ -8,8 +8,8 @@ main_process_ip: 192.168.0.1
 main_process_port: 29555
 main_training_function: main
 mixed_precision: fp16
-num_machines: 2
-num_processes: 16
+num_machines: 2 # the number of nodes
+num_processes: 16 # the number of GPUs in all nodes
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/examples/merge_lora/merge.sh b/examples/merge_lora/merge.sh
index bd2babb8..c1f15fce 100644
--- a/examples/merge_lora/merge.sh
+++ b/examples/merge_lora/merge.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# DO NOT use quantized model or quantization_bit when merging lora weights
 
 CUDA_VISIBLE_DEVICES= python ../../src/export_model.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \

From 04fc2f78bf7cd62991fa755f77c25ec85e7b844f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 7 Apr 2024 00:48:24 +0800
Subject: [PATCH 045/341] update readme

Former-commit-id: 1cf15547e2420a3e5f7a969c21c10c7fbdfc71fe
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index edeedebd..33c358b0 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
-| [Qwen1.5 (MoE)](https://huggingface.co/Qwen)             | 0.5B/1.8B/4B/7B/14B/72B     | q_proj,v_proj     | qwen      |
+| [Qwen1.5 (MoE)](https://huggingface.co/Qwen)             | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                  | q_proj,v_proj     | xverse    |
 | [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                   | q_proj,v_proj     | yi        |
diff --git a/README_zh.md b/README_zh.md
index 09f02922..206af6f5 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -142,7 +142,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
-| [Qwen1.5 (MoE)](https://huggingface.co/Qwen)             | 0.5B/1.8B/4B/7B/14B/72B     | q_proj,v_proj     | qwen      |
+| [Qwen1.5 (MoE)](https://huggingface.co/Qwen)             | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                  | q_proj,v_proj     | xverse    |
 | [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                   | q_proj,v_proj     | yi        |

From 0cc03d3f05751d0fe9c38617a1dea8731c1de3ec Mon Sep 17 00:00:00 2001
From: sliderSun <291952004@qq.com>
Date: Sun, 7 Apr 2024 10:26:13 +0800
Subject: [PATCH 046/341] support Qwen1.5-32B

Former-commit-id: 8f2c67b95a8e177eb4096382417a70cacba38e90
---
 src/llmtuner/extras/constants.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 6e46218b..19bff11b 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -663,6 +663,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B",
         },
+        "Qwen1.5-32B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B",
+        },
         "Qwen1.5-72B": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B",
@@ -691,6 +695,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat",
         },
+        "Qwen1.5-32B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B-Chat",
+        },
         "Qwen1.5-72B-Chat": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat",

From 9f437b5c43f5834dc1012ce145e2bf9b2556f003 Mon Sep 17 00:00:00 2001
From: sliderSun <291952004@qq.com>
Date: Sun, 7 Apr 2024 10:56:03 +0800
Subject: [PATCH 047/341] support Qwen1.5-32B

Former-commit-id: c419adf1697b92520342f4ffa697c84bf19ca37d
---
 src/llmtuner/extras/constants.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 19bff11b..ec5a8fcf 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -747,6 +747,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat-AWQ",
         },
+        "Qwen1.5-32B-int4-Chat": {
+            DownloadSource.DEFAULT: "qwen/Qwen1.5-32B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B-Chat-GPTQ-Int4",
+        },
         "Qwen1.5-72B-int8-Chat": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-GPTQ-Int8",

From 1131ddfaff92f0356055feeaa731a9c16d0fecbf Mon Sep 17 00:00:00 2001
From: sliderSun <291952004@qq.com>
Date: Sun, 7 Apr 2024 10:59:15 +0800
Subject: [PATCH 048/341] fix spell error

Former-commit-id: e6d36a2e593ebc1193b1735075c4ddb5d9f54990
---
 src/llmtuner/extras/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index ec5a8fcf..2c7f5e5f 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -748,7 +748,7 @@ register_model_group(
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat-AWQ",
         },
         "Qwen1.5-32B-int4-Chat": {
-            DownloadSource.DEFAULT: "qwen/Qwen1.5-32B-Chat-GPTQ-Int4",
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat-GPTQ-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B-Chat-GPTQ-Int4",
         },
         "Qwen1.5-72B-int8-Chat": {

From 9e4fda326d9c5392b4ce7169768f52b46bdbb59c Mon Sep 17 00:00:00 2001
From: codingma <codingma@163.com>
Date: Sun, 7 Apr 2024 11:34:01 +0800
Subject: [PATCH 049/341] support
 https://github.com/hiyouga/LLaMA-Factory/issues/3152

Former-commit-id: 708f0ab4b0aa72e2c73ca36eb9ed058910e43092
---
 src/llmtuner/data/template.py    |  9 +++++++++
 src/llmtuner/extras/constants.py | 14 ++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 6cc12c56..440030db 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -773,3 +773,12 @@ _register_template(
     format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )
+
+_register_template(
+    name="mediatek",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
+    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
+    default_system="You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan. ",
+    force_system=True,
+    efficient_eos=True,
+)
\ No newline at end of file
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 6e46218b..535c7cdb 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -969,3 +969,17 @@ register_model_group(
     },
     template="atom",
 )
+
+register_model_group(
+    models={
+        "Breeze-7B": {
+            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Base-v1_0",
+            DownloadSource.MODELSCOPE: "MediaTek-Research/Breeze-7B-Base-v1_0",
+        },
+        "Breeze-7B-Chat": {
+            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Instruct-v1_0",
+            DownloadSource.MODELSCOPE: "MediaTek-Research/Breeze-7B-Instruct-v1_0",
+        }
+    },
+    template="mediatek",
+)
\ No newline at end of file

From 75866aa0207d131e45eb736201c1fd971536e81b Mon Sep 17 00:00:00 2001
From: codingma <codingma@163.com>
Date: Sun, 7 Apr 2024 11:39:54 +0800
Subject: [PATCH 050/341] rename template to breeze

Former-commit-id: 1d894e7cfb73b8a29dababb554d051bd50e4f01d
---
 src/llmtuner/data/template.py    | 2 +-
 src/llmtuner/extras/constants.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 440030db..bd9e65e9 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -775,7 +775,7 @@ _register_template(
 )
 
 _register_template(
-    name="mediatek",
+    name="breeze",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     default_system="You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan. ",
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 535c7cdb..b2ee3058 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -981,5 +981,5 @@ register_model_group(
             DownloadSource.MODELSCOPE: "MediaTek-Research/Breeze-7B-Instruct-v1_0",
         }
     },
-    template="mediatek",
+    template="breeze",
 )
\ No newline at end of file

From 1a8a8b8651e30b141daf015cfc1c597712e60272 Mon Sep 17 00:00:00 2001
From: codingma <codingma@163.com>
Date: Sun, 7 Apr 2024 18:27:20 +0800
Subject: [PATCH 051/341] rename template to breeze

Former-commit-id: 1223e6358dab52b4e1505057f1b16fd9d527c79e
---
 src/llmtuner/data/template.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index bd9e65e9..88d7cb86 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -779,6 +779,5 @@ _register_template(
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     default_system="You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan. ",
-    force_system=True,
     efficient_eos=True,
-)
\ No newline at end of file
+)

From 325dafcbb0daee72c436b066edfc0e28c9f03a25 Mon Sep 17 00:00:00 2001
From: codingma <codingma@163.com>
Date: Sun, 7 Apr 2024 18:28:08 +0800
Subject: [PATCH 052/341] add empty line

Former-commit-id: 1c6c2e611d10e9fa662e3f4e1e7d23b80ae496cb
---
 src/llmtuner/extras/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index b2ee3058..34f89324 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -982,4 +982,4 @@ register_model_group(
         }
     },
     template="breeze",
-)
\ No newline at end of file
+)

From 6030a4a7208e7779c8e9faaf37239628e5f531cf Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 8 Apr 2024 21:28:39 +0800
Subject: [PATCH 053/341] tiny fix

Former-commit-id: d8f1ff51d4c920d4d0aeb9d53db29d1efb733c85
---
 src/llmtuner/data/template.py    | 20 ++++++++++++--------
 src/llmtuner/extras/constants.py | 31 +++++++++++++++----------------
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 88d7cb86..52358c1e 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -441,6 +441,18 @@ _register_template(
 )
 
 
+_register_template(
+    name="breeze",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
+    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
+    default_system=(
+        "You are a helpful AI assistant built by MediaTek Research. "
+        "The user you are helping speaks Traditional Chinese and comes from Taiwan."
+    ),
+    efficient_eos=True,
+)
+
+
 _register_template(
     name="chatglm2",
     format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
@@ -773,11 +785,3 @@ _register_template(
     format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )
-
-_register_template(
-    name="breeze",
-    format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    default_system="You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan. ",
-    efficient_eos=True,
-)
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 38fde712..729e0fa6 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -170,6 +170,19 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "Breeze-7B": {
+            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Base-v1_0",
+        },
+        "Breeze-7B-Chat": {
+            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Instruct-v1_0",
+        },
+    },
+    template="breeze",
+)
+
+
 register_model_group(
     models={
         "ChatGLM2-6B-Chat": {
@@ -748,8 +761,8 @@ register_model_group(
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat-AWQ",
         },
         "Qwen1.5-32B-int4-Chat": {
-            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat-GPTQ-Int4",
-            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B-Chat-GPTQ-Int4",
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B-Chat-AWQ",
         },
         "Qwen1.5-72B-int8-Chat": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
@@ -981,17 +994,3 @@ register_model_group(
     },
     template="atom",
 )
-
-register_model_group(
-    models={
-        "Breeze-7B": {
-            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Base-v1_0",
-            DownloadSource.MODELSCOPE: "MediaTek-Research/Breeze-7B-Base-v1_0",
-        },
-        "Breeze-7B-Chat": {
-            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Instruct-v1_0",
-            DownloadSource.MODELSCOPE: "MediaTek-Research/Breeze-7B-Instruct-v1_0",
-        }
-    },
-    template="breeze",
-)

From 566d71b7a9069b784073fa64294c257891c38849 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 9 Apr 2024 17:12:59 +0800
Subject: [PATCH 054/341] fix quant infer and qwen2moe

Former-commit-id: b75d16767f35c36e2cf2aaab8a3844135085bccf
---
 src/llmtuner/model/loader.py  | 3 ---
 src/llmtuner/model/patcher.py | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index e91a7b68..2acbadb0 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -109,9 +109,6 @@ def load_model(
     if not is_trainable:
         model.requires_grad_(False)
         model.eval()
-        for param in model.parameters():
-            if param.device.type == "cuda":
-                param.data = param.data.to(model_args.compute_dtype)
     else:
         model.train()
 
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 434a3a84..a23d0ef3 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -316,6 +316,9 @@ def patch_config(
     if getattr(config, "model_type", None) == "qwen2" and is_trainable and model_args.flash_attn:
         setattr(config, "use_cache", False)  # qwen2 does not support use_cache when using flashattn
 
+    if getattr(config, "model_type", None) == "qwen2_moe" and is_trainable:
+        setattr(config, "output_router_logits", True)
+
     init_kwargs["torch_dtype"] = model_args.compute_dtype
     if not is_deepspeed_zero3_enabled():
         init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage

From 95a4589bbf8c05df47751e07865b0458617ded17 Mon Sep 17 00:00:00 2001
From: Erich Schubert <kno10@users.noreply.github.com>
Date: Tue, 9 Apr 2024 17:53:40 +0200
Subject: [PATCH 055/341] Pass additional_target to unsloth

Fixes #3200

Former-commit-id: f8f87f5b0549cba6a011749c42064047f82ba577
---
 src/llmtuner/model/adapter.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index eb6d3878..861b8008 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -145,6 +145,8 @@ def init_adapter(
                 from unsloth import FastLanguageModel  # type: ignore
 
                 unsloth_peft_kwargs = {"model": model, "max_seq_length": model_args.model_max_length}
+                if finetuning_args.additional_target:
+                    unsloth_peft_kwargs["modules_to_save"] = finetuning_args.additional_target                
                 model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
             else:
                 lora_config = LoraConfig(

From e25ddef08c9ed71e044c65a0714a1f9382097cd4 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 10 Apr 2024 00:57:30 +0800
Subject: [PATCH 056/341] Update adapter.py

Former-commit-id: a84b8d17dbf221259212e81931d80bcdd6284ad7
---
 src/llmtuner/model/adapter.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index 861b8008..bf206907 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -139,20 +139,18 @@ def init_adapter(
                 "lora_alpha": finetuning_args.lora_alpha,
                 "lora_dropout": finetuning_args.lora_dropout,
                 "use_rslora": finetuning_args.use_rslora,
+                "modules_to_save": finetuning_args.additional_target,
             }
 
             if model_args.use_unsloth:
                 from unsloth import FastLanguageModel  # type: ignore
 
-                unsloth_peft_kwargs = {"model": model, "max_seq_length": model_args.model_max_length}
-                if finetuning_args.additional_target:
-                    unsloth_peft_kwargs["modules_to_save"] = finetuning_args.additional_target                
+                unsloth_peft_kwargs = {"model": model, "max_seq_length": model_args.model_max_length}       
                 model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
             else:
                 lora_config = LoraConfig(
                     task_type=TaskType.CAUSAL_LM,
                     inference_mode=False,
-                    modules_to_save=finetuning_args.additional_target,
                     use_dora=finetuning_args.use_dora,
                     **peft_kwargs,
                 )

From 7856f98965793a2cb911214586bafcfe2615f99d Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 10 Apr 2024 00:57:51 +0800
Subject: [PATCH 057/341] Update adapter.py

Former-commit-id: 720fde3683529ed7e08ac27c7c4598c6bdc30d44
---
 src/llmtuner/model/adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index bf206907..bf6f9381 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -145,7 +145,7 @@ def init_adapter(
             if model_args.use_unsloth:
                 from unsloth import FastLanguageModel  # type: ignore
 
-                unsloth_peft_kwargs = {"model": model, "max_seq_length": model_args.model_max_length}       
+                unsloth_peft_kwargs = {"model": model, "max_seq_length": model_args.model_max_length}
                 model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
             else:
                 lora_config = LoraConfig(

From 2bc2fe7b5ed241996b44380847a63309562ed5a2 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 10 Apr 2024 23:57:59 +0800
Subject: [PATCH 058/341] fix #3225

Former-commit-id: 94110ecf27c32e263f1f2ee61842a3a301b9e089
---
 src/llmtuner/extras/ploting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/extras/ploting.py b/src/llmtuner/extras/ploting.py
index aa101cb7..fd3cb8a3 100644
--- a/src/llmtuner/extras/ploting.py
+++ b/src/llmtuner/extras/ploting.py
@@ -52,6 +52,6 @@ def plot_loss(save_dictionary: os.PathLike, keys: List[str] = ["loss"]) -> None:
         plt.xlabel("step")
         plt.ylabel(key)
         plt.legend()
-        figure_path = os.path.join(save_dictionary, "training_{}.png".format(key.replace(os.path.sep, "_")))
+        figure_path = os.path.join(save_dictionary, "training_{}.png".format(key.replace("/", "_")))
         plt.savefig(figure_path, format="png", dpi=100)
         print("Figure saved at:", figure_path)

From 7468f2535cfc70017825571b2858717195216989 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 11 Apr 2024 20:08:51 +0800
Subject: [PATCH 059/341] release v0.6.2

Former-commit-id: f92ad0a62d957b595f6a76a5403216b163eb3d17
---
 README.md                              |  2 +-
 README_zh.md                           |  2 +-
 setup.py                               |  1 +
 src/llmtuner/__init__.py               |  2 +-
 src/llmtuner/data/loader.py            |  8 ++++---
 src/llmtuner/data/parser.py            | 31 ++++++++++++++++++--------
 src/llmtuner/extras/misc.py            | 11 +++------
 src/llmtuner/model/adapter.py          |  2 +-
 src/llmtuner/webui/components/eval.py  |  2 +-
 src/llmtuner/webui/components/train.py |  2 +-
 10 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 33c358b0..273a3a2f 100644
--- a/README.md
+++ b/README.md
@@ -305,7 +305,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-Extra dependencies available: deepspeed, metrics, unsloth, galore, vllm, bitsandbytes, gptq, awq, aqlm, qwen, quality
+Extra dependencies available: deepspeed, metrics, unsloth, galore, vllm, bitsandbytes, gptq, awq, aqlm, qwen, modelscope, quality
 
 <details><summary>For Windows users</summary>
 
diff --git a/README_zh.md b/README_zh.md
index 206af6f5..844a5f93 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -305,7 +305,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-可选的额外依赖项：deepspeed、metrics、unsloth、galore、vllm、bitsandbytes、gptq、awq、aqlm、qwen、quality
+可选的额外依赖项：deepspeed、metrics、unsloth、galore、vllm、bitsandbytes、gptq、awq、aqlm、qwen、modelscope、quality
 
 <details><summary>Windows 用户指南</summary>
 
diff --git a/setup.py b/setup.py
index 67b6f98d..fd5bdf7e 100644
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,7 @@ extra_require = {
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
     "qwen": ["tiktoken", "transformers_stream_generator"],
+    "modelscope": ["modelscope"],
     "quality": ["ruff"],
 }
 
diff --git a/src/llmtuner/__init__.py b/src/llmtuner/__init__.py
index 1a0189bb..f789dd83 100644
--- a/src/llmtuner/__init__.py
+++ b/src/llmtuner/__init__.py
@@ -7,5 +7,5 @@ from .train import export_model, run_exp
 from .webui import create_ui, create_web_demo
 
 
-__version__ = "0.6.2.dev0"
+__version__ = "0.6.2"
 __all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index c22f9a77..5414150e 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -6,7 +6,7 @@ from datasets import load_dataset, load_from_disk
 
 from ..extras.constants import FILEEXT2TYPE
 from ..extras.logging import get_logger
-from ..extras.misc import is_path_available
+from ..extras.misc import has_tokenized_data
 from .aligner import align_dataset
 from .parser import get_dataset_list
 from .preprocess import get_preprocess_and_print_func
@@ -81,7 +81,9 @@ def load_single_dataset(
                 cache_dir=cache_dir,
                 token=model_args.ms_hub_token,
                 use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
-            ).to_hf_dataset()
+            )
+            if isinstance(dataset, MsDataset):
+                dataset = dataset.to_hf_dataset()
         except ImportError:
             raise ImportError("Please install modelscope via `pip install modelscope -U`")
     else:
@@ -125,7 +127,7 @@ def get_dataset(
 
     # Load tokenized dataset
     if data_args.tokenized_path is not None:
-        if not is_path_available(data_args.tokenized_path):
+        if has_tokenized_data(data_args.tokenized_path):
             logger.warning("Loading dataset from disk will ignore other data arguments.")
             dataset = load_from_disk(data_args.tokenized_path)
             logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py
index 861396a0..b9c8782a 100644
--- a/src/llmtuner/data/parser.py
+++ b/src/llmtuner/data/parser.py
@@ -53,22 +53,35 @@ class DatasetAttr:
 
 
 def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
-    dataset_names = [ds.strip() for ds in data_args.dataset.split(",")] if data_args.dataset is not None else []
-    try:
-        with open(os.path.join(data_args.dataset_dir, DATA_CONFIG), "r") as f:
-            dataset_info = json.load(f)
-    except Exception as err:
-        if data_args.dataset is not None:
-            raise ValueError(
-                "Cannot open {} due to {}.".format(os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err))
-            )
+    if data_args.dataset is not None:
+        dataset_names = [ds.strip() for ds in data_args.dataset.split(",")]
+    else:
+        dataset_names = []
+
+    if data_args.dataset_dir == "ONLINE":
         dataset_info = None
+    else:
+        try:
+            with open(os.path.join(data_args.dataset_dir, DATA_CONFIG), "r") as f:
+                dataset_info = json.load(f)
+        except Exception as err:
+            if len(dataset_names) != 0:
+                raise ValueError(
+                    "Cannot open {} due to {}.".format(os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err))
+                )
+            dataset_info = None
 
     if data_args.interleave_probs is not None:
         data_args.interleave_probs = [float(prob.strip()) for prob in data_args.interleave_probs.split(",")]
 
     dataset_list: List[DatasetAttr] = []
     for name in dataset_names:
+        if dataset_info is None:
+            load_from = "ms_hub" if use_modelscope() else "hf_hub"
+            dataset_attr = DatasetAttr(load_from, dataset_name=name)
+            dataset_list.append(dataset_attr)
+            continue
+
         if name not in dataset_info:
             raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
 
diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index 49b99eee..12d1446f 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -193,16 +193,11 @@ def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype:
         return torch.float32
 
 
-def is_path_available(path: os.PathLike) -> bool:
+def has_tokenized_data(path: os.PathLike) -> bool:
     r"""
-    Checks if the path is empty or not exist.
+    Checks if the path has a tokenized dataset.
     """
-    if not os.path.exists(path):
-        return True
-    elif os.path.isdir(path) and not os.listdir(path):
-        return True
-    else:
-        return False
+    return os.path.isdir(path) and len(os.listdir(path)) > 0
 
 
 def torch_gc() -> None:
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index bf6f9381..eb6d3878 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -139,7 +139,6 @@ def init_adapter(
                 "lora_alpha": finetuning_args.lora_alpha,
                 "lora_dropout": finetuning_args.lora_dropout,
                 "use_rslora": finetuning_args.use_rslora,
-                "modules_to_save": finetuning_args.additional_target,
             }
 
             if model_args.use_unsloth:
@@ -151,6 +150,7 @@ def init_adapter(
                 lora_config = LoraConfig(
                     task_type=TaskType.CAUSAL_LM,
                     inference_mode=False,
+                    modules_to_save=finetuning_args.additional_target,
                     use_dora=finetuning_args.use_dora,
                     **peft_kwargs,
                 )
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index 87611da5..d41ef857 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -18,7 +18,7 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Row():
         dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2)
-        dataset = gr.Dropdown(multiselect=True, scale=4)
+        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
     input_elems.update({dataset_dir, dataset})
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 4f108db0..10954c1b 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -23,7 +23,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=1
         )
         dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=1)
-        dataset = gr.Dropdown(multiselect=True, scale=4, allow_custom_value=True)
+        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
     input_elems.update({training_stage, dataset_dir, dataset})

From 1a77de82fa4d1b9070ff2eb2b7cf2402007d7f84 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 11 Apr 2024 20:27:34 +0800
Subject: [PATCH 060/341] set dev version

Former-commit-id: f6cc76571d2c789675883a18e0db3d0c61f33808
---
 src/llmtuner/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/__init__.py b/src/llmtuner/__init__.py
index f789dd83..9d90a59e 100644
--- a/src/llmtuner/__init__.py
+++ b/src/llmtuner/__init__.py
@@ -7,5 +7,5 @@ from .train import export_model, run_exp
 from .webui import create_ui, create_web_demo
 
 
-__version__ = "0.6.2"
+__version__ = "0.6.3.dev0"
 __all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]

From 31bbbb6d13c04e85bc3ad40037ed31ea67b2f117 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 12 Apr 2024 14:28:11 +0800
Subject: [PATCH 061/341] fix #3238

Former-commit-id: 4d7e81ab4722d13bec6ca1af141f94bdc74d0883
---
 src/llmtuner/hparams/parser.py | 6 +++++-
 src/llmtuner/train/tuner.py    | 3 +--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 9264d1ee..4abd3f03 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -277,7 +277,11 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
 
     _verify_model_args(model_args, finetuning_args)
 
-    model_args.device_map = "auto"
+    if model_args.export_dir is not None:
+        model_args.device_map = {"": "cpu"}
+        model_args.compute_dtype = torch.float32
+    else:
+        model_args.device_map = "auto"
 
     return model_args, data_args, finetuning_args, generating_args
 
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index f6c2e16b..a8a2b8e9 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -65,8 +65,7 @@ def export_model(args: Optional[Dict[str, Any]] = None):
     if getattr(model, "quantization_method", None) is None:  # cannot convert dtype of a quantized model
         output_dtype = getattr(model.config, "torch_dtype", torch.float16)
         setattr(model.config, "torch_dtype", output_dtype)
-        for param in model.parameters():
-            param.data = param.data.to(output_dtype)
+        model = model.to(output_dtype)
 
     model.save_pretrained(
         save_directory=model_args.export_dir,

From 5486ea09e3e7673e802cc97eb97e8da0a4392cfd Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 12 Apr 2024 17:11:59 +0800
Subject: [PATCH 062/341] fix model card

Former-commit-id: 920e7149bf2b559c9829aa4b11cfb6d00bbb2f9e
---
 src/llmtuner/train/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index cf199633..fef63c39 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -57,9 +57,11 @@ def create_modelcard_and_push(
     kwargs = {
         "tasks": "text-generation",
         "finetuned_from": model_args.model_name_or_path,
-        "dataset": [dataset.strip() for dataset in data_args.dataset.split(",")],
         "tags": ["llama-factory", finetuning_args.finetuning_type],
     }
+    if data_args.dataset is not None:
+        kwargs["dataset"] = [dataset.strip() for dataset in data_args.dataset.split(",")]
+
     if not training_args.do_train:
         pass
     elif training_args.push_to_hub:

From 106a0104da0930a91c987cc0b02a870f63f6704f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 12 Apr 2024 17:41:33 +0800
Subject: [PATCH 063/341] fix #3247

Former-commit-id: bb67c66f80627805b585d157ba807c0ce378d3f2
---
 data/alpaca_data_zh_51k.json.REMOVED.git-id | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/alpaca_data_zh_51k.json.REMOVED.git-id b/data/alpaca_data_zh_51k.json.REMOVED.git-id
index f28dd6e9..0cd1db46 100644
--- a/data/alpaca_data_zh_51k.json.REMOVED.git-id
+++ b/data/alpaca_data_zh_51k.json.REMOVED.git-id
@@ -1 +1 @@
-34c723573fbc2d7601f6d9c882ccf5aa4f9bcc4b
\ No newline at end of file
+a97cf9475291591843976554878568e046d8a46d
\ No newline at end of file

From f3284136467a04863c27046bf4a106551dcd915e Mon Sep 17 00:00:00 2001
From: marko1616 <marko1616@outlook.com>
Date: Sat, 13 Apr 2024 04:31:33 +0800
Subject: [PATCH 064/341] Add template&support(Not tested)

Former-commit-id: 60bb60c4dc30a9641ddb57a44ef126f0768566c4
---
 src/llmtuner/data/template.py    |  8 ++++++++
 src/llmtuner/extras/constants.py | 10 ++++++++++
 2 files changed, 18 insertions(+)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 52358c1e..aefa3e45 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -785,3 +785,11 @@ _register_template(
     format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )
+
+_register_template(
+    name="c4ai",
+    format_user=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}", "<|END_OF_TURN_TOKEN|>"]),
+    format_system=StringFormatter(slots=[{"bos_token"},"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>"]),
+    default_system="You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.",
+)
\ No newline at end of file
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 729e0fa6..0cd2f987 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -994,3 +994,13 @@ register_model_group(
     },
     template="atom",
 )
+
+register_model_group(
+    models={
+        "C4AI-Command-R-35B": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-v01",
+        }
+    },
+    template="c4ai",
+)
\ No newline at end of file

From c991654cb4d70b076f424a5a7273b560c6fedf68 Mon Sep 17 00:00:00 2001
From: marko1616 <marko1616@outlook.com>
Date: Sat, 13 Apr 2024 07:32:40 +0800
Subject: [PATCH 065/341] Add c4ai-command-r-plus link

Former-commit-id: acaf953ca46eca8fb378067f4ada133654e4f088
---
 src/llmtuner/extras/constants.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 0cd2f987..cf68a225 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -1003,4 +1003,14 @@ register_model_group(
         }
     },
     template="c4ai",
+)
+
+register_model_group(
+    models={
+        "C4AI-Command-R-plus-104B": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-plus",
+        }
+    },
+    template="c4ai",
 )
\ No newline at end of file

From aeec78b35c69e133c0f6f39a9c665c29ef6c750e Mon Sep 17 00:00:00 2001
From: marko1616 <marko1616@outlook.com>
Date: Sat, 13 Apr 2024 07:52:11 +0800
Subject: [PATCH 066/341] Typo fix

Former-commit-id: 51b1e49e288e66c1b0c24ac070201c988fb2a389
---
 src/llmtuner/data/template.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index aefa3e45..f409cd9a 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -786,6 +786,7 @@ _register_template(
     format_separator=EmptyFormatter(slots=["\n"]),
 )
 
+
 _register_template(
     name="c4ai",
     format_user=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]),

From 0dcc9e0bcae7bd6a032e73042e8ff8c0754dd943 Mon Sep 17 00:00:00 2001
From: marko1616 <marko1616@outlook.com>
Date: Sat, 13 Apr 2024 17:30:21 +0800
Subject: [PATCH 067/341] Typo fix

Former-commit-id: 607625497738b2c8be736be7b0bd5c6f4cbaad5e
---
 src/llmtuner/extras/constants.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index cf68a225..5b81a9a9 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -995,6 +995,7 @@ register_model_group(
     template="atom",
 )
 
+
 register_model_group(
     models={
         "C4AI-Command-R-35B": {
@@ -1005,6 +1006,7 @@ register_model_group(
     template="c4ai",
 )
 
+
 register_model_group(
     models={
         "C4AI-Command-R-plus-104B": {

From daaafa900a2b0d232ba5157e652dd042c7932d32 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Sat, 13 Apr 2024 20:45:19 +0800
Subject: [PATCH 068/341] Added specimens for single-card full parameter
 prediction

Former-commit-id: d8d4fb9fa4b0e1950a453682e5e186f34f085dee
---
 examples/full_multi_gpu/predict.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 examples/full_multi_gpu/predict.sh

diff --git a/examples/full_multi_gpu/predict.sh b/examples/full_multi_gpu/predict.sh
new file mode 100644
index 00000000..af94d0a8
--- /dev/null
+++ b/examples/full_multi_gpu/predict.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage sft \
+    --do_predict \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type full \
+    --output_dir ../../saves/LLaMA2-7B/full/predict \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_eval_batch_size 1 \
+    --max_samples 20 \
+    --predict_with_generate

From 19adfb88a9512f30177e709056b3fe3278a23e31 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Sat, 13 Apr 2024 20:50:49 +0800
Subject: [PATCH 069/341] Upgrade README.md

Former-commit-id: 697f768d7185789ee054c94f4f161a65b8a505bc
---
 examples/README_zh.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/README_zh.md b/examples/README_zh.md
index a77209b3..de4bc6e4 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -22,6 +22,7 @@ examples/
 ├── full_multi_gpu/
 │   ├── single_node.sh: 使用 DeepSpeed 进行单节点训练
 │   └── multi_node.sh: 使用 DeepSpeed 进行多节点训练
+|   └── predict.sh: 使用单卡做全参批量预测
 ├── merge_lora/
 │   ├── merge.sh: 将 LoRA 权重合并到预训练模型中
 │   └── quantize.sh: 使用 AutoGPTQ 量化模型

From fde3d91242eec49ffc95661db2e86c94dd26456c Mon Sep 17 00:00:00 2001
From: liuzc <lzc410374@alibaba-inc.com>
Date: Mon, 15 Apr 2024 12:11:49 +0800
Subject: [PATCH 070/341] fix: mixtral output_router_logits

Former-commit-id: ab3171ea97ec968b972287287ef9ee2502c6d37c
---
 src/llmtuner/model/patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index a23d0ef3..e7807e56 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -316,7 +316,7 @@ def patch_config(
     if getattr(config, "model_type", None) == "qwen2" and is_trainable and model_args.flash_attn:
         setattr(config, "use_cache", False)  # qwen2 does not support use_cache when using flashattn
 
-    if getattr(config, "model_type", None) == "qwen2_moe" and is_trainable:
+    if getattr(config, "model_type", None) in ["mixtral", "qwen2_moe"] and is_trainable:
         setattr(config, "output_router_logits", True)
 
     init_kwargs["torch_dtype"] = model_args.compute_dtype

From 9338f878a398d78c0b3284f8150c84fab608a532 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 15 Apr 2024 15:32:58 +0800
Subject: [PATCH 071/341] fix #3273

Former-commit-id: 3b20c89b342a068356ffc29c3724b645775c65db
---
 src/llmtuner/hparams/model_args.py |  4 ++++
 src/llmtuner/hparams/parser.py     |  6 +++---
 src/llmtuner/model/adapter.py      | 12 +++++++++---
 src/llmtuner/model/patcher.py      |  4 ++--
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index be71d32f..514c8714 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -129,6 +129,10 @@ class ModelArguments:
         default=1,
         metadata={"help": "The file shard size (in GB) of the exported model."},
     )
+    export_device: str = field(
+        default="cpu",
+        metadata={"help": "The device used in model export."},
+    )
     export_quantization_bit: Optional[int] = field(
         default=None,
         metadata={"help": "The number of bits to quantize the exported model."},
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 4abd3f03..1865ff17 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -10,7 +10,7 @@ from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import is_torch_bf16_gpu_available
 
 from ..extras.logging import get_logger
-from ..extras.misc import check_dependencies
+from ..extras.misc import check_dependencies, get_current_device
 from ..extras.packages import is_unsloth_available
 from .data_args import DataArguments
 from .evaluation_args import EvaluationArguments
@@ -235,6 +235,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     elif training_args.fp16:
         model_args.compute_dtype = torch.float16
 
+    model_args.device_map = {"": get_current_device()}
     model_args.model_max_length = data_args.cutoff_len
     data_args.packing = data_args.packing if data_args.packing is not None else finetuning_args.stage == "pt"
 
@@ -278,8 +279,7 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
     _verify_model_args(model_args, finetuning_args)
 
     if model_args.export_dir is not None:
-        model_args.device_map = {"": "cpu"}
-        model_args.compute_dtype = torch.float32
+        model_args.device_map = {"": torch.device(model_args.export_device)}
     else:
         model_args.device_map = "auto"
 
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index eb6d3878..4bb4057d 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -32,6 +32,9 @@ def init_adapter(
         logger.info("Adapter is not found at evaluation, load the base model.")
         return model
 
+    if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None):
+        raise ValueError("You can only use lora for quantized models.")
+
     if finetuning_args.finetuning_type == "full" and is_trainable:
         logger.info("Fine-tuning method: Full")
         if not finetuning_args.pure_bf16:
@@ -129,9 +132,12 @@ def init_adapter(
             if finetuning_args.use_llama_pro:
                 target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable)
 
-            if finetuning_args.use_dora and getattr(model, "quantization_method", None) is not None:
-                if getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES:
-                    raise ValueError("DoRA is not compatible with PTQ-quantized models.")
+            if (
+                finetuning_args.use_dora
+                and getattr(model, "quantization_method", None) is not None
+                and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
+            ):
+                raise ValueError("DoRA is not compatible with PTQ-quantized models.")
 
             peft_kwargs = {
                 "r": finetuning_args.lora_rank,
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index a23d0ef3..fe707af7 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -323,8 +323,8 @@ def patch_config(
     if not is_deepspeed_zero3_enabled():
         init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage
         if init_kwargs["low_cpu_mem_usage"]:
-            if "device_map" not in init_kwargs:
-                init_kwargs["device_map"] = model_args.device_map or {"": get_current_device()}
+            if "device_map" not in init_kwargs and model_args.device_map:
+                init_kwargs["device_map"] = model_args.device_map
 
             if init_kwargs["device_map"] == "auto":
                 init_kwargs["offload_folder"] = model_args.offload_folder

From dfaa31e99173cc96be65a78c5931a16e002dd929 Mon Sep 17 00:00:00 2001
From: marko1616 <marko1616@outlook.com>
Date: Mon, 15 Apr 2024 20:16:52 +0800
Subject: [PATCH 072/341] Add support for function call(Not strictly following
 origin)

Former-commit-id: 44f3ada4e394c06b0d972329ed2a62d2be2ea0c6
---
 src/llmtuner/data/template.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index f409cd9a..b41d2642 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -792,5 +792,21 @@ _register_template(
     format_user=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]),
     format_assistant=StringFormatter(slots=["{{content}}", "<|END_OF_TURN_TOKEN|>"]),
     format_system=StringFormatter(slots=[{"bos_token"},"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>"]),
-    default_system="You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.",
+    format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
+    format_observation=StringFormatter(
+        slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}","<|END_OF_TURN_TOKEN|>","<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]
+    ),
+    default_system=("# Safety Preamble\n",
+        "The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n\n",
+        "# System Preamble\n",
+        "## Basic Rules\n",
+        "You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n",
+        "# User Preamble\n",
+        "## Task and Context\n",
+        "You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n",
+        "## Style Guide\n",
+        "Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.\n"
+        "## Available Tools\n",
+        "Here is a list of tools that you have available to you:\n"
+    )
 )
\ No newline at end of file

From 72dd67620800ccfec1178fb74d5f08e20aa1c40b Mon Sep 17 00:00:00 2001
From: marko1616 <marko1616@outlook.com>
Date: Mon, 15 Apr 2024 20:27:09 +0800
Subject: [PATCH 073/341] Revert "Add support for function call(Not strictly
 following origin)"

This reverts commit dfaa31e99173cc96be65a78c5931a16e002dd929 [formerly 44f3ada4e394c06b0d972329ed2a62d2be2ea0c6].


Former-commit-id: fac9cc6e01dd8f3bc449b656804476e1871326f0
---
 src/llmtuner/data/template.py | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index b41d2642..f409cd9a 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -792,21 +792,5 @@ _register_template(
     format_user=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]),
     format_assistant=StringFormatter(slots=["{{content}}", "<|END_OF_TURN_TOKEN|>"]),
     format_system=StringFormatter(slots=[{"bos_token"},"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>"]),
-    format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
-    format_observation=StringFormatter(
-        slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}","<|END_OF_TURN_TOKEN|>","<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]
-    ),
-    default_system=("# Safety Preamble\n",
-        "The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n\n",
-        "# System Preamble\n",
-        "## Basic Rules\n",
-        "You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n",
-        "# User Preamble\n",
-        "## Task and Context\n",
-        "You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n",
-        "## Style Guide\n",
-        "Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.\n"
-        "## Available Tools\n",
-        "Here is a list of tools that you have available to you:\n"
-    )
+    default_system="You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.",
 )
\ No newline at end of file

From 952b785bb3c23522b21056832dbfbf3b919ab69a Mon Sep 17 00:00:00 2001
From: marko1616 <marko1616@outlook.com>
Date: Mon, 15 Apr 2024 20:45:46 +0800
Subject: [PATCH 074/341] change default_system accroding to official template

Former-commit-id: 7ad9029c5e77a87a7c324b8f90b4f80a31a5c78b
---
 src/llmtuner/data/template.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index f409cd9a..d85adcf3 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -789,8 +789,20 @@ _register_template(
 
 _register_template(
     name="c4ai",
-    format_user=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]),
+    format_user=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]),
     format_assistant=StringFormatter(slots=["{{content}}", "<|END_OF_TURN_TOKEN|>"]),
     format_system=StringFormatter(slots=[{"bos_token"},"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>"]),
-    default_system="You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.",
+    format_observation=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]),
+    default_system=(
+        "# Safety Preamble\n",
+        "The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n\n",
+        "# System Preamble\n",
+        "## Basic Rules\n",
+        "You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n",
+        "# User Preamble\n",
+        "## Task and Context\n",
+        "You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n",
+        "## Style Guide\n",
+        "Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling."
+    )
 )
\ No newline at end of file

From 276f2cb24e2af7f731b6c3f851f4964a6bea7d19 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 15 Apr 2024 22:14:34 +0800
Subject: [PATCH 075/341] update examples

Former-commit-id: 369294b31c8a03a1cafcee83eb31a817007d3c49
---
 examples/README.md                 | 47 +++++++++++++++---------------
 examples/README_zh.md              | 46 ++++++++++++++---------------
 examples/extras/loraplus/sft.sh    |  4 +--
 examples/full_multi_gpu/predict.sh |  2 +-
 examples/merge_lora/merge.sh       |  2 +-
 src/llmtuner/chat/vllm_engine.py   |  3 --
 src/llmtuner/extras/packages.py    |  4 ---
 src/llmtuner/hparams/parser.py     | 28 +++++++++++++++---
 src/llmtuner/model/loader.py       |  4 ++-
 src/llmtuner/train/sft/metric.py   |  5 ----
 src/llmtuner/train/utils.py        |  3 --
 11 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 4f34be52..4e771c2e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,41 +3,42 @@ We provide diverse examples about fine-tuning LLMs.
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pretrain.sh: Do pre-training
-│   ├── sft.sh: Do supervised fine-tuning
-│   ├── reward.sh: Do reward modeling
-│   ├── ppo.sh: Do PPO training
-│   ├── dpo.sh: Do DPO training
-│   ├── orpo.sh: Do ORPO training
+│   ├── pretrain.sh: Do pre-training using LoRA
+│   ├── sft.sh: Do supervised fine-tuning using LoRA
+│   ├── reward.sh: Do reward modeling using LoRA
+│   ├── ppo.sh: Do PPO training using LoRA
+│   ├── dpo.sh: Do DPO training using LoRA
+│   ├── orpo.sh: Do ORPO training using LoRA
 │   ├── prepare.sh: Save tokenized dataset
-│   └── predict.sh: Do batch predict
+│   └── predict.sh: Do batch predict and compute BLEU and ROUGE scores after LoRA tuning
 ├── qlora_single_gpu/
-│   ├── bitsandbytes.sh: Fine-tune 4/8-bit BNB models
-│   ├── gptq.sh: Fine-tune 4/8-bit GPTQ models
-│   ├── awq.sh: Fine-tune 4-bit AWQ models
-│   └── aqlm.sh: Fine-tune 2-bit AQLM models
+│   ├── bitsandbytes.sh: Fine-tune 4/8-bit BNB models using QLoRA
+│   ├── gptq.sh: Fine-tune 4/8-bit GPTQ models using QLoRA
+│   ├── awq.sh: Fine-tune 4-bit AWQ models using QLoRA
+│   └── aqlm.sh: Fine-tune 2-bit AQLM models using QLoRA
 ├── lora_multi_gpu/
-│   ├── single_node.sh: Fine-tune model with Accelerate on single node
-│   └── multi_node.sh: Fine-tune model with Accelerate on multiple nodes
+│   ├── single_node.sh: Fine-tune model with Accelerate on single node using LoRA
+│   └── multi_node.sh: Fine-tune model with Accelerate on multiple nodes using LoRA
 ├── full_multi_gpu/
-│   ├── single_node.sh: Fine-tune model with DeepSpeed on single node
-│   └── multi_node.sh: Fine-tune model with DeepSpeed on multiple nodes
+│   ├── single_node.sh: Full fine-tune model with DeepSpeed on single node
+│   ├── multi_node.sh: Full fine-tune model with DeepSpeed on multiple nodes
+│   └── predict.sh: Do batch predict and compute BLEU and ROUGE scores after full tuning
 ├── merge_lora/
 │   ├── merge.sh: Merge LoRA weights into the pre-trained models
-│   └── quantize.sh: Quantize fine-tuned model with AutoGPTQ
+│   └── quantize.sh: Quantize the fine-tuned model with AutoGPTQ
 ├── inference/
-│   ├── cli_demo.sh: Launch a command line interface
-│   ├── api_demo.sh: Launch an OpenAI-style API
-│   ├── web_demo.sh: Launch a web interface
-│   └── evaluate.sh: Evaluate model on the MMLU benchmark
+│   ├── cli_demo.sh: Launch a command line interface with LoRA adapters
+│   ├── api_demo.sh: Launch an OpenAI-style API with LoRA adapters
+│   ├── web_demo.sh: Launch a web interface with LoRA adapters
+│   └── evaluate.sh: Evaluate model on the MMLU/CMMLU/C-Eval benchmarks with LoRA adapters
 └── extras/
     ├── galore/
     │   └── sft.sh: Fine-tune model with GaLore
     ├── loraplus/
-    │   └── sft.sh: Fine-tune model with LoRA+
+    │   └── sft.sh: Fine-tune model using LoRA+
     ├── llama_pro/
     │   ├── expand.sh: Expand layers in the model
-    │   └── sft.sh: Fine-tune expanded model
+    │   └── sft.sh: Fine-tune the expanded model
     └── fsdp_qlora/
-        └── sft.sh: Fine-tune quantized model with FSDP
+        └── sft.sh: Fine-tune quantized model with FSDP+QLoRA
 ```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index de4bc6e4..badda0fe 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -1,36 +1,36 @@
-我们提供了多样化的示例脚本。
+我们提供了多样化的大模型微调示例脚本。
 
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pretrain.sh: 进行预训练
-│   ├── sft.sh: 进行指令监督微调
-│   ├── reward.sh: 进行奖励模型训练
-│   ├── ppo.sh: 进行 PPO 训练
-│   ├── dpo.sh: 进行 DPO 训练
-│   ├── orpo.sh: 进行 ORPO 训练
+│   ├── pretrain.sh: 基于 LoRA 进行预训练
+│   ├── sft.sh: 基于 LoRA 进行指令监督微调
+│   ├── reward.sh: 基于 LoRA 进行奖励模型训练
+│   ├── ppo.sh: 基于 LoRA 进行 PPO 训练
+│   ├── dpo.sh: 基于 LoRA 进行 DPO 训练
+│   ├── orpo.sh: 基于 LoRA 进行 ORPO 训练
 │   ├── prepare.sh: 保存预处理后的数据集
-│   └── predict.sh: 进行批量预测
+│   └── predict.sh: 基于 LoRA 进行批量预测并计算 BLEU 和 ROUGE 分数
 ├── qlora_single_gpu/
-│   ├── bitsandbytes.sh: 微调 4/8 比特 BNB 模型
-│   ├── gptq.sh: 微调 4/8 比特 GPTQ 模型
-│   ├── awq.sh: 微调 4 比特 AWQ 模型
-│   └── aqlm.sh: 微调 2 比特 AQLM 模型
+│   ├── bitsandbytes.sh: 基于 QLoRA 微调 4/8 比特 BNB 模型
+│   ├── gptq.sh: 基于 QLoRA 微调 4/8 比特 GPTQ 模型
+│   ├── awq.sh: 基于 QLoRA 微调 4 比特 AWQ 模型
+│   └── aqlm.sh: 基于 QLoRA 微调 2 比特 AQLM 模型
 ├── lora_multi_gpu/
-│   ├── single_node.sh: 使用 Accelerate 进行单节点训练
-│   └── multi_node.sh: 使用 Accelerate 进行多节点训练
+│   ├── single_node.sh: 使用 Accelerate 进行单节点 LoRA 训练
+│   └── multi_node.sh: 使用 Accelerate 进行多节点 LoRA 训练
 ├── full_multi_gpu/
-│   ├── single_node.sh: 使用 DeepSpeed 进行单节点训练
-│   └── multi_node.sh: 使用 DeepSpeed 进行多节点训练
-|   └── predict.sh: 使用单卡做全参批量预测
+│   ├── single_node.sh: 使用 DeepSpeed 进行单节点全量训练
+│   ├── multi_node.sh: 使用 DeepSpeed 进行多节点全量训练
+│   └── predict.sh: 基于全量训练进行批量预测并计算 BLEU 和 ROUGE 分数
 ├── merge_lora/
 │   ├── merge.sh: 将 LoRA 权重合并到预训练模型中
-│   └── quantize.sh: 使用 AutoGPTQ 量化模型
+│   └── quantize.sh: 使用 AutoGPTQ 量化微调后的模型
 ├── inference/
-│   ├── cli_demo.sh: 启动命令行推理接口
-│   ├── api_demo.sh: 启动 OpenAI 风格 API
-│   ├── web_demo.sh: 启动浏览器推理接口
-│   └── evaluate.sh: 在 MMLU 数据集上评测模型
+│   ├── cli_demo.sh: 启动 LoRA 模型的命令行推理接口
+│   ├── api_demo.sh: 启动 LoRA 模型的 OpenAI 风格 API
+│   ├── web_demo.sh: 启动 LoRA 模型的浏览器推理接口
+│   └── evaluate.sh: 在 MMLU/CMMLU/C-Eval 数据集上评测 LoRA 模型
 └── extras/
     ├── galore/
     │   └── sft.sh: 使用 GaLore 训练模型
@@ -40,5 +40,5 @@ examples/
     │   ├── expand.sh: 扩展模型中的层
     │   └── sft.sh: 训练扩展后的模型
     └── fsdp_qlora/
-        └── sft.sh: 使用 FSDP 微调量化模型
+        └── sft.sh: 使用 FSDP+QLoRA 微调量化模型
 ```
diff --git a/examples/extras/loraplus/sft.sh b/examples/extras/loraplus/sft.sh
index 8bc16cdf..cb334e7d 100644
--- a/examples/extras/loraplus/sft.sh
+++ b/examples/extras/loraplus/sft.sh
@@ -9,6 +9,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
     --template default \
     --finetuning_type lora \
     --lora_target q_proj,v_proj \
+    --loraplus_lr_ratio 16.0 \
     --output_dir ../../saves/LLaMA2-7B/loraplus/sft \
     --overwrite_cache \
     --overwrite_output_dir \
@@ -29,5 +30,4 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
     --max_samples 3000 \
     --val_size 0.1 \
     --plot_loss \
-    --fp16 \
-    --loraplus_lr_ratio 16.0
+    --fp16
diff --git a/examples/full_multi_gpu/predict.sh b/examples/full_multi_gpu/predict.sh
index af94d0a8..52fdc7a0 100644
--- a/examples/full_multi_gpu/predict.sh
+++ b/examples/full_multi_gpu/predict.sh
@@ -3,7 +3,7 @@
 CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
     --stage sft \
     --do_predict \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --model_name_or_path ../../saves/LLaMA2-7B/full/sft \
     --dataset alpaca_gpt4_en,glaive_toolcall \
     --dataset_dir ../../data \
     --template default \
diff --git a/examples/merge_lora/merge.sh b/examples/merge_lora/merge.sh
index c1f15fce..8c50591e 100644
--- a/examples/merge_lora/merge.sh
+++ b/examples/merge_lora/merge.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # DO NOT use quantized model or quantization_bit when merging lora weights
 
-CUDA_VISIBLE_DEVICES= python ../../src/export_model.py \
+CUDA_VISIBLE_DEVICES=0 python ../../src/export_model.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \
diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index 9911e361..e924ef6e 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -1,8 +1,6 @@
 import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence
 
-from transformers.utils.versions import require_version
-
 from ..data import get_template_and_fix_tokenizer
 from ..extras.misc import get_device_count
 from ..extras.packages import is_vllm_available
@@ -25,7 +23,6 @@ class VllmEngine(BaseEngine):
         finetuning_args: "FinetuningArguments",
         generating_args: "GeneratingArguments",
     ) -> None:
-        require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3")
         self.can_generate = finetuning_args.stage == "sft"
         engine_args = AsyncEngineArgs(
             model=model_args.model_name_or_path,
diff --git a/src/llmtuner/extras/packages.py b/src/llmtuner/extras/packages.py
index cf10ffd4..b134ddab 100644
--- a/src/llmtuner/extras/packages.py
+++ b/src/llmtuner/extras/packages.py
@@ -49,10 +49,6 @@ def is_starlette_available():
     return _is_package_available("sse_starlette")
 
 
-def is_unsloth_available():
-    return _is_package_available("unsloth")
-
-
 def is_uvicorn_available():
     return _is_package_available("uvicorn")
 
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 1865ff17..8f3bd18a 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -8,10 +8,10 @@ import transformers
 from transformers import HfArgumentParser, Seq2SeqTrainingArguments
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import is_torch_bf16_gpu_available
+from transformers.utils.versions import require_version
 
 from ..extras.logging import get_logger
 from ..extras.misc import check_dependencies, get_current_device
-from ..extras.packages import is_unsloth_available
 from .data_args import DataArguments
 from .evaluation_args import EvaluationArguments
 from .finetuning_args import FinetuningArguments
@@ -74,6 +74,26 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
             raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
 
 
+def _check_extra_dependencies(
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    training_args: Optional["Seq2SeqTrainingArguments"] = None,
+) -> None:
+    if model_args.use_unsloth:
+        require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth")
+
+    if model_args.infer_backend == "vllm":
+        require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3")
+
+    if finetuning_args.use_galore:
+        require_version("galore_torch", "To fix: pip install galore_torch")
+
+    if training_args is not None and training_args.predict_with_generate:
+        require_version("jieba", "To fix: pip install jieba")
+        require_version("nltk", "To fix: pip install nltk")
+        require_version("rouge_chinese", "To fix: pip install rouge-chinese")
+
+
 def _parse_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     parser = HfArgumentParser(_TRAIN_ARGS)
     return _parse_args(parser, args)
@@ -131,9 +151,6 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.do_train and training_args.predict_with_generate:
         raise ValueError("`predict_with_generate` cannot be set as True while training.")
 
-    if training_args.do_train and model_args.use_unsloth and not is_unsloth_available():
-        raise ValueError("Unsloth was not installed: https://github.com/unslothai/unsloth")
-
     if finetuning_args.use_dora and model_args.use_unsloth:
         raise ValueError("Unsloth does not support DoRA.")
 
@@ -158,6 +175,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
 
     _verify_model_args(model_args, finetuning_args)
+    _check_extra_dependencies(model_args, finetuning_args, training_args)
 
     if (
         training_args.do_train
@@ -277,6 +295,7 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
             raise ValueError("vLLM engine does not support RoPE scaling.")
 
     _verify_model_args(model_args, finetuning_args)
+    _check_extra_dependencies(model_args, finetuning_args)
 
     if model_args.export_dir is not None:
         model_args.device_map = {"": torch.device(model_args.export_device)}
@@ -298,6 +317,7 @@ def get_eval_args(args: Optional[Dict[str, Any]] = None) -> _EVAL_CLS:
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
 
     _verify_model_args(model_args, finetuning_args)
+    _check_extra_dependencies(model_args, finetuning_args)
 
     model_args.device_map = "auto"
 
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 2acbadb0..60bcb970 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -85,7 +85,9 @@ def load_model(
             logger.warning("Unsloth does not support loading adapters.")
 
     if model is None:
-        model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config, **init_kwargs)
+        init_kwargs["config"] = config
+        init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path
+        model: "PreTrainedModel" = AutoModelForCausalLM.from_pretrained(**init_kwargs)
 
     patch_model(model, tokenizer, model_args, is_trainable)
     register_autoclass(config, model, tokenizer)
diff --git a/src/llmtuner/train/sft/metric.py b/src/llmtuner/train/sft/metric.py
index 35f89f56..d1af4c17 100644
--- a/src/llmtuner/train/sft/metric.py
+++ b/src/llmtuner/train/sft/metric.py
@@ -2,7 +2,6 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
 
 import numpy as np
-from transformers.utils.versions import require_version
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available
@@ -33,10 +32,6 @@ class ComputeMetrics:
         r"""
         Uses the model predictions to compute metrics.
         """
-        require_version("jieba", "To fix: pip install jieba")
-        require_version("nltk", "To fix: pip install nltk")
-        require_version("rouge_chinese", "To fix: pip install rouge-chinese")
-
         preds, labels = eval_preds
         score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []}
 
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index fef63c39..d921aec4 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -5,7 +5,6 @@ from transformers import Trainer
 from transformers.optimization import get_scheduler
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
 from transformers.trainer_pt_utils import get_parameter_names
-from transformers.utils.versions import require_version
 
 from ..extras.logging import get_logger
 from ..extras.packages import is_galore_available
@@ -168,8 +167,6 @@ def _create_galore_optimizer(
     training_args: "Seq2SeqTrainingArguments",
     finetuning_args: "FinetuningArguments",
 ) -> "torch.optim.Optimizer":
-    require_version("galore_torch", "To fix: pip install galore_torch")
-
     if len(finetuning_args.galore_target) == 1 and finetuning_args.galore_target[0] == "all":
         galore_targets = find_all_linear_modules(model)
     else:

From f5f15896623e3f563f1b2b2a74b049c1053b9488 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 15 Apr 2024 22:56:55 +0800
Subject: [PATCH 076/341] Update constants.py

Former-commit-id: 39199f712aa7b7a1c66080d9c84651fd2eb0b425
---
 src/llmtuner/extras/constants.py | 44 ++++++++++++++++----------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 5b81a9a9..321c36a4 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -242,6 +242,28 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "CommandR-35B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-v01",
+        },
+        "CommandR-Plus-104B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-plus",
+        },
+        "CommandR-35B-4bit-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01-4bit",
+            DownloadSource.MODELSCOPE: "mirror013/c4ai-command-r-v01-4bit",
+        },
+        "CommandR-Plus-104B-4bit-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus-4bit",
+        },
+    },
+    template="cohere",
+)
+
+
 register_model_group(
     models={
         "DeepSeek-LLM-7B-Base": {
@@ -994,25 +1016,3 @@ register_model_group(
     },
     template="atom",
 )
-
-
-register_model_group(
-    models={
-        "C4AI-Command-R-35B": {
-            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01",
-            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-v01",
-        }
-    },
-    template="c4ai",
-)
-
-
-register_model_group(
-    models={
-        "C4AI-Command-R-plus-104B": {
-            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus",
-            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-plus",
-        }
-    },
-    template="c4ai",
-)
\ No newline at end of file

From c6d6c4c2090f3afb06b61216dbd428b3958c814a Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 15 Apr 2024 22:58:01 +0800
Subject: [PATCH 077/341] Update template.py

Former-commit-id: 00b8be7dafa65e13b344724a8d3855919ee4f631
---
 src/llmtuner/data/template.py | 36 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index d85adcf3..7a1f4ab8 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -526,6 +526,21 @@ _register_template(
 )
 
 
+_register_template(
+    name="cohere",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"
+                "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+            )
+        ]
+    ),
+    format_system=EmptyFormatter(slots=[{"bos_token"}]),
+    force_system=True,
+)
+
+
 _register_template(
     name="cpm",
     format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
@@ -785,24 +800,3 @@ _register_template(
     format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )
-
-
-_register_template(
-    name="c4ai",
-    format_user=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]),
-    format_assistant=StringFormatter(slots=["{{content}}", "<|END_OF_TURN_TOKEN|>"]),
-    format_system=StringFormatter(slots=[{"bos_token"},"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|>"]),
-    format_observation=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}", "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"]),
-    default_system=(
-        "# Safety Preamble\n",
-        "The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.\n\n",
-        "# System Preamble\n",
-        "## Basic Rules\n",
-        "You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.\n\n",
-        "# User Preamble\n",
-        "## Task and Context\n",
-        "You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.\n\n",
-        "## Style Guide\n",
-        "Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling."
-    )
-)
\ No newline at end of file

From d4d471450fc8cb875670c029691f3087d611052d Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Mon, 15 Apr 2024 23:15:27 +0800
Subject: [PATCH 078/341] Feature BAdam

Former-commit-id: d8d2807fbcf587c37f7fd34a23e9397d2775ceed
---
 examples/extras/badam/sft.sh            | 36 ++++++++++++++++
 requirements.txt                        |  1 +
 src/llmtuner/hparams/finetuning_args.py | 43 ++++++++++++++++++-
 src/llmtuner/hparams/parser.py          |  6 +++
 src/llmtuner/model/adapter.py           |  6 +--
 src/llmtuner/model/patcher.py           |  5 ++-
 src/llmtuner/model/utils.py             | 42 ++++++++++++++++++
 src/llmtuner/train/sft/trainer.py       |  6 ++-
 src/llmtuner/train/utils.py             | 57 +++++++++++++++++++++++++
 9 files changed, 195 insertions(+), 7 deletions(-)
 create mode 100644 examples/extras/badam/sft.sh

diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
new file mode 100644
index 00000000..daa63913
--- /dev/null
+++ b/examples/extras/badam/sft.sh
@@ -0,0 +1,36 @@
+# BAdam layer-wise
+export CUDA_VISIBLE_DEVICES=0
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+python ../../../src/train_bash.py \
+--stage sft \
+--do_train \
+--model_name_or_path meta-llama/Llama-2-7b-hf \
+--dataset alpaca_gpt4_en,glaive_toolcall \
+--dataset_dir ../../../data \
+--template default \
+--finetuning_type full \
+--output_dir ../../../saves/LLaMA2-7B/badam \
+--overwrite_cache \
+--overwrite_output_dir \
+--cutoff_len 1024 \
+--preprocessing_num_workers 32 \
+--per_device_train_batch_size 8 \
+--per_device_eval_batch_size 5 \
+--gradient_accumulation_steps 2 \
+--lr_scheduler_type cosine \
+--logging_steps 10 \
+--warmup_steps 20 \
+--save_steps 100 \
+--eval_steps 100 \
+--evaluation_strategy steps \
+--load_best_model_at_end \
+--learning_rate 5e-5 \
+--num_train_epochs 3.0 \
+--val_size 0.1 \
+--plot_loss \
+--use_badam \
+--switch_mode descending \
+--badam_verbose 2 \
+--switch_block_every 50 \
+--pure_bf16 \
+
diff --git a/requirements.txt b/requirements.txt
index 1fa5a142..9d58d75a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ fastapi
 sse-starlette
 matplotlib
 fire
+badam
\ No newline at end of file
diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index 177a9f8a..d64f1583 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -163,6 +163,47 @@ class RLHFArguments:
         metadata={"help": "The type of the reward model in PPO training. Lora model only supports lora training."},
     )
 
+@dataclass
+class BAdamArgument:
+    r"""
+    Arguments for BAdam optimizer.
+    """
+    use_badam: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use BAdam optimizer."},
+    )
+    badam_mode: Literal["layer", "ratio"] = field(
+        default="layer",
+        metadata={"help": "The mode of BAdam optimizer. 'layer' for layer-wise, 'ratio' for ratio-wise."},
+    )
+    
+    # ======== Arguments for layer-wise update ========
+    start_block: Optional[int] = field(
+        default=None,
+        metadata={"help": "The starting block index for block-wise fine-tuning."}
+    )
+    switch_block_every: Optional[int] = field(
+        default=50,
+        metadata={"help": "how often to switch model's block update. Set to -1 to disable the block update."}
+    )
+    switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field(
+        default="ascending",
+        metadata={"help": "the strategy of picking block to update."}
+    )
+    
+    # ======== Arguments for ratio-wise update ========
+    badam_update_ratio: float = field(
+        default=0.,
+        metadata={"help": "The ratio of the update for the BAdam optimizer."}
+    )
+    badam_mask_mode: Literal["adjacent", "scatter"] = field(
+        default="adjacent",
+        metadata={"help": "The mode of the mask for BAdam optimizer. `adjacent` means that the trainable parameters are adjacent to each other; `scatter` means that trainable parameters are randomly choosed from the weight."}
+    )
+    badam_verbose: int = field(
+        default=0,
+        metadata={"help": "The verbosity level of BAdam optimizer. 0 for no print, 1 for print the block prefix, 2 for print trainable parameters"}
+    )
 
 @dataclass
 class GaloreArguments:
@@ -204,7 +245,7 @@ class GaloreArguments:
 
 
 @dataclass
-class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreArguments):
+class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreArguments, BAdamArgument):
     r"""
     Arguments pertaining to which techniques we are going to fine-tuning with.
     """
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 8f3bd18a..032a1a4b 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -171,6 +171,12 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if finetuning_args.use_galore and training_args.deepspeed is not None:
         raise ValueError("GaLore is incompatible with DeepSpeed.")
 
+    if (finetuning_args.use_badam
+        and finetuning_args.badam_mode == "layer"
+        and training_args.parallel_mode.value == "distributed"
+    ):
+        raise ValueError("BAdam with layer-wise mode is not supported in distributed training by now, use ratio mode instead.")
+    
     if model_args.infer_backend == "vllm":
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
 
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index 4bb4057d..71c9c2f4 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -37,7 +37,7 @@ def init_adapter(
 
     if finetuning_args.finetuning_type == "full" and is_trainable:
         logger.info("Fine-tuning method: Full")
-        if not finetuning_args.pure_bf16:
+        if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
             model = model.float()
 
     if finetuning_args.finetuning_type == "freeze" and is_trainable:
@@ -82,7 +82,7 @@ def init_adapter(
 
         for name, param in model.named_parameters():
             if any(trainable_layer in name for trainable_layer in trainable_layers):
-                if not finetuning_args.pure_bf16:
+                if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
                     param.data = param.data.to(torch.float32)
             else:
                 param.requires_grad_(False)
@@ -162,7 +162,7 @@ def init_adapter(
                 )
                 model = get_peft_model(model, lora_config)
 
-        if not finetuning_args.pure_bf16:
+        if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
             for param in filter(lambda p: p.requires_grad, model.parameters()):
                 param.data = param.data.to(torch.float32)
 
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index c48df995..563b1827 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -17,7 +17,7 @@ from ..extras.logging import get_logger
 from ..extras.misc import get_current_device, infer_optim_dtype
 from ..extras.packages import is_flash_attn2_available
 from ..extras.patches.llama_patch import apply_llama_patch
-from .utils import QuantizationMethod, add_z3_leaf_module
+from .utils import QuantizationMethod, add_z3_leaf_module, gradient_checkpointing_enable
 
 
 if TYPE_CHECKING:
@@ -266,8 +266,9 @@ def _prepare_model_for_training(
         else:
             # use_reentrant=False might increase VRAM usage (have not been empirically verified yet)
             # According to: https://github.com/huggingface/transformers/issues/28339
+            model.gradient_checkpointing_enable = MethodType(gradient_checkpointing_enable, model)
             model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True})
-            model.enable_input_require_grads()
+            # model.enable_input_require_grads()
             setattr(model.config, "use_cache", False)  # turn off when gradient checkpointing is enabled
             logger.info("Gradient checkpointing enabled.")
 
diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py
index 771e6112..e83a903e 100644
--- a/src/llmtuner/model/utils.py
+++ b/src/llmtuner/model/utils.py
@@ -135,3 +135,45 @@ def register_autoclass(config: "PretrainedConfig", model: "PreTrainedModel", tok
         model.__class__.register_for_auto_class()
     if "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}):
         tokenizer.__class__.register_for_auto_class()
+
+def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+    """
+    Modification of the original method to enable gradient checkpointing for block-wise optimizer.
+    
+    Activates gradient checkpointing for the current model.
+
+    We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
+    the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+
+    Args:
+        gradient_checkpointing_kwargs (dict, *optional*):
+            Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function.
+    """
+    from torch.utils.checkpoint import checkpoint
+    
+    if not self.supports_gradient_checkpointing:
+        raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+
+    if gradient_checkpointing_kwargs is None:
+        gradient_checkpointing_kwargs = {}
+
+    # gradient_checkpointing_func = functools.partial(checkpoint, **gradient_checkpointing_kwargs)
+    
+    def gradient_checkpointing_func(func, *args, **kwargs):
+        module = func.__self__
+        
+        if any([p.requires_grad for p in module.parameters()]):
+            for arg in args:
+                if torch.is_tensor(arg) and torch.is_floating_point(arg):
+                    arg.requires_grad_(True)
+        
+        return checkpoint(func, *args, **kwargs)            
+        
+    self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func)
+
+    if getattr(self, "_hf_peft_config_loaded", False):
+        # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True
+        # we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334
+        # When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate
+        # the gradients to make sure the gradient flows.
+        self.enable_input_require_grads()
\ No newline at end of file
diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py
index 8d2f9fa0..d750f491 100644
--- a/src/llmtuner/train/sft/trainer.py
+++ b/src/llmtuner/train/sft/trainer.py
@@ -9,7 +9,8 @@ from transformers import Seq2SeqTrainer
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
 from ..utils import create_custom_optimzer, create_custom_scheduler
-
+from types import MethodType
+from packaging import version
 
 if TYPE_CHECKING:
     from transformers.trainer import PredictionOutput
@@ -28,6 +29,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
     def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
+        if version.parse(torch.__version__) >= version.parse("1.13"):
+            from badam import clip_grad_norm_for_sparse_tensor
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index d921aec4..65233f72 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -287,12 +287,69 @@ def _create_loraplus_optimizer(
     logger.info("Using LoRA+ optimizer with loraplus lr ratio {:.2f}.".format(finetuning_args.loraplus_lr_ratio))
     return optimizer
 
+def _create_badam_optimizer(
+    model: "PreTrainedModel",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> "torch.optim.Optimizer":
+    
+    from transformers.trainer_pt_utils import get_parameter_names
+    decay_parameters = list(filter(lambda n: "bias" not in n, get_parameter_names(model, ALL_LAYERNORM_LAYERS)))
+    # filter out the embedding layers when using badam ratio mode
+    if finetuning_args.badam_mode == "ratio":
+        decay_parameters = list(filter(lambda n: "embed" not in n, decay_parameters)) # TODO: make it more general
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if n in decay_parameters],
+            "weight_decay": training_args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+
+    # create BlockOptimizer
+    if finetuning_args.badam_mode == "layer":
+        from badam import BlockOptimizer
+        base_optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+        optimizer = BlockOptimizer(base_optimizer=base_optimizer,
+                                named_parameters_list=list(model.named_parameters()),
+                                block_prefix_list=None,
+                                switch_block_every=finetuning_args.switch_block_every,
+                                start_block=finetuning_args.start_block,
+                                switch_mode=finetuning_args.switch_mode,
+                                verbose=finetuning_args.badam_verbose)
+        
+        logger.info(f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.switch_mode}, "
+                    f"switch block every {finetuning_args.switch_block_every} steps, "
+                    f"default start block is {finetuning_args.start_block}")
+    
+    elif finetuning_args.badam_mode == "ratio":
+        assert finetuning_args.badam_update_ratio > 0.
+        from badam import BlockOptimizerRatio
+        optimizer = BlockOptimizerRatio(param_groups=optimizer_grouped_parameters,
+                                        named_parameters_list=list(model.named_parameters()),
+                                        update_ratio=finetuning_args.badam_update_ratio,
+                                        mask_mode=finetuning_args.badam_mask_mode,
+                                        verbose=finetuning_args.badam_verbose,
+                                        **optimizer_kwargs)
+        
+        logger.info(f"Using BAdam optimizer with ratio update, update ratio is {finetuning_args.badam_update_ratio}, "
+                    f"mask mode is {finetuning_args.badam_mask_mode}")
+    
+    return optimizer
 
 def create_custom_optimzer(
     model: "PreTrainedModel",
     training_args: "Seq2SeqTrainingArguments",
     finetuning_args: "FinetuningArguments",
 ) -> Optional["torch.optim.Optimizer"]:
+    if finetuning_args.use_badam:
+        return _create_badam_optimizer(model, training_args, finetuning_args)
+    
     if finetuning_args.use_galore:
         return _create_galore_optimizer(model, training_args, finetuning_args)
 

From b638c65519d57d019bf442e5a386ca561adcd2c7 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 15 Apr 2024 23:26:42 +0800
Subject: [PATCH 079/341] support cohere commandR #3184

Former-commit-id: e077c36872740f6b2ac255aee9da6c4c70f28977
---
 README.md                          |  9 +++++----
 README_zh.md                       |  9 +++++----
 src/llmtuner/extras/constants.py   | 15 ---------------
 src/llmtuner/hparams/model_args.py |  2 +-
 src/llmtuner/hparams/parser.py     |  3 +++
 src/llmtuner/model/loader.py       | 24 +++++++++++++++++-------
 src/llmtuner/model/patcher.py      |  4 +++-
 7 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 273a3a2f..cb8a691f 100644
--- a/README.md
+++ b/README.md
@@ -129,9 +129,10 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | Model                                                    | Model size                  | Default module    | Template  |
 | -------------------------------------------------------- | --------------------------- | ----------------- | --------- |
 | [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                      | W_pack            | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience/bloom)         | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
-| [BLOOMZ](https://huggingface.co/bigscience/bloomz)       | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
-| [ChatGLM3](https://huggingface.co/THUDM/chatglm3-6b)     | 6B                          | query_key_value   | chatglm3  |
+| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
+| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                          | query_key_value   | chatglm3  |
+| [CommandR](https://huggingface.co/CohereForAI)           | 35B/104B                    | q_proj,v_proj     | cohere    |
 | [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                  | q_proj,v_proj     | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                 | query_key_value   | falcon    |
 | [Gemma](https://huggingface.co/google)                   | 2B/7B                       | q_proj,v_proj     | gemma     |
@@ -427,7 +428,7 @@ If you have a project that should be incorporated, please contact via email or c
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
 
-Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [CommandR](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index 844a5f93..96ddf20d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -129,9 +129,10 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | 模型名                                                   | 模型大小                     | 默认模块           | Template  |
 | -------------------------------------------------------- | --------------------------- | ----------------- | --------- |
 | [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                      | W_pack            | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience/bloom)         | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
-| [BLOOMZ](https://huggingface.co/bigscience/bloomz)       | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
-| [ChatGLM3](https://huggingface.co/THUDM/chatglm3-6b)     | 6B                          | query_key_value   | chatglm3  |
+| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
+| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                          | query_key_value   | chatglm3  |
+| [CommandR](https://huggingface.co/CohereForAI)           | 35B/104B                    | q_proj,v_proj     | cohere    |
 | [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                  | q_proj,v_proj     | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                 | query_key_value   | falcon    |
 | [Gemma](https://huggingface.co/google)                   | 2B/7B                       | q_proj,v_proj     | gemma     |
@@ -427,7 +428,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
 
-使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [CommandR](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## 引用
 
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 321c36a4..6ba88bbd 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -1001,18 +1001,3 @@ register_model_group(
     },
     template="zephyr",
 )
-
-
-register_model_group(
-    models={
-        "Atom-7B": {
-            DownloadSource.DEFAULT: "FlagAlpha/Atom-7B",
-            DownloadSource.MODELSCOPE: "FlagAlpha/Atom-7B",
-        },
-        "Atom-7B-Chat": {
-            DownloadSource.DEFAULT: "FlagAlpha/Atom-7B-Chat",
-            DownloadSource.MODELSCOPE: "FlagAlpha/Atom-7B-Chat",
-        },
-    },
-    template="atom",
-)
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index 514c8714..57213470 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -55,7 +55,7 @@ class ModelArguments:
     )
     quantization_device_map: Optional[Literal["auto"]] = field(
         default=None,
-        metadata={"help": "Device map used for loading the 4-bit quantized model, needs bitsandbytes>=0.43.0."},
+        metadata={"help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."},
     )
     rope_scaling: Optional[Literal["linear", "dynamic"]] = field(
         default=None,
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 8f3bd18a..84712b3b 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -151,6 +151,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.do_train and training_args.predict_with_generate:
         raise ValueError("`predict_with_generate` cannot be set as True while training.")
 
+    if training_args.do_train and model_args.quantization_device_map == "auto":
+        raise ValueError("Cannot use device map for quantized models in training.")
+
     if finetuning_args.use_dora and model_args.use_unsloth:
         raise ValueError("Unsloth does not support DoRA.")
 
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 60bcb970..8a89be33 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -36,13 +36,23 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
     Note: including inplace operation of model_args.
     """
     init_kwargs = _get_init_kwargs(model_args)
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path,
-        use_fast=model_args.use_fast_tokenizer,
-        split_special_tokens=model_args.split_special_tokens,
-        padding_side="right",
-        **init_kwargs,
-    )
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=model_args.use_fast_tokenizer,
+            split_special_tokens=model_args.split_special_tokens,
+            padding_side="right",
+            **init_kwargs,
+        )
+    except ValueError:  # try the fast one
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=True,
+            split_special_tokens=model_args.split_special_tokens,
+            padding_side="right",
+            **init_kwargs,
+        )
+
     patch_tokenizer(tokenizer)
     return tokenizer
 
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index c48df995..49c9f598 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -133,7 +133,9 @@ def _configure_quantization(
         if is_deepspeed_zero3_enabled():
             raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantized models.")
 
-        init_kwargs["device_map"] = {"": get_current_device()}
+        if model_args.quantization_device_map != "auto":
+            init_kwargs["device_map"] = {"": get_current_device()}
+
         quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
         quant_method = quantization_config.get("quant_method", "")
 

From b5c5283dd6c037832b502bfd2748a105f02ba80a Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 00:11:15 +0800
Subject: [PATCH 080/341] add codegemma

Former-commit-id: 9324176525c2eda22962b0ca1895009b6237e6e3
---
 README.md                        |  6 +++---
 README_zh.md                     |  6 +++---
 src/llmtuner/extras/constants.py | 17 +++++++++++++++++
 src/llmtuner/model/loader.py     |  1 -
 4 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index cb8a691f..ab6a37ea 100644
--- a/README.md
+++ b/README.md
@@ -132,10 +132,10 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
 | [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
 | [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                          | query_key_value   | chatglm3  |
-| [CommandR](https://huggingface.co/CohereForAI)           | 35B/104B                    | q_proj,v_proj     | cohere    |
+| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                    | q_proj,v_proj     | cohere    |
 | [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                  | q_proj,v_proj     | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                 | query_key_value   | falcon    |
-| [Gemma](https://huggingface.co/google)                   | 2B/7B                       | q_proj,v_proj     | gemma     |
+| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                       | q_proj,v_proj     | gemma     |
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
@@ -428,7 +428,7 @@ If you have a project that should be incorporated, please contact via email or c
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
 
-Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [CommandR](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index 96ddf20d..6c9d8b29 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -132,10 +132,10 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
 | [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
 | [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                          | query_key_value   | chatglm3  |
-| [CommandR](https://huggingface.co/CohereForAI)           | 35B/104B                    | q_proj,v_proj     | cohere    |
+| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                    | q_proj,v_proj     | cohere    |
 | [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                  | q_proj,v_proj     | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                 | query_key_value   | falcon    |
-| [Gemma](https://huggingface.co/google)                   | 2B/7B                       | q_proj,v_proj     | gemma     |
+| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                       | q_proj,v_proj     | gemma     |
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
@@ -428,7 +428,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
 
-使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [CommandR](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## 引用
 
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 6ba88bbd..f1be79f7 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -385,6 +385,23 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "CodeGemma-2B": {
+            DownloadSource.DEFAULT: "google/codegemma-2b",
+        },
+        "CodeGemma-7B": {
+            DownloadSource.DEFAULT: "google/codegemma-7b",
+        },
+        "CodeGemma-7B-Chat": {
+            DownloadSource.DEFAULT: "google/codegemma-7b-it",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/codegemma-7b-it",
+        },
+    },
+    template="gemma",
+)
+
+
 register_model_group(
     models={
         "InternLM-7B": {
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 8a89be33..aeb22850 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -48,7 +48,6 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             use_fast=True,
-            split_special_tokens=model_args.split_special_tokens,
             padding_side="right",
             **init_kwargs,
         )

From efa808069a8abcd4875185f3ddc4605849d45e5d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 00:25:03 +0800
Subject: [PATCH 081/341] support unsloth 2024.4

Former-commit-id: 14a83f8bc4fe44783252378fce59198194a96bb8
---
 src/llmtuner/model/adapter.py | 8 ++++++--
 src/llmtuner/model/loader.py  | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index 4bb4057d..cabb21df 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -145,18 +145,22 @@ def init_adapter(
                 "lora_alpha": finetuning_args.lora_alpha,
                 "lora_dropout": finetuning_args.lora_dropout,
                 "use_rslora": finetuning_args.use_rslora,
+                "modules_to_save": finetuning_args.additional_target,
             }
 
             if model_args.use_unsloth:
                 from unsloth import FastLanguageModel  # type: ignore
 
-                unsloth_peft_kwargs = {"model": model, "max_seq_length": model_args.model_max_length}
+                unsloth_peft_kwargs = {
+                    "model": model,
+                    "max_seq_length": model_args.model_max_length,
+                    "use_gradient_checkpointing": "unsloth",
+                }
                 model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
             else:
                 lora_config = LoraConfig(
                     task_type=TaskType.CAUSAL_LM,
                     inference_mode=False,
-                    modules_to_save=finetuning_args.additional_target,
                     use_dora=finetuning_args.use_dora,
                     **peft_kwargs,
                 )
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index aeb22850..7ab8222f 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -82,6 +82,8 @@ def load_model(
             "token": model_args.hf_hub_token,
             "device_map": {"": get_current_device()},
             "rope_scaling": getattr(config, "rope_scaling", None),
+            "fix_tokenizer": False,
+            "trust_remote_code": True,
         }
         try:
             model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)

From ebf0f4a77cbb7d6a8f6bee5568c0e5b84d936871 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 02:35:36 +0800
Subject: [PATCH 082/341] update readme

Former-commit-id: f9a246572c1ec0e4b36bff237c6523ce629b7000
---
 README.md    | 6 ++++--
 README_zh.md | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index ab6a37ea..c89598e2 100644
--- a/README.md
+++ b/README.md
@@ -68,14 +68,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
+
 [24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
 
 [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
 
-[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
-
 <details><summary>Full Changelog</summary>
 
+[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
+
 [24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See `examples/extras/loraplus` for usage.
 
 [24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. See `examples/extras/galore` for usage.
diff --git a/README_zh.md b/README_zh.md
index 6c9d8b29..fac92600 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -68,14 +68,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 56k 文本）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
+
 [24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
 
 [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
 
-[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
-
 <details><summary>展开日志</summary>
 
+[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
+
 [24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 `examples/extras/loraplus`。
 
 [24/03/07] 我们支持了梯度低秩投影（**[GaLore](https://arxiv.org/abs/2403.03507)**）算法。详细用法请参照 `examples/extras/galore`。

From b053c6454eb43608193a461110ab064225ea0722 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 02:36:54 +0800
Subject: [PATCH 083/341] update readme

Former-commit-id: 8f233745c3aa7a6ef57f275bec80ee731ff76de3
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c89598e2..2fc9ba88 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
-[24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
+[24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
 
 [24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
 
diff --git a/README_zh.md b/README_zh.md
index fac92600..6564ad4f 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -68,7 +68,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
-[24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 56k 文本）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
+[24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
 
 [24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
 

From ce4f653121445e1c5ea906346f6f8e6752788621 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 03:10:02 +0800
Subject: [PATCH 084/341] add empty template

Former-commit-id: a325ffa8a668bec354d2636683806acef105e196
---
 src/llmtuner/data/template.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 7a1f4ab8..1311eda5 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -581,6 +581,13 @@ _register_template(
 )
 
 
+_register_template(
+    name="empty",
+    format_user=StringFormatter(slots=["{{content}}"]),
+    format_assistant=StringFormatter(slots=["{{content}}"]),
+)
+
+
 _register_template(
     name="falcon",
     format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),

From 7b97a79efc16eba02d8c780c1df55360b03c8d4d Mon Sep 17 00:00:00 2001
From: codingma <codingma@163.com>
Date: Tue, 16 Apr 2024 10:43:14 +0800
Subject: [PATCH 085/341] support for previewing custom dataset in directory
 format

Former-commit-id: 501cff38c819f06f15194907ce7e052d5f28025a
---
 src/llmtuner/webui/components/data.py | 38 ++++++++++++++++++---------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/src/llmtuner/webui/components/data.py b/src/llmtuner/webui/components/data.py
index 46274417..c0f113ea 100644
--- a/src/llmtuner/webui/components/data.py
+++ b/src/llmtuner/webui/components/data.py
@@ -28,30 +28,44 @@ def can_preview(dataset_dir: str, dataset: list) -> "gr.Button":
             dataset_info = json.load(f)
     except Exception:
         return gr.Button(interactive=False)
-
+    
+    local_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
     if (
-        len(dataset) > 0
-        and "file_name" in dataset_info[dataset[0]]
-        and os.path.isfile(os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"]))
+            len(dataset) > 0
+            and "file_name" in dataset_info[dataset[0]]
+            and (os.path.isfile(local_path)
+                 or (os.path.isdir(local_path)) and len(os.listdir(local_path)) != 0)
     ):
         return gr.Button(interactive=True)
     else:
         return gr.Button(interactive=False)
 
 
+def load_single_data(data_file_path):
+    with open(os.path.join(data_file_path), "r", encoding="utf-8") as f:
+        if data_file_path.endswith(".json"):
+            data = json.load(f)
+        elif data_file_path.endswith(".jsonl"):
+            data = [json.loads(line) for line in f]
+        else:
+            data = [line for line in f]  # noqa: C416
+        return data
+
+
 def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, "gr.Column"]:
     with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f:
         dataset_info = json.load(f)
 
     data_file: str = dataset_info[dataset[0]]["file_name"]
-    with open(os.path.join(dataset_dir, data_file), "r", encoding="utf-8") as f:
-        if data_file.endswith(".json"):
-            data = json.load(f)
-        elif data_file.endswith(".jsonl"):
-            data = [json.loads(line) for line in f]
-        else:
-            data = [line for line in f]  # noqa: C416
-    return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
+    local_path = os.path.join(dataset_dir, data_file)
+    if os.path.isdir(local_path):
+        data = []
+        for file_name in os.listdir(local_path):
+            data.extend(load_single_data(os.path.join(local_path, file_name)))
+    else:
+        data = load_single_data(local_path)
+
+    return len(data), data[PAGE_SIZE * page_index: PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
 
 
 def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dict[str, "Component"]:

From 5f1418a68b3757ef798363267efb29c0ffb22601 Mon Sep 17 00:00:00 2001
From: codingma <codingma@163.com>
Date: Tue, 16 Apr 2024 10:56:39 +0800
Subject: [PATCH 086/341] add check

Former-commit-id: 008f6498977c243c80e87242f05c9cf9573541ac
---
 src/llmtuner/webui/components/data.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/llmtuner/webui/components/data.py b/src/llmtuner/webui/components/data.py
index c0f113ea..ab6b5de4 100644
--- a/src/llmtuner/webui/components/data.py
+++ b/src/llmtuner/webui/components/data.py
@@ -28,14 +28,13 @@ def can_preview(dataset_dir: str, dataset: list) -> "gr.Button":
             dataset_info = json.load(f)
     except Exception:
         return gr.Button(interactive=False)
-    
+
+    if len(dataset) == 0 or "file_name" not in dataset_info[dataset[0]]:
+        return gr.Button(interactive=False)
+
     local_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
-    if (
-            len(dataset) > 0
-            and "file_name" in dataset_info[dataset[0]]
-            and (os.path.isfile(local_path)
-                 or (os.path.isdir(local_path)) and len(os.listdir(local_path)) != 0)
-    ):
+    if (os.path.isfile(local_path)
+            or (os.path.isdir(local_path) and len(os.listdir(local_path)) != 0)):
         return gr.Button(interactive=True)
     else:
         return gr.Button(interactive=False)

From 6dd6b3e39679cfeda2d2dcf0c4b7512b95acd64e Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Tue, 16 Apr 2024 12:05:27 +0800
Subject: [PATCH 087/341] resolve gradient checkpointing issue.

Former-commit-id: 6df9135d063bb6102f0cbcdf0d702076f5febbae
---
 examples/extras/badam/sft.sh      |  3 +--
 setup.py                          |  1 +
 src/llmtuner/model/utils.py       | 16 +++++-----------
 src/llmtuner/train/sft/trainer.py |  2 +-
 4 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
index daa63913..656cfdba 100644
--- a/examples/extras/badam/sft.sh
+++ b/examples/extras/badam/sft.sh
@@ -31,6 +31,5 @@ python ../../../src/train_bash.py \
 --use_badam \
 --switch_mode descending \
 --badam_verbose 2 \
---switch_block_every 50 \
---pure_bf16 \
+--switch_block_every 50
 
diff --git a/setup.py b/setup.py
index fd5bdf7e..b2eb4afd 100644
--- a/setup.py
+++ b/setup.py
@@ -24,6 +24,7 @@ extra_require = {
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "unsloth": ["torch==2.2.0", "unsloth[cu121-ampere-torch220]"],
     "galore": ["galore-torch"],
+    "badam": ["torch>=2.1.0"],
     "vllm": ["vllm>=0.3.3"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py
index e83a903e..fd587efd 100644
--- a/src/llmtuner/model/utils.py
+++ b/src/llmtuner/model/utils.py
@@ -150,30 +150,24 @@ def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
             Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function.
     """
     from torch.utils.checkpoint import checkpoint
+    import functools
     
     if not self.supports_gradient_checkpointing:
         raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
 
     if gradient_checkpointing_kwargs is None:
-        gradient_checkpointing_kwargs = {}
+        gradient_checkpointing_kwargs = {"use_reentrant": True}
 
-    # gradient_checkpointing_func = functools.partial(checkpoint, **gradient_checkpointing_kwargs)
+    checkpoint = functools.partial(checkpoint, **gradient_checkpointing_kwargs)
     
     def gradient_checkpointing_func(func, *args, **kwargs):
         module = func.__self__
         
-        if any([p.requires_grad for p in module.parameters()]):
+        if any(p.requires_grad for p in module.parameters()):
             for arg in args:
                 if torch.is_tensor(arg) and torch.is_floating_point(arg):
                     arg.requires_grad_(True)
         
         return checkpoint(func, *args, **kwargs)            
         
-    self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func)
-
-    if getattr(self, "_hf_peft_config_loaded", False):
-        # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True
-        # we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334
-        # When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate
-        # the gradients to make sure the gradient flows.
-        self.enable_input_require_grads()
\ No newline at end of file
+    self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func)
\ No newline at end of file
diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py
index d750f491..de741426 100644
--- a/src/llmtuner/train/sft/trainer.py
+++ b/src/llmtuner/train/sft/trainer.py
@@ -29,7 +29,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
     def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
-        if version.parse(torch.__version__) >= version.parse("1.13"):
+        if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 

From 86ab47e121fdaff3a2753a06d3151800331d5021 Mon Sep 17 00:00:00 2001
From: Jonery <qijunluo@link.cuhk.edu.cn>
Date: Tue, 16 Apr 2024 12:25:50 +0800
Subject: [PATCH 088/341] remove badam from core requirements

Former-commit-id: fa5898944a3867ac5108dd0d579ca0677c87d3d6
---
 requirements.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9d58d75a..4b5651b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,5 +14,4 @@ pydantic
 fastapi
 sse-starlette
 matplotlib
-fire
-badam
\ No newline at end of file
+fire
\ No newline at end of file

From 351493b183681a74d1db277361145af34253bddc Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:10:02 +0800
Subject: [PATCH 089/341] Update setup.py

Former-commit-id: 5df30ea166aff29d48ff83a22ac6ef1611ce3e35
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b2eb4afd..9ef881e2 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@ extra_require = {
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "unsloth": ["torch==2.2.0", "unsloth[cu121-ampere-torch220]"],
     "galore": ["galore-torch"],
-    "badam": ["torch>=2.1.0"],
+    "badam": ["badam"],
     "vllm": ["vllm>=0.3.3"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],

From d08e09642d92d211e75c79e1c844f77ffc5c2bc0 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:10:17 +0800
Subject: [PATCH 090/341] Update requirements.txt

Former-commit-id: 1e45537ca0bb4d49b4147df01122e365b3d617e4
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4b5651b4..1fa5a142 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,4 @@ pydantic
 fastapi
 sse-starlette
 matplotlib
-fire
\ No newline at end of file
+fire

From de728d0371562c744a9d2a96b6cc2b2b87904d22 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:25:40 +0800
Subject: [PATCH 091/341] Update sft.sh

Former-commit-id: 2b4b1562e91bbb02e345e71b7721da9333c0791b
---
 examples/extras/badam/sft.sh | 68 ++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
index 656cfdba..c2319caa 100644
--- a/examples/extras/badam/sft.sh
+++ b/examples/extras/badam/sft.sh
@@ -1,35 +1,35 @@
-# BAdam layer-wise
-export CUDA_VISIBLE_DEVICES=0
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-python ../../../src/train_bash.py \
---stage sft \
---do_train \
---model_name_or_path meta-llama/Llama-2-7b-hf \
---dataset alpaca_gpt4_en,glaive_toolcall \
---dataset_dir ../../../data \
---template default \
---finetuning_type full \
---output_dir ../../../saves/LLaMA2-7B/badam \
---overwrite_cache \
---overwrite_output_dir \
---cutoff_len 1024 \
---preprocessing_num_workers 32 \
---per_device_train_batch_size 8 \
---per_device_eval_batch_size 5 \
---gradient_accumulation_steps 2 \
---lr_scheduler_type cosine \
---logging_steps 10 \
---warmup_steps 20 \
---save_steps 100 \
---eval_steps 100 \
---evaluation_strategy steps \
---load_best_model_at_end \
---learning_rate 5e-5 \
---num_train_epochs 3.0 \
---val_size 0.1 \
---plot_loss \
---use_badam \
---switch_mode descending \
---badam_verbose 2 \
---switch_block_every 50
+#!/bin/bash
 
+CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../../data \
+    --template default \
+    --finetuning_type full \
+    --use_badam \
+    --badam_switch_mode descending \
+    --badam_switch_block_every 50 \
+    --badam_verbose 2 \
+    --output_dir ../../../saves/LLaMA2-7B/badam/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --pure_bf16

From ff4f587dd92f5fa950887d349486231f6019a720 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:26:30 +0800
Subject: [PATCH 092/341] Update finetuning_args.py

Former-commit-id: 3a23d900aea74078f0bc8cf73fac860a4ce3df67
---
 src/llmtuner/hparams/finetuning_args.py | 90 +++++++++++++------------
 1 file changed, 48 insertions(+), 42 deletions(-)

diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index d64f1583..899c7284 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -163,47 +163,6 @@ class RLHFArguments:
         metadata={"help": "The type of the reward model in PPO training. Lora model only supports lora training."},
     )
 
-@dataclass
-class BAdamArgument:
-    r"""
-    Arguments for BAdam optimizer.
-    """
-    use_badam: bool = field(
-        default=False,
-        metadata={"help": "Whether or not to use BAdam optimizer."},
-    )
-    badam_mode: Literal["layer", "ratio"] = field(
-        default="layer",
-        metadata={"help": "The mode of BAdam optimizer. 'layer' for layer-wise, 'ratio' for ratio-wise."},
-    )
-    
-    # ======== Arguments for layer-wise update ========
-    start_block: Optional[int] = field(
-        default=None,
-        metadata={"help": "The starting block index for block-wise fine-tuning."}
-    )
-    switch_block_every: Optional[int] = field(
-        default=50,
-        metadata={"help": "how often to switch model's block update. Set to -1 to disable the block update."}
-    )
-    switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field(
-        default="ascending",
-        metadata={"help": "the strategy of picking block to update."}
-    )
-    
-    # ======== Arguments for ratio-wise update ========
-    badam_update_ratio: float = field(
-        default=0.,
-        metadata={"help": "The ratio of the update for the BAdam optimizer."}
-    )
-    badam_mask_mode: Literal["adjacent", "scatter"] = field(
-        default="adjacent",
-        metadata={"help": "The mode of the mask for BAdam optimizer. `adjacent` means that the trainable parameters are adjacent to each other; `scatter` means that trainable parameters are randomly choosed from the weight."}
-    )
-    badam_verbose: int = field(
-        default=0,
-        metadata={"help": "The verbosity level of BAdam optimizer. 0 for no print, 1 for print the block prefix, 2 for print trainable parameters"}
-    )
 
 @dataclass
 class GaloreArguments:
@@ -213,7 +172,7 @@ class GaloreArguments:
 
     use_galore: bool = field(
         default=False,
-        metadata={"help": "Whether or not to use gradient low-Rank projection."},
+        metadata={"help": "Whether or not to use the gradient low-Rank projection (GaLore)."},
     )
     galore_target: str = field(
         default="all",
@@ -244,6 +203,53 @@ class GaloreArguments:
     )
 
 
+@dataclass
+class BAdamArgument:
+    r"""
+    Arguments pertaining to the BAdam optimizer.
+    """
+
+    use_badam: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the BAdam optimizer."},
+    )
+    badam_mode: Literal["layer", "ratio"] = field(
+        default="layer",
+        metadata={"help": "Whether to use layer-wise or ratio-wise BAdam optimizer."},
+    )
+    badam_start_block: Optional[int] = field(
+        default=None,
+        metadata={"help": "The starting block index for layer-wise BAdam."},
+    )
+    badam_switch_block_every: Optional[int] = field(
+        default=50,
+        metadata={"help": "How often to switch model's block update. Set to -1 to disable the block update."},
+    )
+    badam_switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field(
+        default="ascending",
+        metadata={"help": "the strategy of picking block to update for layer-wise BAdam."},
+    )
+    badam_update_ratio: float = field(
+        default=0.0,
+        metadata={"help": "The ratio of the update for ratio-wise BAdam."},
+    )
+    badam_mask_mode: Literal["adjacent", "scatter"] = field(
+        default="adjacent",
+        metadata={
+            "help": """The mode of the mask for BAdam optimizer. \
+                    `adjacent` means that the trainable parameters are adjacent to each other, \
+                    `scatter` means that trainable parameters are randomly choosed from the weight."""
+        },
+    )
+    badam_verbose: int = field(
+        default=0,
+        metadata={
+            "help": """The verbosity level of BAdam optimizer. \
+                    0 for no print, 1 for print the block prefix, 2 for print trainable parameters"""
+        },
+    )
+
+
 @dataclass
 class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreArguments, BAdamArgument):
     r"""

From 191971865dff7cb1170ee5363d667ebe3c489708 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:27:02 +0800
Subject: [PATCH 093/341] Update parser.py

Former-commit-id: 2f3da8169d18b026760cc0ac7dd6141bdd08c932
---
 src/llmtuner/hparams/parser.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 032a1a4b..baa65978 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -82,12 +82,18 @@ def _check_extra_dependencies(
     if model_args.use_unsloth:
         require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth")
 
+    if model_args.mixture_of_depths:
+        require_version("mixture-of-depth", "To fix: pip install mixture-of-depth")
+
     if model_args.infer_backend == "vllm":
         require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3")
 
     if finetuning_args.use_galore:
         require_version("galore_torch", "To fix: pip install galore_torch")
 
+    if finetuning_args.use_badam:
+        require_version("badam", "To fix: pip install badam")
+
     if training_args is not None and training_args.predict_with_generate:
         require_version("jieba", "To fix: pip install jieba")
         require_version("nltk", "To fix: pip install nltk")
@@ -151,6 +157,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if training_args.do_train and training_args.predict_with_generate:
         raise ValueError("`predict_with_generate` cannot be set as True while training.")
 
+    if training_args.do_train and model_args.quantization_device_map == "auto":
+        raise ValueError("Cannot use device map for quantized models in training.")
+
     if finetuning_args.use_dora and model_args.use_unsloth:
         raise ValueError("Unsloth does not support DoRA.")
 
@@ -169,14 +178,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
     if finetuning_args.use_galore and training_args.deepspeed is not None:
-        raise ValueError("GaLore is incompatible with DeepSpeed.")
+        raise ValueError("GaLore is incompatible with DeepSpeed yet.")
 
-    if (finetuning_args.use_badam
+    if (
+        finetuning_args.use_badam
         and finetuning_args.badam_mode == "layer"
         and training_args.parallel_mode.value == "distributed"
     ):
-        raise ValueError("BAdam with layer-wise mode is not supported in distributed training by now, use ratio mode instead.")
-    
+        raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
+
     if model_args.infer_backend == "vllm":
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
 

From 7ecea08b9b94a8427fd4d1b16e8caa32829f5668 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:27:25 +0800
Subject: [PATCH 094/341] Update parser.py

Former-commit-id: 898239883afc79f03abd0dc276eef901662a9591
---
 src/llmtuner/hparams/parser.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index baa65978..a9f8ffd7 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -82,9 +82,6 @@ def _check_extra_dependencies(
     if model_args.use_unsloth:
         require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth")
 
-    if model_args.mixture_of_depths:
-        require_version("mixture-of-depth", "To fix: pip install mixture-of-depth")
-
     if model_args.infer_backend == "vllm":
         require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3")
 

From 96213f04b0f28cc84937ff8c92985b9eabeac20e Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:28:12 +0800
Subject: [PATCH 095/341] Update adapter.py

Former-commit-id: 8f7b75b26f020d8ae85baab7b082475c3bfeb512
---
 src/llmtuner/model/adapter.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index 71c9c2f4..b712bdcf 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -145,18 +145,22 @@ def init_adapter(
                 "lora_alpha": finetuning_args.lora_alpha,
                 "lora_dropout": finetuning_args.lora_dropout,
                 "use_rslora": finetuning_args.use_rslora,
+                "modules_to_save": finetuning_args.additional_target,
             }
 
             if model_args.use_unsloth:
                 from unsloth import FastLanguageModel  # type: ignore
 
-                unsloth_peft_kwargs = {"model": model, "max_seq_length": model_args.model_max_length}
+                unsloth_peft_kwargs = {
+                    "model": model,
+                    "max_seq_length": model_args.model_max_length,
+                    "use_gradient_checkpointing": "unsloth",
+                }
                 model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
             else:
                 lora_config = LoraConfig(
                     task_type=TaskType.CAUSAL_LM,
                     inference_mode=False,
-                    modules_to_save=finetuning_args.additional_target,
                     use_dora=finetuning_args.use_dora,
                     **peft_kwargs,
                 )

From cde9d1b9179cce7c3b0439ebbdc676b980c65c85 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:29:19 +0800
Subject: [PATCH 096/341] Update patcher.py

Former-commit-id: 494e6a1e05b38f5ff61d83327303614f53c92e64
---
 src/llmtuner/model/patcher.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 563b1827..fb2835e8 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -133,7 +133,9 @@ def _configure_quantization(
         if is_deepspeed_zero3_enabled():
             raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantized models.")
 
-        init_kwargs["device_map"] = {"": get_current_device()}
+        if model_args.quantization_device_map != "auto":
+            init_kwargs["device_map"] = {"": get_current_device()}
+
         quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
         quant_method = quantization_config.get("quant_method", "")
 
@@ -268,7 +270,6 @@ def _prepare_model_for_training(
             # According to: https://github.com/huggingface/transformers/issues/28339
             model.gradient_checkpointing_enable = MethodType(gradient_checkpointing_enable, model)
             model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True})
-            # model.enable_input_require_grads()
             setattr(model.config, "use_cache", False)  # turn off when gradient checkpointing is enabled
             logger.info("Gradient checkpointing enabled.")
 

From c7c216069c3fa5ddd82e6fcdedc242724dc797d7 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:29:30 +0800
Subject: [PATCH 097/341] Update utils.py

Former-commit-id: 7edf4dbed88b8034282f14fd6e0cb6f7f9e5f805
---
 src/llmtuner/model/utils.py | 70 +++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py
index fd587efd..7e4430d1 100644
--- a/src/llmtuner/model/utils.py
+++ b/src/llmtuner/model/utils.py
@@ -1,5 +1,6 @@
 from enum import Enum, unique
-from typing import TYPE_CHECKING, Dict, List
+from functools import partial
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import torch
 from transformers import PreTrainedModel
@@ -100,6 +101,37 @@ def find_expanded_modules(model: "PreTrainedModel", target_modules: List[str], n
     return module_names
 
 
+def gradient_checkpointing_enable(
+    self: "PreTrainedModel", gradient_checkpointing_kwargs: Optional[Dict[str, Any]] = None
+) -> None:
+    r"""
+    Activates gradient checkpointing for the current model.
+
+    Modification of the original method to enable gradient checkpointing for block-wise optimizer.
+    """
+    from torch.utils.checkpoint import checkpoint
+
+    if not self.supports_gradient_checkpointing:
+        raise ValueError("{} does not support gradient checkpointing.".format(self.__class__.__name__))
+
+    if gradient_checkpointing_kwargs is None:
+        gradient_checkpointing_kwargs = {"use_reentrant": True}
+
+    gradient_checkpointing_func = partial(checkpoint, **gradient_checkpointing_kwargs)
+
+    def custom_gradient_checkpointing_func(func, *args, **kwargs):
+        module: "torch.nn.Module" = func.__self__
+
+        if any(param.requires_grad for param in module.parameters()):
+            for arg in args:
+                if torch.is_tensor(arg) and torch.is_floating_point(arg):
+                    arg.requires_grad_(True)
+
+        return gradient_checkpointing_func(func, *args, **kwargs)
+
+    self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=custom_gradient_checkpointing_func)
+
+
 def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> Dict[str, torch.Tensor]:
     r"""
     Loads value head parameters from Hugging Face Hub or local disk.
@@ -135,39 +167,3 @@ def register_autoclass(config: "PretrainedConfig", model: "PreTrainedModel", tok
         model.__class__.register_for_auto_class()
     if "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}):
         tokenizer.__class__.register_for_auto_class()
-
-def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
-    """
-    Modification of the original method to enable gradient checkpointing for block-wise optimizer.
-    
-    Activates gradient checkpointing for the current model.
-
-    We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
-    the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
-
-    Args:
-        gradient_checkpointing_kwargs (dict, *optional*):
-            Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function.
-    """
-    from torch.utils.checkpoint import checkpoint
-    import functools
-    
-    if not self.supports_gradient_checkpointing:
-        raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
-
-    if gradient_checkpointing_kwargs is None:
-        gradient_checkpointing_kwargs = {"use_reentrant": True}
-
-    checkpoint = functools.partial(checkpoint, **gradient_checkpointing_kwargs)
-    
-    def gradient_checkpointing_func(func, *args, **kwargs):
-        module = func.__self__
-        
-        if any(p.requires_grad for p in module.parameters()):
-            for arg in args:
-                if torch.is_tensor(arg) and torch.is_floating_point(arg):
-                    arg.requires_grad_(True)
-        
-        return checkpoint(func, *args, **kwargs)            
-        
-    self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func)
\ No newline at end of file

From 5978427ae0e86f53cfc88fc3af655adbc3b0f4f0 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:29:52 +0800
Subject: [PATCH 098/341] Update trainer.py

Former-commit-id: c6163be1444c00dd000f288e2f834968bd932981
---
 src/llmtuner/train/sft/trainer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py
index de741426..def427fd 100644
--- a/src/llmtuner/train/sft/trainer.py
+++ b/src/llmtuner/train/sft/trainer.py
@@ -1,5 +1,6 @@
 import json
 import os
+from types import MethodType
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -9,8 +10,7 @@ from transformers import Seq2SeqTrainer
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
 from ..utils import create_custom_optimzer, create_custom_scheduler
-from types import MethodType
-from packaging import version
+
 
 if TYPE_CHECKING:
     from transformers.trainer import PredictionOutput
@@ -31,6 +31,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         self.finetuning_args = finetuning_args
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
+
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":

From 9d23f5dc890afb6ad7d618150fa2603a95c9fad4 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:30:12 +0800
Subject: [PATCH 099/341] Update utils.py

Former-commit-id: 01147536b2bb507e87e033fa696e9eb39fe96bbe
---
 src/llmtuner/train/utils.py | 111 +++++++++++++++++++++---------------
 1 file changed, 64 insertions(+), 47 deletions(-)

diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index 65233f72..2835eddf 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -162,6 +162,15 @@ def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]:
     return decay_parameters
 
 
+def _get_embedding_names(model: "PreTrainedModel") -> List[str]:
+    r"""
+    Returns a list of names of parameters in embedding.
+    """
+    result = {name for name, _ in model.get_input_embeddings().named_parameters()}
+    result.update(name for name, _ in model.get_output_embeddings().named_parameters())
+    return result
+
+
 def _create_galore_optimizer(
     model: "PreTrainedModel",
     training_args: "Seq2SeqTrainingArguments",
@@ -236,7 +245,7 @@ def _create_galore_optimizer(
         optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict)
     else:
         param_groups = [
-            dict(params=nodecay_params),
+            dict(params=nodecay_params, weight_decay=0.0),
             dict(params=decay_params, weight_decay=training_args.weight_decay),
             dict(params=galore_params, weight_decay=training_args.weight_decay, **galore_kwargs),
         ]
@@ -280,82 +289,90 @@ def _create_loraplus_optimizer(
     param_groups = [
         dict(params=param_dict["lora_a"], **decay_args),
         dict(params=param_dict["lora_b"], lr=loraplus_lr, **decay_args),
-        dict(params=param_dict["lora_b_nodecay"], lr=loraplus_lr),
+        dict(params=param_dict["lora_b_nodecay"], lr=loraplus_lr, weight_decay=0.0),
         dict(params=param_dict["embedding"], lr=finetuning_args.loraplus_lr_embedding, **decay_args),
     ]
     optimizer = optim_class(param_groups, **optim_kwargs)
     logger.info("Using LoRA+ optimizer with loraplus lr ratio {:.2f}.".format(finetuning_args.loraplus_lr_ratio))
     return optimizer
 
+
 def _create_badam_optimizer(
     model: "PreTrainedModel",
     training_args: "Seq2SeqTrainingArguments",
     finetuning_args: "FinetuningArguments",
 ) -> "torch.optim.Optimizer":
-    
-    from transformers.trainer_pt_utils import get_parameter_names
-    decay_parameters = list(filter(lambda n: "bias" not in n, get_parameter_names(model, ALL_LAYERNORM_LAYERS)))
-    # filter out the embedding layers when using badam ratio mode
-    if finetuning_args.badam_mode == "ratio":
-        decay_parameters = list(filter(lambda n: "embed" not in n, decay_parameters)) # TODO: make it more general
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if n in decay_parameters],
-            "weight_decay": training_args.weight_decay,
-        },
-        {
-            "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
-            "weight_decay": 0.0,
-        },
+    decay_param_names = _get_decay_parameter_names(model)
+    if finetuning_args.badam_mode == "ratio":  # filter out the embedding layers for ratio-wise badam
+        decay_param_names = [name for name in decay_param_names if name not in _get_embedding_names(model)]
+
+    decay_params, nodecay_params = [], []
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if name in decay_param_names:
+                decay_params.append(param)
+            else:
+                nodecay_params.append(param)
+
+    optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+    param_groups = [
+        dict(params=nodecay_params, weight_decay=0.0),
+        dict(params=decay_params, weight_decay=training_args.weight_decay),
     ]
 
-    optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
-
-    # create BlockOptimizer
     if finetuning_args.badam_mode == "layer":
         from badam import BlockOptimizer
-        base_optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
-        optimizer = BlockOptimizer(base_optimizer=base_optimizer,
-                                named_parameters_list=list(model.named_parameters()),
-                                block_prefix_list=None,
-                                switch_block_every=finetuning_args.switch_block_every,
-                                start_block=finetuning_args.start_block,
-                                switch_mode=finetuning_args.switch_mode,
-                                verbose=finetuning_args.badam_verbose)
-        
-        logger.info(f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.switch_mode}, "
-                    f"switch block every {finetuning_args.switch_block_every} steps, "
-                    f"default start block is {finetuning_args.start_block}")
-    
+
+        base_optimizer = optim_class(param_groups, **optim_kwargs)
+        optimizer = BlockOptimizer(
+            base_optimizer=base_optimizer,
+            named_parameters_list=list(model.named_parameters()),
+            block_prefix_list=None,
+            switch_block_every=finetuning_args.badam_switch_block_every,
+            start_block=finetuning_args.badam_start_block,
+            switch_mode=finetuning_args.badam_switch_mode,
+            verbose=finetuning_args.badam_verbose,
+        )
+        logger.info(
+            f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
+            f"switch block every {finetuning_args.badam_switch_block_every} steps, "
+            f"default start block is {finetuning_args.badam_start_block}"
+        )
+
     elif finetuning_args.badam_mode == "ratio":
-        assert finetuning_args.badam_update_ratio > 0.
         from badam import BlockOptimizerRatio
-        optimizer = BlockOptimizerRatio(param_groups=optimizer_grouped_parameters,
-                                        named_parameters_list=list(model.named_parameters()),
-                                        update_ratio=finetuning_args.badam_update_ratio,
-                                        mask_mode=finetuning_args.badam_mask_mode,
-                                        verbose=finetuning_args.badam_verbose,
-                                        **optimizer_kwargs)
-        
-        logger.info(f"Using BAdam optimizer with ratio update, update ratio is {finetuning_args.badam_update_ratio}, "
-                    f"mask mode is {finetuning_args.badam_mask_mode}")
-    
+
+        assert finetuning_args.badam_update_ratio > 1e-6
+        optimizer = BlockOptimizerRatio(
+            param_groups=param_groups,
+            named_parameters_list=list(model.named_parameters()),
+            update_ratio=finetuning_args.badam_update_ratio,
+            mask_mode=finetuning_args.badam_mask_mode,
+            verbose=finetuning_args.badam_verbose,
+            **optim_kwargs,
+        )
+        logger.info(
+            f"Using BAdam optimizer with ratio-wise update, update ratio is {finetuning_args.badam_update_ratio}, "
+            f"mask mode is {finetuning_args.badam_mask_mode}"
+        )
+
     return optimizer
 
+
 def create_custom_optimzer(
     model: "PreTrainedModel",
     training_args: "Seq2SeqTrainingArguments",
     finetuning_args: "FinetuningArguments",
 ) -> Optional["torch.optim.Optimizer"]:
-    if finetuning_args.use_badam:
-        return _create_badam_optimizer(model, training_args, finetuning_args)
-    
     if finetuning_args.use_galore:
         return _create_galore_optimizer(model, training_args, finetuning_args)
 
     if finetuning_args.loraplus_lr_ratio is not None:
         return _create_loraplus_optimizer(model, training_args, finetuning_args)
 
+    if finetuning_args.use_badam:
+        return _create_badam_optimizer(model, training_args, finetuning_args)
+
 
 def create_custom_scheduler(
     training_args: "Seq2SeqTrainingArguments",

From a4167fd925ba6e78e08e51f33872f6a91daff7d8 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 17:44:48 +0800
Subject: [PATCH 100/341] support badam for all stages

Former-commit-id: 7a1380646119bfe6855f73dd90570defcea05281
---
 README.md                          | 27 ++++++++++++++-------------
 README_zh.md                       | 27 ++++++++++++++-------------
 examples/README.md                 |  4 +++-
 examples/README_zh.md              |  4 +++-
 src/llmtuner/train/dpo/trainer.py  |  6 ++++++
 src/llmtuner/train/orpo/trainer.py |  5 +++++
 src/llmtuner/train/ppo/trainer.py  |  6 ++++++
 src/llmtuner/train/pt/trainer.py   |  5 +++++
 src/llmtuner/train/rm/trainer.py   |  5 +++++
 9 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 2fc9ba88..276bc6a7 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Choose your path:
 - **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
-- **Advanced algorithms**: GaLore, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
+- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
@@ -68,14 +68,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See `examples/extras/badam` for usage.
+
 [24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
 
 [24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
 
-[24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
-
 <details><summary>Full Changelog</summary>
 
+[24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
+
 [24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
 
 [24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See `examples/extras/loraplus` for usage.
@@ -278,16 +280,15 @@ huggingface-cli login
 
 \* *estimated*
 
-| Method | Bits |   7B  |  13B  |  30B  |   70B  |   8x7B |
-| ------ | ---- | ----- | ----- | ----- | ------ | ------ |
-| Full   | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
-| Full   |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
-| GaLore |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| Freeze |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
-| LoRA   |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| QLoRA  |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
-| QLoRA  |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
-| QLoRA  |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
+| Method            | Bits |   7B  |  13B  |  30B  |   70B  |   8x7B |
+| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ |
+| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
+| Full              |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
+| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
+| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
+| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
+| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
+| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
 
 ## Getting Started
 
diff --git a/README_zh.md b/README_zh.md
index 6564ad4f..4420d8bb 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -46,7 +46,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - **多种模型**：LLaMA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
 - **集成方法**：（增量）预训练、指令监督微调、奖励模型训练、PPO 训练、DPO 训练和 ORPO 训练。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
-- **先进算法**：GaLore、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。
+- **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
 - **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
 - **极速推理**：基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。
@@ -68,14 +68,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 `examples/extras/badam`。
+
 [24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
 
 [24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
 
-[24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
-
 <details><summary>展开日志</summary>
 
+[24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
+
 [24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
 
 [24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 `examples/extras/loraplus`。
@@ -278,16 +280,15 @@ huggingface-cli login
 
 \* *估算值*
 
-| 训练方法 | 精度 |   7B  |  13B  |  30B  |   70B  |   8x7B |
-| ------- | ---- | ----- | ----- | ----- | ------ | ------ |
-| 全参数   | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
-| 全参数   |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
-| GaLore  |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| 部分参数 |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
-| LoRA    |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| QLoRA   |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
-| QLoRA   |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
-| QLoRA   |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
+| 训练方法           | 精度 |   7B  |  13B  |  30B  |   70B  |   8x7B |
+| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ |
+| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
+| Full              |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
+| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
+| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
+| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
+| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
+| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
 
 ## 如何使用
 
diff --git a/examples/README.md b/examples/README.md
index 4e771c2e..c0c0088e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,7 +3,7 @@ We provide diverse examples about fine-tuning LLMs.
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pretrain.sh: Do pre-training using LoRA
+│   ├── pretrain.sh: Do continuous pre-training using LoRA
 │   ├── sft.sh: Do supervised fine-tuning using LoRA
 │   ├── reward.sh: Do reward modeling using LoRA
 │   ├── ppo.sh: Do PPO training using LoRA
@@ -34,6 +34,8 @@ examples/
 └── extras/
     ├── galore/
     │   └── sft.sh: Fine-tune model with GaLore
+    ├── badam/
+    │   └── sft.sh: Fine-tune model with BAdam
     ├── loraplus/
     │   └── sft.sh: Fine-tune model using LoRA+
     ├── llama_pro/
diff --git a/examples/README_zh.md b/examples/README_zh.md
index badda0fe..3f31ffc7 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -3,7 +3,7 @@
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pretrain.sh: 基于 LoRA 进行预训练
+│   ├── pretrain.sh: 基于 LoRA 进行增量预训练
 │   ├── sft.sh: 基于 LoRA 进行指令监督微调
 │   ├── reward.sh: 基于 LoRA 进行奖励模型训练
 │   ├── ppo.sh: 基于 LoRA 进行 PPO 训练
@@ -34,6 +34,8 @@ examples/
 └── extras/
     ├── galore/
     │   └── sft.sh: 使用 GaLore 训练模型
+    ├── badam/
+    │   └── sft.sh: 使用 BAdam 训练模型
     ├── loraplus/
     │   └── sft.sh: 使用 LoRA+ 训练模型
     ├── llama_pro/
diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py
index 0b316c62..35dcd8db 100644
--- a/src/llmtuner/train/dpo/trainer.py
+++ b/src/llmtuner/train/dpo/trainer.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from contextlib import nullcontext
+from types import MethodType
 from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union
 
 import torch
@@ -63,6 +64,11 @@ class CustomDPOTrainer(DPOTrainer):
             else:
                 self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
 
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
+
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py
index d84e0199..5e0d70d9 100644
--- a/src/llmtuner/train/orpo/trainer.py
+++ b/src/llmtuner/train/orpo/trainer.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+from types import MethodType
 from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union
 
 import torch
@@ -44,6 +45,10 @@ class CustomORPOTrainer(DPOTrainer):
         self._stored_metrics = defaultdict(lambda: defaultdict(list))
 
         Trainer.__init__(self, model=model, **kwargs)
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llmtuner/train/ppo/trainer.py b/src/llmtuner/train/ppo/trainer.py
index 020d54cf..ef769968 100644
--- a/src/llmtuner/train/ppo/trainer.py
+++ b/src/llmtuner/train/ppo/trainer.py
@@ -1,6 +1,7 @@
 import math
 import os
 import sys
+from types import MethodType
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
 import torch
@@ -124,6 +125,11 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
             else:
                 self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
 
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
+
     def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
         r"""
         Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer.
diff --git a/src/llmtuner/train/pt/trainer.py b/src/llmtuner/train/pt/trainer.py
index af2848fb..969ebf04 100644
--- a/src/llmtuner/train/pt/trainer.py
+++ b/src/llmtuner/train/pt/trainer.py
@@ -1,3 +1,4 @@
+from types import MethodType
 from typing import TYPE_CHECKING, Optional
 
 from transformers import Trainer
@@ -23,6 +24,10 @@ class CustomTrainer(Trainer):
     def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
diff --git a/src/llmtuner/train/rm/trainer.py b/src/llmtuner/train/rm/trainer.py
index 8d0f2763..0f5d88d3 100644
--- a/src/llmtuner/train/rm/trainer.py
+++ b/src/llmtuner/train/rm/trainer.py
@@ -1,5 +1,6 @@
 import json
 import os
+from types import MethodType
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -28,6 +29,10 @@ class PairwiseTrainer(Trainer):
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
         self.can_return_loss = True  # override property to return eval_loss
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:

From 0a1578e4e3cc5d6e4c6665378f2ff77427410af0 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 18:09:16 +0800
Subject: [PATCH 101/341] update readme and gradio version

Former-commit-id: 4029b60ddcbd15b5354503c51178f0f5e7e9aedf
---
 README.md                   |  9 ++++++++-
 README_zh.md                | 11 +++++++++--
 requirements.txt            |  2 +-
 src/llmtuner/extras/misc.py |  2 +-
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 276bc6a7..95e3e8a0 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-28-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-34-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -332,6 +332,7 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 ```bash
 export CUDA_VISIBLE_DEVICES=0 # `set CUDA_VISIBLE_DEVICES=0` for Windows
+export GRADIO_SERVER_PORT=7860 # `set GRADIO_SERVER_PORT=7860` for Windows
 python src/train_web.py # or python -m llmtuner.webui.interface
 ```
 
@@ -417,8 +418,14 @@ If you have a project that should be incorporated, please contact via email or c
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
+1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
 1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
 1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
+1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443)
+1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
+1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
+1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
+1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
diff --git a/README_zh.md b/README_zh.md
index 4420d8bb..d8b0c518 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-28-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-34-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -332,6 +332,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 ```bash
 export CUDA_VISIBLE_DEVICES=0 # Windows 使用 `set CUDA_VISIBLE_DEVICES=0`
+export GRADIO_SERVER_PORT=7860 # Windows 使用 `set GRADIO_SERVER_PORT=7860`
 python src/train_web.py # 或 python -m llmtuner.webui.interface
 ```
 
@@ -392,7 +393,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 ## 使用了 LLaMA Factory 的项目
 
-如果您有项目希望添加至上述列表，请通过邮件联系或者创建一个 PR。
+如果您有项目希望添加至下述列表，请通过邮件联系或者创建一个 PR。
 
 <details><summary>点击显示</summary>
 
@@ -417,8 +418,14 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
+1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
 1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
 1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
+1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443)
+1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
+1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
+1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
+1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper，基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM，基于 Baichuan-13B 微调而得，具有法律推理和知识检索能力。
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。
diff --git a/requirements.txt b/requirements.txt
index 1fa5a142..3928d28d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ datasets>=2.14.3
 accelerate>=0.27.2
 peft>=0.10.0
 trl>=0.8.1
-gradio>=4.0.0,<=4.21.0
+gradio>=4.0.0
 scipy
 einops
 sentencepiece
diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index 12d1446f..1a1f81a0 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -66,7 +66,7 @@ def check_dependencies() -> None:
         require_version("accelerate>=0.27.2", "To fix: pip install accelerate>=0.27.2")
         require_version("peft>=0.10.0", "To fix: pip install peft>=0.10.0")
         require_version("trl>=0.8.1", "To fix: pip install trl>=0.8.1")
-        require_version("gradio>=4.0.0,<=4.21.0", "To fix: pip install gradio==4.21.0")
+        require_version("gradio>=4.0.0", "To fix: pip install gradio>=4.0.0")
 
 
 def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:

From d301f0a64b53776e1e8a79852e895759aee98575 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 18:09:31 +0800
Subject: [PATCH 102/341] Update parser.py

Former-commit-id: 92c2133896c20054db86dd53508c982e39bd5ca0
---
 src/llmtuner/hparams/parser.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index a9f8ffd7..03ab0c50 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -174,9 +174,6 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("Distributed training does not support layer-wise GaLore.")
 
-    if finetuning_args.use_galore and training_args.deepspeed is not None:
-        raise ValueError("GaLore is incompatible with DeepSpeed yet.")
-
     if (
         finetuning_args.use_badam
         and finetuning_args.badam_mode == "layer"
@@ -184,6 +181,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     ):
         raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
 
+    if (finetuning_args.use_galore or finetuning_args.use_badam) and training_args.deepspeed is not None:
+        raise ValueError("GaLore and BAdam are incompatible with DeepSpeed yet.")
+
     if model_args.infer_backend == "vllm":
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
 

From 171a029c5e5d836026dbbb2c3e59994af0edf54e Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 16 Apr 2024 18:21:09 +0800
Subject: [PATCH 103/341] lint

Former-commit-id: 917d65ce65024d17a5030bc57083a427cfae16d7
---
 src/llmtuner/webui/components/data.py | 37 ++++++++++++---------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/llmtuner/webui/components/data.py b/src/llmtuner/webui/components/data.py
index ab6b5de4..8e2e04bf 100644
--- a/src/llmtuner/webui/components/data.py
+++ b/src/llmtuner/webui/components/data.py
@@ -1,6 +1,6 @@
 import json
 import os
-from typing import TYPE_CHECKING, Dict, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
 
 import gradio as gr
 
@@ -32,39 +32,36 @@ def can_preview(dataset_dir: str, dataset: list) -> "gr.Button":
     if len(dataset) == 0 or "file_name" not in dataset_info[dataset[0]]:
         return gr.Button(interactive=False)
 
-    local_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
-    if (os.path.isfile(local_path)
-            or (os.path.isdir(local_path) and len(os.listdir(local_path)) != 0)):
+    data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
+    if os.path.isfile(data_path) or (os.path.isdir(data_path) and os.listdir(data_path)):
         return gr.Button(interactive=True)
     else:
         return gr.Button(interactive=False)
 
 
-def load_single_data(data_file_path):
-    with open(os.path.join(data_file_path), "r", encoding="utf-8") as f:
-        if data_file_path.endswith(".json"):
-            data = json.load(f)
-        elif data_file_path.endswith(".jsonl"):
-            data = [json.loads(line) for line in f]
+def _load_data_file(file_path: str) -> List[Any]:
+    with open(file_path, "r", encoding="utf-8") as f:
+        if file_path.endswith(".json"):
+            return json.load(f)
+        elif file_path.endswith(".jsonl"):
+            return [json.loads(line) for line in f]
         else:
-            data = [line for line in f]  # noqa: C416
-        return data
+            return list(f)
 
 
 def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, "gr.Column"]:
     with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f:
         dataset_info = json.load(f)
 
-    data_file: str = dataset_info[dataset[0]]["file_name"]
-    local_path = os.path.join(dataset_dir, data_file)
-    if os.path.isdir(local_path):
-        data = []
-        for file_name in os.listdir(local_path):
-            data.extend(load_single_data(os.path.join(local_path, file_name)))
+    data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
+    if os.path.isfile(data_path):
+        data = _load_data_file(data_path)
     else:
-        data = load_single_data(local_path)
+        data = []
+        for file_name in os.listdir(data_path):
+            data.extend(_load_data_file(os.path.join(data_path, file_name)))
 
-    return len(data), data[PAGE_SIZE * page_index: PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
+    return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
 
 
 def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dict[str, "Component"]:

From 9f6349a333a196cb10ebb27269515f4dc6ddab27 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 17 Apr 2024 22:17:19 +0800
Subject: [PATCH 104/341] fix #3317

Former-commit-id: 7dce1763be4374cf616d96db95ae964ff510a9d6
---
 src/llmtuner/chat/base_engine.py         |  5 +----
 src/llmtuner/extras/packages.py          |  4 ++++
 src/llmtuner/webui/chatter.py            |  9 ++++++---
 src/llmtuner/webui/common.py             |  6 +++++-
 src/llmtuner/webui/components/chatbot.py |  7 +++++--
 src/llmtuner/webui/components/data.py    |  7 +++++--
 src/llmtuner/webui/components/eval.py    |  7 +++++--
 src/llmtuner/webui/components/export.py  |  7 +++++--
 src/llmtuner/webui/components/infer.py   |  7 +++++--
 src/llmtuner/webui/components/top.py     |  7 +++++--
 src/llmtuner/webui/components/train.py   |  6 +++++-
 src/llmtuner/webui/engine.py             |  7 +++++--
 src/llmtuner/webui/interface.py          |  7 +++++--
 src/llmtuner/webui/runner.py             |  8 ++++++--
 src/llmtuner/webui/utils.py              | 13 ++++++++-----
 15 files changed, 75 insertions(+), 32 deletions(-)

diff --git a/src/llmtuner/chat/base_engine.py b/src/llmtuner/chat/base_engine.py
index c5db41da..e19db676 100644
--- a/src/llmtuner/chat/base_engine.py
+++ b/src/llmtuner/chat/base_engine.py
@@ -5,14 +5,11 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Literal, Opti
 
 if TYPE_CHECKING:
     from transformers import PreTrainedModel, PreTrainedTokenizer
+    from vllm import AsyncLLMEngine
 
     from ..data import Template
-    from ..extras.packages import is_vllm_available
     from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
 
-    if is_vllm_available():
-        from vllm import AsyncLLMEngine
-
 
 @dataclass
 class Response:
diff --git a/src/llmtuner/extras/packages.py b/src/llmtuner/extras/packages.py
index b134ddab..8494cb2c 100644
--- a/src/llmtuner/extras/packages.py
+++ b/src/llmtuner/extras/packages.py
@@ -25,6 +25,10 @@ def is_galore_available():
     return _is_package_available("galore_torch")
 
 
+def is_gradio_available():
+    return _is_package_available("gradio")
+
+
 def is_jieba_available():
     return _is_package_available("jieba")
 
diff --git a/src/llmtuner/webui/chatter.py b/src/llmtuner/webui/chatter.py
index 8c744153..479846ca 100644
--- a/src/llmtuner/webui/chatter.py
+++ b/src/llmtuner/webui/chatter.py
@@ -2,12 +2,10 @@ import json
 import os
 from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Sequence, Tuple
 
-import gradio as gr
-from gradio.components import Component  # cannot use TYPE_CHECKING here
-
 from ..chat import ChatModel
 from ..data import Role
 from ..extras.misc import torch_gc
+from ..extras.packages import is_gradio_available
 from .common import get_save_dir
 from .locales import ALERTS
 
@@ -17,6 +15,11 @@ if TYPE_CHECKING:
     from .manager import Manager
 
 
+if is_gradio_available():
+    import gradio as gr
+    from gradio.components import Component  # cannot use TYPE_CHECKING here
+
+
 class WebChatModel(ChatModel):
     def __init__(self, manager: "Manager", demo_mode: bool = False, lazy_init: bool = True) -> None:
         self.manager = manager
diff --git a/src/llmtuner/webui/common.py b/src/llmtuner/webui/common.py
index 96ef2737..659c35c3 100644
--- a/src/llmtuner/webui/common.py
+++ b/src/llmtuner/webui/common.py
@@ -3,7 +3,6 @@ import os
 from collections import defaultdict
 from typing import Any, Dict, Optional
 
-import gradio as gr
 from peft.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME
 
 from ..extras.constants import (
@@ -17,6 +16,11 @@ from ..extras.constants import (
     DownloadSource,
 )
 from ..extras.misc import use_modelscope
+from ..extras.packages import is_gradio_available
+
+
+if is_gradio_available():
+    import gradio as gr
 
 
 ADAPTER_NAMES = {WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME}
diff --git a/src/llmtuner/webui/components/chatbot.py b/src/llmtuner/webui/components/chatbot.py
index 8efd333c..82bc4f29 100644
--- a/src/llmtuner/webui/components/chatbot.py
+++ b/src/llmtuner/webui/components/chatbot.py
@@ -1,11 +1,14 @@
 from typing import TYPE_CHECKING, Dict, Tuple
 
-import gradio as gr
-
 from ...data import Role
+from ...extras.packages import is_gradio_available
 from ..utils import check_json_schema
 
 
+if is_gradio_available():
+    import gradio as gr
+
+
 if TYPE_CHECKING:
     from gradio.components import Component
 
diff --git a/src/llmtuner/webui/components/data.py b/src/llmtuner/webui/components/data.py
index 8e2e04bf..232b973d 100644
--- a/src/llmtuner/webui/components/data.py
+++ b/src/llmtuner/webui/components/data.py
@@ -2,9 +2,12 @@ import json
 import os
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple
 
-import gradio as gr
-
 from ...extras.constants import DATA_CONFIG
+from ...extras.packages import is_gradio_available
+
+
+if is_gradio_available():
+    import gradio as gr
 
 
 if TYPE_CHECKING:
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index d41ef857..0b3bfc8c 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -1,11 +1,14 @@
 from typing import TYPE_CHECKING, Dict
 
-import gradio as gr
-
+from ...extras.packages import is_gradio_available
 from ..common import DEFAULT_DATA_DIR, list_dataset
 from .data import create_preview_box
 
 
+if is_gradio_available():
+    import gradio as gr
+
+
 if TYPE_CHECKING:
     from gradio.components import Component
 
diff --git a/src/llmtuner/webui/components/export.py b/src/llmtuner/webui/components/export.py
index b394d75c..d9c2d8e4 100644
--- a/src/llmtuner/webui/components/export.py
+++ b/src/llmtuner/webui/components/export.py
@@ -1,12 +1,15 @@
 from typing import TYPE_CHECKING, Dict, Generator, List
 
-import gradio as gr
-
+from ...extras.packages import is_gradio_available
 from ...train import export_model
 from ..common import get_save_dir
 from ..locales import ALERTS
 
 
+if is_gradio_available():
+    import gradio as gr
+
+
 if TYPE_CHECKING:
     from gradio.components import Component
 
diff --git a/src/llmtuner/webui/components/infer.py b/src/llmtuner/webui/components/infer.py
index 1e56d432..d565347e 100644
--- a/src/llmtuner/webui/components/infer.py
+++ b/src/llmtuner/webui/components/infer.py
@@ -1,10 +1,13 @@
 from typing import TYPE_CHECKING, Dict
 
-import gradio as gr
-
+from ...extras.packages import is_gradio_available
 from .chatbot import create_chat_box
 
 
+if is_gradio_available():
+    import gradio as gr
+
+
 if TYPE_CHECKING:
     from gradio.components import Component
 
diff --git a/src/llmtuner/webui/components/top.py b/src/llmtuner/webui/components/top.py
index 6c5030cd..6cbf6e0d 100644
--- a/src/llmtuner/webui/components/top.py
+++ b/src/llmtuner/webui/components/top.py
@@ -1,13 +1,16 @@
 from typing import TYPE_CHECKING, Dict
 
-import gradio as gr
-
 from ...data import templates
 from ...extras.constants import METHODS, SUPPORTED_MODELS
+from ...extras.packages import is_gradio_available
 from ..common import get_model_path, get_template, list_adapters, save_config
 from ..utils import can_quantize
 
 
+if is_gradio_available():
+    import gradio as gr
+
+
 if TYPE_CHECKING:
     from gradio.components import Component
 
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 10954c1b..eaa266d9 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -1,13 +1,17 @@
 from typing import TYPE_CHECKING, Dict
 
-import gradio as gr
 from transformers.trainer_utils import SchedulerType
 
 from ...extras.constants import TRAINING_STAGES
+from ...extras.packages import is_gradio_available
 from ..common import DEFAULT_DATA_DIR, autoset_packing, list_adapters, list_dataset
 from ..components.data import create_preview_box
 
 
+if is_gradio_available():
+    import gradio as gr
+
+
 if TYPE_CHECKING:
     from gradio.components import Component
 
diff --git a/src/llmtuner/webui/engine.py b/src/llmtuner/webui/engine.py
index 0ee7f047..65945533 100644
--- a/src/llmtuner/webui/engine.py
+++ b/src/llmtuner/webui/engine.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, Generator
 
-from gradio.components import Component  # cannot use TYPE_CHECKING here
-
+from ..extras.packages import is_gradio_available
 from .chatter import WebChatModel
 from .common import get_model_path, list_dataset, load_config
 from .locales import LOCALES
@@ -10,6 +9,10 @@ from .runner import Runner
 from .utils import get_time
 
 
+if is_gradio_available():
+    from gradio.components import Component  # cannot use TYPE_CHECKING here
+
+
 class Engine:
     def __init__(self, demo_mode: bool = False, pure_chat: bool = False) -> None:
         self.demo_mode = demo_mode
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index f89d3ca5..0359d082 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -1,5 +1,4 @@
-import gradio as gr
-
+from ..extras.packages import is_gradio_available
 from .common import save_config
 from .components import (
     create_chat_box,
@@ -13,6 +12,10 @@ from .css import CSS
 from .engine import Engine
 
 
+if is_gradio_available():
+    import gradio as gr
+
+
 def create_ui(demo_mode: bool = False) -> gr.Blocks:
     engine = Engine(demo_mode=demo_mode, pure_chat=False)
 
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index ef5379cd..12307234 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -4,9 +4,7 @@ import time
 from threading import Thread
 from typing import TYPE_CHECKING, Any, Dict, Generator
 
-import gradio as gr
 import transformers
-from gradio.components import Component  # cannot use TYPE_CHECKING here
 from transformers.trainer import TRAINING_ARGS_NAME
 from transformers.utils import is_torch_cuda_available
 
@@ -14,12 +12,18 @@ from ..extras.callbacks import LogCallback
 from ..extras.constants import TRAINING_STAGES
 from ..extras.logging import LoggerHandler
 from ..extras.misc import get_device_count, torch_gc
+from ..extras.packages import is_gradio_available
 from ..train import run_exp
 from .common import get_module, get_save_dir, load_args, load_config, save_args
 from .locales import ALERTS
 from .utils import gen_cmd, gen_plot, get_eval_results, update_process_bar
 
 
+if is_gradio_available():
+    import gradio as gr
+    from gradio.components import Component  # cannot use TYPE_CHECKING here
+
+
 if TYPE_CHECKING:
     from .manager import Manager
 
diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py
index d96b1f6b..74f74e6a 100644
--- a/src/llmtuner/webui/utils.py
+++ b/src/llmtuner/webui/utils.py
@@ -3,21 +3,24 @@ import os
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
-import gradio as gr
-
-from ..extras.packages import is_matplotlib_available
+from ..extras.packages import is_gradio_available, is_matplotlib_available
 from ..extras.ploting import smooth
 from .locales import ALERTS
 
 
-if TYPE_CHECKING:
-    from ..extras.callbacks import LogCallback
+if is_gradio_available():
+    import gradio as gr
+
 
 if is_matplotlib_available():
     import matplotlib.figure
     import matplotlib.pyplot as plt
 
 
+if TYPE_CHECKING:
+    from ..extras.callbacks import LogCallback
+
+
 def update_process_bar(callback: "LogCallback") -> "gr.Slider":
     if not callback.max_steps:
         return gr.Slider(visible=False)

From bee796f6b50d86315f24ed993365f25bdf50f251 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 17 Apr 2024 22:54:34 +0800
Subject: [PATCH 105/341] fix #3316

Former-commit-id: 7395e9e90a209228ff563ab54319955608850fc3
---
 src/llmtuner/model/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py
index 7e4430d1..17b09a60 100644
--- a/src/llmtuner/model/utils.py
+++ b/src/llmtuner/model/utils.py
@@ -1,3 +1,4 @@
+import inspect
 from enum import Enum, unique
 from functools import partial
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
@@ -129,7 +130,11 @@ def gradient_checkpointing_enable(
 
         return gradient_checkpointing_func(func, *args, **kwargs)
 
-    self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=custom_gradient_checkpointing_func)
+    if "value" in inspect.signature(self._set_gradient_checkpointing).parameters:  # old GC format
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+        logger.warning("You are using the old GC format, some features (e.g. BAdam) will be invalid.")
+    else:
+        self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=custom_gradient_checkpointing_func)
 
 
 def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> Dict[str, torch.Tensor]:

From 0c8d6369ac46aad61c09bd479597c81de45c0afa Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 17 Apr 2024 23:27:22 +0800
Subject: [PATCH 106/341] add CodeQwen models

Former-commit-id: 9f6094241391f8f717818c8ba94e11d1791b4a5c
---
 src/llmtuner/extras/constants.py       | 12 ++++++++++++
 src/llmtuner/webui/components/eval.py  |  2 +-
 src/llmtuner/webui/components/train.py |  2 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index f1be79f7..e45d6ac6 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -727,6 +727,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B",
         },
+        "Qwen1.5-Code-7B": {
+            DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B",
+            DownloadSource.MODELSCOPE: "qwen/CodeQwen1.5-7B",
+        },
         "Qwen1.5-0.5B-Chat": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat",
@@ -759,6 +763,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat",
         },
+        "Qwen1.5-Code-7B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/CodeQwen1.5-7B-Chat",
+        },
         "Qwen1.5-0.5B-int8-Chat": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
@@ -815,6 +823,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
         },
+        "Qwen1.5-Code-7B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "qwen/CodeQwen1.5-7B-Chat-AWQ",
+        },
     },
     template="qwen",
 )
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index 0b3bfc8c..3910a746 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -21,7 +21,7 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Row():
         dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2)
-        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
+        dataset = gr.Dropdown(multiselect=True, scale=4)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
     input_elems.update({dataset_dir, dataset})
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index eaa266d9..0f425bc9 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -27,7 +27,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=1
         )
         dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=1)
-        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
+        dataset = gr.Dropdown(multiselect=True, scale=4)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
     input_elems.update({training_stage, dataset_dir, dataset})

From ce9bdb3509deebae1aab231dc6f5b40db78739a4 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 17 Apr 2024 23:35:59 +0800
Subject: [PATCH 107/341] add mixtral 8x22B models

Former-commit-id: eccbeecff0909e1fa124b5439ffbbfbc5607e1d6
---
 README.md                        |  4 ++--
 README_zh.md                     |  4 ++--
 src/llmtuner/data/template.py    |  2 +-
 src/llmtuner/extras/constants.py | 10 ++++++++--
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 95e3e8a0..aad905d0 100644
--- a/README.md
+++ b/README.md
@@ -143,11 +143,11 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B                     | q_proj,v_proj     | mistral   |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
-| [Qwen1.5 (MoE)](https://huggingface.co/Qwen)             | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
+| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                  | q_proj,v_proj     | xverse    |
 | [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                   | q_proj,v_proj     | yi        |
diff --git a/README_zh.md b/README_zh.md
index d8b0c518..9ffc85e9 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -143,11 +143,11 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B                     | q_proj,v_proj     | mistral   |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
-| [Qwen1.5 (MoE)](https://huggingface.co/Qwen)             | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
+| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                  | q_proj,v_proj     | xverse    |
 | [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                   | q_proj,v_proj     | yi        |
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 1311eda5..286280e6 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -659,7 +659,7 @@ _register_template(
 
 _register_template(
     name="mistral",
-    format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
+    format_user=StringFormatter(slots=[" [INST] {{content}} [/INST]"]),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     force_system=True,
 )
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index e45d6ac6..cacb6c5f 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -538,14 +538,20 @@ register_model_group(
 
 register_model_group(
     models={
-        "Mixtral-8x7B": {
+        "Mixtral-8x7B-v0.1": {
             DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-v0.1",
             DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-v0.1",
         },
-        "Mixtral-8x7B-Chat": {
+        "Mixtral-8x7B-v0.1-Chat": {
             DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-Instruct-v0.1",
             DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-Instruct-v0.1",
         },
+        "Mixtral-8x22B-v0.1": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-v0.1",
+        },
+        "Mixtral-8x22B-v0.1-Chat": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-Instruct-v0.1",
+        },
     },
     template="mistral",
 )

From 1ebac62e3dae519d8ba16702727217e0836fd9f9 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 17 Apr 2024 23:40:49 +0800
Subject: [PATCH 108/341] update readme

Former-commit-id: a49112a74339ba77bfec53f7870e821fe148db2c
---
 README.md    | 18 +++++++++---------
 README_zh.md | 18 +++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index aad905d0..0422c00c 100644
--- a/README.md
+++ b/README.md
@@ -280,15 +280,15 @@ huggingface-cli login
 
 \* *estimated*
 
-| Method            | Bits |   7B  |  13B  |  30B  |   70B  |   8x7B |
-| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ |
-| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
-| Full              |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
-| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
-| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
-| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
-| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
+| Method            | Bits |   7B  |  13B  |  30B  |   70B  |  8x7B |  8x22B |
+| ----------------- | ---- | ----- | ----- | ----- | ------ | ----- | ------ |
+| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB | 900GB | 2400GB |
+| Full              |  16  |  60GB | 120GB | 300GB |  600GB | 400GB | 1200GB |
+| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB | 160GB |  400GB |
+| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB | 120GB |  320GB |
+| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |  60GB |  160GB |
+| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |  30GB |   96GB |
+| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |  18GB |   48GB |
 
 ## Getting Started
 
diff --git a/README_zh.md b/README_zh.md
index 9ffc85e9..e25744fd 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -280,15 +280,15 @@ huggingface-cli login
 
 \* *估算值*
 
-| 训练方法           | 精度 |   7B  |  13B  |  30B  |   70B  |   8x7B |
-| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ |
-| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB |  900GB |
-| Full              |  16  |  60GB | 120GB | 300GB |  600GB |  400GB |
-| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB |  160GB |
-| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  120GB |
-| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |   60GB |
-| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |   30GB |
-| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |   18GB |
+| 方法              | 精度 |   7B  |  13B  |  30B  |   70B  |  8x7B |  8x22B |
+| ----------------- | ---- | ----- | ----- | ----- | ------ | ----- | ------ |
+| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB | 900GB | 2400GB |
+| Full              |  16  |  60GB | 120GB | 300GB |  600GB | 400GB | 1200GB |
+| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB | 160GB |  400GB |
+| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB | 120GB |  320GB |
+| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |  60GB |  160GB |
+| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |  30GB |   96GB |
+| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |  18GB |   48GB |
 
 ## 如何使用
 

From 619264c8546b4e0058c2da9617d9a3a30ee914d6 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 18 Apr 2024 00:22:17 +0800
Subject: [PATCH 109/341] tiny fix

Former-commit-id: 86399ca8c06273c42c2b184664ae25d3405b3bf6
---
 README.md                   | 4 ++--
 README_zh.md                | 2 +-
 src/llmtuner/extras/misc.py | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0422c00c..66fdbbc0 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Choose your path:
 - **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
-- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
+- **Advanced algorithms**: GaLore, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
@@ -309,7 +309,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-Extra dependencies available: deepspeed, metrics, unsloth, galore, vllm, bitsandbytes, gptq, awq, aqlm, qwen, modelscope, quality
+Extra dependencies available: deepspeed, metrics, unsloth, galore, badam, vllm, bitsandbytes, gptq, awq, aqlm, qwen, modelscope, quality
 
 <details><summary>For Windows users</summary>
 
diff --git a/README_zh.md b/README_zh.md
index e25744fd..0013c074 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -309,7 +309,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-可选的额外依赖项：deepspeed、metrics、unsloth、galore、vllm、bitsandbytes、gptq、awq、aqlm、qwen、modelscope、quality
+可选的额外依赖项：deepspeed、metrics、unsloth、galore、badam、vllm、bitsandbytes、gptq、awq、aqlm、qwen、modelscope、quality
 
 <details><summary>Windows 用户指南</summary>
 
diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index 1a1f81a0..ecb6797c 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -66,7 +66,6 @@ def check_dependencies() -> None:
         require_version("accelerate>=0.27.2", "To fix: pip install accelerate>=0.27.2")
         require_version("peft>=0.10.0", "To fix: pip install peft>=0.10.0")
         require_version("trl>=0.8.1", "To fix: pip install trl>=0.8.1")
-        require_version("gradio>=4.0.0", "To fix: pip install gradio>=4.0.0")
 
 
 def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:

From 9e1bd6420de8ca2f38ff35e2534fb23c254503e1 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 18 Apr 2024 15:34:45 +0800
Subject: [PATCH 110/341] fix #3324

Former-commit-id: 5e710c4ac331f3400534d33b2646c4108c898d98
---
 README.md                   | 2 +-
 src/llmtuner/model/utils.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 66fdbbc0..476f6fe6 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Choose your path:
 - **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
-- **Advanced algorithms**: GaLore, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
+- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py
index 17b09a60..51dbca8e 100644
--- a/src/llmtuner/model/utils.py
+++ b/src/llmtuner/model/utils.py
@@ -132,8 +132,9 @@ def gradient_checkpointing_enable(
 
     if "value" in inspect.signature(self._set_gradient_checkpointing).parameters:  # old GC format
         self.apply(partial(self._set_gradient_checkpointing, value=True))
+        self.enable_input_require_grads()
         logger.warning("You are using the old GC format, some features (e.g. BAdam) will be invalid.")
-    else:
+    else:  # have already enabled input require gradients
         self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=custom_gradient_checkpointing_func)
 
 
From 8397808d1d24cfe5b92a7f1f1cc32cd333d62f25 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 19 Apr 2024 01:13:50 +0800
Subject: [PATCH 111/341] support llama3

Former-commit-id: c1eabb751a5fd73b710714451b146732e0ed4558
---
 README.md                        |  9 ++++++---
 README_zh.md                     |  9 ++++++---
 src/llmtuner/data/template.py    | 14 ++++++++++++++
 src/llmtuner/extras/constants.py | 19 +++++++++++++++++++
 4 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 476f6fe6..365f8b34 100644
--- a/README.md
+++ b/README.md
@@ -68,14 +68,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/04/19] We supported **Meta Llama 3** model series.
+
 [24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See `examples/extras/badam` for usage.
 
 [24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
 
-[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
-
 <details><summary>Full Changelog</summary>
 
+[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
+
 [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
 
 [24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
@@ -143,6 +145,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
+| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                      | q_proj,v_proj     | llama3    |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
@@ -438,7 +441,7 @@ If you have a project that should be incorporated, please contact via email or c
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
 
-Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index 0013c074..ac9e1426 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -68,14 +68,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/04/19] 我们支持了 **Meta Llama 3** 系列模型。
+
 [24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 `examples/extras/badam`。
 
 [24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
 
-[24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
-
 <details><summary>展开日志</summary>
 
+[24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
+
 [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
 
 [24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
@@ -143,6 +145,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
+| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                      | q_proj,v_proj     | llama3    |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
@@ -438,7 +441,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
 
-使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## 引用
 
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 286280e6..00bdbf10 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -649,6 +649,20 @@ _register_template(
 )
 
 
+_register_template(
+    name="llama3",
+    format_user=StringFormatter(
+        slots=[
+            "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+        ]
+    ),
+    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
+    format_separator=EmptyFormatter(slots=["<|eot_id|>"]),
+    efficient_eos=True,
+    force_system=True,
+)
+
+
 _register_template(
     name="llama2_zh",
     format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index cacb6c5f..07ccbc0d 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -513,6 +513,25 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "LLaMA3-8B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B",
+        },
+        "LLaMA3-70B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B",
+        },
+        "LLaMA3-8B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B-Instruct",
+        },
+        "LLaMA3-70B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B-Instruct",
+        },
+    },
+    template="llama3",
+)
+
+
 register_model_group(
     models={
         "Mistral-7B-v0.1": {

From 44cda2eececc911070e989cefc9a4810d08803d7 Mon Sep 17 00:00:00 2001
From: Marco <121761685+mlinmg@users.noreply.github.com>
Date: Thu, 18 Apr 2024 20:31:24 +0200
Subject: [PATCH 112/341] Added Mixture of Depths

Former-commit-id: 75dd98b9abc847e22cb263c17ebcd2ca5dd98345
---
 README.md                          |  8 +++++---
 README_zh.md                       |  8 +++++---
 examples/README.md                 |  3 +++
 examples/README_zh.md              |  3 +++
 examples/extras/MoD/freeze_sft.sh  | 33 ++++++++++++++++++++++++++++++
 examples/extras/MoD/sft.sh         | 32 +++++++++++++++++++++++++++++
 src/llmtuner/hparams/model_args.py |  4 ++++
 src/llmtuner/hparams/parser.py     |  3 +++
 src/llmtuner/model/adapter.py      |  2 ++
 src/llmtuner/model/loader.py       | 13 ++++++++++++
 10 files changed, 103 insertions(+), 6 deletions(-)
 create mode 100644 examples/extras/MoD/freeze_sft.sh
 create mode 100644 examples/extras/MoD/sft.sh

diff --git a/README.md b/README.md
index 365f8b34..3bf284b2 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Choose your path:
 - **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
-- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
+- **Advanced algorithms**: GaLore, Mixture of Depths, BAdam, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
@@ -68,14 +68,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/04/19] We integrated **[Mixture of Depths](https://github.com/astramind-ai/Mixture-of-depths)**. see `examples/extras/MoD` for usage.
+
 [24/04/19] We supported **Meta Llama 3** model series.
 
 [24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See `examples/extras/badam` for usage.
 
-[24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
-
 <details><summary>Full Changelog</summary>
 
+[24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
+
 [24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
 
 [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
diff --git a/README_zh.md b/README_zh.md
index ac9e1426..7565664e 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -46,7 +46,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - **多种模型**：LLaMA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
 - **集成方法**：（增量）预训练、指令监督微调、奖励模型训练、PPO 训练、DPO 训练和 ORPO 训练。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
-- **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。
+- **先进算法**：GaLore、Mixture of Depths、BAdam、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
 - **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
 - **极速推理**：基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。
@@ -68,14 +68,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/04/19] 我们整合了 **[深度混合](https://github.com/astramind-ai/Mixture-of-depths)**。用法请参见 `examples/extras/MoD`。
+
 [24/04/19] 我们支持了 **Meta Llama 3** 系列模型。
 
 [24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 `examples/extras/badam`。
 
-[24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
-
 <details><summary>展开日志</summary>
 
+[24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
+
 [24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
 
 [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
diff --git a/examples/README.md b/examples/README.md
index c0c0088e..dd526ba8 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -41,6 +41,9 @@ examples/
     ├── llama_pro/
     │   ├── expand.sh: Expand layers in the model
     │   └── sft.sh: Fine-tune the expanded model
+    ├── MoD/
+    │   ├── freeze_sft.sh: Freeze finetune a model, updating only the MoD router
+    │   └── sft.sh: Fine-tune the MoD model
     └── fsdp_qlora/
         └── sft.sh: Fine-tune quantized model with FSDP+QLoRA
 ```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 3f31ffc7..cdef207b 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -41,6 +41,9 @@ examples/
     ├── llama_pro/
     │   ├── expand.sh: 扩展模型中的层
     │   └── sft.sh: 训练扩展后的模型
+    ├── MoD/
+    │   ├── freeze_sft.sh: 冻结微调模型，仅更新 MoD 路由器
+    │   └── sft.sh: 微调国防部模型
     └── fsdp_qlora/
         └── sft.sh: 使用 FSDP+QLoRA 微调量化模型
 ```
diff --git a/examples/extras/MoD/freeze_sft.sh b/examples/extras/MoD/freeze_sft.sh
new file mode 100644
index 00000000..867fad47
--- /dev/null
+++ b/examples/extras/MoD/freeze_sft.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../../data \
+    --template default \
+    --finetuning_type freeze \
+    --name_module_trainable router \
+    --output_dir ../../../saves/TinyLlama/TinyLlama-1.1B-Chat-v1.0/sft \
+    --mixture_of_depths convert \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --pure_bf16
diff --git a/examples/extras/MoD/sft.sh b/examples/extras/MoD/sft.sh
new file mode 100644
index 00000000..b0257f9f
--- /dev/null
+++ b/examples/extras/MoD/sft.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../../data \
+    --template default \
+    --finetuning_type full \
+    --output_dir ../../../saves/TinyLlama/TinyLlama-1.1B-Chat-v1.0/sft \
+    --mixture_of_depths convert \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --pure_bf16
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index 57213470..bc80d304 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -69,6 +69,10 @@ class ModelArguments:
         default=False,
         metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."},
     )
+    mixture_of_depths: Optional[Literal["convert", "continue"]] = field(
+        default=None,
+        metadata={"help": "Whether or not to use MoD in the model."},
+    )
     use_unsloth: bool = field(
         default=False,
         metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."},
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 03ab0c50..ea4d169f 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -82,6 +82,9 @@ def _check_extra_dependencies(
     if model_args.use_unsloth:
         require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth")
 
+    if model_args.mixture_of_depths:
+        require_version("mixture-of-depth", "To fix: pip install mixture-of-depth")
+
     if model_args.infer_backend == "vllm":
         require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3")
 
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index b712bdcf..2aafd663 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -69,6 +69,8 @@ def init_adapter(
         for name, _ in model.named_modules():
             if ".0." in name:
                 freeze_modules.add(name.split(".0.")[-1].split(".")[0])
+            elif ".1." in name:  # here since MoD starts from layer 1
+                freeze_modules.add(name.split(".1.")[-1].split(".")[0])
 
         trainable_layers = []
         for module_name in finetuning_args.name_module_trainable:
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 7ab8222f..5626e79f 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -71,6 +71,12 @@ def load_model(
     patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
 
     model = None
+    if model_args.mixture_of_depths == 'continue':
+        from MoD import AutoMoDModelForCausalLM
+        model = AutoMoDModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config)
+        if model.config.model_type == 'qwen2':
+            RuntimeError("Qwen models are not supported for MoD training.")
+
     if is_trainable and model_args.use_unsloth:
         from unsloth import FastLanguageModel  # type: ignore
 
@@ -100,6 +106,13 @@ def load_model(
         init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path
         model: "PreTrainedModel" = AutoModelForCausalLM.from_pretrained(**init_kwargs)
 
+    if model_args.mixture_of_depths == 'convert':
+        from MoD import convert_hf_model
+        if model.config.model_type == 'qwen2':
+            RuntimeError("Qwen models are not supported for MoD training.")
+        model = convert_hf_model(model)
+
+
     patch_model(model, tokenizer, model_args, is_trainable)
     register_autoclass(config, model, tokenizer)
 

From abd9fed445a96810f4c69f2b2e4416dda97cb07c Mon Sep 17 00:00:00 2001
From: Marco <121761685+mlinmg@users.noreply.github.com>
Date: Thu, 18 Apr 2024 20:33:29 +0200
Subject: [PATCH 113/341] fix small typo

Former-commit-id: 5638a03cd0cf8119ff366b3b3e303b5a2351b065
---
 src/llmtuner/hparams/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index ea4d169f..246d97cf 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -82,7 +82,7 @@ def _check_extra_dependencies(
     if model_args.use_unsloth:
         require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth")
 
-    if model_args.mixture_of_depths:
+    if model_args.mixture_of_depths == 'convert' or model_args.mixture_of_depths == 'continue':
         require_version("mixture-of-depth", "To fix: pip install mixture-of-depth")
 
     if model_args.infer_backend == "vllm":

From 9225c15c885b64b44725ee54f50c31ca29756ea6 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 19 Apr 2024 15:46:51 +0800
Subject: [PATCH 114/341] fix llama3 template

Former-commit-id: 20e95250168fbe081c779b2e1ff23f5df3ce02f7
---
 src/llmtuner/data/template.py    | 20 ++++++++++----------
 src/llmtuner/extras/constants.py |  4 ++++
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 00bdbf10..67b447f0 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -649,6 +649,14 @@ _register_template(
 )
 
 
+_register_template(
+    name="llama2_zh",
+    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
+    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
+    default_system="You are a helpful assistant. 你是一个乐于助人的助手。",
+)
+
+
 _register_template(
     name="llama3",
     format_user=StringFormatter(
@@ -657,20 +665,12 @@ _register_template(
         ]
     ),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
-    format_separator=EmptyFormatter(slots=["<|eot_id|>"]),
-    efficient_eos=True,
+    stop_words=["<|eot_id|>"],
+    replace_eos=True,
     force_system=True,
 )
 
 
-_register_template(
-    name="llama2_zh",
-    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
-    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
-    default_system="You are a helpful assistant. 你是一个乐于助人的助手。",
-)
-
-
 _register_template(
     name="mistral",
     format_user=StringFormatter(slots=[" [INST] {{content}} [/INST]"]),
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 07ccbc0d..78352a01 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -517,15 +517,19 @@ register_model_group(
     models={
         "LLaMA3-8B": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B",
         },
         "LLaMA3-70B": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B",
         },
         "LLaMA3-8B-Chat": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B-Instruct",
         },
         "LLaMA3-70B-Chat": {
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B-Instruct",
         },
     },
     template="llama3",

From b3b5b530d10ddb97bdf5fee843f1e67bd3723fef Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 19 Apr 2024 22:40:01 +0800
Subject: [PATCH 115/341] fix #3352

Former-commit-id: f315f8e8ec916b82bac94a159e55839ff155c6b5
---
 src/llmtuner/data/template.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 67b447f0..c74becc4 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -661,13 +661,18 @@ _register_template(
     name="llama3",
     format_user=StringFormatter(
         slots=[
-            "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
         ]
     ),
-    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
+    format_system=StringFormatter(
+        slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]
+    ),
+    default_system="You are a helpful assistant.",
     stop_words=["<|eot_id|>"],
     replace_eos=True,
-    force_system=True,
 )
 
 
From 0cb596fee1d99f14117d27f9730254d647116cb2 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 20 Apr 2024 01:31:38 +0800
Subject: [PATCH 116/341] add dpo mix dataset

Former-commit-id: 6def3f8bfa51b2d9d73af112352ce07db972e4c9
---
 data/belle_multiturn/belle_multiturn.py | 31 ++++---------
 data/example_dataset/example_dataset.py | 33 +++++--------
 data/hh_rlhf_en/hh_rlhf_en.py           | 61 ++++++++++---------------
 data/ultra_chat/ultra_chat.py           | 39 ++++++----------
 4 files changed, 59 insertions(+), 105 deletions(-)

diff --git a/data/belle_multiturn/belle_multiturn.py b/data/belle_multiturn/belle_multiturn.py
index 6e31f0e6..5c3fce26 100644
--- a/data/belle_multiturn/belle_multiturn.py
+++ b/data/belle_multiturn/belle_multiturn.py
@@ -1,5 +1,6 @@
-import os
 import json
+import os
+
 import datasets
 
 
@@ -22,31 +23,19 @@ _URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0
 
 
 class BelleMultiturn(datasets.GeneratorBasedBuilder):
-
     VERSION = datasets.Version("0.0.0")
 
     def _info(self):
-        features = datasets.Features({
-            "conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
-        })
+        features = datasets.Features(
+            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
+        )
         return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
         )
 
     def _split_generators(self, dl_manager: datasets.DownloadManager):
         file_path = dl_manager.download(_URL)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepath": file_path
-                }
-            )
-        ]
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
 
     def _generate_examples(self, filepath: str):
         with open(filepath, "r", encoding="utf-8") as f:
@@ -58,7 +47,7 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
 
                 assist_idx = prompt.rfind("Assistant:")
                 human_idx = prompt.rfind("Human:")
-                query = prompt[human_idx+6:assist_idx].strip()
+                query = prompt[human_idx + 6 : assist_idx].strip()
                 prompt = prompt[:human_idx].strip()
                 conversations.insert(0, {"from": "gpt", "value": response})
                 conversations.insert(0, {"from": "human", "value": query})
@@ -67,8 +56,8 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
                     assist_idx = prompt.rfind("Assistant:")
                     human_idx = prompt.rfind("Human:")
                     if human_idx != -1:
-                        old_query = prompt[human_idx+6:assist_idx].strip()
-                        old_resp = prompt[assist_idx+10:].strip()
+                        old_query = prompt[human_idx + 6 : assist_idx].strip()
+                        old_resp = prompt[assist_idx + 10 :].strip()
                         conversations.insert(0, {"from": "gpt", "value": old_resp})
                         conversations.insert(0, {"from": "human", "value": old_query})
                     else:
diff --git a/data/example_dataset/example_dataset.py b/data/example_dataset/example_dataset.py
index 5d6cfa22..bf0baa54 100644
--- a/data/example_dataset/example_dataset.py
+++ b/data/example_dataset/example_dataset.py
@@ -1,7 +1,8 @@
 import json
-import datasets
 from typing import Any, Dict, Generator, List, Tuple
 
+import datasets
+
 
 _DESCRIPTION = "An example of dataset."
 _CITATION = ""
@@ -11,34 +12,24 @@ _URL = "examples.json"
 
 
 class ExampleDataset(datasets.GeneratorBasedBuilder):
-
     VERSION = datasets.Version("0.0.0")
 
     def _info(self) -> datasets.DatasetInfo:
-        features = datasets.Features({
-            "instruction": datasets.Value("string"),
-            "input": datasets.Value("string"),
-            "output": datasets.Value("string"),
-            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
-        })
+        features = datasets.Features(
+            {
+                "instruction": datasets.Value("string"),
+                "input": datasets.Value("string"),
+                "output": datasets.Value("string"),
+                "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
+            }
+        )
         return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
         )
 
     def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
         file_path = dl_manager.download(_URL)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepath": file_path
-                }
-            )
-        ]
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
 
     def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
         example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
diff --git a/data/hh_rlhf_en/hh_rlhf_en.py b/data/hh_rlhf_en/hh_rlhf_en.py
index 2839af7d..abe4673c 100644
--- a/data/hh_rlhf_en/hh_rlhf_en.py
+++ b/data/hh_rlhf_en/hh_rlhf_en.py
@@ -1,8 +1,10 @@
-import os
 import json
-import datasets
+import os
 from typing import List
 
+import datasets
+
+
 _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
 _DESCRIPTION = "Human preference data about helpfulness and harmlessness."
 _CITATION = ""
@@ -14,50 +16,37 @@ _URLS = {
         _URL + "harmless-base/train.jsonl.gz",
         _URL + "helpful-base/train.jsonl.gz",
         _URL + "helpful-online/train.jsonl.gz",
-        _URL + "helpful-rejection-sampled/train.jsonl.gz"
+        _URL + "helpful-rejection-sampled/train.jsonl.gz",
     ],
     "test": [
         _URL + "harmless-base/test.jsonl.gz",
         _URL + "helpful-base/test.jsonl.gz",
         _URL + "helpful-online/test.jsonl.gz",
-        _URL + "helpful-rejection-sampled/test.jsonl.gz"
-    ]
+        _URL + "helpful-rejection-sampled/test.jsonl.gz",
+    ],
 }
 
 
 class HhRlhfEn(datasets.GeneratorBasedBuilder):
-
     VERSION = datasets.Version("0.0.0")
 
     def _info(self) -> datasets.DatasetInfo:
-        features = datasets.Features({
-            "instruction": datasets.Value("string"),
-            "output": datasets.Sequence(datasets.Value("string")),
-            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
-        })
+        features = datasets.Features(
+            {
+                "instruction": datasets.Value("string"),
+                "output": datasets.Sequence(datasets.Value("string")),
+                "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
+            }
+        )
         return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
         )
 
     def _split_generators(self, dl_manager: datasets.DownloadManager):
         file_path = dl_manager.download_and_extract(_URLS)
         return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepaths": file_path["train"]
-                }
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    "filepaths": file_path["test"]
-                }
-            )
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_path["train"]}),
+            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": file_path["test"]}),
         ]
 
     def _generate_examples(self, filepaths: List[str]):
@@ -70,12 +59,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
                     rejected = data["rejected"]
 
                     assist_idx = rejected.rfind("\n\nAssistant: ")
-                    r_reject = rejected[assist_idx+13:].strip()
+                    r_reject = rejected[assist_idx + 13 :].strip()
                     assist_idx = chosen.rfind("\n\nAssistant: ")
-                    r_accept = chosen[assist_idx+13:].strip()
+                    r_accept = chosen[assist_idx + 13 :].strip()
 
                     human_idx = chosen.rfind("\n\nHuman: ")
-                    query = chosen[human_idx+9:assist_idx].strip()
+                    query = chosen[human_idx + 9 : assist_idx].strip()
                     prompt = chosen[:human_idx]
                     history = []
 
@@ -83,16 +72,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
                         assist_idx = prompt.rfind("\n\nAssistant: ")
                         human_idx = prompt.rfind("\n\nHuman: ")
                         if human_idx != -1:
-                            old_query = prompt[human_idx+9:assist_idx].strip()
-                            old_resp = prompt[assist_idx+13:].strip()
+                            old_query = prompt[human_idx + 9 : assist_idx].strip()
+                            old_resp = prompt[assist_idx + 13 :].strip()
                             history.insert(0, (old_query, old_resp))
                         else:
                             break
                         prompt = prompt[:human_idx]
 
-                    yield key, {
-                        "instruction": query,
-                        "output": [r_accept, r_reject],
-                        "history": history
-                    }
+                    yield key, {"instruction": query, "output": [r_accept, r_reject], "history": history}
                     key += 1
diff --git a/data/ultra_chat/ultra_chat.py b/data/ultra_chat/ultra_chat.py
index 2e8a75e1..e7df3ff3 100644
--- a/data/ultra_chat/ultra_chat.py
+++ b/data/ultra_chat/ultra_chat.py
@@ -1,8 +1,10 @@
-import os
 import json
-import datasets
+import os
 from typing import List
 
+import datasets
+
+
 _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
 
 _DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
@@ -24,31 +26,19 @@ _BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jso
 
 
 class UltraChat(datasets.GeneratorBasedBuilder):
-
     VERSION = datasets.Version("0.0.0")
 
     def _info(self):
-        features = datasets.Features({
-            "conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
-        })
+        features = datasets.Features(
+            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
+        )
         return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
         )
 
     def _split_generators(self, dl_manager: datasets.DownloadManager):
-        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepaths": file_paths
-                }
-            )
-        ]
+        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)]  # multiple shards
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})]
 
     def _generate_examples(self, filepaths: List[str]):
         for filepath in filepaths:
@@ -56,7 +46,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
                 for row in f:
                     try:
                         data = json.loads(row)
-                    except:
+                    except Exception:
                         continue
                     key: int = data["id"]
                     content: List[str] = data["data"]
@@ -64,8 +54,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
                         content.pop(-1)
                     if len(content) < 2:
                         continue
-                    conversations = [{
-                        "from": "human" if i % 2 == 0 else "gpt",
-                        "value": content[i]
-                    } for i in range(len(content))]
+                    conversations = [
+                        {"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content))
+                    ]
                     yield key, {"conversations": conversations}

From 3a5e68b7d937b884b9860e4b2b2ee082eac050f9 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 20 Apr 2024 10:34:09 +0800
Subject: [PATCH 117/341] fix #3348

Former-commit-id: aa5e921c00f60074eceb2f9d4d8837cc713edba6
---
 src/llmtuner/model/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 7ab8222f..9376998d 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -44,7 +44,7 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
             padding_side="right",
             **init_kwargs,
         )
-    except ValueError:  # try the fast one
+    except Exception:  # try the fast one
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             use_fast=True,

From f8e219dc81bbfb5418fd6155c019190f2ac78caf Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 21 Apr 2024 18:11:10 +0800
Subject: [PATCH 118/341] fix mod stuff

Former-commit-id: cf3988226e6398c67bb2955578e436fc505aa5c5
---
 README.md                          |  9 ++++----
 README_zh.md                       |  9 ++++----
 examples/README.md                 |  5 ++---
 examples/README_zh.md              |  5 ++---
 examples/extras/MoD/freeze_sft.sh  | 33 ------------------------------
 examples/extras/MoD/sft.sh         |  7 ++++---
 examples/extras/galore/sft.sh      |  5 +++--
 examples/inference/evaluate.sh     |  2 +-
 src/llmtuner/data/template.py      | 19 +++++++++--------
 src/llmtuner/extras/constants.py   |  2 ++
 src/llmtuner/extras/misc.py        |  2 ++
 src/llmtuner/hparams/model_args.py |  6 +++---
 src/llmtuner/hparams/parser.py     |  4 ++--
 src/llmtuner/model/adapter.py      |  2 +-
 src/llmtuner/model/loader.py       | 31 +++++++++++++++-------------
 src/llmtuner/model/patcher.py      | 10 ++++-----
 16 files changed, 63 insertions(+), 88 deletions(-)
 delete mode 100644 examples/extras/MoD/freeze_sft.sh

diff --git a/README.md b/README.md
index 3bf284b2..ed56baba 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Choose your path:
 - **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
 - **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
-- **Advanced algorithms**: GaLore, Mixture of Depths, BAdam, DoRA, LongLoRA, LLaMA Pro, LoRA+, LoftQ and Agent tuning.
+- **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
 - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc.
 - **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
@@ -68,16 +68,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
-[24/04/19] We integrated **[Mixture of Depths](https://github.com/astramind-ai/Mixture-of-depths)**. see `examples/extras/MoD` for usage.
+[24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See `examples/extras/mod` for usage.
 
 [24/04/19] We supported **Meta Llama 3** model series.
 
 [24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See `examples/extras/badam` for usage.
 
-<details><summary>Full Changelog</summary>
-
 [24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
 
+<details><summary>Full Changelog</summary>
+
 [24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
 
 [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
@@ -251,6 +251,7 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ
 - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Orca DPO (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
+- [DPO mix (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
 - [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
 
 </details>
diff --git a/README_zh.md b/README_zh.md
index 7565664e..586ee38a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -46,7 +46,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - **多种模型**：LLaMA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
 - **集成方法**：（增量）预训练、指令监督微调、奖励模型训练、PPO 训练、DPO 训练和 ORPO 训练。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
-- **先进算法**：GaLore、Mixture of Depths、BAdam、DoRA、LongLoRA、LLaMA Pro、LoRA+、LoftQ 和 Agent 微调。
+- **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
 - **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow 等等。
 - **极速推理**：基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。
@@ -68,16 +68,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
-[24/04/19] 我们整合了 **[深度混合](https://github.com/astramind-ai/Mixture-of-depths)**。用法请参见 `examples/extras/MoD`。
+[24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 `examples/extras/mod`。
 
 [24/04/19] 我们支持了 **Meta Llama 3** 系列模型。
 
 [24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 `examples/extras/badam`。
 
-<details><summary>展开日志</summary>
-
 [24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
 
+<details><summary>展开日志</summary>
+
 [24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
 
 [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
@@ -251,6 +251,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Orca DPO (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
+- [DPO mix (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
 - [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
 
 </details>
diff --git a/examples/README.md b/examples/README.md
index dd526ba8..8218d113 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -38,12 +38,11 @@ examples/
     │   └── sft.sh: Fine-tune model with BAdam
     ├── loraplus/
     │   └── sft.sh: Fine-tune model using LoRA+
+    ├── mod/
+    │   └── sft.sh: Fine-tune model using Mixture-of-Depths
     ├── llama_pro/
     │   ├── expand.sh: Expand layers in the model
     │   └── sft.sh: Fine-tune the expanded model
-    ├── MoD/
-    │   ├── freeze_sft.sh: Freeze finetune a model, updating only the MoD router
-    │   └── sft.sh: Fine-tune the MoD model
     └── fsdp_qlora/
         └── sft.sh: Fine-tune quantized model with FSDP+QLoRA
 ```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index cdef207b..ed0d244d 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -38,12 +38,11 @@ examples/
     │   └── sft.sh: 使用 BAdam 训练模型
     ├── loraplus/
     │   └── sft.sh: 使用 LoRA+ 训练模型
+    ├── mod/
+    │   └── sft.sh: 使用深度混合训练模型
     ├── llama_pro/
     │   ├── expand.sh: 扩展模型中的层
     │   └── sft.sh: 训练扩展后的模型
-    ├── MoD/
-    │   ├── freeze_sft.sh: 冻结微调模型，仅更新 MoD 路由器
-    │   └── sft.sh: 微调国防部模型
     └── fsdp_qlora/
         └── sft.sh: 使用 FSDP+QLoRA 微调量化模型
 ```
diff --git a/examples/extras/MoD/freeze_sft.sh b/examples/extras/MoD/freeze_sft.sh
deleted file mode 100644
index 867fad47..00000000
--- a/examples/extras/MoD/freeze_sft.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type freeze \
-    --name_module_trainable router \
-    --output_dir ../../../saves/TinyLlama/TinyLlama-1.1B-Chat-v1.0/sft \
-    --mixture_of_depths convert \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/MoD/sft.sh b/examples/extras/MoD/sft.sh
index b0257f9f..2c8f04a3 100644
--- a/examples/extras/MoD/sft.sh
+++ b/examples/extras/MoD/sft.sh
@@ -3,20 +3,21 @@
 CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
     --stage sft \
     --do_train \
-    --model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
     --dataset alpaca_gpt4_en,glaive_toolcall \
     --dataset_dir ../../../data \
     --template default \
     --finetuning_type full \
-    --output_dir ../../../saves/TinyLlama/TinyLlama-1.1B-Chat-v1.0/sft \
     --mixture_of_depths convert \
+    --output_dir ../../../saves/LLaMA2-7B/mod/sft \
     --overwrite_cache \
     --overwrite_output_dir \
     --cutoff_len 1024 \
     --preprocessing_num_workers 16 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
+    --gradient_accumulation_steps 8 \
+    --optim paged_adamw_8bit \
     --lr_scheduler_type cosine \
     --logging_steps 10 \
     --warmup_steps 20 \
diff --git a/examples/extras/galore/sft.sh b/examples/extras/galore/sft.sh
index 1ffeb5ca..1e46ac1f 100644
--- a/examples/extras/galore/sft.sh
+++ b/examples/extras/galore/sft.sh
@@ -11,6 +11,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
     --use_galore \
     --galore_layerwise \
     --galore_target mlp,self_attn \
+    --galore_scale 2.0 \
     --galore_rank 128 \
     --output_dir ../../../saves/LLaMA2-7B/galore/sft \
     --overwrite_cache \
@@ -28,8 +29,8 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
     --evaluation_strategy steps \
     --load_best_model_at_end \
     --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
+    --num_train_epochs 30.0 \
+    --max_samples 300 \
     --val_size 0.1 \
     --plot_loss \
     --pure_bf16
diff --git a/examples/inference/evaluate.sh b/examples/inference/evaluate.sh
index b54c2a60..1fc6ccf8 100644
--- a/examples/inference/evaluate.sh
+++ b/examples/inference/evaluate.sh
@@ -3,7 +3,7 @@
 CUDA_VISIBLE_DEVICES=0 python ../../src/evaluate.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template vanilla \
+    --template fewshot \
     --finetuning_type lora \
     --task mmlu \
     --split test \
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index c74becc4..04538510 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -343,7 +343,7 @@ def get_template_and_fix_tokenizer(
     name: Optional[str] = None,
 ) -> Template:
     if name is None:
-        template = templates["vanilla"]  # placeholder
+        template = templates["empty"]  # placeholder
     else:
         template = templates.get(name, None)
         if template is None:
@@ -385,7 +385,8 @@ _register_template(
     format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
     format_separator=EmptyFormatter(slots=["\n\n"]),
     default_system=(
-        "Below is an instruction that describes a task. " "Write a response that appropriately completes the request."
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
     ),
 )
 
@@ -596,6 +597,13 @@ _register_template(
 )
 
 
+_register_template(
+    name="fewshot",
+    format_separator=EmptyFormatter(slots=["\n\n"]),
+    efficient_eos=True,
+)
+
+
 _register_template(
     name="gemma",
     format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
@@ -740,13 +748,6 @@ _register_template(
 )
 
 
-_register_template(
-    name="vanilla",
-    format_separator=EmptyFormatter(slots=["\n"]),
-    efficient_eos=True,
-)
-
-
 _register_template(
     name="vicuna",
     format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 78352a01..a0e51d17 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -28,6 +28,8 @@ LOG_FILE_NAME = "trainer_log.jsonl"
 
 METHODS = ["full", "freeze", "lora"]
 
+MOD_SUPPORTED_MODELS = ["bloom", "falcon", "gemma", "llama", "mistral", "mixtral", "phi", "starcoder2"]
+
 PEFT_METHODS = ["lora"]
 
 SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"]
diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index ecb6797c..8ce25d18 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -83,6 +83,8 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
         if param.__class__.__name__ == "Params4bit":
             if hasattr(param, "quant_storage") and hasattr(param.quant_storage, "itemsize"):
                 num_bytes = param.quant_storage.itemsize
+            elif hasattr(param, "element_size"):  # for older pytorch version
+                num_bytes = param.element_size()
             else:
                 num_bytes = 1
 
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index bc80d304..0e42033f 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -63,15 +63,15 @@ class ModelArguments:
     )
     flash_attn: bool = field(
         default=False,
-        metadata={"help": "Enable FlashAttention-2 for faster training."},
+        metadata={"help": "Enable FlashAttention for faster training."},
     )
     shift_attn: bool = field(
         default=False,
         metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."},
     )
-    mixture_of_depths: Optional[Literal["convert", "continue"]] = field(
+    mixture_of_depths: Optional[Literal["convert", "load"]] = field(
         default=None,
-        metadata={"help": "Whether or not to use MoD in the model."},
+        metadata={"help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."},
     )
     use_unsloth: bool = field(
         default=False,
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 246d97cf..b22db652 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -82,8 +82,8 @@ def _check_extra_dependencies(
     if model_args.use_unsloth:
         require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth")
 
-    if model_args.mixture_of_depths == 'convert' or model_args.mixture_of_depths == 'continue':
-        require_version("mixture-of-depth", "To fix: pip install mixture-of-depth")
+    if model_args.mixture_of_depths is not None:
+        require_version("mixture-of-depth>=1.1.6", "To fix: pip install mixture-of-depth>=1.1.6")
 
     if model_args.infer_backend == "vllm":
         require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3")
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index 2aafd663..f73666d5 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -69,7 +69,7 @@ def init_adapter(
         for name, _ in model.named_modules():
             if ".0." in name:
                 freeze_modules.add(name.split(".0.")[-1].split(".")[0])
-            elif ".1." in name:  # here since MoD starts from layer 1
+            elif ".1." in name:  # MoD starts from layer 1
                 freeze_modules.add(name.split(".1.")[-1].split(".")[0])
 
         trainable_layers = []
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index e4624d65..4935dd52 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, Dict
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from trl import AutoModelForCausalLMWithValueHead
 
+from ..extras.constants import MOD_SUPPORTED_MODELS
 from ..extras.logging import get_logger
 from ..extras.misc import count_parameters, get_current_device, try_download_model_from_ms
 from .adapter import init_adapter
@@ -44,7 +45,7 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
             padding_side="right",
             **init_kwargs,
         )
-    except Exception:  # try the fast one
+    except ValueError:  # try the fast one
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             use_fast=True,
@@ -71,12 +72,6 @@ def load_model(
     patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
 
     model = None
-    if model_args.mixture_of_depths == 'continue':
-        from MoD import AutoMoDModelForCausalLM
-        model = AutoMoDModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config)
-        if model.config.model_type == 'qwen2':
-            RuntimeError("Qwen models are not supported for MoD training.")
-
     if is_trainable and model_args.use_unsloth:
         from unsloth import FastLanguageModel  # type: ignore
 
@@ -104,14 +99,22 @@ def load_model(
     if model is None:
         init_kwargs["config"] = config
         init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path
-        model: "PreTrainedModel" = AutoModelForCausalLM.from_pretrained(**init_kwargs)
 
-    if model_args.mixture_of_depths == 'convert':
-        from MoD import convert_hf_model
-        if model.config.model_type == 'qwen2':
-            RuntimeError("Qwen models are not supported for MoD training.")
-        model = convert_hf_model(model)
+        if model_args.mixture_of_depths == "load":
+            from MoD import AutoMoDModelForCausalLM
 
+            model = AutoMoDModelForCausalLM.from_pretrained(**init_kwargs)
+        else:
+            model = AutoModelForCausalLM.from_pretrained(**init_kwargs)
+
+        if model_args.mixture_of_depths == "convert":
+            from MoD import apply_mod_to_hf
+
+            if getattr(config, "model_type", None) not in MOD_SUPPORTED_MODELS:
+                raise ValueError("Current model is not supported by mixture-of-depth.")
+
+            model = apply_mod_to_hf(model)
+            model = model.to(model_args.compute_dtype)
 
     patch_model(model, tokenizer, model_args, is_trainable)
     register_autoclass(config, model, tokenizer)
@@ -119,7 +122,7 @@ def load_model(
     model = init_adapter(model, model_args, finetuning_args, is_trainable)
 
     if add_valuehead:
-        model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(model)
+        model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
         patch_valuehead_model(model)
 
         if model_args.adapter_name_or_path is not None:
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index fb2835e8..a1b19fb1 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -61,9 +61,7 @@ def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "Mod
     return samples
 
 
-def _configure_attn_implementation(
-    config: "PretrainedConfig", model_args: "ModelArguments", init_kwargs: Dict[str, Any]
-) -> None:
+def _configure_attn_implementation(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
     if model_args.flash_attn:
         if not is_flash_attn2_available():
             logger.warning("FlashAttention2 is not installed.")
@@ -73,9 +71,9 @@ def _configure_attn_implementation(
         if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
             setattr(config, "attn_implementation", "flash_attention_2")
         else:
-            init_kwargs["attn_implementation"] = "flash_attention_2"
+            setattr(config, "_attn_implementation", "flash_attention_2")
     else:
-        init_kwargs["attn_implementation"] = "eager"
+        setattr(config, "_attn_implementation", "eager")
 
 
 def _configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
@@ -295,7 +293,7 @@ def patch_config(
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
         model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
 
-    _configure_attn_implementation(config, model_args, init_kwargs)
+    _configure_attn_implementation(config, model_args)
     _configure_rope(config, model_args, is_trainable)
     _configure_longlora(config, model_args, is_trainable)
     _configure_quantization(config, tokenizer, model_args, init_kwargs)

From d16561e7a49dd1af3ee773e54ea01df9d678afe8 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 21 Apr 2024 18:53:22 +0800
Subject: [PATCH 119/341] fix bug in galore optimizer

Former-commit-id: c05ac23261a5a8ba893c2918a43dc7777307407b
---
 examples/extras/galore/sft.sh |  6 +++---
 src/llmtuner/train/utils.py   | 14 ++++----------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/examples/extras/galore/sft.sh b/examples/extras/galore/sft.sh
index 1e46ac1f..da1779ed 100644
--- a/examples/extras/galore/sft.sh
+++ b/examples/extras/galore/sft.sh
@@ -11,8 +11,8 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
     --use_galore \
     --galore_layerwise \
     --galore_target mlp,self_attn \
-    --galore_scale 2.0 \
     --galore_rank 128 \
+    --galore_scale 2.0 \
     --output_dir ../../../saves/LLaMA2-7B/galore/sft \
     --overwrite_cache \
     --overwrite_output_dir \
@@ -29,8 +29,8 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
     --evaluation_strategy steps \
     --load_best_model_at_end \
     --learning_rate 5e-5 \
-    --num_train_epochs 30.0 \
-    --max_samples 300 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
     --val_size 0.1 \
     --plot_loss \
     --pure_bf16
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index 2835eddf..d3f17116 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -234,14 +234,6 @@ def _create_galore_optimizer(
             param_groups = [dict(params=[param], weight_decay=training_args.weight_decay, **galore_kwargs)]
             optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
 
-        def optimizer_hook(param: "torch.nn.Parameter"):
-            if param.grad is not None:
-                optimizer_dict[param].step()
-                optimizer_dict[param].zero_grad()
-
-        for param in trainable_params:
-            param.register_post_accumulate_grad_hook(optimizer_hook)
-
         optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict)
     else:
         param_groups = [
@@ -391,9 +383,11 @@ def create_custom_scheduler(
                 num_training_steps=num_training_steps * 2,
             )
 
-        def scheduler_hook(param: "torch.nn.Parameter"):
+        def optimizer_hook(param: "torch.nn.Parameter"):
             if param.grad is not None:
+                optimizer_dict[param].step()
+                optimizer_dict[param].zero_grad()
                 scheduler_dict[param].step()
 
         for param in optimizer_dict.keys():
-            param.register_post_accumulate_grad_hook(scheduler_hook)
+            param.register_post_accumulate_grad_hook(optimizer_hook)

From 1d341dcd8341866bd07d029f2956ad68fd3565f0 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 21 Apr 2024 19:20:18 +0800
Subject: [PATCH 120/341] fix #3365

Former-commit-id: 415ce41e8fa887e980e5bd575c8e95bd4076b90b
---
 src/llmtuner/model/patcher.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index a1b19fb1..53616dd9 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -139,6 +139,7 @@ def _configure_quantization(
 
         if quant_method == QuantizationMethod.GPTQ:
             require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
+            quantization_config.pop("disable_exllama", None)  # remove deprecated args
             quantization_config["use_exllama"] = False  # disable exllama
 
         if quant_method == QuantizationMethod.AWQ:

From 233e167f68030047070759c2f5823f1d8ea4b123 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 21 Apr 2024 20:40:54 +0800
Subject: [PATCH 121/341] fix optimizers

Former-commit-id: f811eee2fa12a89a55a9c5d3a05a1521b4347727
---
 src/llmtuner/hparams/finetuning_args.py |  5 ++-
 src/llmtuner/train/utils.py             | 50 +++++++++++--------------
 2 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index 899c7284..f4f71bc5 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -303,11 +303,14 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
             raise ValueError("`dpo_label_smoothing` is only valid for sigmoid loss function.")
 
         if self.use_llama_pro and self.finetuning_type == "full":
-            raise ValueError("`use_llama_pro` is only valid for the Freeze or LoRA method.")
+            raise ValueError("`use_llama_pro` is only valid for the Freeze or LoRA training.")
 
         if self.use_galore and self.finetuning_type == "lora":
             raise ValueError("Cannot use LoRA with GaLore together.")
 
+        if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora":
+            raise ValueError("`loraplus_lr_ratio` is only valid for the LoRA training.")
+
     def save_to_json(self, json_path: str):
         r"""Saves the content of this instance in JSON format inside `json_path`."""
         json_string = json.dumps(asdict(self), indent=2, sort_keys=True) + "\n"
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index d3f17116..fa9e36e5 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -162,15 +162,6 @@ def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]:
     return decay_parameters
 
 
-def _get_embedding_names(model: "PreTrainedModel") -> List[str]:
-    r"""
-    Returns a list of names of parameters in embedding.
-    """
-    result = {name for name, _ in model.get_input_embeddings().named_parameters()}
-    result.update(name for name, _ in model.get_output_embeddings().named_parameters())
-    return result
-
-
 def _create_galore_optimizer(
     model: "PreTrainedModel",
     training_args: "Seq2SeqTrainingArguments",
@@ -225,7 +216,7 @@ def _create_galore_optimizer(
 
         optimizer_dict: Dict["torch.Tensor", "torch.optim.Optimizer"] = {}
         for param in nodecay_params:
-            param_groups = [dict(params=[param])]
+            param_groups = [dict(params=[param], weight_decay=0.0)]
             optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
         for param in decay_params:
             param_groups = [dict(params=[param], weight_decay=training_args.weight_decay)]
@@ -234,6 +225,14 @@ def _create_galore_optimizer(
             param_groups = [dict(params=[param], weight_decay=training_args.weight_decay, **galore_kwargs)]
             optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
 
+        def optimizer_hook(param: "torch.nn.Parameter"):
+            if param.grad is not None:
+                optimizer_dict[param].step()
+                optimizer_dict[param].zero_grad()
+
+        for param in trainable_params:
+            param.register_post_accumulate_grad_hook(optimizer_hook)
+
         optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict)
     else:
         param_groups = [
@@ -252,11 +251,9 @@ def _create_loraplus_optimizer(
     training_args: "Seq2SeqTrainingArguments",
     finetuning_args: "FinetuningArguments",
 ) -> "torch.optim.Optimizer":
-    if finetuning_args.finetuning_type != "lora":
-        raise ValueError("You should use LoRA tuning to activate LoRA+.")
-
+    default_lr = training_args.learning_rate
     loraplus_lr = training_args.learning_rate * finetuning_args.loraplus_lr_ratio
-    decay_args = {"weight_decay": training_args.weight_decay}
+    embedding_lr = finetuning_args.loraplus_lr_embedding
 
     decay_param_names = _get_decay_parameter_names(model)
     param_dict: Dict[str, List["torch.nn.Parameter"]] = {
@@ -279,10 +276,10 @@ def _create_loraplus_optimizer(
 
     optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
     param_groups = [
-        dict(params=param_dict["lora_a"], **decay_args),
-        dict(params=param_dict["lora_b"], lr=loraplus_lr, **decay_args),
+        dict(params=param_dict["lora_a"], lr=default_lr, weight_decay=training_args.weight_decay),
+        dict(params=param_dict["lora_b"], lr=loraplus_lr, weight_decay=training_args.weight_decay),
         dict(params=param_dict["lora_b_nodecay"], lr=loraplus_lr, weight_decay=0.0),
-        dict(params=param_dict["embedding"], lr=finetuning_args.loraplus_lr_embedding, **decay_args),
+        dict(params=param_dict["embedding"], lr=embedding_lr, weight_decay=training_args.weight_decay),
     ]
     optimizer = optim_class(param_groups, **optim_kwargs)
     logger.info("Using LoRA+ optimizer with loraplus lr ratio {:.2f}.".format(finetuning_args.loraplus_lr_ratio))
@@ -294,11 +291,8 @@ def _create_badam_optimizer(
     training_args: "Seq2SeqTrainingArguments",
     finetuning_args: "FinetuningArguments",
 ) -> "torch.optim.Optimizer":
-    decay_param_names = _get_decay_parameter_names(model)
-    if finetuning_args.badam_mode == "ratio":  # filter out the embedding layers for ratio-wise badam
-        decay_param_names = [name for name in decay_param_names if name not in _get_embedding_names(model)]
-
     decay_params, nodecay_params = [], []
+    decay_param_names = _get_decay_parameter_names(model)
     for name, param in model.named_parameters():
         if param.requires_grad:
             if name in decay_param_names:
@@ -341,6 +335,7 @@ def _create_badam_optimizer(
             update_ratio=finetuning_args.badam_update_ratio,
             mask_mode=finetuning_args.badam_mask_mode,
             verbose=finetuning_args.badam_verbose,
+            include_embedding=False,
             **optim_kwargs,
         )
         logger.info(
@@ -379,15 +374,12 @@ def create_custom_scheduler(
             scheduler_dict[param] = get_scheduler(
                 training_args.lr_scheduler_type,
                 optimizer=optimizer_dict[param],
-                num_warmup_steps=training_args.get_warmup_steps(num_training_steps) * 2,
-                num_training_steps=num_training_steps * 2,
+                num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
+                num_training_steps=num_training_steps,
             )
 
-        def optimizer_hook(param: "torch.nn.Parameter"):
-            if param.grad is not None:
-                optimizer_dict[param].step()
-                optimizer_dict[param].zero_grad()
-                scheduler_dict[param].step()
+        def scheduler_hook(param: "torch.nn.Parameter"):
+            scheduler_dict[param].step()
 
         for param in optimizer_dict.keys():
-            param.register_post_accumulate_grad_hook(optimizer_hook)
+            param.register_post_accumulate_grad_hook(scheduler_hook)

From 0f1ad7140fb8662b0f33a40792ce62da31ffe8f1 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 21 Apr 2024 21:34:25 +0800
Subject: [PATCH 122/341] fix #3366

Former-commit-id: dc20237455c36de44f8922539d7dfadd8bedb12f
---
 src/llmtuner/webui/chatter.py |  7 +++----
 src/llmtuner/webui/engine.py  | 11 +++++------
 src/llmtuner/webui/runner.py  | 21 +++++++++++----------
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/llmtuner/webui/chatter.py b/src/llmtuner/webui/chatter.py
index 479846ca..dac7dd67 100644
--- a/src/llmtuner/webui/chatter.py
+++ b/src/llmtuner/webui/chatter.py
@@ -1,6 +1,6 @@
 import json
 import os
-from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Sequence, Tuple
+from typing import TYPE_CHECKING, Dict, Generator, List, Optional, Sequence, Tuple
 
 from ..chat import ChatModel
 from ..data import Role
@@ -17,7 +17,6 @@ if TYPE_CHECKING:
 
 if is_gradio_available():
     import gradio as gr
-    from gradio.components import Component  # cannot use TYPE_CHECKING here
 
 
 class WebChatModel(ChatModel):
@@ -38,7 +37,7 @@ class WebChatModel(ChatModel):
     def loaded(self) -> bool:
         return self.engine is not None
 
-    def load_model(self, data: Dict[Component, Any]) -> Generator[str, None, None]:
+    def load_model(self, data) -> Generator[str, None, None]:
         get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
         lang = get("top.lang")
         error = ""
@@ -82,7 +81,7 @@ class WebChatModel(ChatModel):
 
         yield ALERTS["info_loaded"][lang]
 
-    def unload_model(self, data: Dict[Component, Any]) -> Generator[str, None, None]:
+    def unload_model(self, data) -> Generator[str, None, None]:
         lang = data[self.manager.get_elem_by_id("top.lang")]
 
         if self.demo_mode:
diff --git a/src/llmtuner/webui/engine.py b/src/llmtuner/webui/engine.py
index 65945533..b9ee61d2 100644
--- a/src/llmtuner/webui/engine.py
+++ b/src/llmtuner/webui/engine.py
@@ -1,6 +1,5 @@
-from typing import Any, Dict, Generator
+from typing import TYPE_CHECKING, Any, Dict
 
-from ..extras.packages import is_gradio_available
 from .chatter import WebChatModel
 from .common import get_model_path, list_dataset, load_config
 from .locales import LOCALES
@@ -9,8 +8,8 @@ from .runner import Runner
 from .utils import get_time
 
 
-if is_gradio_available():
-    from gradio.components import Component  # cannot use TYPE_CHECKING here
+if TYPE_CHECKING:
+    from gradio.components import Component
 
 
 class Engine:
@@ -32,7 +31,7 @@ class Engine:
 
         return output_dict
 
-    def resume(self) -> Generator[Dict[Component, Component], None, None]:
+    def resume(self):
         user_config = load_config() if not self.demo_mode else {}
         lang = user_config.get("lang", None) or "en"
 
@@ -58,7 +57,7 @@ class Engine:
             else:
                 yield self._update_component({"eval.resume_btn": {"value": True}})
 
-    def change_lang(self, lang: str) -> Dict[Component, Component]:
+    def change_lang(self, lang: str):
         return {
             elem: elem.__class__(**LOCALES[elem_name][lang])
             for elem_name, elem in self.manager.get_elem_iter()
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 12307234..ec493c96 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -21,10 +21,11 @@ from .utils import gen_cmd, gen_plot, get_eval_results, update_process_bar
 
 if is_gradio_available():
     import gradio as gr
-    from gradio.components import Component  # cannot use TYPE_CHECKING here
 
 
 if TYPE_CHECKING:
+    from gradio.components import Component
+
     from .manager import Manager
 
 
@@ -243,7 +244,7 @@ class Runner:
 
         return args
 
-    def _preview(self, data: Dict["Component", Any], do_train: bool) -> Generator[Dict[Component, str], None, None]:
+    def _preview(self, data: Dict["Component", Any], do_train: bool) -> Generator[Dict["Component", str], None, None]:
         output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
         error = self._initialize(data, do_train, from_preview=True)
         if error:
@@ -253,7 +254,7 @@ class Runner:
             args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
             yield {output_box: gen_cmd(args)}
 
-    def _launch(self, data: Dict["Component", Any], do_train: bool) -> Generator[Dict[Component, Any], None, None]:
+    def _launch(self, data: Dict["Component", Any], do_train: bool) -> Generator[Dict["Component", Any], None, None]:
         output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
         error = self._initialize(data, do_train, from_preview=False)
         if error:
@@ -267,19 +268,19 @@ class Runner:
             self.thread.start()
             yield from self.monitor()
 
-    def preview_train(self, data: Dict[Component, Any]) -> Generator[Dict[Component, str], None, None]:
+    def preview_train(self, data):
         yield from self._preview(data, do_train=True)
 
-    def preview_eval(self, data: Dict[Component, Any]) -> Generator[Dict[Component, str], None, None]:
+    def preview_eval(self, data):
         yield from self._preview(data, do_train=False)
 
-    def run_train(self, data: Dict[Component, Any]) -> Generator[Dict[Component, Any], None, None]:
+    def run_train(self, data):
         yield from self._launch(data, do_train=True)
 
-    def run_eval(self, data: Dict[Component, Any]) -> Generator[Dict[Component, Any], None, None]:
+    def run_eval(self, data):
         yield from self._launch(data, do_train=False)
 
-    def monitor(self) -> Generator[Dict[Component, Any], None, None]:
+    def monitor(self):
         get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
         self.aborted = False
         self.running = True
@@ -336,7 +337,7 @@ class Runner:
 
         yield return_dict
 
-    def save_args(self, data: Dict[Component, Any]) -> Dict[Component, str]:
+    def save_args(self, data):
         output_box = self.manager.get_elem_by_id("train.output_box")
         error = self._initialize(data, do_train=True, from_preview=True)
         if error:
@@ -355,7 +356,7 @@ class Runner:
         save_path = save_args(config_path, config_dict)
         return {output_box: ALERTS["info_config_saved"][lang] + save_path}
 
-    def load_args(self, lang: str, config_path: str) -> Dict[Component, Any]:
+    def load_args(self, lang: str, config_path: str):
         output_box = self.manager.get_elem_by_id("train.output_box")
         config_dict = load_args(config_path)
         if config_dict is None:

From c8fe3f544b504c310383b90e1875230b3f83cf80 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 21 Apr 2024 23:13:23 +0800
Subject: [PATCH 123/341] release v0.6.3

Former-commit-id: 947572af8de201669598f54735f35b50bb719d71
---
 src/llmtuner/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/__init__.py b/src/llmtuner/__init__.py
index 9d90a59e..829f7cb9 100644
--- a/src/llmtuner/__init__.py
+++ b/src/llmtuner/__init__.py
@@ -7,5 +7,5 @@ from .train import export_model, run_exp
 from .webui import create_ui, create_web_demo
 
 
-__version__ = "0.6.3.dev0"
+__version__ = "0.6.3"
 __all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]

From 348f29aa50632c2c70c849e35779cc25a1371426 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 21 Apr 2024 23:14:30 +0800
Subject: [PATCH 124/341] set dev version

Former-commit-id: b9557887d7506ff57b2b2bf490092aac4e4becf0
---
 src/llmtuner/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/__init__.py b/src/llmtuner/__init__.py
index 829f7cb9..6cb78806 100644
--- a/src/llmtuner/__init__.py
+++ b/src/llmtuner/__init__.py
@@ -7,5 +7,5 @@ from .train import export_model, run_exp
 from .webui import create_ui, create_web_demo
 
 
-__version__ = "0.6.3"
+__version__ = "0.6.4.dev0"
 __all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]

From 719585a12868c58e0ad7047fda633e97430e8cf8 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 22 Apr 2024 00:21:01 +0800
Subject: [PATCH 125/341] update readme

Former-commit-id: 3a8c17907c71f46b1b37501e2afdc99ad89fb4bc
---
 README.md    | 14 ++++++++++++--
 README_zh.md | 14 ++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index ed56baba..05a75949 100644
--- a/README.md
+++ b/README.md
@@ -342,6 +342,16 @@ export GRADIO_SERVER_PORT=7860 # `set GRADIO_SERVER_PORT=7860` for Windows
 python src/train_web.py # or python -m llmtuner.webui.interface
 ```
 
+<details><summary>For Aliyun users</summary>
+
+If you encountered display problems in LLaMA Board GUI, try using the following command to set environment variables before starting LLaMA Board:
+
+```bash
+export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
+```
+
+</details>
+
 #### Use Docker
 
 ```bash
@@ -381,8 +391,8 @@ Use `python src/train_bash.py -h` to display arguments description.
 
 ```bash
 CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
-    --model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 \
-    --template mistral \
+    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
+    --template llama3 \
     --infer_backend vllm \
     --vllm_enforce_eager
 ```
diff --git a/README_zh.md b/README_zh.md
index 586ee38a..0e01e2c2 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -342,6 +342,16 @@ export GRADIO_SERVER_PORT=7860 # Windows 使用 `set GRADIO_SERVER_PORT=7860`
 python src/train_web.py # 或 python -m llmtuner.webui.interface
 ```
 
+<details><summary>阿里云用户指南</summary>
+
+如果您在 LLaMA Board GUI 中遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
+
+```bash
+export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
+```
+
+</details>
+
 #### 使用 Docker
 
 ```bash
@@ -381,8 +391,8 @@ docker compose -f ./docker-compose.yml up -d
 
 ```bash
 CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
-    --model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 \
-    --template mistral \
+    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
+    --template llama3 \
     --infer_backend vllm \
     --vllm_enforce_eager
 ```

From be716972fe588a736234ccf23c5767b91516d580 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 22 Apr 2024 00:35:41 +0800
Subject: [PATCH 126/341] remove extras

Former-commit-id: d67e972f8c3d5273e589c8c85c0a1620f59785c5
---
 examples/extras/MoD/sft.sh          | 33 ------------------------
 examples/extras/badam/sft.sh        | 35 -------------------------
 examples/extras/fsdp_qlora/sft.sh   | 40 -----------------------------
 examples/extras/galore/sft.sh       | 36 --------------------------
 examples/extras/llama_pro/expand.sh |  6 -----
 examples/extras/llama_pro/sft.sh    | 34 ------------------------
 examples/extras/loraplus/sft.sh     | 33 ------------------------
 7 files changed, 217 deletions(-)
 delete mode 100644 examples/extras/MoD/sft.sh
 delete mode 100644 examples/extras/badam/sft.sh
 delete mode 100644 examples/extras/fsdp_qlora/sft.sh
 delete mode 100644 examples/extras/galore/sft.sh
 delete mode 100644 examples/extras/llama_pro/expand.sh
 delete mode 100644 examples/extras/llama_pro/sft.sh
 delete mode 100644 examples/extras/loraplus/sft.sh

diff --git a/examples/extras/MoD/sft.sh b/examples/extras/MoD/sft.sh
deleted file mode 100644
index 2c8f04a3..00000000
--- a/examples/extras/MoD/sft.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --mixture_of_depths convert \
-    --output_dir ../../../saves/LLaMA2-7B/mod/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --optim paged_adamw_8bit \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
deleted file mode 100644
index c2319caa..00000000
--- a/examples/extras/badam/sft.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --use_badam \
-    --badam_switch_mode descending \
-    --badam_switch_block_every 50 \
-    --badam_verbose 2 \
-    --output_dir ../../../saves/LLaMA2-7B/badam/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/fsdp_qlora/sft.sh b/examples/extras/fsdp_qlora/sft.sh
deleted file mode 100644
index 614245d3..00000000
--- a/examples/extras/fsdp_qlora/sft.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-pip install "transformers>=4.39.1"
-pip install "accelerate>=0.28.0"
-pip install "bitsandbytes>=0.43.0"
-
-CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
-    --config_file ../../accelerate/fsdp_config.yaml \
-    ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-70b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../../saves/LLaMA2-70B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --quantization_bit 4 \
-    --plot_loss \
-    --fp16
diff --git a/examples/extras/galore/sft.sh b/examples/extras/galore/sft.sh
deleted file mode 100644
index da1779ed..00000000
--- a/examples/extras/galore/sft.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --use_galore \
-    --galore_layerwise \
-    --galore_target mlp,self_attn \
-    --galore_rank 128 \
-    --galore_scale 2.0 \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/llama_pro/expand.sh b/examples/extras/llama_pro/expand.sh
deleted file mode 100644
index b260902c..00000000
--- a/examples/extras/llama_pro/expand.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-python ../../../scripts/llama_pro.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --output_dir ../../../models/llama2-7b-pro \
-    --num_expand 8
diff --git a/examples/extras/llama_pro/sft.sh b/examples/extras/llama_pro/sft.sh
deleted file mode 100644
index 573078ff..00000000
--- a/examples/extras/llama_pro/sft.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path ../../../models/llama2-7b-pro \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type freeze \
-    --name_module_trainable all \
-    --num_layer_trainable 8 \
-    --use_llama_pro \
-    --output_dir ../../../saves/LLaMA2-7B-Pro/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/extras/loraplus/sft.sh b/examples/extras/loraplus/sft.sh
deleted file mode 100644
index cb334e7d..00000000
--- a/examples/extras/loraplus/sft.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --loraplus_lr_ratio 16.0 \
-    --output_dir ../../saves/LLaMA2-7B/loraplus/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16

From ba06eb65ca01ca876ccffc327f8ab9f4189a9a8b Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 22 Apr 2024 00:37:32 +0800
Subject: [PATCH 127/341] update readme and examples

Former-commit-id: 27dd9bf201c24f7804811398bc2758966ec78432
---
 README.md                           |  8 +++---
 README_zh.md                        | 14 +++++-----
 examples/README.md                  |  3 ++-
 examples/README_zh.md               |  3 ++-
 examples/extras/badam/sft.sh        | 35 ++++++++++++++++++++++++
 examples/extras/fsdp_qlora/sft.sh   | 41 +++++++++++++++++++++++++++++
 examples/extras/galore/sft.sh       | 36 +++++++++++++++++++++++++
 examples/extras/llama_pro/expand.sh |  6 +++++
 examples/extras/llama_pro/sft.sh    | 34 ++++++++++++++++++++++++
 examples/extras/loraplus/sft.sh     | 33 +++++++++++++++++++++++
 examples/extras/mod/sft.sh          | 33 +++++++++++++++++++++++
 examples/lora_multi_gpu/ds_zero3.sh | 33 +++++++++++++++++++++++
 12 files changed, 266 insertions(+), 13 deletions(-)
 create mode 100644 examples/extras/badam/sft.sh
 create mode 100644 examples/extras/fsdp_qlora/sft.sh
 create mode 100644 examples/extras/galore/sft.sh
 create mode 100644 examples/extras/llama_pro/expand.sh
 create mode 100644 examples/extras/llama_pro/sft.sh
 create mode 100644 examples/extras/loraplus/sft.sh
 create mode 100644 examples/extras/mod/sft.sh
 create mode 100644 examples/lora_multi_gpu/ds_zero3.sh

diff --git a/README.md b/README.md
index 05a75949..eb260003 100644
--- a/README.md
+++ b/README.md
@@ -329,7 +329,7 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 </details>
 
-### LLaMA Board GUI
+### Train with LLaMA Board GUI
 
 > [!IMPORTANT]
 > LLaMA Board GUI only supports training on a single GPU, please use [CLI](#command-line-interface) for distributed training.
@@ -381,7 +381,7 @@ docker compose -f ./docker-compose.yml up -d
 
 </details>
 
-### Command Line Interface
+### Train with Command Line Interface
 
 See [examples/README.md](examples/README.md) for usage.
 
@@ -397,7 +397,7 @@ CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
     --vllm_enforce_eager
 ```
 
-### Use ModelScope Hub
+### Download from ModelScope Hub
 
 If you have trouble with downloading models and datasets from Hugging Face, you can use ModelScope.
 
@@ -405,7 +405,7 @@ If you have trouble with downloading models and datasets from Hugging Face, you
 export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows
 ```
 
-Train the model by specifying a model ID of the ModelScope Hub as the `--model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `modelscope/Llama-2-7b-ms`.
+Train the model by specifying a model ID of the ModelScope Hub as the `--model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `LLM-Research/Meta-Llama-3-8B-Instruct`.
 
 ## Projects using LLaMA Factory
 
diff --git a/README_zh.md b/README_zh.md
index 0e01e2c2..ab43fa26 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -329,10 +329,10 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 </details>
 
-### LLaMA Board 可视化界面
+### 利用 LLaMA Board 可视化界面训练
 
 > [!IMPORTANT]
-> LLaMA Board 可视化界面目前仅支持单 GPU 训练，请使用[命令行接口](#命令行接口)来进行分布式训练。
+> LLaMA Board 可视化界面目前仅支持单 GPU 训练，请使用[命令行接口](#命令行接口)来进行多 GPU 分布式训练。
 
 #### 使用本地环境
 
@@ -381,13 +381,13 @@ docker compose -f ./docker-compose.yml up -d
 
 </details>
 
-### 命令行接口
+### 利用命令行接口训练
 
 使用方法请参考 [examples/README_zh.md](examples/README_zh.md)。
 
-使用 `python src/train_bash.py -h` 查看参数文档。
+您可以执行 `python src/train_bash.py -h` 来查看参数文档。
 
-### 使用 OpenAI 风格 API 和 vLLM 部署
+### 利用 vLLM 部署 OpenAI API
 
 ```bash
 CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
@@ -397,7 +397,7 @@ CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
     --vllm_enforce_eager
 ```
 
-### 使用魔搭社区
+### 从魔搭社区下载
 
 如果您在 Hugging Face 模型和数据集的下载中遇到了问题，可以通过下述方法使用魔搭社区。
 
@@ -405,7 +405,7 @@ CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
 export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 ```
 
-将 `--model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型，例如 `modelscope/Llama-2-7b-ms`。
+将 `--model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型，例如 `LLM-Research/Meta-Llama-3-8B-Instruct`。
 
 ## 使用了 LLaMA Factory 的项目
 
diff --git a/examples/README.md b/examples/README.md
index 8218d113..871bf0de 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -18,7 +18,8 @@ examples/
 │   └── aqlm.sh: Fine-tune 2-bit AQLM models using QLoRA
 ├── lora_multi_gpu/
 │   ├── single_node.sh: Fine-tune model with Accelerate on single node using LoRA
-│   └── multi_node.sh: Fine-tune model with Accelerate on multiple nodes using LoRA
+│   ├── multi_node.sh: Fine-tune model with Accelerate on multiple nodes using LoRA
+│   └── ds_zero3.sh: Fine-tune model with DeepSpeed ZeRO-3 using LoRA
 ├── full_multi_gpu/
 │   ├── single_node.sh: Full fine-tune model with DeepSpeed on single node
 │   ├── multi_node.sh: Full fine-tune model with DeepSpeed on multiple nodes
diff --git a/examples/README_zh.md b/examples/README_zh.md
index ed0d244d..c4f2062e 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -18,7 +18,8 @@ examples/
 │   └── aqlm.sh: 基于 QLoRA 微调 2 比特 AQLM 模型
 ├── lora_multi_gpu/
 │   ├── single_node.sh: 使用 Accelerate 进行单节点 LoRA 训练
-│   └── multi_node.sh: 使用 Accelerate 进行多节点 LoRA 训练
+│   ├── multi_node.sh: 使用 Accelerate 进行多节点 LoRA 训练
+│   └── ds_zero3.sh: 使用 DeepSpeed ZeRO-3 进行 LoRA 训练
 ├── full_multi_gpu/
 │   ├── single_node.sh: 使用 DeepSpeed 进行单节点全量训练
 │   ├── multi_node.sh: 使用 DeepSpeed 进行多节点全量训练
diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
new file mode 100644
index 00000000..c2319caa
--- /dev/null
+++ b/examples/extras/badam/sft.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../../data \
+    --template default \
+    --finetuning_type full \
+    --use_badam \
+    --badam_switch_mode descending \
+    --badam_switch_block_every 50 \
+    --badam_verbose 2 \
+    --output_dir ../../../saves/LLaMA2-7B/badam/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --pure_bf16
diff --git a/examples/extras/fsdp_qlora/sft.sh b/examples/extras/fsdp_qlora/sft.sh
new file mode 100644
index 00000000..e8b9ece7
--- /dev/null
+++ b/examples/extras/fsdp_qlora/sft.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
+
+pip install "transformers>=4.39.1"
+pip install "accelerate>=0.28.0"
+pip install "bitsandbytes>=0.43.0"
+
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
+    --config_file ../../accelerate/fsdp_config.yaml \
+    ../../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-70b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../../saves/LLaMA2-70B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --ddp_timeout 180000000 \
+    --quantization_bit 4 \
+    --plot_loss \
+    --fp16
diff --git a/examples/extras/galore/sft.sh b/examples/extras/galore/sft.sh
new file mode 100644
index 00000000..da1779ed
--- /dev/null
+++ b/examples/extras/galore/sft.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../../data \
+    --template default \
+    --finetuning_type full \
+    --use_galore \
+    --galore_layerwise \
+    --galore_target mlp,self_attn \
+    --galore_rank 128 \
+    --galore_scale 2.0 \
+    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --pure_bf16
diff --git a/examples/extras/llama_pro/expand.sh b/examples/extras/llama_pro/expand.sh
new file mode 100644
index 00000000..b260902c
--- /dev/null
+++ b/examples/extras/llama_pro/expand.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+python ../../../scripts/llama_pro.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --output_dir ../../../models/llama2-7b-pro \
+    --num_expand 8
diff --git a/examples/extras/llama_pro/sft.sh b/examples/extras/llama_pro/sft.sh
new file mode 100644
index 00000000..573078ff
--- /dev/null
+++ b/examples/extras/llama_pro/sft.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path ../../../models/llama2-7b-pro \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../../data \
+    --template default \
+    --finetuning_type freeze \
+    --name_module_trainable all \
+    --num_layer_trainable 8 \
+    --use_llama_pro \
+    --output_dir ../../../saves/LLaMA2-7B-Pro/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/examples/extras/loraplus/sft.sh b/examples/extras/loraplus/sft.sh
new file mode 100644
index 00000000..cb334e7d
--- /dev/null
+++ b/examples/extras/loraplus/sft.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --loraplus_lr_ratio 16.0 \
+    --output_dir ../../saves/LLaMA2-7B/loraplus/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --fp16
diff --git a/examples/extras/mod/sft.sh b/examples/extras/mod/sft.sh
new file mode 100644
index 00000000..2c8f04a3
--- /dev/null
+++ b/examples/extras/mod/sft.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../../data \
+    --template default \
+    --finetuning_type full \
+    --mixture_of_depths convert \
+    --output_dir ../../../saves/LLaMA2-7B/mod/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --optim paged_adamw_8bit \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --pure_bf16
diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh
new file mode 100644
index 00000000..f429d15b
--- /dev/null
+++ b/examples/lora_multi_gpu/ds_zero3.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+deepspeed --num_gpus 4 ../../src/train_bash.py \
+    --deepspeed ../deepspeed/ds_z3_config.json \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --lr_scheduler_type cosine \
+    --logging_steps 10 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --ddp_timeout 180000000 \
+    --plot_loss \
+    --fp16

From 4ec56dd958bea9228f8265db7317a3c2cb57ccba Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 22 Apr 2024 00:42:25 +0800
Subject: [PATCH 128/341] update readme

Former-commit-id: fdca136309709e43d75a831252b9375a5a99635a
---
 README.md    | 6 ++++--
 README_zh.md | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index eb260003..fc36b405 100644
--- a/README.md
+++ b/README.md
@@ -159,9 +159,11 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                 | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
-> **Default module** is used for the `--lora_target` argument, you can use `--lora_target all` to specify all the available modules.
+> **Default module** is used for the `--lora_target` argument, you can use `--lora_target all` to specify all the available modules for better convergence.
 >
-> For the "base" models, the `--template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "chat" models.
+> For the "base" models, the `--template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
+>
+> Remember to use the **SAME** template in training and inference.
 
 Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list of models we supported.
 
diff --git a/README_zh.md b/README_zh.md
index ab43fa26..bf43b656 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -159,9 +159,11 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                 | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
-> **默认模块**应作为 `--lora_target` 参数的默认值，可使用 `--lora_target all` 参数指定全部模块。
+> **默认模块**应作为 `--lora_target` 参数的默认值，可使用 `--lora_target all` 参数指定全部模块以得到更好的效果。
 >
-> 对于所有“基座”（Base）模型，`--template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Chat）模型请务必使用**对应的模板**。
+> 对于所有“基座”（Base）模型，`--template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
+>
+> 请务必在训练和推理时使用**完全一致**的模板。
 
 项目所支持模型的完整列表请参阅 [constants.py](src/llmtuner/extras/constants.py)。
 

From f6a53d83c80ad4368b8cd7d5d0f00c7be58faa73 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 22 Apr 2024 00:51:35 +0800
Subject: [PATCH 129/341] update readme

Former-commit-id: 3eab580703ee01a0d2d75e7f01df5165af551386
---
 README.md    | 4 ++--
 README_zh.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index fc36b405..8d8a6423 100644
--- a/README.md
+++ b/README.md
@@ -344,9 +344,9 @@ export GRADIO_SERVER_PORT=7860 # `set GRADIO_SERVER_PORT=7860` for Windows
 python src/train_web.py # or python -m llmtuner.webui.interface
 ```
 
-<details><summary>For Aliyun users</summary>
+<details><summary>For Alibaba Cloud users</summary>
 
-If you encountered display problems in LLaMA Board GUI, try using the following command to set environment variables before starting LLaMA Board:
+If you encountered display problems in LLaMA Board on Alibaba Cloud, try using the following command to set environment variables before starting LLaMA Board:
 
 ```bash
 export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
diff --git a/README_zh.md b/README_zh.md
index bf43b656..76c8c358 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -346,7 +346,7 @@ python src/train_web.py # 或 python -m llmtuner.webui.interface
 
 <details><summary>阿里云用户指南</summary>
 
-如果您在 LLaMA Board GUI 中遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
+如果您在阿里云上使用 LLaMA Board 时遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
 
 ```bash
 export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/

From 1d2e372a8e6a1cbc1b626f5ba363065bf1a4af32 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 22 Apr 2024 17:09:17 +0800
Subject: [PATCH 130/341] update readme

Former-commit-id: d4eaee262a64e716ce475dc4eb18d8d9697d8dd8
---
 README.md                      | 2 ++
 README_zh.md                   | 6 ++++--
 src/llmtuner/hparams/parser.py | 3 +++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8d8a6423..faa1c7d8 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,8 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details.
+
 [24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See `examples/extras/mod` for usage.
 
 [24/04/19] We supported **Meta Llama 3** model series.
diff --git a/README_zh.md b/README_zh.md
index 76c8c358..1b4e3f1a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -11,7 +11,7 @@
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
 [![Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/hiyouga/LLaMA-Board)
 [![Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board)
-[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)
+[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)
 
 👋 加入我们的[微信群](assets/wechat.jpg)。
 
@@ -23,7 +23,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 选择你的打开方式：
 
-- **Colab**：https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing
+- **Colab**：https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing
 - **本地机器**：请见[如何使用](#如何使用)
 
 ## 目录
@@ -68,6 +68,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型，详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。
+
 [24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 `examples/extras/mod`。
 
 [24/04/19] 我们支持了 **Meta Llama 3** 系列模型。
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index b22db652..0d286819 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -94,6 +94,9 @@ def _check_extra_dependencies(
     if finetuning_args.use_badam:
         require_version("badam", "To fix: pip install badam")
 
+    if finetuning_args.plot_loss:
+        require_version("matplotlib", "To fix: pip install matplotlib")
+
     if training_args is not None and training_args.predict_with_generate:
         require_version("jieba", "To fix: pip install jieba")
         require_version("nltk", "To fix: pip install nltk")

From 79d34ce0f335615bba3a949456301242a5b53e24 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 23 Apr 2024 18:29:46 +0800
Subject: [PATCH 131/341] update examples

Former-commit-id: 8bf55682cdfbbdca0f01073eac0084c20a6a09d1
---
 examples/README.md                     | 4 ++--
 examples/README_zh.md                  | 4 ++--
 examples/accelerate/master_config.yaml | 2 +-
 examples/accelerate/slave_config.yaml  | 2 +-
 examples/full_multi_gpu/predict.sh     | 4 +++-
 examples/lora_multi_gpu/multi_node.sh  | 1 +
 examples/lora_multi_gpu/single_node.sh | 2 +-
 7 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 871bf0de..111f50bd 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -19,11 +19,11 @@ examples/
 ├── lora_multi_gpu/
 │   ├── single_node.sh: Fine-tune model with Accelerate on single node using LoRA
 │   ├── multi_node.sh: Fine-tune model with Accelerate on multiple nodes using LoRA
-│   └── ds_zero3.sh: Fine-tune model with DeepSpeed ZeRO-3 using LoRA
+│   └── ds_zero3.sh: Fine-tune model with DeepSpeed ZeRO-3 using LoRA (weight sharding)
 ├── full_multi_gpu/
 │   ├── single_node.sh: Full fine-tune model with DeepSpeed on single node
 │   ├── multi_node.sh: Full fine-tune model with DeepSpeed on multiple nodes
-│   └── predict.sh: Do batch predict and compute BLEU and ROUGE scores after full tuning
+│   └── predict.sh: Do parallel batch predict and compute BLEU and ROUGE scores after full tuning
 ├── merge_lora/
 │   ├── merge.sh: Merge LoRA weights into the pre-trained models
 │   └── quantize.sh: Quantize the fine-tuned model with AutoGPTQ
diff --git a/examples/README_zh.md b/examples/README_zh.md
index c4f2062e..fecbdb2f 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -19,11 +19,11 @@ examples/
 ├── lora_multi_gpu/
 │   ├── single_node.sh: 使用 Accelerate 进行单节点 LoRA 训练
 │   ├── multi_node.sh: 使用 Accelerate 进行多节点 LoRA 训练
-│   └── ds_zero3.sh: 使用 DeepSpeed ZeRO-3 进行 LoRA 训练
+│   └── ds_zero3.sh: 使用 DeepSpeed ZeRO-3 进行 LoRA 训练（拆分权重）
 ├── full_multi_gpu/
 │   ├── single_node.sh: 使用 DeepSpeed 进行单节点全量训练
 │   ├── multi_node.sh: 使用 DeepSpeed 进行多节点全量训练
-│   └── predict.sh: 基于全量训练进行批量预测并计算 BLEU 和 ROUGE 分数
+│   └── predict.sh: 基于全量训练进行多卡批量预测并计算 BLEU 和 ROUGE 分数
 ├── merge_lora/
 │   ├── merge.sh: 将 LoRA 权重合并到预训练模型中
 │   └── quantize.sh: 使用 AutoGPTQ 量化微调后的模型
diff --git a/examples/accelerate/master_config.yaml b/examples/accelerate/master_config.yaml
index 9c8fc275..a1018313 100644
--- a/examples/accelerate/master_config.yaml
+++ b/examples/accelerate/master_config.yaml
@@ -9,7 +9,7 @@ main_process_port: 29555
 main_training_function: main
 mixed_precision: fp16
 num_machines: 2 # the number of nodes
-num_processes: 16 # the number of GPUs in all nodes
+num_processes: 8 # the number of GPUs in all nodes
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/examples/accelerate/slave_config.yaml b/examples/accelerate/slave_config.yaml
index e4a63e82..e610fd0e 100644
--- a/examples/accelerate/slave_config.yaml
+++ b/examples/accelerate/slave_config.yaml
@@ -9,7 +9,7 @@ main_process_port: 29555
 main_training_function: main
 mixed_precision: fp16
 num_machines: 2 # the number of nodes
-num_processes: 16 # the number of GPUs in all nodes
+num_processes: 8 # the number of GPUs in all nodes
 rdzv_backend: static
 same_network: true
 tpu_env: []
diff --git a/examples/full_multi_gpu/predict.sh b/examples/full_multi_gpu/predict.sh
index 52fdc7a0..801df85a 100644
--- a/examples/full_multi_gpu/predict.sh
+++ b/examples/full_multi_gpu/predict.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
+    --config_file ../accelerate/single_config.yaml \
+    ../../src/train_bash.py \
     --stage sft \
     --do_predict \
     --model_name_or_path ../../saves/LLaMA2-7B/full/sft \
diff --git a/examples/lora_multi_gpu/multi_node.sh b/examples/lora_multi_gpu/multi_node.sh
index 5172b9a6..85a3e026 100644
--- a/examples/lora_multi_gpu/multi_node.sh
+++ b/examples/lora_multi_gpu/multi_node.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# also launch it on slave machine using slave_config.yaml
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
     --config_file ../accelerate/master_config.yaml \
diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh
index 269d76d7..04529cf0 100644
--- a/examples/lora_multi_gpu/single_node.sh
+++ b/examples/lora_multi_gpu/single_node.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch \
+CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
     --config_file ../accelerate/single_config.yaml \
     ../../src/train_bash.py \
     --stage sft \

From ab6dc0ea30fd14c331d777109d5b5ca4d632fadf Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 23 Apr 2024 18:45:43 +0800
Subject: [PATCH 132/341] add multimodal LLM BLIP-2 and InstructBLIP

Former-commit-id: a730f89a972f1a9d37c718c716f199cb8d4903b2
---
 examples/mllm/sft_blip2.sh              |  34 ++++++
 examples/mllm/sft_instructblip.sh       |  35 ++++++
 src/llmtuner/data/__init__.py           |   4 +-
 src/llmtuner/data/loader.py             |  50 +++++++--
 src/llmtuner/hparams/data_args.py       |   4 +
 src/llmtuner/hparams/finetuning_args.py |   2 +-
 src/llmtuner/hparams/model_args.py      |   4 +
 src/llmtuner/model/__init__.py          |   5 +-
 src/llmtuner/model/adapter.py           | 114 ++++++++++++++++++--
 src/llmtuner/model/loader.py            | 116 ++++++++++++++++++--
 src/llmtuner/train/sftmm/__init__.py    |   3 +
 src/llmtuner/train/sftmm/collator.py    |  69 ++++++++++++
 src/llmtuner/train/sftmm/metric.py      |  61 +++++++++++
 src/llmtuner/train/sftmm/trainer.py     | 137 ++++++++++++++++++++++++
 src/llmtuner/train/sftmm/workflow.py    | 105 ++++++++++++++++++
 src/llmtuner/train/tuner.py             |   5 +-
 16 files changed, 710 insertions(+), 38 deletions(-)
 create mode 100644 examples/mllm/sft_blip2.sh
 create mode 100644 examples/mllm/sft_instructblip.sh
 create mode 100644 src/llmtuner/train/sftmm/__init__.py
 create mode 100644 src/llmtuner/train/sftmm/collator.py
 create mode 100644 src/llmtuner/train/sftmm/metric.py
 create mode 100644 src/llmtuner/train/sftmm/trainer.py
 create mode 100644 src/llmtuner/train/sftmm/workflow.py

diff --git a/examples/mllm/sft_blip2.sh b/examples/mllm/sft_blip2.sh
new file mode 100644
index 00000000..416bb9cd
--- /dev/null
+++ b/examples/mllm/sft_blip2.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
+    --stage sft_mm \
+    --do_train \
+    --model_name_or_path /home/LAB/fengzc/LLM/checkpoints/Salesforce/blip2-opt-2.7b \
+    --dataset llava_instruct_100 \
+    --dataset_dir data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,k_proj \
+    --output_dir saves/blip2-opt-2.7b/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 1 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --quantization_bit 8 \
+    --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017
+
diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh
new file mode 100644
index 00000000..a4330a84
--- /dev/null
+++ b/examples/mllm/sft_instructblip.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
+    --stage sft_mm \
+    --do_train \
+    --model_name_or_path /home/LAB/fengzc/LLM/checkpoints/Salesforce/instructblip-vicuna-7b \
+    --dataset llava_instruct_100 \
+    --dataset_dir data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,k_proj \
+    --output_dir saves/instructblip-vicuna-7b/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --logging_steps 1 \
+    --warmup_steps 20 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --max_samples 3000 \
+    --val_size 0.1 \
+    --plot_loss \
+    --quantization_bit 8 \
+    --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 \
+    --use_qformer
+
diff --git a/src/llmtuner/data/__init__.py b/src/llmtuner/data/__init__.py
index 792e89d9..27a2f3b8 100644
--- a/src/llmtuner/data/__init__.py
+++ b/src/llmtuner/data/__init__.py
@@ -1,12 +1,12 @@
 from .collator import PairwiseDataCollatorWithPadding
-from .loader import get_dataset
+from .loader import get_dataset, get_mm_dataset
 from .template import Template, get_template_and_fix_tokenizer, templates
 from .utils import Role, split_dataset
 
-
 __all__ = [
     "PairwiseDataCollatorWithPadding",
     "get_dataset",
+    "get_mm_dataset",
     "Template",
     "get_template_and_fix_tokenizer",
     "templates",
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index 5414150e..b7377379 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -13,23 +13,21 @@ from .preprocess import get_preprocess_and_print_func
 from .template import get_template_and_fix_tokenizer
 from .utils import checksum, merge_dataset
 
-
 if TYPE_CHECKING:
     from datasets import Dataset, IterableDataset
-    from transformers import Seq2SeqTrainingArguments
+    from transformers import Seq2SeqTrainingArguments, AutoProcessor
     from transformers.tokenization_utils import PreTrainedTokenizer
 
     from ..hparams import DataArguments, ModelArguments
     from .parser import DatasetAttr
 
-
 logger = get_logger(__name__)
 
 
 def load_single_dataset(
-    dataset_attr: "DatasetAttr",
-    model_args: "ModelArguments",
-    data_args: "DataArguments",
+        dataset_attr: "DatasetAttr",
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
 ) -> Union["Dataset", "IterableDataset"]:
     logger.info("Loading dataset {}...".format(dataset_attr))
     data_path, data_name, data_dir, data_files = None, None, None, None
@@ -115,11 +113,11 @@ def load_single_dataset(
 
 
 def get_dataset(
-    tokenizer: "PreTrainedTokenizer",
-    model_args: "ModelArguments",
-    data_args: "DataArguments",
-    training_args: "Seq2SeqTrainingArguments",
-    stage: Literal["pt", "sft", "rm", "ppo"],
+        tokenizer: "PreTrainedTokenizer",
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        training_args: "Seq2SeqTrainingArguments",
+        stage: Literal["pt", "sft", "rm", "ppo"],
 ) -> Union["Dataset", "IterableDataset"]:
     template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
     if data_args.train_on_prompt and template.efficient_eos:
@@ -177,3 +175,33 @@ def get_dataset(
                 raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
 
         return dataset
+
+
+def get_mm_dataset(
+        processor: "AutoProcessor",
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        training_args: "Seq2SeqTrainingArguments",
+        stage: Literal["pt", "sft", "rm", "ppo"],
+) -> Union["Dataset", "IterableDataset"]:
+    tokenizer = processor.tokenizer
+    if data_args.tokenized_path is not None:
+        if has_tokenized_data(data_args.tokenized_path):
+            logger.warning("Loading dataset from disk will ignore other data arguments.")
+            dataset = load_from_disk(data_args.tokenized_path)
+            logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
+            if data_args.streaming:
+                dataset = dataset.to_iterable_dataset()
+            return dataset
+
+        if data_args.streaming:
+            raise ValueError("Turn off `streaming` when saving dataset to disk.")
+
+    with training_args.main_process_first(desc="load dataset"):
+        all_datasets = []
+        for dataset_attr in get_dataset_list(data_args):
+            local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+            all_datasets.append(load_dataset("json", data_files=local_path)['train'])
+        dataset = merge_dataset(all_datasets, data_args, training_args)
+
+    return dataset
diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py
index f5f75c77..3b52f1ea 100644
--- a/src/llmtuner/hparams/data_args.py
+++ b/src/llmtuner/hparams/data_args.py
@@ -88,6 +88,10 @@ class DataArguments:
         default=None,
         metadata={"help": "Path to save or load the tokenized datasets."},
     )
+    image_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to images."},
+    )
 
     def __post_init__(self):
         if self.reserved_label_len >= self.cutoff_len:
diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index f4f71bc5..cb525699 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -260,7 +260,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         default=False,
         metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."},
     )
-    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo"] = field(
+    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo", "sft_mm"] = field(
         default="sft",
         metadata={"help": "Which stage will be performed in training."},
     )
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index 0e42033f..32637f59 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -165,6 +165,10 @@ class ModelArguments:
         default=False,
         metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
     )
+    use_qformer: bool = field(
+        default=False,
+        metadata={"help": "Whether use qformer for Multimodal LLM."},
+    )
 
     def __post_init__(self):
         self.compute_dtype = None
diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py
index 1eaf4271..cf54dafe 100644
--- a/src/llmtuner/model/__init__.py
+++ b/src/llmtuner/model/__init__.py
@@ -1,10 +1,11 @@
-from .loader import load_model, load_tokenizer
+from .loader import load_model, load_tokenizer, load_processor, load_mm_model
 from .utils import find_all_linear_modules, load_valuehead_params
 
-
 __all__ = [
     "load_model",
+    "load_mm_model",
     "load_tokenizer",
+    "load_processor",
     "load_valuehead_params",
     "find_all_linear_modules",
 ]
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index f73666d5..624d8a85 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -1,24 +1,25 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
 
 import torch
 from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model
+from transformers import AutoModelForVision2Seq
 from transformers.integrations import is_deepspeed_zero3_enabled
 
 from ..extras.logging import get_logger
 from .utils import QuantizationMethod, find_all_linear_modules, find_expanded_modules
 
-
 if TYPE_CHECKING:
-    from transformers.modeling_utils import PreTrainedModel
+    from transformers.modeling_utils import PreTrainedModel, AutoModelForVision2Seq
 
     from ..hparams import FinetuningArguments, ModelArguments
 
-
 logger = get_logger(__name__)
 
 
 def init_adapter(
-    model: "PreTrainedModel", model_args: "ModelArguments", finetuning_args: "FinetuningArguments", is_trainable: bool
+        model: "PreTrainedModel", model_args: "ModelArguments",
+        finetuning_args: "FinetuningArguments",
+        is_trainable: bool
 ) -> "PreTrainedModel":
     r"""
     Initializes the adapters.
@@ -43,9 +44,9 @@ def init_adapter(
     if finetuning_args.finetuning_type == "freeze" and is_trainable:
         logger.info("Fine-tuning method: Freeze")
         num_layers = (
-            getattr(model.config, "num_hidden_layers", None)
-            or getattr(model.config, "num_layers", None)
-            or getattr(model.config, "n_layer", None)
+                getattr(model.config, "num_hidden_layers", None)
+                or getattr(model.config, "num_layers", None)
+                or getattr(model.config, "n_layer", None)
         )
         if not num_layers:
             raise ValueError("Current model does not support freeze tuning.")
@@ -135,9 +136,9 @@ def init_adapter(
                 target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable)
 
             if (
-                finetuning_args.use_dora
-                and getattr(model, "quantization_method", None) is not None
-                and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
+                    finetuning_args.use_dora
+                    and getattr(model, "quantization_method", None) is not None
+                    and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
             ):
                 raise ValueError("DoRA is not compatible with PTQ-quantized models.")
 
@@ -176,3 +177,94 @@ def init_adapter(
             logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
 
     return model
+
+
+def init_mm_adapter(
+        model: "AutoModelForVision2Seq", model_args: "ModelArguments",
+        finetuning_args: "FinetuningArguments",
+        is_trainable: bool
+) -> "AutoModelForVision2Seq":
+    if finetuning_args.finetuning_type == "lora":
+        logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
+        adapter_to_resume = None
+
+        if model_args.adapter_name_or_path is not None:
+            is_mergeable = True
+            if getattr(model, "quantization_method", None):  # merge lora in quantized model is unstable
+                assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter."
+                is_mergeable = False
+
+            if is_deepspeed_zero3_enabled():
+                assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3."
+                is_mergeable = False
+
+            if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable):
+                adapter_to_merge = model_args.adapter_name_or_path[:-1]
+                adapter_to_resume = model_args.adapter_name_or_path[-1]
+            else:
+                adapter_to_merge = model_args.adapter_name_or_path
+
+            for adapter in adapter_to_merge:
+                model: "LoraModel" = PeftModel.from_pretrained(
+                    model, adapter, offload_folder=model_args.offload_folder
+                )
+                model = model.merge_and_unload()
+
+            if len(adapter_to_merge) > 0:
+                logger.info("Merged {} adapter(s).".format(len(adapter_to_merge)))
+
+            if adapter_to_resume is not None:  # resume lora training
+                model = PeftModel.from_pretrained(
+                    model, adapter_to_resume, is_trainable=is_trainable, offload_folder=model_args.offload_folder
+                )
+
+        if is_trainable and adapter_to_resume is None:  # create new lora weights while training
+            if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all":
+                target_modules = find_all_linear_modules(model)
+            else:
+                target_modules = finetuning_args.lora_target
+
+            if finetuning_args.use_llama_pro:
+                target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable)
+
+            if (
+                    finetuning_args.use_dora
+                    and getattr(model, "quantization_method", None) is not None
+                    and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
+            ):
+                raise ValueError("DoRA is not compatible with PTQ-quantized models.")
+
+            peft_kwargs = {
+                "r": finetuning_args.lora_rank,
+                "target_modules": target_modules,
+                "lora_alpha": finetuning_args.lora_alpha,
+                "lora_dropout": finetuning_args.lora_dropout,
+                "use_rslora": finetuning_args.use_rslora,
+                "modules_to_save": finetuning_args.additional_target,
+            }
+
+            if model_args.use_unsloth:
+                from unsloth import FastLanguageModel  # type: ignore
+
+                unsloth_peft_kwargs = {
+                    "model": model,
+                    "max_seq_length": model_args.model_max_length,
+                    "use_gradient_checkpointing": "unsloth",
+                }
+                model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
+            else:
+                lora_config = LoraConfig(
+                    # task_type=TaskType.CAUSAL_LM,
+                    inference_mode=False,
+                    use_dora=finetuning_args.use_dora,
+                    **peft_kwargs,
+                )
+                model = get_peft_model(model, lora_config)
+
+        if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
+            for param in filter(lambda p: p.requires_grad, model.parameters()):
+                param.data = param.data.to(torch.float32)
+
+        if model_args.adapter_name_or_path is not None:
+            logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
+    return model
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 4935dd52..eeee69a6 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -1,22 +1,20 @@
 from typing import TYPE_CHECKING, Any, Dict
 
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModelForVision2Seq
 from trl import AutoModelForCausalLMWithValueHead
 
 from ..extras.constants import MOD_SUPPORTED_MODELS
 from ..extras.logging import get_logger
 from ..extras.misc import count_parameters, get_current_device, try_download_model_from_ms
-from .adapter import init_adapter
+from .adapter import init_adapter, init_mm_adapter
 from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model
 from .utils import load_valuehead_params, register_autoclass
 
-
 if TYPE_CHECKING:
     from transformers import PreTrainedModel, PreTrainedTokenizer
 
     from ..hparams import FinetuningArguments, ModelArguments
 
-
 logger = get_logger(__name__)
 
 
@@ -57,12 +55,38 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
     return tokenizer
 
 
+def load_processor(model_args: "ModelArguments") -> "AutoProcessor":
+    r"""
+    Loads processor. Must before load_model.
+
+    Note: including inplace operation of model_args.
+    """
+    init_kwargs = _get_init_kwargs(model_args)
+    try:
+        processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=model_args.use_fast_tokenizer,
+            split_special_tokens=model_args.split_special_tokens,
+            padding_side="right",
+            **init_kwargs,
+        )
+    except Exception:  # try the fast one
+        processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=True,
+            padding_side="right",
+            **init_kwargs,
+        )
+
+    return processor
+
+
 def load_model(
-    tokenizer: "PreTrainedTokenizer",
-    model_args: "ModelArguments",
-    finetuning_args: "FinetuningArguments",
-    is_trainable: bool = False,
-    add_valuehead: bool = False,
+        tokenizer: "PreTrainedTokenizer",
+        model_args: "ModelArguments",
+        finetuning_args: "FinetuningArguments",
+        is_trainable: bool = False,
+        add_valuehead: bool = False,
 ) -> "PreTrainedModel":
     r"""
     Loads pretrained model. Must after load_tokenizer.
@@ -159,3 +183,77 @@ def load_model(
             )
 
     return model
+
+
+def load_mm_model(
+        processor: "AutoProcessor",
+        model_args: "ModelArguments",
+        finetuning_args: "FinetuningArguments",
+        is_trainable: bool = False,
+        add_valuehead: bool = False,
+) -> "AutoModelForVision2Seq":
+    r"""
+    Loads pretrained model. Must after load_tokenizer.
+    """
+    tokenizer = processor.tokenizer
+    init_kwargs = _get_init_kwargs(model_args)
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
+    patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
+
+    model = None
+    if is_trainable and model_args.use_unsloth:
+        from unsloth import FastLanguageModel  # type: ignore
+
+        unsloth_kwargs = {
+            "model_name": model_args.model_name_or_path,
+            "max_seq_length": model_args.model_max_length,
+            "dtype": model_args.compute_dtype,
+            "load_in_4bit": model_args.quantization_bit == 4,
+            "token": model_args.hf_hub_token,
+            "device_map": {"": get_current_device()},
+            "rope_scaling": getattr(config, "rope_scaling", None),
+            "fix_tokenizer": False,
+            "trust_remote_code": True,
+        }
+        try:
+            model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
+        except NotImplementedError:
+            logger.warning("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))
+            model_args.use_unsloth = False
+
+        if model_args.adapter_name_or_path:
+            model_args.adapter_name_or_path = None
+            logger.warning("Unsloth does not support loading adapters.")
+    if model is None:
+        init_kwargs["config"] = config
+        init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path
+        model: "AutoModelForVision2Seq" = AutoModelForVision2Seq.from_pretrained(**init_kwargs)
+    patch_model(model, tokenizer, model_args, is_trainable)
+    register_autoclass(config, model, tokenizer)
+
+    model = init_mm_adapter(model, model_args, finetuning_args, is_trainable)
+
+    if not is_trainable:
+        model.requires_grad_(False)
+        model.eval()
+    else:
+        model.train()
+
+    trainable_params, all_param = count_parameters(model)
+    if is_trainable:
+        param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
+            trainable_params, all_param, 100 * trainable_params / all_param
+        )
+    else:
+        param_stats = "all params: {:d}".format(all_param)
+    logger.info(param_stats)
+
+    if model_args.print_param_status:
+        for name, param in model.named_parameters():
+            print(
+                "name: {}, dtype: {}, device: {}, trainable: {}".format(
+                    name, param.dtype, param.device, param.requires_grad
+                )
+            )
+
+    return model
diff --git a/src/llmtuner/train/sftmm/__init__.py b/src/llmtuner/train/sftmm/__init__.py
new file mode 100644
index 00000000..3eb8b2e2
--- /dev/null
+++ b/src/llmtuner/train/sftmm/__init__.py
@@ -0,0 +1,3 @@
+from .workflow import run_sft_mm
+
+__all__ = ["run_sft_mm"]
diff --git a/src/llmtuner/train/sftmm/collator.py b/src/llmtuner/train/sftmm/collator.py
new file mode 100644
index 00000000..e91374bc
--- /dev/null
+++ b/src/llmtuner/train/sftmm/collator.py
@@ -0,0 +1,69 @@
+import json
+import os
+from dataclasses import dataclass
+
+import torch
+from torch.utils.data import Dataset as Dataset_torch
+from datasets import Dataset
+from PIL import Image
+from transformers import AutoProcessor
+
+
+class ImageCaptioningDataset(Dataset_torch):
+    def __init__(self, dataset: Dataset, image_path: str, processor: AutoProcessor):
+        self.processor = processor
+        self.dataset = dataset
+        self.image_path = image_path
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        source = self.dataset[idx]
+        image_id = source['image']
+        image = Image.open(os.path.join(self.image_path, image_id))
+        convs = source['conversations']
+        prompt = convs[0]['value']
+        label = convs[1]['value']
+        image_inputs = self.processor(image, return_tensors="pt")
+        image_inputs = {k: v.squeeze() for k, v in image_inputs.items()}
+        inputs = {
+            "input_ids": prompt,
+            "labels": label,
+        }
+        for key in image_inputs:
+            inputs[key] = image_inputs[key]
+        return inputs
+
+
+@dataclass
+class DataCollatorForVis2Seq:
+    processor: AutoProcessor
+    use_qformer: bool = False
+
+    def __call__(self, features, return_tensors=None):
+        processed_batch = {}
+        for key in features[0].keys():
+            if key == 'pixel_values':
+                processed_batch[key] = torch.stack([example[key] for example in features])
+            elif key == 'input_ids':
+                text_inputs = self.processor.tokenizer(
+                    [example[key] for example in features], padding="max_length", return_tensors="pt",
+                    max_length=512,
+                )
+                processed_batch["input_ids"] = text_inputs["input_ids"]
+                processed_batch["attention_mask"] = text_inputs["attention_mask"]
+                if self.use_qformer:
+                    qformer_text_inputs = self.processor.qformer_tokenizer(
+                        [example[key] for example in features], padding="max_length", return_tensors="pt",
+                        max_length=512,
+                    )
+                    processed_batch["qformer_input_ids"] = qformer_text_inputs["input_ids"]
+                    processed_batch["qformer_attention_mask"] = qformer_text_inputs["attention_mask"]
+            elif key == 'labels':
+                text_inputs = self.processor.tokenizer(
+                    [example[key] for example in features], padding="max_length", return_tensors="pt",
+                    max_length=512,
+                )
+                processed_batch["labels"] = text_inputs["input_ids"]
+        return processed_batch
diff --git a/src/llmtuner/train/sftmm/metric.py b/src/llmtuner/train/sftmm/metric.py
new file mode 100644
index 00000000..d1af4c17
--- /dev/null
+++ b/src/llmtuner/train/sftmm/metric.py
@@ -0,0 +1,61 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
+
+import numpy as np
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available
+
+
+if TYPE_CHECKING:
+    from transformers.tokenization_utils import PreTrainedTokenizer
+
+if is_jieba_available():
+    import jieba  # type: ignore
+
+if is_nltk_available():
+    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+
+if is_rouge_available():
+    from rouge_chinese import Rouge
+
+
+@dataclass
+class ComputeMetrics:
+    r"""
+    Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer.
+    """
+
+    tokenizer: "PreTrainedTokenizer"
+
+    def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]:
+        r"""
+        Uses the model predictions to compute metrics.
+        """
+        preds, labels = eval_preds
+        score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []}
+
+        preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id)
+        labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id)
+
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        for pred, label in zip(decoded_preds, decoded_labels):
+            hypothesis = list(jieba.cut(pred))
+            reference = list(jieba.cut(label))
+
+            if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
+                result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
+            else:
+                rouge = Rouge()
+                scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
+                result = scores[0]
+
+            for k, v in result.items():
+                score_dict[k].append(round(v["f"] * 100, 4))
+
+            bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
+            score_dict["bleu-4"].append(round(bleu_score * 100, 4))
+
+        return {k: float(np.mean(v)) for k, v in score_dict.items()}
diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py
new file mode 100644
index 00000000..96b86b44
--- /dev/null
+++ b/src/llmtuner/train/sftmm/trainer.py
@@ -0,0 +1,137 @@
+import json
+import os
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from transformers import Seq2SeqTrainer
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.logging import get_logger
+from ..utils import create_custom_optimzer, create_custom_scheduler
+
+if TYPE_CHECKING:
+    from transformers.trainer import PredictionOutput
+    from peft import PeftModelForCausalLM
+    from ...hparams import FinetuningArguments
+
+logger = get_logger(__name__)
+
+
+class CustomSeq2SeqTrainer(Seq2SeqTrainer):
+    r"""
+    Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE.
+    """
+
+    def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.finetuning_args = finetuning_args
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
+
+    # def compute_loss(self, model, inputs, return_outputs=False):
+    #     print(inputs.keys())
+    #     device = "cuda"
+    #     input_ids = inputs.get("input_ids").to(device)
+    #     pixel_values = inputs.get("pixel_values").to(device, torch.float16)
+    #     attention_mask = inputs.get("attention_mask").to(device)
+    #     labels = inputs.get("labels").to(device)
+    #
+    #     outputs = model(input_ids=input_ids,
+    #                     pixel_values=pixel_values,
+    #                     labels=labels,
+    #                     # attention_mask=attention_mask,
+    #                     )
+    #     loss = outputs.loss
+    #     print("Loss:", loss.item())
+    #     return (loss, outputs) if return_outputs else loss
+
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    def create_scheduler(
+            self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    def prediction_step(
+            self,
+            model: "torch.nn.Module",
+            inputs: Dict[str, Union[torch.Tensor, Any]],
+            prediction_loss_only: bool,
+            ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        r"""
+        Removes the prompt part in the generated tokens.
+
+        Subclass and override to inject custom behavior.
+        """
+        labels = inputs["labels"].detach().clone() if "labels" in inputs else None  # backup labels
+        if self.args.predict_with_generate:
+            assert self.tokenizer.padding_side == "left", "This method only accepts left-padded tensor."
+            prompt_len, label_len = inputs["input_ids"].size(-1), inputs["labels"].size(-1)
+            if prompt_len > label_len:
+                inputs["labels"] = self._pad_tensors_to_target_len(inputs["labels"], inputs["input_ids"])
+            if label_len > prompt_len:  # truncate the labels instead of padding the inputs (llama2 fp16 compatibility)
+                inputs["labels"] = inputs["labels"][:, :prompt_len]
+
+        loss, generated_tokens, _ = super().prediction_step(  # ignore the returned labels (may be truncated)
+            model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+        )
+        if generated_tokens is not None and self.args.predict_with_generate:
+            generated_tokens[:, :prompt_len] = self.tokenizer.pad_token_id
+            generated_tokens = generated_tokens.contiguous()
+
+        return loss, generated_tokens, labels
+
+    def _pad_tensors_to_target_len(self, src_tensor: torch.Tensor, tgt_tensor: torch.Tensor) -> torch.Tensor:
+        r"""
+        Pads the tensor to the same length as the target tensor.
+        """
+        assert self.tokenizer.pad_token_id is not None, "Pad token is required."
+        padded_tensor = self.tokenizer.pad_token_id * torch.ones_like(tgt_tensor)
+        padded_tensor[:, -src_tensor.shape[-1]:] = src_tensor  # adopt left-padding
+        return padded_tensor.contiguous()  # in contiguous memory
+
+    def save_predictions(self, predict_results: "PredictionOutput") -> None:
+        r"""
+        Saves model predictions to `output_dir`.
+
+        A custom behavior that not contained in Seq2SeqTrainer.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
+        logger.info(f"Saving prediction results to {output_prediction_file}")
+
+        labels = np.where(
+            predict_results.label_ids != IGNORE_INDEX, predict_results.label_ids, self.tokenizer.pad_token_id
+        )
+        preds = np.where(
+            predict_results.predictions != IGNORE_INDEX, predict_results.predictions, self.tokenizer.pad_token_id
+        )
+
+        for i in range(len(preds)):
+            pad_len = np.nonzero(preds[i] != self.tokenizer.pad_token_id)[0]
+            if len(pad_len):
+                preds[i] = np.concatenate(
+                    (preds[i][pad_len[0]:], preds[i][: pad_len[0]]), axis=-1
+                )  # move pad token to last
+
+        decoded_labels = self.tokenizer.batch_decode(
+            labels, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+
+        with open(output_prediction_file, "w", encoding="utf-8") as writer:
+            res: List[str] = []
+            for label, pred in zip(decoded_labels, decoded_preds):
+                res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False))
+            writer.write("\n".join(res))
diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py
new file mode 100644
index 00000000..9f952772
--- /dev/null
+++ b/src/llmtuner/train/sftmm/workflow.py
@@ -0,0 +1,105 @@
+# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py
+import os
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+from transformers import DataCollatorForSeq2Seq, LlavaNextForConditionalGeneration, AutoModelForVision2Seq
+
+from ...data import split_dataset, get_mm_dataset
+from ...extras.constants import IGNORE_INDEX
+from ...extras.misc import get_logits_processor
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer, load_processor, load_mm_model
+from ..utils import create_modelcard_and_push
+from .metric import ComputeMetrics
+from .trainer import CustomSeq2SeqTrainer
+from .collator import DataCollatorForVis2Seq, ImageCaptioningDataset
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+def run_sft_mm(
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        training_args: "Seq2SeqTrainingArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+        callbacks: Optional[List["TrainerCallback"]] = None,
+):
+    processor = load_processor(model_args)
+    tokenizer = processor.tokenizer
+    model = load_mm_model(processor, model_args, finetuning_args, training_args.do_train)
+    dataset = get_mm_dataset(processor, model_args, data_args, training_args, stage="sft")
+    if training_args.predict_with_generate:
+        tokenizer.padding_side = "left"  # use left-padding in generation
+    if getattr(model, "is_quantized", False) and not training_args.do_train:
+        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
+    splited_dataset = split_dataset(dataset, data_args, training_args)
+    splited_dataset['train_dataset'].set_format(type=splited_dataset['train_dataset'].format["type"],
+                                                columns=list(splited_dataset['train_dataset'].features.keys()))
+    splited_dataset['eval_dataset'].set_format(type=splited_dataset['eval_dataset'].format["type"],
+                                               columns=list(splited_dataset['eval_dataset'].features.keys()))
+    train_dataset = ImageCaptioningDataset(splited_dataset['train_dataset'], data_args.image_path, processor)
+    eval_dataset = ImageCaptioningDataset(splited_dataset['eval_dataset'], data_args.image_path, processor)
+    data_collator = DataCollatorForVis2Seq(
+        processor=processor,
+        use_qformer=model_args.use_qformer,
+    )
+
+    # Override the decoding parameters of Seq2SeqTrainer
+    training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
+    training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
+
+    # Initialize our Trainer
+    trainer = CustomSeq2SeqTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
+
+    # Keyword arguments for `model.generate`
+    gen_kwargs = generating_args.to_dict()
+    gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids
+    gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
+    gen_kwargs["logits_processor"] = get_logits_processor()
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs)
+        if training_args.predict_with_generate:  # eval_loss will be wrong if predict_with_generate is enabled
+            metrics.pop("eval_loss", None)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Predict
+    if training_args.do_predict:
+        predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs)
+        if training_args.predict_with_generate:  # predict_loss will be wrong if predict_with_generate is enabled
+            predict_results.metrics.pop("predict_loss", None)
+        trainer.log_metrics("predict", predict_results.metrics)
+        trainer.save_metrics("predict", predict_results.metrics)
+        trainer.save_predictions(predict_results)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index a8a2b8e9..ac56289c 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -14,12 +14,11 @@ from .ppo import run_ppo
 from .pt import run_pt
 from .rm import run_rm
 from .sft import run_sft
-
+from .sftmm import run_sft_mm
 
 if TYPE_CHECKING:
     from transformers import TrainerCallback
 
-
 logger = get_logger(__name__)
 
 
@@ -31,6 +30,8 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["Tra
         run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
     elif finetuning_args.stage == "sft":
         run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
+    elif finetuning_args.stage == "sft_mm":
+        run_sft_mm(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
     elif finetuning_args.stage == "rm":
         run_rm(model_args, data_args, training_args, finetuning_args, callbacks)
     elif finetuning_args.stage == "ppo":

From 0b99b13786f27ef7253332b0d820d68a0e304b31 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 23 Apr 2024 18:47:03 +0800
Subject: [PATCH 133/341] add multimodal LLM BLIP-2 and InstructBLIP

Former-commit-id: b78b5f290aa38a7454e101ee9703fb6fac5064ac
---
 examples/mllm/sft_instructblip.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh
index a4330a84..055c639a 100644
--- a/examples/mllm/sft_instructblip.sh
+++ b/examples/mllm/sft_instructblip.sh
@@ -31,5 +31,4 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --plot_loss \
     --quantization_bit 8 \
     --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 \
-    --use_qformer
-
+    --use_qformer
\ No newline at end of file

From 1451297c78f4a75f0b05884ca1d02aa08d561d61 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 23 Apr 2024 19:22:42 +0800
Subject: [PATCH 134/341] add multimodal LLM BLIP-2 and InstructBLIP

Former-commit-id: 67800c565b086f362b8cf131b0c9babaa7a7ebc7
---
 examples/mllm/sft_blip2.sh        | 5 ++---
 examples/mllm/sft_instructblip.sh | 4 ++--
 src/llmtuner/data/loader.py       | 1 -
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/examples/mllm/sft_blip2.sh b/examples/mllm/sft_blip2.sh
index 416bb9cd..ac0a3f11 100644
--- a/examples/mllm/sft_blip2.sh
+++ b/examples/mllm/sft_blip2.sh
@@ -14,7 +14,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --overwrite_output_dir \
     --cutoff_len 1024 \
     --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
+    --per_device_train_batch_size 4 \
     --per_device_eval_batch_size 1 \
     --gradient_accumulation_steps 8 \
     --lr_scheduler_type cosine \
@@ -30,5 +30,4 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --val_size 0.1 \
     --plot_loss \
     --quantization_bit 8 \
-    --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017
-
+    --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017
\ No newline at end of file
diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh
index 055c639a..92478500 100644
--- a/examples/mllm/sft_instructblip.sh
+++ b/examples/mllm/sft_instructblip.sh
@@ -14,7 +14,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --overwrite_output_dir \
     --cutoff_len 1024 \
     --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
+    --per_device_train_batch_size 4 \
     --per_device_eval_batch_size 1 \
     --gradient_accumulation_steps 8 \
     --lr_scheduler_type cosine \
@@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --eval_steps 100 \
     --evaluation_strategy steps \
     --load_best_model_at_end \
-    --learning_rate 5e-5 \
+    --learning_rate 1e-5 \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index b7377379..b3af434b 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -184,7 +184,6 @@ def get_mm_dataset(
         training_args: "Seq2SeqTrainingArguments",
         stage: Literal["pt", "sft", "rm", "ppo"],
 ) -> Union["Dataset", "IterableDataset"]:
-    tokenizer = processor.tokenizer
     if data_args.tokenized_path is not None:
         if has_tokenized_data(data_args.tokenized_path):
             logger.warning("Loading dataset from disk will ignore other data arguments.")

From 1e4010a1fb019b1d802685bd6c24a6aea5ecaf5b Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 00:28:53 +0800
Subject: [PATCH 135/341] support phi-3

Former-commit-id: 7e8ffa9beee3893e051ceeade443bd56c4a07b1c
---
 README.md                        |  5 +++--
 README_zh.md                     |  5 +++--
 src/llmtuner/data/template.py    |  9 +++++++++
 src/llmtuner/extras/constants.py | 14 ++++++++++++++
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index faa1c7d8..0bf9f731 100644
--- a/README.md
+++ b/README.md
@@ -153,6 +153,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
+| [Phi-3](https://huggingface.co/microsoft)                | 3.8B                        | qkv_proj          | phi       |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
 | [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
@@ -333,7 +334,7 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 </details>
 
-### Train with LLaMA Board GUI
+### Train with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
 
 > [!IMPORTANT]
 > LLaMA Board GUI only supports training on a single GPU, please use [CLI](#command-line-interface) for distributed training.
@@ -458,7 +459,7 @@ If you have a project that should be incorporated, please contact via email or c
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
 
-Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index 1b4e3f1a..69ba2562 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -153,6 +153,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
+| [Phi-3](https://huggingface.co/microsoft)                | 3.8B                        | qkv_proj          | phi       |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
 | [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
@@ -333,7 +334,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 </details>
 
-### 利用 LLaMA Board 可视化界面训练
+### 利用 LLaMA Board 可视化界面训练（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
 
 > [!IMPORTANT]
 > LLaMA Board 可视化界面目前仅支持单 GPU 训练，请使用[命令行接口](#命令行接口)来进行多 GPU 分布式训练。
@@ -458,7 +459,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
 
-使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## 引用
 
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 04538510..cd567a7b 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -718,6 +718,15 @@ _register_template(
 )
 
 
+_register_template(
+    name="phi",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]),
+    format_separator=EmptyFormatter(slots=["<|end|>\n"]),
+    default_system="You are a helpful AI assistant.",
+)
+
+
 _register_template(
     name="qwen",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index a0e51d17..38d715f5 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -652,6 +652,20 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "Phi3-3.8B-4k-Chat": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-mini-4k-instruct",
+        },
+        "Phi3-3.8B-128k-Chat": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-mini-128k-instruct",
+        },
+    },
+    module="qkv_proj",
+    template="phi",
+)
+
+
 register_model_group(
     models={
         "Qwen-1.8B": {

From 35c4a2c212e6fe0b1df464236555b107e5dba620 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 01:30:16 +0800
Subject: [PATCH 136/341] fix #3347 #3387

Former-commit-id: c253c18185a29b59190f3e0ed236c2bb4c788085
---
 src/llmtuner/chat/vllm_engine.py | 18 ++++++++++++------
 src/llmtuner/model/__init__.py   |  3 ++-
 src/llmtuner/model/loader.py     | 23 +++++++++++++++++------
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index e924ef6e..67a19b68 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -2,9 +2,9 @@ import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence
 
 from ..data import get_template_and_fix_tokenizer
-from ..extras.misc import get_device_count
+from ..extras.misc import get_device_count, infer_optim_dtype
 from ..extras.packages import is_vllm_available
-from ..model import load_tokenizer
+from ..model import load_config, load_tokenizer
 from .base_engine import BaseEngine, Response
 
 
@@ -23,10 +23,20 @@ class VllmEngine(BaseEngine):
         finetuning_args: "FinetuningArguments",
         generating_args: "GeneratingArguments",
     ) -> None:
+        config = load_config(model_args)  # may download model from ms hub
+        load_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
+
         self.can_generate = finetuning_args.stage == "sft"
+        self.tokenizer = load_tokenizer(model_args)
+        self.tokenizer.padding_side = "left"
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
+        self.generating_args = generating_args.to_dict()
+
         engine_args = AsyncEngineArgs(
             model=model_args.model_name_or_path,
             trust_remote_code=True,
+            download_dir=model_args.cache_dir,
+            dtype=str(load_dtype).split(".")[-1],
             max_model_len=model_args.vllm_maxlen,
             tensor_parallel_size=get_device_count() or 1,
             gpu_memory_utilization=model_args.vllm_gpu_util,
@@ -35,10 +45,6 @@ class VllmEngine(BaseEngine):
             enforce_eager=model_args.vllm_enforce_eager,
         )
         self.model = AsyncLLMEngine.from_engine_args(engine_args)
-        self.tokenizer = load_tokenizer(model_args)
-        self.tokenizer.padding_side = "left"
-        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
-        self.generating_args = generating_args.to_dict()
 
     async def _generate(
         self,
diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py
index 1eaf4271..e0b1c9cd 100644
--- a/src/llmtuner/model/__init__.py
+++ b/src/llmtuner/model/__init__.py
@@ -1,8 +1,9 @@
-from .loader import load_model, load_tokenizer
+from .loader import load_config, load_model, load_tokenizer
 from .utils import find_all_linear_modules, load_valuehead_params
 
 
 __all__ = [
+    "load_config",
     "load_model",
     "load_tokenizer",
     "load_valuehead_params",
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 4935dd52..57f5a763 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -12,7 +12,7 @@ from .utils import load_valuehead_params, register_autoclass
 
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel, PreTrainedTokenizer
+    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
 
     from ..hparams import FinetuningArguments, ModelArguments
 
@@ -21,6 +21,11 @@ logger = get_logger(__name__)
 
 
 def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
+    r"""
+    Gets arguments to load config/tokenizer/model.
+
+    Note: including inplace operation of model_args.
+    """
     model_args.model_name_or_path = try_download_model_from_ms(model_args)
     return {
         "trust_remote_code": True,
@@ -32,9 +37,7 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
 
 def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
     r"""
-    Loads pretrained tokenizer. Must before load_model.
-
-    Note: including inplace operation of model_args.
+    Loads pretrained tokenizer.
     """
     init_kwargs = _get_init_kwargs(model_args)
     try:
@@ -57,6 +60,14 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
     return tokenizer
 
 
+def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
+    r"""
+    Loads model config.
+    """
+    init_kwargs = _get_init_kwargs(model_args)
+    return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
+
+
 def load_model(
     tokenizer: "PreTrainedTokenizer",
     model_args: "ModelArguments",
@@ -65,10 +76,10 @@ def load_model(
     add_valuehead: bool = False,
 ) -> "PreTrainedModel":
     r"""
-    Loads pretrained model. Must after load_tokenizer.
+    Loads pretrained model.
     """
     init_kwargs = _get_init_kwargs(model_args)
-    config = AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
+    config = load_config(model_args)
     patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
 
     model = None

From d2bb1b3a6b0d517d0886058f6405797582f9cf2d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 02:18:44 +0800
Subject: [PATCH 137/341] reenable sdpa and fast tok by default

Former-commit-id: 9e00902dbedc71d55743d1bf237843506a557891
---
 README.md                            |  4 +--
 README_zh.md                         |  4 +--
 requirements.txt                     |  1 +
 src/llmtuner/extras/packages.py      | 19 ++++++++---
 src/llmtuner/hparams/model_args.py   |  8 ++---
 src/llmtuner/model/patcher.py        | 49 ++++++++++++++++++++++------
 src/llmtuner/webui/components/top.py |  2 +-
 src/llmtuner/webui/runner.py         |  4 +--
 8 files changed, 64 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 0bf9f731..970dd8fc 100644
--- a/README.md
+++ b/README.md
@@ -72,8 +72,6 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 [24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See `examples/extras/mod` for usage.
 
-[24/04/19] We supported **Meta Llama 3** model series.
-
 [24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See `examples/extras/badam` for usage.
 
 [24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
@@ -112,7 +110,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 [23/09/23] We integrated MMLU, C-Eval and CMMLU benchmarks in this repo. See [this example](#evaluation) to evaluate your models.
 
-[23/09/10] We supported **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**. Try `--flash_attn` argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs.
+[23/09/10] We supported **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**. Try `--flash_attn fa2` argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs.
 
 [23/08/12] We supported **RoPE scaling** to extend the context length of the LLaMA models. Try `--rope_scaling linear` argument in training and `--rope_scaling dynamic` argument at inference to extrapolate the position embeddings.
 
diff --git a/README_zh.md b/README_zh.md
index 69ba2562..583c89ca 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -72,8 +72,6 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 [24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 `examples/extras/mod`。
 
-[24/04/19] 我们支持了 **Meta Llama 3** 系列模型。
-
 [24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 `examples/extras/badam`。
 
 [24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
@@ -112,7 +110,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 [23/09/23] 我们在项目中集成了 MMLU、C-Eval 和 CMMLU 评估集。使用方法请参阅[此示例](#模型评估)。
 
-[23/09/10] 我们支持了 **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**。如果您使用的是 RTX4090、A100 或 H100 GPU，请使用 `--flash_attn` 参数以启用 FlashAttention-2。
+[23/09/10] 我们支持了 **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**。如果您使用的是 RTX4090、A100 或 H100 GPU，请使用 `--flash_attn fa2` 参数以启用 FlashAttention-2。
 
 [23/08/12] 我们支持了 **RoPE 插值**来扩展 LLaMA 模型的上下文长度。请使用 `--rope_scaling linear` 参数训练模型或使用 `--rope_scaling dynamic` 参数评估模型。
 
diff --git a/requirements.txt b/requirements.txt
index 3928d28d..ecba3ce1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ fastapi
 sse-starlette
 matplotlib
 fire
+packaging
diff --git a/src/llmtuner/extras/packages.py b/src/llmtuner/extras/packages.py
index 8494cb2c..aeeba084 100644
--- a/src/llmtuner/extras/packages.py
+++ b/src/llmtuner/extras/packages.py
@@ -1,16 +1,23 @@
 import importlib.metadata
 import importlib.util
+from typing import TYPE_CHECKING
+
+from packaging import version
+
+
+if TYPE_CHECKING:
+    from packaging.version import Version
 
 
 def _is_package_available(name: str) -> bool:
     return importlib.util.find_spec(name) is not None
 
 
-def _get_package_version(name: str) -> str:
+def _get_package_version(name: str) -> "Version":
     try:
-        return importlib.metadata.version(name)
+        return version.parse(importlib.metadata.version(name))
     except Exception:
-        return "0.0.0"
+        return version.parse("0.0.0")
 
 
 def is_fastapi_availble():
@@ -18,7 +25,7 @@ def is_fastapi_availble():
 
 
 def is_flash_attn2_available():
-    return _is_package_available("flash_attn") and _get_package_version("flash_attn").startswith("2")
+    return _is_package_available("flash_attn") and _get_package_version("flash_attn") > version.parse("2.0.0")
 
 
 def is_galore_available():
@@ -49,6 +56,10 @@ def is_rouge_available():
     return _is_package_available("rouge_chinese")
 
 
+def is_sdpa_available():
+    return _get_package_version("torch") > version.parse("2.1.1")
+
+
 def is_starlette_available():
     return _is_package_available("sse_starlette")
 
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index 0e42033f..eb6366d9 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -22,7 +22,7 @@ class ModelArguments:
         metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."},
     )
     use_fast_tokenizer: bool = field(
-        default=False,
+        default=True,
         metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."},
     )
     resize_vocab: bool = field(
@@ -61,9 +61,9 @@ class ModelArguments:
         default=None,
         metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
     )
-    flash_attn: bool = field(
-        default=False,
-        metadata={"help": "Enable FlashAttention for faster training."},
+    flash_attn: Literal["off", "sdpa", "fa2", "auto"] = field(
+        default="auto",
+        metadata={"help": "Enable FlashAttention for faster training and inference."},
     )
     shift_attn: bool = field(
         default=False,
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 53616dd9..6c79992a 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -15,7 +15,7 @@ from transformers.utils.versions import require_version
 from ..extras.constants import FILEEXT2TYPE, LAYERNORM_NAMES
 from ..extras.logging import get_logger
 from ..extras.misc import get_current_device, infer_optim_dtype
-from ..extras.packages import is_flash_attn2_available
+from ..extras.packages import is_flash_attn2_available, is_sdpa_available
 from ..extras.patches.llama_patch import apply_llama_patch
 from .utils import QuantizationMethod, add_z3_leaf_module, gradient_checkpointing_enable
 
@@ -62,18 +62,45 @@ def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "Mod
 
 
 def _configure_attn_implementation(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
-    if model_args.flash_attn:
-        if not is_flash_attn2_available():
-            logger.warning("FlashAttention2 is not installed.")
+    if model_args.flash_attn == "auto":
+        return
+
+    elif model_args.flash_attn == "off":
+        requested_attn_implementation = "eager"
+
+    elif model_args.flash_attn == "sdpa":
+        if not is_sdpa_available():
+            logger.warning("Torch>=2.1.1 is required for SDPA attention.")
             return
 
-        logger.info("Using FlashAttention-2 for faster training and inference.")
-        if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
-            setattr(config, "attn_implementation", "flash_attention_2")
-        else:
-            setattr(config, "_attn_implementation", "flash_attention_2")
+        requested_attn_implementation = "sdpa"
+    elif model_args.flash_attn == "fa2":
+        if not is_flash_attn2_available():
+            logger.warning("FlashAttention-2 is not installed.")
+            return
+
+        requested_attn_implementation = "flash_attention_2"
     else:
-        setattr(config, "_attn_implementation", "eager")
+        raise NotImplementedError("Unknown attention type: {}".format(model_args.flash_attn))
+
+    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
+        setattr(config, "attn_implementation", requested_attn_implementation)
+    else:
+        setattr(config, "_attn_implementation", requested_attn_implementation)
+
+
+def _print_attn_implementation(config: "PretrainedConfig") -> None:
+    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
+        attn_implementation = getattr(config, "attn_implementation", None)
+    else:
+        attn_implementation = getattr(config, "_attn_implementation", None)
+
+    if attn_implementation == "flash_attention_2":
+        logger.info("Using FlashAttention-2 for faster training and inference.")
+    elif attn_implementation == "sdpa":
+        logger.info("Using torch SDPA for faster training and inference.")
+    else:
+        logger.info("Using vanilla Attention implementation.")
 
 
 def _configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
@@ -365,6 +392,8 @@ def patch_model(
 
         add_z3_leaf_module(model, Qwen2MoeSparseMoeBlock)
 
+    _print_attn_implementation(model.config)
+
     try:
         model.add_model_tags(["llama-factory"])
     except Exception:
diff --git a/src/llmtuner/webui/components/top.py b/src/llmtuner/webui/components/top.py
index 6cbf6e0d..c67d7cc5 100644
--- a/src/llmtuner/webui/components/top.py
+++ b/src/llmtuner/webui/components/top.py
@@ -33,7 +33,7 @@ def create_top() -> Dict[str, "Component"]:
             quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none")
             template = gr.Dropdown(choices=list(templates.keys()), value="default")
             rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none")
-            booster = gr.Radio(choices=["none", "flashattn", "unsloth"], value="none")
+            booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none")
 
     model_name.change(list_adapters, [model_name, finetuning_type], [adapter_path], queue=False).then(
         get_model_path, [model_name], [model_path], queue=False
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index ec493c96..b64a015c 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -67,7 +67,7 @@ class Runner:
         if not model_path:
             return ALERTS["err_no_path"][lang]
 
-        if len(dataset) == 0:
+        if not dataset:
             return ALERTS["err_no_dataset"][lang]
 
         if not from_preview and self.demo_mode:
@@ -122,7 +122,7 @@ class Runner:
             quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
             template=get("top.template"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
-            flash_attn=(get("top.booster") == "flashattn"),
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
             use_unsloth=(get("top.booster") == "unsloth"),
             dataset_dir=get("train.dataset_dir"),
             dataset=",".join(get("train.dataset")),

From 03f2e3284ae2e5216ddf5746d51bb9d7efcb947b Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 03:02:23 +0800
Subject: [PATCH 138/341] refactor patcher

Former-commit-id: 263cfe1294f5c3188f5e8d65791f35ee0d87315a
---
 src/llmtuner/extras/constants.py              |   2 +
 src/llmtuner/model/__init__.py                |   2 +-
 src/llmtuner/model/adapter.py                 |   3 +-
 src/llmtuner/model/loader.py                  |   2 +-
 src/llmtuner/model/patcher.py                 | 325 +-----------------
 .../patches => model/utils}/__init__.py       |   0
 src/llmtuner/model/utils/attention.py         |  55 +++
 src/llmtuner/model/utils/checkpointing.py     |  94 +++++
 src/llmtuner/model/utils/embedding.py         |  56 +++
 .../utils/longlora.py}                        | 151 +++++++-
 .../model/{utils.py => utils/misc.py}         |  74 +---
 src/llmtuner/model/utils/moe.py               |  39 +++
 src/llmtuner/model/utils/quantization.py      | 146 ++++++++
 src/llmtuner/model/utils/rope.py              |  43 +++
 14 files changed, 598 insertions(+), 394 deletions(-)
 rename src/llmtuner/{extras/patches => model/utils}/__init__.py (100%)
 create mode 100644 src/llmtuner/model/utils/attention.py
 create mode 100644 src/llmtuner/model/utils/checkpointing.py
 create mode 100644 src/llmtuner/model/utils/embedding.py
 rename src/llmtuner/{extras/patches/llama_patch.py => model/utils/longlora.py} (58%)
 rename src/llmtuner/model/{utils.py => utils/misc.py} (61%)
 create mode 100644 src/llmtuner/model/utils/moe.py
 create mode 100644 src/llmtuner/model/utils/quantization.py
 create mode 100644 src/llmtuner/model/utils/rope.py

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 38d715f5..0a29f971 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -47,6 +47,8 @@ TRAINING_STAGES = {
 
 STAGES_USE_PAIR_DATA = ["rm", "dpo", "orpo"]
 
+SUPPORTED_CLASS_FOR_S2ATTN = ["llama"]
+
 V_HEAD_WEIGHTS_NAME = "value_head.bin"
 
 V_HEAD_SAFE_WEIGHTS_NAME = "value_head.safetensors"
diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py
index e0b1c9cd..1824f084 100644
--- a/src/llmtuner/model/__init__.py
+++ b/src/llmtuner/model/__init__.py
@@ -1,5 +1,5 @@
 from .loader import load_config, load_model, load_tokenizer
-from .utils import find_all_linear_modules, load_valuehead_params
+from .utils.misc import find_all_linear_modules, load_valuehead_params
 
 
 __all__ = [
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index f73666d5..efc63cde 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -5,7 +5,8 @@ from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model
 from transformers.integrations import is_deepspeed_zero3_enabled
 
 from ..extras.logging import get_logger
-from .utils import QuantizationMethod, find_all_linear_modules, find_expanded_modules
+from .utils.misc import find_all_linear_modules, find_expanded_modules
+from .utils.quantization import QuantizationMethod
 
 
 if TYPE_CHECKING:
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 57f5a763..b8558542 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -8,7 +8,7 @@ from ..extras.logging import get_logger
 from ..extras.misc import count_parameters, get_current_device, try_download_model_from_ms
 from .adapter import init_adapter
 from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model
-from .utils import load_valuehead_params, register_autoclass
+from .utils.misc import load_valuehead_params, register_autoclass
 
 
 if TYPE_CHECKING:
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 6c79992a..c0166a8a 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -1,23 +1,20 @@
-import math
-import os
-import random
-from contextlib import nullcontext
 from types import MethodType
-from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+from typing import TYPE_CHECKING, Any, Dict
 
 import torch
-from datasets import load_dataset
 from peft import PeftModel
-from transformers import BitsAndBytesConfig, GPTQConfig, PreTrainedModel, PreTrainedTokenizerBase
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from transformers.integrations import is_deepspeed_zero3_enabled
-from transformers.utils.versions import require_version
 
-from ..extras.constants import FILEEXT2TYPE, LAYERNORM_NAMES
 from ..extras.logging import get_logger
-from ..extras.misc import get_current_device, infer_optim_dtype
-from ..extras.packages import is_flash_attn2_available, is_sdpa_available
-from ..extras.patches.llama_patch import apply_llama_patch
-from .utils import QuantizationMethod, add_z3_leaf_module, gradient_checkpointing_enable
+from ..extras.misc import infer_optim_dtype
+from .utils.attention import configure_attn_implementation, print_attn_implementation
+from .utils.checkpointing import prepare_model_for_training
+from .utils.embedding import resize_embedding_layer
+from .utils.longlora import configure_longlora
+from .utils.moe import add_z3_leaf_module
+from .utils.quantization import configure_quantization
+from .utils.rope import configure_rope
 
 
 if TYPE_CHECKING:
@@ -28,282 +25,6 @@ if TYPE_CHECKING:
 
 
 logger = get_logger(__name__)
-SUPPORTED_CLASS_FOR_S2ATTN = ["llama"]
-
-
-def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]:
-    r"""
-    Inspired by: https://github.com/huggingface/optimum/blob/v1.16.0/optimum/gptq/data.py#L133
-    TODO: remove tokenizer.decode() https://github.com/huggingface/optimum/pull/1600
-    """
-    if os.path.isfile(model_args.export_quantization_dataset):
-        data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None)
-        data_files = model_args.export_quantization_dataset
-    else:
-        data_path = model_args.export_quantization_dataset
-        data_files = None
-
-    dataset = load_dataset(path=data_path, data_files=data_files, split="train", cache_dir=model_args.cache_dir)
-    maxlen = model_args.export_quantization_maxlen
-
-    samples = []
-    for _ in range(model_args.export_quantization_nsamples):
-        while True:
-            sample_idx = random.randint(0, len(dataset) - 1)
-            sample: Dict[str, torch.Tensor] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt")
-            if sample["input_ids"].size(1) >= maxlen:
-                break  # TODO: fix large maxlen
-
-        word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1)
-        input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen]
-        samples.append(tokenizer.decode(input_ids[0].tolist(), skip_special_tokens=True))
-
-    return samples
-
-
-def _configure_attn_implementation(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
-    if model_args.flash_attn == "auto":
-        return
-
-    elif model_args.flash_attn == "off":
-        requested_attn_implementation = "eager"
-
-    elif model_args.flash_attn == "sdpa":
-        if not is_sdpa_available():
-            logger.warning("Torch>=2.1.1 is required for SDPA attention.")
-            return
-
-        requested_attn_implementation = "sdpa"
-    elif model_args.flash_attn == "fa2":
-        if not is_flash_attn2_available():
-            logger.warning("FlashAttention-2 is not installed.")
-            return
-
-        requested_attn_implementation = "flash_attention_2"
-    else:
-        raise NotImplementedError("Unknown attention type: {}".format(model_args.flash_attn))
-
-    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
-        setattr(config, "attn_implementation", requested_attn_implementation)
-    else:
-        setattr(config, "_attn_implementation", requested_attn_implementation)
-
-
-def _print_attn_implementation(config: "PretrainedConfig") -> None:
-    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
-        attn_implementation = getattr(config, "attn_implementation", None)
-    else:
-        attn_implementation = getattr(config, "_attn_implementation", None)
-
-    if attn_implementation == "flash_attention_2":
-        logger.info("Using FlashAttention-2 for faster training and inference.")
-    elif attn_implementation == "sdpa":
-        logger.info("Using torch SDPA for faster training and inference.")
-    else:
-        logger.info("Using vanilla Attention implementation.")
-
-
-def _configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
-    if model_args.rope_scaling is None:
-        return
-
-    if not hasattr(config, "rope_scaling"):
-        logger.warning("Current model does not support RoPE scaling.")
-        return
-
-    if is_trainable:
-        if model_args.rope_scaling == "dynamic":
-            logger.warning(
-                "Dynamic NTK scaling may not work well with fine-tuning. "
-                "See: https://github.com/huggingface/transformers/pull/24653"
-            )
-
-        current_max_length = getattr(config, "max_position_embeddings", None)
-        if current_max_length and model_args.model_max_length > current_max_length:
-            scaling_factor = float(math.ceil(model_args.model_max_length / current_max_length))
-        else:
-            logger.warning("Input length is smaller than max length. Consider increase input length.")
-            scaling_factor = 1.0
-    else:
-        scaling_factor = 2.0
-
-    setattr(config, "rope_scaling", {"type": model_args.rope_scaling, "factor": scaling_factor})
-    logger.info(
-        "Using {} scaling strategy and setting scaling factor to {}".format(model_args.rope_scaling, scaling_factor)
-    )
-
-
-def _configure_longlora(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
-    if not is_trainable or not model_args.shift_attn:
-        return
-
-    if getattr(config, "model_type", None) in SUPPORTED_CLASS_FOR_S2ATTN:
-        setattr(config, "group_size_ratio", 0.25)
-        apply_llama_patch()
-        logger.info("Using shift short attention with group_size_ratio=1/4.")
-    else:
-        logger.warning("Current model does not support shift short attention.")
-
-
-def _configure_quantization(
-    config: "PretrainedConfig",
-    tokenizer: "PreTrainedTokenizer",
-    model_args: "ModelArguments",
-    init_kwargs: Dict[str, Any],
-) -> None:
-    r"""
-    Priority: PTQ-quantized (training) > AutoGPTQ (export) > Bitsandbytes (training)
-    """
-    if getattr(config, "quantization_config", None):  # ptq
-        if is_deepspeed_zero3_enabled():
-            raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantized models.")
-
-        if model_args.quantization_device_map != "auto":
-            init_kwargs["device_map"] = {"": get_current_device()}
-
-        quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
-        quant_method = quantization_config.get("quant_method", "")
-
-        if quant_method == QuantizationMethod.GPTQ:
-            require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
-            quantization_config.pop("disable_exllama", None)  # remove deprecated args
-            quantization_config["use_exllama"] = False  # disable exllama
-
-        if quant_method == QuantizationMethod.AWQ:
-            require_version("autoawq", "To fix: pip install autoawq")
-
-        if quant_method == QuantizationMethod.AQLM:
-            require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
-            require_version("aqlm>=1.1.0", "To fix: pip install aqlm[gpu]>=1.1.0")
-            quantization_config["bits"] = 2
-
-        quant_bits = quantization_config.get("bits", "?")
-        logger.info("Loading {}-bit {}-quantized model.".format(quant_bits, quant_method.upper()))
-
-    elif model_args.export_quantization_bit is not None:  # auto-gptq
-        require_version("optimum>=1.16.0", "To fix: pip install optimum>=1.16.0")
-        require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
-        from accelerate.utils import get_max_memory
-
-        if getattr(config, "model_type", None) == "chatglm":
-            raise ValueError("ChatGLM model is not supported.")
-
-        init_kwargs["quantization_config"] = GPTQConfig(
-            bits=model_args.export_quantization_bit,
-            tokenizer=tokenizer,
-            dataset=_get_quantization_dataset(tokenizer, model_args),
-        )
-        init_kwargs["device_map"] = "auto"
-        init_kwargs["max_memory"] = get_max_memory()
-        logger.info("Quantizing model to {} bit.".format(model_args.export_quantization_bit))
-
-    elif model_args.quantization_bit is not None:  # bnb
-        if model_args.quantization_bit == 8:
-            require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0")
-            init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
-
-        elif model_args.quantization_bit == 4:
-            require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0")
-            init_kwargs["quantization_config"] = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=model_args.compute_dtype,
-                bnb_4bit_use_double_quant=model_args.double_quantization,
-                bnb_4bit_quant_type=model_args.quantization_type,
-                bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp qlora
-            )
-
-        if is_deepspeed_zero3_enabled() or model_args.quantization_device_map == "auto":
-            if model_args.quantization_bit != 4:
-                raise ValueError("Only 4-bit quantized model can use auto device map.")
-
-            require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
-            require_version("accelerate>=0.28.0", "To fix: pip install accelerate>=0.28.0")
-            require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
-        else:
-            init_kwargs["device_map"] = {"": get_current_device()}
-
-        logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
-
-
-def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int):
-    embedding_dim = embed_weight.size(1)
-    avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True)
-    noise_weight = torch.empty_like(embed_weight[-num_new_tokens:])
-    noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim)))
-    embed_weight[-num_new_tokens:] = avg_weight + noise_weight
-
-
-def _resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None:
-    r"""
-    Resize token embeddings.
-    """
-    if is_deepspeed_zero3_enabled():
-        import deepspeed  # type: ignore
-
-        params = [model.get_input_embeddings().weight]
-        if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
-            params.append(model.get_output_embeddings().weight)
-
-        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
-    else:
-        context_maybe_zero3 = nullcontext()
-
-    with context_maybe_zero3:
-        current_embedding_size = model.get_input_embeddings().weight.size(0)
-
-    if len(tokenizer) > current_embedding_size:
-        if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
-            logger.warning("Current model does not support resizing token embeddings.")
-            return
-
-        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
-        with context_maybe_zero3:
-            new_embedding_size = model.get_input_embeddings().weight.size(0)
-            num_new_tokens = new_embedding_size - current_embedding_size
-            _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens)
-            _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens)
-
-        logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size))
-
-
-def _fp32_forward_post_hook(
-    module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor"
-) -> "torch.Tensor":
-    return output.to(torch.float32)
-
-
-def _prepare_model_for_training(
-    model: "PreTrainedModel", model_args: "ModelArguments", output_layer_name: str = "lm_head"
-) -> None:
-    r"""
-    Includes:
-        (1) cast the layernorm in fp32
-        (2) make output embedding layer require grads
-        (3) add the upcasting of the lm_head in fp32
-    Inspired by: https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/other.py#L72
-    """
-    if model_args.upcast_layernorm:
-        logger.info("Upcasting layernorm weights in float32.")
-        for name, param in model.named_parameters():
-            if param.ndim == 1 and any(ln_name in name for ln_name in LAYERNORM_NAMES):
-                param.data = param.data.to(torch.float32)
-
-    if not model_args.disable_gradient_checkpointing:
-        if not getattr(model, "supports_gradient_checkpointing", False):
-            logger.warning("Current model does not support gradient checkpointing.")
-        else:
-            # use_reentrant=False might increase VRAM usage (have not been empirically verified yet)
-            # According to: https://github.com/huggingface/transformers/issues/28339
-            model.gradient_checkpointing_enable = MethodType(gradient_checkpointing_enable, model)
-            model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True})
-            setattr(model.config, "use_cache", False)  # turn off when gradient checkpointing is enabled
-            logger.info("Gradient checkpointing enabled.")
-
-    if hasattr(model, output_layer_name) and model_args.upcast_lmhead_output:
-        logger.info("Upcasting lm_head outputs in float32.")
-        output_layer = getattr(model, output_layer_name)
-        if isinstance(output_layer, torch.nn.Linear) and output_layer.weight.dtype != torch.float32:
-            output_layer.register_forward_hook(_fp32_forward_post_hook)
 
 
 def patch_tokenizer(tokenizer: "PreTrainedTokenizer") -> None:
@@ -321,10 +42,10 @@ def patch_config(
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
         model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
 
-    _configure_attn_implementation(config, model_args)
-    _configure_rope(config, model_args, is_trainable)
-    _configure_longlora(config, model_args, is_trainable)
-    _configure_quantization(config, tokenizer, model_args, init_kwargs)
+    configure_attn_implementation(config, model_args)
+    configure_rope(config, model_args, is_trainable)
+    configure_longlora(config, model_args, is_trainable)
+    configure_quantization(config, tokenizer, model_args, init_kwargs)
 
     if model_args.use_cache and not is_trainable:
         setattr(config, "use_cache", True)
@@ -377,22 +98,14 @@ def patch_model(
         setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
 
     if model_args.resize_vocab:
-        _resize_embedding_layer(model, tokenizer)
+        resize_embedding_layer(model, tokenizer)
 
     if is_trainable:
-        _prepare_model_for_training(model, model_args)
+        prepare_model_for_training(model, model_args)
+        add_z3_leaf_module(model)
 
-    if getattr(model.config, "model_type", None) == "mixtral":
-        from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
-
-        add_z3_leaf_module(model, MixtralSparseMoeBlock)
-
-    if getattr(model.config, "model_type", None) == "qwen2moe":
-        from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
-
-        add_z3_leaf_module(model, Qwen2MoeSparseMoeBlock)
-
-    _print_attn_implementation(model.config)
+    if not model_args.use_unsloth:
+        print_attn_implementation(model.config)
 
     try:
         model.add_model_tags(["llama-factory"])
diff --git a/src/llmtuner/extras/patches/__init__.py b/src/llmtuner/model/utils/__init__.py
similarity index 100%
rename from src/llmtuner/extras/patches/__init__.py
rename to src/llmtuner/model/utils/__init__.py
diff --git a/src/llmtuner/model/utils/attention.py b/src/llmtuner/model/utils/attention.py
new file mode 100644
index 00000000..f4686489
--- /dev/null
+++ b/src/llmtuner/model/utils/attention.py
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+
+from ...extras.logging import get_logger
+from ...extras.packages import is_flash_attn2_available, is_sdpa_available
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def configure_attn_implementation(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
+    if model_args.flash_attn == "auto":
+        return
+
+    elif model_args.flash_attn == "off":
+        requested_attn_implementation = "eager"
+
+    elif model_args.flash_attn == "sdpa":
+        if not is_sdpa_available():
+            logger.warning("Torch>=2.1.1 is required for SDPA attention.")
+            return
+
+        requested_attn_implementation = "sdpa"
+    elif model_args.flash_attn == "fa2":
+        if not is_flash_attn2_available():
+            logger.warning("FlashAttention-2 is not installed.")
+            return
+
+        requested_attn_implementation = "flash_attention_2"
+    else:
+        raise NotImplementedError("Unknown attention type: {}".format(model_args.flash_attn))
+
+    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
+        setattr(config, "attn_implementation", requested_attn_implementation)
+    else:
+        setattr(config, "_attn_implementation", requested_attn_implementation)
+
+
+def print_attn_implementation(config: "PretrainedConfig") -> None:
+    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
+        attn_implementation = getattr(config, "attn_implementation", None)
+    else:
+        attn_implementation = getattr(config, "_attn_implementation", None)
+
+    if attn_implementation == "flash_attention_2":
+        logger.info("Using FlashAttention-2 for faster training and inference.")
+    elif attn_implementation == "sdpa":
+        logger.info("Using torch SDPA for faster training and inference.")
+    else:
+        logger.info("Using vanilla Attention implementation.")
diff --git a/src/llmtuner/model/utils/checkpointing.py b/src/llmtuner/model/utils/checkpointing.py
new file mode 100644
index 00000000..e0657be8
--- /dev/null
+++ b/src/llmtuner/model/utils/checkpointing.py
@@ -0,0 +1,94 @@
+import inspect
+from functools import partial
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
+
+import torch
+
+from ...extras.constants import LAYERNORM_NAMES
+from ...extras.logging import get_logger
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def _gradient_checkpointing_enable(
+    self: "PreTrainedModel", gradient_checkpointing_kwargs: Optional[Dict[str, Any]] = None
+) -> None:
+    r"""
+    Activates gradient checkpointing for the current model.
+
+    Modification of the original method to enable gradient checkpointing for block-wise optimizer.
+    """
+    from torch.utils.checkpoint import checkpoint
+
+    if not self.supports_gradient_checkpointing:
+        raise ValueError("{} does not support gradient checkpointing.".format(self.__class__.__name__))
+
+    if gradient_checkpointing_kwargs is None:
+        gradient_checkpointing_kwargs = {"use_reentrant": True}
+
+    gradient_checkpointing_func = partial(checkpoint, **gradient_checkpointing_kwargs)
+
+    def custom_gradient_checkpointing_func(func, *args, **kwargs):
+        module: "torch.nn.Module" = func.__self__
+
+        if any(param.requires_grad for param in module.parameters()):
+            for arg in args:
+                if torch.is_tensor(arg) and torch.is_floating_point(arg):
+                    arg.requires_grad_(True)
+
+        return gradient_checkpointing_func(func, *args, **kwargs)
+
+    if "value" in inspect.signature(self._set_gradient_checkpointing).parameters:  # old GC format
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+        self.enable_input_require_grads()
+        logger.warning("You are using the old GC format, some features (e.g. BAdam) will be invalid.")
+    else:  # have already enabled input require gradients
+        self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=custom_gradient_checkpointing_func)
+
+
+def _fp32_forward_post_hook(
+    module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor"
+) -> "torch.Tensor":
+    return output.to(torch.float32)
+
+
+def prepare_model_for_training(
+    model: "PreTrainedModel", model_args: "ModelArguments", output_layer_name: str = "lm_head"
+) -> None:
+    r"""
+    Includes:
+        (1) cast the layernorm in fp32
+        (2) make output embedding layer require grads
+        (3) add the upcasting of the lm_head in fp32
+    Inspired by: https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/other.py#L72
+    """
+    if model_args.upcast_layernorm:
+        logger.info("Upcasting layernorm weights in float32.")
+        for name, param in model.named_parameters():
+            if param.ndim == 1 and any(ln_name in name for ln_name in LAYERNORM_NAMES):
+                param.data = param.data.to(torch.float32)
+
+    if not model_args.disable_gradient_checkpointing:
+        if not getattr(model, "supports_gradient_checkpointing", False):
+            logger.warning("Current model does not support gradient checkpointing.")
+        else:
+            # use_reentrant=False might increase VRAM usage (have not been empirically verified yet)
+            # According to: https://github.com/huggingface/transformers/issues/28339
+            model.gradient_checkpointing_enable = MethodType(_gradient_checkpointing_enable, model)
+            model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True})
+            setattr(model.config, "use_cache", False)  # turn off when gradient checkpointing is enabled
+            logger.info("Gradient checkpointing enabled.")
+
+    if hasattr(model, output_layer_name) and model_args.upcast_lmhead_output:
+        logger.info("Upcasting lm_head outputs in float32.")
+        output_layer = getattr(model, output_layer_name)
+        if isinstance(output_layer, torch.nn.Linear) and output_layer.weight.dtype != torch.float32:
+            output_layer.register_forward_hook(_fp32_forward_post_hook)
diff --git a/src/llmtuner/model/utils/embedding.py b/src/llmtuner/model/utils/embedding.py
new file mode 100644
index 00000000..7759fc0f
--- /dev/null
+++ b/src/llmtuner/model/utils/embedding.py
@@ -0,0 +1,56 @@
+import math
+from contextlib import nullcontext
+from typing import TYPE_CHECKING
+
+import torch
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from ...extras.logging import get_logger
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+
+
+logger = get_logger(__name__)
+
+
+def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int) -> None:
+    embedding_dim = embed_weight.size(1)
+    avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True)
+    noise_weight = torch.empty_like(embed_weight[-num_new_tokens:])
+    noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim)))
+    embed_weight[-num_new_tokens:] = avg_weight + noise_weight
+
+
+def resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None:
+    r"""
+    Resize token embeddings.
+    """
+    if is_deepspeed_zero3_enabled():
+        import deepspeed  # type: ignore
+
+        params = [model.get_input_embeddings().weight]
+        if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
+            params.append(model.get_output_embeddings().weight)
+
+        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
+    else:
+        context_maybe_zero3 = nullcontext()
+
+    with context_maybe_zero3:
+        current_embedding_size = model.get_input_embeddings().weight.size(0)
+
+    if len(tokenizer) > current_embedding_size:
+        if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
+            logger.warning("Current model does not support resizing token embeddings.")
+            return
+
+        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
+        with context_maybe_zero3:
+            new_embedding_size = model.get_input_embeddings().weight.size(0)
+            num_new_tokens = new_embedding_size - current_embedding_size
+            _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens)
+            _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens)
+
+        logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size))
diff --git a/src/llmtuner/extras/patches/llama_patch.py b/src/llmtuner/model/utils/longlora.py
similarity index 58%
rename from src/llmtuner/extras/patches/llama_patch.py
rename to src/llmtuner/model/utils/longlora.py
index 6a90c41a..c3740a73 100644
--- a/src/llmtuner/extras/patches/llama_patch.py
+++ b/src/llmtuner/model/utils/longlora.py
@@ -1,5 +1,5 @@
 import math
-from typing import Optional, Tuple
+from typing import TYPE_CHECKING, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -7,19 +7,28 @@ from transformers.models.llama.modeling_llama import (
     Cache,
     LlamaAttention,
     LlamaFlashAttention2,
+    LlamaSdpaAttention,
     apply_rotary_pos_emb,
     repeat_kv,
 )
 from transformers.utils import logging
 from transformers.utils.versions import require_version
 
+from ...extras.constants import SUPPORTED_CLASS_FOR_S2ATTN
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
 
 logger = logging.get_logger(__name__)
 
 
 # Modified from:
-# https://github.com/huggingface/transformers/blob/v4.39.1/src/transformers/models/llama/modeling_llama.py
-def llama_torch_attn_forward(
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+def llama_attention_forward(
     self: "LlamaAttention",
     hidden_states: torch.Tensor,
     attention_mask: Optional[torch.Tensor] = None,
@@ -39,10 +48,11 @@ def llama_torch_attn_forward(
     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
     value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-    past_key_value = getattr(self, "past_key_value", past_key_value)
     cos, sin = self.rotary_emb(value_states, position_ids)
     query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
+    past_key_value = getattr(self, "past_key_value", past_key_value)
+
     if past_key_value is not None:
         cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
         key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
@@ -69,8 +79,9 @@ def llama_torch_attn_forward(
 
     attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
 
     # upcast attention to fp32
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
@@ -97,8 +108,8 @@ def llama_torch_attn_forward(
 
 
 # Modified from:
-# https://github.com/huggingface/transformers/blob/v4.39.1/src/transformers/models/llama/modeling_llama.py
-def llama_flash_attn_forward(
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+def llama_flash_attention_2_forward(
     self: "LlamaFlashAttention2",
     hidden_states: torch.Tensor,
     attention_mask: Optional[torch.Tensor] = None,
@@ -117,7 +128,6 @@ def llama_flash_attn_forward(
     key_states = self.k_proj(hidden_states)
     value_states = self.v_proj(hidden_states)
 
-    # FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim)
     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
     value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -134,9 +144,10 @@ def llama_flash_attn_forward(
     key_states = repeat_kv(key_states, self.num_key_value_groups)
     value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-    query_states = query_states.transpose(1, 2)  # (bsz, seq_len, n_heads, head_dim)
-    key_states = key_states.transpose(1, 2)  # (bsz, seq_len, n_heads, head_dim)
-    value_states = value_states.transpose(1, 2)  # (bsz, seq_len, n_heads, head_dim)
+    # FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim)
+    query_states = query_states.transpose(1, 2)
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
 
     dropout_rate = self.attention_dropout if self.training else 0.0
 
@@ -192,7 +203,115 @@ def llama_flash_attn_forward(
     return attn_output, attn_weights, past_key_value
 
 
-def apply_llama_patch() -> None:
-    require_version("transformers==4.39.3", "To fix: pip install transformers==4.39.3")
-    LlamaAttention.forward = llama_torch_attn_forward
-    LlamaFlashAttention2.forward = llama_flash_attn_forward
+# Modified from:
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+def llama_sdpa_attention_forward(
+    self: "LlamaSdpaAttention",
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional["Cache"] = None,
+    output_attentions: bool = False,
+    cache_position: Optional[torch.LongTensor] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        logger.warning_once("SDPA does not support `output_attentions=True`. Falling back to the vanilla attention")
+        return llama_attention_forward(
+            self,
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    cos, sin = self.rotary_emb(value_states, position_ids)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift
+        groupsz = int(q_len * getattr(self.config, "group_size_ratio"))
+        assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz)
+        num_groups = q_len // groupsz
+
+        def shift(state: torch.Tensor) -> torch.Tensor:
+            state = state.transpose(1, 2)  # output: (bsz, seq_len, n_heads, head_dim)
+            state = torch.cat(
+                (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)),
+                dim=2,
+            )
+            return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1)
+
+    causal_mask = attention_mask
+    if attention_mask is not None:
+        causal_mask = causal_mask[:, :, :, :groupsz]
+
+    query_states = query_states.contiguous()
+    key_states = key_states.contiguous()
+    value_states = value_states.contiguous()
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states,
+        key_states,
+        value_states,
+        attn_mask=causal_mask,
+        dropout_p=self.attention_dropout if self.training else 0.0,
+        is_causal=causal_mask is None and q_len > 1,
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
+        attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = torch.cat(
+            (
+                attn_output[:, :, : self.num_heads // 2],
+                attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
+            )
+        )
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, None, past_key_value
+
+
+def _apply_llama_patch() -> None:
+    require_version("transformers==4.40.0", "To fix: pip install transformers==4.40.0")
+    LlamaAttention.forward = llama_attention_forward
+    LlamaFlashAttention2.forward = llama_flash_attention_2_forward
+    LlamaSdpaAttention.forward = llama_sdpa_attention_forward
+
+
+def configure_longlora(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
+    if not is_trainable or not model_args.shift_attn:
+        return
+
+    if getattr(config, "model_type", None) in SUPPORTED_CLASS_FOR_S2ATTN:
+        setattr(config, "group_size_ratio", 0.25)
+        _apply_llama_patch()
+        logger.info("Using shift short attention with group_size_ratio=1/4.")
+    else:
+        logger.warning("Current model does not support shift short attention.")
diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils/misc.py
similarity index 61%
rename from src/llmtuner/model/utils.py
rename to src/llmtuner/model/utils/misc.py
index 51dbca8e..57e772f7 100644
--- a/src/llmtuner/model/utils.py
+++ b/src/llmtuner/model/utils/misc.py
@@ -1,51 +1,23 @@
-import inspect
-from enum import Enum, unique
-from functools import partial
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Dict, List
 
 import torch
 from transformers import PreTrainedModel
-from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils import cached_file
-from transformers.utils.versions import require_version
 
-from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
-from ..extras.logging import get_logger
+from ...extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+from ...extras.logging import get_logger
+from .quantization import QuantizationMethod
 
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedTokenizer
 
-    from ..hparams import ModelArguments
+    from ...hparams import ModelArguments
 
 
 logger = get_logger(__name__)
 
 
-@unique
-class QuantizationMethod(str, Enum):
-    r"""
-    Borrowed from `transformers.utils.quantization_config.QuantizationMethod`.
-    """
-
-    BITS_AND_BYTES = "bitsandbytes"
-    GPTQ = "gptq"
-    AWQ = "awq"
-    AQLM = "aqlm"
-    QUANTO = "quanto"
-
-
-def add_z3_leaf_module(model: "PreTrainedModel", module: "torch.nn.Module") -> None:
-    r"""
-    Sets module as a leaf module to skip partitioning in deepspeed zero3.
-    """
-    if is_deepspeed_zero3_enabled():
-        require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
-        from deepspeed.utils import set_z3_leaf_modules  # type: ignore
-
-        set_z3_leaf_modules(model, [module])
-
-
 def find_all_linear_modules(model: "PreTrainedModel") -> List[str]:
     r"""
     Finds all available modules to apply lora or galore.
@@ -102,42 +74,6 @@ def find_expanded_modules(model: "PreTrainedModel", target_modules: List[str], n
     return module_names
 
 
-def gradient_checkpointing_enable(
-    self: "PreTrainedModel", gradient_checkpointing_kwargs: Optional[Dict[str, Any]] = None
-) -> None:
-    r"""
-    Activates gradient checkpointing for the current model.
-
-    Modification of the original method to enable gradient checkpointing for block-wise optimizer.
-    """
-    from torch.utils.checkpoint import checkpoint
-
-    if not self.supports_gradient_checkpointing:
-        raise ValueError("{} does not support gradient checkpointing.".format(self.__class__.__name__))
-
-    if gradient_checkpointing_kwargs is None:
-        gradient_checkpointing_kwargs = {"use_reentrant": True}
-
-    gradient_checkpointing_func = partial(checkpoint, **gradient_checkpointing_kwargs)
-
-    def custom_gradient_checkpointing_func(func, *args, **kwargs):
-        module: "torch.nn.Module" = func.__self__
-
-        if any(param.requires_grad for param in module.parameters()):
-            for arg in args:
-                if torch.is_tensor(arg) and torch.is_floating_point(arg):
-                    arg.requires_grad_(True)
-
-        return gradient_checkpointing_func(func, *args, **kwargs)
-
-    if "value" in inspect.signature(self._set_gradient_checkpointing).parameters:  # old GC format
-        self.apply(partial(self._set_gradient_checkpointing, value=True))
-        self.enable_input_require_grads()
-        logger.warning("You are using the old GC format, some features (e.g. BAdam) will be invalid.")
-    else:  # have already enabled input require gradients
-        self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=custom_gradient_checkpointing_func)
-
-
 def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> Dict[str, torch.Tensor]:
     r"""
     Loads value head parameters from Hugging Face Hub or local disk.
diff --git a/src/llmtuner/model/utils/moe.py b/src/llmtuner/model/utils/moe.py
new file mode 100644
index 00000000..020a8f55
--- /dev/null
+++ b/src/llmtuner/model/utils/moe.py
@@ -0,0 +1,39 @@
+from typing import TYPE_CHECKING
+
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.utils.versions import require_version
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+
+def add_z3_leaf_module(model: "PreTrainedModel") -> None:
+    r"""
+    Sets module as a leaf module to skip partitioning in deepspeed zero3.
+    """
+    if not is_deepspeed_zero3_enabled():
+        return
+
+    require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
+    from deepspeed.utils import set_z3_leaf_modules  # type: ignore
+
+    if getattr(model.config, "model_type", None) == "mixtral":
+        from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+        set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+
+    if getattr(model.config, "model_type", None) == "qwen2moe":
+        from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
+
+        set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
+
+    if getattr(model.config, "model_type", None) == "jamba":
+        from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock
+
+        set_z3_leaf_modules(model, [JambaSparseMoeBlock])
+
+    if getattr(model.config, "model_type", None) == "dbrx":
+        from transformers.models.dbrx.modeling_dbrx import DbrxFFN
+
+        set_z3_leaf_modules(model, [DbrxFFN])
diff --git a/src/llmtuner/model/utils/quantization.py b/src/llmtuner/model/utils/quantization.py
new file mode 100644
index 00000000..3cf159c1
--- /dev/null
+++ b/src/llmtuner/model/utils/quantization.py
@@ -0,0 +1,146 @@
+import os
+import random
+from enum import Enum, unique
+from typing import TYPE_CHECKING, Any, Dict, List
+
+import torch
+from datasets import load_dataset
+from transformers import BitsAndBytesConfig, GPTQConfig
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.utils.versions import require_version
+
+from ...extras.constants import FILEEXT2TYPE
+from ...extras.logging import get_logger
+from ...extras.misc import get_current_device
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedTokenizer
+
+    from ...hparams import ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+@unique
+class QuantizationMethod(str, Enum):
+    r"""
+    Borrowed from `transformers.utils.quantization_config.QuantizationMethod`.
+    """
+
+    BITS_AND_BYTES = "bitsandbytes"
+    GPTQ = "gptq"
+    AWQ = "awq"
+    AQLM = "aqlm"
+    QUANTO = "quanto"
+
+
+def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]:
+    r"""
+    Inspired by: https://github.com/huggingface/optimum/blob/v1.16.0/optimum/gptq/data.py#L133
+    TODO: remove tokenizer.decode() https://github.com/huggingface/optimum/pull/1600
+    """
+    if os.path.isfile(model_args.export_quantization_dataset):
+        data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None)
+        data_files = model_args.export_quantization_dataset
+    else:
+        data_path = model_args.export_quantization_dataset
+        data_files = None
+
+    dataset = load_dataset(path=data_path, data_files=data_files, split="train", cache_dir=model_args.cache_dir)
+    maxlen = model_args.export_quantization_maxlen
+
+    samples = []
+    for _ in range(model_args.export_quantization_nsamples):
+        while True:
+            sample_idx = random.randint(0, len(dataset) - 1)
+            sample: Dict[str, torch.Tensor] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt")
+            if sample["input_ids"].size(1) >= maxlen:
+                break  # TODO: fix large maxlen
+
+        word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1)
+        input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen]
+        samples.append(tokenizer.decode(input_ids[0].tolist(), skip_special_tokens=True))
+
+    return samples
+
+
+def configure_quantization(
+    config: "PretrainedConfig",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    init_kwargs: Dict[str, Any],
+) -> None:
+    r"""
+    Priority: PTQ-quantized (training) > AutoGPTQ (export) > Bitsandbytes (training)
+    """
+    if getattr(config, "quantization_config", None):  # ptq
+        if is_deepspeed_zero3_enabled():
+            raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantized models.")
+
+        if model_args.quantization_device_map != "auto":
+            init_kwargs["device_map"] = {"": get_current_device()}
+
+        quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None)
+        quant_method = quantization_config.get("quant_method", "")
+
+        if quant_method == QuantizationMethod.GPTQ:
+            require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
+            quantization_config.pop("disable_exllama", None)  # remove deprecated args
+            quantization_config["use_exllama"] = False  # disable exllama
+
+        if quant_method == QuantizationMethod.AWQ:
+            require_version("autoawq", "To fix: pip install autoawq")
+
+        if quant_method == QuantizationMethod.AQLM:
+            require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
+            require_version("aqlm>=1.1.0", "To fix: pip install aqlm[gpu]>=1.1.0")
+            quantization_config["bits"] = 2
+
+        quant_bits = quantization_config.get("bits", "?")
+        logger.info("Loading {}-bit {}-quantized model.".format(quant_bits, quant_method.upper()))
+
+    elif model_args.export_quantization_bit is not None:  # auto-gptq
+        require_version("optimum>=1.16.0", "To fix: pip install optimum>=1.16.0")
+        require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
+        from accelerate.utils import get_max_memory
+
+        if getattr(config, "model_type", None) == "chatglm":
+            raise ValueError("ChatGLM model is not supported.")
+
+        init_kwargs["quantization_config"] = GPTQConfig(
+            bits=model_args.export_quantization_bit,
+            tokenizer=tokenizer,
+            dataset=_get_quantization_dataset(tokenizer, model_args),
+        )
+        init_kwargs["device_map"] = "auto"
+        init_kwargs["max_memory"] = get_max_memory()
+        logger.info("Quantizing model to {} bit.".format(model_args.export_quantization_bit))
+
+    elif model_args.quantization_bit is not None:  # bnb
+        if model_args.quantization_bit == 8:
+            require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0")
+            init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
+
+        elif model_args.quantization_bit == 4:
+            require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0")
+            init_kwargs["quantization_config"] = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=model_args.compute_dtype,
+                bnb_4bit_use_double_quant=model_args.double_quantization,
+                bnb_4bit_quant_type=model_args.quantization_type,
+                bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp qlora
+            )
+
+        if is_deepspeed_zero3_enabled() or model_args.quantization_device_map == "auto":
+            if model_args.quantization_bit != 4:
+                raise ValueError("Only 4-bit quantized model can use auto device map.")
+
+            require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
+            require_version("accelerate>=0.28.0", "To fix: pip install accelerate>=0.28.0")
+            require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0")
+        else:
+            init_kwargs["device_map"] = {"": get_current_device()}
+
+        logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
diff --git a/src/llmtuner/model/utils/rope.py b/src/llmtuner/model/utils/rope.py
new file mode 100644
index 00000000..2a4cce7a
--- /dev/null
+++ b/src/llmtuner/model/utils/rope.py
@@ -0,0 +1,43 @@
+import math
+from typing import TYPE_CHECKING
+
+from ...extras.logging import get_logger
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
+    if model_args.rope_scaling is None:
+        return
+
+    if not hasattr(config, "rope_scaling"):
+        logger.warning("Current model does not support RoPE scaling.")
+        return
+
+    if is_trainable:
+        if model_args.rope_scaling == "dynamic":
+            logger.warning(
+                "Dynamic NTK scaling may not work well with fine-tuning. "
+                "See: https://github.com/huggingface/transformers/pull/24653"
+            )
+
+        current_max_length = getattr(config, "max_position_embeddings", None)
+        if current_max_length and model_args.model_max_length > current_max_length:
+            scaling_factor = float(math.ceil(model_args.model_max_length / current_max_length))
+        else:
+            logger.warning("Input length is smaller than max length. Consider increase input length.")
+            scaling_factor = 1.0
+    else:
+        scaling_factor = 2.0
+
+    setattr(config, "rope_scaling", {"type": model_args.rope_scaling, "factor": scaling_factor})
+    logger.info(
+        "Using {} scaling strategy and setting scaling factor to {}".format(model_args.rope_scaling, scaling_factor)
+    )

From 5420905a2ebdfaecdcbe734fc49c131f3b75e422 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 04:46:53 +0800
Subject: [PATCH 139/341] support unsloth generate

Former-commit-id: 0ef1ad9f505dba71db9342f524cc3a7565e5e09e
---
 src/llmtuner/model/adapter.py          | 35 +++++++----
 src/llmtuner/model/loader.py           | 55 +++++------------
 src/llmtuner/model/utils/mod.py        | 28 +++++++++
 src/llmtuner/model/utils/unsloth.py    | 85 ++++++++++++++++++++++++++
 src/llmtuner/train/utils.py            |  3 +
 src/llmtuner/webui/components/train.py |  2 +-
 6 files changed, 155 insertions(+), 53 deletions(-)
 create mode 100644 src/llmtuner/model/utils/mod.py
 create mode 100644 src/llmtuner/model/utils/unsloth.py

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index efc63cde..d8d8eaf0 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -7,10 +7,11 @@ from transformers.integrations import is_deepspeed_zero3_enabled
 from ..extras.logging import get_logger
 from .utils.misc import find_all_linear_modules, find_expanded_modules
 from .utils.quantization import QuantizationMethod
+from .utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model
 
 
 if TYPE_CHECKING:
-    from transformers.modeling_utils import PreTrainedModel
+    from transformers import PretrainedConfig, PreTrainedModel
 
     from ..hparams import FinetuningArguments, ModelArguments
 
@@ -19,7 +20,11 @@ logger = get_logger(__name__)
 
 
 def init_adapter(
-    model: "PreTrainedModel", model_args: "ModelArguments", finetuning_args: "FinetuningArguments", is_trainable: bool
+    config: "PretrainedConfig",
+    model: "PreTrainedModel",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
 ) -> "PreTrainedModel":
     r"""
     Initializes the adapters.
@@ -106,6 +111,10 @@ def init_adapter(
                 assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3."
                 is_mergeable = False
 
+            if model_args.use_unsloth:
+                assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter."
+                is_mergeable = False
+
             if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable):
                 adapter_to_merge = model_args.adapter_name_or_path[:-1]
                 adapter_to_resume = model_args.adapter_name_or_path[-1]
@@ -122,9 +131,15 @@ def init_adapter(
                 logger.info("Merged {} adapter(s).".format(len(adapter_to_merge)))
 
             if adapter_to_resume is not None:  # resume lora training
-                model = PeftModel.from_pretrained(
-                    model, adapter_to_resume, is_trainable=is_trainable, offload_folder=model_args.offload_folder
-                )
+                if model_args.use_unsloth:
+                    model = load_unsloth_peft_model(config, model_args, is_trainable=is_trainable)
+                else:
+                    model = PeftModel.from_pretrained(
+                        model,
+                        adapter_to_resume,
+                        is_trainable=is_trainable,
+                        offload_folder=model_args.offload_folder,
+                    )
 
         if is_trainable and adapter_to_resume is None:  # create new lora weights while training
             if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all":
@@ -152,14 +167,8 @@ def init_adapter(
             }
 
             if model_args.use_unsloth:
-                from unsloth import FastLanguageModel  # type: ignore
-
-                unsloth_peft_kwargs = {
-                    "model": model,
-                    "max_seq_length": model_args.model_max_length,
-                    "use_gradient_checkpointing": "unsloth",
-                }
-                model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
+                print(model)
+                model = get_unsloth_peft_model(model, model_args, peft_kwargs)
             else:
                 lora_config = LoraConfig(
                     task_type=TaskType.CAUSAL_LM,
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index b8558542..06405219 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -3,12 +3,13 @@ from typing import TYPE_CHECKING, Any, Dict
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from trl import AutoModelForCausalLMWithValueHead
 
-from ..extras.constants import MOD_SUPPORTED_MODELS
 from ..extras.logging import get_logger
-from ..extras.misc import count_parameters, get_current_device, try_download_model_from_ms
+from ..extras.misc import count_parameters, try_download_model_from_ms
 from .adapter import init_adapter
 from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model
 from .utils.misc import load_valuehead_params, register_autoclass
+from .utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
+from .utils.unsloth import load_unsloth_pretrained_model
 
 
 if TYPE_CHECKING:
@@ -83,54 +84,30 @@ def load_model(
     patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
 
     model = None
-    if is_trainable and model_args.use_unsloth:
-        from unsloth import FastLanguageModel  # type: ignore
+    lazy_load = False
+    if model_args.use_unsloth:
+        if model_args.adapter_name_or_path is not None:
+            lazy_load = True
+        elif is_trainable:
+            model = load_unsloth_pretrained_model(config, model_args)
 
-        unsloth_kwargs = {
-            "model_name": model_args.model_name_or_path,
-            "max_seq_length": model_args.model_max_length,
-            "dtype": model_args.compute_dtype,
-            "load_in_4bit": model_args.quantization_bit == 4,
-            "token": model_args.hf_hub_token,
-            "device_map": {"": get_current_device()},
-            "rope_scaling": getattr(config, "rope_scaling", None),
-            "fix_tokenizer": False,
-            "trust_remote_code": True,
-        }
-        try:
-            model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
-        except NotImplementedError:
-            logger.warning("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))
-            model_args.use_unsloth = False
-
-        if model_args.adapter_name_or_path:
-            model_args.adapter_name_or_path = None
-            logger.warning("Unsloth does not support loading adapters.")
-
-    if model is None:
+    if model is None and not lazy_load:
         init_kwargs["config"] = config
         init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path
 
         if model_args.mixture_of_depths == "load":
-            from MoD import AutoMoDModelForCausalLM
-
-            model = AutoMoDModelForCausalLM.from_pretrained(**init_kwargs)
+            model = load_mod_pretrained_model(**init_kwargs)
         else:
             model = AutoModelForCausalLM.from_pretrained(**init_kwargs)
 
         if model_args.mixture_of_depths == "convert":
-            from MoD import apply_mod_to_hf
+            model = convert_pretrained_model_to_mod(model, config, model_args)
 
-            if getattr(config, "model_type", None) not in MOD_SUPPORTED_MODELS:
-                raise ValueError("Current model is not supported by mixture-of-depth.")
+    if not lazy_load:
+        patch_model(model, tokenizer, model_args, is_trainable)
+        register_autoclass(config, model, tokenizer)
 
-            model = apply_mod_to_hf(model)
-            model = model.to(model_args.compute_dtype)
-
-    patch_model(model, tokenizer, model_args, is_trainable)
-    register_autoclass(config, model, tokenizer)
-
-    model = init_adapter(model, model_args, finetuning_args, is_trainable)
+    model = init_adapter(config, model, model_args, finetuning_args, is_trainable)
 
     if add_valuehead:
         model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
diff --git a/src/llmtuner/model/utils/mod.py b/src/llmtuner/model/utils/mod.py
new file mode 100644
index 00000000..5708a1a8
--- /dev/null
+++ b/src/llmtuner/model/utils/mod.py
@@ -0,0 +1,28 @@
+from typing import TYPE_CHECKING
+
+from ...extras.constants import MOD_SUPPORTED_MODELS
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+def load_mod_pretrained_model(**init_kwargs) -> "PreTrainedModel":
+    from MoD import AutoMoDModelForCausalLM
+
+    return AutoMoDModelForCausalLM.from_pretrained(**init_kwargs)
+
+
+def convert_pretrained_model_to_mod(
+    model: "PreTrainedModel", config: "PretrainedConfig", model_args: "ModelArguments"
+) -> "PreTrainedModel":
+    from MoD import apply_mod_to_hf
+
+    if getattr(config, "model_type", None) not in MOD_SUPPORTED_MODELS:
+        raise ValueError("Current model is not supported by mixture-of-depth.")
+
+    model = apply_mod_to_hf(model)
+    model = model.to(model_args.compute_dtype)
+    return model
diff --git a/src/llmtuner/model/utils/unsloth.py b/src/llmtuner/model/utils/unsloth.py
new file mode 100644
index 00000000..6c5f506f
--- /dev/null
+++ b/src/llmtuner/model/utils/unsloth.py
@@ -0,0 +1,85 @@
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+from ...extras.logging import get_logger
+from ...extras.misc import get_current_device
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def _get_unsloth_kwargs(
+    config: "PretrainedConfig", model_name_or_path: str, model_args: "ModelArguments"
+) -> Dict[str, Any]:
+    return {
+        "model_name": model_name_or_path,
+        "max_seq_length": model_args.model_max_length,
+        "dtype": model_args.compute_dtype,
+        "load_in_4bit": model_args.quantization_bit == 4,
+        "token": model_args.hf_hub_token,
+        "device_map": {"": get_current_device()},
+        "rope_scaling": getattr(config, "rope_scaling", None),
+        "fix_tokenizer": False,
+        "trust_remote_code": True,
+        "use_gradient_checkpointing": "unsloth",
+    }
+
+
+def load_unsloth_pretrained_model(
+    config: "PretrainedConfig", model_args: "ModelArguments"
+) -> Optional["PreTrainedModel"]:
+    r"""
+    Optionally loads pretrained model with unsloth.
+    """
+    from unsloth import FastLanguageModel
+
+    unsloth_kwargs = _get_unsloth_kwargs(config, model_args.model_name_or_path, model_args)
+    try:
+        model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
+    except NotImplementedError:
+        logger.warning("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))
+        model = None
+        model_args.use_unsloth = False
+
+    return model
+
+
+def get_unsloth_peft_model(
+    model: "PreTrainedModel", model_args: "ModelArguments", peft_kwargs: Dict[str, Any]
+) -> "PreTrainedModel":
+    r"""
+    Gets the peft model for the pretrained model with unsloth.
+    """
+    from unsloth import FastLanguageModel
+
+    unsloth_peft_kwargs = {
+        "model": model,
+        "max_seq_length": model_args.model_max_length,
+        "use_gradient_checkpointing": "unsloth",
+    }
+    return FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
+
+
+def load_unsloth_peft_model(
+    config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool
+) -> "PreTrainedModel":
+    r"""
+    Loads peft model with unsloth.
+    """
+    from unsloth import FastLanguageModel
+
+    unsloth_kwargs = _get_unsloth_kwargs(config, model_args.adapter_name_or_path, model_args)
+    try:
+        model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
+    except NotImplementedError:
+        raise ValueError("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))
+
+    if not is_trainable:
+        FastLanguageModel.for_inference(model)
+
+    return model
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index fa9e36e5..27dc8eb3 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -61,6 +61,9 @@ def create_modelcard_and_push(
     if data_args.dataset is not None:
         kwargs["dataset"] = [dataset.strip() for dataset in data_args.dataset.split(",")]
 
+    if model_args.use_unsloth:
+        kwargs["tags"] = kwargs["tags"] + ["unsloth"]
+
     if not training_args.do_train:
         pass
     elif training_args.push_to_hub:
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 0f425bc9..7dc324af 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -138,7 +138,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         with gr.Row():
             lora_rank = gr.Slider(value=8, minimum=1, maximum=1024, step=1)
             lora_alpha = gr.Slider(value=16, minimum=1, maximum=2048, step=1)
-            lora_dropout = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01)
+            lora_dropout = gr.Slider(value=0, minimum=0, maximum=1, step=0.01)
             loraplus_lr_ratio = gr.Slider(value=0, minimum=0, maximum=64, step=0.01)
             create_new_adapter = gr.Checkbox()
 

From c05027d14a4b62e930c016eb9229ad44042dc287 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 05:02:18 +0800
Subject: [PATCH 140/341] remove redundant code

Former-commit-id: 4a7a7ad2bcdc493458084f5f3d384239228b7d5a
---
 src/llmtuner/model/adapter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index d8d8eaf0..af58b514 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -167,7 +167,6 @@ def init_adapter(
             }
 
             if model_args.use_unsloth:
-                print(model)
                 model = get_unsloth_peft_model(model, model_args, peft_kwargs)
             else:
                 lora_config = LoraConfig(

From ad24a2a0c9ad289ac86e36a65d03d02dfa5c8442 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 05:10:07 +0800
Subject: [PATCH 141/341] fix bug

Former-commit-id: 271c24d2c82d645fa9072e6de94ca38f20411537
---
 src/llmtuner/model/utils/unsloth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/model/utils/unsloth.py b/src/llmtuner/model/utils/unsloth.py
index 6c5f506f..974b41c0 100644
--- a/src/llmtuner/model/utils/unsloth.py
+++ b/src/llmtuner/model/utils/unsloth.py
@@ -73,7 +73,7 @@ def load_unsloth_peft_model(
     """
     from unsloth import FastLanguageModel
 
-    unsloth_kwargs = _get_unsloth_kwargs(config, model_args.adapter_name_or_path, model_args)
+    unsloth_kwargs = _get_unsloth_kwargs(config, model_args.adapter_name_or_path[0], model_args)
     try:
         model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
     except NotImplementedError:

From 94c82195750202b196ef5adc7a6233153d42eb43 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 05:21:18 +0800
Subject: [PATCH 142/341] fix bug

Former-commit-id: 38e164fe4aaea6f0baf121a720291ca42643ba8c
---
 src/llmtuner/model/utils/unsloth.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/llmtuner/model/utils/unsloth.py b/src/llmtuner/model/utils/unsloth.py
index 974b41c0..8a16409d 100644
--- a/src/llmtuner/model/utils/unsloth.py
+++ b/src/llmtuner/model/utils/unsloth.py
@@ -18,7 +18,7 @@ def _get_unsloth_kwargs(
 ) -> Dict[str, Any]:
     return {
         "model_name": model_name_or_path,
-        "max_seq_length": model_args.model_max_length,
+        "max_seq_length": model_args.model_max_length or 4096,
         "dtype": model_args.compute_dtype,
         "load_in_4bit": model_args.quantization_bit == 4,
         "token": model_args.hf_hub_token,
@@ -34,7 +34,7 @@ def load_unsloth_pretrained_model(
     config: "PretrainedConfig", model_args: "ModelArguments"
 ) -> Optional["PreTrainedModel"]:
     r"""
-    Optionally loads pretrained model with unsloth.
+    Optionally loads pretrained model with unsloth. Used in training.
     """
     from unsloth import FastLanguageModel
 
@@ -53,7 +53,7 @@ def get_unsloth_peft_model(
     model: "PreTrainedModel", model_args: "ModelArguments", peft_kwargs: Dict[str, Any]
 ) -> "PreTrainedModel":
     r"""
-    Gets the peft model for the pretrained model with unsloth.
+    Gets the peft model for the pretrained model with unsloth. Used in training.
     """
     from unsloth import FastLanguageModel
 
@@ -69,12 +69,15 @@ def load_unsloth_peft_model(
     config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool
 ) -> "PreTrainedModel":
     r"""
-    Loads peft model with unsloth.
+    Loads peft model with unsloth. Used in both training and inference.
     """
     from unsloth import FastLanguageModel
 
     unsloth_kwargs = _get_unsloth_kwargs(config, model_args.adapter_name_or_path[0], model_args)
     try:
+        if not is_trainable:
+            unsloth_kwargs["use_gradient_checkpointing"] = False
+
         model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
     except NotImplementedError:
         raise ValueError("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))

From aa25716a5d01dbff7920f75cc045bddab917bc93 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 05:39:52 +0800
Subject: [PATCH 143/341] add dbrx and jamba models

Former-commit-id: ce35c80b4b00152185285d6064939803d14487f0
---
 src/llmtuner/data/template.py    | 25 +++++++++++++++++++++++++
 src/llmtuner/extras/constants.py | 26 ++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index cd567a7b..efdd44f3 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -550,6 +550,31 @@ _register_template(
 )
 
 
+_register_template(
+    name="dbrx",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system=(
+        "You are DBRX, created by Databricks. You were last updated in December 2023. "
+        "You answer questions based on information available up to that point.\n"
+        "YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough "
+        "responses to more complex and open-ended questions.\nYou assist with various tasks, "
+        "from writing to coding (using markdown for code blocks — remember to use ``` with "
+        "code, JSON, and tables).\n(You do not have real-time data access or code execution "
+        "capabilities. You avoid stereotyping and provide balanced perspectives on "
+        "controversial topics. You do not provide song lyrics, poems, or news articles and "
+        "do not divulge details of your training data.)\nThis is your system prompt, "
+        "guiding your responses. Do not reference it, just respond to the user. If you find "
+        "yourself talking about this message, stop. You should be responding appropriately "
+        "and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION "
+        "ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY."
+    ),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
 _register_template(
     name="deepseek",
     format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 0a29f971..031e3e81 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -268,6 +268,22 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "DBRX-132B-Base": {
+            DownloadSource.DEFAULT: "databricks/dbrx-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-base",
+        },
+        "DBRX-132B-Chat": {
+            DownloadSource.DEFAULT: "databricks/dbrx-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-instruct",
+        },
+    },
+    module="Wqkv",
+    template="dbrx",
+)
+
+
 register_model_group(
     models={
         "DeepSeek-LLM-7B-Base": {
@@ -453,6 +469,16 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "Jambda-v0.1": {
+            DownloadSource.DEFAULT: "ai21labs/Jamba-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Jamba-v0.1",
+        }
+    },
+)
+
+
 register_model_group(
     models={
         "LingoWhale-8B": {

From a5eabbe93314a69aad1ebadabda544bec30ccd93 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 05:50:50 +0800
Subject: [PATCH 144/341] add olmo 1.7

Former-commit-id: 86a3fb3a141d2702b15af08df36ffcf9b3d6de14
---
 README.md                        |  2 +-
 README_zh.md                     |  2 +-
 src/llmtuner/extras/constants.py | 11 ++++-------
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 970dd8fc..4e87e369 100644
--- a/README.md
+++ b/README.md
@@ -149,7 +149,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
 | [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                      | q_proj,v_proj     | llama3    |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
-| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
+| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | q_proj,v_proj     | -         |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
 | [Phi-3](https://huggingface.co/microsoft)                | 3.8B                        | qkv_proj          | phi       |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
diff --git a/README_zh.md b/README_zh.md
index 583c89ca..599af301 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -149,7 +149,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
 | [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                      | q_proj,v_proj     | llama3    |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
-| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | att_proj          | olmo      |
+| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | q_proj,v_proj     | -         |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
 | [Phi-3](https://huggingface.co/microsoft)                | 3.8B                        | qkv_proj          | phi       |
 | [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 031e3e81..9f7d5c46 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -613,18 +613,15 @@ register_model_group(
 register_model_group(
     models={
         "OLMo-1B": {
-            DownloadSource.DEFAULT: "allenai/OLMo-1B",
+            DownloadSource.DEFAULT: "allenai/OLMo-1B-hf",
         },
         "OLMo-7B": {
-            DownloadSource.DEFAULT: "allenai/OLMo-7B",
-            DownloadSource.MODELSCOPE: "AI-ModelScope/OLMo-7B",
+            DownloadSource.DEFAULT: "allenai/OLMo-7B-hf",
         },
-        "OLMo-7B-Chat": {
-            DownloadSource.DEFAULT: "allenai/OLMo-7B-Instruct",
+        "OLMo-1.7-7B": {
+            DownloadSource.DEFAULT: "allenai/OLMo-1.7-7B-hf",
         },
     },
-    module="att_proj",
-    template="olmo",
 )
 
 
From 7cfcd69c646ffc5b53144554ac24f1a0e025ec2e Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 13:53:39 +0800
Subject: [PATCH 145/341] fix inference in llamaboard

Former-commit-id: 5e631915157083b61e2d5a183e0c91f2d11f416e
---
 src/llmtuner/webui/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index b64a015c..77d5ea98 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -222,7 +222,7 @@ class Runner:
             quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
             template=get("top.template"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
-            flash_attn=(get("top.booster") == "flashattn"),
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
             use_unsloth=(get("top.booster") == "unsloth"),
             dataset_dir=get("eval.dataset_dir"),
             dataset=",".join(get("eval.dataset")),

From a88873116a2a16b5548a866f24b9f9057150fcfb Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 13:54:21 +0800
Subject: [PATCH 146/341] fix webchatmodel

Former-commit-id: dc6d8b5dc42c363dd180aaf90c9a2f2d0cce6725
---
 src/llmtuner/webui/chatter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/webui/chatter.py b/src/llmtuner/webui/chatter.py
index dac7dd67..ee28603e 100644
--- a/src/llmtuner/webui/chatter.py
+++ b/src/llmtuner/webui/chatter.py
@@ -72,7 +72,7 @@ class WebChatModel(ChatModel):
             finetuning_type=get("top.finetuning_type"),
             quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
             template=get("top.template"),
-            flash_attn=(get("top.booster") == "flash_attn"),
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
             use_unsloth=(get("top.booster") == "unsloth"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             infer_backend=get("infer.infer_backend"),

From 12f852b8d42b1b1f5293278c5b8862c13c446369 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 13:55:14 +0800
Subject: [PATCH 147/341] fix phi template

Former-commit-id: 14a1ff665eaebfc618229efbe96f09848d52faec
---
 src/llmtuner/data/template.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index efdd44f3..dd355e97 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -747,8 +747,10 @@ _register_template(
     name="phi",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
     format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]),
-    format_separator=EmptyFormatter(slots=["<|end|>\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
     default_system="You are a helpful AI assistant.",
+    stop_words=["<|end|>"],
+    replace_eos=True,
 )
 
 
From 83404c4fa902d37e728e1e52614b744dbfef1ba4 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 23:39:31 +0800
Subject: [PATCH 148/341] support new special token #3420

Former-commit-id: f5c6a47f5193ab3a6c137580992bdcce0b31fdd5
---
 src/llmtuner/hparams/data_args.py       |  4 ++--
 src/llmtuner/hparams/generating_args.py |  4 ++--
 src/llmtuner/hparams/model_args.py      |  7 +++++++
 src/llmtuner/hparams/parser.py          |  6 +++++-
 src/llmtuner/model/adapter.py           | 11 +++++++++++
 src/llmtuner/model/loader.py            | 12 ++++++++++++
 src/llmtuner/model/utils/embedding.py   |  6 ++++--
 src/llmtuner/model/utils/rope.py        |  4 ++++
 8 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py
index f5f75c77..1e0cd08c 100644
--- a/src/llmtuner/hparams/data_args.py
+++ b/src/llmtuner/hparams/data_args.py
@@ -26,11 +26,11 @@ class DataArguments:
     )
     cutoff_len: int = field(
         default=1024,
-        metadata={"help": "The cutoff length of the model inputs after tokenization."},
+        metadata={"help": "The cutoff length of the tokenized inputs in the dataset."},
     )
     reserved_label_len: int = field(
         default=1,
-        metadata={"help": "The minimum cutoff length reserved for label after tokenization."},
+        metadata={"help": "The minimum cutoff length reserved for the tokenized labels in the dataset."},
     )
     train_on_prompt: bool = field(
         default=False,
diff --git a/src/llmtuner/hparams/generating_args.py b/src/llmtuner/hparams/generating_args.py
index 70dabb3e..e792c003 100644
--- a/src/llmtuner/hparams/generating_args.py
+++ b/src/llmtuner/hparams/generating_args.py
@@ -31,11 +31,11 @@ class GeneratingArguments:
         metadata={"help": "Number of beams for beam search. 1 means no beam search."},
     )
     max_length: int = field(
-        default=512,
+        default=1024,
         metadata={"help": "The maximum length the generated tokens can have. It can be overridden by max_new_tokens."},
     )
     max_new_tokens: int = field(
-        default=512,
+        default=1024,
         metadata={"help": "The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."},
     )
     repetition_penalty: float = field(
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index eb6366d9..b60492a0 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -33,6 +33,10 @@ class ModelArguments:
         default=False,
         metadata={"help": "Whether or not the special tokens should be split during the tokenization process."},
     )
+    new_special_tokens: Optional[str] = field(
+        default=None,
+        metadata={"help": "Special tokens to be added into the tokenizer."},
+    )
     model_revision: str = field(
         default="main",
         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
@@ -177,6 +181,9 @@ class ModelArguments:
         if self.adapter_name_or_path is not None:  # support merging multiple lora weights
             self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")]
 
+        if self.new_special_tokens is not None:  # support multiple special tokens
+            self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")]
+
         assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
         assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization."
 
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 0d286819..a7d0a17f 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -67,6 +67,9 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin
         if finetuning_args.finetuning_type != "lora":
             raise ValueError("Quantization is only compatible with the LoRA method.")
 
+        if model_args.resize_vocab:
+            raise ValueError("Cannot resize embedding layers of a quantized model.")
+
         if model_args.adapter_name_or_path is not None and finetuning_args.create_new_adapter:
             raise ValueError("Cannot create new adapter upon a quantized model.")
 
@@ -199,10 +202,11 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if (
         training_args.do_train
         and finetuning_args.finetuning_type == "lora"
+        and model_args.quantization_bit is None
         and model_args.resize_vocab
         and finetuning_args.additional_target is None
     ):
-        logger.warning("Add token embeddings to `additional_target` to make the added tokens trainable.")
+        logger.warning("Remember to add embedding layers to `additional_target` to make the added tokens trainable.")
 
     if training_args.do_train and model_args.quantization_bit is not None and (not model_args.upcast_layernorm):
         logger.warning("We recommend enable `upcast_layernorm` in quantized training.")
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index af58b514..d43e00f0 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -157,6 +157,17 @@ def init_adapter(
             ):
                 raise ValueError("DoRA is not compatible with PTQ-quantized models.")
 
+            if model_args.resize_vocab and finetuning_args.additional_target is None:
+                input_embeddings = model.get_input_embeddings()
+                output_embeddings = model.get_output_embeddings()
+                module_names = set()
+                for name, module in model.named_modules():
+                    if module in [input_embeddings, output_embeddings]:
+                        module_names.add(name.split(".")[-1])
+
+                finetuning_args.additional_target = module_names
+                logger.warning("Vocab has been resized, add {} to trainable params.".format(",".join(module_names)))
+
             peft_kwargs = {
                 "r": finetuning_args.lora_rank,
                 "target_modules": target_modules,
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 06405219..54048cc5 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -39,6 +39,8 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
 def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
     r"""
     Loads pretrained tokenizer.
+
+    Note: including inplace operation of model_args.
     """
     init_kwargs = _get_init_kwargs(model_args)
     try:
@@ -57,6 +59,16 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
             **init_kwargs,
         )
 
+    if model_args.new_special_tokens is not None:
+        num_added_tokens = tokenizer.add_special_tokens(
+            dict(additional_special_tokens=model_args.new_special_tokens),
+            replace_additional_special_tokens=False,
+        )
+        logger.info("Add {} to special tokens.".format(",".join(model_args.new_special_tokens)))
+        if num_added_tokens > 0 and not model_args.resize_vocab:
+            model_args.resize_vocab = True
+            logger.warning("New tokens have been added, changed `resize_vocab` to True.")
+
     patch_tokenizer(tokenizer)
     return tokenizer
 
diff --git a/src/llmtuner/model/utils/embedding.py b/src/llmtuner/model/utils/embedding.py
index 7759fc0f..357c9cc0 100644
--- a/src/llmtuner/model/utils/embedding.py
+++ b/src/llmtuner/model/utils/embedding.py
@@ -42,9 +42,11 @@ def resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedToken
         current_embedding_size = model.get_input_embeddings().weight.size(0)
 
     if len(tokenizer) > current_embedding_size:
+        if getattr(model, "quantization_method", None):
+            raise ValueError("Cannot resize embedding layers of a quantized model.")
+
         if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
-            logger.warning("Current model does not support resizing token embeddings.")
-            return
+            raise ValueError("Current model does not support resizing embedding layers.")
 
         model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
         with context_maybe_zero3:
diff --git a/src/llmtuner/model/utils/rope.py b/src/llmtuner/model/utils/rope.py
index 2a4cce7a..9163253b 100644
--- a/src/llmtuner/model/utils/rope.py
+++ b/src/llmtuner/model/utils/rope.py
@@ -30,6 +30,10 @@ def configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_
 
         current_max_length = getattr(config, "max_position_embeddings", None)
         if current_max_length and model_args.model_max_length > current_max_length:
+            logger.warning(
+                "Enlarge max model length from {} to {}.".format(current_max_length, model_args.model_max_length)
+            )
+            setattr(config, "max_position_embeddings", model_args.model_max_length)
             scaling_factor = float(math.ceil(model_args.model_max_length / current_max_length))
         else:
             logger.warning("Input length is smaller than max length. Consider increase input length.")

From 21fac4c98c20933ea3bed27ffa666aef9b208853 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 24 Apr 2024 23:42:59 +0800
Subject: [PATCH 149/341] fix log level

Former-commit-id: 8d21302f6201b3f33c10f61f3559bd95be3363c2
---
 src/llmtuner/model/utils/rope.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/model/utils/rope.py b/src/llmtuner/model/utils/rope.py
index 9163253b..93ab8929 100644
--- a/src/llmtuner/model/utils/rope.py
+++ b/src/llmtuner/model/utils/rope.py
@@ -30,7 +30,7 @@ def configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_
 
         current_max_length = getattr(config, "max_position_embeddings", None)
         if current_max_length and model_args.model_max_length > current_max_length:
-            logger.warning(
+            logger.info(
                 "Enlarge max model length from {} to {}.".format(current_max_length, model_args.model_max_length)
             )
             setattr(config, "max_position_embeddings", model_args.model_max_length)

From 36be12a3b722e9865632df8df43b1f37e117d544 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 25 Apr 2024 00:21:34 +0800
Subject: [PATCH 150/341] update tool template

Former-commit-id: c72a1981859818c257c5271d32e03c9d3c344206
---
 src/llmtuner/data/template.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index dd355e97..73b22eb7 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -503,6 +503,7 @@ _register_template(
     name="chatml",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     stop_words=["<|im_end|>", "<|im_start|>"],
     replace_eos=True,
@@ -513,6 +514,7 @@ _register_template(
     name="chatml_de",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
     stop_words=["<|im_end|>", "<|im_start|>"],
@@ -554,6 +556,7 @@ _register_template(
     name="dbrx",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system=(
         "You are DBRX, created by Databricks. You were last updated in December 2023. "
@@ -633,6 +636,9 @@ _register_template(
     name="gemma",
     format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
     format_separator=EmptyFormatter(slots=["<end_of_turn>\n"]),
     efficient_eos=True,
     force_system=True,
@@ -703,6 +709,14 @@ _register_template(
     format_system=StringFormatter(
         slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]
     ),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>tool<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
     default_system="You are a helpful assistant.",
     stop_words=["<|eot_id|>"],
     replace_eos=True,
@@ -747,6 +761,7 @@ _register_template(
     name="phi",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
     format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]),
+    format_observation=StringFormatter(slots=["<|function_output|>\n{{content}}<|end|>\n<|assistant|>\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system="You are a helpful AI assistant.",
     stop_words=["<|end|>"],
@@ -758,6 +773,7 @@ _register_template(
     name="qwen",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system="You are a helpful assistant.",
     stop_words=["<|im_end|>"],

From 12c51655cebff8e60bff48d092916c6e96348852 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 00:22:43 +0800
Subject: [PATCH 151/341] add llava and instructblip

Former-commit-id: 142fb6f4541a1acfefe66ff2574dabde53b00c06
---
 data/mllm_example_dataset/README.md           |  25 +++++
 data/mllm_example_dataset/data/test-0.parquet | Bin 0 -> 4580 bytes
 .../mllm_example_dataset/data/train-0.parquet | Bin 0 -> 4580 bytes
 examples/mllm/sft_instructblip.sh             |  16 ++-
 examples/mllm/{sft_blip2.sh => sft_llava.sh}  |  17 ++--
 scripts/make_mllm_instruct.py                 |  95 ++++++++++++++++++
 scripts/test_mllm.py                          |  84 ++++++++++++++++
 src/llmtuner/data/loader.py                   |   3 +-
 src/llmtuner/data/preprocess.py               |   2 +-
 src/llmtuner/hparams/data_args.py             |   4 -
 src/llmtuner/hparams/model_args.py            |   4 -
 src/llmtuner/model/adapter.py                 |  22 ++--
 src/llmtuner/model/loader.py                  |   3 +-
 src/llmtuner/train/sftmm/collator.py          |  82 ++++-----------
 src/llmtuner/train/sftmm/trainer.py           |  95 +-----------------
 src/llmtuner/train/sftmm/workflow.py          |  35 +++----
 16 files changed, 273 insertions(+), 214 deletions(-)
 create mode 100644 data/mllm_example_dataset/README.md
 create mode 100644 data/mllm_example_dataset/data/test-0.parquet
 create mode 100644 data/mllm_example_dataset/data/train-0.parquet
 rename examples/mllm/{sft_blip2.sh => sft_llava.sh} (58%)
 create mode 100644 scripts/make_mllm_instruct.py
 create mode 100644 scripts/test_mllm.py

diff --git a/data/mllm_example_dataset/README.md b/data/mllm_example_dataset/README.md
new file mode 100644
index 00000000..d5c8c0e6
--- /dev/null
+++ b/data/mllm_example_dataset/README.md
@@ -0,0 +1,25 @@
+---
+dataset_info:
+  features:
+  - name: messages
+    list:
+    - name: content
+      list:
+      - name: index
+        dtype: int64
+      - name: text
+        dtype: string
+      - name: type
+        dtype: string
+    - name: role
+      dtype: string
+  - name: images
+    sequence: image
+configs:
+- config_name: default
+  data_files:
+  - split: train
+    path: data/train-*
+  - split: test
+    path: data/test-*
+---
\ No newline at end of file
diff --git a/data/mllm_example_dataset/data/test-0.parquet b/data/mllm_example_dataset/data/test-0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..42c20b192497168523c3d39447cdae4495085b84
GIT binary patch
literal 4580
zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l
zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko
zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4}
z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2
zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj
z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ-
zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF
zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_<gQO0E?plPn
z$g@%Sy9E-z285d?!9b@GS9Of)v*X_bDx(3hp9SQKZde4c*!3LS!YvPFS_aBu@Ce)d
zD}#08kk_U#a+`#JlhNS!$G}A$^B*iCtg3dS3Hj5Bw)|1>kMpPvX5(fPocD;ve`lr&
zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN
zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAH<P68^nahqz7K
za*e79LTm?w@i(S!+D_H*kliHDz8`?JYz?h>u4)iVC5Rj~iDS@$#AeTYHxA`<TlnHh
zS1g34#iHIpEPIeyW@I-(VxpfOno*h`KmF)Tj(Vj*;*g5TsguUxhi?Dn<G+3UkIx5!
zW2xQ4SAW8B((j&{5JsN7eF7di@>usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf
z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m
zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r>
zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI
z&cAEI<P41Tu9+bKjwY!&A&#w=IOz@X#G(*O#n}-Ztad!&f<d9k*CI9P_gAG~U4?4m
zF4X=_p-=#IRQvL781!h^!wFw%Vm5xztcPz29GBV<IOtY5f{93&I%Og^9X`#&h{Q7>
zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn#
zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb
zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB
z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q<bHG;hsd{?kT
z?^6~IU%nUXe!}_()}4$<^Pm@D2Qs{c`PdAKeYFSK$7YMf?gNP4kY}mkvkVM)6uB?*
zUaT+t(d75P0`h75|GV-TgFmVnOV?$#@0*^Z-;vDpK4^!0y+yyGV)x<U#1VOW`=#P7
ze;|sl4=3oeR9}^4c?*8&b4QlvWu-2!$#Pv@sWX_OoWZv#@JoGsrVVI%hKEUE_?#@a
zWkA5234ymvg~H>)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL<cn`$Sw
zkjuhHV{!jjO|4m#(z@}Ip;Uq9pqff2E5&q@@)YM>17zN<l<E}!Lawo$Bqhb9xLm_*
zX{+UA&S=+j=F$n=tmxJ3zL~R<Zf+%?tdw*n3uNEbvSz!o1G-o8$6A)!rW%D6*(l9z
zr`60J=#Y*5#+s`Y_l(*O$hCU#bGogKqzd{gogCFsJ0-VRE*{NRcJfQLeWt%tO&u|N
zpICosR-?Az;;ddt?H`|yVfTDmcfMMLSc2~JKJBIUkF?@`tDFMAO0!F~EqR4z&ZCQ;
z<Z|LLk${7;s<5Q16hjN4TMB!02X!RN8Ph8lHB*JM*x7s`kx1@7pGYiHx{VAaVv@V{
zL+SlW^7$z~V?utDbSD$aKiB79W28(;my1zRdz5y$v~9EFomJ5HyxOyu7Bf6$y7*hV
z^GmaAfv(az99MIc{)p<M<-FBq_OK%M{b#JqepcU#S1D~A_xuF@dCI@k=T9<yr}&xI
zEOLJQm@BxnZB`jjwrOkTH(_D=_ZZaI=Mzw)nGxCiVKu|dllLfn@)#Bt-*wh+3hT#{
z{`B=_DStV&)Y3BhElpMyfhj@y`jRZ4c}nE_OJ{`N$&bm8OZgZ2`X84c)_+u{ym3HP
z@6iQLin7h1r|OBFnO3lMUSmg^?=9bd2m<&Cp)Et!Wtu!6>C-8)&Nj5N{w3etSwT_2
z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5
I;r~1T1e%nlz5oCK

literal 0
HcmV?d00001

diff --git a/data/mllm_example_dataset/data/train-0.parquet b/data/mllm_example_dataset/data/train-0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..42c20b192497168523c3d39447cdae4495085b84
GIT binary patch
literal 4580
zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l
zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko
zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4}
z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2
zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj
z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ-
zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF
zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_<gQO0E?plPn
z$g@%Sy9E-z285d?!9b@GS9Of)v*X_bDx(3hp9SQKZde4c*!3LS!YvPFS_aBu@Ce)d
zD}#08kk_U#a+`#JlhNS!$G}A$^B*iCtg3dS3Hj5Bw)|1>kMpPvX5(fPocD;ve`lr&
zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN
zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAH<P68^nahqz7K
za*e79LTm?w@i(S!+D_H*kliHDz8`?JYz?h>u4)iVC5Rj~iDS@$#AeTYHxA`<TlnHh
zS1g34#iHIpEPIeyW@I-(VxpfOno*h`KmF)Tj(Vj*;*g5TsguUxhi?Dn<G+3UkIx5!
zW2xQ4SAW8B((j&{5JsN7eF7di@>usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf
z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m
zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r>
zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI
z&cAEI<P41Tu9+bKjwY!&A&#w=IOz@X#G(*O#n}-Ztad!&f<d9k*CI9P_gAG~U4?4m
zF4X=_p-=#IRQvL781!h^!wFw%Vm5xztcPz29GBV<IOtY5f{93&I%Og^9X`#&h{Q7>
zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn#
zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb
zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB
z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q<bHG;hsd{?kT
z?^6~IU%nUXe!}_()}4$<^Pm@D2Qs{c`PdAKeYFSK$7YMf?gNP4kY}mkvkVM)6uB?*
zUaT+t(d75P0`h75|GV-TgFmVnOV?$#@0*^Z-;vDpK4^!0y+yyGV)x<U#1VOW`=#P7
ze;|sl4=3oeR9}^4c?*8&b4QlvWu-2!$#Pv@sWX_OoWZv#@JoGsrVVI%hKEUE_?#@a
zWkA5234ymvg~H>)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL<cn`$Sw
zkjuhHV{!jjO|4m#(z@}Ip;Uq9pqff2E5&q@@)YM>17zN<l<E}!Lawo$Bqhb9xLm_*
zX{+UA&S=+j=F$n=tmxJ3zL~R<Zf+%?tdw*n3uNEbvSz!o1G-o8$6A)!rW%D6*(l9z
zr`60J=#Y*5#+s`Y_l(*O$hCU#bGogKqzd{gogCFsJ0-VRE*{NRcJfQLeWt%tO&u|N
zpICosR-?Az;;ddt?H`|yVfTDmcfMMLSc2~JKJBIUkF?@`tDFMAO0!F~EqR4z&ZCQ;
z<Z|LLk${7;s<5Q16hjN4TMB!02X!RN8Ph8lHB*JM*x7s`kx1@7pGYiHx{VAaVv@V{
zL+SlW^7$z~V?utDbSD$aKiB79W28(;my1zRdz5y$v~9EFomJ5HyxOyu7Bf6$y7*hV
z^GmaAfv(az99MIc{)p<M<-FBq_OK%M{b#JqepcU#S1D~A_xuF@dCI@k=T9<yr}&xI
zEOLJQm@BxnZB`jjwrOkTH(_D=_ZZaI=Mzw)nGxCiVKu|dllLfn@)#Bt-*wh+3hT#{
z{`B=_DStV&)Y3BhElpMyfhj@y`jRZ4c}nE_OJ{`N$&bm8OZgZ2`X84c)_+u{ym3HP
z@6iQLin7h1r|OBFnO3lMUSmg^?=9bd2m<&Cp)Et!Wtu!6>C-8)&Nj5N{w3etSwT_2
z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5
I;r~1T1e%nlz5oCK

literal 0
HcmV?d00001

diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh
index 92478500..b3923655 100644
--- a/examples/mllm/sft_instructblip.sh
+++ b/examples/mllm/sft_instructblip.sh
@@ -3,20 +3,20 @@
 CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --stage sft_mm \
     --do_train \
-    --model_name_or_path /home/LAB/fengzc/LLM/checkpoints/Salesforce/instructblip-vicuna-7b \
-    --dataset llava_instruct_100 \
+    --model_name_or_path Salesforce/instructblip-vicuna-7b \
+    --dataset mllm_instruct_example \
     --dataset_dir data \
     --template default \
     --finetuning_type lora \
-    --lora_target q_proj,k_proj \
+    --lora_target all \
     --output_dir saves/instructblip-vicuna-7b/lora/sft \
     --overwrite_cache \
     --overwrite_output_dir \
     --cutoff_len 1024 \
     --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 4 \
+    --per_device_train_batch_size 3 \
     --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
+    --gradient_accumulation_steps 1 \
     --lr_scheduler_type cosine \
     --logging_steps 1 \
     --warmup_steps 20 \
@@ -25,10 +25,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --evaluation_strategy steps \
     --load_best_model_at_end \
     --learning_rate 1e-5 \
-    --num_train_epochs 3.0 \
+    --num_train_epochs 50 \
     --max_samples 3000 \
     --val_size 0.1 \
     --plot_loss \
-    --quantization_bit 8 \
-    --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 \
-    --use_qformer
\ No newline at end of file
+    --bf16
\ No newline at end of file
diff --git a/examples/mllm/sft_blip2.sh b/examples/mllm/sft_llava.sh
similarity index 58%
rename from examples/mllm/sft_blip2.sh
rename to examples/mllm/sft_llava.sh
index ac0a3f11..c1fce693 100644
--- a/examples/mllm/sft_blip2.sh
+++ b/examples/mllm/sft_llava.sh
@@ -3,20 +3,20 @@
 CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --stage sft_mm \
     --do_train \
-    --model_name_or_path /home/LAB/fengzc/LLM/checkpoints/Salesforce/blip2-opt-2.7b \
-    --dataset llava_instruct_100 \
+    --model_name_or_path llava-hf/llava-1.5-7b-hf \
+    --dataset mllm_instruct_example \
     --dataset_dir data \
     --template default \
     --finetuning_type lora \
-    --lora_target q_proj,k_proj \
-    --output_dir saves/blip2-opt-2.7b/lora/sft \
+    --lora_target all \
+    --output_dir saves/llava-1.5-7b/lora/sft \
     --overwrite_cache \
     --overwrite_output_dir \
     --cutoff_len 1024 \
     --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 4 \
+    --per_device_train_batch_size 3 \
     --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
+    --gradient_accumulation_steps 1 \
     --lr_scheduler_type cosine \
     --logging_steps 1 \
     --warmup_steps 20 \
@@ -25,9 +25,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
     --evaluation_strategy steps \
     --load_best_model_at_end \
     --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
+    --num_train_epochs 100 \
     --max_samples 3000 \
     --val_size 0.1 \
     --plot_loss \
-    --quantization_bit 8 \
-    --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017
\ No newline at end of file
+    --bf16
\ No newline at end of file
diff --git a/scripts/make_mllm_instruct.py b/scripts/make_mllm_instruct.py
new file mode 100644
index 00000000..41e13b8e
--- /dev/null
+++ b/scripts/make_mllm_instruct.py
@@ -0,0 +1,95 @@
+import json
+import os.path
+
+import fire
+from datasets import Dataset, concatenate_datasets, load_dataset, Value, Image, Features, Sequence
+
+"""usage
+python3 scripts/make_mllm_instruct.py \
+--json_path data/llava_instruct_example.json \
+--image_path data/images \
+--output_path data/mllm_example_dataset
+"""
+
+
+def make_one_json(json_path, image_path) -> Dataset:
+    with open(json_path) as f:
+        raw_data_ls = json.loads(f.read())
+    data_ls = []
+    for i, data in enumerate(raw_data_ls):
+        for j, message in enumerate(data['messages']):
+            text = message['content']
+            message['content'] = [{'index': None, 'text': text, 'type': 'text'}]
+            if j == 0:
+                message['content'].append({'index': 0, 'text': None, 'type': 'image'})
+        image = data['image']
+        if image_path:
+            image = os.path.join(image_path, data['image'])
+        data['images'] = [image]
+        del data['image']
+        data_ls.append(data)
+
+    def gen():
+        for data in data_ls:
+            yield data
+
+    features = Features({'messages': [{'content': [
+        {'index': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None),
+         'type': Value(dtype='string', id=None)}], 'role': Value(dtype='string', id=None)}],
+        'images': Sequence(feature=Image(decode=True, id=None), length=-1, id=None)})
+    dataset = Dataset.from_generator(gen, features=features)
+    return dataset
+
+
+yaml_content = """---
+dataset_info:
+  features:
+  - name: messages
+    list:
+    - name: content
+      list:
+      - name: index
+        dtype: int64
+      - name: text
+        dtype: string
+      - name: type
+        dtype: string
+    - name: role
+      dtype: string
+  - name: images
+    sequence: image
+configs:
+- config_name: default
+  data_files:
+  - split: train
+    path: data/train-*
+  - split: test
+    path: data/test-*
+---"""
+
+
+def main(
+    json_path: str,
+    image_path: str,
+    output_path: str,
+):
+    json_path_list = json_path.split()
+    dataset_list = []
+    for json_path in json_path_list:
+        dataset = make_one_json(json_path, image_path)
+        dataset_list.append(dataset)
+    dataset = concatenate_datasets(dataset_list)
+    print(dataset[0])
+    data_path = os.path.join(output_path, "data")
+    os.makedirs(os.path.join(data_path), exist_ok=True)
+    parquet_path = os.path.join(data_path, "train-0.parquet")
+    dataset.to_parquet(parquet_path)
+    parquet_path = os.path.join(data_path, "test-0.parquet")
+    dataset.to_parquet(parquet_path)
+    readme_path = os.path.join(output_path, "README.md")
+    with open(readme_path, 'w') as f:
+        f.write(yaml_content)
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py
new file mode 100644
index 00000000..c03525b8
--- /dev/null
+++ b/scripts/test_mllm.py
@@ -0,0 +1,84 @@
+import os.path
+
+import fire
+import torch
+from datasets import load_dataset
+from peft import PeftModel
+from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
+
+"""usage
+python3 scripts/test_mllm.py \
+--base_model_path llava-hf/llava-1.5-7b-hf \
+--lora_model_path saves/llava-1.5-7b/lora/sft \
+--model_path saves/llava-1.5-7b/lora/merged \
+--dataset_name data/mllm_example_dataset \
+--do_merge 1
+"""
+
+
+def get_processor(model_path):
+    CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""
+    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+    tokenizer.chat_template = CHAT_TEMPLATE
+    processor = AutoProcessor.from_pretrained(model_path)
+    processor.tokenizer = tokenizer
+    return processor
+
+
+def apply_lora(base_model_path, model_path, lora_path):
+    print(f"Loading the base model from {base_model_path}")
+    base_model = AutoModelForVision2Seq.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="cuda",
+    )
+    processor = get_processor(base_model_path)
+    tokenizer = processor.tokenizer
+    print(f"Loading the LoRA adapter from {lora_path}")
+
+    lora_model = PeftModel.from_pretrained(
+        base_model,
+        lora_path,
+        torch_dtype=torch.float16,
+    )
+
+    print("Applying the LoRA")
+    model = lora_model.merge_and_unload()
+
+    print(f"Saving the target model to {model_path}")
+    model.save_pretrained(model_path)
+    tokenizer.save_pretrained(model_path)
+    processor.image_processor.save_pretrained(model_path)
+
+
+def main(
+        model_path: str,
+        dataset_name: str,
+        base_model_path: str = "",
+        lora_model_path: str = "",
+        do_merge: bool = False,
+):
+    if not os.path.exists(model_path) or do_merge:
+        apply_lora(base_model_path, model_path, lora_model_path)
+    model = AutoModelForVision2Seq.from_pretrained(
+        model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map="cuda"
+    )
+    processor = get_processor(model_path)
+    raw_datasets = load_dataset(dataset_name)
+    train_dataset = raw_datasets['train']
+    examples = train_dataset.select(range(3))
+    texts = []
+    images = []
+    for example in examples:
+        messages = example["messages"][:1]
+        text = processor.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=False
+        )
+        texts.append(text)
+        images.append(example["images"][0])
+    batch = processor(texts, images, return_tensors="pt", padding=True).to("cuda")
+    output = model.generate(**batch, max_new_tokens=100)
+    res = processor.batch_decode(output, skip_special_tokens=True)
+    print(res)
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index b3af434b..18665731 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -199,8 +199,7 @@ def get_mm_dataset(
     with training_args.main_process_first(desc="load dataset"):
         all_datasets = []
         for dataset_attr in get_dataset_list(data_args):
-            local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
-            all_datasets.append(load_dataset("json", data_files=local_path)['train'])
+            all_datasets.append(load_dataset(dataset_attr.dataset_name)['train'])
         dataset = merge_dataset(all_datasets, data_args, training_args)
 
     return dataset
diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index b8edfa10..8494ba7e 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -275,4 +275,4 @@ def get_preprocess_and_print_func(
         )
         print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
 
-    return preprocess_func, print_function
+    return preprocess_func, print_function
\ No newline at end of file
diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py
index 3b52f1ea..f5f75c77 100644
--- a/src/llmtuner/hparams/data_args.py
+++ b/src/llmtuner/hparams/data_args.py
@@ -88,10 +88,6 @@ class DataArguments:
         default=None,
         metadata={"help": "Path to save or load the tokenized datasets."},
     )
-    image_path: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to images."},
-    )
 
     def __post_init__(self):
         if self.reserved_label_len >= self.cutoff_len:
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index 32637f59..0e42033f 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -165,10 +165,6 @@ class ModelArguments:
         default=False,
         metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
     )
-    use_qformer: bool = field(
-        default=False,
-        metadata={"help": "Whether use qformer for Multimodal LLM."},
-    )
 
     def __post_init__(self):
         self.compute_dtype = None
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index 624d8a85..e66a984b 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -182,7 +182,8 @@ def init_adapter(
 def init_mm_adapter(
         model: "AutoModelForVision2Seq", model_args: "ModelArguments",
         finetuning_args: "FinetuningArguments",
-        is_trainable: bool
+        is_trainable: bool,
+        use_clm=True,
 ) -> "AutoModelForVision2Seq":
     if finetuning_args.finetuning_type == "lora":
         logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
@@ -253,12 +254,19 @@ def init_mm_adapter(
                 }
                 model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
             else:
-                lora_config = LoraConfig(
-                    # task_type=TaskType.CAUSAL_LM,
-                    inference_mode=False,
-                    use_dora=finetuning_args.use_dora,
-                    **peft_kwargs,
-                )
+                if use_clm:
+                    lora_config = LoraConfig(
+                        task_type=TaskType.CAUSAL_LM,
+                        inference_mode=False,
+                        use_dora=finetuning_args.use_dora,
+                        **peft_kwargs,
+                    )
+                else:
+                    lora_config = LoraConfig(
+                        inference_mode=False,
+                        use_dora=finetuning_args.use_dora,
+                        **peft_kwargs,
+                    )
                 model = get_peft_model(model, lora_config)
 
         if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index eeee69a6..917f11c9 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -191,6 +191,7 @@ def load_mm_model(
         finetuning_args: "FinetuningArguments",
         is_trainable: bool = False,
         add_valuehead: bool = False,
+        use_clm=True,
 ) -> "AutoModelForVision2Seq":
     r"""
     Loads pretrained model. Must after load_tokenizer.
@@ -231,7 +232,7 @@ def load_mm_model(
     patch_model(model, tokenizer, model_args, is_trainable)
     register_autoclass(config, model, tokenizer)
 
-    model = init_mm_adapter(model, model_args, finetuning_args, is_trainable)
+    model = init_mm_adapter(model, model_args, finetuning_args, is_trainable, use_clm)
 
     if not is_trainable:
         model.requires_grad_(False)
diff --git a/src/llmtuner/train/sftmm/collator.py b/src/llmtuner/train/sftmm/collator.py
index e91374bc..95dbd939 100644
--- a/src/llmtuner/train/sftmm/collator.py
+++ b/src/llmtuner/train/sftmm/collator.py
@@ -1,69 +1,29 @@
-import json
-import os
 from dataclasses import dataclass
-
-import torch
-from torch.utils.data import Dataset as Dataset_torch
-from datasets import Dataset
-from PIL import Image
 from transformers import AutoProcessor
 
 
-class ImageCaptioningDataset(Dataset_torch):
-    def __init__(self, dataset: Dataset, image_path: str, processor: AutoProcessor):
-        self.processor = processor
-        self.dataset = dataset
-        self.image_path = image_path
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def __getitem__(self, idx):
-        source = self.dataset[idx]
-        image_id = source['image']
-        image = Image.open(os.path.join(self.image_path, image_id))
-        convs = source['conversations']
-        prompt = convs[0]['value']
-        label = convs[1]['value']
-        image_inputs = self.processor(image, return_tensors="pt")
-        image_inputs = {k: v.squeeze() for k, v in image_inputs.items()}
-        inputs = {
-            "input_ids": prompt,
-            "labels": label,
-        }
-        for key in image_inputs:
-            inputs[key] = image_inputs[key]
-        return inputs
-
-
 @dataclass
 class DataCollatorForVis2Seq:
     processor: AutoProcessor
-    use_qformer: bool = False
 
-    def __call__(self, features, return_tensors=None):
-        processed_batch = {}
-        for key in features[0].keys():
-            if key == 'pixel_values':
-                processed_batch[key] = torch.stack([example[key] for example in features])
-            elif key == 'input_ids':
-                text_inputs = self.processor.tokenizer(
-                    [example[key] for example in features], padding="max_length", return_tensors="pt",
-                    max_length=512,
-                )
-                processed_batch["input_ids"] = text_inputs["input_ids"]
-                processed_batch["attention_mask"] = text_inputs["attention_mask"]
-                if self.use_qformer:
-                    qformer_text_inputs = self.processor.qformer_tokenizer(
-                        [example[key] for example in features], padding="max_length", return_tensors="pt",
-                        max_length=512,
-                    )
-                    processed_batch["qformer_input_ids"] = qformer_text_inputs["input_ids"]
-                    processed_batch["qformer_attention_mask"] = qformer_text_inputs["attention_mask"]
-            elif key == 'labels':
-                text_inputs = self.processor.tokenizer(
-                    [example[key] for example in features], padding="max_length", return_tensors="pt",
-                    max_length=512,
-                )
-                processed_batch["labels"] = text_inputs["input_ids"]
-        return processed_batch
+    def __call__(self, examples):
+        texts = []
+        images = []
+        for example in examples:
+            if len(example["images"]) > 1:
+                raise ValueError("This collator only supports one image per example")
+            messages = example["messages"]
+            text = self.processor.tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=False
+            )
+            texts.append(text)
+            images.append(example["images"][0])
+
+        batch = self.processor(text=texts, images=images, return_tensors="pt", padding=True)
+
+        labels = batch["input_ids"].clone()
+        if self.processor.tokenizer.pad_token_id is not None:
+            labels[labels == self.processor.tokenizer.pad_token_id] = -100
+        batch["labels"] = labels
+
+        return batch
diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py
index 96b86b44..f094e609 100644
--- a/src/llmtuner/train/sftmm/trainer.py
+++ b/src/llmtuner/train/sftmm/trainer.py
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
-from transformers import Seq2SeqTrainer
+from transformers import Seq2SeqTrainer, Trainer
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
@@ -32,23 +32,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
 
             self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
 
-    # def compute_loss(self, model, inputs, return_outputs=False):
-    #     print(inputs.keys())
-    #     device = "cuda"
-    #     input_ids = inputs.get("input_ids").to(device)
-    #     pixel_values = inputs.get("pixel_values").to(device, torch.float16)
-    #     attention_mask = inputs.get("attention_mask").to(device)
-    #     labels = inputs.get("labels").to(device)
-    #
-    #     outputs = model(input_ids=input_ids,
-    #                     pixel_values=pixel_values,
-    #                     labels=labels,
-    #                     # attention_mask=attention_mask,
-    #                     )
-    #     loss = outputs.loss
-    #     print("Loss:", loss.item())
-    #     return (loss, outputs) if return_outputs else loss
-
     def create_optimizer(self) -> "torch.optim.Optimizer":
         if self.optimizer is None:
             self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
@@ -59,79 +42,3 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
     ) -> "torch.optim.lr_scheduler.LRScheduler":
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
-
-    def prediction_step(
-            self,
-            model: "torch.nn.Module",
-            inputs: Dict[str, Union[torch.Tensor, Any]],
-            prediction_loss_only: bool,
-            ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        r"""
-        Removes the prompt part in the generated tokens.
-
-        Subclass and override to inject custom behavior.
-        """
-        labels = inputs["labels"].detach().clone() if "labels" in inputs else None  # backup labels
-        if self.args.predict_with_generate:
-            assert self.tokenizer.padding_side == "left", "This method only accepts left-padded tensor."
-            prompt_len, label_len = inputs["input_ids"].size(-1), inputs["labels"].size(-1)
-            if prompt_len > label_len:
-                inputs["labels"] = self._pad_tensors_to_target_len(inputs["labels"], inputs["input_ids"])
-            if label_len > prompt_len:  # truncate the labels instead of padding the inputs (llama2 fp16 compatibility)
-                inputs["labels"] = inputs["labels"][:, :prompt_len]
-
-        loss, generated_tokens, _ = super().prediction_step(  # ignore the returned labels (may be truncated)
-            model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
-        )
-        if generated_tokens is not None and self.args.predict_with_generate:
-            generated_tokens[:, :prompt_len] = self.tokenizer.pad_token_id
-            generated_tokens = generated_tokens.contiguous()
-
-        return loss, generated_tokens, labels
-
-    def _pad_tensors_to_target_len(self, src_tensor: torch.Tensor, tgt_tensor: torch.Tensor) -> torch.Tensor:
-        r"""
-        Pads the tensor to the same length as the target tensor.
-        """
-        assert self.tokenizer.pad_token_id is not None, "Pad token is required."
-        padded_tensor = self.tokenizer.pad_token_id * torch.ones_like(tgt_tensor)
-        padded_tensor[:, -src_tensor.shape[-1]:] = src_tensor  # adopt left-padding
-        return padded_tensor.contiguous()  # in contiguous memory
-
-    def save_predictions(self, predict_results: "PredictionOutput") -> None:
-        r"""
-        Saves model predictions to `output_dir`.
-
-        A custom behavior that not contained in Seq2SeqTrainer.
-        """
-        if not self.is_world_process_zero():
-            return
-
-        output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
-        logger.info(f"Saving prediction results to {output_prediction_file}")
-
-        labels = np.where(
-            predict_results.label_ids != IGNORE_INDEX, predict_results.label_ids, self.tokenizer.pad_token_id
-        )
-        preds = np.where(
-            predict_results.predictions != IGNORE_INDEX, predict_results.predictions, self.tokenizer.pad_token_id
-        )
-
-        for i in range(len(preds)):
-            pad_len = np.nonzero(preds[i] != self.tokenizer.pad_token_id)[0]
-            if len(pad_len):
-                preds[i] = np.concatenate(
-                    (preds[i][pad_len[0]:], preds[i][: pad_len[0]]), axis=-1
-                )  # move pad token to last
-
-        decoded_labels = self.tokenizer.batch_decode(
-            labels, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-
-        with open(output_prediction_file, "w", encoding="utf-8") as writer:
-            res: List[str] = []
-            for label, pred in zip(decoded_labels, decoded_preds):
-                res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False))
-            writer.write("\n".join(res))
diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py
index 9f952772..21f4aebf 100644
--- a/src/llmtuner/train/sftmm/workflow.py
+++ b/src/llmtuner/train/sftmm/workflow.py
@@ -1,21 +1,14 @@
 # Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py
 import os
 from typing import TYPE_CHECKING, List, Optional
-
-import torch
-from PIL import Image
-from torch.utils.data import Dataset
-from transformers import DataCollatorForSeq2Seq, LlavaNextForConditionalGeneration, AutoModelForVision2Seq
-
 from ...data import split_dataset, get_mm_dataset
-from ...extras.constants import IGNORE_INDEX
 from ...extras.misc import get_logits_processor
 from ...extras.ploting import plot_loss
-from ...model import load_model, load_tokenizer, load_processor, load_mm_model
+from ...model import load_tokenizer, load_processor, load_mm_model
 from ..utils import create_modelcard_and_push
 from .metric import ComputeMetrics
 from .trainer import CustomSeq2SeqTrainer
-from .collator import DataCollatorForVis2Seq, ImageCaptioningDataset
+from .collator import DataCollatorForVis2Seq
 
 if TYPE_CHECKING:
     from transformers import Seq2SeqTrainingArguments, TrainerCallback
@@ -32,28 +25,27 @@ def run_sft_mm(
         callbacks: Optional[List["TrainerCallback"]] = None,
 ):
     processor = load_processor(model_args)
-    tokenizer = processor.tokenizer
-    model = load_mm_model(processor, model_args, finetuning_args, training_args.do_train)
+    tokenizer = load_tokenizer(model_args)
+    CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""
+    tokenizer.chat_template = CHAT_TEMPLATE
+    processor.tokenizer = tokenizer
+    use_clm = True
+    if "blip" in model_args.model_name_or_path:
+        use_clm = False
+    model = load_mm_model(processor, model_args, finetuning_args, training_args.do_train, use_clm=use_clm)
     dataset = get_mm_dataset(processor, model_args, data_args, training_args, stage="sft")
-    if training_args.predict_with_generate:
-        tokenizer.padding_side = "left"  # use left-padding in generation
     if getattr(model, "is_quantized", False) and not training_args.do_train:
         setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
-    splited_dataset = split_dataset(dataset, data_args, training_args)
-    splited_dataset['train_dataset'].set_format(type=splited_dataset['train_dataset'].format["type"],
-                                                columns=list(splited_dataset['train_dataset'].features.keys()))
-    splited_dataset['eval_dataset'].set_format(type=splited_dataset['eval_dataset'].format["type"],
-                                               columns=list(splited_dataset['eval_dataset'].features.keys()))
-    train_dataset = ImageCaptioningDataset(splited_dataset['train_dataset'], data_args.image_path, processor)
-    eval_dataset = ImageCaptioningDataset(splited_dataset['eval_dataset'], data_args.image_path, processor)
+    train_dataset = dataset
+    eval_dataset = dataset
     data_collator = DataCollatorForVis2Seq(
         processor=processor,
-        use_qformer=model_args.use_qformer,
     )
 
     # Override the decoding parameters of Seq2SeqTrainer
     training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
     training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
+    training_args.remove_unused_columns = False
 
     # Initialize our Trainer
     trainer = CustomSeq2SeqTrainer(
@@ -67,7 +59,6 @@ def run_sft_mm(
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
     )
-
     # Keyword arguments for `model.generate`
     gen_kwargs = generating_args.to_dict()
     gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids

From 0e3cc523278b0d10133699d682ca85a2b1dbdc14 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 00:56:06 +0800
Subject: [PATCH 152/341] remove conflicts

Former-commit-id: e5750ee202eb67cf5fc54f464548e2eb43d00900
---
 scripts/test_mllm.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py
index 882bf032..961f02bf 100644
--- a/scripts/test_mllm.py
+++ b/scripts/test_mllm.py
@@ -5,6 +5,7 @@ import torch
 from datasets import load_dataset
 from peft import PeftModel
 from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
+import shutil
 
 """usage
 python3 scripts/test_mllm.py \
@@ -47,15 +48,14 @@ def apply_lora(base_model_path, model_path, lora_path):
     model.save_pretrained(model_path)
     tokenizer.save_pretrained(model_path)
     processor.image_processor.save_pretrained(model_path)
-    if 'instructblip' in model_path:
-        processor.qformer_tokenizer.save_pretrained(model_path)
+
 
 def main(
-        model_path: str,
-        dataset_name: str,
-        base_model_path: str = "",
-        lora_model_path: str = "",
-        do_merge: bool = False,
+    model_path: str,
+    dataset_name: str,
+    base_model_path: str = "",
+    lora_model_path: str = "",
+    do_merge: bool = False,
 ):
     if not os.path.exists(model_path) or do_merge:
         apply_lora(base_model_path, model_path, lora_model_path)

From 5142349661fc7b9cbe5e30001878e2d68fa9f678 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 01:01:59 +0800
Subject: [PATCH 153/341] remove error

Former-commit-id: 2bcd1c7dc3595f17ae4e2c4475196cc2d03d0e75
---
 src/llmtuner/model/loader.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index f3856da7..a6c37922 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -202,29 +202,6 @@ def load_mm_model(
     patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
 
     model = None
-    if is_trainable and model_args.use_unsloth:
-        from unsloth import FastLanguageModel  # type: ignore
-
-        unsloth_kwargs = {
-            "model_name": model_args.model_name_or_path,
-            "max_seq_length": model_args.model_max_length,
-            "dtype": model_args.compute_dtype,
-            "load_in_4bit": model_args.quantization_bit == 4,
-            "token": model_args.hf_hub_token,
-            "device_map": {"": get_current_device()},
-            "rope_scaling": getattr(config, "rope_scaling", None),
-            "fix_tokenizer": False,
-            "trust_remote_code": True,
-        }
-        try:
-            model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
-        except NotImplementedError:
-            logger.warning("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))
-            model_args.use_unsloth = False
-
-        if model_args.adapter_name_or_path:
-            model_args.adapter_name_or_path = None
-            logger.warning("Unsloth does not support loading adapters.")
     if model is None:
         init_kwargs["config"] = config
         init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path

From 00e2a272ef03fa33c6019f11fc0a588f6f0a82b9 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 08:20:41 +0800
Subject: [PATCH 154/341] merge model part to the text stream

Former-commit-id: b6fcb832ddaed4647d6f2b926f3dfccd47f3ea84
---
 src/llmtuner/hparams/model_args.py   |   4 +
 src/llmtuner/model/__init__.py       |   3 +-
 src/llmtuner/model/adapter.py        | 107 +--------------------------
 src/llmtuner/model/loader.py         |  75 ++++---------------
 src/llmtuner/train/sftmm/workflow.py |   7 +-
 5 files changed, 24 insertions(+), 172 deletions(-)

diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index b60492a0..a6e4b710 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -169,6 +169,10 @@ class ModelArguments:
         default=False,
         metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
     )
+    use_mllm: bool = field(
+        default=False,
+        metadata={"help": "Whether use Multimodal LLM."},
+    )
 
     def __post_init__(self):
         self.compute_dtype = None
diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py
index 2bd73365..f6be60d8 100644
--- a/src/llmtuner/model/__init__.py
+++ b/src/llmtuner/model/__init__.py
@@ -1,10 +1,9 @@
-from .loader import load_config, load_model, load_tokenizer, load_mm_model
+from .loader import load_config, load_model, load_tokenizer, load_processor
 from .utils.misc import find_all_linear_modules, load_valuehead_params
 
 __all__ = [
     "load_config",
     "load_model",
-    "load_mm_model",
     "load_tokenizer",
     "load_processor",
     "load_valuehead_params",
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index 8079c028..bcefee92 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
 
 import torch
 from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model
@@ -21,11 +21,11 @@ logger = get_logger(__name__)
 
 def init_adapter(
     config: "PretrainedConfig",
-    model: "PreTrainedModel",
+    model: Union["PreTrainedModel","AutoModelForVision2Seq"],
     model_args: "ModelArguments",
     finetuning_args: "FinetuningArguments",
     is_trainable: bool,
-) -> "PreTrainedModel":
+) -> Union["PreTrainedModel","AutoModelForVision2Seq"]:
     r"""
     Initializes the adapters.
 
@@ -195,103 +195,4 @@ def init_adapter(
         if model_args.adapter_name_or_path is not None:
             logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
 
-    return model
-
-
-def init_mm_adapter(
-        model: "AutoModelForVision2Seq", model_args: "ModelArguments",
-        finetuning_args: "FinetuningArguments",
-        is_trainable: bool,
-        use_clm=True,
-) -> "AutoModelForVision2Seq":
-    if finetuning_args.finetuning_type == "lora":
-        logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
-        adapter_to_resume = None
-
-        if model_args.adapter_name_or_path is not None:
-            is_mergeable = True
-            if getattr(model, "quantization_method", None):  # merge lora in quantized model is unstable
-                assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter."
-                is_mergeable = False
-
-            if is_deepspeed_zero3_enabled():
-                assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3."
-                is_mergeable = False
-
-            if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable):
-                adapter_to_merge = model_args.adapter_name_or_path[:-1]
-                adapter_to_resume = model_args.adapter_name_or_path[-1]
-            else:
-                adapter_to_merge = model_args.adapter_name_or_path
-
-            for adapter in adapter_to_merge:
-                model: "LoraModel" = PeftModel.from_pretrained(
-                    model, adapter, offload_folder=model_args.offload_folder
-                )
-                model = model.merge_and_unload()
-
-            if len(adapter_to_merge) > 0:
-                logger.info("Merged {} adapter(s).".format(len(adapter_to_merge)))
-
-            if adapter_to_resume is not None:  # resume lora training
-                model = PeftModel.from_pretrained(
-                    model, adapter_to_resume, is_trainable=is_trainable, offload_folder=model_args.offload_folder
-                )
-
-        if is_trainable and adapter_to_resume is None:  # create new lora weights while training
-            if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all":
-                target_modules = find_all_linear_modules(model)
-            else:
-                target_modules = finetuning_args.lora_target
-
-            if finetuning_args.use_llama_pro:
-                target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable)
-
-            if (
-                    finetuning_args.use_dora
-                    and getattr(model, "quantization_method", None) is not None
-                    and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
-            ):
-                raise ValueError("DoRA is not compatible with PTQ-quantized models.")
-
-            peft_kwargs = {
-                "r": finetuning_args.lora_rank,
-                "target_modules": target_modules,
-                "lora_alpha": finetuning_args.lora_alpha,
-                "lora_dropout": finetuning_args.lora_dropout,
-                "use_rslora": finetuning_args.use_rslora,
-                "modules_to_save": finetuning_args.additional_target,
-            }
-
-            if model_args.use_unsloth:
-                from unsloth import FastLanguageModel  # type: ignore
-
-                unsloth_peft_kwargs = {
-                    "model": model,
-                    "max_seq_length": model_args.model_max_length,
-                    "use_gradient_checkpointing": "unsloth",
-                }
-                model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
-            else:
-                if use_clm:
-                    lora_config = LoraConfig(
-                        task_type=TaskType.CAUSAL_LM,
-                        inference_mode=False,
-                        use_dora=finetuning_args.use_dora,
-                        **peft_kwargs,
-                    )
-                else:
-                    lora_config = LoraConfig(
-                        inference_mode=False,
-                        use_dora=finetuning_args.use_dora,
-                        **peft_kwargs,
-                    )
-                model = get_peft_model(model, lora_config)
-
-        if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
-            for param in filter(lambda p: p.requires_grad, model.parameters()):
-                param.data = param.data.to(torch.float32)
-
-        if model_args.adapter_name_or_path is not None:
-            logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
-    return model
+    return model
\ No newline at end of file
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index a6c37922..3712a592 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -1,11 +1,11 @@
-from typing import TYPE_CHECKING, Any, Dict
+from typing import TYPE_CHECKING, Any, Dict, Union
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModelForVision2Seq
 from trl import AutoModelForCausalLMWithValueHead
 
 from ..extras.logging import get_logger
 from ..extras.misc import count_parameters, try_download_model_from_ms
-from .adapter import init_adapter, init_mm_adapter
+from .adapter import init_adapter
 from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model
 from .utils.misc import load_valuehead_params, register_autoclass
 from .utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
@@ -106,12 +106,12 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
 
 
 def load_model(
-        tokenizer: "PreTrainedTokenizer",
-        model_args: "ModelArguments",
-        finetuning_args: "FinetuningArguments",
-        is_trainable: bool = False,
-        add_valuehead: bool = False,
-) -> "PreTrainedModel":
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool = False,
+    add_valuehead: bool = False,
+) -> Union["PreTrainedModel", "AutoModelForVision2Seq"]:
     r"""
     Loads pretrained model.
     """
@@ -134,7 +134,10 @@ def load_model(
         if model_args.mixture_of_depths == "load":
             model = load_mod_pretrained_model(**init_kwargs)
         else:
-            model = AutoModelForCausalLM.from_pretrained(**init_kwargs)
+            if model_args.use_mllm:
+                model = AutoModelForVision2Seq.from_pretrained(**init_kwargs)
+            else:
+                model = AutoModelForCausalLM.from_pretrained(**init_kwargs)
 
         if model_args.mixture_of_depths == "convert":
             model = convert_pretrained_model_to_mod(model, config, model_args)
@@ -182,56 +185,4 @@ def load_model(
                 )
             )
 
-    return model
-
-
-def load_mm_model(
-        processor: "AutoProcessor",
-        model_args: "ModelArguments",
-        finetuning_args: "FinetuningArguments",
-        is_trainable: bool = False,
-        add_valuehead: bool = False,
-        use_clm=True,
-) -> "AutoModelForVision2Seq":
-    r"""
-    Loads pretrained model. Must after load_tokenizer.
-    """
-    tokenizer = processor.tokenizer
-    init_kwargs = _get_init_kwargs(model_args)
-    config = AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
-    patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
-
-    model = None
-    if model is None:
-        init_kwargs["config"] = config
-        init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path
-        model: "AutoModelForVision2Seq" = AutoModelForVision2Seq.from_pretrained(**init_kwargs)
-    patch_model(model, tokenizer, model_args, is_trainable)
-    register_autoclass(config, model, tokenizer)
-
-    model = init_mm_adapter(model, model_args, finetuning_args, is_trainable, use_clm)
-
-    if not is_trainable:
-        model.requires_grad_(False)
-        model.eval()
-    else:
-        model.train()
-
-    trainable_params, all_param = count_parameters(model)
-    if is_trainable:
-        param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
-            trainable_params, all_param, 100 * trainable_params / all_param
-        )
-    else:
-        param_stats = "all params: {:d}".format(all_param)
-    logger.info(param_stats)
-
-    if model_args.print_param_status:
-        for name, param in model.named_parameters():
-            print(
-                "name: {}, dtype: {}, device: {}, trainable: {}".format(
-                    name, param.dtype, param.device, param.requires_grad
-                )
-            )
-
-    return model
+    return model
\ No newline at end of file
diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py
index 21f4aebf..7afd8f6f 100644
--- a/src/llmtuner/train/sftmm/workflow.py
+++ b/src/llmtuner/train/sftmm/workflow.py
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, List, Optional
 from ...data import split_dataset, get_mm_dataset
 from ...extras.misc import get_logits_processor
 from ...extras.ploting import plot_loss
-from ...model import load_tokenizer, load_processor, load_mm_model
+from ...model import load_tokenizer, load_processor, load_model
 from ..utils import create_modelcard_and_push
 from .metric import ComputeMetrics
 from .trainer import CustomSeq2SeqTrainer
@@ -29,10 +29,7 @@ def run_sft_mm(
     CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""
     tokenizer.chat_template = CHAT_TEMPLATE
     processor.tokenizer = tokenizer
-    use_clm = True
-    if "blip" in model_args.model_name_or_path:
-        use_clm = False
-    model = load_mm_model(processor, model_args, finetuning_args, training_args.do_train, use_clm=use_clm)
+    model = load_model(processor.tokenizer, model_args, finetuning_args, training_args.do_train)
     dataset = get_mm_dataset(processor, model_args, data_args, training_args, stage="sft")
     if getattr(model, "is_quantized", False) and not training_args.do_train:
         setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction

From 9aeb88c426b745b66e1da70c6656a2a71fa35b3a Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 25 Apr 2024 19:02:32 +0800
Subject: [PATCH 155/341] add export_device in webui #3333

Former-commit-id: 30ebd3652809d73941e0a5e4a8be11d989faf98d
---
 examples/README.md                      |  6 +++---
 examples/merge_lora/merge.sh            |  1 +
 src/llmtuner/hparams/model_args.py      |  2 +-
 src/llmtuner/model/patcher.py           | 12 ++----------
 src/llmtuner/model/utils/moe.py         | 16 +++++++++++++++-
 src/llmtuner/webui/components/export.py | 19 ++++++++++++++-----
 src/llmtuner/webui/locales.py           | 23 +++++++++++++++++++++--
 7 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 111f50bd..cc01cf9f 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -28,9 +28,9 @@ examples/
 │   ├── merge.sh: Merge LoRA weights into the pre-trained models
 │   └── quantize.sh: Quantize the fine-tuned model with AutoGPTQ
 ├── inference/
-│   ├── cli_demo.sh: Launch a command line interface with LoRA adapters
-│   ├── api_demo.sh: Launch an OpenAI-style API with LoRA adapters
-│   ├── web_demo.sh: Launch a web interface with LoRA adapters
+│   ├── cli_demo.sh: Chat with fine-tuned model in the CLI with LoRA adapters
+│   ├── api_demo.sh: Chat with fine-tuned model in an OpenAI-style API with LoRA adapters
+│   ├── web_demo.sh: Chat with fine-tuned model in the Web browser with LoRA adapters
 │   └── evaluate.sh: Evaluate model on the MMLU/CMMLU/C-Eval benchmarks with LoRA adapters
 └── extras/
     ├── galore/
diff --git a/examples/merge_lora/merge.sh b/examples/merge_lora/merge.sh
index 8c50591e..c50bd6ad 100644
--- a/examples/merge_lora/merge.sh
+++ b/examples/merge_lora/merge.sh
@@ -8,4 +8,5 @@ CUDA_VISIBLE_DEVICES=0 python ../../src/export_model.py \
     --finetuning_type lora \
     --export_dir ../../models/llama2-7b-sft \
     --export_size 2 \
+    --export_device cpu \
     --export_legacy_format False
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index b60492a0..bb8a8193 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -139,7 +139,7 @@ class ModelArguments:
     )
     export_device: str = field(
         default="cpu",
-        metadata={"help": "The device used in model export."},
+        metadata={"help": "The device used in model export, use cuda to avoid addmm errors."},
     )
     export_quantization_bit: Optional[int] = field(
         default=None,
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index c0166a8a..5c3c31b3 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -12,7 +12,7 @@ from .utils.attention import configure_attn_implementation, print_attn_implement
 from .utils.checkpointing import prepare_model_for_training
 from .utils.embedding import resize_embedding_layer
 from .utils.longlora import configure_longlora
-from .utils.moe import add_z3_leaf_module
+from .utils.moe import add_z3_leaf_module, configure_moe
 from .utils.quantization import configure_quantization
 from .utils.rope import configure_rope
 
@@ -46,17 +46,12 @@ def patch_config(
     configure_rope(config, model_args, is_trainable)
     configure_longlora(config, model_args, is_trainable)
     configure_quantization(config, tokenizer, model_args, init_kwargs)
+    configure_moe(config, model_args, is_trainable)
 
     if model_args.use_cache and not is_trainable:
         setattr(config, "use_cache", True)
         logger.info("Using KV cache for faster generation.")
 
-    if model_args.moe_aux_loss_coef is not None:
-        if getattr(config, "model_type", None) in ["mixtral", "qwen2_moe"]:
-            setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)
-        elif getattr(config, "model_type", None) == "deepseek":
-            setattr(config, "aux_loss_alpha", model_args.moe_aux_loss_coef)
-
     if getattr(config, "model_type", None) == "qwen":
         setattr(config, "use_flash_attn", model_args.flash_attn)
         for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
@@ -65,9 +60,6 @@ def patch_config(
     if getattr(config, "model_type", None) == "qwen2" and is_trainable and model_args.flash_attn:
         setattr(config, "use_cache", False)  # qwen2 does not support use_cache when using flashattn
 
-    if getattr(config, "model_type", None) in ["mixtral", "qwen2_moe"] and is_trainable:
-        setattr(config, "output_router_logits", True)
-
     init_kwargs["torch_dtype"] = model_args.compute_dtype
     if not is_deepspeed_zero3_enabled():
         init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage
diff --git a/src/llmtuner/model/utils/moe.py b/src/llmtuner/model/utils/moe.py
index 020a8f55..64dcaba5 100644
--- a/src/llmtuner/model/utils/moe.py
+++ b/src/llmtuner/model/utils/moe.py
@@ -5,7 +5,9 @@ from transformers.utils.versions import require_version
 
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import ModelArguments
 
 
 def add_z3_leaf_module(model: "PreTrainedModel") -> None:
@@ -37,3 +39,15 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:
         from transformers.models.dbrx.modeling_dbrx import DbrxFFN
 
         set_z3_leaf_modules(model, [DbrxFFN])
+
+
+def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
+    if model_args.moe_aux_loss_coef is not None:
+        if getattr(config, "model_type", None) in ["jamba", "mixtral", "qwen2_moe"]:
+            setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)
+
+        elif getattr(config, "model_type", None) == "deepseek":
+            setattr(config, "aux_loss_alpha", model_args.moe_aux_loss_coef)
+
+    if getattr(config, "model_type", None) in ["dbrx", "jamba", "mixtral", "qwen2_moe"]:
+        setattr(config, "output_router_logits", is_trainable)
diff --git a/src/llmtuner/webui/components/export.py b/src/llmtuner/webui/components/export.py
index d9c2d8e4..ebccac25 100644
--- a/src/llmtuner/webui/components/export.py
+++ b/src/llmtuner/webui/components/export.py
@@ -1,5 +1,6 @@
 from typing import TYPE_CHECKING, Dict, Generator, List
 
+from ...extras.misc import torch_gc
 from ...extras.packages import is_gradio_available
 from ...train import export_model
 from ..common import get_save_dir
@@ -26,9 +27,10 @@ def save_model(
     adapter_path: List[str],
     finetuning_type: str,
     template: str,
-    max_shard_size: int,
+    export_size: int,
     export_quantization_bit: int,
     export_quantization_dataset: str,
+    export_device: str,
     export_legacy_format: bool,
     export_dir: str,
     export_hub_model_id: str,
@@ -44,6 +46,8 @@ def save_model(
         error = ALERTS["err_no_dataset"][lang]
     elif export_quantization_bit not in GPTQ_BITS and not adapter_path:
         error = ALERTS["err_no_adapter"][lang]
+    elif export_quantization_bit in GPTQ_BITS and adapter_path:
+        error = ALERTS["err_gptq_lora"][lang]
 
     if error:
         gr.Warning(error)
@@ -64,22 +68,25 @@ def save_model(
         template=template,
         export_dir=export_dir,
         export_hub_model_id=export_hub_model_id or None,
-        export_size=max_shard_size,
+        export_size=export_size,
         export_quantization_bit=int(export_quantization_bit) if export_quantization_bit in GPTQ_BITS else None,
         export_quantization_dataset=export_quantization_dataset,
+        export_device=export_device,
         export_legacy_format=export_legacy_format,
     )
 
     yield ALERTS["info_exporting"][lang]
     export_model(args)
+    torch_gc()
     yield ALERTS["info_exported"][lang]
 
 
 def create_export_tab(engine: "Engine") -> Dict[str, "Component"]:
     with gr.Row():
-        max_shard_size = gr.Slider(value=1, minimum=1, maximum=100, step=1)
+        export_size = gr.Slider(value=1, minimum=1, maximum=100, step=1)
         export_quantization_bit = gr.Dropdown(choices=["none", "8", "4", "3", "2"], value="none")
         export_quantization_dataset = gr.Textbox(value="data/c4_demo.json")
+        export_device = gr.Radio(choices=["cpu", "cuda"], value="cpu")
         export_legacy_format = gr.Checkbox()
 
     with gr.Row():
@@ -98,9 +105,10 @@ def create_export_tab(engine: "Engine") -> Dict[str, "Component"]:
             engine.manager.get_elem_by_id("top.adapter_path"),
             engine.manager.get_elem_by_id("top.finetuning_type"),
             engine.manager.get_elem_by_id("top.template"),
-            max_shard_size,
+            export_size,
             export_quantization_bit,
             export_quantization_dataset,
+            export_device,
             export_legacy_format,
             export_dir,
             export_hub_model_id,
@@ -109,9 +117,10 @@ def create_export_tab(engine: "Engine") -> Dict[str, "Component"]:
     )
 
     return dict(
-        max_shard_size=max_shard_size,
+        export_size=export_size,
         export_quantization_bit=export_quantization_bit,
         export_quantization_dataset=export_quantization_dataset,
+        export_device=export_device,
         export_legacy_format=export_legacy_format,
         export_dir=export_dir,
         export_hub_model_id=export_hub_model_id,
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index c3111e8f..3af9128f 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -1150,7 +1150,7 @@ LOCALES = {
             "value": "清空历史",
         },
     },
-    "max_shard_size": {
+    "export_size": {
         "en": {
             "label": "Max shard size (GB)",
             "info": "The maximum size for a model file.",
@@ -1192,6 +1192,20 @@ LOCALES = {
             "info": "量化过程中使用的校准数据集。",
         },
     },
+    "export_device": {
+        "en": {
+            "label": "Export device",
+            "info": "Which device should be used to export model.",
+        },
+        "ru": {
+            "label": "Экспорт устройство",
+            "info": "Какое устройство следует использовать для экспорта модели.",
+        },
+        "zh": {
+            "label": "导出设备",
+            "info": "导出模型使用的设备类型。",
+        },
+    },
     "export_legacy_format": {
         "en": {
             "label": "Export legacy format",
@@ -1287,7 +1301,12 @@ ALERTS = {
     "err_no_export_dir": {
         "en": "Please provide export dir.",
         "ru": "Пожалуйста, укажите каталог для экспорта.",
-        "zh": "请填写导出目录",
+        "zh": "请填写导出目录。",
+    },
+    "err_gptq_lora": {
+        "en": "Please merge adapters before quantizing the model.",
+        "ru": "Пожалуйста, объедините адаптеры перед квантованием модели.",
+        "zh": "量化模型前请先合并适配器。",
     },
     "err_failed": {
         "en": "Failed.",

From 3c792174dbc46f2b92f5fe5075317f9c1a1956e7 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 19:19:59 +0800
Subject: [PATCH 156/341] merge data part to the text stream

Former-commit-id: 7ee20286d9bcc2d5378bfd6bb02cd3648396d873
---
 examples/mllm/sft_instructblip.sh    |  32 ----
 scripts/test_mllm.py                 |  14 +-
 src/llmtuner/data/__init__.py        |   3 +-
 src/llmtuner/data/aligner.py         | 118 +++++++++++--
 src/llmtuner/data/loader.py          | 103 +++++------
 src/llmtuner/data/parser.py          |  37 +++-
 src/llmtuner/data/preprocess.py      | 185 +++++++++++++++++---
 src/llmtuner/data/template.py        | 248 +++++++++++++++++++++------
 src/llmtuner/hparams/model_args.py   | 115 ++++++++++---
 src/llmtuner/model/adapter.py        | 100 ++++++++---
 src/llmtuner/model/loader.py         |  26 ++-
 src/llmtuner/train/sftmm/collator.py |  15 +-
 src/llmtuner/train/sftmm/workflow.py |  90 +++++++---
 13 files changed, 802 insertions(+), 284 deletions(-)
 delete mode 100644 examples/mllm/sft_instructblip.sh

diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh
deleted file mode 100644
index b3923655..00000000
--- a/examples/mllm/sft_instructblip.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft_mm \
-    --do_train \
-    --model_name_or_path Salesforce/instructblip-vicuna-7b \
-    --dataset mllm_instruct_example \
-    --dataset_dir data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target all \
-    --output_dir saves/instructblip-vicuna-7b/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 3 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 1 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 1e-5 \
-    --num_train_epochs 50 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --bf16
\ No newline at end of file
diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py
index 961f02bf..94d8670b 100644
--- a/scripts/test_mllm.py
+++ b/scripts/test_mllm.py
@@ -29,7 +29,10 @@ def get_processor(model_path):
 def apply_lora(base_model_path, model_path, lora_path):
     print(f"Loading the base model from {base_model_path}")
     base_model = AutoModelForVision2Seq.from_pretrained(
-        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="cuda",
+        base_model_path,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+        device_map="cuda",
     )
     processor = get_processor(base_model_path)
     tokenizer = processor.tokenizer
@@ -60,11 +63,14 @@ def main(
     if not os.path.exists(model_path) or do_merge:
         apply_lora(base_model_path, model_path, lora_model_path)
     model = AutoModelForVision2Seq.from_pretrained(
-        model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map="cuda"
+        model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        device_map="cuda",
     )
     processor = get_processor(model_path)
     raw_datasets = load_dataset(dataset_name)
-    train_dataset = raw_datasets['train']
+    train_dataset = raw_datasets["train"]
     examples = train_dataset.select(range(3))
     texts = []
     images = []
@@ -81,5 +87,5 @@ def main(
     print(res)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     fire.Fire(main)
diff --git a/src/llmtuner/data/__init__.py b/src/llmtuner/data/__init__.py
index 27a2f3b8..00a82d73 100644
--- a/src/llmtuner/data/__init__.py
+++ b/src/llmtuner/data/__init__.py
@@ -1,12 +1,11 @@
 from .collator import PairwiseDataCollatorWithPadding
-from .loader import get_dataset, get_mm_dataset
+from .loader import get_dataset
 from .template import Template, get_template_and_fix_tokenizer, templates
 from .utils import Role, split_dataset
 
 __all__ = [
     "PairwiseDataCollatorWithPadding",
     "get_dataset",
-    "get_mm_dataset",
     "Template",
     "get_template_and_fix_tokenizer",
     "templates",
diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py
index 4de37e6d..85202ea8 100644
--- a/src/llmtuner/data/aligner.py
+++ b/src/llmtuner/data/aligner.py
@@ -13,7 +13,9 @@ if TYPE_CHECKING:
     from .parser import DatasetAttr
 
 
-def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]:
+def convert_alpaca(
+    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
+) -> Dict[str, List[Any]]:
     outputs = {"prompt": [], "response": [], "system": [], "tools": []}
     for i in range(len(examples[dataset_attr.prompt])):
         prompt = []
@@ -31,24 +33,38 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr")
 
         prompt.append({"role": Role.USER.value, "content": "\n".join(content)})
 
-        if dataset_attr.response and isinstance(examples[dataset_attr.response][i], list):
+        if dataset_attr.response and isinstance(
+            examples[dataset_attr.response][i], list
+        ):
             response = [
-                {"role": Role.ASSISTANT.value, "content": content} for content in examples[dataset_attr.response][i]
+                {"role": Role.ASSISTANT.value, "content": content}
+                for content in examples[dataset_attr.response][i]
+            ]
+        elif dataset_attr.response and isinstance(
+            examples[dataset_attr.response][i], str
+        ):
+            response = [
+                {
+                    "role": Role.ASSISTANT.value,
+                    "content": examples[dataset_attr.response][i],
+                }
             ]
-        elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str):
-            response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}]
         else:
             response = []
 
         outputs["prompt"].append(prompt)
         outputs["response"].append(response)
-        outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
+        outputs["system"].append(
+            examples[dataset_attr.system][i] if dataset_attr.system else ""
+        )
         outputs["tools"].append("")
-
+        outputs["images"].append([])
     return outputs
 
 
-def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]:
+def convert_sharegpt(
+    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
+) -> Dict[str, List[Any]]:
     outputs = {"prompt": [], "response": [], "system": [], "tools": []}
     tag_mapping = {
         dataset_attr.user_tag: Role.USER.value,
@@ -61,7 +77,10 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
     even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag)
     accept_tags = (odd_tags, even_tags)
     for i, messages in enumerate(examples[dataset_attr.messages]):
-        if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag:
+        if (
+            dataset_attr.system_tag
+            and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag
+        ):
             system = messages[0][dataset_attr.content_tag]
             messages = messages[1:]
         else:
@@ -77,19 +96,81 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
                 raise ValueError("Invalid role tag in {}.".format(messages))
 
             aligned_messages.append(
-                {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
+                {
+                    "role": tag_mapping[message[dataset_attr.role_tag]],
+                    "content": message[dataset_attr.content_tag],
+                }
             )
 
         outputs["prompt"].append(aligned_messages[:-1])
         outputs["response"].append(aligned_messages[-1:])
         outputs["system"].append(system)
-        outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
+        outputs["tools"].append(
+            examples[dataset_attr.tools][i] if dataset_attr.tools else ""
+        )
+        outputs["images"].append([])
+
+    return outputs
+
+
+def convert_llava(
+    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
+) -> Dict[str, List[Any]]:
+    outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
+    tag_mapping = {
+        dataset_attr.user_tag: Role.USER.value,
+        dataset_attr.assistant_tag: Role.ASSISTANT.value,
+        dataset_attr.observation_tag: Role.OBSERVATION.value,
+        dataset_attr.function_tag: Role.FUNCTION.value,
+        dataset_attr.system_tag: Role.SYSTEM.value,
+    }
+    odd_tags = (dataset_attr.user_tag, dataset_attr.observation_tag)
+    even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag)
+    accept_tags = (odd_tags, even_tags)
+    for i, messages in enumerate(examples[dataset_attr.messages]):
+        if (
+            dataset_attr.system_tag
+            and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag
+        ):
+            system = messages[0][dataset_attr.content_tag]
+            messages = messages[1:]
+        else:
+            system = examples[dataset_attr.system][i] if dataset_attr.system else ""
+
+        messages = messages[: len(messages) // 2 * 2]  # should be multiples of 2
+        if len(messages) == 0:
+            continue
+
+        aligned_messages = []
+        for turn_idx, message in enumerate(messages):
+            if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]:
+                raise ValueError("Invalid role tag in {}.".format(messages))
+
+            aligned_messages.append(
+                {
+                    "role": tag_mapping[message[dataset_attr.role_tag]],
+                    "content": message[dataset_attr.content_tag],
+                }
+            )
+
+        outputs["prompt"].append(aligned_messages[:-1])
+        outputs["response"].append(aligned_messages[-1:])
+        outputs["system"].append(system)
+        outputs["tools"].append(
+            examples[dataset_attr.tools][i] if dataset_attr.tools else ""
+        )
+        print(examples[dataset_attr.images][i])
+        outputs["images"].append(
+            examples[dataset_attr.images][i] if dataset_attr.images else []
+        )
 
     return outputs
 
 
 def align_dataset(
-    dataset: Union["Dataset", "IterableDataset"], dataset_attr: "DatasetAttr", data_args: "DataArguments"
+    dataset: Union["Dataset", "IterableDataset"],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
 ) -> Union["Dataset", "IterableDataset"]:
     r"""
     Aligned dataset:
@@ -100,6 +181,8 @@ def align_dataset(
     """
     if dataset_attr.formatting == "alpaca":
         convert_func = partial(convert_alpaca, dataset_attr=dataset_attr)
+    elif dataset_attr.formatting == "llava":
+        convert_func = partial(convert_llava, dataset_attr=dataset_attr)
     else:
         convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr)
 
@@ -107,13 +190,20 @@ def align_dataset(
     features = Features.from_dict(
         {
             "prompt": [
-                {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}}
+                {
+                    "role": {"dtype": "string", "_type": "Value"},
+                    "content": {"dtype": "string", "_type": "Value"},
+                }
             ],
             "response": [
-                {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}}
+                {
+                    "role": {"dtype": "string", "_type": "Value"},
+                    "content": {"dtype": "string", "_type": "Value"},
+                }
             ],
             "system": {"dtype": "string", "_type": "Value"},
             "tools": {"dtype": "string", "_type": "Value"},
+            "images": {"feature": {"_type": "Image"}, "_type": "Sequence"},
         }
     )
     kwargs = {}
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index 18665731..c373e196 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -1,6 +1,6 @@
 import inspect
 import os
-from typing import TYPE_CHECKING, Literal, Union
+from typing import TYPE_CHECKING, Literal, Union, Optional
 
 from datasets import load_dataset, load_from_disk
 
@@ -25,9 +25,9 @@ logger = get_logger(__name__)
 
 
 def load_single_dataset(
-        dataset_attr: "DatasetAttr",
-        model_args: "ModelArguments",
-        data_args: "DataArguments",
+    dataset_attr: "DatasetAttr",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
 ) -> Union["Dataset", "IterableDataset"]:
     logger.info("Loading dataset {}...".format(dataset_attr))
     data_path, data_name, data_dir, data_files = None, None, None, None
@@ -78,14 +78,20 @@ def load_single_dataset(
                 split=data_args.split,
                 cache_dir=cache_dir,
                 token=model_args.ms_hub_token,
-                use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
+                use_streaming=(
+                    data_args.streaming and (dataset_attr.load_from != "file")
+                ),
             )
             if isinstance(dataset, MsDataset):
                 dataset = dataset.to_hf_dataset()
         except ImportError:
-            raise ImportError("Please install modelscope via `pip install modelscope -U`")
+            raise ImportError(
+                "Please install modelscope via `pip install modelscope -U`"
+            )
     else:
-        if "trust_remote_code" in inspect.signature(load_dataset).parameters:  # for datasets==2.16.0
+        if (
+            "trust_remote_code" in inspect.signature(load_dataset).parameters
+        ):  # for datasets==2.16.0
             kwargs = {"trust_remote_code": True}
         else:
             kwargs = {}
@@ -102,7 +108,9 @@ def load_single_dataset(
             **kwargs,
         )
 
-    if data_args.streaming and (dataset_attr.load_from == "file"):  # faster than specifying streaming=True
+    if data_args.streaming and (
+        dataset_attr.load_from == "file"
+    ):  # faster than specifying streaming=True
         dataset = dataset.to_iterable_dataset()  # TODO: add num shards parameter
 
     if data_args.max_samples is not None:  # truncate dataset
@@ -113,11 +121,12 @@ def load_single_dataset(
 
 
 def get_dataset(
-        tokenizer: "PreTrainedTokenizer",
-        model_args: "ModelArguments",
-        data_args: "DataArguments",
-        training_args: "Seq2SeqTrainingArguments",
-        stage: Literal["pt", "sft", "rm", "ppo"],
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo"],
+    processor: Optional["AutoProcessor"] = None,
 ) -> Union["Dataset", "IterableDataset"]:
     template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
     if data_args.train_on_prompt and template.efficient_eos:
@@ -126,9 +135,13 @@ def get_dataset(
     # Load tokenized dataset
     if data_args.tokenized_path is not None:
         if has_tokenized_data(data_args.tokenized_path):
-            logger.warning("Loading dataset from disk will ignore other data arguments.")
+            logger.warning(
+                "Loading dataset from disk will ignore other data arguments."
+            )
             dataset = load_from_disk(data_args.tokenized_path)
-            logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
+            logger.info(
+                "Loaded tokenized dataset from {}.".format(data_args.tokenized_path)
+            )
             if data_args.streaming:
                 dataset = dataset.to_iterable_dataset()
             return dataset
@@ -139,15 +152,21 @@ def get_dataset(
     with training_args.main_process_first(desc="load dataset"):
         all_datasets = []
         for dataset_attr in get_dataset_list(data_args):
-            if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True):
-                raise ValueError("The dataset is not applicable in the current training stage.")
+            if (stage == "rm" and dataset_attr.ranking is False) or (
+                stage != "rm" and dataset_attr.ranking is True
+            ):
+                raise ValueError(
+                    "The dataset is not applicable in the current training stage."
+                )
 
-            all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args))
+            all_datasets.append(
+                load_single_dataset(dataset_attr, model_args, data_args)
+            )
         dataset = merge_dataset(all_datasets, data_args, training_args)
 
     with training_args.main_process_first(desc="pre-process dataset"):
         preprocess_func, print_function = get_preprocess_and_print_func(
-            tokenizer, template, data_args, training_args, stage
+            tokenizer, template, data_args, training_args, stage, processor
         )
         column_names = list(next(iter(dataset)).keys())
         kwargs = {}
@@ -158,13 +177,21 @@ def get_dataset(
                 desc="Running tokenizer on dataset",
             )
 
-        dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
+        dataset = dataset.map(
+            preprocess_func, batched=True, remove_columns=column_names, **kwargs
+        )
 
         if data_args.tokenized_path is not None:
             if training_args.should_save:
                 dataset.save_to_disk(data_args.tokenized_path)
-                logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path))
-                logger.info("Please restart the training with `--tokenized_path {}`.".format(data_args.tokenized_path))
+                logger.info(
+                    "Tokenized dataset saved at {}.".format(data_args.tokenized_path)
+                )
+                logger.info(
+                    "Please restart the training with `--tokenized_path {}`.".format(
+                        data_args.tokenized_path
+                    )
+                )
 
             exit(0)
 
@@ -172,34 +199,8 @@ def get_dataset(
             try:
                 print_function(next(iter(dataset)))
             except StopIteration:
-                raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
+                raise RuntimeError(
+                    "Cannot find valid samples, check `data/README.md` for the data format."
+                )
 
         return dataset
-
-
-def get_mm_dataset(
-        processor: "AutoProcessor",
-        model_args: "ModelArguments",
-        data_args: "DataArguments",
-        training_args: "Seq2SeqTrainingArguments",
-        stage: Literal["pt", "sft", "rm", "ppo"],
-) -> Union["Dataset", "IterableDataset"]:
-    if data_args.tokenized_path is not None:
-        if has_tokenized_data(data_args.tokenized_path):
-            logger.warning("Loading dataset from disk will ignore other data arguments.")
-            dataset = load_from_disk(data_args.tokenized_path)
-            logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
-            if data_args.streaming:
-                dataset = dataset.to_iterable_dataset()
-            return dataset
-
-        if data_args.streaming:
-            raise ValueError("Turn off `streaming` when saving dataset to disk.")
-
-    with training_args.main_process_first(desc="load dataset"):
-        all_datasets = []
-        for dataset_attr in get_dataset_list(data_args):
-            all_datasets.append(load_dataset(dataset_attr.dataset_name)['train'])
-        dataset = merge_dataset(all_datasets, data_args, training_args)
-
-    return dataset
diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py
index b9c8782a..79d6ed4e 100644
--- a/src/llmtuner/data/parser.py
+++ b/src/llmtuner/data/parser.py
@@ -25,7 +25,7 @@ class DatasetAttr:
     subset: Optional[str] = None
     folder: Optional[str] = None
     ranking: bool = False
-    formatting: Literal["alpaca", "sharegpt"] = "alpaca"
+    formatting: Literal["alpaca", "sharegpt", "llava"] = "alpaca"
     """ columns """
     system: Optional[str] = None
     """ columns for the alpaca format """
@@ -44,11 +44,15 @@ class DatasetAttr:
     observation_tag: Optional[str] = "observation"
     function_tag: Optional[str] = "function_call"
     system_tag: Optional[str] = "system"
+    """ columns for the mllm format """
+    images: Optional[str] = None
 
     def __repr__(self) -> str:
         return self.dataset_name
 
-    def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None:
+    def set_attr(
+        self, key: str, obj: Dict[str, Any], default: Optional[Any] = None
+    ) -> None:
         setattr(self, key, obj.get(key, default))
 
 
@@ -67,12 +71,16 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
         except Exception as err:
             if len(dataset_names) != 0:
                 raise ValueError(
-                    "Cannot open {} due to {}.".format(os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err))
+                    "Cannot open {} due to {}.".format(
+                        os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err)
+                    )
                 )
             dataset_info = None
 
     if data_args.interleave_probs is not None:
-        data_args.interleave_probs = [float(prob.strip()) for prob in data_args.interleave_probs.split(",")]
+        data_args.interleave_probs = [
+            float(prob.strip()) for prob in data_args.interleave_probs.split(",")
+        ]
 
     dataset_list: List[DatasetAttr] = []
     for name in dataset_names:
@@ -90,31 +98,42 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
 
         if has_hf_url or has_ms_url:
             if (use_modelscope() and has_ms_url) or (not has_hf_url):
-                dataset_attr = DatasetAttr("ms_hub", dataset_name=dataset_info[name]["ms_hub_url"])
+                dataset_attr = DatasetAttr(
+                    "ms_hub", dataset_name=dataset_info[name]["ms_hub_url"]
+                )
             else:
-                dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
+                dataset_attr = DatasetAttr(
+                    "hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]
+                )
         elif "script_url" in dataset_info[name]:
-            dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
+            dataset_attr = DatasetAttr(
+                "script", dataset_name=dataset_info[name]["script_url"]
+            )
         else:
-            dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])
+            dataset_attr = DatasetAttr(
+                "file", dataset_name=dataset_info[name]["file_name"]
+            )
 
         dataset_attr.set_attr("file_sha1", dataset_info[name])
         dataset_attr.set_attr("subset", dataset_info[name])
         dataset_attr.set_attr("folder", dataset_info[name])
         dataset_attr.set_attr("ranking", dataset_info[name], default=False)
         dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca")
+        dataset_attr.set_attr("images", dataset_info[name], default="")
 
         if "columns" in dataset_info[name]:
             column_names = ["system"]
             if dataset_attr.formatting == "alpaca":
                 column_names.extend(["prompt", "query", "response", "history"])
+            elif dataset_attr.formatting == "llava":
+                column_names.extend(["messages", "images"])
             else:
                 column_names.extend(["messages", "tools"])
 
             for column_name in column_names:
                 dataset_attr.set_attr(column_name, dataset_info[name]["columns"])
 
-        if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]:
+        if dataset_attr.formatting != "alpaca" and "tags" in dataset_info[name]:
             tag_names = (
                 "role_tag",
                 "content_tag",
diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 8494ba7e..dc72483f 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -1,6 +1,6 @@
 from functools import partial
 from itertools import chain
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Tuple
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Tuple, Optional
 
 from ..extras.constants import IGNORE_INDEX
 from ..extras.logging import get_logger
@@ -9,7 +9,7 @@ from .utils import Role
 
 if TYPE_CHECKING:
     from transformers import Seq2SeqTrainingArguments
-    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers.tokenization_utils import PreTrainedTokenizer, AutoProcessor
 
     from ..hparams import DataArguments
     from .template import Template
@@ -19,19 +19,27 @@ logger = get_logger(__name__)
 
 
 def preprocess_pretrain_dataset(
-    examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
+    examples: Dict[str, List[Any]],
+    tokenizer: "PreTrainedTokenizer",
+    data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
-    text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]]
+    text_examples = [
+        messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]
+    ]
 
     if not data_args.packing:
         if data_args.template == "gemma":
             text_examples = [tokenizer.bos_token + example for example in text_examples]
 
-        result = tokenizer(text_examples, add_special_tokens=False, max_length=data_args.cutoff_len)
+        result = tokenizer(
+            text_examples, add_special_tokens=False, max_length=data_args.cutoff_len
+        )
     else:
         tokenized_examples = tokenizer(text_examples, add_special_tokens=False)
-        concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
+        concatenated_examples = {
+            k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()
+        }
         total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
         block_size = data_args.cutoff_len
         total_length = (total_length // block_size) * block_size
@@ -54,7 +62,11 @@ def preprocess_supervised_dataset(
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
-    model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
+    model_inputs = {
+        "input_ids": [],
+        "attention_mask": [],
+        "labels": [],
+    }
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
@@ -75,7 +87,9 @@ def preprocess_supervised_dataset(
             if data_args.train_on_prompt:
                 source_mask = source_ids
             elif turn_idx != 0 and template.efficient_eos:
-                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
+                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (
+                    len(source_ids) - 1
+                )
             else:
                 source_mask = [IGNORE_INDEX] * len(source_ids)
 
@@ -114,7 +128,9 @@ def preprocess_packed_supervised_dataset(
             if data_args.train_on_prompt:
                 source_mask = source_ids
             elif len(input_ids) != 0 and template.efficient_eos:
-                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
+                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (
+                    len(source_ids) - 1
+                )
             else:
                 source_mask = [IGNORE_INDEX] * len(source_ids)
 
@@ -139,6 +155,64 @@ def preprocess_packed_supervised_dataset(
     return model_inputs
 
 
+def preprocess_multimodal_supervised_dataset(
+    examples: Dict[str, List[Any]],
+    processor: "AutoProcessor",
+    template: "Template",
+    data_args: "DataArguments",
+) -> Dict[str, List[List[int]]]:
+    # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
+    # for multiturn examples, we only mask the prompt part in each prompt-response pair.
+    tokenizer = processor.tokenizer
+    model_inputs = {
+        "input_ids": [],
+        "attention_mask": [],
+        "labels": [],
+        "pixel_values": [],
+    }
+
+    for i in range(len(examples["prompt"])):
+        if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
+            continue
+
+        messages = examples["prompt"][i] + examples["response"][i]
+        input_ids, labels = [], []
+        for turn_idx, (source_ids, target_ids) in enumerate(
+            template.encode_multiturn(
+                tokenizer,
+                messages,
+                examples["system"][i],
+                examples["tools"][i],
+                data_args.cutoff_len,
+                data_args.reserved_label_len,
+            )
+        ):
+            if data_args.train_on_prompt:
+                source_mask = source_ids
+            elif turn_idx != 0 and template.efficient_eos:
+                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (
+                    len(source_ids) - 1
+                )
+            else:
+                source_mask = [IGNORE_INDEX] * len(source_ids)
+
+            input_ids += source_ids + target_ids
+            labels += source_mask + target_ids
+
+        if template.efficient_eos:
+            input_ids += [tokenizer.eos_token_id]
+            labels += [tokenizer.eos_token_id]
+
+        model_inputs["input_ids"].append(input_ids)
+        model_inputs["attention_mask"].append([1] * len(input_ids))
+        model_inputs["labels"].append(labels)
+        pixel_values = processor.image_processor(
+            examples["images"][0], return_tensors="pt"
+        )["pixel_values"][0]
+        model_inputs["pixel_values"].append(pixel_values)
+    return model_inputs
+
+
 def preprocess_unsupervised_dataset(
     examples: Dict[str, List[Any]],
     tokenizer: "PreTrainedTokenizer",
@@ -155,7 +229,9 @@ def preprocess_unsupervised_dataset(
         if len(examples["response"][i]) == 1:
             messages = examples["prompt"][i] + examples["response"][i]
         else:
-            messages = examples["prompt"][i] + [{"role": Role.ASSISTANT.value, "content": ""}]
+            messages = examples["prompt"][i] + [
+                {"role": Role.ASSISTANT.value, "content": ""}
+            ]
 
         input_ids, labels = template.encode_oneturn(
             tokenizer,
@@ -218,29 +294,58 @@ def preprocess_pairwise_dataset(
     return model_inputs
 
 
-def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
+def print_supervised_dataset_example(
+    example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer"
+) -> None:
     print("input_ids:\n{}".format(example["input_ids"]))
-    print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+    print(
+        "inputs:\n{}".format(
+            tokenizer.decode(example["input_ids"], skip_special_tokens=False)
+        )
+    )
     print("label_ids:\n{}".format(example["labels"]))
     print(
         "labels:\n{}".format(
-            tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False)
+            tokenizer.decode(
+                list(filter(lambda x: x != IGNORE_INDEX, example["labels"])),
+                skip_special_tokens=False,
+            )
         )
     )
 
 
-def print_pairwise_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
+def print_pairwise_dataset_example(
+    example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer"
+) -> None:
     print("prompt_ids:\n{}".format(example["prompt_ids"]))
-    print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)))
+    print(
+        "prompt:\n{}".format(
+            tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)
+        )
+    )
     print("chosen_ids:\n{}".format(example["chosen_ids"]))
-    print("chosen:\n{}".format(tokenizer.decode(example["chosen_ids"], skip_special_tokens=False)))
+    print(
+        "chosen:\n{}".format(
+            tokenizer.decode(example["chosen_ids"], skip_special_tokens=False)
+        )
+    )
     print("rejected_ids:\n{}".format(example["rejected_ids"]))
-    print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)))
+    print(
+        "rejected:\n{}".format(
+            tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)
+        )
+    )
 
 
-def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
+def print_unsupervised_dataset_example(
+    example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer"
+) -> None:
     print("input_ids:\n{}".format(example["input_ids"]))
-    print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+    print(
+        "inputs:\n{}".format(
+            tokenizer.decode(example["input_ids"], skip_special_tokens=False)
+        )
+    )
 
 
 def get_preprocess_and_print_func(
@@ -249,30 +354,56 @@ def get_preprocess_and_print_func(
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
     stage: Literal["pt", "sft", "rm", "ppo"],
+    processor: Optional["AutoProcessor"] = None,
 ) -> Tuple[Callable, Callable]:
     if stage == "pt":
-        preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args)
-        print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
+        preprocess_func = partial(
+            preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args
+        )
+        print_function = partial(
+            print_unsupervised_dataset_example, tokenizer=tokenizer
+        )
     elif stage == "sft" and not training_args.predict_with_generate:
         if data_args.packing:
             preprocess_func = partial(
-                preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
+                preprocess_packed_supervised_dataset,
+                tokenizer=tokenizer,
+                template=template,
+                data_args=data_args,
+            )
+        elif processor is not None:
+            preprocess_func = partial(
+                preprocess_multimodal_supervised_dataset,
+                processor=processor,
+                template=template,
+                data_args=data_args,
             )
         else:
             preprocess_func = partial(
-                preprocess_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
+                preprocess_supervised_dataset,
+                tokenizer=tokenizer,
+                template=template,
+                data_args=data_args,
             )
 
         print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)
     elif stage == "rm":
         preprocess_func = partial(
-            preprocess_pairwise_dataset, tokenizer=tokenizer, template=template, data_args=data_args
+            preprocess_pairwise_dataset,
+            tokenizer=tokenizer,
+            template=template,
+            data_args=data_args,
         )
         print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer)
     else:
         preprocess_func = partial(
-            preprocess_unsupervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
+            preprocess_unsupervised_dataset,
+            tokenizer=tokenizer,
+            template=template,
+            data_args=data_args,
+        )
+        print_function = partial(
+            print_unsupervised_dataset_example, tokenizer=tokenizer
         )
-        print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
 
-    return preprocess_func, print_function
\ No newline at end of file
+    return preprocess_func, print_function
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 73b22eb7..311660aa 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -42,7 +42,9 @@ class Template:
         r"""
         Returns a single pair of token ids representing prompt and response respectively.
         """
-        encoded_pairs = self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
+        encoded_pairs = self._encode(
+            tokenizer, messages, system, tools, cutoff_len, reserved_label_len
+        )
         prompt_ids = []
         for query_ids, resp_ids in encoded_pairs[:-1]:
             prompt_ids += query_ids + resp_ids
@@ -62,7 +64,9 @@ class Template:
         r"""
         Returns multiple pairs of token ids representing prompts and responses respectively.
         """
-        return self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
+        return self._encode(
+            tokenizer, messages, system, tools, cutoff_len, reserved_label_len
+        )
 
     def _encode(
         self,
@@ -89,7 +93,9 @@ class Template:
                 elements += self.format_separator.apply()
 
             if message["role"] == Role.USER.value:
-                elements += self.format_user.apply(content=message["content"], idx=str(i // 2))
+                elements += self.format_user.apply(
+                    content=message["content"], idx=str(i // 2)
+                )
             elif message["role"] == Role.ASSISTANT.value:
                 elements += self.format_assistant.apply(content=message["content"])
             elif message["role"] == Role.OBSERVATION.value:
@@ -104,7 +110,9 @@ class Template:
         return self._make_pairs(encoded_messages, cutoff_len, reserved_label_len)
 
     def _convert_elements_to_ids(
-        self, tokenizer: "PreTrainedTokenizer", elements: List[Union[str, Dict[str, str]]]
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        elements: List[Union[str, Dict[str, str]]],
     ) -> List[int]:
         r"""
         Converts elements to token ids.
@@ -122,7 +130,11 @@ class Template:
                 elif "eos_token" in elem and tokenizer.eos_token_id is not None:
                     token_ids += [tokenizer.eos_token_id]
             else:
-                raise ValueError("Input must be string, set[str] or dict[str, str], got {}".format(type(elem)))
+                raise ValueError(
+                    "Input must be string, set[str] or dict[str, str], got {}".format(
+                        type(elem)
+                    )
+                )
 
         return token_ids
 
@@ -180,7 +192,9 @@ class Llama2Template(Template):
                 elements += self.format_separator.apply()
 
             if message["role"] == Role.USER.value:
-                elements += self.format_user.apply(content=system_text + message["content"])
+                elements += self.format_user.apply(
+                    content=system_text + message["content"]
+                )
             elif message["role"] == Role.ASSISTANT.value:
                 elements += self.format_assistant.apply(content=message["content"])
             elif message["role"] == Role.OBSERVATION.value:
@@ -243,7 +257,9 @@ def _register_template(
     template_class = Llama2Template if name.startswith("llama2") else Template
     default_user_formatter = StringFormatter(slots=["{{content}}"])
     default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots)
-    default_function_formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots)
+    default_function_formatter = FunctionFormatter(
+        slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots
+    )
     default_tool_formatter = ToolFormatter(tool_format="default")
     default_separator_formatter = EmptyFormatter()
     templates[name] = template_class(
@@ -279,7 +295,9 @@ def _jinja_escape(content: str) -> str:
     return content.replace("\n", r"\n").replace("'", r"\'")
 
 
-def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str:
+def _convert_slots_to_jinja(
+    slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content"
+) -> str:
     slot_items = []
     for slot in slots:
         if isinstance(slot, str):
@@ -293,7 +311,9 @@ def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", pl
         elif isinstance(slot, set):
             if "bos_token" in slot:
                 slot_items.append("'" + tokenizer.bos_token + "'")
-            elif "eos_token" in slot:  # do not use {{ eos_token }} since it may be replaced
+            elif (
+                "eos_token" in slot
+            ):  # do not use {{ eos_token }} since it may be replaced
                 slot_items.append("'" + tokenizer.eos_token + "'")
         elif isinstance(slot, dict):
             raise ValueError("Dict is not supported.")
@@ -305,25 +325,37 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer")
     jinja_template = ""
 
     if template.default_system:
-        jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
+        jinja_template += (
+            "{% set system_message = '"
+            + _jinja_escape(template.default_system)
+            + "' %}"
+        )
 
     jinja_template += (
-        "{% if messages[0]['role'] == 'system' %}" "{% set system_message = messages[0]['content'] %}" "{% endif %}"
+        "{% if messages[0]['role'] == 'system' %}"
+        "{% set system_message = messages[0]['content'] %}"
+        "{% endif %}"
     )
 
-    system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message")
+    system_message = _convert_slots_to_jinja(
+        template.format_system.apply(), tokenizer, placeholder="system_message"
+    )
     if isinstance(template, Llama2Template):
         pass
     elif template.force_system:
         jinja_template += "{{ " + system_message + " }}"
     else:
-        jinja_template += "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}"
+        jinja_template += (
+            "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}"
+        )
 
     jinja_template += "{% for message in messages %}"
     jinja_template += "{% set content = message['content'] %}"
     if isinstance(template, Llama2Template):
         jinja_template += "{% if loop.index0 == 0 and system_message is defined %}"
-        jinja_template += "{% set content = " + system_message + " + message['content'] %}"
+        jinja_template += (
+            "{% set content = " + system_message + " + message['content'] %}"
+        )
         jinja_template += "{% endif %}"
     jinja_template += "{% if message['role'] == 'user' %}"
     user_message = _convert_slots_to_jinja(template.format_user.apply(), tokenizer)
@@ -366,11 +398,14 @@ def get_template_and_fix_tokenizer(
 
     if stop_words:
         num_added_tokens = tokenizer.add_special_tokens(
-            dict(additional_special_tokens=stop_words), replace_additional_special_tokens=False
+            dict(additional_special_tokens=stop_words),
+            replace_additional_special_tokens=False,
         )
         logger.info("Add {} to stop words.".format(",".join(stop_words)))
         if num_added_tokens > 0:
-            logger.warning("New tokens have been added, make sure `resize_vocab` is True.")
+            logger.warning(
+                "New tokens have been added, make sure `resize_vocab` is True."
+            )
 
     try:
         tokenizer.chat_template = _get_jinja_template(template, tokenizer)
@@ -382,7 +417,9 @@ def get_template_and_fix_tokenizer(
 
 _register_template(
     name="alpaca",
-    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
+    format_user=StringFormatter(
+        slots=["### Instruction:\n{{content}}\n\n### Response:\n"]
+    ),
     format_separator=EmptyFormatter(slots=["\n\n"]),
     default_system=(
         "Below is an instruction that describes a task. "
@@ -407,7 +444,13 @@ _register_template(
 _register_template(
     name="atom",
     format_user=StringFormatter(
-        slots=[{"bos_token"}, "Human: {{content}}\n", {"eos_token"}, {"bos_token"}, "Assistant:"]
+        slots=[
+            {"bos_token"},
+            "Human: {{content}}\n",
+            {"eos_token"},
+            {"bos_token"},
+            "Assistant:",
+        ]
     ),
     format_assistant=StringFormatter(slots=["{{content}}\n", {"eos_token"}]),
 )
@@ -415,7 +458,9 @@ _register_template(
 
 _register_template(
     name="baichuan",
-    format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
+    format_user=StringFormatter(
+        slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]
+    ),
     efficient_eos=True,
 )
 
@@ -438,7 +483,9 @@ _register_template(
 
 _register_template(
     name="bluelm",
-    format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
+    format_user=StringFormatter(
+        slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]
+    ),
 )
 
 
@@ -457,7 +504,9 @@ _register_template(
 _register_template(
     name="chatglm2",
     format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
-    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
+    format_system=StringFormatter(
+        slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]
+    ),
     format_separator=EmptyFormatter(slots=["\n\n"]),
     efficient_eos=True,
     force_system=True,
@@ -466,12 +515,21 @@ _register_template(
 
 _register_template(
     name="chatglm3",
-    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
+    format_user=StringFormatter(
+        slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
+    ),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
+    format_system=StringFormatter(
+        slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]
+    ),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(
-        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
+        slots=[
+            {"token": "<|observation|>"},
+            "\n",
+            "{{content}}",
+            {"token": "<|assistant|>"},
+        ]
     ),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
@@ -481,14 +539,27 @@ _register_template(
 
 _register_template(
     name="chatglm3_system",
-    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
+    format_user=StringFormatter(
+        slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
+    ),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
     format_system=StringFormatter(
-        slots=[{"token": "[gMASK]"}, {"token": "sop"}, {"token": "<|system|>"}, "\n", "{{content}}"]
+        slots=[
+            {"token": "[gMASK]"},
+            {"token": "sop"},
+            {"token": "<|system|>"},
+            "\n",
+            "{{content}}",
+        ]
     ),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(
-        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
+        slots=[
+            {"token": "<|observation|>"},
+            "\n",
+            "{{content}}",
+            {"token": "<|assistant|>"},
+        ]
     ),
     default_system=(
         "You are ChatGLM3, a large language model trained by Zhipu.AI. "
@@ -501,9 +572,15 @@ _register_template(
 
 _register_template(
     name="chatml",
-    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
-    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
-    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_user=StringFormatter(
+        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_system=StringFormatter(
+        slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]
+    ),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
     format_separator=EmptyFormatter(slots=["\n"]),
     stop_words=["<|im_end|>", "<|im_start|>"],
     replace_eos=True,
@@ -512,9 +589,15 @@ _register_template(
 
 _register_template(
     name="chatml_de",
-    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
-    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
-    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_user=StringFormatter(
+        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_system=StringFormatter(
+        slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]
+    ),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
     stop_words=["<|im_end|>", "<|im_start|>"],
@@ -524,7 +607,9 @@ _register_template(
 
 _register_template(
     name="codegeex2",
-    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
+    format_system=StringFormatter(
+        slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]
+    ),
     force_system=True,
 )
 
@@ -554,9 +639,15 @@ _register_template(
 
 _register_template(
     name="dbrx",
-    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
-    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
-    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_user=StringFormatter(
+        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_system=StringFormatter(
+        slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]
+    ),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system=(
         "You are DBRX, created by Databricks. You were last updated in December 2023. "
@@ -634,7 +725,9 @@ _register_template(
 
 _register_template(
     name="gemma",
-    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_user=StringFormatter(
+        slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     format_observation=StringFormatter(
         slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
@@ -647,7 +740,9 @@ _register_template(
 
 _register_template(
     name="intern",
-    format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
+    format_user=StringFormatter(
+        slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]
+    ),
     format_separator=EmptyFormatter(slots=[{"token": "<eoa>"}, "\n"]),
     stop_words=["<eoa>"],
     efficient_eos=True,
@@ -656,8 +751,12 @@ _register_template(
 
 _register_template(
     name="intern2",
-    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_user=StringFormatter(
+        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_system=StringFormatter(
+        slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]
+    ),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system=(
         "You are an AI assistant whose name is InternLM (书生·浦语).\n"
@@ -707,7 +806,10 @@ _register_template(
         ]
     ),
     format_system=StringFormatter(
-        slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]
+        slots=[
+            {"bos_token"},
+            "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>",
+        ]
     ),
     format_observation=StringFormatter(
         slots=[
@@ -742,7 +844,13 @@ _register_template(
 
 _register_template(
     name="openchat",
-    format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
+    format_user=StringFormatter(
+        slots=[
+            "GPT4 Correct User: {{content}}",
+            {"eos_token"},
+            "GPT4 Correct Assistant:",
+        ]
+    ),
     format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     force_system=True,
@@ -751,7 +859,9 @@ _register_template(
 
 _register_template(
     name="orion",
-    format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
+    format_user=StringFormatter(
+        slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]
+    ),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     force_system=True,
 )
@@ -759,9 +869,15 @@ _register_template(
 
 _register_template(
     name="phi",
-    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
-    format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]),
-    format_observation=StringFormatter(slots=["<|function_output|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_user=StringFormatter(
+        slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]
+    ),
+    format_system=StringFormatter(
+        slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]
+    ),
+    format_observation=StringFormatter(
+        slots=["<|function_output|>\n{{content}}<|end|>\n<|assistant|>\n"]
+    ),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system="You are a helpful AI assistant.",
     stop_words=["<|end|>"],
@@ -771,9 +887,15 @@ _register_template(
 
 _register_template(
     name="qwen",
-    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
-    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
-    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_user=StringFormatter(
+        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_system=StringFormatter(
+        slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]
+    ),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system="You are a helpful assistant.",
     stop_words=["<|im_end|>"],
@@ -829,8 +951,12 @@ _register_template(
 
 _register_template(
     name="yayi",
-    format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
-    format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]),
+    format_user=StringFormatter(
+        slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]
+    ),
+    format_system=StringFormatter(
+        slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]
+    ),
     format_separator=EmptyFormatter(slots=["\n\n"]),
     default_system=(
         "You are a helpful, respectful and honest assistant named YaYi "
@@ -849,7 +975,9 @@ _register_template(
 
 _register_template(
     name="yi",
-    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_user=StringFormatter(
+        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
+    ),
     format_separator=EmptyFormatter(slots=["\n"]),
     stop_words=["<|im_end|>"],
     replace_eos=True,
@@ -867,7 +995,9 @@ _register_template(
 
 _register_template(
     name="zephyr",
-    format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
+    format_user=StringFormatter(
+        slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]
+    ),
     format_assistant=StringFormatter(slots=["\n{{content}}", {"eos_token"}]),
     format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
     default_system="You are a friendly chatbot who always responds in the style of a pirate",
@@ -879,3 +1009,13 @@ _register_template(
     format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )
+
+_register_template(
+    name="llava",
+    format_user=StringFormatter(slots=["USER: {{content}} "]),
+    format_assistant=StringFormatter(slots=["ASSISTANT: {{content}}"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+)
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index a6e4b710..63fc7f02 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -15,23 +15,33 @@ class ModelArguments:
     )
     adapter_name_or_path: Optional[str] = field(
         default=None,
-        metadata={"help": "Path to the adapter weight or identifier from huggingface.co/models."},
+        metadata={
+            "help": "Path to the adapter weight or identifier from huggingface.co/models."
+        },
     )
     cache_dir: Optional[str] = field(
         default=None,
-        metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."},
+        metadata={
+            "help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."
+        },
     )
     use_fast_tokenizer: bool = field(
         default=True,
-        metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."},
+        metadata={
+            "help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."
+        },
     )
     resize_vocab: bool = field(
         default=False,
-        metadata={"help": "Whether or not to resize the tokenizer vocab and the embedding layers."},
+        metadata={
+            "help": "Whether or not to resize the tokenizer vocab and the embedding layers."
+        },
     )
     split_special_tokens: bool = field(
         default=False,
-        metadata={"help": "Whether or not the special tokens should be split during the tokenization process."},
+        metadata={
+            "help": "Whether or not the special tokens should be split during the tokenization process."
+        },
     )
     new_special_tokens: Optional[str] = field(
         default=None,
@@ -39,7 +49,9 @@ class ModelArguments:
     )
     model_revision: str = field(
         default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+        metadata={
+            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
+        },
     )
     low_cpu_mem_usage: bool = field(
         default=True,
@@ -47,7 +59,9 @@ class ModelArguments:
     )
     quantization_bit: Optional[int] = field(
         default=None,
-        metadata={"help": "The number of bits to quantize the model using bitsandbytes."},
+        metadata={
+            "help": "The number of bits to quantize the model using bitsandbytes."
+        },
     )
     quantization_type: Literal["fp4", "nf4"] = field(
         default="nf4",
@@ -55,15 +69,21 @@ class ModelArguments:
     )
     double_quantization: bool = field(
         default=True,
-        metadata={"help": "Whether or not to use double quantization in int4 training."},
+        metadata={
+            "help": "Whether or not to use double quantization in int4 training."
+        },
     )
     quantization_device_map: Optional[Literal["auto"]] = field(
         default=None,
-        metadata={"help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."},
+        metadata={
+            "help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."
+        },
     )
     rope_scaling: Optional[Literal["linear", "dynamic"]] = field(
         default=None,
-        metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
+        metadata={
+            "help": "Which scaling strategy should be adopted for the RoPE embeddings."
+        },
     )
     flash_attn: Literal["off", "sdpa", "fa2", "auto"] = field(
         default="auto",
@@ -71,19 +91,27 @@ class ModelArguments:
     )
     shift_attn: bool = field(
         default=False,
-        metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."},
+        metadata={
+            "help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."
+        },
     )
     mixture_of_depths: Optional[Literal["convert", "load"]] = field(
         default=None,
-        metadata={"help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."},
+        metadata={
+            "help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."
+        },
     )
     use_unsloth: bool = field(
         default=False,
-        metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."},
+        metadata={
+            "help": "Whether or not to use unsloth's optimization for the LoRA training."
+        },
     )
     moe_aux_loss_coef: Optional[float] = field(
         default=None,
-        metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."},
+        metadata={
+            "help": "Coefficient of the auxiliary router loss in mixture-of-experts model."
+        },
     )
     disable_gradient_checkpointing: bool = field(
         default=False,
@@ -107,7 +135,9 @@ class ModelArguments:
     )
     vllm_gpu_util: float = field(
         default=0.9,
-        metadata={"help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine."},
+        metadata={
+            "help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine."
+        },
     )
     vllm_enforce_eager: bool = field(
         default=False,
@@ -147,7 +177,9 @@ class ModelArguments:
     )
     export_quantization_dataset: Optional[str] = field(
         default=None,
-        metadata={"help": "Path to the dataset or dataset name to use in quantizing the exported model."},
+        metadata={
+            "help": "Path to the dataset or dataset name to use in quantizing the exported model."
+        },
     )
     export_quantization_nsamples: int = field(
         default=128,
@@ -155,19 +187,27 @@ class ModelArguments:
     )
     export_quantization_maxlen: int = field(
         default=1024,
-        metadata={"help": "The maximum length of the model inputs used for quantization."},
+        metadata={
+            "help": "The maximum length of the model inputs used for quantization."
+        },
     )
     export_legacy_format: bool = field(
         default=False,
-        metadata={"help": "Whether or not to save the `.bin` files instead of `.safetensors`."},
+        metadata={
+            "help": "Whether or not to save the `.bin` files instead of `.safetensors`."
+        },
     )
     export_hub_model_id: Optional[str] = field(
         default=None,
-        metadata={"help": "The name of the repository if push the model to the Hugging Face hub."},
+        metadata={
+            "help": "The name of the repository if push the model to the Hugging Face hub."
+        },
     )
     print_param_status: bool = field(
         default=False,
-        metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
+        metadata={
+            "help": "For debugging purposes, print the status of the parameters in the model."
+        },
     )
     use_mllm: bool = field(
         default=False,
@@ -180,18 +220,39 @@ class ModelArguments:
         self.model_max_length = None
 
         if self.split_special_tokens and self.use_fast_tokenizer:
-            raise ValueError("`split_special_tokens` is only supported for slow tokenizers.")
+            raise ValueError(
+                "`split_special_tokens` is only supported for slow tokenizers."
+            )
 
-        if self.adapter_name_or_path is not None:  # support merging multiple lora weights
-            self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")]
+        if (
+            self.adapter_name_or_path is not None
+        ):  # support merging multiple lora weights
+            self.adapter_name_or_path = [
+                path.strip() for path in self.adapter_name_or_path.split(",")
+            ]
 
         if self.new_special_tokens is not None:  # support multiple special tokens
-            self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")]
+            self.new_special_tokens = [
+                token.strip() for token in self.new_special_tokens.split(",")
+            ]
 
-        assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
-        assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization."
+        assert self.quantization_bit in [
+            None,
+            8,
+            4,
+        ], "We only accept 4-bit or 8-bit quantization."
+        assert self.export_quantization_bit in [
+            None,
+            8,
+            4,
+            3,
+            2,
+        ], "We only accept 2/3/4/8-bit quantization."
 
-        if self.export_quantization_bit is not None and self.export_quantization_dataset is None:
+        if (
+            self.export_quantization_bit is not None
+            and self.export_quantization_dataset is None
+        ):
             raise ValueError("Quantization dataset is necessary for exporting.")
 
     def to_dict(self) -> Dict[str, Any]:
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index bcefee92..e65798b7 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -11,7 +11,7 @@ from .utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model
 
 
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig, PreTrainedModel, AutoModelForVision2Seq
+    from transformers import PretrainedConfig, PreTrainedModel
 
     from ..hparams import FinetuningArguments, ModelArguments
 
@@ -21,11 +21,11 @@ logger = get_logger(__name__)
 
 def init_adapter(
     config: "PretrainedConfig",
-    model: Union["PreTrainedModel","AutoModelForVision2Seq"],
+    model: Union["PreTrainedModel"],
     model_args: "ModelArguments",
     finetuning_args: "FinetuningArguments",
     is_trainable: bool,
-) -> Union["PreTrainedModel","AutoModelForVision2Seq"]:
+) -> Union["PreTrainedModel"]:
     r"""
     Initializes the adapters.
 
@@ -38,7 +38,9 @@ def init_adapter(
         logger.info("Adapter is not found at evaluation, load the base model.")
         return model
 
-    if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None):
+    if finetuning_args.finetuning_type != "lora" and getattr(
+        model, "quantization_method", None
+    ):
         raise ValueError("You can only use lora for quantized models.")
 
     if finetuning_args.finetuning_type == "full" and is_trainable:
@@ -49,9 +51,9 @@ def init_adapter(
     if finetuning_args.finetuning_type == "freeze" and is_trainable:
         logger.info("Fine-tuning method: Freeze")
         num_layers = (
-                getattr(model.config, "num_hidden_layers", None)
-                or getattr(model.config, "num_layers", None)
-                or getattr(model.config, "n_layer", None)
+            getattr(model.config, "num_hidden_layers", None)
+            or getattr(model.config, "num_layers", None)
+            or getattr(model.config, "n_layer", None)
         )
         if not num_layers:
             raise ValueError("Current model does not support freeze tuning.")
@@ -66,8 +68,12 @@ def init_adapter(
 
             stride = num_layers // finetuning_args.num_layer_trainable
             trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
-        elif finetuning_args.num_layer_trainable > 0:  # fine-tuning the last n layers if num_layer_trainable > 0
-            trainable_layer_ids = range(num_layers - finetuning_args.num_layer_trainable, num_layers)
+        elif (
+            finetuning_args.num_layer_trainable > 0
+        ):  # fine-tuning the last n layers if num_layer_trainable > 0
+            trainable_layer_ids = range(
+                num_layers - finetuning_args.num_layer_trainable, num_layers
+            )
         else:  # fine-tuning the first n layers if num_layer_trainable < 0
             trainable_layer_ids = range(-finetuning_args.num_layer_trainable)
 
@@ -82,11 +88,15 @@ def init_adapter(
         for module_name in finetuning_args.name_module_trainable:
             if module_name not in freeze_modules:
                 raise ValueError(
-                    "Module {} is not found, please choose from {}".format(module_name, ", ".join(freeze_modules))
+                    "Module {} is not found, please choose from {}".format(
+                        module_name, ", ".join(freeze_modules)
+                    )
                 )
 
             for idx in trainable_layer_ids:
-                trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else ""))
+                trainable_layers.append(
+                    ".{:d}.{}".format(idx, module_name if module_name != "all" else "")
+                )
 
         for name, param in model.named_parameters():
             if any(trainable_layer in name for trainable_layer in trainable_layers):
@@ -95,27 +105,43 @@ def init_adapter(
             else:
                 param.requires_grad_(False)
 
-        logger.info("Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids))))
+        logger.info(
+            "Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids)))
+        )
 
     if finetuning_args.finetuning_type == "lora":
-        logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
+        logger.info(
+            "Fine-tuning method: {}".format(
+                "DoRA" if finetuning_args.use_dora else "LoRA"
+            )
+        )
         adapter_to_resume = None
 
         if model_args.adapter_name_or_path is not None:
             is_mergeable = True
-            if getattr(model, "quantization_method", None):  # merge lora in quantized model is unstable
-                assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter."
+            if getattr(
+                model, "quantization_method", None
+            ):  # merge lora in quantized model is unstable
+                assert (
+                    len(model_args.adapter_name_or_path) == 1
+                ), "Quantized model only accepts a single adapter."
                 is_mergeable = False
 
             if is_deepspeed_zero3_enabled():
-                assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3."
+                assert (
+                    len(model_args.adapter_name_or_path) == 1
+                ), "Cannot use multiple adapters in DeepSpeed ZeRO-3."
                 is_mergeable = False
 
             if model_args.use_unsloth:
-                assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter."
+                assert (
+                    len(model_args.adapter_name_or_path) == 1
+                ), "Unsloth model only accepts a single adapter."
                 is_mergeable = False
 
-            if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable):
+            if (is_trainable and not finetuning_args.create_new_adapter) or (
+                not is_mergeable
+            ):
                 adapter_to_merge = model_args.adapter_name_or_path[:-1]
                 adapter_to_resume = model_args.adapter_name_or_path[-1]
             else:
@@ -132,7 +158,9 @@ def init_adapter(
 
             if adapter_to_resume is not None:  # resume lora training
                 if model_args.use_unsloth:
-                    model = load_unsloth_peft_model(config, model_args, is_trainable=is_trainable)
+                    model = load_unsloth_peft_model(
+                        config, model_args, is_trainable=is_trainable
+                    )
                 else:
                     model = PeftModel.from_pretrained(
                         model,
@@ -141,19 +169,27 @@ def init_adapter(
                         offload_folder=model_args.offload_folder,
                     )
 
-        if is_trainable and adapter_to_resume is None:  # create new lora weights while training
-            if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all":
+        if (
+            is_trainable and adapter_to_resume is None
+        ):  # create new lora weights while training
+            if (
+                len(finetuning_args.lora_target) == 1
+                and finetuning_args.lora_target[0] == "all"
+            ):
                 target_modules = find_all_linear_modules(model)
             else:
                 target_modules = finetuning_args.lora_target
 
             if finetuning_args.use_llama_pro:
-                target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable)
+                target_modules = find_expanded_modules(
+                    model, target_modules, finetuning_args.num_layer_trainable
+                )
 
             if (
-                    finetuning_args.use_dora
-                    and getattr(model, "quantization_method", None) is not None
-                    and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
+                finetuning_args.use_dora
+                and getattr(model, "quantization_method", None) is not None
+                and getattr(model, "quantization_method", None)
+                != QuantizationMethod.BITS_AND_BYTES
             ):
                 raise ValueError("DoRA is not compatible with PTQ-quantized models.")
 
@@ -166,7 +202,11 @@ def init_adapter(
                         module_names.add(name.split(".")[-1])
 
                 finetuning_args.additional_target = module_names
-                logger.warning("Vocab has been resized, add {} to trainable params.".format(",".join(module_names)))
+                logger.warning(
+                    "Vocab has been resized, add {} to trainable params.".format(
+                        ",".join(module_names)
+                    )
+                )
 
             peft_kwargs = {
                 "r": finetuning_args.lora_rank,
@@ -193,6 +233,10 @@ def init_adapter(
                 param.data = param.data.to(torch.float32)
 
         if model_args.adapter_name_or_path is not None:
-            logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
+            logger.info(
+                "Loaded adapter(s): {}".format(
+                    ",".join(model_args.adapter_name_or_path)
+                )
+            )
 
-    return model
\ No newline at end of file
+    return model
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 3712a592..18b0cf79 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -1,6 +1,12 @@
 from typing import TYPE_CHECKING, Any, Dict, Union
 
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModelForVision2Seq
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoProcessor,
+    AutoModelForVision2Seq,
+)
 from trl import AutoModelForCausalLMWithValueHead
 
 from ..extras.logging import get_logger
@@ -62,10 +68,14 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
             dict(additional_special_tokens=model_args.new_special_tokens),
             replace_additional_special_tokens=False,
         )
-        logger.info("Add {} to special tokens.".format(",".join(model_args.new_special_tokens)))
+        logger.info(
+            "Add {} to special tokens.".format(",".join(model_args.new_special_tokens))
+        )
         if num_added_tokens > 0 and not model_args.resize_vocab:
             model_args.resize_vocab = True
-            logger.warning("New tokens have been added, changed `resize_vocab` to True.")
+            logger.warning(
+                "New tokens have been added, changed `resize_vocab` to True."
+            )
 
     patch_tokenizer(tokenizer)
     return tokenizer
@@ -111,7 +121,7 @@ def load_model(
     finetuning_args: "FinetuningArguments",
     is_trainable: bool = False,
     add_valuehead: bool = False,
-) -> Union["PreTrainedModel", "AutoModelForVision2Seq"]:
+) -> Union["PreTrainedModel"]:
     r"""
     Loads pretrained model.
     """
@@ -170,8 +180,10 @@ def load_model(
 
     trainable_params, all_param = count_parameters(model)
     if is_trainable:
-        param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
-            trainable_params, all_param, 100 * trainable_params / all_param
+        param_stats = (
+            "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
+                trainable_params, all_param, 100 * trainable_params / all_param
+            )
         )
     else:
         param_stats = "all params: {:d}".format(all_param)
@@ -185,4 +197,4 @@ def load_model(
                 )
             )
 
-    return model
\ No newline at end of file
+    return model
diff --git a/src/llmtuner/train/sftmm/collator.py b/src/llmtuner/train/sftmm/collator.py
index 95dbd939..2931dd9c 100644
--- a/src/llmtuner/train/sftmm/collator.py
+++ b/src/llmtuner/train/sftmm/collator.py
@@ -19,7 +19,9 @@ class DataCollatorForVis2Seq:
             texts.append(text)
             images.append(example["images"][0])
 
-        batch = self.processor(text=texts, images=images, return_tensors="pt", padding=True)
+        batch = self.processor(
+            text=texts, images=images, return_tensors="pt", padding=True
+        )
 
         labels = batch["input_ids"].clone()
         if self.processor.tokenizer.pad_token_id is not None:
@@ -27,3 +29,14 @@ class DataCollatorForVis2Seq:
         batch["labels"] = labels
 
         return batch
+
+
+@dataclass
+class DataCollatorForMLLM:
+    processor: AutoProcessor
+
+    def __call__(self, examples):
+        print(examples[0].keys())
+        print(examples[0]["input_ids"])
+        batch = {}
+        return batch
diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py
index 7afd8f6f..3849a563 100644
--- a/src/llmtuner/train/sftmm/workflow.py
+++ b/src/llmtuner/train/sftmm/workflow.py
@@ -1,47 +1,66 @@
 # Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py
 import os
 from typing import TYPE_CHECKING, List, Optional
-from ...data import split_dataset, get_mm_dataset
+from ...data import get_dataset
 from ...extras.misc import get_logits_processor
 from ...extras.ploting import plot_loss
-from ...model import load_tokenizer, load_processor, load_model
+from ...model import load_processor, load_model
 from ..utils import create_modelcard_and_push
 from .metric import ComputeMetrics
 from .trainer import CustomSeq2SeqTrainer
-from .collator import DataCollatorForVis2Seq
+from transformers import DataCollatorForSeq2Seq
+from ...extras.constants import IGNORE_INDEX
 
 if TYPE_CHECKING:
     from transformers import Seq2SeqTrainingArguments, TrainerCallback
 
-    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+    from ...hparams import (
+        DataArguments,
+        FinetuningArguments,
+        GeneratingArguments,
+        ModelArguments,
+    )
 
 
 def run_sft_mm(
-        model_args: "ModelArguments",
-        data_args: "DataArguments",
-        training_args: "Seq2SeqTrainingArguments",
-        finetuning_args: "FinetuningArguments",
-        generating_args: "GeneratingArguments",
-        callbacks: Optional[List["TrainerCallback"]] = None,
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    generating_args: "GeneratingArguments",
+    callbacks: Optional[List["TrainerCallback"]] = None,
 ):
     processor = load_processor(model_args)
-    tokenizer = load_tokenizer(model_args)
-    CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""
-    tokenizer.chat_template = CHAT_TEMPLATE
-    processor.tokenizer = tokenizer
-    model = load_model(processor.tokenizer, model_args, finetuning_args, training_args.do_train)
-    dataset = get_mm_dataset(processor, model_args, data_args, training_args, stage="sft")
+    tokenizer = processor.tokenizer
+    dataset = get_dataset(
+        tokenizer, model_args, data_args, training_args, "sft", processor
+    )
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
     if getattr(model, "is_quantized", False) and not training_args.do_train:
-        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
+        setattr(
+            model, "_hf_peft_config_loaded", True
+        )  # hack here: make model compatible with prediction
     train_dataset = dataset
     eval_dataset = dataset
-    data_collator = DataCollatorForVis2Seq(
-        processor=processor,
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer=tokenizer,
+        pad_to_multiple_of=(
+            8 if tokenizer.padding_side == "right" else None
+        ),  # for shift short attention
+        label_pad_token_id=(
+            IGNORE_INDEX
+            if data_args.ignore_pad_token_for_loss
+            else tokenizer.pad_token_id
+        ),
     )
 
     # Override the decoding parameters of Seq2SeqTrainer
-    training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
-    training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
+    training_args.generation_max_length = (
+        training_args.generation_max_length or data_args.cutoff_len
+    )
+    training_args.generation_num_beams = (
+        data_args.eval_num_beams or training_args.generation_num_beams
+    )
     training_args.remove_unused_columns = False
 
     # Initialize our Trainer
@@ -52,19 +71,26 @@ def run_sft_mm(
         tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,
-        compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None,
+        compute_metrics=(
+            ComputeMetrics(tokenizer) if training_args.predict_with_generate else None
+        ),
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
     )
+
     # Keyword arguments for `model.generate`
     gen_kwargs = generating_args.to_dict()
-    gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids
+    gen_kwargs["eos_token_id"] = [
+        tokenizer.eos_token_id
+    ] + tokenizer.additional_special_tokens_ids
     gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
     gen_kwargs["logits_processor"] = get_logits_processor()
 
     # Training
     if training_args.do_train:
-        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        train_result = trainer.train(
+            resume_from_checkpoint=training_args.resume_from_checkpoint
+        )
         trainer.save_model()
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
@@ -75,19 +101,27 @@ def run_sft_mm(
     # Evaluation
     if training_args.do_eval:
         metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs)
-        if training_args.predict_with_generate:  # eval_loss will be wrong if predict_with_generate is enabled
+        if (
+            training_args.predict_with_generate
+        ):  # eval_loss will be wrong if predict_with_generate is enabled
             metrics.pop("eval_loss", None)
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
 
     # Predict
     if training_args.do_predict:
-        predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs)
-        if training_args.predict_with_generate:  # predict_loss will be wrong if predict_with_generate is enabled
+        predict_results = trainer.predict(
+            dataset, metric_key_prefix="predict", **gen_kwargs
+        )
+        if (
+            training_args.predict_with_generate
+        ):  # predict_loss will be wrong if predict_with_generate is enabled
             predict_results.metrics.pop("predict_loss", None)
         trainer.log_metrics("predict", predict_results.metrics)
         trainer.save_metrics("predict", predict_results.metrics)
         trainer.save_predictions(predict_results)
 
     # Create model card
-    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
+    create_modelcard_and_push(
+        trainer, model_args, data_args, training_args, finetuning_args
+    )

From 6be321b5dae59435486e12904ff3fd3e79e00632 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 25 Apr 2024 19:56:49 +0800
Subject: [PATCH 157/341] fix #3374

Former-commit-id: 0097d7968b3b570e1705caff26f42d9ed71ad974
---
 src/llmtuner/data/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py
index 83ee0610..dc189609 100644
--- a/src/llmtuner/data/utils.py
+++ b/src/llmtuner/data/utils.py
@@ -78,9 +78,9 @@ def split_dataset(
     if training_args.do_train:
         if data_args.val_size > 1e-6:  # Split the dataset
             if data_args.streaming:
+                dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
                 val_set = dataset.take(int(data_args.val_size))
                 train_set = dataset.skip(int(data_args.val_size))
-                dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
                 return {"train_dataset": train_set, "eval_dataset": val_set}
             else:
                 val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size

From d1d08d066a7245e56d1d25c7dda5600b2d0c4e6b Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 19:58:47 +0800
Subject: [PATCH 158/341] merge data part to the text stream

Former-commit-id: 80537d580119d9d5a06ab236a5284aaae2f83b5b
---
 data/mllm_example_dataset/README.md           |  25 ------------------
 data/mllm_example_dataset/data/test-0.parquet | Bin 4580 -> 0 bytes
 .../mllm_example_dataset/data/train-0.parquet | Bin 4580 -> 0 bytes
 scripts/test_mllm.py                          |  24 +++++++++++------
 src/llmtuner/data/template.py                 |   4 +--
 5 files changed, 18 insertions(+), 35 deletions(-)
 delete mode 100644 data/mllm_example_dataset/README.md
 delete mode 100644 data/mllm_example_dataset/data/test-0.parquet
 delete mode 100644 data/mllm_example_dataset/data/train-0.parquet

diff --git a/data/mllm_example_dataset/README.md b/data/mllm_example_dataset/README.md
deleted file mode 100644
index d5c8c0e6..00000000
--- a/data/mllm_example_dataset/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
----
-dataset_info:
-  features:
-  - name: messages
-    list:
-    - name: content
-      list:
-      - name: index
-        dtype: int64
-      - name: text
-        dtype: string
-      - name: type
-        dtype: string
-    - name: role
-      dtype: string
-  - name: images
-    sequence: image
-configs:
-- config_name: default
-  data_files:
-  - split: train
-    path: data/train-*
-  - split: test
-    path: data/test-*
----
\ No newline at end of file
diff --git a/data/mllm_example_dataset/data/test-0.parquet b/data/mllm_example_dataset/data/test-0.parquet
deleted file mode 100644
index 42c20b192497168523c3d39447cdae4495085b84..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4580
zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l
zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko
zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4}
z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2
zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj
z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ-
zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF
zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_<gQO0E?plPn
z$g@%Sy9E-z285d?!9b@GS9Of)v*X_bDx(3hp9SQKZde4c*!3LS!YvPFS_aBu@Ce)d
zD}#08kk_U#a+`#JlhNS!$G}A$^B*iCtg3dS3Hj5Bw)|1>kMpPvX5(fPocD;ve`lr&
zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN
zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAH<P68^nahqz7K
za*e79LTm?w@i(S!+D_H*kliHDz8`?JYz?h>u4)iVC5Rj~iDS@$#AeTYHxA`<TlnHh
zS1g34#iHIpEPIeyW@I-(VxpfOno*h`KmF)Tj(Vj*;*g5TsguUxhi?Dn<G+3UkIx5!
zW2xQ4SAW8B((j&{5JsN7eF7di@>usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf
z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m
zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r>
zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI
z&cAEI<P41Tu9+bKjwY!&A&#w=IOz@X#G(*O#n}-Ztad!&f<d9k*CI9P_gAG~U4?4m
zF4X=_p-=#IRQvL781!h^!wFw%Vm5xztcPz29GBV<IOtY5f{93&I%Og^9X`#&h{Q7>
zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn#
zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb
zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB
z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q<bHG;hsd{?kT
z?^6~IU%nUXe!}_()}4$<^Pm@D2Qs{c`PdAKeYFSK$7YMf?gNP4kY}mkvkVM)6uB?*
zUaT+t(d75P0`h75|GV-TgFmVnOV?$#@0*^Z-;vDpK4^!0y+yyGV)x<U#1VOW`=#P7
ze;|sl4=3oeR9}^4c?*8&b4QlvWu-2!$#Pv@sWX_OoWZv#@JoGsrVVI%hKEUE_?#@a
zWkA5234ymvg~H>)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL<cn`$Sw
zkjuhHV{!jjO|4m#(z@}Ip;Uq9pqff2E5&q@@)YM>17zN<l<E}!Lawo$Bqhb9xLm_*
zX{+UA&S=+j=F$n=tmxJ3zL~R<Zf+%?tdw*n3uNEbvSz!o1G-o8$6A)!rW%D6*(l9z
zr`60J=#Y*5#+s`Y_l(*O$hCU#bGogKqzd{gogCFsJ0-VRE*{NRcJfQLeWt%tO&u|N
zpICosR-?Az;;ddt?H`|yVfTDmcfMMLSc2~JKJBIUkF?@`tDFMAO0!F~EqR4z&ZCQ;
z<Z|LLk${7;s<5Q16hjN4TMB!02X!RN8Ph8lHB*JM*x7s`kx1@7pGYiHx{VAaVv@V{
zL+SlW^7$z~V?utDbSD$aKiB79W28(;my1zRdz5y$v~9EFomJ5HyxOyu7Bf6$y7*hV
z^GmaAfv(az99MIc{)p<M<-FBq_OK%M{b#JqepcU#S1D~A_xuF@dCI@k=T9<yr}&xI
zEOLJQm@BxnZB`jjwrOkTH(_D=_ZZaI=Mzw)nGxCiVKu|dllLfn@)#Bt-*wh+3hT#{
z{`B=_DStV&)Y3BhElpMyfhj@y`jRZ4c}nE_OJ{`N$&bm8OZgZ2`X84c)_+u{ym3HP
z@6iQLin7h1r|OBFnO3lMUSmg^?=9bd2m<&Cp)Et!Wtu!6>C-8)&Nj5N{w3etSwT_2
z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5
I;r~1T1e%nlz5oCK

diff --git a/data/mllm_example_dataset/data/train-0.parquet b/data/mllm_example_dataset/data/train-0.parquet
deleted file mode 100644
index 42c20b192497168523c3d39447cdae4495085b84..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4580
zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l
zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko
zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4}
z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2
zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj
z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ-
zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF
zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_<gQO0E?plPn
z$g@%Sy9E-z285d?!9b@GS9Of)v*X_bDx(3hp9SQKZde4c*!3LS!YvPFS_aBu@Ce)d
zD}#08kk_U#a+`#JlhNS!$G}A$^B*iCtg3dS3Hj5Bw)|1>kMpPvX5(fPocD;ve`lr&
zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN
zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAH<P68^nahqz7K
za*e79LTm?w@i(S!+D_H*kliHDz8`?JYz?h>u4)iVC5Rj~iDS@$#AeTYHxA`<TlnHh
zS1g34#iHIpEPIeyW@I-(VxpfOno*h`KmF)Tj(Vj*;*g5TsguUxhi?Dn<G+3UkIx5!
zW2xQ4SAW8B((j&{5JsN7eF7di@>usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf
z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m
zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r>
zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI
z&cAEI<P41Tu9+bKjwY!&A&#w=IOz@X#G(*O#n}-Ztad!&f<d9k*CI9P_gAG~U4?4m
zF4X=_p-=#IRQvL781!h^!wFw%Vm5xztcPz29GBV<IOtY5f{93&I%Og^9X`#&h{Q7>
zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn#
zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb
zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB
z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q<bHG;hsd{?kT
z?^6~IU%nUXe!}_()}4$<^Pm@D2Qs{c`PdAKeYFSK$7YMf?gNP4kY}mkvkVM)6uB?*
zUaT+t(d75P0`h75|GV-TgFmVnOV?$#@0*^Z-;vDpK4^!0y+yyGV)x<U#1VOW`=#P7
ze;|sl4=3oeR9}^4c?*8&b4QlvWu-2!$#Pv@sWX_OoWZv#@JoGsrVVI%hKEUE_?#@a
zWkA5234ymvg~H>)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL<cn`$Sw
zkjuhHV{!jjO|4m#(z@}Ip;Uq9pqff2E5&q@@)YM>17zN<l<E}!Lawo$Bqhb9xLm_*
zX{+UA&S=+j=F$n=tmxJ3zL~R<Zf+%?tdw*n3uNEbvSz!o1G-o8$6A)!rW%D6*(l9z
zr`60J=#Y*5#+s`Y_l(*O$hCU#bGogKqzd{gogCFsJ0-VRE*{NRcJfQLeWt%tO&u|N
zpICosR-?Az;;ddt?H`|yVfTDmcfMMLSc2~JKJBIUkF?@`tDFMAO0!F~EqR4z&ZCQ;
z<Z|LLk${7;s<5Q16hjN4TMB!02X!RN8Ph8lHB*JM*x7s`kx1@7pGYiHx{VAaVv@V{
zL+SlW^7$z~V?utDbSD$aKiB79W28(;my1zRdz5y$v~9EFomJ5HyxOyu7Bf6$y7*hV
z^GmaAfv(az99MIc{)p<M<-FBq_OK%M{b#JqepcU#S1D~A_xuF@dCI@k=T9<yr}&xI
zEOLJQm@BxnZB`jjwrOkTH(_D=_ZZaI=Mzw)nGxCiVKu|dllLfn@)#Bt-*wh+3hT#{
z{`B=_DStV&)Y3BhElpMyfhj@y`jRZ4c}nE_OJ{`N$&bm8OZgZ2`X84c)_+u{ym3HP
z@6iQLin7h1r|OBFnO3lMUSmg^?=9bd2m<&Cp)Et!Wtu!6>C-8)&Nj5N{w3etSwT_2
z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5
I;r~1T1e%nlz5oCK

diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py
index 94d8670b..b8fe3e0f 100644
--- a/scripts/test_mllm.py
+++ b/scripts/test_mllm.py
@@ -6,22 +6,23 @@ from datasets import load_dataset
 from peft import PeftModel
 from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
 import shutil
+from PIL import Image
 
 """usage
 python3 scripts/test_mllm.py \
 --base_model_path llava-hf/llava-1.5-7b-hf \
 --lora_model_path saves/llava-1.5-7b/lora/sft \
 --model_path saves/llava-1.5-7b/lora/merged \
---dataset_name data/mllm_example_dataset \
+--dataset_name data/llava_instruct_example.json \
 --do_merge 1
 """
 
 
 def get_processor(model_path):
-    CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""
+    processor = AutoProcessor.from_pretrained(model_path)
+    CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {{ message['content'] }} ASSISTANT: {% else %}{{ message['content'] }}{% endif %} {% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""
     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
     tokenizer.chat_template = CHAT_TEMPLATE
-    processor = AutoProcessor.from_pretrained(model_path)
     processor.tokenizer = tokenizer
     return processor
 
@@ -69,7 +70,7 @@ def main(
         device_map="cuda",
     )
     processor = get_processor(model_path)
-    raw_datasets = load_dataset(dataset_name)
+    raw_datasets = load_dataset("json", data_files=dataset_name)
     train_dataset = raw_datasets["train"]
     examples = train_dataset.select(range(3))
     texts = []
@@ -80,11 +81,18 @@ def main(
             messages, tokenize=False, add_generation_prompt=False
         )
         texts.append(text)
-        images.append(example["images"][0])
-    batch = processor(texts, images, return_tensors="pt", padding=True).to("cuda")
+        images.append(Image.open(example["images"][0]))
+    batch = processor(text=texts, images=images, return_tensors="pt", padding=True).to(
+        "cuda"
+    )
     output = model.generate(**batch, max_new_tokens=100)
-    res = processor.batch_decode(output, skip_special_tokens=True)
-    print(res)
+    res_list = processor.batch_decode(output, skip_special_tokens=True)
+    for i, prompt in enumerate(texts):
+        res = res_list[i]
+        print(f"#{i}")
+        print(f"prompt:{prompt}")
+        print(f"response:{res[len(prompt):].strip()}")
+        print()
 
 
 if __name__ == "__main__":
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 311660aa..e6cdadd6 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -1012,8 +1012,8 @@ _register_template(
 
 _register_template(
     name="llava",
-    format_user=StringFormatter(slots=["USER: {{content}} "]),
-    format_assistant=StringFormatter(slots=["ASSISTANT: {{content}}"]),
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT: "]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]),
     default_system=(
         "A chat between a curious user and an artificial intelligence assistant. "
         "The assistant gives helpful, detailed, and polite answers to the user's questions."

From 9b210cf4b3bbd1faae0b36e3a4fd1587d1357057 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 20:09:43 +0800
Subject: [PATCH 159/341] rm some

Former-commit-id: 2c85b4fabbebd8b51eee53f5d29184d4a6e97569
---
 src/llmtuner/train/sftmm/collator.py | 42 ----------------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 src/llmtuner/train/sftmm/collator.py

diff --git a/src/llmtuner/train/sftmm/collator.py b/src/llmtuner/train/sftmm/collator.py
deleted file mode 100644
index 2931dd9c..00000000
--- a/src/llmtuner/train/sftmm/collator.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from dataclasses import dataclass
-from transformers import AutoProcessor
-
-
-@dataclass
-class DataCollatorForVis2Seq:
-    processor: AutoProcessor
-
-    def __call__(self, examples):
-        texts = []
-        images = []
-        for example in examples:
-            if len(example["images"]) > 1:
-                raise ValueError("This collator only supports one image per example")
-            messages = example["messages"]
-            text = self.processor.tokenizer.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=False
-            )
-            texts.append(text)
-            images.append(example["images"][0])
-
-        batch = self.processor(
-            text=texts, images=images, return_tensors="pt", padding=True
-        )
-
-        labels = batch["input_ids"].clone()
-        if self.processor.tokenizer.pad_token_id is not None:
-            labels[labels == self.processor.tokenizer.pad_token_id] = -100
-        batch["labels"] = labels
-
-        return batch
-
-
-@dataclass
-class DataCollatorForMLLM:
-    processor: AutoProcessor
-
-    def __call__(self, examples):
-        print(examples[0].keys())
-        print(examples[0]["input_ids"])
-        batch = {}
-        return batch

From 7bfbcb1fe38a87fdcd685459234be6ea141b7518 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 25 Apr 2024 20:24:31 +0800
Subject: [PATCH 160/341] vllm + lora support

Former-commit-id: 8cb86ba355195f5d6dcb95ee6b6b7203463a34db
---
 src/llmtuner/chat/vllm_engine.py | 17 ++++++++++++++---
 src/llmtuner/hparams/parser.py   |  8 ++++----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index 67a19b68..786e743d 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -10,6 +10,7 @@ from .base_engine import BaseEngine, Response
 
 if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
+    from vllm.lora.request import LoRARequest
 
 if TYPE_CHECKING:
     from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
@@ -24,7 +25,8 @@ class VllmEngine(BaseEngine):
         generating_args: "GeneratingArguments",
     ) -> None:
         config = load_config(model_args)  # may download model from ms hub
-        load_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
+        infer_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
+        infer_dtype = str(infer_dtype).split(".")[-1]
 
         self.can_generate = finetuning_args.stage == "sft"
         self.tokenizer = load_tokenizer(model_args)
@@ -36,15 +38,20 @@ class VllmEngine(BaseEngine):
             model=model_args.model_name_or_path,
             trust_remote_code=True,
             download_dir=model_args.cache_dir,
-            dtype=str(load_dtype).split(".")[-1],
+            dtype=infer_dtype,
             max_model_len=model_args.vllm_maxlen,
             tensor_parallel_size=get_device_count() or 1,
             gpu_memory_utilization=model_args.vllm_gpu_util,
             disable_log_stats=True,
             disable_log_requests=True,
             enforce_eager=model_args.vllm_enforce_eager,
+            enable_lora=model_args.adapter_name_or_path is not None,
         )
         self.model = AsyncLLMEngine.from_engine_args(engine_args)
+        if model_args.adapter_name_or_path is not None:
+            self.lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
+        else:
+            self.lora_request = None
 
     async def _generate(
         self,
@@ -98,7 +105,11 @@ class VllmEngine(BaseEngine):
             skip_special_tokens=True,
         )
         result_generator = self.model.generate(
-            prompt=None, sampling_params=sampling_params, request_id=request_id, prompt_token_ids=prompt_ids
+            prompt=None,
+            sampling_params=sampling_params,
+            request_id=request_id,
+            prompt_token_ids=prompt_ids,
+            lora_request=self.lora_request,
         )
         return result_generator
 
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index a7d0a17f..c922dc47 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -308,15 +308,15 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
         if finetuning_args.stage != "sft":
             raise ValueError("vLLM engine only supports auto-regressive models.")
 
-        if model_args.adapter_name_or_path is not None:
-            raise ValueError("vLLM engine does not support LoRA adapters. Merge them first.")
-
         if model_args.quantization_bit is not None:
-            raise ValueError("vLLM engine does not support quantization.")
+            raise ValueError("vLLM engine does not support bnb quantization (GPTQ and AWQ are supported).")
 
         if model_args.rope_scaling is not None:
             raise ValueError("vLLM engine does not support RoPE scaling.")
 
+        if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
+            raise ValueError("vLLM only accepts a single adapter. Merge them first.")
+
     _verify_model_args(model_args, finetuning_args)
     _check_extra_dependencies(model_args, finetuning_args)
 

From 29eebef696eb15039f77d27cfb8a22c45855eac6 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 25 Apr 2024 20:49:23 +0800
Subject: [PATCH 161/341] add webui backend option

Former-commit-id: 3764586cb3ed64fe376d0ae420ff5690c28459e2
---
 src/llmtuner/webui/chatter.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/llmtuner/webui/chatter.py b/src/llmtuner/webui/chatter.py
index ee28603e..82e7b7f1 100644
--- a/src/llmtuner/webui/chatter.py
+++ b/src/llmtuner/webui/chatter.py
@@ -31,7 +31,10 @@ class WebChatModel(ChatModel):
         if demo_mode and os.environ.get("DEMO_MODEL") and os.environ.get("DEMO_TEMPLATE"):  # load demo model
             model_name_or_path = os.environ.get("DEMO_MODEL")
             template = os.environ.get("DEMO_TEMPLATE")
-            super().__init__(dict(model_name_or_path=model_name_or_path, template=template))
+            infer_backend = os.environ.get("DEMO_BACKEND", "huggingface")
+            super().__init__(
+                dict(model_name_or_path=model_name_or_path, template=template, infer_backend=infer_backend)
+            )
 
     @property
     def loaded(self) -> bool:

From dbd905438b22bc74576900dc310ef5c506bd88ab Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 21:08:32 +0800
Subject: [PATCH 162/341] add some

Former-commit-id: 8d035a849c4a441d457791aab073861adf69a09f
---
 src/llmtuner/model/__init__.py       |   3 +-
 src/llmtuner/model/loader.py         |  50 +++++------
 src/llmtuner/train/sft/workflow.py   |  72 +++++++++++----
 src/llmtuner/train/sftmm/__init__.py |   3 -
 src/llmtuner/train/sftmm/metric.py   |  61 -------------
 src/llmtuner/train/sftmm/trainer.py  |  44 ----------
 src/llmtuner/train/sftmm/workflow.py | 127 ---------------------------
 src/llmtuner/train/tuner.py          |   3 -
 8 files changed, 80 insertions(+), 283 deletions(-)
 delete mode 100644 src/llmtuner/train/sftmm/__init__.py
 delete mode 100644 src/llmtuner/train/sftmm/metric.py
 delete mode 100644 src/llmtuner/train/sftmm/trainer.py
 delete mode 100644 src/llmtuner/train/sftmm/workflow.py

diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py
index f6be60d8..db81e1dc 100644
--- a/src/llmtuner/model/__init__.py
+++ b/src/llmtuner/model/__init__.py
@@ -1,11 +1,10 @@
-from .loader import load_config, load_model, load_tokenizer, load_processor
+from .loader import load_config, load_model, load_tokenizer
 from .utils.misc import find_all_linear_modules, load_valuehead_params
 
 __all__ = [
     "load_config",
     "load_model",
     "load_tokenizer",
-    "load_processor",
     "load_valuehead_params",
     "find_all_linear_modules",
 ]
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 18b0cf79..99ad9adc 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -40,7 +40,9 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
     }
 
 
-def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
+def load_tokenizer(
+    model_args: "ModelArguments",
+) -> Dict[str, Union["PreTrainedTokenizer", "AutoProcesser"]]:
     r"""
     Loads pretrained tokenizer.
 
@@ -78,33 +80,25 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer":
             )
 
     patch_tokenizer(tokenizer)
-    return tokenizer
-
-
-def load_processor(model_args: "ModelArguments") -> "AutoProcessor":
-    r"""
-    Loads processor. Must before load_model.
-
-    Note: including inplace operation of model_args.
-    """
-    init_kwargs = _get_init_kwargs(model_args)
-    try:
-        processor = AutoProcessor.from_pretrained(
-            model_args.model_name_or_path,
-            use_fast=model_args.use_fast_tokenizer,
-            split_special_tokens=model_args.split_special_tokens,
-            padding_side="right",
-            **init_kwargs,
-        )
-    except Exception:  # try the fast one
-        processor = AutoProcessor.from_pretrained(
-            model_args.model_name_or_path,
-            use_fast=True,
-            padding_side="right",
-            **init_kwargs,
-        )
-
-    return processor
+    tokenizer_modules = {"tokenizer": tokenizer, "processor": None}
+    if model_args.use_mllm:
+        try:
+            processor = AutoProcessor.from_pretrained(
+                model_args.model_name_or_path,
+                use_fast=model_args.use_fast_tokenizer,
+                split_special_tokens=model_args.split_special_tokens,
+                padding_side="right",
+                **init_kwargs,
+            )
+        except Exception:  # try the fast one
+            processor = AutoProcessor.from_pretrained(
+                model_args.model_name_or_path,
+                use_fast=True,
+                padding_side="right",
+                **init_kwargs,
+            )
+        tokenizer_modules["processor"] = processor
+    return tokenizer_modules
 
 
 def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py
index 9ab78850..6f887810 100644
--- a/src/llmtuner/train/sft/workflow.py
+++ b/src/llmtuner/train/sft/workflow.py
@@ -17,7 +17,12 @@ from .trainer import CustomSeq2SeqTrainer
 if TYPE_CHECKING:
     from transformers import Seq2SeqTrainingArguments, TrainerCallback
 
-    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+    from ...hparams import (
+        DataArguments,
+        FinetuningArguments,
+        GeneratingArguments,
+        ModelArguments,
+    )
 
 
 def run_sft(
@@ -28,25 +33,48 @@ def run_sft(
     generating_args: "GeneratingArguments",
     callbacks: Optional[List["TrainerCallback"]] = None,
 ):
-    tokenizer = load_tokenizer(model_args)
-    dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft")
+    tokenizer_modules = load_tokenizer(model_args)
+    tokenizer = tokenizer_modules["tokenizer"]
+    processor = tokenizer_modules["processor"]
+    dataset = get_dataset(
+        tokenizer,
+        model_args,
+        data_args,
+        training_args,
+        stage="sft",
+        processor=processor,
+    )
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
 
     if training_args.predict_with_generate:
         tokenizer.padding_side = "left"  # use left-padding in generation
 
     if getattr(model, "is_quantized", False) and not training_args.do_train:
-        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
+        setattr(
+            model, "_hf_peft_config_loaded", True
+        )  # hack here: make model compatible with prediction
 
     data_collator = DataCollatorForSeq2Seq(
         tokenizer=tokenizer,
-        pad_to_multiple_of=8 if tokenizer.padding_side == "right" else None,  # for shift short attention
-        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+        pad_to_multiple_of=(
+            8 if tokenizer.padding_side == "right" else None
+        ),  # for shift short attention
+        label_pad_token_id=(
+            IGNORE_INDEX
+            if data_args.ignore_pad_token_for_loss
+            else tokenizer.pad_token_id
+        ),
     )
 
     # Override the decoding parameters of Seq2SeqTrainer
-    training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
-    training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
+    training_args.generation_max_length = (
+        training_args.generation_max_length or data_args.cutoff_len
+    )
+    training_args.generation_num_beams = (
+        data_args.eval_num_beams or training_args.generation_num_beams
+    )
+    if model_args.use_mllm:
+        training_args.remove_unused_columns = False
 
     # Initialize our Trainer
     trainer = CustomSeq2SeqTrainer(
@@ -56,19 +84,25 @@ def run_sft(
         tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,
-        compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None,
+        compute_metrics=(
+            ComputeMetrics(tokenizer) if training_args.predict_with_generate else None
+        ),
         **split_dataset(dataset, data_args, training_args),
     )
 
     # Keyword arguments for `model.generate`
     gen_kwargs = generating_args.to_dict()
-    gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids
+    gen_kwargs["eos_token_id"] = [
+        tokenizer.eos_token_id
+    ] + tokenizer.additional_special_tokens_ids
     gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
     gen_kwargs["logits_processor"] = get_logits_processor()
 
     # Training
     if training_args.do_train:
-        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        train_result = trainer.train(
+            resume_from_checkpoint=training_args.resume_from_checkpoint
+        )
         trainer.save_model()
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
@@ -79,19 +113,27 @@ def run_sft(
     # Evaluation
     if training_args.do_eval:
         metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs)
-        if training_args.predict_with_generate:  # eval_loss will be wrong if predict_with_generate is enabled
+        if (
+            training_args.predict_with_generate
+        ):  # eval_loss will be wrong if predict_with_generate is enabled
             metrics.pop("eval_loss", None)
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
 
     # Predict
     if training_args.do_predict:
-        predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs)
-        if training_args.predict_with_generate:  # predict_loss will be wrong if predict_with_generate is enabled
+        predict_results = trainer.predict(
+            dataset, metric_key_prefix="predict", **gen_kwargs
+        )
+        if (
+            training_args.predict_with_generate
+        ):  # predict_loss will be wrong if predict_with_generate is enabled
             predict_results.metrics.pop("predict_loss", None)
         trainer.log_metrics("predict", predict_results.metrics)
         trainer.save_metrics("predict", predict_results.metrics)
         trainer.save_predictions(predict_results)
 
     # Create model card
-    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
+    create_modelcard_and_push(
+        trainer, model_args, data_args, training_args, finetuning_args
+    )
diff --git a/src/llmtuner/train/sftmm/__init__.py b/src/llmtuner/train/sftmm/__init__.py
deleted file mode 100644
index 3eb8b2e2..00000000
--- a/src/llmtuner/train/sftmm/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .workflow import run_sft_mm
-
-__all__ = ["run_sft_mm"]
diff --git a/src/llmtuner/train/sftmm/metric.py b/src/llmtuner/train/sftmm/metric.py
deleted file mode 100644
index d1af4c17..00000000
--- a/src/llmtuner/train/sftmm/metric.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
-
-import numpy as np
-
-from ...extras.constants import IGNORE_INDEX
-from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available
-
-
-if TYPE_CHECKING:
-    from transformers.tokenization_utils import PreTrainedTokenizer
-
-if is_jieba_available():
-    import jieba  # type: ignore
-
-if is_nltk_available():
-    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-
-if is_rouge_available():
-    from rouge_chinese import Rouge
-
-
-@dataclass
-class ComputeMetrics:
-    r"""
-    Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer.
-    """
-
-    tokenizer: "PreTrainedTokenizer"
-
-    def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]:
-        r"""
-        Uses the model predictions to compute metrics.
-        """
-        preds, labels = eval_preds
-        score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []}
-
-        preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id)
-        labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id)
-
-        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
-        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-        for pred, label in zip(decoded_preds, decoded_labels):
-            hypothesis = list(jieba.cut(pred))
-            reference = list(jieba.cut(label))
-
-            if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
-                result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
-            else:
-                rouge = Rouge()
-                scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
-                result = scores[0]
-
-            for k, v in result.items():
-                score_dict[k].append(round(v["f"] * 100, 4))
-
-            bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
-            score_dict["bleu-4"].append(round(bleu_score * 100, 4))
-
-        return {k: float(np.mean(v)) for k, v in score_dict.items()}
diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py
deleted file mode 100644
index f094e609..00000000
--- a/src/llmtuner/train/sftmm/trainer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import json
-import os
-from types import MethodType
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from transformers import Seq2SeqTrainer, Trainer
-
-from ...extras.constants import IGNORE_INDEX
-from ...extras.logging import get_logger
-from ..utils import create_custom_optimzer, create_custom_scheduler
-
-if TYPE_CHECKING:
-    from transformers.trainer import PredictionOutput
-    from peft import PeftModelForCausalLM
-    from ...hparams import FinetuningArguments
-
-logger = get_logger(__name__)
-
-
-class CustomSeq2SeqTrainer(Seq2SeqTrainer):
-    r"""
-    Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE.
-    """
-
-    def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
-        super().__init__(**kwargs)
-        self.finetuning_args = finetuning_args
-        if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-    def create_optimizer(self) -> "torch.optim.Optimizer":
-        if self.optimizer is None:
-            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
-        return super().create_optimizer()
-
-    def create_scheduler(
-            self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
-    ) -> "torch.optim.lr_scheduler.LRScheduler":
-        create_custom_scheduler(self.args, num_training_steps, optimizer)
-        return super().create_scheduler(num_training_steps, optimizer)
diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py
deleted file mode 100644
index 3849a563..00000000
--- a/src/llmtuner/train/sftmm/workflow.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py
-import os
-from typing import TYPE_CHECKING, List, Optional
-from ...data import get_dataset
-from ...extras.misc import get_logits_processor
-from ...extras.ploting import plot_loss
-from ...model import load_processor, load_model
-from ..utils import create_modelcard_and_push
-from .metric import ComputeMetrics
-from .trainer import CustomSeq2SeqTrainer
-from transformers import DataCollatorForSeq2Seq
-from ...extras.constants import IGNORE_INDEX
-
-if TYPE_CHECKING:
-    from transformers import Seq2SeqTrainingArguments, TrainerCallback
-
-    from ...hparams import (
-        DataArguments,
-        FinetuningArguments,
-        GeneratingArguments,
-        ModelArguments,
-    )
-
-
-def run_sft_mm(
-    model_args: "ModelArguments",
-    data_args: "DataArguments",
-    training_args: "Seq2SeqTrainingArguments",
-    finetuning_args: "FinetuningArguments",
-    generating_args: "GeneratingArguments",
-    callbacks: Optional[List["TrainerCallback"]] = None,
-):
-    processor = load_processor(model_args)
-    tokenizer = processor.tokenizer
-    dataset = get_dataset(
-        tokenizer, model_args, data_args, training_args, "sft", processor
-    )
-    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
-    if getattr(model, "is_quantized", False) and not training_args.do_train:
-        setattr(
-            model, "_hf_peft_config_loaded", True
-        )  # hack here: make model compatible with prediction
-    train_dataset = dataset
-    eval_dataset = dataset
-    data_collator = DataCollatorForSeq2Seq(
-        tokenizer=tokenizer,
-        pad_to_multiple_of=(
-            8 if tokenizer.padding_side == "right" else None
-        ),  # for shift short attention
-        label_pad_token_id=(
-            IGNORE_INDEX
-            if data_args.ignore_pad_token_for_loss
-            else tokenizer.pad_token_id
-        ),
-    )
-
-    # Override the decoding parameters of Seq2SeqTrainer
-    training_args.generation_max_length = (
-        training_args.generation_max_length or data_args.cutoff_len
-    )
-    training_args.generation_num_beams = (
-        data_args.eval_num_beams or training_args.generation_num_beams
-    )
-    training_args.remove_unused_columns = False
-
-    # Initialize our Trainer
-    trainer = CustomSeq2SeqTrainer(
-        model=model,
-        args=training_args,
-        finetuning_args=finetuning_args,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        callbacks=callbacks,
-        compute_metrics=(
-            ComputeMetrics(tokenizer) if training_args.predict_with_generate else None
-        ),
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-    )
-
-    # Keyword arguments for `model.generate`
-    gen_kwargs = generating_args.to_dict()
-    gen_kwargs["eos_token_id"] = [
-        tokenizer.eos_token_id
-    ] + tokenizer.additional_special_tokens_ids
-    gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
-    gen_kwargs["logits_processor"] = get_logits_processor()
-
-    # Training
-    if training_args.do_train:
-        train_result = trainer.train(
-            resume_from_checkpoint=training_args.resume_from_checkpoint
-        )
-        trainer.save_model()
-        trainer.log_metrics("train", train_result.metrics)
-        trainer.save_metrics("train", train_result.metrics)
-        trainer.save_state()
-        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
-
-    # Evaluation
-    if training_args.do_eval:
-        metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs)
-        if (
-            training_args.predict_with_generate
-        ):  # eval_loss will be wrong if predict_with_generate is enabled
-            metrics.pop("eval_loss", None)
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Predict
-    if training_args.do_predict:
-        predict_results = trainer.predict(
-            dataset, metric_key_prefix="predict", **gen_kwargs
-        )
-        if (
-            training_args.predict_with_generate
-        ):  # predict_loss will be wrong if predict_with_generate is enabled
-            predict_results.metrics.pop("predict_loss", None)
-        trainer.log_metrics("predict", predict_results.metrics)
-        trainer.save_metrics("predict", predict_results.metrics)
-        trainer.save_predictions(predict_results)
-
-    # Create model card
-    create_modelcard_and_push(
-        trainer, model_args, data_args, training_args, finetuning_args
-    )
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index ac56289c..5f691225 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -14,7 +14,6 @@ from .ppo import run_ppo
 from .pt import run_pt
 from .rm import run_rm
 from .sft import run_sft
-from .sftmm import run_sft_mm
 
 if TYPE_CHECKING:
     from transformers import TrainerCallback
@@ -30,8 +29,6 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["Tra
         run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
     elif finetuning_args.stage == "sft":
         run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
-    elif finetuning_args.stage == "sft_mm":
-        run_sft_mm(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
     elif finetuning_args.stage == "rm":
         run_rm(model_args, data_args, training_args, finetuning_args, callbacks)
     elif finetuning_args.stage == "ppo":

From c425436676d19434a662b6c4780030ec4bb52b8a Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 21:15:16 +0800
Subject: [PATCH 163/341] modify style

Former-commit-id: 54b713d0c4ffdfc6a7faeb14471b58bb1cd8acf5
---
 src/llmtuner/data/__init__.py        |   1 +
 src/llmtuner/data/aligner.py         |  49 ++-----
 src/llmtuner/data/loader.py          |  59 +++------
 src/llmtuner/data/parser.py          |  28 +---
 src/llmtuner/data/preprocess.py      |  90 +++----------
 src/llmtuner/data/template.py        | 188 +++++++--------------------
 src/llmtuner/hparams/model_args.py   | 101 ++++----------
 src/llmtuner/model/__init__.py       |   1 +
 src/llmtuner/model/adapter.py        |  82 +++---------
 src/llmtuner/model/loader.py         |  21 ++-
 src/llmtuner/train/sft/workflow.py   |  50 ++-----
 src/llmtuner/train/sftmm/__init__.py |   4 +
 src/llmtuner/train/sftmm/metric.py   |  61 +++++++++
 src/llmtuner/train/sftmm/trainer.py  |  39 ++++++
 src/llmtuner/train/sftmm/workflow.py | 101 ++++++++++++++
 src/llmtuner/train/tuner.py          |   1 +
 16 files changed, 374 insertions(+), 502 deletions(-)
 create mode 100644 src/llmtuner/train/sftmm/__init__.py
 create mode 100644 src/llmtuner/train/sftmm/metric.py
 create mode 100644 src/llmtuner/train/sftmm/trainer.py
 create mode 100644 src/llmtuner/train/sftmm/workflow.py

diff --git a/src/llmtuner/data/__init__.py b/src/llmtuner/data/__init__.py
index 00a82d73..792e89d9 100644
--- a/src/llmtuner/data/__init__.py
+++ b/src/llmtuner/data/__init__.py
@@ -3,6 +3,7 @@ from .loader import get_dataset
 from .template import Template, get_template_and_fix_tokenizer, templates
 from .utils import Role, split_dataset
 
+
 __all__ = [
     "PairwiseDataCollatorWithPadding",
     "get_dataset",
diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py
index 85202ea8..9d440aff 100644
--- a/src/llmtuner/data/aligner.py
+++ b/src/llmtuner/data/aligner.py
@@ -13,9 +13,7 @@ if TYPE_CHECKING:
     from .parser import DatasetAttr
 
 
-def convert_alpaca(
-    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
-) -> Dict[str, List[Any]]:
+def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]:
     outputs = {"prompt": [], "response": [], "system": [], "tools": []}
     for i in range(len(examples[dataset_attr.prompt])):
         prompt = []
@@ -33,16 +31,11 @@ def convert_alpaca(
 
         prompt.append({"role": Role.USER.value, "content": "\n".join(content)})
 
-        if dataset_attr.response and isinstance(
-            examples[dataset_attr.response][i], list
-        ):
+        if dataset_attr.response and isinstance(examples[dataset_attr.response][i], list):
             response = [
-                {"role": Role.ASSISTANT.value, "content": content}
-                for content in examples[dataset_attr.response][i]
+                {"role": Role.ASSISTANT.value, "content": content} for content in examples[dataset_attr.response][i]
             ]
-        elif dataset_attr.response and isinstance(
-            examples[dataset_attr.response][i], str
-        ):
+        elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str):
             response = [
                 {
                     "role": Role.ASSISTANT.value,
@@ -54,17 +47,13 @@ def convert_alpaca(
 
         outputs["prompt"].append(prompt)
         outputs["response"].append(response)
-        outputs["system"].append(
-            examples[dataset_attr.system][i] if dataset_attr.system else ""
-        )
+        outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
         outputs["tools"].append("")
         outputs["images"].append([])
     return outputs
 
 
-def convert_sharegpt(
-    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
-) -> Dict[str, List[Any]]:
+def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]:
     outputs = {"prompt": [], "response": [], "system": [], "tools": []}
     tag_mapping = {
         dataset_attr.user_tag: Role.USER.value,
@@ -77,10 +66,7 @@ def convert_sharegpt(
     even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag)
     accept_tags = (odd_tags, even_tags)
     for i, messages in enumerate(examples[dataset_attr.messages]):
-        if (
-            dataset_attr.system_tag
-            and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag
-        ):
+        if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag:
             system = messages[0][dataset_attr.content_tag]
             messages = messages[1:]
         else:
@@ -105,17 +91,13 @@ def convert_sharegpt(
         outputs["prompt"].append(aligned_messages[:-1])
         outputs["response"].append(aligned_messages[-1:])
         outputs["system"].append(system)
-        outputs["tools"].append(
-            examples[dataset_attr.tools][i] if dataset_attr.tools else ""
-        )
+        outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
         outputs["images"].append([])
 
     return outputs
 
 
-def convert_llava(
-    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
-) -> Dict[str, List[Any]]:
+def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]:
     outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
     tag_mapping = {
         dataset_attr.user_tag: Role.USER.value,
@@ -128,10 +110,7 @@ def convert_llava(
     even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag)
     accept_tags = (odd_tags, even_tags)
     for i, messages in enumerate(examples[dataset_attr.messages]):
-        if (
-            dataset_attr.system_tag
-            and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag
-        ):
+        if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag:
             system = messages[0][dataset_attr.content_tag]
             messages = messages[1:]
         else:
@@ -156,13 +135,9 @@ def convert_llava(
         outputs["prompt"].append(aligned_messages[:-1])
         outputs["response"].append(aligned_messages[-1:])
         outputs["system"].append(system)
-        outputs["tools"].append(
-            examples[dataset_attr.tools][i] if dataset_attr.tools else ""
-        )
+        outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
         print(examples[dataset_attr.images][i])
-        outputs["images"].append(
-            examples[dataset_attr.images][i] if dataset_attr.images else []
-        )
+        outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else [])
 
     return outputs
 
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index c373e196..fa4aa9c1 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -1,6 +1,6 @@
 import inspect
 import os
-from typing import TYPE_CHECKING, Literal, Union, Optional
+from typing import TYPE_CHECKING, Literal, Optional, Union
 
 from datasets import load_dataset, load_from_disk
 
@@ -13,9 +13,10 @@ from .preprocess import get_preprocess_and_print_func
 from .template import get_template_and_fix_tokenizer
 from .utils import checksum, merge_dataset
 
+
 if TYPE_CHECKING:
     from datasets import Dataset, IterableDataset
-    from transformers import Seq2SeqTrainingArguments, AutoProcessor
+    from transformers import AutoProcessor, Seq2SeqTrainingArguments
     from transformers.tokenization_utils import PreTrainedTokenizer
 
     from ..hparams import DataArguments, ModelArguments
@@ -78,20 +79,14 @@ def load_single_dataset(
                 split=data_args.split,
                 cache_dir=cache_dir,
                 token=model_args.ms_hub_token,
-                use_streaming=(
-                    data_args.streaming and (dataset_attr.load_from != "file")
-                ),
+                use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
             )
             if isinstance(dataset, MsDataset):
                 dataset = dataset.to_hf_dataset()
         except ImportError:
-            raise ImportError(
-                "Please install modelscope via `pip install modelscope -U`"
-            )
+            raise ImportError("Please install modelscope via `pip install modelscope -U`")
     else:
-        if (
-            "trust_remote_code" in inspect.signature(load_dataset).parameters
-        ):  # for datasets==2.16.0
+        if "trust_remote_code" in inspect.signature(load_dataset).parameters:  # for datasets==2.16.0
             kwargs = {"trust_remote_code": True}
         else:
             kwargs = {}
@@ -108,9 +103,7 @@ def load_single_dataset(
             **kwargs,
         )
 
-    if data_args.streaming and (
-        dataset_attr.load_from == "file"
-    ):  # faster than specifying streaming=True
+    if data_args.streaming and (dataset_attr.load_from == "file"):  # faster than specifying streaming=True
         dataset = dataset.to_iterable_dataset()  # TODO: add num shards parameter
 
     if data_args.max_samples is not None:  # truncate dataset
@@ -135,13 +128,9 @@ def get_dataset(
     # Load tokenized dataset
     if data_args.tokenized_path is not None:
         if has_tokenized_data(data_args.tokenized_path):
-            logger.warning(
-                "Loading dataset from disk will ignore other data arguments."
-            )
+            logger.warning("Loading dataset from disk will ignore other data arguments.")
             dataset = load_from_disk(data_args.tokenized_path)
-            logger.info(
-                "Loaded tokenized dataset from {}.".format(data_args.tokenized_path)
-            )
+            logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
             if data_args.streaming:
                 dataset = dataset.to_iterable_dataset()
             return dataset
@@ -152,16 +141,10 @@ def get_dataset(
     with training_args.main_process_first(desc="load dataset"):
         all_datasets = []
         for dataset_attr in get_dataset_list(data_args):
-            if (stage == "rm" and dataset_attr.ranking is False) or (
-                stage != "rm" and dataset_attr.ranking is True
-            ):
-                raise ValueError(
-                    "The dataset is not applicable in the current training stage."
-                )
+            if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True):
+                raise ValueError("The dataset is not applicable in the current training stage.")
 
-            all_datasets.append(
-                load_single_dataset(dataset_attr, model_args, data_args)
-            )
+            all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args))
         dataset = merge_dataset(all_datasets, data_args, training_args)
 
     with training_args.main_process_first(desc="pre-process dataset"):
@@ -177,21 +160,13 @@ def get_dataset(
                 desc="Running tokenizer on dataset",
             )
 
-        dataset = dataset.map(
-            preprocess_func, batched=True, remove_columns=column_names, **kwargs
-        )
+        dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
 
         if data_args.tokenized_path is not None:
             if training_args.should_save:
                 dataset.save_to_disk(data_args.tokenized_path)
-                logger.info(
-                    "Tokenized dataset saved at {}.".format(data_args.tokenized_path)
-                )
-                logger.info(
-                    "Please restart the training with `--tokenized_path {}`.".format(
-                        data_args.tokenized_path
-                    )
-                )
+                logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path))
+                logger.info("Please restart the training with `--tokenized_path {}`.".format(data_args.tokenized_path))
 
             exit(0)
 
@@ -199,8 +174,6 @@ def get_dataset(
             try:
                 print_function(next(iter(dataset)))
             except StopIteration:
-                raise RuntimeError(
-                    "Cannot find valid samples, check `data/README.md` for the data format."
-                )
+                raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
 
         return dataset
diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py
index 79d6ed4e..4d3d7741 100644
--- a/src/llmtuner/data/parser.py
+++ b/src/llmtuner/data/parser.py
@@ -50,9 +50,7 @@ class DatasetAttr:
     def __repr__(self) -> str:
         return self.dataset_name
 
-    def set_attr(
-        self, key: str, obj: Dict[str, Any], default: Optional[Any] = None
-    ) -> None:
+    def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None:
         setattr(self, key, obj.get(key, default))
 
 
@@ -71,16 +69,12 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
         except Exception as err:
             if len(dataset_names) != 0:
                 raise ValueError(
-                    "Cannot open {} due to {}.".format(
-                        os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err)
-                    )
+                    "Cannot open {} due to {}.".format(os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err))
                 )
             dataset_info = None
 
     if data_args.interleave_probs is not None:
-        data_args.interleave_probs = [
-            float(prob.strip()) for prob in data_args.interleave_probs.split(",")
-        ]
+        data_args.interleave_probs = [float(prob.strip()) for prob in data_args.interleave_probs.split(",")]
 
     dataset_list: List[DatasetAttr] = []
     for name in dataset_names:
@@ -98,21 +92,13 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
 
         if has_hf_url or has_ms_url:
             if (use_modelscope() and has_ms_url) or (not has_hf_url):
-                dataset_attr = DatasetAttr(
-                    "ms_hub", dataset_name=dataset_info[name]["ms_hub_url"]
-                )
+                dataset_attr = DatasetAttr("ms_hub", dataset_name=dataset_info[name]["ms_hub_url"])
             else:
-                dataset_attr = DatasetAttr(
-                    "hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]
-                )
+                dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
         elif "script_url" in dataset_info[name]:
-            dataset_attr = DatasetAttr(
-                "script", dataset_name=dataset_info[name]["script_url"]
-            )
+            dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
         else:
-            dataset_attr = DatasetAttr(
-                "file", dataset_name=dataset_info[name]["file_name"]
-            )
+            dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])
 
         dataset_attr.set_attr("file_sha1", dataset_info[name])
         dataset_attr.set_attr("subset", dataset_info[name])
diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index dc72483f..1c8c64a6 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -1,6 +1,6 @@
 from functools import partial
 from itertools import chain
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Tuple, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple
 
 from ..extras.constants import IGNORE_INDEX
 from ..extras.logging import get_logger
@@ -9,7 +9,7 @@ from .utils import Role
 
 if TYPE_CHECKING:
     from transformers import Seq2SeqTrainingArguments
-    from transformers.tokenization_utils import PreTrainedTokenizer, AutoProcessor
+    from transformers.tokenization_utils import AutoProcessor, PreTrainedTokenizer
 
     from ..hparams import DataArguments
     from .template import Template
@@ -24,22 +24,16 @@ def preprocess_pretrain_dataset(
     data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
-    text_examples = [
-        messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]
-    ]
+    text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]]
 
     if not data_args.packing:
         if data_args.template == "gemma":
             text_examples = [tokenizer.bos_token + example for example in text_examples]
 
-        result = tokenizer(
-            text_examples, add_special_tokens=False, max_length=data_args.cutoff_len
-        )
+        result = tokenizer(text_examples, add_special_tokens=False, max_length=data_args.cutoff_len)
     else:
         tokenized_examples = tokenizer(text_examples, add_special_tokens=False)
-        concatenated_examples = {
-            k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()
-        }
+        concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
         total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
         block_size = data_args.cutoff_len
         total_length = (total_length // block_size) * block_size
@@ -87,9 +81,7 @@ def preprocess_supervised_dataset(
             if data_args.train_on_prompt:
                 source_mask = source_ids
             elif turn_idx != 0 and template.efficient_eos:
-                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (
-                    len(source_ids) - 1
-                )
+                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
             else:
                 source_mask = [IGNORE_INDEX] * len(source_ids)
 
@@ -128,9 +120,7 @@ def preprocess_packed_supervised_dataset(
             if data_args.train_on_prompt:
                 source_mask = source_ids
             elif len(input_ids) != 0 and template.efficient_eos:
-                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (
-                    len(source_ids) - 1
-                )
+                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
             else:
                 source_mask = [IGNORE_INDEX] * len(source_ids)
 
@@ -190,9 +180,7 @@ def preprocess_multimodal_supervised_dataset(
             if data_args.train_on_prompt:
                 source_mask = source_ids
             elif turn_idx != 0 and template.efficient_eos:
-                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (
-                    len(source_ids) - 1
-                )
+                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
             else:
                 source_mask = [IGNORE_INDEX] * len(source_ids)
 
@@ -206,9 +194,7 @@ def preprocess_multimodal_supervised_dataset(
         model_inputs["input_ids"].append(input_ids)
         model_inputs["attention_mask"].append([1] * len(input_ids))
         model_inputs["labels"].append(labels)
-        pixel_values = processor.image_processor(
-            examples["images"][0], return_tensors="pt"
-        )["pixel_values"][0]
+        pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0]
         model_inputs["pixel_values"].append(pixel_values)
     return model_inputs
 
@@ -229,9 +215,7 @@ def preprocess_unsupervised_dataset(
         if len(examples["response"][i]) == 1:
             messages = examples["prompt"][i] + examples["response"][i]
         else:
-            messages = examples["prompt"][i] + [
-                {"role": Role.ASSISTANT.value, "content": ""}
-            ]
+            messages = examples["prompt"][i] + [{"role": Role.ASSISTANT.value, "content": ""}]
 
         input_ids, labels = template.encode_oneturn(
             tokenizer,
@@ -294,15 +278,9 @@ def preprocess_pairwise_dataset(
     return model_inputs
 
 
-def print_supervised_dataset_example(
-    example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer"
-) -> None:
+def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
     print("input_ids:\n{}".format(example["input_ids"]))
-    print(
-        "inputs:\n{}".format(
-            tokenizer.decode(example["input_ids"], skip_special_tokens=False)
-        )
-    )
+    print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
     print("label_ids:\n{}".format(example["labels"]))
     print(
         "labels:\n{}".format(
@@ -314,38 +292,18 @@ def print_supervised_dataset_example(
     )
 
 
-def print_pairwise_dataset_example(
-    example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer"
-) -> None:
+def print_pairwise_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
     print("prompt_ids:\n{}".format(example["prompt_ids"]))
-    print(
-        "prompt:\n{}".format(
-            tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)
-        )
-    )
+    print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)))
     print("chosen_ids:\n{}".format(example["chosen_ids"]))
-    print(
-        "chosen:\n{}".format(
-            tokenizer.decode(example["chosen_ids"], skip_special_tokens=False)
-        )
-    )
+    print("chosen:\n{}".format(tokenizer.decode(example["chosen_ids"], skip_special_tokens=False)))
     print("rejected_ids:\n{}".format(example["rejected_ids"]))
-    print(
-        "rejected:\n{}".format(
-            tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)
-        )
-    )
+    print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)))
 
 
-def print_unsupervised_dataset_example(
-    example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer"
-) -> None:
+def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
     print("input_ids:\n{}".format(example["input_ids"]))
-    print(
-        "inputs:\n{}".format(
-            tokenizer.decode(example["input_ids"], skip_special_tokens=False)
-        )
-    )
+    print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
 
 
 def get_preprocess_and_print_func(
@@ -357,12 +315,8 @@ def get_preprocess_and_print_func(
     processor: Optional["AutoProcessor"] = None,
 ) -> Tuple[Callable, Callable]:
     if stage == "pt":
-        preprocess_func = partial(
-            preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args
-        )
-        print_function = partial(
-            print_unsupervised_dataset_example, tokenizer=tokenizer
-        )
+        preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args)
+        print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
     elif stage == "sft" and not training_args.predict_with_generate:
         if data_args.packing:
             preprocess_func = partial(
@@ -402,8 +356,6 @@ def get_preprocess_and_print_func(
             template=template,
             data_args=data_args,
         )
-        print_function = partial(
-            print_unsupervised_dataset_example, tokenizer=tokenizer
-        )
+        print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
 
     return preprocess_func, print_function
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index e6cdadd6..cf21e932 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -42,9 +42,7 @@ class Template:
         r"""
         Returns a single pair of token ids representing prompt and response respectively.
         """
-        encoded_pairs = self._encode(
-            tokenizer, messages, system, tools, cutoff_len, reserved_label_len
-        )
+        encoded_pairs = self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
         prompt_ids = []
         for query_ids, resp_ids in encoded_pairs[:-1]:
             prompt_ids += query_ids + resp_ids
@@ -64,9 +62,7 @@ class Template:
         r"""
         Returns multiple pairs of token ids representing prompts and responses respectively.
         """
-        return self._encode(
-            tokenizer, messages, system, tools, cutoff_len, reserved_label_len
-        )
+        return self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len)
 
     def _encode(
         self,
@@ -93,9 +89,7 @@ class Template:
                 elements += self.format_separator.apply()
 
             if message["role"] == Role.USER.value:
-                elements += self.format_user.apply(
-                    content=message["content"], idx=str(i // 2)
-                )
+                elements += self.format_user.apply(content=message["content"], idx=str(i // 2))
             elif message["role"] == Role.ASSISTANT.value:
                 elements += self.format_assistant.apply(content=message["content"])
             elif message["role"] == Role.OBSERVATION.value:
@@ -130,11 +124,7 @@ class Template:
                 elif "eos_token" in elem and tokenizer.eos_token_id is not None:
                     token_ids += [tokenizer.eos_token_id]
             else:
-                raise ValueError(
-                    "Input must be string, set[str] or dict[str, str], got {}".format(
-                        type(elem)
-                    )
-                )
+                raise ValueError("Input must be string, set[str] or dict[str, str], got {}".format(type(elem)))
 
         return token_ids
 
@@ -192,9 +182,7 @@ class Llama2Template(Template):
                 elements += self.format_separator.apply()
 
             if message["role"] == Role.USER.value:
-                elements += self.format_user.apply(
-                    content=system_text + message["content"]
-                )
+                elements += self.format_user.apply(content=system_text + message["content"])
             elif message["role"] == Role.ASSISTANT.value:
                 elements += self.format_assistant.apply(content=message["content"])
             elif message["role"] == Role.OBSERVATION.value:
@@ -257,9 +245,7 @@ def _register_template(
     template_class = Llama2Template if name.startswith("llama2") else Template
     default_user_formatter = StringFormatter(slots=["{{content}}"])
     default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots)
-    default_function_formatter = FunctionFormatter(
-        slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots
-    )
+    default_function_formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots)
     default_tool_formatter = ToolFormatter(tool_format="default")
     default_separator_formatter = EmptyFormatter()
     templates[name] = template_class(
@@ -295,9 +281,7 @@ def _jinja_escape(content: str) -> str:
     return content.replace("\n", r"\n").replace("'", r"\'")
 
 
-def _convert_slots_to_jinja(
-    slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content"
-) -> str:
+def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str:
     slot_items = []
     for slot in slots:
         if isinstance(slot, str):
@@ -311,9 +295,7 @@ def _convert_slots_to_jinja(
         elif isinstance(slot, set):
             if "bos_token" in slot:
                 slot_items.append("'" + tokenizer.bos_token + "'")
-            elif (
-                "eos_token" in slot
-            ):  # do not use {{ eos_token }} since it may be replaced
+            elif "eos_token" in slot:  # do not use {{ eos_token }} since it may be replaced
                 slot_items.append("'" + tokenizer.eos_token + "'")
         elif isinstance(slot, dict):
             raise ValueError("Dict is not supported.")
@@ -325,37 +307,25 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer")
     jinja_template = ""
 
     if template.default_system:
-        jinja_template += (
-            "{% set system_message = '"
-            + _jinja_escape(template.default_system)
-            + "' %}"
-        )
+        jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
 
     jinja_template += (
-        "{% if messages[0]['role'] == 'system' %}"
-        "{% set system_message = messages[0]['content'] %}"
-        "{% endif %}"
+        "{% if messages[0]['role'] == 'system' %}" "{% set system_message = messages[0]['content'] %}" "{% endif %}"
     )
 
-    system_message = _convert_slots_to_jinja(
-        template.format_system.apply(), tokenizer, placeholder="system_message"
-    )
+    system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message")
     if isinstance(template, Llama2Template):
         pass
     elif template.force_system:
         jinja_template += "{{ " + system_message + " }}"
     else:
-        jinja_template += (
-            "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}"
-        )
+        jinja_template += "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}"
 
     jinja_template += "{% for message in messages %}"
     jinja_template += "{% set content = message['content'] %}"
     if isinstance(template, Llama2Template):
         jinja_template += "{% if loop.index0 == 0 and system_message is defined %}"
-        jinja_template += (
-            "{% set content = " + system_message + " + message['content'] %}"
-        )
+        jinja_template += "{% set content = " + system_message + " + message['content'] %}"
         jinja_template += "{% endif %}"
     jinja_template += "{% if message['role'] == 'user' %}"
     user_message = _convert_slots_to_jinja(template.format_user.apply(), tokenizer)
@@ -403,9 +373,7 @@ def get_template_and_fix_tokenizer(
         )
         logger.info("Add {} to stop words.".format(",".join(stop_words)))
         if num_added_tokens > 0:
-            logger.warning(
-                "New tokens have been added, make sure `resize_vocab` is True."
-            )
+            logger.warning("New tokens have been added, make sure `resize_vocab` is True.")
 
     try:
         tokenizer.chat_template = _get_jinja_template(template, tokenizer)
@@ -417,9 +385,7 @@ def get_template_and_fix_tokenizer(
 
 _register_template(
     name="alpaca",
-    format_user=StringFormatter(
-        slots=["### Instruction:\n{{content}}\n\n### Response:\n"]
-    ),
+    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
     format_separator=EmptyFormatter(slots=["\n\n"]),
     default_system=(
         "Below is an instruction that describes a task. "
@@ -458,9 +424,7 @@ _register_template(
 
 _register_template(
     name="baichuan",
-    format_user=StringFormatter(
-        slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]
-    ),
+    format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
     efficient_eos=True,
 )
 
@@ -483,9 +447,7 @@ _register_template(
 
 _register_template(
     name="bluelm",
-    format_user=StringFormatter(
-        slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]
-    ),
+    format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
 )
 
 
@@ -504,9 +466,7 @@ _register_template(
 _register_template(
     name="chatglm2",
     format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
-    format_system=StringFormatter(
-        slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]
-    ),
+    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
     format_separator=EmptyFormatter(slots=["\n\n"]),
     efficient_eos=True,
     force_system=True,
@@ -515,13 +475,9 @@ _register_template(
 
 _register_template(
     name="chatglm3",
-    format_user=StringFormatter(
-        slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
-    ),
+    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
-    format_system=StringFormatter(
-        slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]
-    ),
+    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(
         slots=[
@@ -539,9 +495,7 @@ _register_template(
 
 _register_template(
     name="chatglm3_system",
-    format_user=StringFormatter(
-        slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
-    ),
+    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
     format_system=StringFormatter(
         slots=[
@@ -572,15 +526,9 @@ _register_template(
 
 _register_template(
     name="chatml",
-    format_user=StringFormatter(
-        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
-    format_system=StringFormatter(
-        slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]
-    ),
-    format_observation=StringFormatter(
-        slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     stop_words=["<|im_end|>", "<|im_start|>"],
     replace_eos=True,
@@ -589,15 +537,9 @@ _register_template(
 
 _register_template(
     name="chatml_de",
-    format_user=StringFormatter(
-        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
-    format_system=StringFormatter(
-        slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]
-    ),
-    format_observation=StringFormatter(
-        slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
     stop_words=["<|im_end|>", "<|im_start|>"],
@@ -607,9 +549,7 @@ _register_template(
 
 _register_template(
     name="codegeex2",
-    format_system=StringFormatter(
-        slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]
-    ),
+    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
     force_system=True,
 )
 
@@ -639,15 +579,9 @@ _register_template(
 
 _register_template(
     name="dbrx",
-    format_user=StringFormatter(
-        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
-    format_system=StringFormatter(
-        slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]
-    ),
-    format_observation=StringFormatter(
-        slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system=(
         "You are DBRX, created by Databricks. You were last updated in December 2023. "
@@ -725,9 +659,7 @@ _register_template(
 
 _register_template(
     name="gemma",
-    format_user=StringFormatter(
-        slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
-    ),
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     format_observation=StringFormatter(
         slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
@@ -740,9 +672,7 @@ _register_template(
 
 _register_template(
     name="intern",
-    format_user=StringFormatter(
-        slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]
-    ),
+    format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
     format_separator=EmptyFormatter(slots=[{"token": "<eoa>"}, "\n"]),
     stop_words=["<eoa>"],
     efficient_eos=True,
@@ -751,12 +681,8 @@ _register_template(
 
 _register_template(
     name="intern2",
-    format_user=StringFormatter(
-        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
-    format_system=StringFormatter(
-        slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]
-    ),
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system=(
         "You are an AI assistant whose name is InternLM (书生·浦语).\n"
@@ -859,9 +785,7 @@ _register_template(
 
 _register_template(
     name="orion",
-    format_user=StringFormatter(
-        slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]
-    ),
+    format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     force_system=True,
 )
@@ -869,15 +793,9 @@ _register_template(
 
 _register_template(
     name="phi",
-    format_user=StringFormatter(
-        slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]
-    ),
-    format_system=StringFormatter(
-        slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]
-    ),
-    format_observation=StringFormatter(
-        slots=["<|function_output|>\n{{content}}<|end|>\n<|assistant|>\n"]
-    ),
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]),
+    format_observation=StringFormatter(slots=["<|function_output|>\n{{content}}<|end|>\n<|assistant|>\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system="You are a helpful AI assistant.",
     stop_words=["<|end|>"],
@@ -887,15 +805,9 @@ _register_template(
 
 _register_template(
     name="qwen",
-    format_user=StringFormatter(
-        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
-    format_system=StringFormatter(
-        slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]
-    ),
-    format_observation=StringFormatter(
-        slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     default_system="You are a helpful assistant.",
     stop_words=["<|im_end|>"],
@@ -951,12 +863,8 @@ _register_template(
 
 _register_template(
     name="yayi",
-    format_user=StringFormatter(
-        slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]
-    ),
-    format_system=StringFormatter(
-        slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]
-    ),
+    format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
+    format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]),
     format_separator=EmptyFormatter(slots=["\n\n"]),
     default_system=(
         "You are a helpful, respectful and honest assistant named YaYi "
@@ -975,9 +883,7 @@ _register_template(
 
 _register_template(
     name="yi",
-    format_user=StringFormatter(
-        slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
-    ),
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
     format_separator=EmptyFormatter(slots=["\n"]),
     stop_words=["<|im_end|>"],
     replace_eos=True,
@@ -995,9 +901,7 @@ _register_template(
 
 _register_template(
     name="zephyr",
-    format_user=StringFormatter(
-        slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]
-    ),
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
     format_assistant=StringFormatter(slots=["\n{{content}}", {"eos_token"}]),
     format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
     default_system="You are a friendly chatbot who always responds in the style of a pirate",
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index 66ac93cf..df1a5ec0 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -15,33 +15,23 @@ class ModelArguments:
     )
     adapter_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "Path to the adapter weight or identifier from huggingface.co/models."
-        },
+        metadata={"help": "Path to the adapter weight or identifier from huggingface.co/models."},
     )
     cache_dir: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."
-        },
+        metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."},
     )
     use_fast_tokenizer: bool = field(
         default=True,
-        metadata={
-            "help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."
-        },
+        metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."},
     )
     resize_vocab: bool = field(
         default=False,
-        metadata={
-            "help": "Whether or not to resize the tokenizer vocab and the embedding layers."
-        },
+        metadata={"help": "Whether or not to resize the tokenizer vocab and the embedding layers."},
     )
     split_special_tokens: bool = field(
         default=False,
-        metadata={
-            "help": "Whether or not the special tokens should be split during the tokenization process."
-        },
+        metadata={"help": "Whether or not the special tokens should be split during the tokenization process."},
     )
     new_special_tokens: Optional[str] = field(
         default=None,
@@ -49,9 +39,7 @@ class ModelArguments:
     )
     model_revision: str = field(
         default="main",
-        metadata={
-            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
-        },
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
     )
     low_cpu_mem_usage: bool = field(
         default=True,
@@ -59,9 +47,7 @@ class ModelArguments:
     )
     quantization_bit: Optional[int] = field(
         default=None,
-        metadata={
-            "help": "The number of bits to quantize the model using bitsandbytes."
-        },
+        metadata={"help": "The number of bits to quantize the model using bitsandbytes."},
     )
     quantization_type: Literal["fp4", "nf4"] = field(
         default="nf4",
@@ -69,21 +55,15 @@ class ModelArguments:
     )
     double_quantization: bool = field(
         default=True,
-        metadata={
-            "help": "Whether or not to use double quantization in int4 training."
-        },
+        metadata={"help": "Whether or not to use double quantization in int4 training."},
     )
     quantization_device_map: Optional[Literal["auto"]] = field(
         default=None,
-        metadata={
-            "help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."
-        },
+        metadata={"help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."},
     )
     rope_scaling: Optional[Literal["linear", "dynamic"]] = field(
         default=None,
-        metadata={
-            "help": "Which scaling strategy should be adopted for the RoPE embeddings."
-        },
+        metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
     )
     flash_attn: Literal["off", "sdpa", "fa2", "auto"] = field(
         default="auto",
@@ -91,27 +71,19 @@ class ModelArguments:
     )
     shift_attn: bool = field(
         default=False,
-        metadata={
-            "help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."
-        },
+        metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."},
     )
     mixture_of_depths: Optional[Literal["convert", "load"]] = field(
         default=None,
-        metadata={
-            "help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."
-        },
+        metadata={"help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."},
     )
     use_unsloth: bool = field(
         default=False,
-        metadata={
-            "help": "Whether or not to use unsloth's optimization for the LoRA training."
-        },
+        metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."},
     )
     moe_aux_loss_coef: Optional[float] = field(
         default=None,
-        metadata={
-            "help": "Coefficient of the auxiliary router loss in mixture-of-experts model."
-        },
+        metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."},
     )
     disable_gradient_checkpointing: bool = field(
         default=False,
@@ -135,9 +107,7 @@ class ModelArguments:
     )
     vllm_gpu_util: float = field(
         default=0.9,
-        metadata={
-            "help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine."
-        },
+        metadata={"help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine."},
     )
     vllm_enforce_eager: bool = field(
         default=False,
@@ -177,9 +147,7 @@ class ModelArguments:
     )
     export_quantization_dataset: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "Path to the dataset or dataset name to use in quantizing the exported model."
-        },
+        metadata={"help": "Path to the dataset or dataset name to use in quantizing the exported model."},
     )
     export_quantization_nsamples: int = field(
         default=128,
@@ -187,27 +155,19 @@ class ModelArguments:
     )
     export_quantization_maxlen: int = field(
         default=1024,
-        metadata={
-            "help": "The maximum length of the model inputs used for quantization."
-        },
+        metadata={"help": "The maximum length of the model inputs used for quantization."},
     )
     export_legacy_format: bool = field(
         default=False,
-        metadata={
-            "help": "Whether or not to save the `.bin` files instead of `.safetensors`."
-        },
+        metadata={"help": "Whether or not to save the `.bin` files instead of `.safetensors`."},
     )
     export_hub_model_id: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "The name of the repository if push the model to the Hugging Face hub."
-        },
+        metadata={"help": "The name of the repository if push the model to the Hugging Face hub."},
     )
     print_param_status: bool = field(
         default=False,
-        metadata={
-            "help": "For debugging purposes, print the status of the parameters in the model."
-        },
+        metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
     )
     use_mllm: bool = field(
         default=False,
@@ -220,21 +180,13 @@ class ModelArguments:
         self.model_max_length = None
 
         if self.split_special_tokens and self.use_fast_tokenizer:
-            raise ValueError(
-                "`split_special_tokens` is only supported for slow tokenizers."
-            )
+            raise ValueError("`split_special_tokens` is only supported for slow tokenizers.")
 
-        if (
-            self.adapter_name_or_path is not None
-        ):  # support merging multiple lora weights
-            self.adapter_name_or_path = [
-                path.strip() for path in self.adapter_name_or_path.split(",")
-            ]
+        if self.adapter_name_or_path is not None:  # support merging multiple lora weights
+            self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")]
 
         if self.new_special_tokens is not None:  # support multiple special tokens
-            self.new_special_tokens = [
-                token.strip() for token in self.new_special_tokens.split(",")
-            ]
+            self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")]
 
         assert self.quantization_bit in [
             None,
@@ -249,10 +201,7 @@ class ModelArguments:
             2,
         ], "We only accept 2/3/4/8-bit quantization."
 
-        if (
-            self.export_quantization_bit is not None
-            and self.export_quantization_dataset is None
-        ):
+        if self.export_quantization_bit is not None and self.export_quantization_dataset is None:
             raise ValueError("Quantization dataset is necessary for exporting.")
 
     def to_dict(self) -> Dict[str, Any]:
diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py
index db81e1dc..1824f084 100644
--- a/src/llmtuner/model/__init__.py
+++ b/src/llmtuner/model/__init__.py
@@ -1,6 +1,7 @@
 from .loader import load_config, load_model, load_tokenizer
 from .utils.misc import find_all_linear_modules, load_valuehead_params
 
+
 __all__ = [
     "load_config",
     "load_model",
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index e65798b7..f3db4d1e 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -38,9 +38,7 @@ def init_adapter(
         logger.info("Adapter is not found at evaluation, load the base model.")
         return model
 
-    if finetuning_args.finetuning_type != "lora" and getattr(
-        model, "quantization_method", None
-    ):
+    if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None):
         raise ValueError("You can only use lora for quantized models.")
 
     if finetuning_args.finetuning_type == "full" and is_trainable:
@@ -68,12 +66,8 @@ def init_adapter(
 
             stride = num_layers // finetuning_args.num_layer_trainable
             trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
-        elif (
-            finetuning_args.num_layer_trainable > 0
-        ):  # fine-tuning the last n layers if num_layer_trainable > 0
-            trainable_layer_ids = range(
-                num_layers - finetuning_args.num_layer_trainable, num_layers
-            )
+        elif finetuning_args.num_layer_trainable > 0:  # fine-tuning the last n layers if num_layer_trainable > 0
+            trainable_layer_ids = range(num_layers - finetuning_args.num_layer_trainable, num_layers)
         else:  # fine-tuning the first n layers if num_layer_trainable < 0
             trainable_layer_ids = range(-finetuning_args.num_layer_trainable)
 
@@ -88,15 +82,11 @@ def init_adapter(
         for module_name in finetuning_args.name_module_trainable:
             if module_name not in freeze_modules:
                 raise ValueError(
-                    "Module {} is not found, please choose from {}".format(
-                        module_name, ", ".join(freeze_modules)
-                    )
+                    "Module {} is not found, please choose from {}".format(module_name, ", ".join(freeze_modules))
                 )
 
             for idx in trainable_layer_ids:
-                trainable_layers.append(
-                    ".{:d}.{}".format(idx, module_name if module_name != "all" else "")
-                )
+                trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else ""))
 
         for name, param in model.named_parameters():
             if any(trainable_layer in name for trainable_layer in trainable_layers):
@@ -105,43 +95,27 @@ def init_adapter(
             else:
                 param.requires_grad_(False)
 
-        logger.info(
-            "Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids)))
-        )
+        logger.info("Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids))))
 
     if finetuning_args.finetuning_type == "lora":
-        logger.info(
-            "Fine-tuning method: {}".format(
-                "DoRA" if finetuning_args.use_dora else "LoRA"
-            )
-        )
+        logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
         adapter_to_resume = None
 
         if model_args.adapter_name_or_path is not None:
             is_mergeable = True
-            if getattr(
-                model, "quantization_method", None
-            ):  # merge lora in quantized model is unstable
-                assert (
-                    len(model_args.adapter_name_or_path) == 1
-                ), "Quantized model only accepts a single adapter."
+            if getattr(model, "quantization_method", None):  # merge lora in quantized model is unstable
+                assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter."
                 is_mergeable = False
 
             if is_deepspeed_zero3_enabled():
-                assert (
-                    len(model_args.adapter_name_or_path) == 1
-                ), "Cannot use multiple adapters in DeepSpeed ZeRO-3."
+                assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3."
                 is_mergeable = False
 
             if model_args.use_unsloth:
-                assert (
-                    len(model_args.adapter_name_or_path) == 1
-                ), "Unsloth model only accepts a single adapter."
+                assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter."
                 is_mergeable = False
 
-            if (is_trainable and not finetuning_args.create_new_adapter) or (
-                not is_mergeable
-            ):
+            if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable):
                 adapter_to_merge = model_args.adapter_name_or_path[:-1]
                 adapter_to_resume = model_args.adapter_name_or_path[-1]
             else:
@@ -158,9 +132,7 @@ def init_adapter(
 
             if adapter_to_resume is not None:  # resume lora training
                 if model_args.use_unsloth:
-                    model = load_unsloth_peft_model(
-                        config, model_args, is_trainable=is_trainable
-                    )
+                    model = load_unsloth_peft_model(config, model_args, is_trainable=is_trainable)
                 else:
                     model = PeftModel.from_pretrained(
                         model,
@@ -169,27 +141,19 @@ def init_adapter(
                         offload_folder=model_args.offload_folder,
                     )
 
-        if (
-            is_trainable and adapter_to_resume is None
-        ):  # create new lora weights while training
-            if (
-                len(finetuning_args.lora_target) == 1
-                and finetuning_args.lora_target[0] == "all"
-            ):
+        if is_trainable and adapter_to_resume is None:  # create new lora weights while training
+            if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all":
                 target_modules = find_all_linear_modules(model)
             else:
                 target_modules = finetuning_args.lora_target
 
             if finetuning_args.use_llama_pro:
-                target_modules = find_expanded_modules(
-                    model, target_modules, finetuning_args.num_layer_trainable
-                )
+                target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable)
 
             if (
                 finetuning_args.use_dora
                 and getattr(model, "quantization_method", None) is not None
-                and getattr(model, "quantization_method", None)
-                != QuantizationMethod.BITS_AND_BYTES
+                and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
             ):
                 raise ValueError("DoRA is not compatible with PTQ-quantized models.")
 
@@ -202,11 +166,7 @@ def init_adapter(
                         module_names.add(name.split(".")[-1])
 
                 finetuning_args.additional_target = module_names
-                logger.warning(
-                    "Vocab has been resized, add {} to trainable params.".format(
-                        ",".join(module_names)
-                    )
-                )
+                logger.warning("Vocab has been resized, add {} to trainable params.".format(",".join(module_names)))
 
             peft_kwargs = {
                 "r": finetuning_args.lora_rank,
@@ -233,10 +193,6 @@ def init_adapter(
                 param.data = param.data.to(torch.float32)
 
         if model_args.adapter_name_or_path is not None:
-            logger.info(
-                "Loaded adapter(s): {}".format(
-                    ",".join(model_args.adapter_name_or_path)
-                )
-            )
+            logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
 
     return model
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 99ad9adc..47298673 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -3,9 +3,9 @@ from typing import TYPE_CHECKING, Any, Dict, Union
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
-    AutoTokenizer,
-    AutoProcessor,
     AutoModelForVision2Seq,
+    AutoProcessor,
+    AutoTokenizer,
 )
 from trl import AutoModelForCausalLMWithValueHead
 
@@ -17,6 +17,7 @@ from .utils.misc import load_valuehead_params, register_autoclass
 from .utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
 from .utils.unsloth import load_unsloth_pretrained_model
 
+
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
 
@@ -42,7 +43,7 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
 
 def load_tokenizer(
     model_args: "ModelArguments",
-) -> Dict[str, Union["PreTrainedTokenizer", "AutoProcesser"]]:
+) -> Dict[str, Union["PreTrainedTokenizer", "AutoProcessor"]]:
     r"""
     Loads pretrained tokenizer.
 
@@ -70,14 +71,10 @@ def load_tokenizer(
             dict(additional_special_tokens=model_args.new_special_tokens),
             replace_additional_special_tokens=False,
         )
-        logger.info(
-            "Add {} to special tokens.".format(",".join(model_args.new_special_tokens))
-        )
+        logger.info("Add {} to special tokens.".format(",".join(model_args.new_special_tokens)))
         if num_added_tokens > 0 and not model_args.resize_vocab:
             model_args.resize_vocab = True
-            logger.warning(
-                "New tokens have been added, changed `resize_vocab` to True."
-            )
+            logger.warning("New tokens have been added, changed `resize_vocab` to True.")
 
     patch_tokenizer(tokenizer)
     tokenizer_modules = {"tokenizer": tokenizer, "processor": None}
@@ -174,10 +171,8 @@ def load_model(
 
     trainable_params, all_param = count_parameters(model)
     if is_trainable:
-        param_stats = (
-            "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
-                trainable_params, all_param, 100 * trainable_params / all_param
-            )
+        param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
+            trainable_params, all_param, 100 * trainable_params / all_param
         )
     else:
         param_stats = "all params: {:d}".format(all_param)
diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py
index 6f887810..50833a99 100644
--- a/src/llmtuner/train/sft/workflow.py
+++ b/src/llmtuner/train/sft/workflow.py
@@ -50,29 +50,17 @@ def run_sft(
         tokenizer.padding_side = "left"  # use left-padding in generation
 
     if getattr(model, "is_quantized", False) and not training_args.do_train:
-        setattr(
-            model, "_hf_peft_config_loaded", True
-        )  # hack here: make model compatible with prediction
+        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
 
     data_collator = DataCollatorForSeq2Seq(
         tokenizer=tokenizer,
-        pad_to_multiple_of=(
-            8 if tokenizer.padding_side == "right" else None
-        ),  # for shift short attention
-        label_pad_token_id=(
-            IGNORE_INDEX
-            if data_args.ignore_pad_token_for_loss
-            else tokenizer.pad_token_id
-        ),
+        pad_to_multiple_of=(8 if tokenizer.padding_side == "right" else None),  # for shift short attention
+        label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id),
     )
 
     # Override the decoding parameters of Seq2SeqTrainer
-    training_args.generation_max_length = (
-        training_args.generation_max_length or data_args.cutoff_len
-    )
-    training_args.generation_num_beams = (
-        data_args.eval_num_beams or training_args.generation_num_beams
-    )
+    training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
+    training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
     if model_args.use_mllm:
         training_args.remove_unused_columns = False
 
@@ -84,25 +72,19 @@ def run_sft(
         tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,
-        compute_metrics=(
-            ComputeMetrics(tokenizer) if training_args.predict_with_generate else None
-        ),
+        compute_metrics=(ComputeMetrics(tokenizer) if training_args.predict_with_generate else None),
         **split_dataset(dataset, data_args, training_args),
     )
 
     # Keyword arguments for `model.generate`
     gen_kwargs = generating_args.to_dict()
-    gen_kwargs["eos_token_id"] = [
-        tokenizer.eos_token_id
-    ] + tokenizer.additional_special_tokens_ids
+    gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids
     gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
     gen_kwargs["logits_processor"] = get_logits_processor()
 
     # Training
     if training_args.do_train:
-        train_result = trainer.train(
-            resume_from_checkpoint=training_args.resume_from_checkpoint
-        )
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
         trainer.save_model()
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
@@ -113,27 +95,19 @@ def run_sft(
     # Evaluation
     if training_args.do_eval:
         metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs)
-        if (
-            training_args.predict_with_generate
-        ):  # eval_loss will be wrong if predict_with_generate is enabled
+        if training_args.predict_with_generate:  # eval_loss will be wrong if predict_with_generate is enabled
             metrics.pop("eval_loss", None)
         trainer.log_metrics("eval", metrics)
         trainer.save_metrics("eval", metrics)
 
     # Predict
     if training_args.do_predict:
-        predict_results = trainer.predict(
-            dataset, metric_key_prefix="predict", **gen_kwargs
-        )
-        if (
-            training_args.predict_with_generate
-        ):  # predict_loss will be wrong if predict_with_generate is enabled
+        predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs)
+        if training_args.predict_with_generate:  # predict_loss will be wrong if predict_with_generate is enabled
             predict_results.metrics.pop("predict_loss", None)
         trainer.log_metrics("predict", predict_results.metrics)
         trainer.save_metrics("predict", predict_results.metrics)
         trainer.save_predictions(predict_results)
 
     # Create model card
-    create_modelcard_and_push(
-        trainer, model_args, data_args, training_args, finetuning_args
-    )
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/src/llmtuner/train/sftmm/__init__.py b/src/llmtuner/train/sftmm/__init__.py
new file mode 100644
index 00000000..9ebdf821
--- /dev/null
+++ b/src/llmtuner/train/sftmm/__init__.py
@@ -0,0 +1,4 @@
+from .workflow import run_sft_mm
+
+
+__all__ = ["run_sft_mm"]
diff --git a/src/llmtuner/train/sftmm/metric.py b/src/llmtuner/train/sftmm/metric.py
new file mode 100644
index 00000000..d1af4c17
--- /dev/null
+++ b/src/llmtuner/train/sftmm/metric.py
@@ -0,0 +1,61 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
+
+import numpy as np
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available
+
+
+if TYPE_CHECKING:
+    from transformers.tokenization_utils import PreTrainedTokenizer
+
+if is_jieba_available():
+    import jieba  # type: ignore
+
+if is_nltk_available():
+    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+
+if is_rouge_available():
+    from rouge_chinese import Rouge
+
+
+@dataclass
+class ComputeMetrics:
+    r"""
+    Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer.
+    """
+
+    tokenizer: "PreTrainedTokenizer"
+
+    def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]:
+        r"""
+        Uses the model predictions to compute metrics.
+        """
+        preds, labels = eval_preds
+        score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []}
+
+        preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id)
+        labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id)
+
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        for pred, label in zip(decoded_preds, decoded_labels):
+            hypothesis = list(jieba.cut(pred))
+            reference = list(jieba.cut(label))
+
+            if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
+                result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
+            else:
+                rouge = Rouge()
+                scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
+                result = scores[0]
+
+            for k, v in result.items():
+                score_dict[k].append(round(v["f"] * 100, 4))
+
+            bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
+            score_dict["bleu-4"].append(round(bleu_score * 100, 4))
+
+        return {k: float(np.mean(v)) for k, v in score_dict.items()}
diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py
new file mode 100644
index 00000000..270e7169
--- /dev/null
+++ b/src/llmtuner/train/sftmm/trainer.py
@@ -0,0 +1,39 @@
+from types import MethodType
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from transformers import Seq2SeqTrainer
+
+from ...extras.logging import get_logger
+from ..utils import create_custom_optimzer, create_custom_scheduler
+
+
+if TYPE_CHECKING:
+    from ...hparams import FinetuningArguments
+
+logger = get_logger(__name__)
+
+
+class CustomSeq2SeqTrainer(Seq2SeqTrainer):
+    r"""
+    Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE.
+    """
+
+    def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.finetuning_args = finetuning_args
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
+
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py
new file mode 100644
index 00000000..dbda2d05
--- /dev/null
+++ b/src/llmtuner/train/sftmm/workflow.py
@@ -0,0 +1,101 @@
+# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py
+from typing import TYPE_CHECKING, List, Optional
+
+from transformers import DataCollatorForSeq2Seq
+
+from ...data import get_dataset
+from ...extras.constants import IGNORE_INDEX
+from ...extras.misc import get_logits_processor
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_processor
+from ..sft.metric import ComputeMetrics
+from ..utils import create_modelcard_and_push
+from .trainer import CustomSeq2SeqTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import (
+        DataArguments,
+        FinetuningArguments,
+        GeneratingArguments,
+        ModelArguments,
+    )
+
+
+def run_sft_mm(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    generating_args: "GeneratingArguments",
+    callbacks: Optional[List["TrainerCallback"]] = None,
+):
+    processor = load_processor(model_args)
+    tokenizer = processor.tokenizer
+    dataset = get_dataset(tokenizer, model_args, data_args, training_args, "sft", processor)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+    if getattr(model, "is_quantized", False) and not training_args.do_train:
+        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
+    train_dataset = dataset
+    eval_dataset = dataset
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer=tokenizer,
+        pad_to_multiple_of=(8 if tokenizer.padding_side == "right" else None),  # for shift short attention
+        label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id),
+    )
+
+    # Override the decoding parameters of Seq2SeqTrainer
+    training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
+    training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
+    training_args.remove_unused_columns = False
+
+    # Initialize our Trainer
+    trainer = CustomSeq2SeqTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        compute_metrics=(ComputeMetrics(tokenizer) if training_args.predict_with_generate else None),
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
+
+    # Keyword arguments for `model.generate`
+    gen_kwargs = generating_args.to_dict()
+    gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids
+    gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
+    gen_kwargs["logits_processor"] = get_logits_processor()
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs)
+        if training_args.predict_with_generate:  # eval_loss will be wrong if predict_with_generate is enabled
+            metrics.pop("eval_loss", None)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Predict
+    if training_args.do_predict:
+        predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs)
+        if training_args.predict_with_generate:  # predict_loss will be wrong if predict_with_generate is enabled
+            predict_results.metrics.pop("predict_loss", None)
+        trainer.log_metrics("predict", predict_results.metrics)
+        trainer.save_metrics("predict", predict_results.metrics)
+        trainer.save_predictions(predict_results)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index 5f691225..e1999946 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -15,6 +15,7 @@ from .pt import run_pt
 from .rm import run_rm
 from .sft import run_sft
 
+
 if TYPE_CHECKING:
     from transformers import TrainerCallback
 

From 110c2ce2a522c7225f952c3cae6a5035569c0a86 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 21:27:48 +0800
Subject: [PATCH 164/341] modify style

Former-commit-id: 3bffc1e1b8bcc4582cebea06d35e5146163c7bec
---
 src/llmtuner/hparams/model_args.py   |  14 +---
 src/llmtuner/train/sftmm/__init__.py |   4 --
 src/llmtuner/train/sftmm/metric.py   |  61 ----------------
 src/llmtuner/train/sftmm/trainer.py  |  39 -----------
 src/llmtuner/train/sftmm/workflow.py | 101 ---------------------------
 5 files changed, 2 insertions(+), 217 deletions(-)
 delete mode 100644 src/llmtuner/train/sftmm/__init__.py
 delete mode 100644 src/llmtuner/train/sftmm/metric.py
 delete mode 100644 src/llmtuner/train/sftmm/trainer.py
 delete mode 100644 src/llmtuner/train/sftmm/workflow.py

diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index df1a5ec0..97b908e4 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -188,18 +188,8 @@ class ModelArguments:
         if self.new_special_tokens is not None:  # support multiple special tokens
             self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")]
 
-        assert self.quantization_bit in [
-            None,
-            8,
-            4,
-        ], "We only accept 4-bit or 8-bit quantization."
-        assert self.export_quantization_bit in [
-            None,
-            8,
-            4,
-            3,
-            2,
-        ], "We only accept 2/3/4/8-bit quantization."
+        assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
+        assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization."
 
         if self.export_quantization_bit is not None and self.export_quantization_dataset is None:
             raise ValueError("Quantization dataset is necessary for exporting.")
diff --git a/src/llmtuner/train/sftmm/__init__.py b/src/llmtuner/train/sftmm/__init__.py
deleted file mode 100644
index 9ebdf821..00000000
--- a/src/llmtuner/train/sftmm/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .workflow import run_sft_mm
-
-
-__all__ = ["run_sft_mm"]
diff --git a/src/llmtuner/train/sftmm/metric.py b/src/llmtuner/train/sftmm/metric.py
deleted file mode 100644
index d1af4c17..00000000
--- a/src/llmtuner/train/sftmm/metric.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union
-
-import numpy as np
-
-from ...extras.constants import IGNORE_INDEX
-from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available
-
-
-if TYPE_CHECKING:
-    from transformers.tokenization_utils import PreTrainedTokenizer
-
-if is_jieba_available():
-    import jieba  # type: ignore
-
-if is_nltk_available():
-    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-
-if is_rouge_available():
-    from rouge_chinese import Rouge
-
-
-@dataclass
-class ComputeMetrics:
-    r"""
-    Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer.
-    """
-
-    tokenizer: "PreTrainedTokenizer"
-
-    def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]:
-        r"""
-        Uses the model predictions to compute metrics.
-        """
-        preds, labels = eval_preds
-        score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []}
-
-        preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id)
-        labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id)
-
-        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
-        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-        for pred, label in zip(decoded_preds, decoded_labels):
-            hypothesis = list(jieba.cut(pred))
-            reference = list(jieba.cut(label))
-
-            if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
-                result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
-            else:
-                rouge = Rouge()
-                scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
-                result = scores[0]
-
-            for k, v in result.items():
-                score_dict[k].append(round(v["f"] * 100, 4))
-
-            bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
-            score_dict["bleu-4"].append(round(bleu_score * 100, 4))
-
-        return {k: float(np.mean(v)) for k, v in score_dict.items()}
diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py
deleted file mode 100644
index 270e7169..00000000
--- a/src/llmtuner/train/sftmm/trainer.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from types import MethodType
-from typing import TYPE_CHECKING, Optional
-
-import torch
-from transformers import Seq2SeqTrainer
-
-from ...extras.logging import get_logger
-from ..utils import create_custom_optimzer, create_custom_scheduler
-
-
-if TYPE_CHECKING:
-    from ...hparams import FinetuningArguments
-
-logger = get_logger(__name__)
-
-
-class CustomSeq2SeqTrainer(Seq2SeqTrainer):
-    r"""
-    Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE.
-    """
-
-    def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
-        super().__init__(**kwargs)
-        self.finetuning_args = finetuning_args
-        if finetuning_args.use_badam:
-            from badam import clip_grad_norm_for_sparse_tensor
-
-            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
-
-    def create_optimizer(self) -> "torch.optim.Optimizer":
-        if self.optimizer is None:
-            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
-        return super().create_optimizer()
-
-    def create_scheduler(
-        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
-    ) -> "torch.optim.lr_scheduler.LRScheduler":
-        create_custom_scheduler(self.args, num_training_steps, optimizer)
-        return super().create_scheduler(num_training_steps, optimizer)
diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py
deleted file mode 100644
index dbda2d05..00000000
--- a/src/llmtuner/train/sftmm/workflow.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py
-from typing import TYPE_CHECKING, List, Optional
-
-from transformers import DataCollatorForSeq2Seq
-
-from ...data import get_dataset
-from ...extras.constants import IGNORE_INDEX
-from ...extras.misc import get_logits_processor
-from ...extras.ploting import plot_loss
-from ...model import load_model, load_processor
-from ..sft.metric import ComputeMetrics
-from ..utils import create_modelcard_and_push
-from .trainer import CustomSeq2SeqTrainer
-
-
-if TYPE_CHECKING:
-    from transformers import Seq2SeqTrainingArguments, TrainerCallback
-
-    from ...hparams import (
-        DataArguments,
-        FinetuningArguments,
-        GeneratingArguments,
-        ModelArguments,
-    )
-
-
-def run_sft_mm(
-    model_args: "ModelArguments",
-    data_args: "DataArguments",
-    training_args: "Seq2SeqTrainingArguments",
-    finetuning_args: "FinetuningArguments",
-    generating_args: "GeneratingArguments",
-    callbacks: Optional[List["TrainerCallback"]] = None,
-):
-    processor = load_processor(model_args)
-    tokenizer = processor.tokenizer
-    dataset = get_dataset(tokenizer, model_args, data_args, training_args, "sft", processor)
-    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
-    if getattr(model, "is_quantized", False) and not training_args.do_train:
-        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
-    train_dataset = dataset
-    eval_dataset = dataset
-    data_collator = DataCollatorForSeq2Seq(
-        tokenizer=tokenizer,
-        pad_to_multiple_of=(8 if tokenizer.padding_side == "right" else None),  # for shift short attention
-        label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id),
-    )
-
-    # Override the decoding parameters of Seq2SeqTrainer
-    training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
-    training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
-    training_args.remove_unused_columns = False
-
-    # Initialize our Trainer
-    trainer = CustomSeq2SeqTrainer(
-        model=model,
-        args=training_args,
-        finetuning_args=finetuning_args,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        callbacks=callbacks,
-        compute_metrics=(ComputeMetrics(tokenizer) if training_args.predict_with_generate else None),
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-    )
-
-    # Keyword arguments for `model.generate`
-    gen_kwargs = generating_args.to_dict()
-    gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids
-    gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
-    gen_kwargs["logits_processor"] = get_logits_processor()
-
-    # Training
-    if training_args.do_train:
-        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
-        trainer.save_model()
-        trainer.log_metrics("train", train_result.metrics)
-        trainer.save_metrics("train", train_result.metrics)
-        trainer.save_state()
-        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
-
-    # Evaluation
-    if training_args.do_eval:
-        metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs)
-        if training_args.predict_with_generate:  # eval_loss will be wrong if predict_with_generate is enabled
-            metrics.pop("eval_loss", None)
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Predict
-    if training_args.do_predict:
-        predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs)
-        if training_args.predict_with_generate:  # predict_loss will be wrong if predict_with_generate is enabled
-            predict_results.metrics.pop("predict_loss", None)
-        trainer.log_metrics("predict", predict_results.metrics)
-        trainer.save_metrics("predict", predict_results.metrics)
-        trainer.save_predictions(predict_results)
-
-    # Create model card
-    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)

From 058ed5e607b4e6b6f24eddcb1b44de261a002ae9 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 21:29:50 +0800
Subject: [PATCH 165/341] modify style

Former-commit-id: c1f1df99e4dc3d0aadf1207b4e9a16218187fd5a
---
 src/llmtuner/model/adapter.py | 6 +++---
 src/llmtuner/model/loader.py  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index f3db4d1e..d43e00f0 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING
 
 import torch
 from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model
@@ -21,11 +21,11 @@ logger = get_logger(__name__)
 
 def init_adapter(
     config: "PretrainedConfig",
-    model: Union["PreTrainedModel"],
+    model: "PreTrainedModel",
     model_args: "ModelArguments",
     finetuning_args: "FinetuningArguments",
     is_trainable: bool,
-) -> Union["PreTrainedModel"]:
+) -> "PreTrainedModel":
     r"""
     Initializes the adapters.
 
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 47298673..dd7eb44c 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -112,7 +112,7 @@ def load_model(
     finetuning_args: "FinetuningArguments",
     is_trainable: bool = False,
     add_valuehead: bool = False,
-) -> Union["PreTrainedModel"]:
+) -> "PreTrainedModel":
     r"""
     Loads pretrained model.
     """

From 10d59e9e4a76d85e1fa9b401235dd8c6cc4e6e2e Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 21:32:01 +0800
Subject: [PATCH 166/341] make dataset script

Former-commit-id: 25892f958da14976025a775febf628cd0e0a3d85
---
 scripts/make_mllm_instruct.py | 95 -----------------------------------
 1 file changed, 95 deletions(-)
 delete mode 100644 scripts/make_mllm_instruct.py

diff --git a/scripts/make_mllm_instruct.py b/scripts/make_mllm_instruct.py
deleted file mode 100644
index 41e13b8e..00000000
--- a/scripts/make_mllm_instruct.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import json
-import os.path
-
-import fire
-from datasets import Dataset, concatenate_datasets, load_dataset, Value, Image, Features, Sequence
-
-"""usage
-python3 scripts/make_mllm_instruct.py \
---json_path data/llava_instruct_example.json \
---image_path data/images \
---output_path data/mllm_example_dataset
-"""
-
-
-def make_one_json(json_path, image_path) -> Dataset:
-    with open(json_path) as f:
-        raw_data_ls = json.loads(f.read())
-    data_ls = []
-    for i, data in enumerate(raw_data_ls):
-        for j, message in enumerate(data['messages']):
-            text = message['content']
-            message['content'] = [{'index': None, 'text': text, 'type': 'text'}]
-            if j == 0:
-                message['content'].append({'index': 0, 'text': None, 'type': 'image'})
-        image = data['image']
-        if image_path:
-            image = os.path.join(image_path, data['image'])
-        data['images'] = [image]
-        del data['image']
-        data_ls.append(data)
-
-    def gen():
-        for data in data_ls:
-            yield data
-
-    features = Features({'messages': [{'content': [
-        {'index': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None),
-         'type': Value(dtype='string', id=None)}], 'role': Value(dtype='string', id=None)}],
-        'images': Sequence(feature=Image(decode=True, id=None), length=-1, id=None)})
-    dataset = Dataset.from_generator(gen, features=features)
-    return dataset
-
-
-yaml_content = """---
-dataset_info:
-  features:
-  - name: messages
-    list:
-    - name: content
-      list:
-      - name: index
-        dtype: int64
-      - name: text
-        dtype: string
-      - name: type
-        dtype: string
-    - name: role
-      dtype: string
-  - name: images
-    sequence: image
-configs:
-- config_name: default
-  data_files:
-  - split: train
-    path: data/train-*
-  - split: test
-    path: data/test-*
----"""
-
-
-def main(
-    json_path: str,
-    image_path: str,
-    output_path: str,
-):
-    json_path_list = json_path.split()
-    dataset_list = []
-    for json_path in json_path_list:
-        dataset = make_one_json(json_path, image_path)
-        dataset_list.append(dataset)
-    dataset = concatenate_datasets(dataset_list)
-    print(dataset[0])
-    data_path = os.path.join(output_path, "data")
-    os.makedirs(os.path.join(data_path), exist_ok=True)
-    parquet_path = os.path.join(data_path, "train-0.parquet")
-    dataset.to_parquet(parquet_path)
-    parquet_path = os.path.join(data_path, "test-0.parquet")
-    dataset.to_parquet(parquet_path)
-    readme_path = os.path.join(output_path, "README.md")
-    with open(readme_path, 'w') as f:
-        f.write(yaml_content)
-
-
-if __name__ == '__main__':
-    fire.Fire(main)

From 8b2a735c14f3193de414f6660af0efd25c1b7bf7 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 21:58:18 +0800
Subject: [PATCH 167/341] modify some style

Former-commit-id: b016e6a671a2f228f0bdd9b8d5995b4669609655
---
 src/llmtuner/data/aligner.py            | 24 ++-----
 src/llmtuner/data/preprocess.py         | 43 +++---------
 src/llmtuner/data/template.py           | 91 ++-----------------------
 src/llmtuner/hparams/finetuning_args.py |  2 +-
 src/llmtuner/model/loader.py            |  4 +-
 src/llmtuner/train/sft/workflow.py      | 20 ++----
 6 files changed, 26 insertions(+), 158 deletions(-)

diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py
index 9d440aff..17b9fc6d 100644
--- a/src/llmtuner/data/aligner.py
+++ b/src/llmtuner/data/aligner.py
@@ -82,10 +82,7 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
                 raise ValueError("Invalid role tag in {}.".format(messages))
 
             aligned_messages.append(
-                {
-                    "role": tag_mapping[message[dataset_attr.role_tag]],
-                    "content": message[dataset_attr.content_tag],
-                }
+                {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
             )
 
         outputs["prompt"].append(aligned_messages[:-1])
@@ -126,10 +123,7 @@ def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -
                 raise ValueError("Invalid role tag in {}.".format(messages))
 
             aligned_messages.append(
-                {
-                    "role": tag_mapping[message[dataset_attr.role_tag]],
-                    "content": message[dataset_attr.content_tag],
-                }
+                {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
             )
 
         outputs["prompt"].append(aligned_messages[:-1])
@@ -143,9 +137,7 @@ def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -
 
 
 def align_dataset(
-    dataset: Union["Dataset", "IterableDataset"],
-    dataset_attr: "DatasetAttr",
-    data_args: "DataArguments",
+    dataset: Union["Dataset", "IterableDataset"], dataset_attr: "DatasetAttr", data_args: "DataArguments"
 ) -> Union["Dataset", "IterableDataset"]:
     r"""
     Aligned dataset:
@@ -165,16 +157,10 @@ def align_dataset(
     features = Features.from_dict(
         {
             "prompt": [
-                {
-                    "role": {"dtype": "string", "_type": "Value"},
-                    "content": {"dtype": "string", "_type": "Value"},
-                }
+                {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}}
             ],
             "response": [
-                {
-                    "role": {"dtype": "string", "_type": "Value"},
-                    "content": {"dtype": "string", "_type": "Value"},
-                }
+                {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}}
             ],
             "system": {"dtype": "string", "_type": "Value"},
             "tools": {"dtype": "string", "_type": "Value"},
diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 1c8c64a6..51af8060 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -14,14 +14,11 @@ if TYPE_CHECKING:
     from ..hparams import DataArguments
     from .template import Template
 
-
 logger = get_logger(__name__)
 
 
 def preprocess_pretrain_dataset(
-    examples: Dict[str, List[Any]],
-    tokenizer: "PreTrainedTokenizer",
-    data_args: "DataArguments",
+    examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
     text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]]
@@ -56,11 +53,7 @@ def preprocess_supervised_dataset(
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
-    model_inputs = {
-        "input_ids": [],
-        "attention_mask": [],
-        "labels": [],
-    }
+    model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
@@ -154,12 +147,7 @@ def preprocess_multimodal_supervised_dataset(
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
     tokenizer = processor.tokenizer
-    model_inputs = {
-        "input_ids": [],
-        "attention_mask": [],
-        "labels": [],
-        "pixel_values": [],
-    }
+    model_inputs = {"input_ids": [], "attention_mask": [], "labels": [], "pixel_values": []}
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
@@ -284,10 +272,7 @@ def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "
     print("label_ids:\n{}".format(example["labels"]))
     print(
         "labels:\n{}".format(
-            tokenizer.decode(
-                list(filter(lambda x: x != IGNORE_INDEX, example["labels"])),
-                skip_special_tokens=False,
-            )
+            tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False)
         )
     )
 
@@ -320,33 +305,21 @@ def get_preprocess_and_print_func(
     elif stage == "sft" and not training_args.predict_with_generate:
         if data_args.packing:
             preprocess_func = partial(
-                preprocess_packed_supervised_dataset,
-                tokenizer=tokenizer,
-                template=template,
-                data_args=data_args,
+                preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
             )
         elif processor is not None:
             preprocess_func = partial(
-                preprocess_multimodal_supervised_dataset,
-                processor=processor,
-                template=template,
-                data_args=data_args,
+                preprocess_multimodal_supervised_dataset, processor=processor, template=template, data_args=data_args
             )
         else:
             preprocess_func = partial(
-                preprocess_supervised_dataset,
-                tokenizer=tokenizer,
-                template=template,
-                data_args=data_args,
+                preprocess_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
             )
 
         print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)
     elif stage == "rm":
         preprocess_func = partial(
-            preprocess_pairwise_dataset,
-            tokenizer=tokenizer,
-            template=template,
-            data_args=data_args,
+            preprocess_pairwise_dataset, tokenizer=tokenizer, template=template, data_args=data_args
         )
         print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer)
     else:
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index cf21e932..f798ba5a 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -11,7 +11,6 @@ if TYPE_CHECKING:
 
     from .formatter import SLOTS, Formatter
 
-
 logger = get_logger(__name__)
 
 
@@ -368,8 +367,7 @@ def get_template_and_fix_tokenizer(
 
     if stop_words:
         num_added_tokens = tokenizer.add_special_tokens(
-            dict(additional_special_tokens=stop_words),
-            replace_additional_special_tokens=False,
+            dict(additional_special_tokens=stop_words), replace_additional_special_tokens=False
         )
         logger.info("Add {} to stop words.".format(",".join(stop_words)))
         if num_added_tokens > 0:
@@ -393,7 +391,6 @@ _register_template(
     ),
 )
 
-
 _register_template(
     name="aquila",
     format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
@@ -406,36 +403,26 @@ _register_template(
     efficient_eos=True,
 )
 
-
 _register_template(
     name="atom",
     format_user=StringFormatter(
-        slots=[
-            {"bos_token"},
-            "Human: {{content}}\n",
-            {"eos_token"},
-            {"bos_token"},
-            "Assistant:",
-        ]
+        slots=[{"bos_token"}, "Human: {{content}}\n", {"eos_token"}, {"bos_token"}, "Assistant:"]
     ),
     format_assistant=StringFormatter(slots=["{{content}}\n", {"eos_token"}]),
 )
 
-
 _register_template(
     name="baichuan",
     format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
     efficient_eos=True,
 )
 
-
 _register_template(
     name="baichuan2",
     format_user=StringFormatter(slots=["<reserved_106>{{content}}<reserved_107>"]),
     efficient_eos=True,
 )
 
-
 _register_template(
     name="belle",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
@@ -444,13 +431,11 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="bluelm",
     format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
 )
 
-
 _register_template(
     name="breeze",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
@@ -462,7 +447,6 @@ _register_template(
     efficient_eos=True,
 )
 
-
 _register_template(
     name="chatglm2",
     format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
@@ -472,7 +456,6 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="chatglm3",
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
@@ -480,40 +463,23 @@ _register_template(
     format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(
-        slots=[
-            {"token": "<|observation|>"},
-            "\n",
-            "{{content}}",
-            {"token": "<|assistant|>"},
-        ]
+        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
     ),
     stop_words=["<|user|>", "<|observation|>"],
     efficient_eos=True,
     force_system=True,
 )
 
-
 _register_template(
     name="chatglm3_system",
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
     format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
     format_system=StringFormatter(
-        slots=[
-            {"token": "[gMASK]"},
-            {"token": "sop"},
-            {"token": "<|system|>"},
-            "\n",
-            "{{content}}",
-        ]
+        slots=[{"token": "[gMASK]"}, {"token": "sop"}, {"token": "<|system|>"}, "\n", "{{content}}"]
     ),
     format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
     format_observation=StringFormatter(
-        slots=[
-            {"token": "<|observation|>"},
-            "\n",
-            "{{content}}",
-            {"token": "<|assistant|>"},
-        ]
+        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
     ),
     default_system=(
         "You are ChatGLM3, a large language model trained by Zhipu.AI. "
@@ -523,7 +489,6 @@ _register_template(
     efficient_eos=True,
 )
 
-
 _register_template(
     name="chatml",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -534,7 +499,6 @@ _register_template(
     replace_eos=True,
 )
 
-
 _register_template(
     name="chatml_de",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -546,14 +510,12 @@ _register_template(
     replace_eos=True,
 )
 
-
 _register_template(
     name="codegeex2",
     format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
     force_system=True,
 )
 
-
 _register_template(
     name="cohere",
     format_user=StringFormatter(
@@ -568,7 +530,6 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="cpm",
     format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
@@ -576,7 +537,6 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="dbrx",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -602,7 +562,6 @@ _register_template(
     replace_eos=True,
 )
 
-
 _register_template(
     name="deepseek",
     format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
@@ -610,7 +569,6 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="deepseekcoder",
     format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
@@ -626,7 +584,6 @@ _register_template(
     efficient_eos=True,
 )
 
-
 _register_template(
     name="default",
     format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]),
@@ -634,14 +591,12 @@ _register_template(
     format_separator=EmptyFormatter(slots=["\n"]),
 )
 
-
 _register_template(
     name="empty",
     format_user=StringFormatter(slots=["{{content}}"]),
     format_assistant=StringFormatter(slots=["{{content}}"]),
 )
 
-
 _register_template(
     name="falcon",
     format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
@@ -649,14 +604,12 @@ _register_template(
     efficient_eos=True,
 )
 
-
 _register_template(
     name="fewshot",
     format_separator=EmptyFormatter(slots=["\n\n"]),
     efficient_eos=True,
 )
 
-
 _register_template(
     name="gemma",
     format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
@@ -669,7 +622,6 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="intern",
     format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
@@ -678,7 +630,6 @@ _register_template(
     efficient_eos=True,
 )
 
-
 _register_template(
     name="intern2",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -695,7 +646,6 @@ _register_template(
     efficient_eos=True,  # internlm2 tokenizer cannot set eos_token_id
 )
 
-
 _register_template(
     name="llama2",
     format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
@@ -712,7 +662,6 @@ _register_template(
     ),
 )
 
-
 _register_template(
     name="llama2_zh",
     format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
@@ -720,7 +669,6 @@ _register_template(
     default_system="You are a helpful assistant. 你是一个乐于助人的助手。",
 )
 
-
 _register_template(
     name="llama3",
     format_user=StringFormatter(
@@ -732,10 +680,7 @@ _register_template(
         ]
     ),
     format_system=StringFormatter(
-        slots=[
-            {"bos_token"},
-            "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>",
-        ]
+        slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]
     ),
     format_observation=StringFormatter(
         slots=[
@@ -750,7 +695,6 @@ _register_template(
     replace_eos=True,
 )
 
-
 _register_template(
     name="mistral",
     format_user=StringFormatter(slots=[" [INST] {{content}} [/INST]"]),
@@ -758,7 +702,6 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="olmo",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
@@ -767,22 +710,14 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="openchat",
-    format_user=StringFormatter(
-        slots=[
-            "GPT4 Correct User: {{content}}",
-            {"eos_token"},
-            "GPT4 Correct Assistant:",
-        ]
-    ),
+    format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
     format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]),
     format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
     force_system=True,
 )
 
-
 _register_template(
     name="orion",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
@@ -790,7 +725,6 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="phi",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
@@ -802,7 +736,6 @@ _register_template(
     replace_eos=True,
 )
 
-
 _register_template(
     name="qwen",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -814,7 +747,6 @@ _register_template(
     replace_eos=True,
 )
 
-
 _register_template(
     name="solar",
     format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
@@ -822,7 +754,6 @@ _register_template(
     efficient_eos=True,
 )
 
-
 _register_template(
     name="starchat",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>"]),
@@ -833,7 +764,6 @@ _register_template(
     force_system=True,
 )
 
-
 _register_template(
     name="vicuna",
     format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
@@ -843,7 +773,6 @@ _register_template(
     ),
 )
 
-
 _register_template(
     name="xuanyuan",
     format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
@@ -854,13 +783,11 @@ _register_template(
     ),
 )
 
-
 _register_template(
     name="xverse",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]),
 )
 
-
 _register_template(
     name="yayi",
     format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
@@ -880,7 +807,6 @@ _register_template(
     stop_words=["<|End|>"],
 )
 
-
 _register_template(
     name="yi",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -889,7 +815,6 @@ _register_template(
     replace_eos=True,
 )
 
-
 _register_template(
     name="yuan",
     format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
@@ -898,7 +823,6 @@ _register_template(
     replace_eos=True,
 )
 
-
 _register_template(
     name="zephyr",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
@@ -907,7 +831,6 @@ _register_template(
     default_system="You are a friendly chatbot who always responds in the style of a pirate",
 )
 
-
 _register_template(
     name="ziya",
     format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index cb525699..f4f71bc5 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -260,7 +260,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         default=False,
         metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."},
     )
-    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo", "sft_mm"] = field(
+    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo"] = field(
         default="sft",
         metadata={"help": "Which stage will be performed in training."},
     )
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index dd7eb44c..5b5c0a4d 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -41,9 +41,7 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
     }
 
 
-def load_tokenizer(
-    model_args: "ModelArguments",
-) -> Dict[str, Union["PreTrainedTokenizer", "AutoProcessor"]]:
+def load_tokenizer(model_args: "ModelArguments") -> Dict[str, Union["PreTrainedTokenizer", "AutoProcessor"]]:
     r"""
     Loads pretrained tokenizer.
 
diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py
index 50833a99..205142e5 100644
--- a/src/llmtuner/train/sft/workflow.py
+++ b/src/llmtuner/train/sft/workflow.py
@@ -17,12 +17,7 @@ from .trainer import CustomSeq2SeqTrainer
 if TYPE_CHECKING:
     from transformers import Seq2SeqTrainingArguments, TrainerCallback
 
-    from ...hparams import (
-        DataArguments,
-        FinetuningArguments,
-        GeneratingArguments,
-        ModelArguments,
-    )
+    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
 
 
 def run_sft(
@@ -36,14 +31,7 @@ def run_sft(
     tokenizer_modules = load_tokenizer(model_args)
     tokenizer = tokenizer_modules["tokenizer"]
     processor = tokenizer_modules["processor"]
-    dataset = get_dataset(
-        tokenizer,
-        model_args,
-        data_args,
-        training_args,
-        stage="sft",
-        processor=processor,
-    )
+    dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft", processor=processor)
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
 
     if training_args.predict_with_generate:
@@ -54,7 +42,7 @@ def run_sft(
 
     data_collator = DataCollatorForSeq2Seq(
         tokenizer=tokenizer,
-        pad_to_multiple_of=(8 if tokenizer.padding_side == "right" else None),  # for shift short attention
+        pad_to_multiple_of=8 if tokenizer.padding_side == "right" else None,  # for shift short attention
         label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id),
     )
 
@@ -72,7 +60,7 @@ def run_sft(
         tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,
-        compute_metrics=(ComputeMetrics(tokenizer) if training_args.predict_with_generate else None),
+        compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None,
         **split_dataset(dataset, data_args, training_args),
     )
 

From 514ffafc126bcd0b07978555167fe3eb93090cc2 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 22:04:09 +0800
Subject: [PATCH 168/341] modify some style

Former-commit-id: 053062abc007014a7fde95c5ae9f4d859893d8ad
---
 src/llmtuner/data/preprocess.py    |  5 +--
 src/llmtuner/data/template.py      | 50 ++++++++++++++++++++++++++++--
 src/llmtuner/train/sft/workflow.py |  2 +-
 src/llmtuner/train/tuner.py        |  1 +
 4 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 51af8060..9cdcdfa2 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -324,10 +324,7 @@ def get_preprocess_and_print_func(
         print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer)
     else:
         preprocess_func = partial(
-            preprocess_unsupervised_dataset,
-            tokenizer=tokenizer,
-            template=template,
-            data_args=data_args,
+            preprocess_unsupervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
         )
         print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
 
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index f798ba5a..9a3673c3 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -11,6 +11,7 @@ if TYPE_CHECKING:
 
     from .formatter import SLOTS, Formatter
 
+
 logger = get_logger(__name__)
 
 
@@ -103,9 +104,7 @@ class Template:
         return self._make_pairs(encoded_messages, cutoff_len, reserved_label_len)
 
     def _convert_elements_to_ids(
-        self,
-        tokenizer: "PreTrainedTokenizer",
-        elements: List[Union[str, Dict[str, str]]],
+        self, tokenizer: "PreTrainedTokenizer", elements: List[Union[str, Dict[str, str]]]
     ) -> List[int]:
         r"""
         Converts elements to token ids.
@@ -391,6 +390,7 @@ _register_template(
     ),
 )
 
+
 _register_template(
     name="aquila",
     format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
@@ -403,6 +403,7 @@ _register_template(
     efficient_eos=True,
 )
 
+
 _register_template(
     name="atom",
     format_user=StringFormatter(
@@ -411,18 +412,21 @@ _register_template(
     format_assistant=StringFormatter(slots=["{{content}}\n", {"eos_token"}]),
 )
 
+
 _register_template(
     name="baichuan",
     format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
     efficient_eos=True,
 )
 
+
 _register_template(
     name="baichuan2",
     format_user=StringFormatter(slots=["<reserved_106>{{content}}<reserved_107>"]),
     efficient_eos=True,
 )
 
+
 _register_template(
     name="belle",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
@@ -431,11 +435,13 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="bluelm",
     format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
 )
 
+
 _register_template(
     name="breeze",
     format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
@@ -447,6 +453,7 @@ _register_template(
     efficient_eos=True,
 )
 
+
 _register_template(
     name="chatglm2",
     format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
@@ -456,6 +463,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="chatglm3",
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
@@ -470,6 +478,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="chatglm3_system",
     format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
@@ -489,6 +498,7 @@ _register_template(
     efficient_eos=True,
 )
 
+
 _register_template(
     name="chatml",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -499,6 +509,7 @@ _register_template(
     replace_eos=True,
 )
 
+
 _register_template(
     name="chatml_de",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -510,12 +521,14 @@ _register_template(
     replace_eos=True,
 )
 
+
 _register_template(
     name="codegeex2",
     format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
     force_system=True,
 )
 
+
 _register_template(
     name="cohere",
     format_user=StringFormatter(
@@ -530,6 +543,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="cpm",
     format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
@@ -537,6 +551,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="dbrx",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -562,6 +577,7 @@ _register_template(
     replace_eos=True,
 )
 
+
 _register_template(
     name="deepseek",
     format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
@@ -569,6 +585,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="deepseekcoder",
     format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
@@ -584,6 +601,7 @@ _register_template(
     efficient_eos=True,
 )
 
+
 _register_template(
     name="default",
     format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]),
@@ -591,12 +609,14 @@ _register_template(
     format_separator=EmptyFormatter(slots=["\n"]),
 )
 
+
 _register_template(
     name="empty",
     format_user=StringFormatter(slots=["{{content}}"]),
     format_assistant=StringFormatter(slots=["{{content}}"]),
 )
 
+
 _register_template(
     name="falcon",
     format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
@@ -604,12 +624,14 @@ _register_template(
     efficient_eos=True,
 )
 
+
 _register_template(
     name="fewshot",
     format_separator=EmptyFormatter(slots=["\n\n"]),
     efficient_eos=True,
 )
 
+
 _register_template(
     name="gemma",
     format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
@@ -622,6 +644,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="intern",
     format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
@@ -630,6 +653,7 @@ _register_template(
     efficient_eos=True,
 )
 
+
 _register_template(
     name="intern2",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -646,6 +670,7 @@ _register_template(
     efficient_eos=True,  # internlm2 tokenizer cannot set eos_token_id
 )
 
+
 _register_template(
     name="llama2",
     format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
@@ -662,6 +687,7 @@ _register_template(
     ),
 )
 
+
 _register_template(
     name="llama2_zh",
     format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
@@ -669,6 +695,7 @@ _register_template(
     default_system="You are a helpful assistant. 你是一个乐于助人的助手。",
 )
 
+
 _register_template(
     name="llama3",
     format_user=StringFormatter(
@@ -695,6 +722,7 @@ _register_template(
     replace_eos=True,
 )
 
+
 _register_template(
     name="mistral",
     format_user=StringFormatter(slots=[" [INST] {{content}} [/INST]"]),
@@ -702,6 +730,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="olmo",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
@@ -710,6 +739,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="openchat",
     format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
@@ -718,6 +748,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="orion",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
@@ -725,6 +756,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="phi",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
@@ -736,6 +768,7 @@ _register_template(
     replace_eos=True,
 )
 
+
 _register_template(
     name="qwen",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -747,6 +780,7 @@ _register_template(
     replace_eos=True,
 )
 
+
 _register_template(
     name="solar",
     format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
@@ -754,6 +788,7 @@ _register_template(
     efficient_eos=True,
 )
 
+
 _register_template(
     name="starchat",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>"]),
@@ -764,6 +799,7 @@ _register_template(
     force_system=True,
 )
 
+
 _register_template(
     name="vicuna",
     format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
@@ -773,6 +809,7 @@ _register_template(
     ),
 )
 
+
 _register_template(
     name="xuanyuan",
     format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
@@ -783,11 +820,13 @@ _register_template(
     ),
 )
 
+
 _register_template(
     name="xverse",
     format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]),
 )
 
+
 _register_template(
     name="yayi",
     format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
@@ -807,6 +846,7 @@ _register_template(
     stop_words=["<|End|>"],
 )
 
+
 _register_template(
     name="yi",
     format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
@@ -815,6 +855,7 @@ _register_template(
     replace_eos=True,
 )
 
+
 _register_template(
     name="yuan",
     format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
@@ -823,6 +864,7 @@ _register_template(
     replace_eos=True,
 )
 
+
 _register_template(
     name="zephyr",
     format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
@@ -831,12 +873,14 @@ _register_template(
     default_system="You are a friendly chatbot who always responds in the style of a pirate",
 )
 
+
 _register_template(
     name="ziya",
     format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )
 
+
 _register_template(
     name="llava",
     format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT: "]),
diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py
index 205142e5..c5acb4bc 100644
--- a/src/llmtuner/train/sft/workflow.py
+++ b/src/llmtuner/train/sft/workflow.py
@@ -43,7 +43,7 @@ def run_sft(
     data_collator = DataCollatorForSeq2Seq(
         tokenizer=tokenizer,
         pad_to_multiple_of=8 if tokenizer.padding_side == "right" else None,  # for shift short attention
-        label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id),
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
     )
 
     # Override the decoding parameters of Seq2SeqTrainer
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index e1999946..a8a2b8e9 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -19,6 +19,7 @@ from .sft import run_sft
 if TYPE_CHECKING:
     from transformers import TrainerCallback
 
+
 logger = get_logger(__name__)
 
 
From 759bee48d23f9a1a3549057b64160bfaa2598aa6 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 22:35:17 +0800
Subject: [PATCH 169/341] merge some func

Former-commit-id: 3085107c44715e4b2ca96d73b20d90c172b95219
---
 scripts/test_mllm.py            | 99 ---------------------------------
 src/llmtuner/data/aligner.py    | 51 +----------------
 src/llmtuner/data/preprocess.py | 65 +++-------------------
 3 files changed, 10 insertions(+), 205 deletions(-)
 delete mode 100644 scripts/test_mllm.py

diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py
deleted file mode 100644
index b8fe3e0f..00000000
--- a/scripts/test_mllm.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import os.path
-
-import fire
-import torch
-from datasets import load_dataset
-from peft import PeftModel
-from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor
-import shutil
-from PIL import Image
-
-"""usage
-python3 scripts/test_mllm.py \
---base_model_path llava-hf/llava-1.5-7b-hf \
---lora_model_path saves/llava-1.5-7b/lora/sft \
---model_path saves/llava-1.5-7b/lora/merged \
---dataset_name data/llava_instruct_example.json \
---do_merge 1
-"""
-
-
-def get_processor(model_path):
-    processor = AutoProcessor.from_pretrained(model_path)
-    CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {{ message['content'] }} ASSISTANT: {% else %}{{ message['content'] }}{% endif %} {% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""
-    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
-    tokenizer.chat_template = CHAT_TEMPLATE
-    processor.tokenizer = tokenizer
-    return processor
-
-
-def apply_lora(base_model_path, model_path, lora_path):
-    print(f"Loading the base model from {base_model_path}")
-    base_model = AutoModelForVision2Seq.from_pretrained(
-        base_model_path,
-        torch_dtype=torch.float16,
-        low_cpu_mem_usage=True,
-        device_map="cuda",
-    )
-    processor = get_processor(base_model_path)
-    tokenizer = processor.tokenizer
-    print(f"Loading the LoRA adapter from {lora_path}")
-
-    lora_model = PeftModel.from_pretrained(
-        base_model,
-        lora_path,
-        torch_dtype=torch.float16,
-    )
-
-    print("Applying the LoRA")
-    model = lora_model.merge_and_unload()
-
-    print(f"Saving the target model to {model_path}")
-    model.save_pretrained(model_path)
-    tokenizer.save_pretrained(model_path)
-    processor.image_processor.save_pretrained(model_path)
-
-
-def main(
-    model_path: str,
-    dataset_name: str,
-    base_model_path: str = "",
-    lora_model_path: str = "",
-    do_merge: bool = False,
-):
-    if not os.path.exists(model_path) or do_merge:
-        apply_lora(base_model_path, model_path, lora_model_path)
-    model = AutoModelForVision2Seq.from_pretrained(
-        model_path,
-        torch_dtype=torch.bfloat16,
-        low_cpu_mem_usage=True,
-        device_map="cuda",
-    )
-    processor = get_processor(model_path)
-    raw_datasets = load_dataset("json", data_files=dataset_name)
-    train_dataset = raw_datasets["train"]
-    examples = train_dataset.select(range(3))
-    texts = []
-    images = []
-    for example in examples:
-        messages = example["messages"][:1]
-        text = processor.tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=False
-        )
-        texts.append(text)
-        images.append(Image.open(example["images"][0]))
-    batch = processor(text=texts, images=images, return_tensors="pt", padding=True).to(
-        "cuda"
-    )
-    output = model.generate(**batch, max_new_tokens=100)
-    res_list = processor.batch_decode(output, skip_special_tokens=True)
-    for i, prompt in enumerate(texts):
-        res = res_list[i]
-        print(f"#{i}")
-        print(f"prompt:{prompt}")
-        print(f"response:{res[len(prompt):].strip()}")
-        print()
-
-
-if __name__ == "__main__":
-    fire.Fire(main)
diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py
index 17b9fc6d..6fd6f404 100644
--- a/src/llmtuner/data/aligner.py
+++ b/src/llmtuner/data/aligner.py
@@ -36,12 +36,7 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr")
                 {"role": Role.ASSISTANT.value, "content": content} for content in examples[dataset_attr.response][i]
             ]
         elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str):
-            response = [
-                {
-                    "role": Role.ASSISTANT.value,
-                    "content": examples[dataset_attr.response][i],
-                }
-            ]
+            response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}]
         else:
             response = []
 
@@ -54,47 +49,6 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr")
 
 
 def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]:
-    outputs = {"prompt": [], "response": [], "system": [], "tools": []}
-    tag_mapping = {
-        dataset_attr.user_tag: Role.USER.value,
-        dataset_attr.assistant_tag: Role.ASSISTANT.value,
-        dataset_attr.observation_tag: Role.OBSERVATION.value,
-        dataset_attr.function_tag: Role.FUNCTION.value,
-        dataset_attr.system_tag: Role.SYSTEM.value,
-    }
-    odd_tags = (dataset_attr.user_tag, dataset_attr.observation_tag)
-    even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag)
-    accept_tags = (odd_tags, even_tags)
-    for i, messages in enumerate(examples[dataset_attr.messages]):
-        if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag:
-            system = messages[0][dataset_attr.content_tag]
-            messages = messages[1:]
-        else:
-            system = examples[dataset_attr.system][i] if dataset_attr.system else ""
-
-        messages = messages[: len(messages) // 2 * 2]  # should be multiples of 2
-        if len(messages) == 0:
-            continue
-
-        aligned_messages = []
-        for turn_idx, message in enumerate(messages):
-            if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]:
-                raise ValueError("Invalid role tag in {}.".format(messages))
-
-            aligned_messages.append(
-                {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
-            )
-
-        outputs["prompt"].append(aligned_messages[:-1])
-        outputs["response"].append(aligned_messages[-1:])
-        outputs["system"].append(system)
-        outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
-        outputs["images"].append([])
-
-    return outputs
-
-
-def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]:
     outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
     tag_mapping = {
         dataset_attr.user_tag: Role.USER.value,
@@ -130,7 +84,6 @@ def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -
         outputs["response"].append(aligned_messages[-1:])
         outputs["system"].append(system)
         outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
-        print(examples[dataset_attr.images][i])
         outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else [])
 
     return outputs
@@ -148,8 +101,6 @@ def align_dataset(
     """
     if dataset_attr.formatting == "alpaca":
         convert_func = partial(convert_alpaca, dataset_attr=dataset_attr)
-    elif dataset_attr.formatting == "llava":
-        convert_func = partial(convert_llava, dataset_attr=dataset_attr)
     else:
         convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr)
 
diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 9cdcdfa2..6108b245 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -50,6 +50,7 @@ def preprocess_supervised_dataset(
     tokenizer: "PreTrainedTokenizer",
     template: "Template",
     data_args: "DataArguments",
+    processor: "AutoProcessor" = None,
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
@@ -88,7 +89,9 @@ def preprocess_supervised_dataset(
         model_inputs["input_ids"].append(input_ids)
         model_inputs["attention_mask"].append([1] * len(input_ids))
         model_inputs["labels"].append(labels)
-
+        if processor is not None and "images" in examples:
+            pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0]
+            model_inputs["pixel_values"].append(pixel_values)
     return model_inputs
 
 
@@ -138,55 +141,6 @@ def preprocess_packed_supervised_dataset(
     return model_inputs
 
 
-def preprocess_multimodal_supervised_dataset(
-    examples: Dict[str, List[Any]],
-    processor: "AutoProcessor",
-    template: "Template",
-    data_args: "DataArguments",
-) -> Dict[str, List[List[int]]]:
-    # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
-    # for multiturn examples, we only mask the prompt part in each prompt-response pair.
-    tokenizer = processor.tokenizer
-    model_inputs = {"input_ids": [], "attention_mask": [], "labels": [], "pixel_values": []}
-
-    for i in range(len(examples["prompt"])):
-        if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
-            continue
-
-        messages = examples["prompt"][i] + examples["response"][i]
-        input_ids, labels = [], []
-        for turn_idx, (source_ids, target_ids) in enumerate(
-            template.encode_multiturn(
-                tokenizer,
-                messages,
-                examples["system"][i],
-                examples["tools"][i],
-                data_args.cutoff_len,
-                data_args.reserved_label_len,
-            )
-        ):
-            if data_args.train_on_prompt:
-                source_mask = source_ids
-            elif turn_idx != 0 and template.efficient_eos:
-                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
-            else:
-                source_mask = [IGNORE_INDEX] * len(source_ids)
-
-            input_ids += source_ids + target_ids
-            labels += source_mask + target_ids
-
-        if template.efficient_eos:
-            input_ids += [tokenizer.eos_token_id]
-            labels += [tokenizer.eos_token_id]
-
-        model_inputs["input_ids"].append(input_ids)
-        model_inputs["attention_mask"].append([1] * len(input_ids))
-        model_inputs["labels"].append(labels)
-        pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0]
-        model_inputs["pixel_values"].append(pixel_values)
-    return model_inputs
-
-
 def preprocess_unsupervised_dataset(
     examples: Dict[str, List[Any]],
     tokenizer: "PreTrainedTokenizer",
@@ -307,15 +261,14 @@ def get_preprocess_and_print_func(
             preprocess_func = partial(
                 preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
             )
-        elif processor is not None:
-            preprocess_func = partial(
-                preprocess_multimodal_supervised_dataset, processor=processor, template=template, data_args=data_args
-            )
         else:
             preprocess_func = partial(
-                preprocess_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
+                preprocess_supervised_dataset,
+                tokenizer=tokenizer,
+                template=template,
+                data_args=data_args,
+                processor=processor,
             )
-
         print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)
     elif stage == "rm":
         preprocess_func = partial(

From 59817c27e33a10e44197438ff746c18f0c34e0f9 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 22:40:25 +0800
Subject: [PATCH 170/341] modify some style

Former-commit-id: d578a90cefa7ec813355795bdd6ead5ee558ce26
---
 src/llmtuner/data/preprocess.py | 76 ++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 6108b245..3487b761 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -6,7 +6,6 @@ from ..extras.constants import IGNORE_INDEX
 from ..extras.logging import get_logger
 from .utils import Role
 
-
 if TYPE_CHECKING:
     from transformers import Seq2SeqTrainingArguments
     from transformers.tokenization_utils import AutoProcessor, PreTrainedTokenizer
@@ -18,7 +17,7 @@ logger = get_logger(__name__)
 
 
 def preprocess_pretrain_dataset(
-    examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
+        examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
     text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]]
@@ -35,7 +34,7 @@ def preprocess_pretrain_dataset(
         block_size = data_args.cutoff_len
         total_length = (total_length // block_size) * block_size
         result = {
-            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
             for k, t in concatenated_examples.items()
         }
         if data_args.template == "gemma":
@@ -46,11 +45,11 @@ def preprocess_pretrain_dataset(
 
 
 def preprocess_supervised_dataset(
-    examples: Dict[str, List[Any]],
-    tokenizer: "PreTrainedTokenizer",
-    template: "Template",
-    data_args: "DataArguments",
-    processor: "AutoProcessor" = None,
+        examples: Dict[str, List[Any]],
+        tokenizer: "PreTrainedTokenizer",
+        template: "Template",
+        data_args: "DataArguments",
+        processor: "AutoProcessor" = None,
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
@@ -63,14 +62,14 @@ def preprocess_supervised_dataset(
         messages = examples["prompt"][i] + examples["response"][i]
         input_ids, labels = [], []
         for turn_idx, (source_ids, target_ids) in enumerate(
-            template.encode_multiturn(
-                tokenizer,
-                messages,
-                examples["system"][i],
-                examples["tools"][i],
-                data_args.cutoff_len,
-                data_args.reserved_label_len,
-            )
+                template.encode_multiturn(
+                    tokenizer,
+                    messages,
+                    examples["system"][i],
+                    examples["tools"][i],
+                    data_args.cutoff_len,
+                    data_args.reserved_label_len,
+                )
         ):
             if data_args.train_on_prompt:
                 source_mask = source_ids
@@ -96,10 +95,10 @@ def preprocess_supervised_dataset(
 
 
 def preprocess_packed_supervised_dataset(
-    examples: Dict[str, List[Any]],
-    tokenizer: "PreTrainedTokenizer",
-    template: "Template",
-    data_args: "DataArguments",
+        examples: Dict[str, List[Any]],
+        tokenizer: "PreTrainedTokenizer",
+        template: "Template",
+        data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>`
     # and labels with format `<ignore> ... <ignore> Y1 <eos> <ignore> ... <ignore> Y2 <eos>`
@@ -111,7 +110,7 @@ def preprocess_packed_supervised_dataset(
 
         messages = examples["prompt"][i] + examples["response"][i]
         for source_ids, target_ids in template.encode_multiturn(
-            tokenizer, messages, examples["system"][i], examples["tools"][i]
+                tokenizer, messages, examples["system"][i], examples["tools"][i]
         ):
             if data_args.train_on_prompt:
                 source_mask = source_ids
@@ -133,19 +132,19 @@ def preprocess_packed_supervised_dataset(
     total_length = (total_length // block_size) * block_size
     # split by chunks of cutoff_len
     for i in range(0, total_length, block_size):
-        if not all(label == IGNORE_INDEX for label in labels[i : i + block_size]):
-            model_inputs["input_ids"].append(input_ids[i : i + block_size])
+        if not all(label == IGNORE_INDEX for label in labels[i: i + block_size]):
+            model_inputs["input_ids"].append(input_ids[i: i + block_size])
             model_inputs["attention_mask"].append([1] * block_size)
-            model_inputs["labels"].append(labels[i : i + block_size])
+            model_inputs["labels"].append(labels[i: i + block_size])
 
     return model_inputs
 
 
 def preprocess_unsupervised_dataset(
-    examples: Dict[str, List[Any]],
-    tokenizer: "PreTrainedTokenizer",
-    template: "Template",
-    data_args: "DataArguments",
+        examples: Dict[str, List[Any]],
+        tokenizer: "PreTrainedTokenizer",
+        template: "Template",
+        data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X` and labels with format `Y <eos>`
     model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@@ -179,10 +178,10 @@ def preprocess_unsupervised_dataset(
 
 
 def preprocess_pairwise_dataset(
-    examples: Dict[str, List[Any]],
-    tokenizer: "PreTrainedTokenizer",
-    template: "Template",
-    data_args: "DataArguments",
+        examples: Dict[str, List[Any]],
+        tokenizer: "PreTrainedTokenizer",
+        template: "Template",
+        data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>`
     model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []}
@@ -246,12 +245,12 @@ def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer:
 
 
 def get_preprocess_and_print_func(
-    tokenizer: "PreTrainedTokenizer",
-    template: "Template",
-    data_args: "DataArguments",
-    training_args: "Seq2SeqTrainingArguments",
-    stage: Literal["pt", "sft", "rm", "ppo"],
-    processor: Optional["AutoProcessor"] = None,
+        tokenizer: "PreTrainedTokenizer",
+        template: "Template",
+        data_args: "DataArguments",
+        training_args: "Seq2SeqTrainingArguments",
+        stage: Literal["pt", "sft", "rm", "ppo"],
+        processor: Optional["AutoProcessor"] = None,
 ) -> Tuple[Callable, Callable]:
     if stage == "pt":
         preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args)
@@ -280,5 +279,4 @@ def get_preprocess_and_print_func(
             preprocess_unsupervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
         )
         print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
-
     return preprocess_func, print_function

From 5062ee547ea45f30faa3ee60048cbafec4ee458d Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 22:40:53 +0800
Subject: [PATCH 171/341] modify some style

Former-commit-id: 1291c7ee39361dd75247c67f04dcf20b472faf83
---
 src/llmtuner/data/preprocess.py | 75 +++++++++++++++++----------------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 3487b761..59b49b9d 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -6,6 +6,7 @@ from ..extras.constants import IGNORE_INDEX
 from ..extras.logging import get_logger
 from .utils import Role
 
+
 if TYPE_CHECKING:
     from transformers import Seq2SeqTrainingArguments
     from transformers.tokenization_utils import AutoProcessor, PreTrainedTokenizer
@@ -17,7 +18,7 @@ logger = get_logger(__name__)
 
 
 def preprocess_pretrain_dataset(
-        examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
+    examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
 ) -> Dict[str, List[List[int]]]:
     # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
     text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]]
@@ -34,7 +35,7 @@ def preprocess_pretrain_dataset(
         block_size = data_args.cutoff_len
         total_length = (total_length // block_size) * block_size
         result = {
-            k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
             for k, t in concatenated_examples.items()
         }
         if data_args.template == "gemma":
@@ -45,11 +46,11 @@ def preprocess_pretrain_dataset(
 
 
 def preprocess_supervised_dataset(
-        examples: Dict[str, List[Any]],
-        tokenizer: "PreTrainedTokenizer",
-        template: "Template",
-        data_args: "DataArguments",
-        processor: "AutoProcessor" = None,
+    examples: Dict[str, List[Any]],
+    tokenizer: "PreTrainedTokenizer",
+    template: "Template",
+    data_args: "DataArguments",
+    processor: "AutoProcessor" = None,
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
@@ -62,14 +63,14 @@ def preprocess_supervised_dataset(
         messages = examples["prompt"][i] + examples["response"][i]
         input_ids, labels = [], []
         for turn_idx, (source_ids, target_ids) in enumerate(
-                template.encode_multiturn(
-                    tokenizer,
-                    messages,
-                    examples["system"][i],
-                    examples["tools"][i],
-                    data_args.cutoff_len,
-                    data_args.reserved_label_len,
-                )
+            template.encode_multiturn(
+                tokenizer,
+                messages,
+                examples["system"][i],
+                examples["tools"][i],
+                data_args.cutoff_len,
+                data_args.reserved_label_len,
+            )
         ):
             if data_args.train_on_prompt:
                 source_mask = source_ids
@@ -95,10 +96,10 @@ def preprocess_supervised_dataset(
 
 
 def preprocess_packed_supervised_dataset(
-        examples: Dict[str, List[Any]],
-        tokenizer: "PreTrainedTokenizer",
-        template: "Template",
-        data_args: "DataArguments",
+    examples: Dict[str, List[Any]],
+    tokenizer: "PreTrainedTokenizer",
+    template: "Template",
+    data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>`
     # and labels with format `<ignore> ... <ignore> Y1 <eos> <ignore> ... <ignore> Y2 <eos>`
@@ -110,7 +111,7 @@ def preprocess_packed_supervised_dataset(
 
         messages = examples["prompt"][i] + examples["response"][i]
         for source_ids, target_ids in template.encode_multiturn(
-                tokenizer, messages, examples["system"][i], examples["tools"][i]
+            tokenizer, messages, examples["system"][i], examples["tools"][i]
         ):
             if data_args.train_on_prompt:
                 source_mask = source_ids
@@ -132,19 +133,19 @@ def preprocess_packed_supervised_dataset(
     total_length = (total_length // block_size) * block_size
     # split by chunks of cutoff_len
     for i in range(0, total_length, block_size):
-        if not all(label == IGNORE_INDEX for label in labels[i: i + block_size]):
-            model_inputs["input_ids"].append(input_ids[i: i + block_size])
+        if not all(label == IGNORE_INDEX for label in labels[i : i + block_size]):
+            model_inputs["input_ids"].append(input_ids[i : i + block_size])
             model_inputs["attention_mask"].append([1] * block_size)
-            model_inputs["labels"].append(labels[i: i + block_size])
+            model_inputs["labels"].append(labels[i : i + block_size])
 
     return model_inputs
 
 
 def preprocess_unsupervised_dataset(
-        examples: Dict[str, List[Any]],
-        tokenizer: "PreTrainedTokenizer",
-        template: "Template",
-        data_args: "DataArguments",
+    examples: Dict[str, List[Any]],
+    tokenizer: "PreTrainedTokenizer",
+    template: "Template",
+    data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X` and labels with format `Y <eos>`
     model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
@@ -178,10 +179,10 @@ def preprocess_unsupervised_dataset(
 
 
 def preprocess_pairwise_dataset(
-        examples: Dict[str, List[Any]],
-        tokenizer: "PreTrainedTokenizer",
-        template: "Template",
-        data_args: "DataArguments",
+    examples: Dict[str, List[Any]],
+    tokenizer: "PreTrainedTokenizer",
+    template: "Template",
+    data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>`
     model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []}
@@ -245,12 +246,12 @@ def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer:
 
 
 def get_preprocess_and_print_func(
-        tokenizer: "PreTrainedTokenizer",
-        template: "Template",
-        data_args: "DataArguments",
-        training_args: "Seq2SeqTrainingArguments",
-        stage: Literal["pt", "sft", "rm", "ppo"],
-        processor: Optional["AutoProcessor"] = None,
+    tokenizer: "PreTrainedTokenizer",
+    template: "Template",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo"],
+    processor: Optional["AutoProcessor"] = None,
 ) -> Tuple[Callable, Callable]:
     if stage == "pt":
         preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args)

From 5d03ac642d9498870974d5cab9c24319bf7eb3e7 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 25 Apr 2024 22:59:46 +0800
Subject: [PATCH 172/341] modify some bug

Former-commit-id: 593b7b004df74bd24361c9883401a656c08fb589
---
 src/llmtuner/data/preprocess.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 59b49b9d..be566a5b 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -91,6 +91,8 @@ def preprocess_supervised_dataset(
         model_inputs["labels"].append(labels)
         if processor is not None and "images" in examples:
             pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0]
+            if "pixel_values" not in model_inputs:
+                model_inputs["pixel_values"] = []
             model_inputs["pixel_values"].append(pixel_values)
     return model_inputs
 

From 13117b69d7e566c61ce277d949cd8c05c7258aa5 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 02:20:47 +0800
Subject: [PATCH 173/341] delete llava template (use vicuna)

Former-commit-id: 420e64970e5a0e45453041927e0366ee8beb73d5
---
 src/llmtuner/data/template.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 9a3673c3..73b22eb7 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -879,14 +879,3 @@ _register_template(
     format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
     format_separator=EmptyFormatter(slots=["\n"]),
 )
-
-
-_register_template(
-    name="llava",
-    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT: "]),
-    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]),
-    default_system=(
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's questions."
-    ),
-)

From 279439abbeeea3891ae80f719ab86b440e22cc8c Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 02:49:39 +0800
Subject: [PATCH 174/341] update hparam name

Former-commit-id: 9941adfbf06db37f8ba32c4555f6e58e27188aaf
---
 src/llmtuner/hparams/model_args.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index 97b908e4..be65cd27 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -81,6 +81,10 @@ class ModelArguments:
         default=False,
         metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."},
     )
+    visual_inputs: bool = field(
+        default=False,
+        metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."},
+    )
     moe_aux_loss_coef: Optional[float] = field(
         default=None,
         metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."},
@@ -169,10 +173,6 @@ class ModelArguments:
         default=False,
         metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
     )
-    use_mllm: bool = field(
-        default=False,
-        metadata={"help": "Whether use Multimodal LLM."},
-    )
 
     def __post_init__(self):
         self.compute_dtype = None

From a6f6b406b3856544a3fb8dcfef528b092e6bb967 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 03:22:40 +0800
Subject: [PATCH 175/341] Update loader.py

Former-commit-id: 72d4817a15f6916706828ea2a61d808183c23773
---
 src/llmtuner/model/loader.py | 54 ++++++++++++++----------------------
 1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 5b5c0a4d..0ff7a350 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -1,12 +1,6 @@
-from typing import TYPE_CHECKING, Any, Dict, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, TypedDict
 
-from transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoModelForVision2Seq,
-    AutoProcessor,
-    AutoTokenizer,
-)
+from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer
 from trl import AutoModelForCausalLMWithValueHead
 
 from ..extras.logging import get_logger
@@ -19,13 +13,19 @@ from .utils.unsloth import load_unsloth_pretrained_model
 
 
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
+    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 
     from ..hparams import FinetuningArguments, ModelArguments
 
+
 logger = get_logger(__name__)
 
 
+class TokenizerModule(TypedDict):
+    tokenizer: "PreTrainedTokenizer"
+    processor: Optional["ProcessorMixin"]
+
+
 def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
     r"""
     Gets arguments to load config/tokenizer/model.
@@ -41,7 +41,7 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]:
     }
 
 
-def load_tokenizer(model_args: "ModelArguments") -> Dict[str, Union["PreTrainedTokenizer", "AutoProcessor"]]:
+def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
     r"""
     Loads pretrained tokenizer.
 
@@ -75,25 +75,14 @@ def load_tokenizer(model_args: "ModelArguments") -> Dict[str, Union["PreTrainedT
             logger.warning("New tokens have been added, changed `resize_vocab` to True.")
 
     patch_tokenizer(tokenizer)
-    tokenizer_modules = {"tokenizer": tokenizer, "processor": None}
-    if model_args.use_mllm:
-        try:
-            processor = AutoProcessor.from_pretrained(
-                model_args.model_name_or_path,
-                use_fast=model_args.use_fast_tokenizer,
-                split_special_tokens=model_args.split_special_tokens,
-                padding_side="right",
-                **init_kwargs,
-            )
-        except Exception:  # try the fast one
-            processor = AutoProcessor.from_pretrained(
-                model_args.model_name_or_path,
-                use_fast=True,
-                padding_side="right",
-                **init_kwargs,
-            )
-        tokenizer_modules["processor"] = processor
-    return tokenizer_modules
+
+    if model_args.visual_inputs:
+        processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs)
+        setattr(processor, "tokenizer", tokenizer)
+    else:
+        processor = None
+
+    return {"tokenizer": tokenizer, "processor": processor}
 
 
 def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
@@ -132,11 +121,10 @@ def load_model(
 
         if model_args.mixture_of_depths == "load":
             model = load_mod_pretrained_model(**init_kwargs)
+        elif model_args.visual_inputs:
+            model = AutoModelForVision2Seq.from_pretrained(**init_kwargs)
         else:
-            if model_args.use_mllm:
-                model = AutoModelForVision2Seq.from_pretrained(**init_kwargs)
-            else:
-                model = AutoModelForCausalLM.from_pretrained(**init_kwargs)
+            model = AutoModelForCausalLM.from_pretrained(**init_kwargs)
 
         if model_args.mixture_of_depths == "convert":
             model = convert_pretrained_model_to_mod(model, config, model_args)

From 2eede9ffd643d6475bd3ec742d1fe8aeb46e0938 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 03:29:12 +0800
Subject: [PATCH 176/341] Update workflow.py

Former-commit-id: 5b8b5b975716d539ae2fae8536f79e106aa0b566
---
 src/llmtuner/train/sft/workflow.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py
index c5acb4bc..3ead9edf 100644
--- a/src/llmtuner/train/sft/workflow.py
+++ b/src/llmtuner/train/sft/workflow.py
@@ -28,11 +28,10 @@ def run_sft(
     generating_args: "GeneratingArguments",
     callbacks: Optional[List["TrainerCallback"]] = None,
 ):
-    tokenizer_modules = load_tokenizer(model_args)
-    tokenizer = tokenizer_modules["tokenizer"]
-    processor = tokenizer_modules["processor"]
-    dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft", processor=processor)
-    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+    tokenizer_module = load_tokenizer(model_args)
+    dataset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
+    tokenizer = tokenizer_module["tokenizer"]
+    model = load_model(tokenizer, model_args, finetuning_args, is_trainable=training_args.do_train)
 
     if training_args.predict_with_generate:
         tokenizer.padding_side = "left"  # use left-padding in generation
@@ -49,8 +48,7 @@ def run_sft(
     # Override the decoding parameters of Seq2SeqTrainer
     training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
     training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
-    if model_args.use_mllm:
-        training_args.remove_unused_columns = False
+    training_args.remove_unused_columns = False if model_args.visual_inputs else training_args.remove_unused_columns
 
     # Initialize our Trainer
     trainer = CustomSeq2SeqTrainer(

From e1838e76fe9f2f630b4899ee818c1ef5ee219bb0 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 03:33:07 +0800
Subject: [PATCH 177/341] Update loader.py

Former-commit-id: 6a5f2e2ab7304113ff71cb77aafff6a1f74831f8
---
 src/llmtuner/data/loader.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index fa4aa9c1..ca0d5407 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -16,12 +16,13 @@ from .utils import checksum, merge_dataset
 
 if TYPE_CHECKING:
     from datasets import Dataset, IterableDataset
-    from transformers import AutoProcessor, Seq2SeqTrainingArguments
+    from transformers import ProcessorMixin, Seq2SeqTrainingArguments
     from transformers.tokenization_utils import PreTrainedTokenizer
 
     from ..hparams import DataArguments, ModelArguments
     from .parser import DatasetAttr
 
+
 logger = get_logger(__name__)
 
 
@@ -114,12 +115,12 @@ def load_single_dataset(
 
 
 def get_dataset(
-    tokenizer: "PreTrainedTokenizer",
     model_args: "ModelArguments",
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
     stage: Literal["pt", "sft", "rm", "ppo"],
-    processor: Optional["AutoProcessor"] = None,
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"] = None,
 ) -> Union["Dataset", "IterableDataset"]:
     template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
     if data_args.train_on_prompt and template.efficient_eos:
@@ -149,7 +150,7 @@ def get_dataset(
 
     with training_args.main_process_first(desc="pre-process dataset"):
         preprocess_func, print_function = get_preprocess_and_print_func(
-            tokenizer, template, data_args, training_args, stage, processor
+            data_args, training_args, stage, template, tokenizer, processor
         )
         column_names = list(next(iter(dataset)).keys())
         kwargs = {}

From ece67f8c7ff7f5e4eab2e5c4b4f269bdd61f1678 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 03:35:39 +0800
Subject: [PATCH 178/341] Update parser.py

Former-commit-id: 4df75e8a9a391565cc3eec69bc0ebf5d5192de61
---
 src/llmtuner/data/parser.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py
index 4d3d7741..01a417a9 100644
--- a/src/llmtuner/data/parser.py
+++ b/src/llmtuner/data/parser.py
@@ -25,9 +25,10 @@ class DatasetAttr:
     subset: Optional[str] = None
     folder: Optional[str] = None
     ranking: bool = False
-    formatting: Literal["alpaca", "sharegpt", "llava"] = "alpaca"
+    formatting: Literal["alpaca", "sharegpt"] = "alpaca"
     """ columns """
     system: Optional[str] = None
+    images: Optional[str] = None
     """ columns for the alpaca format """
     prompt: Optional[str] = "instruction"
     query: Optional[str] = "input"
@@ -44,8 +45,6 @@ class DatasetAttr:
     observation_tag: Optional[str] = "observation"
     function_tag: Optional[str] = "function_call"
     system_tag: Optional[str] = "system"
-    """ columns for the mllm format """
-    images: Optional[str] = None
 
     def __repr__(self) -> str:
         return self.dataset_name
@@ -105,21 +104,18 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
         dataset_attr.set_attr("folder", dataset_info[name])
         dataset_attr.set_attr("ranking", dataset_info[name], default=False)
         dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca")
-        dataset_attr.set_attr("images", dataset_info[name], default="")
 
         if "columns" in dataset_info[name]:
-            column_names = ["system"]
+            column_names = ["system", "images"]
             if dataset_attr.formatting == "alpaca":
                 column_names.extend(["prompt", "query", "response", "history"])
-            elif dataset_attr.formatting == "llava":
-                column_names.extend(["messages", "images"])
             else:
                 column_names.extend(["messages", "tools"])
 
             for column_name in column_names:
                 dataset_attr.set_attr(column_name, dataset_info[name]["columns"])
 
-        if dataset_attr.formatting != "alpaca" and "tags" in dataset_info[name]:
+        if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]:
             tag_names = (
                 "role_tag",
                 "content_tag",

From c37582af02f0e598545f18bf2659aabe996740f5 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 03:48:34 +0800
Subject: [PATCH 179/341] Update aligner.py

Former-commit-id: 855489074c469f47572153df0fa1e251b187b232
---
 src/llmtuner/data/aligner.py | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py
index 6fd6f404..dc1de865 100644
--- a/src/llmtuner/data/aligner.py
+++ b/src/llmtuner/data/aligner.py
@@ -1,3 +1,4 @@
+import os
 from functools import partial
 from typing import TYPE_CHECKING, Any, Dict, List, Union
 
@@ -13,8 +14,10 @@ if TYPE_CHECKING:
     from .parser import DatasetAttr
 
 
-def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]:
-    outputs = {"prompt": [], "response": [], "system": [], "tools": []}
+def convert_alpaca(
+    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
+) -> Dict[str, List[Any]]:
+    outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
     for i in range(len(examples[dataset_attr.prompt])):
         prompt = []
         if dataset_attr.history and isinstance(examples[dataset_attr.history][i], list):
@@ -44,11 +47,18 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr")
         outputs["response"].append(response)
         outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
         outputs["tools"].append("")
-        outputs["images"].append([])
+        outputs["images"].append(
+            [os.path.join(data_args.dataset_dir, path) for path in examples[dataset_attr.images][i]]
+            if dataset_attr.images
+            else []
+        )
+
     return outputs
 
 
-def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]:
+def convert_sharegpt(
+    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
+) -> Dict[str, List[Any]]:
     outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
     tag_mapping = {
         dataset_attr.user_tag: Role.USER.value,
@@ -84,7 +94,11 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
         outputs["response"].append(aligned_messages[-1:])
         outputs["system"].append(system)
         outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
-        outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else [])
+        outputs["images"].append(
+            [os.path.join(data_args.dataset_dir, path) for path in examples[dataset_attr.images][i]]
+            if dataset_attr.images
+            else []
+        )
 
     return outputs
 
@@ -97,12 +111,13 @@ def align_dataset(
         prompt: [{"role": "user", "content": "..."}] * (2T - 1)
         response: [{"role": "assistant", "content": "..."}] * N (N > 1 for ranking dataset)
         system: "..."
-        tools: "..."
+        tools: "...",
+        images: [],
     """
     if dataset_attr.formatting == "alpaca":
-        convert_func = partial(convert_alpaca, dataset_attr=dataset_attr)
+        convert_func = partial(convert_alpaca, dataset_attr=dataset_attr, data_args=data_args)
     else:
-        convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr)
+        convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr, data_args=data_args)
 
     column_names = list(next(iter(dataset)).keys())
     features = Features.from_dict(
@@ -115,7 +130,7 @@ def align_dataset(
             ],
             "system": {"dtype": "string", "_type": "Value"},
             "tools": {"dtype": "string", "_type": "Value"},
-            "images": {"feature": {"_type": "Image"}, "_type": "Sequence"},
+            "images": [{"_type": "Image"}],
         }
     )
     kwargs = {}

From f9a7732a1fe3dd06c3cbd6fc6b1e1be466ca2b30 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 04:10:28 +0800
Subject: [PATCH 180/341] Update preprocess.py

Former-commit-id: 0e376eab23d38b8fca05f054f3cde308756ee3b1
---
 src/llmtuner/data/preprocess.py | 66 +++++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 19 deletions(-)

diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index be566a5b..0b467724 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -8,15 +8,26 @@ from .utils import Role
 
 
 if TYPE_CHECKING:
-    from transformers import Seq2SeqTrainingArguments
-    from transformers.tokenization_utils import AutoProcessor, PreTrainedTokenizer
+    from PIL import Image
+    from transformers import ProcessorMixin, Seq2SeqTrainingArguments
+    from transformers.image_processing_utils import BaseImageProcessor
+    from transformers.tokenization_utils import PreTrainedTokenizer
 
     from ..hparams import DataArguments
     from .template import Template
 
+
 logger = get_logger(__name__)
 
 
+def _preprocess_visual_inputs(model_inputs: Dict[str, Any], processor: "ProcessorMixin", image: "Image") -> None:
+    image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+    pixel_values = image_processor(image, return_tensors="pt")["pixel_values"][0]
+    if "pixel_values" not in model_inputs:
+        model_inputs["pixel_values"] = []
+    model_inputs["pixel_values"].append(pixel_values)
+
+
 def preprocess_pretrain_dataset(
     examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
 ) -> Dict[str, List[List[int]]]:
@@ -47,10 +58,10 @@ def preprocess_pretrain_dataset(
 
 def preprocess_supervised_dataset(
     examples: Dict[str, List[Any]],
-    tokenizer: "PreTrainedTokenizer",
     template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
     data_args: "DataArguments",
-    processor: "AutoProcessor" = None,
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
@@ -90,17 +101,15 @@ def preprocess_supervised_dataset(
         model_inputs["attention_mask"].append([1] * len(input_ids))
         model_inputs["labels"].append(labels)
         if processor is not None and "images" in examples:
-            pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0]
-            if "pixel_values" not in model_inputs:
-                model_inputs["pixel_values"] = []
-            model_inputs["pixel_values"].append(pixel_values)
+            _preprocess_visual_inputs(model_inputs, processor, examples["images"][i][0])
+
     return model_inputs
 
 
 def preprocess_packed_supervised_dataset(
     examples: Dict[str, List[Any]],
-    tokenizer: "PreTrainedTokenizer",
     template: "Template",
+    tokenizer: "PreTrainedTokenizer",
     data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>`
@@ -145,8 +154,9 @@ def preprocess_packed_supervised_dataset(
 
 def preprocess_unsupervised_dataset(
     examples: Dict[str, List[Any]],
-    tokenizer: "PreTrainedTokenizer",
     template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
     data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X` and labels with format `Y <eos>`
@@ -176,14 +186,17 @@ def preprocess_unsupervised_dataset(
         model_inputs["input_ids"].append(input_ids)
         model_inputs["attention_mask"].append([1] * len(input_ids))
         model_inputs["labels"].append(labels)
+        if processor is not None and "images" in examples:
+            _preprocess_visual_inputs(model_inputs, processor, examples["images"][i][0])
 
     return model_inputs
 
 
 def preprocess_pairwise_dataset(
     examples: Dict[str, List[Any]],
-    tokenizer: "PreTrainedTokenizer",
     template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
     data_args: "DataArguments",
 ) -> Dict[str, List[List[int]]]:
     # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>`
@@ -218,6 +231,8 @@ def preprocess_pairwise_dataset(
         model_inputs["prompt_ids"].append(prompt_ids)
         model_inputs["chosen_ids"].append(chosen_ids)
         model_inputs["rejected_ids"].append(rejected_ids)
+        if processor is not None and "images" in examples:
+            _preprocess_visual_inputs(model_inputs, processor, examples["images"][i][0])
 
     return model_inputs
 
@@ -248,12 +263,12 @@ def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer:
 
 
 def get_preprocess_and_print_func(
-    tokenizer: "PreTrainedTokenizer",
-    template: "Template",
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
     stage: Literal["pt", "sft", "rm", "ppo"],
-    processor: Optional["AutoProcessor"] = None,
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
 ) -> Tuple[Callable, Callable]:
     if stage == "pt":
         preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args)
@@ -261,25 +276,38 @@ def get_preprocess_and_print_func(
     elif stage == "sft" and not training_args.predict_with_generate:
         if data_args.packing:
             preprocess_func = partial(
-                preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
+                preprocess_packed_supervised_dataset,
+                template=template,
+                tokenizer=tokenizer,
+                data_args=data_args,
             )
         else:
             preprocess_func = partial(
                 preprocess_supervised_dataset,
-                tokenizer=tokenizer,
                 template=template,
-                data_args=data_args,
+                tokenizer=tokenizer,
                 processor=processor,
+                data_args=data_args,
             )
+
         print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)
     elif stage == "rm":
         preprocess_func = partial(
-            preprocess_pairwise_dataset, tokenizer=tokenizer, template=template, data_args=data_args
+            preprocess_pairwise_dataset,
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            data_args=data_args,
         )
         print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer)
     else:
         preprocess_func = partial(
-            preprocess_unsupervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args
+            preprocess_unsupervised_dataset,
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            data_args=data_args,
         )
         print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
+
     return preprocess_func, print_function

From 23b881bff178a4ab4638b6de2e8d48c6042067f2 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 05:34:58 +0800
Subject: [PATCH 181/341] support mllm hf inference

Former-commit-id: 2c7c01282acd7ddabbb17ce3246b8dae4bc4b8cf
---
 data/README.md                                |  3 +-
 data/README_zh.md                             |  3 +-
 examples/README.md                            |  1 +
 examples/README_zh.md                         |  1 +
 .../sft_mllm.sh}                              | 25 ++++++++-------
 src/llmtuner/chat/base_engine.py              |  3 ++
 src/llmtuner/chat/chat_model.py               | 14 +++++---
 src/llmtuner/chat/hf_engine.py                | 32 ++++++++++++++++---
 src/llmtuner/chat/vllm_engine.py              | 14 ++++++--
 src/llmtuner/data/preprocess.py               |  8 +++--
 src/llmtuner/eval/evaluator.py                |  2 +-
 src/llmtuner/hparams/parser.py                |  3 ++
 src/llmtuner/train/dpo/workflow.py            |  5 +--
 src/llmtuner/train/orpo/workflow.py           |  5 +--
 src/llmtuner/train/ppo/workflow.py            |  5 +--
 src/llmtuner/train/pt/workflow.py             |  5 +--
 src/llmtuner/train/rm/workflow.py             |  5 +--
 src/llmtuner/train/sft/workflow.py            |  4 +--
 src/llmtuner/train/tuner.py                   |  2 +-
 src/llmtuner/train/utils.py                   |  6 ++--
 src/llmtuner/webui/chatter.py                 |  5 ++-
 src/llmtuner/webui/components/chatbot.py      | 15 ++++++---
 src/llmtuner/webui/locales.py                 | 11 +++++++
 23 files changed, 128 insertions(+), 49 deletions(-)
 rename examples/{mllm/sft_llava.sh => lora_single_gpu/sft_mllm.sh} (56%)

diff --git a/data/README.md b/data/README.md
index 2ea0c117..6de0430f 100644
--- a/data/README.md
+++ b/data/README.md
@@ -18,7 +18,8 @@ If you are using a custom dataset, please provide your dataset definition in the
     "history": "the column name in the dataset containing the histories. (default: None)",
     "messages": "the column name in the dataset containing the messages. (default: conversations)",
     "system": "the column name in the dataset containing the system prompts. (default: None)",
-    "tools": "the column name in the dataset containing the tool description. (default: None)"
+    "tools": "the column name in the dataset containing the tool description. (default: None)",
+    "images": "the column name in the dataset containing the image inputs. (default: None)"
   },
   "tags (optional, used for the sharegpt format)": {
     "role_tag": "the key in the message represents the identity. (default: from)",
diff --git a/data/README_zh.md b/data/README_zh.md
index b00f81d9..fb6cb1d9 100644
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -18,7 +18,8 @@
     "history": "数据集代表历史对话的表头名称（默认：None）",
     "messages": "数据集代表消息列表的表头名称（默认：conversations）",
     "system": "数据集代表系统提示的表头名称（默认：None）",
-    "tools": "数据集代表工具描述的表头名称（默认：None）"
+    "tools": "数据集代表工具描述的表头名称（默认：None）",
+    "images": "数据集代表图像输入的表头名称（默认：None）"
   },
   "tags（可选，用于 sharegpt 格式）": {
     "role_tag": "消息中代表发送者身份的键名（默认：from）",
diff --git a/examples/README.md b/examples/README.md
index cc01cf9f..895e9c72 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,6 +9,7 @@ examples/
 │   ├── ppo.sh: Do PPO training using LoRA
 │   ├── dpo.sh: Do DPO training using LoRA
 │   ├── orpo.sh: Do ORPO training using LoRA
+│   ├── sft_mllm.sh: Do supervised fine-tuning on multimodal data using LoRA
 │   ├── prepare.sh: Save tokenized dataset
 │   └── predict.sh: Do batch predict and compute BLEU and ROUGE scores after LoRA tuning
 ├── qlora_single_gpu/
diff --git a/examples/README_zh.md b/examples/README_zh.md
index fecbdb2f..091a877f 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -9,6 +9,7 @@ examples/
 │   ├── ppo.sh: 基于 LoRA 进行 PPO 训练
 │   ├── dpo.sh: 基于 LoRA 进行 DPO 训练
 │   ├── orpo.sh: 基于 LoRA 进行 ORPO 训练
+│   ├── sft_mllm.sh: 基于 LoRA 进行多模态指令监督微调
 │   ├── prepare.sh: 保存预处理后的数据集
 │   └── predict.sh: 基于 LoRA 进行批量预测并计算 BLEU 和 ROUGE 分数
 ├── qlora_single_gpu/
diff --git a/examples/mllm/sft_llava.sh b/examples/lora_single_gpu/sft_mllm.sh
similarity index 56%
rename from examples/mllm/sft_llava.sh
rename to examples/lora_single_gpu/sft_mllm.sh
index c1fce693..7e900918 100644
--- a/examples/mllm/sft_llava.sh
+++ b/examples/lora_single_gpu/sft_mllm.sh
@@ -1,32 +1,33 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft_mm \
+CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+    --stage sft \
     --do_train \
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
-    --dataset mllm_instruct_example \
-    --dataset_dir data \
-    --template default \
+    --visual_inputs \
+    --dataset mllm_demo \
+    --dataset_dir ../../data \
+    --template vicuna \
     --finetuning_type lora \
-    --lora_target all \
-    --output_dir saves/llava-1.5-7b/lora/sft \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft_mllm \
     --overwrite_cache \
     --overwrite_output_dir \
     --cutoff_len 1024 \
     --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 3 \
+    --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
+    --gradient_accumulation_steps 8 \
     --lr_scheduler_type cosine \
-    --logging_steps 1 \
+    --logging_steps 10 \
     --warmup_steps 20 \
     --save_steps 100 \
     --eval_steps 100 \
     --evaluation_strategy steps \
     --load_best_model_at_end \
     --learning_rate 5e-5 \
-    --num_train_epochs 100 \
+    --num_train_epochs 100.0 \
     --max_samples 3000 \
     --val_size 0.1 \
     --plot_loss \
-    --bf16
\ No newline at end of file
+    --fp16
diff --git a/src/llmtuner/chat/base_engine.py b/src/llmtuner/chat/base_engine.py
index e19db676..65b6c59c 100644
--- a/src/llmtuner/chat/base_engine.py
+++ b/src/llmtuner/chat/base_engine.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Literal, Opti
 
 
 if TYPE_CHECKING:
+    from numpy.typing import NDArray
     from transformers import PreTrainedModel, PreTrainedTokenizer
     from vllm import AsyncLLMEngine
 
@@ -46,6 +47,7 @@ class BaseEngine(ABC):
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> List["Response"]: ...
 
@@ -55,6 +57,7 @@ class BaseEngine(ABC):
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> AsyncGenerator[str, None]: ...
 
diff --git a/src/llmtuner/chat/chat_model.py b/src/llmtuner/chat/chat_model.py
index c49d4d78..ba58dd2e 100644
--- a/src/llmtuner/chat/chat_model.py
+++ b/src/llmtuner/chat/chat_model.py
@@ -8,6 +8,8 @@ from .vllm_engine import VllmEngine
 
 
 if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
     from .base_engine import BaseEngine, Response
 
 
@@ -36,9 +38,10 @@ class ChatModel:
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> List["Response"]:
-        task = asyncio.run_coroutine_threadsafe(self.achat(messages, system, tools, **input_kwargs), self._loop)
+        task = asyncio.run_coroutine_threadsafe(self.achat(messages, system, tools, image, **input_kwargs), self._loop)
         return task.result()
 
     async def achat(
@@ -46,18 +49,20 @@ class ChatModel:
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> List["Response"]:
-        return await self.engine.chat(messages, system, tools, **input_kwargs)
+        return await self.engine.chat(messages, system, tools, image, **input_kwargs)
 
     def stream_chat(
         self,
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> Generator[str, None, None]:
-        generator = self.astream_chat(messages, system, tools, **input_kwargs)
+        generator = self.astream_chat(messages, system, tools, image, **input_kwargs)
         while True:
             try:
                 task = asyncio.run_coroutine_threadsafe(generator.__anext__(), self._loop)
@@ -70,9 +75,10 @@ class ChatModel:
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> AsyncGenerator[str, None]:
-        async for new_token in self.engine.stream_chat(messages, system, tools, **input_kwargs):
+        async for new_token in self.engine.stream_chat(messages, system, tools, image, **input_kwargs):
             yield new_token
 
     def get_scores(
diff --git a/src/llmtuner/chat/hf_engine.py b/src/llmtuner/chat/hf_engine.py
index ddb48e47..f6f51898 100644
--- a/src/llmtuner/chat/hf_engine.py
+++ b/src/llmtuner/chat/hf_engine.py
@@ -14,7 +14,9 @@ from .base_engine import BaseEngine, Response
 
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel, PreTrainedTokenizer
+    from numpy.typing import NDArray
+    from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
+    from transformers.image_processing_utils import BaseImageProcessor
     from trl import PreTrainedModelWrapper
 
     from ..data import Template
@@ -30,7 +32,9 @@ class HuggingfaceEngine(BaseEngine):
         generating_args: "GeneratingArguments",
     ) -> None:
         self.can_generate = finetuning_args.stage == "sft"
-        self.tokenizer = load_tokenizer(model_args)
+        tokenizer_module = load_tokenizer(model_args)
+        self.tokenizer = tokenizer_module["tokenizer"]
+        self.processor = tokenizer_module["processor"]
         self.tokenizer.padding_side = "left" if self.can_generate else "right"
         self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
         self.model = load_model(
@@ -42,13 +46,18 @@ class HuggingfaceEngine(BaseEngine):
     def _process_args(
         model: "PreTrainedModel",
         tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
         template: "Template",
         generating_args: Dict[str, Any],
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         input_kwargs: Optional[Dict[str, Any]] = {},
     ) -> Tuple[Dict[str, Any], int]:
+        if processor is not None and image is not None and "<image>" not in messages[0]["content"]:
+            messages[0]["content"] = messages[0]["content"] + "<image>"
+
         paired_messages = messages + [{"role": "assistant", "content": ""}]
         prompt_ids, _ = template.encode_oneturn(
             tokenizer=tokenizer, messages=paired_messages, system=system, tools=tools
@@ -95,6 +104,11 @@ class HuggingfaceEngine(BaseEngine):
             logits_processor=get_logits_processor(),
         )
 
+        if processor is not None and image is not None:
+            image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+            pixel_values: "torch.Tensor" = image_processor(image, return_tensors="pt")["pixel_values"]
+            gen_kwargs["pixel_values"] = pixel_values.to(model.device)
+
         return gen_kwargs, prompt_length
 
     @staticmethod
@@ -102,15 +116,17 @@ class HuggingfaceEngine(BaseEngine):
     def _chat(
         model: "PreTrainedModel",
         tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
         template: "Template",
         generating_args: Dict[str, Any],
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         input_kwargs: Optional[Dict[str, Any]] = {},
     ) -> List["Response"]:
         gen_kwargs, prompt_length = HuggingfaceEngine._process_args(
-            model, tokenizer, template, generating_args, messages, system, tools, input_kwargs
+            model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs
         )
         generate_output = model.generate(**gen_kwargs)
         response_ids = generate_output[:, prompt_length:]
@@ -135,15 +151,17 @@ class HuggingfaceEngine(BaseEngine):
     def _stream_chat(
         model: "PreTrainedModel",
         tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
         template: "Template",
         generating_args: Dict[str, Any],
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         input_kwargs: Optional[Dict[str, Any]] = {},
     ) -> Callable[[], str]:
         gen_kwargs, _ = HuggingfaceEngine._process_args(
-            model, tokenizer, template, generating_args, messages, system, tools, input_kwargs
+            model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs
         )
         streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         gen_kwargs["streamer"] = streamer
@@ -199,6 +217,7 @@ class HuggingfaceEngine(BaseEngine):
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> List["Response"]:
         if not self.can_generate:
@@ -208,11 +227,13 @@ class HuggingfaceEngine(BaseEngine):
         input_args = (
             self.model,
             self.tokenizer,
+            self.processor,
             self.template,
             self.generating_args,
             messages,
             system,
             tools,
+            image,
             input_kwargs,
         )
         async with self._semaphore:
@@ -224,6 +245,7 @@ class HuggingfaceEngine(BaseEngine):
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> AsyncGenerator[str, None]:
         if not self.can_generate:
@@ -233,11 +255,13 @@ class HuggingfaceEngine(BaseEngine):
         input_args = (
             self.model,
             self.tokenizer,
+            self.processor,
             self.template,
             self.generating_args,
             messages,
             system,
             tools,
+            image,
             input_kwargs,
         )
         async with self._semaphore:
diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index 786e743d..a4caa53b 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -12,7 +12,10 @@ if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
 
+
 if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
     from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
 
 
@@ -29,7 +32,9 @@ class VllmEngine(BaseEngine):
         infer_dtype = str(infer_dtype).split(".")[-1]
 
         self.can_generate = finetuning_args.stage == "sft"
-        self.tokenizer = load_tokenizer(model_args)
+        tokenizer_module = load_tokenizer(model_args)
+        self.tokenizer = tokenizer_module["tokenizer"]
+        self.processor = tokenizer_module["processor"]
         self.tokenizer.padding_side = "left"
         self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
         self.generating_args = generating_args.to_dict()
@@ -58,6 +63,7 @@ class VllmEngine(BaseEngine):
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> AsyncIterator["RequestOutput"]:
         request_id = "chatcmpl-{}".format(uuid.uuid4().hex)
@@ -121,10 +127,11 @@ class VllmEngine(BaseEngine):
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> List["Response"]:
         final_output = None
-        generator = await self._generate(messages, system, tools, **input_kwargs)
+        generator = await self._generate(messages, system, tools, image, **input_kwargs)
         async for request_output in generator:
             final_output = request_output
 
@@ -146,10 +153,11 @@ class VllmEngine(BaseEngine):
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
+        image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> AsyncGenerator[str, None]:
         generated_text = ""
-        generator = await self._generate(messages, system, tools, **input_kwargs)
+        generator = await self._generate(messages, system, tools, image, **input_kwargs)
         async for result in generator:
             delta_text = result.outputs[0].text[len(generated_text) :]
             generated_text = result.outputs[0].text
diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 0b467724..18681872 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -8,7 +8,7 @@ from .utils import Role
 
 
 if TYPE_CHECKING:
-    from PIL import Image
+    from PIL.Image import Image
     from transformers import ProcessorMixin, Seq2SeqTrainingArguments
     from transformers.image_processing_utils import BaseImageProcessor
     from transformers.tokenization_utils import PreTrainedTokenizer
@@ -271,7 +271,11 @@ def get_preprocess_and_print_func(
     processor: Optional["ProcessorMixin"],
 ) -> Tuple[Callable, Callable]:
     if stage == "pt":
-        preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args)
+        preprocess_func = partial(
+            preprocess_pretrain_dataset,
+            tokenizer=tokenizer,
+            data_args=data_args,
+        )
         print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
     elif stage == "sft" and not training_args.predict_with_generate:
         if data_args.packing:
diff --git a/src/llmtuner/eval/evaluator.py b/src/llmtuner/eval/evaluator.py
index 2c039928..7446c6f5 100644
--- a/src/llmtuner/eval/evaluator.py
+++ b/src/llmtuner/eval/evaluator.py
@@ -21,7 +21,7 @@ from .template import get_eval_template
 class Evaluator:
     def __init__(self, args: Optional[Dict[str, Any]] = None) -> None:
         self.model_args, self.data_args, self.eval_args, finetuning_args = get_eval_args(args)
-        self.tokenizer = load_tokenizer(self.model_args)
+        self.tokenizer = load_tokenizer(self.model_args)["tokenizer"]
         self.tokenizer.padding_side = "right"  # avoid overflow issue in batched inference for llama2
         self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template)
         self.model = load_model(self.tokenizer, self.model_args, finetuning_args)
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index c922dc47..9b02c93b 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -196,6 +196,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     if model_args.infer_backend == "vllm":
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
 
+    if model_args.visual_inputs and data_args.packing:
+        raise ValueError("Cannot use packing in MLLM fine-tuning.")
+
     _verify_model_args(model_args, finetuning_args)
     _check_extra_dependencies(model_args, finetuning_args, training_args)
 
diff --git a/src/llmtuner/train/dpo/workflow.py b/src/llmtuner/train/dpo/workflow.py
index 929dd029..b19a643e 100644
--- a/src/llmtuner/train/dpo/workflow.py
+++ b/src/llmtuner/train/dpo/workflow.py
@@ -24,8 +24,9 @@ def run_dpo(
     finetuning_args: "FinetuningArguments",
     callbacks: Optional[List["TrainerCallback"]] = None,
 ):
-    tokenizer = load_tokenizer(model_args)
-    dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm")
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    dataset = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module)
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
 
     data_collator = PairwiseDataCollatorWithPadding(
diff --git a/src/llmtuner/train/orpo/workflow.py b/src/llmtuner/train/orpo/workflow.py
index 5a2fd36c..9c870096 100644
--- a/src/llmtuner/train/orpo/workflow.py
+++ b/src/llmtuner/train/orpo/workflow.py
@@ -24,8 +24,9 @@ def run_orpo(
     finetuning_args: "FinetuningArguments",
     callbacks: Optional[List["TrainerCallback"]] = None,
 ):
-    tokenizer = load_tokenizer(model_args)
-    dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm")
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    dataset = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module)
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
 
     data_collator = PairwiseDataCollatorWithPadding(
diff --git a/src/llmtuner/train/ppo/workflow.py b/src/llmtuner/train/ppo/workflow.py
index d5854073..8cd15932 100644
--- a/src/llmtuner/train/ppo/workflow.py
+++ b/src/llmtuner/train/ppo/workflow.py
@@ -27,8 +27,9 @@ def run_ppo(
     generating_args: "GeneratingArguments",
     callbacks: Optional[List["TrainerCallback"]] = None,
 ):
-    tokenizer = load_tokenizer(model_args)
-    dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="ppo")
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    dataset = get_dataset(model_args, data_args, training_args, stage="ppo", **tokenizer_module)
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True)
 
     tokenizer.padding_side = "left"  # use left-padding in generation while using right-padding in training
diff --git a/src/llmtuner/train/pt/workflow.py b/src/llmtuner/train/pt/workflow.py
index f683f37a..3b127da4 100644
--- a/src/llmtuner/train/pt/workflow.py
+++ b/src/llmtuner/train/pt/workflow.py
@@ -25,8 +25,9 @@ def run_pt(
     finetuning_args: "FinetuningArguments",
     callbacks: Optional[List["TrainerCallback"]] = None,
 ):
-    tokenizer = load_tokenizer(model_args)
-    dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="pt")
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    dataset = get_dataset(model_args, data_args, training_args, stage="pt", **tokenizer_module)
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 
diff --git a/src/llmtuner/train/rm/workflow.py b/src/llmtuner/train/rm/workflow.py
index 42bf1ce6..bd0a756c 100644
--- a/src/llmtuner/train/rm/workflow.py
+++ b/src/llmtuner/train/rm/workflow.py
@@ -25,8 +25,9 @@ def run_rm(
     finetuning_args: "FinetuningArguments",
     callbacks: Optional[List["TrainerCallback"]] = None,
 ):
-    tokenizer = load_tokenizer(model_args)
-    dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm")
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    dataset = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module)
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True)
     data_collator = PairwiseDataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
 
diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py
index 3ead9edf..4a9775b4 100644
--- a/src/llmtuner/train/sft/workflow.py
+++ b/src/llmtuner/train/sft/workflow.py
@@ -29,9 +29,9 @@ def run_sft(
     callbacks: Optional[List["TrainerCallback"]] = None,
 ):
     tokenizer_module = load_tokenizer(model_args)
-    dataset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
     tokenizer = tokenizer_module["tokenizer"]
-    model = load_model(tokenizer, model_args, finetuning_args, is_trainable=training_args.do_train)
+    dataset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
 
     if training_args.predict_with_generate:
         tokenizer.padding_side = "left"  # use left-padding in generation
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index a8a2b8e9..a2eb121f 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -52,7 +52,7 @@ def export_model(args: Optional[Dict[str, Any]] = None):
     if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None:
         raise ValueError("Please merge adapters before quantizing the model.")
 
-    tokenizer = load_tokenizer(model_args)
+    tokenizer = load_tokenizer(model_args)["tokenizer"]
     get_template_and_fix_tokenizer(tokenizer, data_args.template)
     model = load_model(tokenizer, model_args, finetuning_args)  # must after fixing tokenizer to resize vocab
 
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index 27dc8eb3..d9fc363d 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -91,7 +91,7 @@ def create_ref_model(
         )
         ref_model_args = ModelArguments(**ref_model_args_dict)
         ref_finetuning_args = FinetuningArguments(finetuning_type="lora")
-        tokenizer = load_tokenizer(ref_model_args)
+        tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
         ref_model = load_model(
             tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
         )
@@ -100,7 +100,7 @@ def create_ref_model(
         if finetuning_args.finetuning_type == "lora":
             ref_model = None
         else:
-            tokenizer = load_tokenizer(model_args)
+            tokenizer = load_tokenizer(model_args)["tokenizer"]
             ref_model = load_model(
                 tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead
             )
@@ -147,7 +147,7 @@ def create_reward_model(
         )
         reward_model_args = ModelArguments(**reward_model_args_dict)
         reward_finetuning_args = FinetuningArguments(finetuning_type="lora")
-        tokenizer = load_tokenizer(reward_model_args)
+        tokenizer = load_tokenizer(reward_model_args)["tokenizer"]
         reward_model = load_model(
             tokenizer, reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True
         )
diff --git a/src/llmtuner/webui/chatter.py b/src/llmtuner/webui/chatter.py
index 82e7b7f1..5aa8f563 100644
--- a/src/llmtuner/webui/chatter.py
+++ b/src/llmtuner/webui/chatter.py
@@ -2,6 +2,8 @@ import json
 import os
 from typing import TYPE_CHECKING, Dict, Generator, List, Optional, Sequence, Tuple
 
+from numpy.typing import NDArray
+
 from ..chat import ChatModel
 from ..data import Role
 from ..extras.misc import torch_gc
@@ -112,6 +114,7 @@ class WebChatModel(ChatModel):
         messages: Sequence[Dict[str, str]],
         system: str,
         tools: str,
+        image: Optional[NDArray],
         max_new_tokens: int,
         top_p: float,
         temperature: float,
@@ -119,7 +122,7 @@ class WebChatModel(ChatModel):
         chatbot[-1][1] = ""
         response = ""
         for new_text in self.stream_chat(
-            messages, system, tools, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature
+            messages, system, tools, image, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature
         ):
             response += new_text
             if tools:
diff --git a/src/llmtuner/webui/components/chatbot.py b/src/llmtuner/webui/components/chatbot.py
index 82bc4f29..e1be1f7b 100644
--- a/src/llmtuner/webui/components/chatbot.py
+++ b/src/llmtuner/webui/components/chatbot.py
@@ -23,9 +23,15 @@ def create_chat_box(
         messages = gr.State([])
         with gr.Row():
             with gr.Column(scale=4):
-                role = gr.Dropdown(choices=[Role.USER.value, Role.OBSERVATION.value], value=Role.USER.value)
-                system = gr.Textbox(show_label=False)
-                tools = gr.Textbox(show_label=False, lines=2)
+                with gr.Row():
+                    with gr.Column():
+                        role = gr.Dropdown(choices=[Role.USER.value, Role.OBSERVATION.value], value=Role.USER.value)
+                        system = gr.Textbox(show_label=False)
+                        tools = gr.Textbox(show_label=False, lines=4)
+
+                    with gr.Column():
+                        image = gr.Image(type="numpy")
+
                 query = gr.Textbox(show_label=False, lines=8)
                 submit_btn = gr.Button(variant="primary")
 
@@ -43,7 +49,7 @@ def create_chat_box(
         [chatbot, messages, query],
     ).then(
         engine.chatter.stream,
-        [chatbot, messages, system, tools, max_new_tokens, top_p, temperature],
+        [chatbot, messages, system, tools, image, max_new_tokens, top_p, temperature],
         [chatbot, messages],
     )
     clear_btn.click(lambda: ([], []), outputs=[chatbot, messages])
@@ -56,6 +62,7 @@ def create_chat_box(
             role=role,
             system=system,
             tools=tools,
+            image=image,
             query=query,
             submit_btn=submit_btn,
             max_new_tokens=max_new_tokens,
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index 3af9128f..8e93efd6 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -1073,6 +1073,17 @@ LOCALES = {
             "placeholder": "工具列表（非必填）",
         },
     },
+    "image": {
+        "en": {
+            "label": "Image (optional)",
+        },
+        "ru": {
+            "label": "Изображение (по желанию)",
+        },
+        "zh": {
+            "label": "图像（非必填）",
+        },
+    },
     "query": {
         "en": {
             "placeholder": "Input...",

From 7773ac0ead2c83706128a8126690083f009ee644 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 05:44:30 +0800
Subject: [PATCH 182/341] update readme

Former-commit-id: 41728fd74de7bec0cc6135aef9dfa3ae9fe7af73
---
 README.md                      | 5 ++++-
 README_zh.md                   | 5 ++++-
 scripts/cal_lr.py              | 5 +++--
 scripts/length_cdf.py          | 4 ++--
 src/llmtuner/hparams/parser.py | 3 +++
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 4e87e369..d74a3fb5 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,8 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See `examples/lora_single_gpu/sft_mllm.sh` for usage.
+
 [24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details.
 
 [24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See `examples/extras/mod` for usage.
@@ -148,6 +150,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
 | [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                      | q_proj,v_proj     | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                      | q_proj,v_proj     | vicuna    |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | q_proj,v_proj     | -         |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
@@ -457,7 +460,7 @@ If you have a project that should be incorporated, please contact via email or c
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
 
-Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2/LLaVA-1.5](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index 599af301..ed19ff94 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -68,6 +68,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 `examples/lora_single_gpu/sft_mllm.sh`。
+
 [24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型，详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。
 
 [24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 `examples/extras/mod`。
@@ -148,6 +150,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
 | [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
 | [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                      | q_proj,v_proj     | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                      | q_proj,v_proj     | vicuna    |
 | [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
 | [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | q_proj,v_proj     | -         |
 | [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
@@ -457,7 +460,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
 
-使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2/LLaVA-1.5](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## 引用
 
diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py
index ffe47f28..c1c1f7a2 100644
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -44,8 +44,9 @@ def calculate_lr(
             overwrite_cache=True,
         )
     )
-    tokenizer = load_tokenizer(model_args)
-    trainset = get_dataset(tokenizer, model_args, data_args, training_args, stage)
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)
     if stage == "pt":
         data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     elif stage == "sft":
diff --git a/scripts/length_cdf.py b/scripts/length_cdf.py
index cf0698de..1446f77a 100644
--- a/scripts/length_cdf.py
+++ b/scripts/length_cdf.py
@@ -32,8 +32,8 @@ def length_cdf(
             overwrite_cache=True,
         )
     )
-    tokenizer = load_tokenizer(model_args)
-    trainset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft")
+    tokenizer_module = load_tokenizer(model_args)
+    trainset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
     total_num = len(trainset)
     length_dict = defaultdict(int)
     for sample in tqdm(trainset["input_ids"]):
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 9b02c93b..715b8f95 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -320,6 +320,9 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
         if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
             raise ValueError("vLLM only accepts a single adapter. Merge them first.")
 
+        if model_args.visual_inputs:
+            raise ValueError("vLLM engine does not support MLLM yet. Stay tuned.")
+
     _verify_model_args(model_args, finetuning_args)
     _check_extra_dependencies(model_args, finetuning_args)
 

From 1480e3a88fc7ed39fe9456b5999ccfe15bad45d4 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 05:49:26 +0800
Subject: [PATCH 183/341] update readme

Former-commit-id: df1155245d3f71ba4f3361d43aa662ab3b024de8
---
 README.md    | 4 ++--
 README_zh.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d74a3fb5..4aae11a3 100644
--- a/README.md
+++ b/README.md
@@ -43,8 +43,8 @@ Choose your path:
 
 ## Features
 
-- **Various models**: LLaMA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
-- **Integrated methods**: (Continuous) pre-training, supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
+- **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
+- **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
 - **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
diff --git a/README_zh.md b/README_zh.md
index ed19ff94..64c71cd6 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -43,8 +43,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 项目特色
 
-- **多种模型**：LLaMA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
-- **集成方法**：（增量）预训练、指令监督微调、奖励模型训练、PPO 训练、DPO 训练和 ORPO 训练。
+- **多种模型**：LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
+- **集成方法**：（增量）预训练、（多模态）指令监督微调、奖励模型训练、PPO 训练、DPO 训练和 ORPO 训练。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
 - **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。

From 697bc20941531fa0397cf32dadd75cee323d4453 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 06:41:35 +0800
Subject: [PATCH 184/341] add llava to llamaboard

Former-commit-id: deaaff0a9de0eef9691991c99cd797461b1165cc
---
 examples/inference/web_demo.sh           |  1 +
 src/llmtuner/extras/constants.py         | 15 +++++++++++++++
 src/llmtuner/webui/chatter.py            |  1 +
 src/llmtuner/webui/common.py             |  5 +++++
 src/llmtuner/webui/components/chatbot.py |  7 ++++---
 src/llmtuner/webui/components/export.py  |  3 +++
 src/llmtuner/webui/components/infer.py   | 14 ++++++++++----
 src/llmtuner/webui/components/top.py     | 16 ++++++++++------
 src/llmtuner/webui/engine.py             |  1 +
 src/llmtuner/webui/interface.py          |  4 ++--
 src/llmtuner/webui/locales.py            | 11 +++++++++++
 src/llmtuner/webui/manager.py            |  1 +
 src/llmtuner/webui/runner.py             |  2 ++
 13 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/examples/inference/web_demo.sh b/examples/inference/web_demo.sh
index 201be2b4..8d6ed09d 100644
--- a/examples/inference/web_demo.sh
+++ b/examples/inference/web_demo.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# add `--visual_inputs True` to load MLLM
 
 CUDA_VISIBLE_DEVICES=0 python ../../src/web_demo.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 9f7d5c46..26990530 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -28,6 +28,8 @@ LOG_FILE_NAME = "trainer_log.jsonl"
 
 METHODS = ["full", "freeze", "lora"]
 
+MLLM_LIST = ["LLaVA1.5"]
+
 MOD_SUPPORTED_MODELS = ["bloom", "falcon", "gemma", "llama", "mistral", "mixtral", "phi", "starcoder2"]
 
 PEFT_METHODS = ["lora"]
@@ -566,6 +568,19 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "LLaVA1.5-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-1.5-7b-hf",
+        },
+        "LLaVA1.5-13B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-1.5-13b-hf",
+        },
+    },
+    template="vicuna",
+)
+
+
 register_model_group(
     models={
         "Mistral-7B-v0.1": {
diff --git a/src/llmtuner/webui/chatter.py b/src/llmtuner/webui/chatter.py
index 5aa8f563..a92f6ef7 100644
--- a/src/llmtuner/webui/chatter.py
+++ b/src/llmtuner/webui/chatter.py
@@ -79,6 +79,7 @@ class WebChatModel(ChatModel):
             template=get("top.template"),
             flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
             use_unsloth=(get("top.booster") == "unsloth"),
+            visual_inputs=get("top.visual_inputs"),
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             infer_backend=get("infer.infer_backend"),
         )
diff --git a/src/llmtuner/webui/common.py b/src/llmtuner/webui/common.py
index 659c35c3..9af4c439 100644
--- a/src/llmtuner/webui/common.py
+++ b/src/llmtuner/webui/common.py
@@ -9,6 +9,7 @@ from ..extras.constants import (
     DATA_CONFIG,
     DEFAULT_MODULE,
     DEFAULT_TEMPLATE,
+    MLLM_LIST,
     PEFT_METHODS,
     STAGES_USE_PAIR_DATA,
     SUPPORTED_MODELS,
@@ -105,6 +106,10 @@ def get_template(model_name: str) -> str:
     return "default"
 
 
+def get_visual(model_name: str) -> bool:
+    return get_prefix(model_name) in MLLM_LIST
+
+
 def list_adapters(model_name: str, finetuning_type: str) -> "gr.Dropdown":
     if finetuning_type not in PEFT_METHODS:
         return gr.Dropdown(value=[], choices=[], interactive=False)
diff --git a/src/llmtuner/webui/components/chatbot.py b/src/llmtuner/webui/components/chatbot.py
index e1be1f7b..15c1fc83 100644
--- a/src/llmtuner/webui/components/chatbot.py
+++ b/src/llmtuner/webui/components/chatbot.py
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
 
 def create_chat_box(
     engine: "Engine", visible: bool = False
-) -> Tuple["gr.Column", "Component", "Component", Dict[str, "Component"]]:
+) -> Tuple["Component", "Component", Dict[str, "Component"]]:
     with gr.Column(visible=visible) as chat_box:
         chatbot = gr.Chatbot(show_copy_button=True)
         messages = gr.State([])
@@ -29,7 +29,7 @@ def create_chat_box(
                         system = gr.Textbox(show_label=False)
                         tools = gr.Textbox(show_label=False, lines=4)
 
-                    with gr.Column():
+                    with gr.Column() as image_box:
                         image = gr.Image(type="numpy")
 
                 query = gr.Textbox(show_label=False, lines=8)
@@ -55,13 +55,14 @@ def create_chat_box(
     clear_btn.click(lambda: ([], []), outputs=[chatbot, messages])
 
     return (
-        chat_box,
         chatbot,
         messages,
         dict(
+            chat_box=chat_box,
             role=role,
             system=system,
             tools=tools,
+            image_box=image_box,
             image=image,
             query=query,
             submit_btn=submit_btn,
diff --git a/src/llmtuner/webui/components/export.py b/src/llmtuner/webui/components/export.py
index ebccac25..4c224736 100644
--- a/src/llmtuner/webui/components/export.py
+++ b/src/llmtuner/webui/components/export.py
@@ -27,6 +27,7 @@ def save_model(
     adapter_path: List[str],
     finetuning_type: str,
     template: str,
+    visual_inputs: bool,
     export_size: int,
     export_quantization_bit: int,
     export_quantization_dataset: str,
@@ -66,6 +67,7 @@ def save_model(
         adapter_name_or_path=adapter_name_or_path,
         finetuning_type=finetuning_type,
         template=template,
+        visual_inputs=visual_inputs,
         export_dir=export_dir,
         export_hub_model_id=export_hub_model_id or None,
         export_size=export_size,
@@ -105,6 +107,7 @@ def create_export_tab(engine: "Engine") -> Dict[str, "Component"]:
             engine.manager.get_elem_by_id("top.adapter_path"),
             engine.manager.get_elem_by_id("top.finetuning_type"),
             engine.manager.get_elem_by_id("top.template"),
+            engine.manager.get_elem_by_id("top.visual_inputs"),
             export_size,
             export_quantization_bit,
             export_quantization_dataset,
diff --git a/src/llmtuner/webui/components/infer.py b/src/llmtuner/webui/components/infer.py
index d565347e..970f4629 100644
--- a/src/llmtuner/webui/components/infer.py
+++ b/src/llmtuner/webui/components/infer.py
@@ -28,15 +28,21 @@ def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]:
     input_elems.update({infer_backend})
     elem_dict.update(dict(infer_backend=infer_backend, load_btn=load_btn, unload_btn=unload_btn, info_box=info_box))
 
-    chat_box, chatbot, messages, chat_elems = create_chat_box(engine, visible=False)
-    elem_dict.update(dict(chat_box=chat_box, **chat_elems))
+    chatbot, messages, chat_elems = create_chat_box(engine, visible=False)
+    elem_dict.update(chat_elems)
 
     load_btn.click(engine.chatter.load_model, input_elems, [info_box]).then(
-        lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_box]
+        lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]]
     )
 
     unload_btn.click(engine.chatter.unload_model, input_elems, [info_box]).then(
         lambda: ([], []), outputs=[chatbot, messages]
-    ).then(lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_box])
+    ).then(lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]])
+
+    engine.manager.get_elem_by_id("top.visual_inputs").change(
+        lambda enabled: gr.Column(visible=enabled),
+        [engine.manager.get_elem_by_id("top.visual_inputs")],
+        [chat_elems["image_box"]],
+    )
 
     return elem_dict
diff --git a/src/llmtuner/webui/components/top.py b/src/llmtuner/webui/components/top.py
index c67d7cc5..a75a4d62 100644
--- a/src/llmtuner/webui/components/top.py
+++ b/src/llmtuner/webui/components/top.py
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Dict
 from ...data import templates
 from ...extras.constants import METHODS, SUPPORTED_MODELS
 from ...extras.packages import is_gradio_available
-from ..common import get_model_path, get_template, list_adapters, save_config
+from ..common import get_model_path, get_template, get_visual, list_adapters, save_config
 from ..utils import can_quantize
 
 
@@ -30,14 +30,17 @@ def create_top() -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as advanced_tab:
         with gr.Row():
-            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none")
-            template = gr.Dropdown(choices=list(templates.keys()), value="default")
-            rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none")
-            booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none")
+            quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=2)
+            template = gr.Dropdown(choices=list(templates.keys()), value="default", scale=2)
+            rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=3)
+            booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none", scale=3)
+            visual_inputs = gr.Checkbox(scale=1)
 
     model_name.change(list_adapters, [model_name, finetuning_type], [adapter_path], queue=False).then(
         get_model_path, [model_name], [model_path], queue=False
-    ).then(get_template, [model_name], [template], queue=False)  # do not save config since the below line will save
+    ).then(get_template, [model_name], [template], queue=False).then(
+        get_visual, [model_name], [visual_inputs], queue=False
+    )  # do not save config since the below line will save
 
     model_path.change(save_config, inputs=[lang, model_name, model_path], queue=False)
 
@@ -59,4 +62,5 @@ def create_top() -> Dict[str, "Component"]:
         template=template,
         rope_scaling=rope_scaling,
         booster=booster,
+        visual_inputs=visual_inputs,
     )
diff --git a/src/llmtuner/webui/engine.py b/src/llmtuner/webui/engine.py
index b9ee61d2..cebac3b9 100644
--- a/src/llmtuner/webui/engine.py
+++ b/src/llmtuner/webui/engine.py
@@ -43,6 +43,7 @@ class Engine:
             init_dict["train.output_dir"] = {"value": "train_{}".format(get_time())}
             init_dict["train.config_path"] = {"value": "{}.json".format(get_time())}
             init_dict["eval.output_dir"] = {"value": "eval_{}".format(get_time())}
+            init_dict["infer.image_box"] = {"visible": False}
 
             if user_config.get("last_model", None):
                 init_dict["top.model_name"] = {"value": user_config["last_model"]}
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index 0359d082..abca16c5 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -58,8 +58,8 @@ def create_web_demo() -> gr.Blocks:
         lang = gr.Dropdown(choices=["en", "zh"])
         engine.manager.add_elems("top", dict(lang=lang))
 
-        chat_box, _, _, chat_elems = create_chat_box(engine, visible=True)
-        engine.manager.add_elems("infer", dict(chat_box=chat_box, **chat_elems))
+        _, _, chat_elems = create_chat_box(engine, visible=True)
+        engine.manager.add_elems("infer", chat_elems)
 
         demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
         lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index 8e93efd6..d341c7b6 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -129,6 +129,17 @@ LOCALES = {
             "label": "加速方式",
         },
     },
+    "visual_inputs": {
+        "en": {
+            "label": "Visual inputs",
+        },
+        "ru": {
+            "label": "визуальные входы",
+        },
+        "zh": {
+            "label": "图像输入",
+        },
+    },
     "training_stage": {
         "en": {
             "label": "Stage",
diff --git a/src/llmtuner/webui/manager.py b/src/llmtuner/webui/manager.py
index a67c0995..f65fa804 100644
--- a/src/llmtuner/webui/manager.py
+++ b/src/llmtuner/webui/manager.py
@@ -60,4 +60,5 @@ class Manager:
             self._id_to_elem["top.template"],
             self._id_to_elem["top.rope_scaling"],
             self._id_to_elem["top.booster"],
+            self._id_to_elem["top.visual_inputs"],
         }
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 77d5ea98..8054484f 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -124,6 +124,7 @@ class Runner:
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
             use_unsloth=(get("top.booster") == "unsloth"),
+            visual_inputs=get("top.visual_inputs"),
             dataset_dir=get("train.dataset_dir"),
             dataset=",".join(get("train.dataset")),
             cutoff_len=get("train.cutoff_len"),
@@ -224,6 +225,7 @@ class Runner:
             rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
             flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
             use_unsloth=(get("top.booster") == "unsloth"),
+            visual_inputs=get("top.visual_inputs"),
             dataset_dir=get("eval.dataset_dir"),
             dataset=",".join(get("eval.dataset")),
             cutoff_len=get("eval.cutoff_len"),

From 51f776ae2a30225b7ecb567f40675879f3cd49e9 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 18:00:23 +0800
Subject: [PATCH 185/341] fix llava qlora

Former-commit-id: 01c5a669f6fe598aac1758a700a7607da37db1bc
---
 examples/merge_lora/quantize.sh    |  1 +
 src/llmtuner/data/aligner.py       | 26 ++++++++++++++++----------
 src/llmtuner/hparams/model_args.py |  3 +++
 src/llmtuner/hparams/parser.py     |  3 +++
 src/llmtuner/model/patcher.py      |  4 ++++
 src/llmtuner/model/utils/visual.py | 28 ++++++++++++++++++++++++++++
 6 files changed, 55 insertions(+), 10 deletions(-)
 create mode 100644 src/llmtuner/model/utils/visual.py

diff --git a/examples/merge_lora/quantize.sh b/examples/merge_lora/quantize.sh
index 143bce50..aeedbe66 100644
--- a/examples/merge_lora/quantize.sh
+++ b/examples/merge_lora/quantize.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# NEED TO run `merge.sh` before using this script
 
 CUDA_VISIBLE_DEVICES=0 python ../../src/export_model.py \
     --model_name_or_path ../../models/llama2-7b-sft \
diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py
index dc1de865..6bd12aad 100644
--- a/src/llmtuner/data/aligner.py
+++ b/src/llmtuner/data/aligner.py
@@ -14,10 +14,23 @@ if TYPE_CHECKING:
     from .parser import DatasetAttr
 
 
+def _convert_images(images: List[Any], dataset_attr: "DatasetAttr", data_args: "DataArguments") -> List[Any]:
+    outputs = []
+    if dataset_attr.load_from in ["script", "file"]:
+        for image in images:
+            if isinstance(image, str) and os.path.isfile(os.path.join(data_args.dataset_dir, image)):
+                outputs.append(os.path.join(data_args.dataset_dir, image))
+            else:
+                outputs.append(image)
+
+    return outputs
+
+
 def convert_alpaca(
     examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
 ) -> Dict[str, List[Any]]:
     outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
+    convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args)
     for i in range(len(examples[dataset_attr.prompt])):
         prompt = []
         if dataset_attr.history and isinstance(examples[dataset_attr.history][i], list):
@@ -47,11 +60,7 @@ def convert_alpaca(
         outputs["response"].append(response)
         outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
         outputs["tools"].append("")
-        outputs["images"].append(
-            [os.path.join(data_args.dataset_dir, path) for path in examples[dataset_attr.images][i]]
-            if dataset_attr.images
-            else []
-        )
+        outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else [])
 
     return outputs
 
@@ -60,6 +69,7 @@ def convert_sharegpt(
     examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
 ) -> Dict[str, List[Any]]:
     outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
+    convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args)
     tag_mapping = {
         dataset_attr.user_tag: Role.USER.value,
         dataset_attr.assistant_tag: Role.ASSISTANT.value,
@@ -94,11 +104,7 @@ def convert_sharegpt(
         outputs["response"].append(aligned_messages[-1:])
         outputs["system"].append(system)
         outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
-        outputs["images"].append(
-            [os.path.join(data_args.dataset_dir, path) for path in examples[dataset_attr.images][i]]
-            if dataset_attr.images
-            else []
-        )
+        outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else [])
 
     return outputs
 
diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index be65cd27..ac70bb3c 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -182,6 +182,9 @@ class ModelArguments:
         if self.split_special_tokens and self.use_fast_tokenizer:
             raise ValueError("`split_special_tokens` is only supported for slow tokenizers.")
 
+        if self.visual_inputs and self.use_unsloth:
+            raise ValueError("Unsloth does not support MLLM yet. Stay tuned.")
+
         if self.adapter_name_or_path is not None:  # support merging multiple lora weights
             self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")]
 
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 715b8f95..aa046837 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -323,6 +323,9 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
         if model_args.visual_inputs:
             raise ValueError("vLLM engine does not support MLLM yet. Stay tuned.")
 
+    if finetuning_args.stage == "rm" and model_args.visual_inputs:
+        raise ValueError("Reward server does not support MLLM yet. Stay tuned.")
+
     _verify_model_args(model_args, finetuning_args)
     _check_extra_dependencies(model_args, finetuning_args)
 
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 5c3c31b3..94d99644 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -15,6 +15,7 @@ from .utils.longlora import configure_longlora
 from .utils.moe import add_z3_leaf_module, configure_moe
 from .utils.quantization import configure_quantization
 from .utils.rope import configure_rope
+from .utils.visual import autocast_projector_dtype
 
 
 if TYPE_CHECKING:
@@ -92,6 +93,9 @@ def patch_model(
     if model_args.resize_vocab:
         resize_embedding_layer(model, tokenizer)
 
+    if model_args.visual_inputs:
+        autocast_projector_dtype(model, model_args)
+
     if is_trainable:
         prepare_model_for_training(model, model_args)
         add_z3_leaf_module(model)
diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
new file mode 100644
index 00000000..cb51301b
--- /dev/null
+++ b/src/llmtuner/model/utils/visual.py
@@ -0,0 +1,28 @@
+from typing import TYPE_CHECKING, Tuple
+
+import torch
+
+from ...extras.logging import get_logger
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def autocast_projector_dtype(
+    model: "PreTrainedModel", model_args: "ModelArguments", mm_projector_name: str = "multi_modal_projector"
+) -> None:
+    def _mm_projector_forward_post_hook(
+        module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor"
+    ) -> "torch.Tensor":
+        return output.to(model_args.compute_dtype)
+
+    if hasattr(model, mm_projector_name):
+        logger.info("Casting multimodal projector outputs in {}.".format(model_args.compute_dtype))
+        mm_projector: "torch.nn.Module" = getattr(model, mm_projector_name)
+        mm_projector.register_forward_hook(_mm_projector_forward_post_hook)

From 70bed8ad8f2303b76b44e9cb2a0e12704512c3f9 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 19:59:22 +0800
Subject: [PATCH 186/341] support Qwen1.5 110B

Former-commit-id: d6e5ecaf4109127bab24e39a0696076bceb0b37c
---
 README.md                        | 50 ++++++++++++++++----------------
 README_zh.md                     | 50 ++++++++++++++++----------------
 src/llmtuner/extras/constants.py |  8 +++++
 3 files changed, 58 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index 4aae11a3..d6f59989 100644
--- a/README.md
+++ b/README.md
@@ -136,31 +136,31 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Supported Models
 
-| Model                                                    | Model size                  | Default module    | Template  |
-| -------------------------------------------------------- | --------------------------- | ----------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                      | W_pack            | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                          | query_key_value   | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                    | q_proj,v_proj     | cohere    |
-| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                  | q_proj,v_proj     | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                 | query_key_value   | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                       | q_proj,v_proj     | gemma     |
-| [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                      | q_proj,v_proj     | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                      | q_proj,v_proj     | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
-| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | q_proj,v_proj     | -         |
-| [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
-| [Phi-3](https://huggingface.co/microsoft)                | 3.8B                        | qkv_proj          | phi       |
-| [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
-| [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                  | q_proj,v_proj     | xverse    |
-| [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                   | q_proj,v_proj     | yi        |
-| [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                 | q_proj,v_proj     | yuan      |
+| Model                                                    | Model size                       | Default module    | Template  |
+| -------------------------------------------------------- | -------------------------------- | ----------------- | --------- |
+| [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                           | W_pack            | baichuan2 |
+| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | query_key_value   | -         |
+| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B      | query_key_value   | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | query_key_value   | chatglm3  |
+| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | q_proj,v_proj     | cohere    |
+| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                       | q_proj,v_proj     | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                      | query_key_value   | falcon    |
+| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | q_proj,v_proj     | gemma     |
+| [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | wqkv              | intern2   |
+| [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B                   | q_proj,v_proj     | -         |
+| [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                       | q_proj,v_proj     | llama2    |
+| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                           | q_proj,v_proj     | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                           | q_proj,v_proj     | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B                    | q_proj,v_proj     | mistral   |
+| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                            | q_proj,v_proj     | -         |
+| [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | q_proj,v_proj     | -         |
+| [Phi-3](https://huggingface.co/microsoft)                | 3.8B                             | qkv_proj          | phi       |
+| [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B                  | c_attn            | qwen      |
+| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | q_proj,v_proj     | qwen      |
+| [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
+| [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
+| [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                        | q_proj,v_proj     | yi        |
+| [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
 > **Default module** is used for the `--lora_target` argument, you can use `--lora_target all` to specify all the available modules for better convergence.
diff --git a/README_zh.md b/README_zh.md
index 64c71cd6..d0534858 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -136,31 +136,31 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 模型
 
-| 模型名                                                   | 模型大小                     | 默认模块           | Template  |
-| -------------------------------------------------------- | --------------------------- | ----------------- | --------- |
-| [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                      | W_pack            | baichuan2 |
-| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
-| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value   | -         |
-| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                          | query_key_value   | chatglm3  |
-| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                    | q_proj,v_proj     | cohere    |
-| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                  | q_proj,v_proj     | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                 | query_key_value   | falcon    |
-| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                       | q_proj,v_proj     | gemma     |
-| [InternLM2](https://huggingface.co/internlm)             | 7B/20B                      | wqkv              | intern2   |
-| [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B              | q_proj,v_proj     | -         |
-| [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                  | q_proj,v_proj     | llama2    |
-| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                      | q_proj,v_proj     | llama3    |
-| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                      | q_proj,v_proj     | vicuna    |
-| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B               | q_proj,v_proj     | mistral   |
-| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                       | q_proj,v_proj     | -         |
-| [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                   | q_proj,v_proj     | -         |
-| [Phi-3](https://huggingface.co/microsoft)                | 3.8B                        | qkv_proj          | phi       |
-| [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B             | c_attn            | qwen      |
-| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B | q_proj,v_proj     | qwen      |
-| [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                   | q_proj,v_proj     | -         |
-| [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                  | q_proj,v_proj     | xverse    |
-| [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                   | q_proj,v_proj     | yi        |
-| [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                 | q_proj,v_proj     | yuan      |
+| 模型名                                                   | 模型大小                          | 默认模块           | Template  |
+| -------------------------------------------------------- | -------------------------------- | ----------------- | --------- |
+| [Baichuan2](https://huggingface.co/baichuan-inc)         | 7B/13B                           | W_pack            | baichuan2 |
+| [BLOOM](https://huggingface.co/bigscience)               | 560M/1.1B/1.7B/3B/7.1B/176B      | query_key_value   | -         |
+| [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B      | query_key_value   | -         |
+| [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | query_key_value   | chatglm3  |
+| [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | q_proj,v_proj     | cohere    |
+| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                       | q_proj,v_proj     | deepseek  |
+| [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                      | query_key_value   | falcon    |
+| [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | q_proj,v_proj     | gemma     |
+| [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | wqkv              | intern2   |
+| [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B                   | q_proj,v_proj     | -         |
+| [LLaMA-2](https://huggingface.co/meta-llama)             | 7B/13B/70B                       | q_proj,v_proj     | llama2    |
+| [LLaMA-3](https://huggingface.co/meta-llama)             | 8B/70B                           | q_proj,v_proj     | llama3    |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)             | 7B/13B                           | q_proj,v_proj     | vicuna    |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)      | 7B/8x7B/8x22B                    | q_proj,v_proj     | mistral   |
+| [OLMo](https://huggingface.co/allenai)                   | 1B/7B                            | q_proj,v_proj     | -         |
+| [Phi-1.5/2](https://huggingface.co/microsoft)            | 1.3B/2.7B                        | q_proj,v_proj     | -         |
+| [Phi-3](https://huggingface.co/microsoft)                | 3.8B                             | qkv_proj          | phi       |
+| [Qwen](https://huggingface.co/Qwen)                      | 1.8B/7B/14B/72B                  | c_attn            | qwen      |
+| [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | q_proj,v_proj     | qwen      |
+| [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
+| [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
+| [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                        | q_proj,v_proj     | yi        |
+| [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
 > **默认模块**应作为 `--lora_target` 参数的默认值，可使用 `--lora_target all` 参数指定全部模块以得到更好的效果。
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 26990530..99544a88 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -808,6 +808,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B",
         },
+        "Qwen1.5-110B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-110B",
+        },
         "Qwen1.5-MoE-A2.7B": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B",
@@ -844,6 +848,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat",
         },
+        "Qwen1.5-110B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-110B-Chat",
+        },
         "Qwen1.5-MoE-A2.7B-Chat": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat",

From cb8b8f40cd0c37b944f7a304c6233da7987ef35d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 20:09:14 +0800
Subject: [PATCH 187/341] update readme

Former-commit-id: f3d4b46338d4d484b205d0651a1fa7b2e77a1654
---
 README.md    | 18 +++++++++---------
 README_zh.md | 18 +++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index d6f59989..eb28c89d 100644
--- a/README.md
+++ b/README.md
@@ -292,15 +292,15 @@ huggingface-cli login
 
 \* *estimated*
 
-| Method            | Bits |   7B  |  13B  |  30B  |   70B  |  8x7B |  8x22B |
-| ----------------- | ---- | ----- | ----- | ----- | ------ | ----- | ------ |
-| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB | 900GB | 2400GB |
-| Full              |  16  |  60GB | 120GB | 300GB |  600GB | 400GB | 1200GB |
-| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB | 160GB |  400GB |
-| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB | 120GB |  320GB |
-| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |  60GB |  160GB |
-| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |  30GB |   96GB |
-| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |  18GB |   48GB |
+| Method            | Bits |   7B  |  13B  |  30B  |   70B  |  110B  |  8x7B |  8x22B |
+| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ | ----- | ------ |
+| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB | 2000GB | 900GB | 2400GB |
+| Full              |  16  |  60GB | 120GB | 300GB |  600GB |  900GB | 400GB | 1200GB |
+| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB |  360GB | 160GB |  400GB |
+| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  240GB | 120GB |  320GB |
+| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |  140GB |  60GB |  160GB |
+| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |   72GB |  30GB |   96GB |
+| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |   48GB |  18GB |   48GB |
 
 ## Getting Started
 
diff --git a/README_zh.md b/README_zh.md
index d0534858..bfcf3870 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -292,15 +292,15 @@ huggingface-cli login
 
 \* *估算值*
 
-| 方法              | 精度 |   7B  |  13B  |  30B  |   70B  |  8x7B |  8x22B |
-| ----------------- | ---- | ----- | ----- | ----- | ------ | ----- | ------ |
-| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB | 900GB | 2400GB |
-| Full              |  16  |  60GB | 120GB | 300GB |  600GB | 400GB | 1200GB |
-| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB | 160GB |  400GB |
-| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB | 120GB |  320GB |
-| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |  60GB |  160GB |
-| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |  30GB |   96GB |
-| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |  18GB |   48GB |
+| 方法               | 精度 |   7B  |  13B  |  30B  |   70B  |  110B  |  8x7B |  8x22B |
+| ----------------- | ---- | ----- | ----- | ----- | ------ | ------ | ----- | ------ |
+| Full              | AMP  | 120GB | 240GB | 600GB | 1200GB | 2000GB | 900GB | 2400GB |
+| Full              |  16  |  60GB | 120GB | 300GB |  600GB |  900GB | 400GB | 1200GB |
+| Freeze            |  16  |  20GB |  40GB |  80GB |  200GB |  360GB | 160GB |  400GB |
+| LoRA/GaLore/BAdam |  16  |  16GB |  32GB |  64GB |  160GB |  240GB | 120GB |  320GB |
+| QLoRA             |   8  |  10GB |  20GB |  40GB |   80GB |  140GB |  60GB |  160GB |
+| QLoRA             |   4  |   6GB |  12GB |  24GB |   48GB |   72GB |  30GB |   96GB |
+| QLoRA             |   2  |   4GB |   8GB |  16GB |   24GB |   48GB |  18GB |   48GB |
 
 ## 如何使用
 

From c501f377dd44d3977fec396e8f259147822028a1 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 23:18:00 +0800
Subject: [PATCH 188/341] release v0.7.0

Former-commit-id: 45bb89cb4d26a6b3fb5360bc90ab950738fe4920
---
 setup.py                                 |  2 +-
 src/llmtuner/__init__.py                 |  2 +-
 src/llmtuner/chat/hf_engine.py           |  2 +-
 src/llmtuner/chat/vllm_engine.py         | 53 +++++++++++++++++-------
 src/llmtuner/data/preprocess.py          | 50 ++++++++++++++++------
 src/llmtuner/extras/packages.py          |  4 ++
 src/llmtuner/hparams/parser.py           |  5 +--
 src/llmtuner/webui/components/chatbot.py |  4 +-
 8 files changed, 86 insertions(+), 36 deletions(-)

diff --git a/setup.py b/setup.py
index 9ef881e2..7ff3185f 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ extra_require = {
     "unsloth": ["torch==2.2.0", "unsloth[cu121-ampere-torch220]"],
     "galore": ["galore-torch"],
     "badam": ["badam"],
-    "vllm": ["vllm>=0.3.3"],
+    "vllm": ["vllm>=0.4.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
     "awq": ["autoawq"],
diff --git a/src/llmtuner/__init__.py b/src/llmtuner/__init__.py
index 6cb78806..b3a980a5 100644
--- a/src/llmtuner/__init__.py
+++ b/src/llmtuner/__init__.py
@@ -7,5 +7,5 @@ from .train import export_model, run_exp
 from .webui import create_ui, create_web_demo
 
 
-__version__ = "0.6.4.dev0"
+__version__ = "0.7.0"
 __all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]
diff --git a/src/llmtuner/chat/hf_engine.py b/src/llmtuner/chat/hf_engine.py
index f6f51898..e8f06a73 100644
--- a/src/llmtuner/chat/hf_engine.py
+++ b/src/llmtuner/chat/hf_engine.py
@@ -56,7 +56,7 @@ class HuggingfaceEngine(BaseEngine):
         input_kwargs: Optional[Dict[str, Any]] = {},
     ) -> Tuple[Dict[str, Any], int]:
         if processor is not None and image is not None and "<image>" not in messages[0]["content"]:
-            messages[0]["content"] = messages[0]["content"] + "<image>"
+            messages[0]["content"] = "<image>" + messages[0]["content"]
 
         paired_messages = messages + [{"role": "assistant", "content": ""}]
         prompt_ids, _ = template.encode_oneturn(
diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index a4caa53b..0f0dc366 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -11,10 +11,13 @@ from .base_engine import BaseEngine, Response
 if is_vllm_available():
     from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
     from vllm.lora.request import LoRARequest
+    from vllm.sequence import MultiModalData
 
 
 if TYPE_CHECKING:
+    import torch
     from numpy.typing import NDArray
+    from transformers.image_processing_utils import BaseImageProcessor
 
     from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
 
@@ -39,20 +42,30 @@ class VllmEngine(BaseEngine):
         self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
         self.generating_args = generating_args.to_dict()
 
-        engine_args = AsyncEngineArgs(
-            model=model_args.model_name_or_path,
-            trust_remote_code=True,
-            download_dir=model_args.cache_dir,
-            dtype=infer_dtype,
-            max_model_len=model_args.vllm_maxlen,
-            tensor_parallel_size=get_device_count() or 1,
-            gpu_memory_utilization=model_args.vllm_gpu_util,
-            disable_log_stats=True,
-            disable_log_requests=True,
-            enforce_eager=model_args.vllm_enforce_eager,
-            enable_lora=model_args.adapter_name_or_path is not None,
-        )
-        self.model = AsyncLLMEngine.from_engine_args(engine_args)
+        engine_args = {
+            "model": model_args.model_name_or_path,
+            "trust_remote_code": True,
+            "download_dir": model_args.cache_dir,
+            "dtype": infer_dtype,
+            "max_model_len": model_args.vllm_maxlen,
+            "tensor_parallel_size": get_device_count() or 1,
+            "gpu_memory_utilization": model_args.vllm_gpu_util,
+            "disable_log_stats": True,
+            "disable_log_requests": True,
+            "enforce_eager": model_args.vllm_enforce_eager,
+            "enable_lora": model_args.adapter_name_or_path is not None,
+        }
+
+        if model_args.visual_inputs:
+            # TODO: auto derive from config
+            # https://github.com/vllm-project/vllm/pull/3042#issuecomment-1984893549
+            self.image_feature_size = 576
+            engine_args["image_input_type"] = "pixel_values"
+            engine_args["image_token_id"] = self.tokenizer.convert_tokens_to_ids("<image>")
+            engine_args["image_input_shape"] = "1,3,336,336"
+            engine_args["image_feature_size"] = self.image_feature_size
+
+        self.model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**engine_args))
         if model_args.adapter_name_or_path is not None:
             self.lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
         else:
@@ -67,6 +80,9 @@ class VllmEngine(BaseEngine):
         **input_kwargs,
     ) -> AsyncIterator["RequestOutput"]:
         request_id = "chatcmpl-{}".format(uuid.uuid4().hex)
+        if self.processor is not None and image is not None and "<image>" not in messages[0]["content"]:
+            messages[0]["content"] = "<image>" * self.image_feature_size + messages[0]["content"]
+
         paired_messages = messages + [{"role": "assistant", "content": ""}]
         prompt_ids, _ = self.template.encode_oneturn(
             tokenizer=self.tokenizer, messages=paired_messages, system=system, tools=tools
@@ -110,12 +126,21 @@ class VllmEngine(BaseEngine):
             max_tokens=generating_args["max_new_tokens"],
             skip_special_tokens=True,
         )
+
+        if self.processor is not None and image is not None:
+            image_processor: "BaseImageProcessor" = getattr(self.processor, "image_processor")
+            pixel_values: "torch.Tensor" = image_processor(image, return_tensors="pt")["pixel_values"]
+            multi_modal_data = MultiModalData(type=MultiModalData.Type.IMAGE, data=pixel_values)
+        else:
+            multi_modal_data = None
+
         result_generator = self.model.generate(
             prompt=None,
             sampling_params=sampling_params,
             request_id=request_id,
             prompt_token_ids=prompt_ids,
             lora_request=self.lora_request,
+            multi_modal_data=multi_modal_data,
         )
         return result_generator
 
diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py
index 18681872..38211b0c 100644
--- a/src/llmtuner/data/preprocess.py
+++ b/src/llmtuner/data/preprocess.py
@@ -1,14 +1,20 @@
 from functools import partial
 from itertools import chain
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple
 
 from ..extras.constants import IGNORE_INDEX
 from ..extras.logging import get_logger
+from ..extras.packages import is_pillow_available
 from .utils import Role
 
 
+if is_pillow_available():
+    from PIL import Image
+
+
 if TYPE_CHECKING:
-    from PIL.Image import Image
+    from numpy.typing import NDArray
+    from PIL.Image import Image as ImageObject
     from transformers import ProcessorMixin, Seq2SeqTrainingArguments
     from transformers.image_processing_utils import BaseImageProcessor
     from transformers.tokenization_utils import PreTrainedTokenizer
@@ -20,12 +26,11 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
-def _preprocess_visual_inputs(model_inputs: Dict[str, Any], processor: "ProcessorMixin", image: "Image") -> None:
+def _preprocess_visual_inputs(images: Sequence["ImageObject"], processor: "ProcessorMixin") -> "NDArray":
+    # process visual inputs (currently only supports a single image)
     image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
-    pixel_values = image_processor(image, return_tensors="pt")["pixel_values"][0]
-    if "pixel_values" not in model_inputs:
-        model_inputs["pixel_values"] = []
-    model_inputs["pixel_values"].append(pixel_values)
+    image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255))
+    return image_processor(image, return_tensors="pt")["pixel_values"][0]
 
 
 def preprocess_pretrain_dataset(
@@ -66,11 +71,17 @@ def preprocess_supervised_dataset(
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
     model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
+    if processor is not None:
+        model_inputs["pixel_values"] = []
+        preprocess_visual_inputs = partial(_preprocess_visual_inputs, processor=processor)
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
             continue
 
+        if processor is not None:
+            examples["prompt"][i][0]["content"] = "<image>" + examples["prompt"][i][0]["content"]
+
         messages = examples["prompt"][i] + examples["response"][i]
         input_ids, labels = [], []
         for turn_idx, (source_ids, target_ids) in enumerate(
@@ -100,8 +111,8 @@ def preprocess_supervised_dataset(
         model_inputs["input_ids"].append(input_ids)
         model_inputs["attention_mask"].append([1] * len(input_ids))
         model_inputs["labels"].append(labels)
-        if processor is not None and "images" in examples:
-            _preprocess_visual_inputs(model_inputs, processor, examples["images"][i][0])
+        if processor is not None:
+            model_inputs["pixel_values"].append(preprocess_visual_inputs(examples["images"][i]))
 
     return model_inputs
 
@@ -161,11 +172,17 @@ def preprocess_unsupervised_dataset(
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X` and labels with format `Y <eos>`
     model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
+    if processor is not None:
+        model_inputs["pixel_values"] = []
+        preprocess_visual_inputs = partial(_preprocess_visual_inputs, processor=processor)
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1:
             continue
 
+        if processor is not None:
+            examples["prompt"][i][0]["content"] = "<image>" + examples["prompt"][i][0]["content"]
+
         if len(examples["response"][i]) == 1:
             messages = examples["prompt"][i] + examples["response"][i]
         else:
@@ -186,8 +203,8 @@ def preprocess_unsupervised_dataset(
         model_inputs["input_ids"].append(input_ids)
         model_inputs["attention_mask"].append([1] * len(input_ids))
         model_inputs["labels"].append(labels)
-        if processor is not None and "images" in examples:
-            _preprocess_visual_inputs(model_inputs, processor, examples["images"][i][0])
+        if processor is not None:
+            model_inputs["pixel_values"].append(preprocess_visual_inputs(examples["images"][i]))
 
     return model_inputs
 
@@ -201,10 +218,17 @@ def preprocess_pairwise_dataset(
 ) -> Dict[str, List[List[int]]]:
     # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>`
     model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []}
+    if processor is not None:
+        model_inputs["pixel_values"] = []
+        preprocess_visual_inputs = partial(_preprocess_visual_inputs, processor=processor)
+
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2:
             continue
 
+        if processor is not None:
+            examples["prompt"][i][0]["content"] = "<image>" + examples["prompt"][i][0]["content"]
+
         chosen_messages = examples["prompt"][i] + [examples["response"][i][0]]
         rejected_messages = examples["prompt"][i] + [examples["response"][i][1]]
         prompt_ids, chosen_ids = template.encode_oneturn(
@@ -231,8 +255,8 @@ def preprocess_pairwise_dataset(
         model_inputs["prompt_ids"].append(prompt_ids)
         model_inputs["chosen_ids"].append(chosen_ids)
         model_inputs["rejected_ids"].append(rejected_ids)
-        if processor is not None and "images" in examples:
-            _preprocess_visual_inputs(model_inputs, processor, examples["images"][i][0])
+        if processor is not None:
+            model_inputs["pixel_values"].append(preprocess_visual_inputs(examples["images"][i]))
 
     return model_inputs
 
diff --git a/src/llmtuner/extras/packages.py b/src/llmtuner/extras/packages.py
index aeeba084..a7317eec 100644
--- a/src/llmtuner/extras/packages.py
+++ b/src/llmtuner/extras/packages.py
@@ -48,6 +48,10 @@ def is_nltk_available():
     return _is_package_available("nltk")
 
 
+def is_pillow_available():
+    return _is_package_available("PIL")
+
+
 def is_requests_available():
     return _is_package_available("requests")
 
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index aa046837..977d7cf4 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -89,7 +89,7 @@ def _check_extra_dependencies(
         require_version("mixture-of-depth>=1.1.6", "To fix: pip install mixture-of-depth>=1.1.6")
 
     if model_args.infer_backend == "vllm":
-        require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3")
+        require_version("vllm>=0.4.0", "To fix: pip install vllm>=0.4.0")
 
     if finetuning_args.use_galore:
         require_version("galore_torch", "To fix: pip install galore_torch")
@@ -320,9 +320,6 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
         if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
             raise ValueError("vLLM only accepts a single adapter. Merge them first.")
 
-        if model_args.visual_inputs:
-            raise ValueError("vLLM engine does not support MLLM yet. Stay tuned.")
-
     if finetuning_args.stage == "rm" and model_args.visual_inputs:
         raise ValueError("Reward server does not support MLLM yet. Stay tuned.")
 
diff --git a/src/llmtuner/webui/components/chatbot.py b/src/llmtuner/webui/components/chatbot.py
index 15c1fc83..0a55460c 100644
--- a/src/llmtuner/webui/components/chatbot.py
+++ b/src/llmtuner/webui/components/chatbot.py
@@ -27,10 +27,10 @@ def create_chat_box(
                     with gr.Column():
                         role = gr.Dropdown(choices=[Role.USER.value, Role.OBSERVATION.value], value=Role.USER.value)
                         system = gr.Textbox(show_label=False)
-                        tools = gr.Textbox(show_label=False, lines=4)
+                        tools = gr.Textbox(show_label=False, lines=3)
 
                     with gr.Column() as image_box:
-                        image = gr.Image(type="numpy")
+                        image = gr.Image(sources=["upload"], type="numpy")
 
                 query = gr.Textbox(show_label=False, lines=8)
                 submit_btn = gr.Button(variant="primary")

From 544a6259b681658df5cf57b0eeca9083b8335ecf Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 26 Apr 2024 23:39:19 +0800
Subject: [PATCH 189/341] update readme

Former-commit-id: c9190fe36f511c3a5149d45c85a10b02a57fa88a
---
 README.md    | 3 ++-
 README_zh.md | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index eb28c89d..789c20ce 100644
--- a/README.md
+++ b/README.md
@@ -238,6 +238,7 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ
 - [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
 - [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
 - [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
+- [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
 - [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
@@ -257,7 +258,7 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ
 - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Orca DPO (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
-- [DPO mix (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
+- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
 - [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
 
 </details>
diff --git a/README_zh.md b/README_zh.md
index bfcf3870..fb02aa95 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -238,6 +238,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
 - [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
 - [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
+- [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
 - [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
@@ -257,7 +258,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Orca DPO (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
-- [DPO mix (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
+- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
 - [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
 
 </details>

From a412b4ed4a9c114667b0f942f2a8f441dfd2d6c7 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 28 Apr 2024 01:50:30 +0800
Subject: [PATCH 190/341] add models to 0.7.0

Former-commit-id: 436d3754452f839c617839ab3bbaacc4a8908e19
---
 src/llmtuner/data/template.py    |  2 +-
 src/llmtuner/extras/constants.py | 77 +++++++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 73b22eb7..ada6cfcd 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -870,7 +870,7 @@ _register_template(
     format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
     format_assistant=StringFormatter(slots=["\n{{content}}", {"eos_token"}]),
     format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
-    default_system="You are a friendly chatbot who always responds in the style of a pirate",
+    default_system="You are Zephyr, a helpful assistant.",
 )
 
 
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 99544a88..0329b374 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -306,9 +306,11 @@ register_model_group(
         },
         "DeepSeek-Math-7B-Base": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-base",
         },
         "DeepSeek-Math-7B-Chat": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-instruct",
         },
         "DeepSeek-MoE-16B-Base": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-base",
@@ -616,6 +618,7 @@ register_model_group(
         },
         "Mixtral-8x22B-v0.1": {
             DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x22B-v0.1",
         },
         "Mixtral-8x22B-v0.1-Chat": {
             DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-Instruct-v0.1",
@@ -644,7 +647,7 @@ register_model_group(
     models={
         "OpenChat3.5-7B-Chat": {
             DownloadSource.DEFAULT: "openchat/openchat-3.5-0106",
-            DownloadSource.MODELSCOPE: "myxiongmodel/openchat_3.5",
+            DownloadSource.MODELSCOPE: "xcwzxcwz/openchat-3.5-0106",
         }
     },
     template="openchat",
@@ -696,9 +699,11 @@ register_model_group(
     models={
         "Phi3-3.8B-4k-Chat": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-mini-4k-instruct",
+            DownloadSource.DEFAULT: "LLM-Research/Phi-3-mini-4k-instruct",
         },
         "Phi3-3.8B-128k-Chat": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-mini-128k-instruct",
+            DownloadSource.DEFAULT: "LLM-Research/Phi-3-mini-128k-instruct",
         },
     },
     module="qkv_proj",
@@ -912,6 +917,10 @@ register_model_group(
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-AWQ",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-AWQ",
         },
+        "Qwen1.5-110B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-110B-Chat-AWQ",
+        },
         "Qwen1.5-MoE-A2.7B-int4-Chat": {
             DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
             DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
@@ -953,12 +962,15 @@ register_model_group(
     models={
         "StarCoder2-3B": {
             DownloadSource.DEFAULT: "bigcode/starcoder2-3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-3b",
         },
         "StarCoder2-7B": {
             DownloadSource.DEFAULT: "bigcode/starcoder2-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-7b",
         },
         "StarCoder2-15B": {
             DownloadSource.DEFAULT: "bigcode/starcoder2-15b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-15b",
         },
     }
 )
@@ -981,17 +993,53 @@ register_model_group(
 
 register_model_group(
     models={
+        "XuanYuan-6B": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B",
+        },
         "XuanYuan-70B": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B",
+        },
+        "XuanYuan-2-70B": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B",
+        },
+        "XuanYuan-6B-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat",
         },
         "XuanYuan-70B-Chat": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat",
+        },
+        "XuanYuan-2-70B-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat",
+        },
+        "XuanYuan-6B-int8-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
+        },
+        "XuanYuan-6B-int4-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
         },
         "XuanYuan-70B-int8-Chat": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
         },
         "XuanYuan-70B-int4-Chat": {
             DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
+        },
+        "XuanYuan-2-70B-int8-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
+        },
+        "XuanYuan-2-70B-int4-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
         },
     },
     template="xuanyuan",
@@ -1028,6 +1076,30 @@ register_model_group(
             DownloadSource.DEFAULT: "xverse/XVERSE-65B-Chat",
             DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-Chat",
         },
+        "XVERSE-MoE-A4.2B": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-MoE-A4.2B",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-MoE-A4.2B",
+        },
+        "XVERSE-7B-int8-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat-GPTQ-Int8",
+        },
+        "XVERSE-7B-int4-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat-GPTQ-Int4",
+        },
+        "XVERSE-13B-int8-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat-GPTQ-Int8",
+        },
+        "XVERSE-13B-int4-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat-GPTQ-Int4",
+        },
+        "XVERSE-65B-int4-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-65B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-Chat-GPTQ-Int4",
+        },
     },
     template="xverse",
 )
@@ -1120,6 +1192,9 @@ register_model_group(
             DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-beta",
             DownloadSource.MODELSCOPE: "modelscope/zephyr-7b-beta",
         },
+        "Zephyr-141B-ORPO-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
+        },
     },
     template="zephyr",
 )

From 4dcd47100dfab919feb98cf5d7b214164588bf51 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 28 Apr 2024 03:01:49 +0800
Subject: [PATCH 191/341] fix llava rlhf

Former-commit-id: f6863cbbcbf960d6481296c6cae3e40fd70e4e14
---
 src/llmtuner/model/__init__.py        |  3 +-
 src/llmtuner/model/loader.py          |  7 ++--
 src/llmtuner/model/patcher.py         | 16 ++++++--
 src/llmtuner/model/utils/misc.py      | 37 +----------------
 src/llmtuner/model/utils/valuehead.py | 59 +++++++++++++++++++++++++++
 5 files changed, 79 insertions(+), 43 deletions(-)
 create mode 100644 src/llmtuner/model/utils/valuehead.py

diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py
index 1824f084..88f666c8 100644
--- a/src/llmtuner/model/__init__.py
+++ b/src/llmtuner/model/__init__.py
@@ -1,5 +1,6 @@
 from .loader import load_config, load_model, load_tokenizer
-from .utils.misc import find_all_linear_modules, load_valuehead_params
+from .utils.misc import find_all_linear_modules
+from .utils.valuehead import load_valuehead_params
 
 
 __all__ = [
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 0ff7a350..ead6178f 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -7,9 +7,10 @@ from ..extras.logging import get_logger
 from ..extras.misc import count_parameters, try_download_model_from_ms
 from .adapter import init_adapter
 from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model
-from .utils.misc import load_valuehead_params, register_autoclass
+from .utils.misc import register_autoclass
 from .utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
 from .utils.unsloth import load_unsloth_pretrained_model
+from .utils.valuehead import load_valuehead_params
 
 
 if TYPE_CHECKING:
@@ -105,7 +106,7 @@ def load_model(
     """
     init_kwargs = _get_init_kwargs(model_args)
     config = load_config(model_args)
-    patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
+    patch_config(config, tokenizer, model_args, init_kwargs, is_trainable, add_valuehead)
 
     model = None
     lazy_load = False
@@ -130,7 +131,7 @@ def load_model(
             model = convert_pretrained_model_to_mod(model, config, model_args)
 
     if not lazy_load:
-        patch_model(model, tokenizer, model_args, is_trainable)
+        patch_model(model, tokenizer, model_args, is_trainable, add_valuehead)
         register_autoclass(config, model, tokenizer)
 
     model = init_adapter(config, model, model_args, finetuning_args, is_trainable)
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 94d99644..31cba492 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -15,6 +15,7 @@ from .utils.longlora import configure_longlora
 from .utils.moe import add_z3_leaf_module, configure_moe
 from .utils.quantization import configure_quantization
 from .utils.rope import configure_rope
+from .utils.valuehead import configure_valuehead, prepare_valuehead_model
 from .utils.visual import autocast_projector_dtype
 
 
@@ -39,6 +40,7 @@ def patch_config(
     model_args: "ModelArguments",
     init_kwargs: Dict[str, Any],
     is_trainable: bool,
+    add_valuehead: bool,
 ) -> None:
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
         model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
@@ -49,6 +51,9 @@ def patch_config(
     configure_quantization(config, tokenizer, model_args, init_kwargs)
     configure_moe(config, model_args, is_trainable)
 
+    if add_valuehead:
+        configure_valuehead(config)
+
     if model_args.use_cache and not is_trainable:
         setattr(config, "use_cache", True)
         logger.info("Using KV cache for faster generation.")
@@ -73,7 +78,11 @@ def patch_config(
 
 
 def patch_model(
-    model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments", is_trainable: bool
+    model: "PreTrainedModel",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    is_trainable: bool,
+    add_valuehead: bool,
 ) -> None:
     gen_config = model.generation_config  # check and fix generation config
     if not gen_config.do_sample and (
@@ -86,9 +95,8 @@ def patch_model(
     if "GenerationMixin" not in str(model.generate.__func__):
         model.generate = MethodType(PreTrainedModel.generate, model)
 
-    if is_trainable and getattr(model.config, "model_type", None) == "chatglm":
-        setattr(model, "lm_head", model.transformer.output_layer)
-        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
+    if add_valuehead:
+        prepare_valuehead_model(model)
 
     if model_args.resize_vocab:
         resize_embedding_layer(model, tokenizer)
diff --git a/src/llmtuner/model/utils/misc.py b/src/llmtuner/model/utils/misc.py
index 57e772f7..eca68866 100644
--- a/src/llmtuner/model/utils/misc.py
+++ b/src/llmtuner/model/utils/misc.py
@@ -1,18 +1,13 @@
-from typing import TYPE_CHECKING, Dict, List
+from typing import TYPE_CHECKING, List
 
 import torch
-from transformers import PreTrainedModel
-from transformers.utils import cached_file
 
-from ...extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from ...extras.logging import get_logger
 from .quantization import QuantizationMethod
 
 
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig, PreTrainedTokenizer
-
-    from ...hparams import ModelArguments
+    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
 
 
 logger = get_logger(__name__)
@@ -74,34 +69,6 @@ def find_expanded_modules(model: "PreTrainedModel", target_modules: List[str], n
     return module_names
 
 
-def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> Dict[str, torch.Tensor]:
-    r"""
-    Loads value head parameters from Hugging Face Hub or local disk.
-
-    Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`.
-    """
-    kwargs = {"path_or_repo_id": path_or_repo_id, "cache_dir": model_args.cache_dir, "token": model_args.hf_hub_token}
-
-    try:
-        from safetensors import safe_open
-
-        vhead_file = cached_file(filename=V_HEAD_SAFE_WEIGHTS_NAME, **kwargs)
-        with safe_open(vhead_file, framework="pt", device="cpu") as f:
-            return {key: f.get_tensor(key) for key in f.keys()}
-    except Exception as err:
-        logger.info("Failed to load {}: {}".format(V_HEAD_SAFE_WEIGHTS_NAME, str(err)))
-
-    try:
-        vhead_file = cached_file(filename=V_HEAD_WEIGHTS_NAME, **kwargs)
-        return torch.load(vhead_file, map_location="cpu")
-    except Exception as err:
-        logger.info("Failed to load {}: {}".format(V_HEAD_WEIGHTS_NAME, str(err)))
-
-    logger.info("Provided path ({}) does not contain value head weights.".format(path_or_repo_id))
-    logger.info("Ignore these messages if you are not resuming the training of a value head model.")
-    return None
-
-
 def register_autoclass(config: "PretrainedConfig", model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer"):
     if "AutoConfig" in getattr(config, "auto_map", {}):
         config.__class__.register_for_auto_class()
diff --git a/src/llmtuner/model/utils/valuehead.py b/src/llmtuner/model/utils/valuehead.py
new file mode 100644
index 00000000..a192dcfa
--- /dev/null
+++ b/src/llmtuner/model/utils/valuehead.py
@@ -0,0 +1,59 @@
+from typing import TYPE_CHECKING, Dict
+
+import torch
+from transformers.utils import cached_file
+
+from ...extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+from ...extras.logging import get_logger
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def configure_valuehead(config: "PretrainedConfig") -> None:
+    if getattr(config, "model_type", None) == "llava":
+        setattr(config, "hidden_size", getattr(config.vision_config, "intermediate_size", None))
+
+
+def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> Dict[str, torch.Tensor]:
+    r"""
+    Loads value head parameters from Hugging Face Hub or local disk.
+
+    Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`.
+    """
+    kwargs = {"path_or_repo_id": path_or_repo_id, "cache_dir": model_args.cache_dir, "token": model_args.hf_hub_token}
+
+    try:
+        from safetensors import safe_open
+
+        vhead_file = cached_file(filename=V_HEAD_SAFE_WEIGHTS_NAME, **kwargs)
+        with safe_open(vhead_file, framework="pt", device="cpu") as f:
+            return {key: f.get_tensor(key) for key in f.keys()}
+    except Exception as err:
+        logger.info("Failed to load {}: {}".format(V_HEAD_SAFE_WEIGHTS_NAME, str(err)))
+
+    try:
+        vhead_file = cached_file(filename=V_HEAD_WEIGHTS_NAME, **kwargs)
+        return torch.load(vhead_file, map_location="cpu")
+    except Exception as err:
+        logger.info("Failed to load {}: {}".format(V_HEAD_WEIGHTS_NAME, str(err)))
+
+    logger.info("Provided path ({}) does not contain value head weights.".format(path_or_repo_id))
+    logger.info("Ignore these messages if you are not resuming the training of a value head model.")
+    return None
+
+
+def prepare_valuehead_model(model: "PreTrainedModel") -> None:
+    if getattr(model.config, "model_type", None) == "llava":
+        setattr(model, "lm_head", model.language_model.get_output_embeddings())
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
+
+    if getattr(model.config, "model_type", None) == "chatglm":
+        setattr(model, "lm_head", model.transformer.output_layer)
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])

From 3cef84407926d830753b430d59fd5363bf863e2b Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 28 Apr 2024 03:49:13 +0800
Subject: [PATCH 192/341] fix setup

Former-commit-id: 7d3e7db46a5f8672dd57fa5fcc03822e175047f9
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 setup.py     | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 789c20ce..04e5aa5b 100644
--- a/README.md
+++ b/README.md
@@ -322,7 +322,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-Extra dependencies available: deepspeed, metrics, unsloth, galore, badam, vllm, bitsandbytes, gptq, awq, aqlm, qwen, modelscope, quality
+Extra dependencies available: deepspeed, metrics, galore, badam, vllm, bitsandbytes, gptq, awq, aqlm, qwen, modelscope, quality
 
 <details><summary>For Windows users</summary>
 
diff --git a/README_zh.md b/README_zh.md
index fb02aa95..2240c688 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -322,7 +322,7 @@ cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-可选的额外依赖项：deepspeed、metrics、unsloth、galore、badam、vllm、bitsandbytes、gptq、awq、aqlm、qwen、modelscope、quality
+可选的额外依赖项：deepspeed、metrics、galore、badam、vllm、bitsandbytes、gptq、awq、aqlm、qwen、modelscope、quality
 
 <details><summary>Windows 用户指南</summary>
 
diff --git a/setup.py b/setup.py
index 7ff3185f..6a03138d 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,6 @@ def get_requires():
 extra_require = {
     "deepspeed": ["deepspeed>=0.10.0"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
-    "unsloth": ["torch==2.2.0", "unsloth[cu121-ampere-torch220]"],
     "galore": ["galore-torch"],
     "badam": ["badam"],
     "vllm": ["vllm>=0.4.0"],

From 7641a214d81bc03844208bf48fa9374ef873a834 Mon Sep 17 00:00:00 2001
From: codingma <codingma@163.com>
Date: Sun, 28 Apr 2024 11:31:34 +0800
Subject: [PATCH 193/341] support BAdam in WebUI

Former-commit-id: 1247154dd7d5eba5d11c4bb8504bf551ab49eb72
---
 src/llmtuner/webui/components/train.py |  26 ++++++
 src/llmtuner/webui/locales.py          | 109 +++++++++++++++++++++++++
 src/llmtuner/webui/runner.py           |   9 ++
 3 files changed, 144 insertions(+)

diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 7dc324af..9d93a9b6 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -210,6 +210,32 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         )
     )
 
+    with gr.Accordion(open=False) as badam_tab:
+        with gr.Row():
+            use_badam = gr.Checkbox()
+            badam_mode = gr.Dropdown(choices=["layer", "ratio"], value="layer")
+            badam_mask_mode = gr.Dropdown(choices=["adjacent", "scatter"], value="adjacent")
+            badam_switch_mode = gr.Dropdown(choices=["ascending", "descending", "random", "fixed"], value="ascending")
+            badam_update_ratio = gr.Slider(value=0, minimum=0, maximum=1, step=0.01)
+            badam_switch_block_every = gr.Slider(value=50, minimum=-1, maximum=200, step=1)
+
+            badam_verbose = gr.Dropdown(choices=[0, 1, 2], value=0)
+
+    input_elems.update({use_badam, badam_mode, badam_switch_block_every, badam_switch_mode, badam_update_ratio,
+                        badam_mask_mode, badam_verbose})
+    elem_dict.update(
+        dict(
+            badam_tab=badam_tab,
+            use_badam=use_badam,
+            badam_mode=badam_mode,
+            badam_switch_block_every=badam_switch_block_every,
+            badam_switch_mode=badam_switch_mode,
+            badam_update_ratio=badam_update_ratio,
+            badam_mask_mode=badam_mask_mode,
+            badam_verbose=badam_verbose,
+        )
+    )
+
     with gr.Row():
         cmd_preview_btn = gr.Button()
         arg_save_btn = gr.Button()
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index d341c7b6..d3dd4dc2 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -891,6 +891,115 @@ LOCALES = {
             "info": "应用 GaLore 的模块名称。使用英文逗号分隔多个名称。",
         },
     },
+    "badam_tab": {
+        "en": {
+            "label": "BAdam configurations",
+        },
+        "ru": {
+            "label": "Конфигурации BAdam",
+        },
+        "zh": {
+            "label": "BAdam 参数设置",
+        },
+    },
+    "use_badam": {
+        "en": {
+            "label": "Use BAdam",
+            "info": "Enable the block coordinate optimization with Adam.",
+        },
+        "ru": {
+            "label": "Использовать BAdam",
+            "info": "Включите блочную оптимизацию координат с Adam.",
+        },
+        "zh": {
+            "label": "使用 BAdam",
+            "info": "使用多Block协同的Adam优化器。",
+        },
+    },
+    "badam_mode": {
+        "en": {
+            "label": "BAdam mode",
+            "info": "Whether to use layer-wise or ratio-wise BAdam optimizer.",
+        },
+        "ru": {
+            "label": "Режим BAdam",
+            "info": "Использовать оптимизатор BAdam с обработкой слоев или с обработкой коэффициентов.",
+        },
+        "zh": {
+            "label": "BAdam 模式",
+            "info": "使用layer或者ratio比例模式。",
+        },
+    },
+    "badam_switch_block_every": {
+        "en": {
+            "label": "Switch block frequency",
+            "info": "How often to switch model's block update. Set to -1 to disable the block update.",
+        },
+        "ru": {
+            "label": "Частота переключения",
+            "info": "Как часто переключать обновление блока модели. Установите -1, чтобы отключить обновление блока.",
+        },
+        "zh": {
+            "label": "切换block的频率",
+            "info": "控制切换block切换的频率，如果是-1,则不切换。",
+        },
+    },
+    "badam_switch_mode": {
+        "en": {
+            "label": "Switch mode",
+            "info": "The strategy of picking block to update for layer-wise BAdam.",
+        },
+        "ru": {
+            "label": "Переключить режим",
+            "info": "Стратегия выбора блока для обновления в методе BAdam по слоям.",
+        },
+        "zh": {
+            "label": "Block切换策略",
+            "info": "如果是layer类型的训练模式，如何切换block。",
+        },
+    },
+    "badam_update_ratio": {
+        "en": {
+            "label": "Update ratio",
+            "info": "The ratio of the update for ratio-wise BAdam.",
+        },
+        "ru": {
+            "label": "Коэффициент обновления",
+            "info": "Коэффициент обновления для метода BAdam, основанного на коэффициентах.",
+        },
+        "zh": {
+            "label": "Block更新比例",
+            "info": "如果是比例类型的训练模式，block每次更新的范围比例。",
+        },
+    },
+    "badam_mask_mode": {
+        "en": {
+            "label": "Mask mode",
+            "info": "The mode of the mask for BAdam optimizer.",
+        },
+        "ru": {
+            "label": "Режим маски",
+            "info": "Режим маски для оптимизатора BAdam.",
+        },
+        "zh": {
+            "label": "Mask模式",
+            "info": "BAdam优化器内训练参数的mask关系。",
+        },
+    },
+    "badam_verbose": {
+        "en": {
+            "label": "Verbosity level",
+            "info": "0 for no print, 1 for print the block prefix, 2 for print trainable parameters.",
+        },
+        "ru": {
+            "label": "Уровень многословности",
+            "info": "0 для отсутствия печати, 1 для печати префикса блока, 2 для печати обучаемых параметров.",
+        },
+        "zh": {
+            "label": "输出日志级别",
+            "info": "0：不输出，1：输出block前缀， 1：输出可训练的参数。",
+        },
+    },
     "cmd_preview_btn": {
         "en": {
             "value": "Preview command",
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 8054484f..52584f31 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -151,6 +151,7 @@ class Runner:
             fp16=(get("train.compute_type") == "fp16"),
             bf16=(get("train.compute_type") == "bf16"),
             pure_bf16=(get("train.compute_type") == "pure_bf16"),
+            use_badam=get("train.use_badam"),
         )
         args["disable_tqdm"] = True
 
@@ -198,6 +199,14 @@ class Runner:
             args["galore_scale"] = get("train.galore_scale")
             args["galore_target"] = get("train.galore_target")
 
+        if args["use_badam"]:
+            args["badam_mode"] = get("train.badam_mode")
+            args["badam_switch_block_every"] = get("train.badam_switch_block_every")
+            args["badam_switch_mode"] = get("train.badam_switch_mode")
+            args["badam_update_ratio"] = get("train.badam_update_ratio")
+            args["badam_mask_mode"] = get("train.badam_mask_mode")
+            args["badam_verbose"] = get("train.badam_verbose")
+
         return args
 
     def _parse_eval_args(self, data: Dict["Component", Any]) -> Dict[str, Any]:

From dfd153cc813fc4ed815018c26f852ef15883888c Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Sun, 28 Apr 2024 14:27:45 +0800
Subject: [PATCH 194/341] added the second sharegpt format

Former-commit-id: 6d140ac98a78ecc0a713842bb917dc8eb14450cb
---
 data/README.md    | 32 ++++++++++++++++++++++++++++----
 data/README_zh.md | 26 +++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/data/README.md b/data/README.md
index 6de0430f..9158233f 100644
--- a/data/README.md
+++ b/data/README.md
@@ -94,20 +94,44 @@ Remember to set `"ranking": true` for the preference datasets.
 The dataset in sharegpt format should follow the below format:
 
 ```json
+# The first sharegpt format
 [
   {
     "conversations": [
       {
         "from": "human",
-        "value": "user instruction"
+        "value": "用户指令"
       },
       {
         "from": "gpt",
-        "value": "model response"
+        "value": "模型回答"
       }
     ],
-    "system": "system prompt (optional)",
-    "tools": "tool description (optional)"
+    "system": "系统提示词（选填）",
+    "tools": "工具描述（选填）"
+  }
+]
+
+# The second sharegpt format
+
+[
+  {
+    "type": "chatml",
+    "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "Tell me something about large language models."
+    },
+    {
+      "role": "assistant",
+      "content": "Large language models are a type of language model  ..."
+    }
+  ],
+  "source": "unknown"
   }
 ]
 ```
diff --git a/data/README_zh.md b/data/README_zh.md
index fb6cb1d9..9abef5b6 100644
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -37,7 +37,7 @@
 
 ----
 
-该项目目前支持两种格式的数据集：**alpaca** 和 **sharegpt**，其中 alpaca 格式的数据集按照以下方式组织：
+该项目目前支持三种格式的数据集：**alpaca** 和 **sharegpt**，其中 alpaca 格式的数据集按照以下方式组织：
 
 ```json
 [
@@ -94,6 +94,7 @@
 而 sharegpt 格式的数据集按照以下方式组织：
 
 ```json
+# 第一种sharegpt格式
 [
   {
     "conversations": [
@@ -110,6 +111,29 @@
     "tools": "工具描述（选填）"
   }
 ]
+
+# 第二种sharegpt格式
+
+[
+  {
+    "type": "chatml",
+    "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "Tell me something about large language models."
+    },
+    {
+      "role": "assistant",
+      "content": "Large language models are a type of language model  ..."
+    }
+  ],
+  "source": "unknown"
+  }
+]
 ```
 
 对于上述格式的数据，`dataset_info.json` 中的 `columns` 应为：

From 3d88589c0f37990ea371b5fb18886dcbf48c92d1 Mon Sep 17 00:00:00 2001
From: khazic <khazzz1c@gmail.com>
Date: Sun, 28 Apr 2024 14:30:05 +0800
Subject: [PATCH 195/341] Upgrade the second sharegpt format

Former-commit-id: 057f992a666b029d207a3dc7dfc353f9abcf8316
---
 data/README_zh.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data/README_zh.md b/data/README_zh.md
index 9abef5b6..5a9db167 100644
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -120,15 +120,15 @@
     "messages": [
     {
       "role": "system",
-      "content": "You are a helpful assistant."
+      "content": "你是一个很有用的AI助手"
     },
     {
       "role": "user",
-      "content": "Tell me something about large language models."
+      "content": "告诉我一些关于大模型的一些信息"
     },
     {
       "role": "assistant",
-      "content": "Large language models are a type of language model  ..."
+      "content": "大模型是一种语言模型"
     }
   ],
   "source": "unknown"

From 57fcdca336d547a53b7a81ab592575243bddf0cf Mon Sep 17 00:00:00 2001
From: Lao <khazzz1c@gmail.com>
Date: Sun, 28 Apr 2024 23:31:37 +0800
Subject: [PATCH 196/341] Update README_zh.md

Former-commit-id: bacc8588dc7b0b43c240189ecf4336bedc299357
---
 data/README_zh.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/README_zh.md b/data/README_zh.md
index 5a9db167..1fe98a9e 100644
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -37,7 +37,7 @@
 
 ----
 
-该项目目前支持三种格式的数据集：**alpaca** 和 **sharegpt**，其中 alpaca 格式的数据集按照以下方式组织：
+该项目目前支持二种格式的数据集：**alpaca** 和 **sharegpt**，其中 alpaca 格式的数据集按照以下方式组织：
 
 ```json
 [

From 2d95127c33a0321e952bc12650401d278f0351d5 Mon Sep 17 00:00:00 2001
From: zhaonx <953608703@qq.com>
Date: Tue, 30 Apr 2024 17:17:09 +0800
Subject: [PATCH 197/341] "add support for vllm api stop parameter"

Former-commit-id: b9f21fa639b66db09c79404d885661c96bdf9395
---
 src/llmtuner/api/app.py                 | 2 ++
 src/llmtuner/api/protocol.py            | 3 ++-
 src/llmtuner/chat/vllm_engine.py        | 3 +++
 src/llmtuner/hparams/generating_args.py | 7 +++++--
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index 3f06fef1..892bf901 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -141,6 +141,7 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
             top_p=request.top_p,
             max_new_tokens=request.max_tokens,
             num_return_sequences=request.n,
+            stop=request.stop
         )
 
         prompt_length, response_length = 0, 0
@@ -193,6 +194,7 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
             temperature=request.temperature,
             top_p=request.top_p,
             max_new_tokens=request.max_tokens,
+            stop=request.stop
         ):
             if len(new_token) == 0:
                 continue
diff --git a/src/llmtuner/api/protocol.py b/src/llmtuner/api/protocol.py
index ece2132b..8f1b7b4c 100644
--- a/src/llmtuner/api/protocol.py
+++ b/src/llmtuner/api/protocol.py
@@ -1,6 +1,6 @@
 import time
 from enum import Enum, unique
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from pydantic import BaseModel, Field
 from typing_extensions import Literal
@@ -78,6 +78,7 @@ class ChatCompletionRequest(BaseModel):
     n: int = 1
     max_tokens: Optional[int] = None
     stream: bool = False
+    stop: Union[Optional[str], List[str]] = None
 
 
 class ChatCompletionResponseChoice(BaseModel):
diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index 0f0dc366..9863d635 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -96,6 +96,7 @@ class VllmEngine(BaseEngine):
         repetition_penalty = input_kwargs.pop("repetition_penalty", None)
         max_length = input_kwargs.pop("max_length", None)
         max_new_tokens = input_kwargs.pop("max_new_tokens", None)
+        stop = input_kwargs.pop("stop", None)
 
         generating_args = self.generating_args.copy()
         generating_args.update(
@@ -105,6 +106,7 @@ class VllmEngine(BaseEngine):
                 top_k=top_k or generating_args["top_k"],
                 num_return_sequences=num_return_sequences or 1,
                 repetition_penalty=repetition_penalty or generating_args["repetition_penalty"],
+                stop=stop or generating_args["stop"]
             )
         )
 
@@ -125,6 +127,7 @@ class VllmEngine(BaseEngine):
             stop_token_ids=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
             max_tokens=generating_args["max_new_tokens"],
             skip_special_tokens=True,
+            stop=generating_args["stop"],
         )
 
         if self.processor is not None and image is not None:
diff --git a/src/llmtuner/hparams/generating_args.py b/src/llmtuner/hparams/generating_args.py
index e792c003..03e760e7 100644
--- a/src/llmtuner/hparams/generating_args.py
+++ b/src/llmtuner/hparams/generating_args.py
@@ -1,5 +1,5 @@
 from dataclasses import asdict, dataclass, field
-from typing import Any, Dict
+from typing import Any, Dict, Union, Optional, List
 
 
 @dataclass
@@ -46,7 +46,10 @@ class GeneratingArguments:
         default=1.0,
         metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
     )
-
+    stop: Union[Optional[str], List[str]] = field(
+        default=None,
+        metadata={"help": "List of strings or string that stop the generation when they are generated. The returned output will not contain the stop strings."},
+    )
     def to_dict(self) -> Dict[str, Any]:
         args = asdict(self)
         if args.get("max_new_tokens", -1) > 0:

From ea58cf111e4bdb2773b666835a977e72de235ff1 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 2 May 2024 02:13:46 +0800
Subject: [PATCH 198/341] Update README.md

Former-commit-id: 4fb43b0c9aa48242126252ad755a2a1683b38d6a
---
 data/README.md | 145 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 99 insertions(+), 46 deletions(-)

diff --git a/data/README.md b/data/README.md
index 9158233f..012de4e7 100644
--- a/data/README.md
+++ b/data/README.md
@@ -1,4 +1,4 @@
-If you are using a custom dataset, please provide your dataset definition in the following format in `dataset_info.json`.
+If you are using a custom dataset, please add your **dataset description** to `dataset_info.json` according to the following format. We also provide several examples in the next section.
 
 ```json
 "dataset_name": {
@@ -33,7 +33,7 @@ If you are using a custom dataset, please provide your dataset definition in the
 }
 ```
 
-Given above, you can use the custom dataset via specifying `--dataset dataset_name`.
+After that, you can load the custom dataset by specifying `--dataset dataset_name`.
 
 ----
 
@@ -54,10 +54,11 @@ Currently we support dataset in **alpaca** or **sharegpt** format, the dataset i
 ]
 ```
 
-Regarding the above dataset, the `columns` in `dataset_info.json` should be:
+Regarding the above dataset, the description in `dataset_info.json` should be:
 
 ```json
 "dataset_name": {
+  "file_name": "data.json",
   "columns": {
     "prompt": "instruction",
     "query": "input",
@@ -70,76 +71,86 @@ Regarding the above dataset, the `columns` in `dataset_info.json` should be:
 
 The `query` column will be concatenated with the `prompt` column and used as the user prompt, then the user prompt would be `prompt\nquery`. The `response` column represents the model response.
 
-The `system` column will be used as the system prompt. The `history` column is a list consisting string tuples representing prompt-response pairs in the history. Note that the responses in the history **will also be used for training**.
+The `system` column will be used as the system prompt. The `history` column is a list consisting string tuples representing prompt-response pairs in the history. Note that the responses in the history **will also be used for training** in supervised fine-tuning.
 
-For the pre-training datasets, only the `prompt` column will be used for training.
-
-For the preference datasets, the `response` column should be a string list whose length is 2, with the preferred answers appearing first, for example:
+For the **pre-training datasets**, only the `prompt` column will be used for training, for example:
 
 ```json
-{
-  "instruction": "user instruction",
-  "input": "user input",
-  "output": [
-    "chosen answer",
-    "rejected answer"
-  ]
+[
+  {"text": "document"},
+  {"text": "document"}
+]
+```
+
+Regarding the above dataset, the description in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "text"
+  }
 }
 ```
 
-Remember to set `"ranking": true` for the preference datasets.
+For the **preference datasets**, the `response` column should be a string list whose length is 2, with the preferred answers appearing first, for example:
+
+```json
+[
+  {
+    "instruction": "user instruction",
+    "input": "user input",
+    "output": [
+      "chosen answer",
+      "rejected answer"
+    ]
+  }
+]
+```
+
+Regarding the above dataset, the description in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "ranking": true,
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+  }
+}
+```
 
 ----
 
-The dataset in sharegpt format should follow the below format:
+The dataset in **sharegpt** format should follow the below format:
 
 ```json
-# The first sharegpt format
 [
   {
     "conversations": [
       {
         "from": "human",
-        "value": "用户指令"
+        "value": "user instruction"
       },
       {
         "from": "gpt",
-        "value": "模型回答"
+        "value": "model response"
       }
     ],
-    "system": "系统提示词（选填）",
-    "tools": "工具描述（选填）"
-  }
-]
-
-# The second sharegpt format
-
-[
-  {
-    "type": "chatml",
-    "messages": [
-    {
-      "role": "system",
-      "content": "You are a helpful assistant."
-    },
-    {
-      "role": "user",
-      "content": "Tell me something about large language models."
-    },
-    {
-      "role": "assistant",
-      "content": "Large language models are a type of language model  ..."
-    }
-  ],
-  "source": "unknown"
+    "system": "system prompt (optional)",
+    "tools": "tool description (optional)"
   }
 ]
 ```
 
-Regarding the above dataset, the `columns` in `dataset_info.json` should be:
+Regarding the above dataset, the description in `dataset_info.json` should be:
 
 ```json
 "dataset_name": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
   "columns": {
     "messages": "conversations",
     "system": "system",
@@ -156,4 +167,46 @@ Regarding the above dataset, the `columns` in `dataset_info.json` should be:
 
 where the `messages` column should be a list following the `u/a/u/a/u/a` order.
 
-Pre-training datasets and preference datasets are incompatible with the sharegpt format yet.
+We also supports the dataset in the **openai** format:
+
+```json
+[
+  {
+    "messages": [
+      {
+        "role": "system",
+        "content": "system prompt (optional)"
+      },
+      {
+        "role": "user",
+        "content": "user instruction"
+      },
+      {
+        "role": "assistant",
+        "content": "model response"
+      }
+    ]
+  }
+]
+```
+
+Regarding the above dataset, the description in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "columns": {
+    "messages": "messages"
+  },
+  "tags": {
+    "role_tag": "role",
+    "content_tag": "content",
+    "user_tag": "user",
+    "assistant_tag": "assistant",
+    "system_tag": "system"
+  }
+}
+```
+
+Pre-training datasets and preference datasets are **incompatible** with the sharegpt format yet.

From eb99999ca8be43bcd12eaf699f27b762952cb85f Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 2 May 2024 02:14:55 +0800
Subject: [PATCH 199/341] Update README_zh.md

Former-commit-id: 1c673d89faca3160627009fcd0a4aa39138570c0
---
 data/README_zh.md | 139 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 96 insertions(+), 43 deletions(-)

diff --git a/data/README_zh.md b/data/README_zh.md
index 1fe98a9e..6449c5d5 100644
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -1,4 +1,4 @@
-如果您使用自定义数据集，请务必在 `dataset_info.json` 文件中按照以下格式提供数据集定义。
+如果您使用自定义数据集，请务必按照以下格式在 `dataset_info.json` 文件中添加**数据集描述**。我们在下面也提供了一些例子。
 
 ```json
 "数据集名称": {
@@ -33,11 +33,11 @@
 }
 ```
 
-添加后可通过指定 `--dataset 数据集名称` 参数使用自定义数据集。
+然后，可通过使用 `--dataset 数据集名称` 参数加载自定义数据集。
 
 ----
 
-该项目目前支持二种格式的数据集：**alpaca** 和 **sharegpt**，其中 alpaca 格式的数据集按照以下方式组织：
+该项目目前支持两种格式的数据集：**alpaca** 和 **sharegpt**，其中 alpaca 格式的数据集按照以下方式组织：
 
 ```json
 [
@@ -54,10 +54,11 @@
 ]
 ```
 
-对于上述格式的数据，`dataset_info.json` 中的 `columns` 应为：
+对于上述格式的数据，`dataset_info.json` 中的描述应为：
 
 ```json
 "数据集名称": {
+  "file_name": "data.json",
   "columns": {
     "prompt": "instruction",
     "query": "input",
@@ -70,31 +71,62 @@
 
 其中 `query` 列对应的内容会与 `prompt` 列对应的内容拼接后作为用户指令，即用户指令为 `prompt\nquery`。`response` 列对应的内容为模型回答。
 
-`system` 列对应的内容将被作为系统提示词。`history` 列是由多个字符串二元组构成的列表，分别代表历史消息中每轮的指令和回答。注意历史消息中的回答**也会被用于训练**。
+`system` 列对应的内容将被作为系统提示词。`history` 列是由多个字符串二元组构成的列表，分别代表历史消息中每轮的指令和回答。注意在指令监督学习时，历史消息中的回答**也会被用于训练**。
 
-对于预训练数据集，仅 `prompt` 列中的内容会用于模型训练。
-
-对于偏好数据集，`response` 列应当是一个长度为 2 的字符串列表，排在前面的代表更优的回答，例如：
+对于**预训练数据集**，仅 `prompt` 列中的内容会用于模型训练，例如：
 
 ```json
-{
-  "instruction": "用户指令",
-  "input": "用户输入",
-  "output": [
-    "优质回答",
-    "劣质回答"
-  ]
+[
+  {"text": "document"},
+  {"text": "document"}
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的描述应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "text"
+  }
 }
 ```
 
-添加偏好数据集需要额外指定 `"ranking": true`。
+对于**偏好数据集**，`response` 列应当是一个长度为 2 的字符串列表，排在前面的代表更优的回答，例如：
+
+```json
+[
+  {
+    "instruction": "用户指令",
+    "input": "用户输入",
+    "output": [
+      "优质回答",
+      "劣质回答"
+    ]
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的描述应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "ranking": true,
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+  }
+}
+```
 
 ----
 
-而 sharegpt 格式的数据集按照以下方式组织：
+而 **sharegpt** 格式的数据集按照以下方式组织：
 
 ```json
-# 第一种sharegpt格式
 [
   {
     "conversations": [
@@ -111,35 +143,14 @@
     "tools": "工具描述（选填）"
   }
 ]
-
-# 第二种sharegpt格式
-
-[
-  {
-    "type": "chatml",
-    "messages": [
-    {
-      "role": "system",
-      "content": "你是一个很有用的AI助手"
-    },
-    {
-      "role": "user",
-      "content": "告诉我一些关于大模型的一些信息"
-    },
-    {
-      "role": "assistant",
-      "content": "大模型是一种语言模型"
-    }
-  ],
-  "source": "unknown"
-  }
-]
 ```
 
-对于上述格式的数据，`dataset_info.json` 中的 `columns` 应为：
+对于上述格式的数据，`dataset_info.json` 中的描述应为：
 
 ```json
 "数据集名称": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
   "columns": {
     "messages": "conversations",
     "system": "system",
@@ -156,4 +167,46 @@
 
 其中 `messages` 列应当是一个列表，且符合 `用户/模型/用户/模型/用户/模型` 的顺序。
 
-预训练数据集和偏好数据集尚不支持 sharegpt 格式。
+我们同样支持 **openai** 格式的数据集：
+
+```json
+[
+  {
+    "messages": [
+      {
+        "role": "system",
+        "content": "系统提示词（选填）"
+      },
+      {
+        "role": "user",
+        "content": "用户指令"
+      },
+      {
+        "role": "assistant",
+        "content": "模型回答"
+      }
+    ]
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的描述应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "columns": {
+    "messages": "messages"
+  },
+  "tags": {
+    "role_tag": "role",
+    "content_tag": "content",
+    "user_tag": "user",
+    "assistant_tag": "assistant",
+    "system_tag": "system"
+  }
+}
+```
+
+预训练数据集和偏好数据集**尚不支持** sharegpt 格式。

From a11a04a24faeb63df01467dc4e6abe2192201b88 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 2 May 2024 02:21:27 +0800
Subject: [PATCH 200/341] Update train.py

Former-commit-id: 16f0d0056967872e02969fdd842a381f9484af8a
---
 src/llmtuner/webui/components/train.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 9d93a9b6..be070869 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -214,25 +214,19 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
         with gr.Row():
             use_badam = gr.Checkbox()
             badam_mode = gr.Dropdown(choices=["layer", "ratio"], value="layer")
-            badam_mask_mode = gr.Dropdown(choices=["adjacent", "scatter"], value="adjacent")
             badam_switch_mode = gr.Dropdown(choices=["ascending", "descending", "random", "fixed"], value="ascending")
-            badam_update_ratio = gr.Slider(value=0, minimum=0, maximum=1, step=0.01)
             badam_switch_block_every = gr.Slider(value=50, minimum=-1, maximum=200, step=1)
+            badam_update_ratio = gr.Slider(value=0, minimum=0, maximum=1, step=0.01)
 
-            badam_verbose = gr.Dropdown(choices=[0, 1, 2], value=0)
-
-    input_elems.update({use_badam, badam_mode, badam_switch_block_every, badam_switch_mode, badam_update_ratio,
-                        badam_mask_mode, badam_verbose})
+    input_elems.update({use_badam, badam_mode, badam_switch_mode, badam_switch_block_every, badam_update_ratio})
     elem_dict.update(
         dict(
             badam_tab=badam_tab,
             use_badam=use_badam,
             badam_mode=badam_mode,
-            badam_switch_block_every=badam_switch_block_every,
             badam_switch_mode=badam_switch_mode,
+            badam_switch_block_every=badam_switch_block_every,
             badam_update_ratio=badam_update_ratio,
-            badam_mask_mode=badam_mask_mode,
-            badam_verbose=badam_verbose,
         )
     )
 

From dd0b85580e2682b8afadf30787a7cfc473478c10 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 2 May 2024 02:47:04 +0800
Subject: [PATCH 201/341] fix badam configs

Former-commit-id: 8a4e6a4c65a9a42e6501b0d3ce81d6220c287454
---
 src/llmtuner/hparams/finetuning_args.py | 15 +++--
 src/llmtuner/train/utils.py             |  4 +-
 src/llmtuner/webui/components/train.py  |  8 +--
 src/llmtuner/webui/locales.py           | 80 ++++++++-----------------
 src/llmtuner/webui/runner.py            |  6 +-
 5 files changed, 44 insertions(+), 69 deletions(-)

diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index f4f71bc5..03bf52af 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -221,16 +221,18 @@ class BAdamArgument:
         default=None,
         metadata={"help": "The starting block index for layer-wise BAdam."},
     )
-    badam_switch_block_every: Optional[int] = field(
-        default=50,
-        metadata={"help": "How often to switch model's block update. Set to -1 to disable the block update."},
-    )
     badam_switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field(
         default="ascending",
         metadata={"help": "the strategy of picking block to update for layer-wise BAdam."},
     )
+    badam_switch_interval: Optional[int] = field(
+        default=50,
+        metadata={
+            "help": "Number of steps to update the block for layer-wise BAdam. Use -1 to disable the block update."
+        },
+    )
     badam_update_ratio: float = field(
-        default=0.0,
+        default=0.05,
         metadata={"help": "The ratio of the update for ratio-wise BAdam."},
     )
     badam_mask_mode: Literal["adjacent", "scatter"] = field(
@@ -308,6 +310,9 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         if self.use_galore and self.finetuning_type == "lora":
             raise ValueError("Cannot use LoRA with GaLore together.")
 
+        if self.use_galore and self.use_badam:
+            raise ValueError("Cannot use GaLore with BAdam together.")
+
         if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora":
             raise ValueError("`loraplus_lr_ratio` is only valid for the LoRA training.")
 
diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py
index d9fc363d..21dac461 100644
--- a/src/llmtuner/train/utils.py
+++ b/src/llmtuner/train/utils.py
@@ -317,14 +317,14 @@ def _create_badam_optimizer(
             base_optimizer=base_optimizer,
             named_parameters_list=list(model.named_parameters()),
             block_prefix_list=None,
-            switch_block_every=finetuning_args.badam_switch_block_every,
+            switch_block_every=finetuning_args.badam_switch_interval,
             start_block=finetuning_args.badam_start_block,
             switch_mode=finetuning_args.badam_switch_mode,
             verbose=finetuning_args.badam_verbose,
         )
         logger.info(
             f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
-            f"switch block every {finetuning_args.badam_switch_block_every} steps, "
+            f"switch block every {finetuning_args.badam_switch_interval} steps, "
             f"default start block is {finetuning_args.badam_start_block}"
         )
 
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index be070869..c9671289 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -215,17 +215,17 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             use_badam = gr.Checkbox()
             badam_mode = gr.Dropdown(choices=["layer", "ratio"], value="layer")
             badam_switch_mode = gr.Dropdown(choices=["ascending", "descending", "random", "fixed"], value="ascending")
-            badam_switch_block_every = gr.Slider(value=50, minimum=-1, maximum=200, step=1)
-            badam_update_ratio = gr.Slider(value=0, minimum=0, maximum=1, step=0.01)
+            badam_switch_interval = gr.Slider(value=50, minimum=1, maximum=1024, step=1)
+            badam_update_ratio = gr.Slider(value=0.05, minimum=0, maximum=1, step=0.01)
 
-    input_elems.update({use_badam, badam_mode, badam_switch_mode, badam_switch_block_every, badam_update_ratio})
+    input_elems.update({use_badam, badam_mode, badam_switch_mode, badam_switch_interval, badam_update_ratio})
     elem_dict.update(
         dict(
             badam_tab=badam_tab,
             use_badam=use_badam,
             badam_mode=badam_mode,
             badam_switch_mode=badam_switch_mode,
-            badam_switch_block_every=badam_switch_block_every,
+            badam_switch_interval=badam_switch_interval,
             badam_update_ratio=badam_update_ratio,
         )
     )
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index d3dd4dc2..1c474f34 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -905,15 +905,15 @@ LOCALES = {
     "use_badam": {
         "en": {
             "label": "Use BAdam",
-            "info": "Enable the block coordinate optimization with Adam.",
+            "info": "Enable the BAdam optimizer.",
         },
         "ru": {
             "label": "Использовать BAdam",
-            "info": "Включите блочную оптимизацию координат с Adam.",
+            "info": "Включите оптимизатор BAdam.",
         },
         "zh": {
             "label": "使用 BAdam",
-            "info": "使用多Block协同的Adam优化器。",
+            "info": "使用 BAdam 优化器。",
         },
     },
     "badam_mode": {
@@ -923,25 +923,11 @@ LOCALES = {
         },
         "ru": {
             "label": "Режим BAdam",
-            "info": "Использовать оптимизатор BAdam с обработкой слоев или с обработкой коэффициентов.",
+            "info": "Использовать ли оптимизатор BAdam с послоевой или пропорциональной настройкой.",
         },
         "zh": {
             "label": "BAdam 模式",
-            "info": "使用layer或者ratio比例模式。",
-        },
-    },
-    "badam_switch_block_every": {
-        "en": {
-            "label": "Switch block frequency",
-            "info": "How often to switch model's block update. Set to -1 to disable the block update.",
-        },
-        "ru": {
-            "label": "Частота переключения",
-            "info": "Как часто переключать обновление блока модели. Установите -1, чтобы отключить обновление блока.",
-        },
-        "zh": {
-            "label": "切换block的频率",
-            "info": "控制切换block切换的频率，如果是-1,则不切换。",
+            "info": "使用 layer-wise 或 ratio-wise BAdam 优化器。",
         },
     },
     "badam_switch_mode": {
@@ -950,12 +936,26 @@ LOCALES = {
             "info": "The strategy of picking block to update for layer-wise BAdam.",
         },
         "ru": {
-            "label": "Переключить режим",
-            "info": "Стратегия выбора блока для обновления в методе BAdam по слоям.",
+            "label": "Режим переключения",
+            "info": "Стратегия выбора блока для обновления для послойного BAdam.",
         },
         "zh": {
-            "label": "Block切换策略",
-            "info": "如果是layer类型的训练模式，如何切换block。",
+            "label": "切换策略",
+            "info": "Layer-wise BAdam 优化器的块切换策略。",
+        },
+    },
+    "badam_switch_interval": {
+        "en": {
+            "label": "Switch interval",
+            "info": "Number of steps to update the block for layer-wise BAdam.",
+        },
+        "ru": {
+            "label": "Интервал переключения",
+            "info": "количество шагов для обновления блока для пошагового BAdam.",
+        },
+        "zh": {
+            "label": "切换频率",
+            "info": "Layer-wise BAdam 优化器的块切换频率。",
         },
     },
     "badam_update_ratio": {
@@ -965,39 +965,11 @@ LOCALES = {
         },
         "ru": {
             "label": "Коэффициент обновления",
-            "info": "Коэффициент обновления для метода BAdam, основанного на коэффициентах.",
+            "info": "Коэффициент обновления для BAdam с учётом соотношений.",
         },
         "zh": {
-            "label": "Block更新比例",
-            "info": "如果是比例类型的训练模式，block每次更新的范围比例。",
-        },
-    },
-    "badam_mask_mode": {
-        "en": {
-            "label": "Mask mode",
-            "info": "The mode of the mask for BAdam optimizer.",
-        },
-        "ru": {
-            "label": "Режим маски",
-            "info": "Режим маски для оптимизатора BAdam.",
-        },
-        "zh": {
-            "label": "Mask模式",
-            "info": "BAdam优化器内训练参数的mask关系。",
-        },
-    },
-    "badam_verbose": {
-        "en": {
-            "label": "Verbosity level",
-            "info": "0 for no print, 1 for print the block prefix, 2 for print trainable parameters.",
-        },
-        "ru": {
-            "label": "Уровень многословности",
-            "info": "0 для отсутствия печати, 1 для печати префикса блока, 2 для печати обучаемых параметров.",
-        },
-        "zh": {
-            "label": "输出日志级别",
-            "info": "0：不输出，1：输出block前缀， 1：输出可训练的参数。",
+            "label": "Block 更新比例",
+            "info": "Ratio-wise BAdam 优化器的更新比例。",
         },
     },
     "cmd_preview_btn": {
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 52584f31..d53a4dfe 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -147,11 +147,11 @@ class Runner:
             shift_attn=get("train.shift_attn"),
             report_to="all" if get("train.report_to") else "none",
             use_galore=get("train.use_galore"),
+            use_badam=get("train.use_badam"),
             output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("train.output_dir")),
             fp16=(get("train.compute_type") == "fp16"),
             bf16=(get("train.compute_type") == "bf16"),
             pure_bf16=(get("train.compute_type") == "pure_bf16"),
-            use_badam=get("train.use_badam"),
         )
         args["disable_tqdm"] = True
 
@@ -201,11 +201,9 @@ class Runner:
 
         if args["use_badam"]:
             args["badam_mode"] = get("train.badam_mode")
-            args["badam_switch_block_every"] = get("train.badam_switch_block_every")
             args["badam_switch_mode"] = get("train.badam_switch_mode")
+            args["badam_switch_interval"] = get("train.badam_switch_interval")
             args["badam_update_ratio"] = get("train.badam_update_ratio")
-            args["badam_mask_mode"] = get("train.badam_mask_mode")
-            args["badam_verbose"] = get("train.badam_verbose")
 
         return args
 

From 2cedb59beea1ed815bfb30d2aad217116f893cdd Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 2 May 2024 17:16:02 +0800
Subject: [PATCH 202/341] Update prepare.sh

Former-commit-id: 5928b869251a984a085289ca6861a9731dc5b910
---
 examples/lora_single_gpu/prepare.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/lora_single_gpu/prepare.sh b/examples/lora_single_gpu/prepare.sh
index 3652cea4..e86de636 100644
--- a/examples/lora_single_gpu/prepare.sh
+++ b/examples/lora_single_gpu/prepare.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# use `--tokenized_path` in training script to load data
 
 CUDA_VISIBLE_DEVICES= python ../../src/train_bash.py \
     --stage sft \

From ce8200ad98b1742e966f4b71c4b4bf22dd8028c6 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 3 May 2024 02:58:23 +0800
Subject: [PATCH 203/341] update webui and add CLIs

Former-commit-id: 1368dda22ab875914c9dd86ee5146a4f6a4736ad
---
 Dockerfile                                |   2 +-
 README.md                                 |   6 +-
 README_zh.md                              |   6 +-
 examples/extras/badam/sft.sh              |   2 +-
 examples/extras/fsdp_qlora/sft.sh         |   2 +-
 examples/extras/galore/sft.sh             |   2 +-
 examples/extras/llama_pro/sft.sh          |   2 +-
 examples/extras/loraplus/sft.sh           |   2 +-
 examples/extras/mod/sft.sh                |   2 +-
 examples/full_multi_gpu/multi_node.sh     |   2 +-
 examples/full_multi_gpu/predict.sh        |   2 +-
 examples/full_multi_gpu/single_node.sh    |   2 +-
 examples/inference/api_demo.sh            |   2 +-
 examples/inference/cli_demo.sh            |   2 +-
 examples/inference/evaluate.sh            |   2 +-
 examples/inference/web_demo.sh            |   2 +-
 examples/lora_multi_gpu/ds_zero3.sh       |   3 +-
 examples/lora_multi_gpu/multi_node.sh     |   2 +-
 examples/lora_multi_gpu/single_node.sh    |   2 +-
 examples/lora_single_gpu/dpo.sh           |   2 +-
 examples/lora_single_gpu/orpo.sh          |   2 +-
 examples/lora_single_gpu/ppo.sh           |   2 +-
 examples/lora_single_gpu/predict.sh       |   2 +-
 examples/lora_single_gpu/prepare.sh       |   2 +-
 examples/lora_single_gpu/pretrain.sh      |   2 +-
 examples/lora_single_gpu/reward.sh        |   2 +-
 examples/lora_single_gpu/sft.sh           |   2 +-
 examples/lora_single_gpu/sft_mllm.sh      |   2 +-
 examples/merge_lora/merge.sh              |   2 +-
 examples/merge_lora/quantize.sh           |   2 +-
 examples/qlora_single_gpu/aqlm.sh         |   2 +-
 examples/qlora_single_gpu/awq.sh          |   2 +-
 examples/qlora_single_gpu/bitsandbytes.sh |   2 +-
 examples/qlora_single_gpu/gptq.sh         |   2 +-
 requirements.txt                          |   1 +
 setup.py                                  |   1 +
 src/api_demo.py                           |  16 ---
 src/cli_demo.py                           |  49 --------
 src/evaluate.py                           |   9 --
 src/export_model.py                       |   9 --
 src/llmtuner/__init__.py                  |  10 +-
 src/llmtuner/api/__init__.py              |   4 -
 src/llmtuner/api/app.py                   |   3 +-
 src/llmtuner/chat/chat_model.py           |  43 +++++++
 src/llmtuner/cli.py                       |  39 ++++++
 src/llmtuner/eval/__init__.py             |   4 -
 src/llmtuner/eval/evaluator.py            |   2 +-
 src/llmtuner/extras/callbacks.py          | 145 ++++++++++------------
 src/llmtuner/extras/constants.py          |   8 +-
 src/llmtuner/extras/logging.py            |  34 +++--
 src/llmtuner/extras/ploting.py            |  25 +++-
 src/llmtuner/hparams/parser.py            |   4 +-
 src/llmtuner/train/__init__.py            |   4 -
 src/llmtuner/train/tuner.py               |   8 +-
 src/llmtuner/webui/__init__.py            |   4 -
 src/llmtuner/webui/common.py              |  11 +-
 src/llmtuner/webui/components/export.py   |   2 +-
 src/llmtuner/webui/components/train.py    |   6 +-
 src/llmtuner/webui/engine.py              |   4 +-
 src/llmtuner/webui/interface.py           |   6 +-
 src/llmtuner/webui/runner.py              |  77 +++++-------
 src/llmtuner/webui/utils.py               | 108 ++++++++--------
 src/{train_bash.py => train.py}           |   4 +-
 src/train_web.py                          |   9 --
 src/web_demo.py                           |   9 --
 65 files changed, 363 insertions(+), 372 deletions(-)
 delete mode 100644 src/api_demo.py
 delete mode 100644 src/cli_demo.py
 delete mode 100644 src/evaluate.py
 delete mode 100644 src/export_model.py
 create mode 100644 src/llmtuner/cli.py
 rename src/{train_bash.py => train.py} (67%)
 delete mode 100644 src/train_web.py
 delete mode 100644 src/web_demo.py

diff --git a/Dockerfile b/Dockerfile
index c3d231b5..4b8bb084 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,4 +11,4 @@ RUN pip install -e .[deepspeed,metrics,bitsandbytes,qwen]
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
 EXPOSE 7860
 
-CMD [ "python", "src/train_web.py" ]
+CMD [ "llamafactory-cli webui" ]
diff --git a/README.md b/README.md
index 04e5aa5b..8caac93f 100644
--- a/README.md
+++ b/README.md
@@ -346,7 +346,7 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 ```bash
 export CUDA_VISIBLE_DEVICES=0 # `set CUDA_VISIBLE_DEVICES=0` for Windows
 export GRADIO_SERVER_PORT=7860 # `set GRADIO_SERVER_PORT=7860` for Windows
-python src/train_web.py # or python -m llmtuner.webui.interface
+llamafactory-cli webui
 ```
 
 <details><summary>For Alibaba Cloud users</summary>
@@ -392,12 +392,12 @@ docker compose -f ./docker-compose.yml up -d
 
 See [examples/README.md](examples/README.md) for usage.
 
-Use `python src/train_bash.py -h` to display arguments description.
+Use `llamafactory-cli train -h` to display arguments description.
 
 ### Deploy with OpenAI-style API and vLLM
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
+CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api \
     --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
     --template llama3 \
     --infer_backend vllm \
diff --git a/README_zh.md b/README_zh.md
index 2240c688..27522232 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -346,7 +346,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 ```bash
 export CUDA_VISIBLE_DEVICES=0 # Windows 使用 `set CUDA_VISIBLE_DEVICES=0`
 export GRADIO_SERVER_PORT=7860 # Windows 使用 `set GRADIO_SERVER_PORT=7860`
-python src/train_web.py # 或 python -m llmtuner.webui.interface
+llamafactory-cli webui
 ```
 
 <details><summary>阿里云用户指南</summary>
@@ -392,12 +392,12 @@ docker compose -f ./docker-compose.yml up -d
 
 使用方法请参考 [examples/README_zh.md](examples/README_zh.md)。
 
-您可以执行 `python src/train_bash.py -h` 来查看参数文档。
+您可以执行 `llamafactory-cli train -h` 来查看参数文档。
 
 ### 利用 vLLM 部署 OpenAI API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 python src/api_demo.py \
+CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api \
     --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
     --template llama3 \
     --infer_backend vllm \
diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
index c2319caa..61167dad 100644
--- a/examples/extras/badam/sft.sh
+++ b/examples/extras/badam/sft.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/extras/fsdp_qlora/sft.sh b/examples/extras/fsdp_qlora/sft.sh
index e8b9ece7..9eb70a53 100644
--- a/examples/extras/fsdp_qlora/sft.sh
+++ b/examples/extras/fsdp_qlora/sft.sh
@@ -7,7 +7,7 @@ pip install "bitsandbytes>=0.43.0"
 
 CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
     --config_file ../../accelerate/fsdp_config.yaml \
-    ../../../src/train_bash.py \
+    ../../../src/train.py \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-70b-hf \
diff --git a/examples/extras/galore/sft.sh b/examples/extras/galore/sft.sh
index da1779ed..283673e7 100644
--- a/examples/extras/galore/sft.sh
+++ b/examples/extras/galore/sft.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/extras/llama_pro/sft.sh b/examples/extras/llama_pro/sft.sh
index 573078ff..3e26e0a6 100644
--- a/examples/extras/llama_pro/sft.sh
+++ b/examples/extras/llama_pro/sft.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path ../../../models/llama2-7b-pro \
diff --git a/examples/extras/loraplus/sft.sh b/examples/extras/loraplus/sft.sh
index cb334e7d..8d152d9e 100644
--- a/examples/extras/loraplus/sft.sh
+++ b/examples/extras/loraplus/sft.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/extras/mod/sft.sh b/examples/extras/mod/sft.sh
index 2c8f04a3..5219751f 100644
--- a/examples/extras/mod/sft.sh
+++ b/examples/extras/mod/sft.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
index d1382bc2..a1ffc0ee 100644
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -6,7 +6,7 @@ python -m torch.distributed.run \
     --node_rank $RANK \
     --master_addr $MASTER_ADDR \
     --master_port $MASTER_PORT \
-    ../../src/train_bash.py \
+    ../../src/train.py \
     --deepspeed ../deepspeed/ds_z3_config.json \
     --stage sft \
     --do_train \
diff --git a/examples/full_multi_gpu/predict.sh b/examples/full_multi_gpu/predict.sh
index 801df85a..7c2e458f 100644
--- a/examples/full_multi_gpu/predict.sh
+++ b/examples/full_multi_gpu/predict.sh
@@ -2,7 +2,7 @@
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
     --config_file ../accelerate/single_config.yaml \
-    ../../src/train_bash.py \
+    ../../src/train.py \
     --stage sft \
     --do_predict \
     --model_name_or_path ../../saves/LLaMA2-7B/full/sft \
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
index ea4acf90..73c7662d 100644
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-deepspeed --num_gpus 4 ../../src/train_bash.py \
+deepspeed --num_gpus 4 ../../src/train.py \
     --deepspeed ../deepspeed/ds_z3_config.json \
     --stage sft \
     --do_train \
diff --git a/examples/inference/api_demo.sh b/examples/inference/api_demo.sh
index aee86595..6f0f1b2e 100644
--- a/examples/inference/api_demo.sh
+++ b/examples/inference/api_demo.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python ../../src/api_demo.py \
+CUDA_VISIBLE_DEVICES=0 API_PORT=8000 llamafactory-cli api \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \
diff --git a/examples/inference/cli_demo.sh b/examples/inference/cli_demo.sh
index 3e4a1e4e..bc762411 100644
--- a/examples/inference/cli_demo.sh
+++ b/examples/inference/cli_demo.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/cli_demo.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \
diff --git a/examples/inference/evaluate.sh b/examples/inference/evaluate.sh
index 1fc6ccf8..5030329d 100644
--- a/examples/inference/evaluate.sh
+++ b/examples/inference/evaluate.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/evaluate.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template fewshot \
diff --git a/examples/inference/web_demo.sh b/examples/inference/web_demo.sh
index 8d6ed09d..a58cd2a0 100644
--- a/examples/inference/web_demo.sh
+++ b/examples/inference/web_demo.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # add `--visual_inputs True` to load MLLM
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/web_demo.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \
diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh
index f429d15b..bc74a6de 100644
--- a/examples/lora_multi_gpu/ds_zero3.sh
+++ b/examples/lora_multi_gpu/ds_zero3.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
+# ZeRO-3 enables weight sharding on multiple GPUs
 
-deepspeed --num_gpus 4 ../../src/train_bash.py \
+deepspeed --num_gpus 4 ../../src/train.py \
     --deepspeed ../deepspeed/ds_z3_config.json \
     --stage sft \
     --do_train \
diff --git a/examples/lora_multi_gpu/multi_node.sh b/examples/lora_multi_gpu/multi_node.sh
index 85a3e026..a58cac20 100644
--- a/examples/lora_multi_gpu/multi_node.sh
+++ b/examples/lora_multi_gpu/multi_node.sh
@@ -3,7 +3,7 @@
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
     --config_file ../accelerate/master_config.yaml \
-    ../../src/train_bash.py \
+    ../../src/train.py \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh
index 04529cf0..c0719c04 100644
--- a/examples/lora_multi_gpu/single_node.sh
+++ b/examples/lora_multi_gpu/single_node.sh
@@ -2,7 +2,7 @@
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
     --config_file ../accelerate/single_config.yaml \
-    ../../src/train_bash.py \
+    ../../src/train.py \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_single_gpu/dpo.sh b/examples/lora_single_gpu/dpo.sh
index 56a2dfc3..2cb6cb01 100644
--- a/examples/lora_single_gpu/dpo.sh
+++ b/examples/lora_single_gpu/dpo.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage dpo \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_single_gpu/orpo.sh b/examples/lora_single_gpu/orpo.sh
index 407907b1..335707bf 100644
--- a/examples/lora_single_gpu/orpo.sh
+++ b/examples/lora_single_gpu/orpo.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage orpo \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_single_gpu/ppo.sh b/examples/lora_single_gpu/ppo.sh
index 6a5b770e..9eccb05e 100644
--- a/examples/lora_single_gpu/ppo.sh
+++ b/examples/lora_single_gpu/ppo.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage ppo \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_single_gpu/predict.sh b/examples/lora_single_gpu/predict.sh
index eb9a18c0..250efed1 100644
--- a/examples/lora_single_gpu/predict.sh
+++ b/examples/lora_single_gpu/predict.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_predict \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_single_gpu/prepare.sh b/examples/lora_single_gpu/prepare.sh
index e86de636..277f9b7a 100644
--- a/examples/lora_single_gpu/prepare.sh
+++ b/examples/lora_single_gpu/prepare.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # use `--tokenized_path` in training script to load data
 
-CUDA_VISIBLE_DEVICES= python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES= llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_single_gpu/pretrain.sh b/examples/lora_single_gpu/pretrain.sh
index 59bdfe62..0782f00c 100644
--- a/examples/lora_single_gpu/pretrain.sh
+++ b/examples/lora_single_gpu/pretrain.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage pt \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_single_gpu/reward.sh b/examples/lora_single_gpu/reward.sh
index 1212d082..678809fd 100644
--- a/examples/lora_single_gpu/reward.sh
+++ b/examples/lora_single_gpu/reward.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage rm \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_single_gpu/sft.sh b/examples/lora_single_gpu/sft.sh
index 3bfbc9b8..2047e21f 100644
--- a/examples/lora_single_gpu/sft.sh
+++ b/examples/lora_single_gpu/sft.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/lora_single_gpu/sft_mllm.sh b/examples/lora_single_gpu/sft_mllm.sh
index 7e900918..53e37262 100644
--- a/examples/lora_single_gpu/sft_mllm.sh
+++ b/examples/lora_single_gpu/sft_mllm.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
diff --git a/examples/merge_lora/merge.sh b/examples/merge_lora/merge.sh
index c50bd6ad..186e64a4 100644
--- a/examples/merge_lora/merge.sh
+++ b/examples/merge_lora/merge.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # DO NOT use quantized model or quantization_bit when merging lora weights
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/export_model.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \
diff --git a/examples/merge_lora/quantize.sh b/examples/merge_lora/quantize.sh
index aeedbe66..4a104645 100644
--- a/examples/merge_lora/quantize.sh
+++ b/examples/merge_lora/quantize.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # NEED TO run `merge.sh` before using this script
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/export_model.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export \
     --model_name_or_path ../../models/llama2-7b-sft \
     --template default \
     --export_dir ../../models/llama2-7b-sft-int4 \
diff --git a/examples/qlora_single_gpu/aqlm.sh b/examples/qlora_single_gpu/aqlm.sh
index 68eb4482..1e0a71ca 100644
--- a/examples/qlora_single_gpu/aqlm.sh
+++ b/examples/qlora_single_gpu/aqlm.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf \
diff --git a/examples/qlora_single_gpu/awq.sh b/examples/qlora_single_gpu/awq.sh
index b0f1f46b..c13c8134 100644
--- a/examples/qlora_single_gpu/awq.sh
+++ b/examples/qlora_single_gpu/awq.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path TheBloke/Llama-2-7B-AWQ \
diff --git a/examples/qlora_single_gpu/bitsandbytes.sh b/examples/qlora_single_gpu/bitsandbytes.sh
index 84bbb426..27f48d41 100644
--- a/examples/qlora_single_gpu/bitsandbytes.sh
+++ b/examples/qlora_single_gpu/bitsandbytes.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
diff --git a/examples/qlora_single_gpu/gptq.sh b/examples/qlora_single_gpu/gptq.sh
index a971b09f..5b1b80e1 100644
--- a/examples/qlora_single_gpu/gptq.sh
+++ b/examples/qlora_single_gpu/gptq.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --stage sft \
     --do_train \
     --model_name_or_path TheBloke/Llama-2-7B-GPTQ \
diff --git a/requirements.txt b/requirements.txt
index ecba3ce1..f4818ed2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,4 @@ sse-starlette
 matplotlib
 fire
 packaging
+pyyaml
diff --git a/setup.py b/setup.py
index 6a03138d..f7589eb8 100644
--- a/setup.py
+++ b/setup.py
@@ -52,6 +52,7 @@ def main():
         python_requires=">=3.8.0",
         install_requires=get_requires(),
         extras_require=extra_require,
+        entry_points={"console_scripts": ["llamafactory-cli = llmtuner.cli:main"]},
         classifiers=[
             "Development Status :: 4 - Beta",
             "Intended Audience :: Developers",
diff --git a/src/api_demo.py b/src/api_demo.py
deleted file mode 100644
index a7140675..00000000
--- a/src/api_demo.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import os
-
-import uvicorn
-
-from llmtuner import ChatModel, create_app
-
-
-def main():
-    chat_model = ChatModel()
-    app = create_app(chat_model)
-    print("Visit http://localhost:{}/docs for API document.".format(os.environ.get("API_PORT", 8000)))
-    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("API_PORT", 8000)), workers=1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/cli_demo.py b/src/cli_demo.py
deleted file mode 100644
index ba828f51..00000000
--- a/src/cli_demo.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from llmtuner import ChatModel
-from llmtuner.extras.misc import torch_gc
-
-
-try:
-    import platform
-
-    if platform.system() != "Windows":
-        import readline  # noqa: F401
-except ImportError:
-    print("Install `readline` for a better experience.")
-
-
-def main():
-    chat_model = ChatModel()
-    messages = []
-    print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.")
-
-    while True:
-        try:
-            query = input("\nUser: ")
-        except UnicodeDecodeError:
-            print("Detected decoding error at the inputs, please set the terminal encoding to utf-8.")
-            continue
-        except Exception:
-            raise
-
-        if query.strip() == "exit":
-            break
-
-        if query.strip() == "clear":
-            messages = []
-            torch_gc()
-            print("History has been removed.")
-            continue
-
-        messages.append({"role": "user", "content": query})
-        print("Assistant: ", end="", flush=True)
-
-        response = ""
-        for new_text in chat_model.stream_chat(messages):
-            print(new_text, end="", flush=True)
-            response += new_text
-        print()
-        messages.append({"role": "assistant", "content": response})
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/evaluate.py b/src/evaluate.py
deleted file mode 100644
index 705a6e42..00000000
--- a/src/evaluate.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from llmtuner import Evaluator
-
-
-def main():
-    Evaluator().eval()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/export_model.py b/src/export_model.py
deleted file mode 100644
index 4baeb2c3..00000000
--- a/src/export_model.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from llmtuner import export_model
-
-
-def main():
-    export_model()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/llmtuner/__init__.py b/src/llmtuner/__init__.py
index b3a980a5..a3a97450 100644
--- a/src/llmtuner/__init__.py
+++ b/src/llmtuner/__init__.py
@@ -1,11 +1,3 @@
 # Level: api, webui > chat, eval, train > data, model > extras, hparams
 
-from .api import create_app
-from .chat import ChatModel
-from .eval import Evaluator
-from .train import export_model, run_exp
-from .webui import create_ui, create_web_demo
-
-
-__version__ = "0.7.0"
-__all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"]
+__version__ = "0.7.1.dev0"
diff --git a/src/llmtuner/api/__init__.py b/src/llmtuner/api/__init__.py
index d7059fbd..e69de29b 100644
--- a/src/llmtuner/api/__init__.py
+++ b/src/llmtuner/api/__init__.py
@@ -1,4 +0,0 @@
-from .app import create_app
-
-
-__all__ = ["create_app"]
diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index 3f06fef1..36918d1b 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -224,7 +224,8 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
     return app
 
 
-if __name__ == "__main__":
+def run_api():
     chat_model = ChatModel()
     app = create_app(chat_model)
+    print("Visit http://localhost:{}/docs for API document.".format(os.environ.get("API_PORT", 8000)))
     uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("API_PORT", 8000)), workers=1)
diff --git a/src/llmtuner/chat/chat_model.py b/src/llmtuner/chat/chat_model.py
index ba58dd2e..97ae87d7 100644
--- a/src/llmtuner/chat/chat_model.py
+++ b/src/llmtuner/chat/chat_model.py
@@ -2,6 +2,7 @@ import asyncio
 from threading import Thread
 from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, Generator, List, Optional, Sequence
 
+from ..extras.misc import torch_gc
 from ..hparams import get_infer_args
 from .hf_engine import HuggingfaceEngine
 from .vllm_engine import VllmEngine
@@ -95,3 +96,45 @@ class ChatModel:
         **input_kwargs,
     ) -> List[float]:
         return await self.engine.get_scores(batch_input, **input_kwargs)
+
+
+def run_chat():
+    try:
+        import platform
+
+        if platform.system() != "Windows":
+            import readline  # noqa: F401
+    except ImportError:
+        print("Install `readline` for a better experience.")
+
+    chat_model = ChatModel()
+    messages = []
+    print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.")
+
+    while True:
+        try:
+            query = input("\nUser: ")
+        except UnicodeDecodeError:
+            print("Detected decoding error at the inputs, please set the terminal encoding to utf-8.")
+            continue
+        except Exception:
+            raise
+
+        if query.strip() == "exit":
+            break
+
+        if query.strip() == "clear":
+            messages = []
+            torch_gc()
+            print("History has been removed.")
+            continue
+
+        messages.append({"role": "user", "content": query})
+        print("Assistant: ", end="", flush=True)
+
+        response = ""
+        for new_text in chat_model.stream_chat(messages):
+            print(new_text, end="", flush=True)
+            response += new_text
+        print()
+        messages.append({"role": "assistant", "content": response})
diff --git a/src/llmtuner/cli.py b/src/llmtuner/cli.py
new file mode 100644
index 00000000..1b5bd658
--- /dev/null
+++ b/src/llmtuner/cli.py
@@ -0,0 +1,39 @@
+import sys
+from enum import Enum, unique
+
+from .api.app import run_api
+from .chat.chat_model import run_chat
+from .eval.evaluator import run_eval
+from .train.tuner import export_model, run_exp
+from .webui.interface import run_web_demo, run_web_ui
+
+
+@unique
+class Command(str, Enum):
+    API = "api"
+    CHAT = "chat"
+    EVAL = "eval"
+    EXPORT = "export"
+    TRAIN = "train"
+    WEBDEMO = "webchat"
+    WEBUI = "webui"
+
+
+def main():
+    command = sys.argv.pop(1)
+    if command == Command.API:
+        run_api()
+    elif command == Command.CHAT:
+        run_chat()
+    elif command == Command.EVAL:
+        run_eval()
+    elif command == Command.EXPORT:
+        export_model()
+    elif command == Command.TRAIN:
+        run_exp()
+    elif command == Command.WEBDEMO:
+        run_web_demo()
+    elif command == Command.WEBUI:
+        run_web_ui()
+    else:
+        raise NotImplementedError("Unknown command: {}".format(command))
diff --git a/src/llmtuner/eval/__init__.py b/src/llmtuner/eval/__init__.py
index 95ce0377..e69de29b 100644
--- a/src/llmtuner/eval/__init__.py
+++ b/src/llmtuner/eval/__init__.py
@@ -1,4 +0,0 @@
-from .evaluator import Evaluator
-
-
-__all__ = ["Evaluator"]
diff --git a/src/llmtuner/eval/evaluator.py b/src/llmtuner/eval/evaluator.py
index 7446c6f5..4ea134c6 100644
--- a/src/llmtuner/eval/evaluator.py
+++ b/src/llmtuner/eval/evaluator.py
@@ -118,6 +118,6 @@ class Evaluator:
                 f.write(score_info)
 
 
-if __name__ == "__main__":
+def run_eval():
     evaluator = Evaluator()
     evaluator.eval()
diff --git a/src/llmtuner/extras/callbacks.py b/src/llmtuner/extras/callbacks.py
index 6e347c3c..fbe6f373 100644
--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -1,14 +1,18 @@
 import json
+import logging
 import os
+import signal
 import time
+from concurrent.futures import ThreadPoolExecutor
 from datetime import timedelta
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Dict
 
+import transformers
 from transformers import TrainerCallback
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length
 
-from .constants import LOG_FILE_NAME
-from .logging import get_logger
+from .constants import TRAINER_LOG
+from .logging import LoggerHandler, get_logger
 from .misc import fix_valuehead_checkpoint
 
 
@@ -33,20 +37,32 @@ class FixValueHeadModelCallback(TrainerCallback):
 
 
 class LogCallback(TrainerCallback):
-    def __init__(self, runner=None):
-        self.runner = runner
-        self.in_training = False
+    def __init__(self, output_dir: str) -> None:
+        self.aborted = False
+        self.do_train = False
+        self.webui_mode = bool(int(os.environ.get("LLAMABOARD_ENABLED", "0")))
+        if self.webui_mode:
+            signal.signal(signal.SIGABRT, self._set_abort)
+            self.logger_handler = LoggerHandler(output_dir)
+            logging.root.addHandler(self.logger_handler)
+            transformers.logging.add_handler(self.logger_handler)
+
+    def _set_abort(self, signum, frame) -> None:
+        self.aborted = True
+
+    def _reset(self, max_steps: int = 0) -> None:
         self.start_time = time.time()
         self.cur_steps = 0
-        self.max_steps = 0
+        self.max_steps = max_steps
         self.elapsed_time = ""
         self.remaining_time = ""
 
-    def timing(self):
+    def _timing(self, cur_steps: int) -> None:
         cur_time = time.time()
         elapsed_time = cur_time - self.start_time
-        avg_time_per_step = elapsed_time / self.cur_steps if self.cur_steps != 0 else 0
-        remaining_time = (self.max_steps - self.cur_steps) * avg_time_per_step
+        avg_time_per_step = elapsed_time / cur_steps if cur_steps != 0 else 0
+        remaining_time = (self.max_steps - cur_steps) * avg_time_per_step
+        self.cur_steps = cur_steps
         self.elapsed_time = str(timedelta(seconds=int(elapsed_time)))
         self.remaining_time = str(timedelta(seconds=int(remaining_time)))
 
@@ -54,36 +70,27 @@ class LogCallback(TrainerCallback):
         r"""
         Event called at the beginning of training.
         """
-        if state.is_local_process_zero:
-            self.in_training = True
-            self.start_time = time.time()
-            self.max_steps = state.max_steps
+        if args.should_log:
+            self.do_train = True
+            self._reset(max_steps=state.max_steps)
 
-        if args.save_on_each_node:
-            if not state.is_local_process_zero:
-                return
-        else:
-            if not state.is_world_process_zero:
-                return
+        if args.should_save:
+            os.makedirs(args.output_dir, exist_ok=True)
+            self.thread_pool = ThreadPoolExecutor(max_workers=1)
 
-        if os.path.exists(os.path.join(args.output_dir, LOG_FILE_NAME)) and args.overwrite_output_dir:
-            logger.warning("Previous log file in this folder will be deleted.")
-            os.remove(os.path.join(args.output_dir, LOG_FILE_NAME))
-
-    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
-        r"""
-        Event called at the end of training.
-        """
-        if state.is_local_process_zero:
-            self.in_training = False
-            self.cur_steps = 0
-            self.max_steps = 0
+        if (
+            args.should_save
+            and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG))
+            and args.overwrite_output_dir
+        ):
+            logger.warning("Previous trainer log in this folder will be deleted.")
+            os.remove(os.path.join(args.output_dir, TRAINER_LOG))
 
     def on_substep_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
         Event called at the end of an substep during gradient accumulation.
         """
-        if state.is_local_process_zero and self.runner is not None and self.runner.aborted:
+        if self.aborted:
             control.should_epoch_stop = True
             control.should_training_stop = True
 
@@ -91,42 +98,41 @@ class LogCallback(TrainerCallback):
         r"""
         Event called at the end of a training step.
         """
-        if state.is_local_process_zero:
-            self.cur_steps = state.global_step
-            self.timing()
-            if self.runner is not None and self.runner.aborted:
-                control.should_epoch_stop = True
-                control.should_training_stop = True
+        if args.should_log:
+            self._timing(cur_steps=state.global_step)
 
-    def on_evaluate(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if self.aborted:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
+
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
-        Event called after an evaluation phase.
+        Event called at the end of training.
         """
-        if state.is_local_process_zero and not self.in_training:
-            self.cur_steps = 0
-            self.max_steps = 0
+        self.thread_pool.shutdown(wait=True)
+        self.thread_pool = None
 
-    def on_predict(
-        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", *other, **kwargs
+    def on_prediction_step(
+        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs
     ):
         r"""
-        Event called after a successful prediction.
+        Event called after a prediction step.
         """
-        if state.is_local_process_zero and not self.in_training:
-            self.cur_steps = 0
-            self.max_steps = 0
+        eval_dataloader = kwargs.pop("eval_dataloader", None)
+        if args.should_log and has_length(eval_dataloader) and not self.do_train:
+            if self.max_steps == 0:
+                self.max_steps = len(eval_dataloader)
+
+            self._timing(cur_steps=self.cur_steps + 1)
+
+    def _write_log(self, output_dir: str, logs: Dict[str, Any]):
+        with open(os.path.join(output_dir, TRAINER_LOG), "a", encoding="utf-8") as f:
+            f.write(json.dumps(logs) + "\n")
 
     def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs) -> None:
         r"""
-        Event called after logging the last logs.
+        Event called after logging the last logs, `args.should_log` has been applied.
         """
-        if args.save_on_each_node:
-            if not state.is_local_process_zero:
-                return
-        else:
-            if not state.is_world_process_zero:
-                return
-
         logs = dict(
             current_steps=self.cur_steps,
             total_steps=self.max_steps,
@@ -141,26 +147,13 @@ class LogCallback(TrainerCallback):
             elapsed_time=self.elapsed_time,
             remaining_time=self.remaining_time,
         )
-        if self.runner is not None:
+        logs = {k: v for k, v in logs.items() if v is not None}
+        if self.webui_mode and "loss" in logs and "learning_rate" in logs and "epoch" in logs:
             logger.info(
                 "{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}}}".format(
-                    logs["loss"] or 0, logs["learning_rate"] or 0, logs["epoch"] or 0
+                    logs["loss"], logs["learning_rate"], logs["epoch"]
                 )
             )
 
-        os.makedirs(args.output_dir, exist_ok=True)
-        with open(os.path.join(args.output_dir, "trainer_log.jsonl"), "a", encoding="utf-8") as f:
-            f.write(json.dumps(logs) + "\n")
-
-    def on_prediction_step(
-        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs
-    ):
-        r"""
-        Event called after a prediction step.
-        """
-        eval_dataloader = kwargs.pop("eval_dataloader", None)
-        if state.is_local_process_zero and has_length(eval_dataloader) and not self.in_training:
-            if self.max_steps == 0:
-                self.max_steps = len(eval_dataloader)
-            self.cur_steps += 1
-            self.timing()
+        if args.should_save and self.thread_pool is not None:
+            self.thread_pool.submit(self._write_log, args.output_dir, logs)
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 0329b374..bf542e69 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -24,8 +24,6 @@ IGNORE_INDEX = -100
 
 LAYERNORM_NAMES = {"norm", "ln"}
 
-LOG_FILE_NAME = "trainer_log.jsonl"
-
 METHODS = ["full", "freeze", "lora"]
 
 MLLM_LIST = ["LLaVA1.5"]
@@ -34,10 +32,16 @@ MOD_SUPPORTED_MODELS = ["bloom", "falcon", "gemma", "llama", "mistral", "mixtral
 
 PEFT_METHODS = ["lora"]
 
+RUNNING_LOG = "running_log.txt"
+
 SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"]
 
 SUPPORTED_MODELS = OrderedDict()
 
+TRAINER_CONFIG = "trainer_config.yaml"
+
+TRAINER_LOG = "trainer_log.jsonl"
+
 TRAINING_STAGES = {
     "Supervised Fine-Tuning": "sft",
     "Reward Modeling": "rm",
diff --git a/src/llmtuner/extras/logging.py b/src/llmtuner/extras/logging.py
index bb270776..430b8a48 100644
--- a/src/llmtuner/extras/logging.py
+++ b/src/llmtuner/extras/logging.py
@@ -1,5 +1,9 @@
 import logging
+import os
 import sys
+from concurrent.futures import ThreadPoolExecutor
+
+from .constants import RUNNING_LOG
 
 
 class LoggerHandler(logging.Handler):
@@ -7,19 +11,35 @@ class LoggerHandler(logging.Handler):
     Logger handler used in Web UI.
     """
 
-    def __init__(self):
+    def __init__(self, output_dir: str) -> None:
         super().__init__()
-        self.log = ""
+        formatter = logging.Formatter(
+            fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S"
+        )
+        self.setLevel(logging.INFO)
+        self.setFormatter(formatter)
 
-    def reset(self):
-        self.log = ""
+        os.makedirs(output_dir, exist_ok=True)
+        self.running_log = os.path.join(output_dir, RUNNING_LOG)
+        if os.path.exists(self.running_log):
+            os.remove(self.running_log)
 
-    def emit(self, record):
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+
+    def _write_log(self, log_entry: str) -> None:
+        with open(self.running_log, "a", encoding="utf-8") as f:
+            f.write(log_entry + "\n\n")
+
+    def emit(self, record) -> None:
         if record.name == "httpx":
             return
+
         log_entry = self.format(record)
-        self.log += log_entry
-        self.log += "\n\n"
+        self.thread_pool.submit(self._write_log, log_entry)
+
+    def close(self) -> None:
+        self.thread_pool.shutdown(wait=True)
+        return super().close()
 
 
 def get_logger(name: str) -> logging.Logger:
diff --git a/src/llmtuner/extras/ploting.py b/src/llmtuner/extras/ploting.py
index fd3cb8a3..e53f1f89 100644
--- a/src/llmtuner/extras/ploting.py
+++ b/src/llmtuner/extras/ploting.py
@@ -1,7 +1,7 @@
 import json
 import math
 import os
-from typing import List
+from typing import Any, Dict, List
 
 from transformers.trainer import TRAINER_STATE_NAME
 
@@ -10,6 +10,7 @@ from .packages import is_matplotlib_available
 
 
 if is_matplotlib_available():
+    import matplotlib.figure
     import matplotlib.pyplot as plt
 
 
@@ -21,7 +22,7 @@ def smooth(scalars: List[float]) -> List[float]:
     EMA implementation according to TensorBoard.
     """
     last = scalars[0]
-    smoothed = list()
+    smoothed = []
     weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5)  # a sigmoid function
     for next_val in scalars:
         smoothed_val = last * weight + (1 - weight) * next_val
@@ -30,7 +31,27 @@ def smooth(scalars: List[float]) -> List[float]:
     return smoothed
 
 
+def gen_loss_plot(trainer_log: List[Dict[str, Any]]) -> "matplotlib.figure.Figure":
+    plt.close("all")
+    plt.switch_backend("agg")
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    steps, losses = [], []
+    for log in trainer_log:
+        if log.get("loss", None):
+            steps.append(log["current_steps"])
+            losses.append(log["loss"])
+
+    ax.plot(steps, losses, color="#1f77b4", alpha=0.4, label="original")
+    ax.plot(steps, smooth(losses), color="#1f77b4", label="smoothed")
+    ax.legend()
+    ax.set_xlabel("step")
+    ax.set_ylabel("loss")
+    return fig
+
+
 def plot_loss(save_dictionary: os.PathLike, keys: List[str] = ["loss"]) -> None:
+    plt.switch_backend("agg")
     with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), "r", encoding="utf-8") as f:
         data = json.load(f)
 
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 977d7cf4..7fdd3234 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -10,6 +10,7 @@ from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import is_torch_bf16_gpu_available
 from transformers.utils.versions import require_version
 
+from ..extras.constants import TRAINER_CONFIG
 from ..extras.logging import get_logger
 from ..extras.misc import check_dependencies, get_current_device
 from .data_args import DataArguments
@@ -251,7 +252,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         and can_resume_from_checkpoint
     ):
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+        files = os.listdir(training_args.output_dir)
+        if last_checkpoint is None and len(files) > 0 and (len(files) != 1 or files[0] != TRAINER_CONFIG):
             raise ValueError("Output directory already exists and is not empty. Please set `overwrite_output_dir`.")
 
         if last_checkpoint is not None:
diff --git a/src/llmtuner/train/__init__.py b/src/llmtuner/train/__init__.py
index 6c22bc15..e69de29b 100644
--- a/src/llmtuner/train/__init__.py
+++ b/src/llmtuner/train/__init__.py
@@ -1,4 +0,0 @@
-from .tuner import export_model, run_exp
-
-
-__all__ = ["export_model", "run_exp"]
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index a2eb121f..6822ffb5 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -23,9 +23,9 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
-def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None):
+def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallback"] = []):
     model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
-    callbacks = [LogCallback()] if callbacks is None else callbacks
+    callbacks.append(LogCallback(training_args.output_dir))
 
     if finetuning_args.stage == "pt":
         run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
@@ -88,7 +88,3 @@ def export_model(args: Optional[Dict[str, Any]] = None):
             tokenizer.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
     except Exception:
         logger.warning("Cannot save tokenizer, please copy the files manually.")
-
-
-if __name__ == "__main__":
-    run_exp()
diff --git a/src/llmtuner/webui/__init__.py b/src/llmtuner/webui/__init__.py
index 3e82dd69..e69de29b 100644
--- a/src/llmtuner/webui/__init__.py
+++ b/src/llmtuner/webui/__init__.py
@@ -1,4 +0,0 @@
-from .interface import create_ui, create_web_demo
-
-
-__all__ = ["create_ui", "create_web_demo"]
diff --git a/src/llmtuner/webui/common.py b/src/llmtuner/webui/common.py
index 9af4c439..a33e3db7 100644
--- a/src/llmtuner/webui/common.py
+++ b/src/llmtuner/webui/common.py
@@ -4,6 +4,7 @@ from collections import defaultdict
 from typing import Any, Dict, Optional
 
 from peft.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME
+from yaml import safe_dump, safe_load
 
 from ..extras.constants import (
     DATA_CONFIG,
@@ -29,7 +30,7 @@ DEFAULT_CACHE_DIR = "cache"
 DEFAULT_CONFIG_DIR = "config"
 DEFAULT_DATA_DIR = "data"
 DEFAULT_SAVE_DIR = "saves"
-USER_CONFIG = "user.config"
+USER_CONFIG = "user_config.yaml"
 
 
 def get_save_dir(*args) -> os.PathLike:
@@ -47,7 +48,7 @@ def get_save_path(config_path: str) -> os.PathLike:
 def load_config() -> Dict[str, Any]:
     try:
         with open(get_config_path(), "r", encoding="utf-8") as f:
-            return json.load(f)
+            return safe_load(f)
     except Exception:
         return {"lang": None, "last_model": None, "path_dict": {}, "cache_dir": None}
 
@@ -60,13 +61,13 @@ def save_config(lang: str, model_name: Optional[str] = None, model_path: Optiona
         user_config["last_model"] = model_name
         user_config["path_dict"][model_name] = model_path
     with open(get_config_path(), "w", encoding="utf-8") as f:
-        json.dump(user_config, f, indent=2, ensure_ascii=False)
+        safe_dump(user_config, f)
 
 
 def load_args(config_path: str) -> Optional[Dict[str, Any]]:
     try:
         with open(get_save_path(config_path), "r", encoding="utf-8") as f:
-            return json.load(f)
+            return safe_load(f)
     except Exception:
         return None
 
@@ -74,7 +75,7 @@ def load_args(config_path: str) -> Optional[Dict[str, Any]]:
 def save_args(config_path: str, config_dict: Dict[str, Any]) -> str:
     os.makedirs(DEFAULT_CONFIG_DIR, exist_ok=True)
     with open(get_save_path(config_path), "w", encoding="utf-8") as f:
-        json.dump(config_dict, f, indent=2, ensure_ascii=False)
+        safe_dump(config_dict, f)
 
     return str(get_save_path(config_path))
 
diff --git a/src/llmtuner/webui/components/export.py b/src/llmtuner/webui/components/export.py
index 4c224736..64273882 100644
--- a/src/llmtuner/webui/components/export.py
+++ b/src/llmtuner/webui/components/export.py
@@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Dict, Generator, List
 
 from ...extras.misc import torch_gc
 from ...extras.packages import is_gradio_available
-from ...train import export_model
+from ...train.tuner import export_model
 from ..common import get_save_dir
 from ..locales import ALERTS
 
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index c9671289..c709b916 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -245,7 +245,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
             with gr.Row():
                 resume_btn = gr.Checkbox(visible=False, interactive=False)
-                process_bar = gr.Slider(visible=False, interactive=False)
+                progress_bar = gr.Slider(visible=False, interactive=False)
 
             with gr.Row():
                 output_box = gr.Markdown()
@@ -263,14 +263,14 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             output_dir=output_dir,
             config_path=config_path,
             resume_btn=resume_btn,
-            process_bar=process_bar,
+            progress_bar=progress_bar,
             output_box=output_box,
             loss_viewer=loss_viewer,
         )
     )
 
     input_elems.update({output_dir, config_path})
-    output_elems = [output_box, process_bar, loss_viewer]
+    output_elems = [output_box, progress_bar, loss_viewer]
 
     cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems, concurrency_limit=None)
     arg_save_btn.click(engine.runner.save_args, input_elems, output_elems, concurrency_limit=None)
diff --git a/src/llmtuner/webui/engine.py b/src/llmtuner/webui/engine.py
index cebac3b9..964d65a2 100644
--- a/src/llmtuner/webui/engine.py
+++ b/src/llmtuner/webui/engine.py
@@ -41,7 +41,7 @@ class Engine:
             init_dict["train.dataset"] = {"choices": list_dataset().choices}
             init_dict["eval.dataset"] = {"choices": list_dataset().choices}
             init_dict["train.output_dir"] = {"value": "train_{}".format(get_time())}
-            init_dict["train.config_path"] = {"value": "{}.json".format(get_time())}
+            init_dict["train.config_path"] = {"value": "{}.yaml".format(get_time())}
             init_dict["eval.output_dir"] = {"value": "eval_{}".format(get_time())}
             init_dict["infer.image_box"] = {"visible": False}
 
@@ -51,7 +51,7 @@ class Engine:
 
         yield self._update_component(init_dict)
 
-        if self.runner.alive and not self.demo_mode and not self.pure_chat:
+        if self.runner.running and not self.demo_mode and not self.pure_chat:
             yield {elem: elem.__class__(value=value) for elem, value in self.runner.running_data.items()}
             if self.runner.do_train:
                 yield self._update_component({"train.resume_btn": {"value": True}})
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index abca16c5..feb2a20a 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -68,5 +68,9 @@ def create_web_demo() -> gr.Blocks:
     return demo
 
 
-if __name__ == "__main__":
+def run_web_ui():
     create_ui().queue().launch(server_name="0.0.0.0", server_port=None, share=False, inbrowser=True)
+
+
+def run_web_demo():
+    create_web_demo().queue().launch(server_name="0.0.0.0", server_port=None, share=False, inbrowser=True)
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index d53a4dfe..b14271b7 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -1,22 +1,19 @@
-import logging
 import os
-import time
-from threading import Thread
-from typing import TYPE_CHECKING, Any, Dict, Generator
+import signal
+from copy import deepcopy
+from subprocess import Popen, TimeoutExpired
+from typing import TYPE_CHECKING, Any, Dict, Generator, Optional
 
-import transformers
+import psutil
 from transformers.trainer import TRAINING_ARGS_NAME
 from transformers.utils import is_torch_cuda_available
 
-from ..extras.callbacks import LogCallback
 from ..extras.constants import TRAINING_STAGES
-from ..extras.logging import LoggerHandler
 from ..extras.misc import get_device_count, torch_gc
 from ..extras.packages import is_gradio_available
-from ..train import run_exp
 from .common import get_module, get_save_dir, load_args, load_config, save_args
 from .locales import ALERTS
-from .utils import gen_cmd, gen_plot, get_eval_results, update_process_bar
+from .utils import gen_cmd, get_eval_results, get_trainer_info, save_cmd
 
 
 if is_gradio_available():
@@ -34,24 +31,18 @@ class Runner:
         self.manager = manager
         self.demo_mode = demo_mode
         """ Resume """
-        self.thread: "Thread" = None
+        self.trainer: Optional["Popen"] = None
         self.do_train = True
         self.running_data: Dict["Component", Any] = None
         """ State """
         self.aborted = False
         self.running = False
-        """ Handler """
-        self.logger_handler = LoggerHandler()
-        self.logger_handler.setLevel(logging.INFO)
-        logging.root.addHandler(self.logger_handler)
-        transformers.logging.add_handler(self.logger_handler)
-
-    @property
-    def alive(self) -> bool:
-        return self.thread is not None
 
     def set_abort(self) -> None:
         self.aborted = True
+        if self.trainer is not None:
+            for children in psutil.Process(self.trainer.pid).children():  # abort the child process
+                os.kill(children.pid, signal.SIGABRT)
 
     def _initialize(self, data: Dict["Component", Any], do_train: bool, from_preview: bool) -> str:
         get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
@@ -85,13 +76,11 @@ class Runner:
         if not from_preview and not is_torch_cuda_available():
             gr.Warning(ALERTS["warn_no_cuda"][lang])
 
-        self.logger_handler.reset()
-        self.trainer_callback = LogCallback(self)
         return ""
 
     def _finalize(self, lang: str, finish_info: str) -> str:
         finish_info = ALERTS["info_aborted"][lang] if self.aborted else finish_info
-        self.thread = None
+        self.trainer = None
         self.aborted = False
         self.running = False
         self.running_data = None
@@ -270,11 +259,12 @@ class Runner:
             gr.Warning(error)
             yield {output_box: error}
         else:
-            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
-            run_kwargs = dict(args=args, callbacks=[self.trainer_callback])
             self.do_train, self.running_data = do_train, data
-            self.thread = Thread(target=run_exp, kwargs=run_kwargs)
-            self.thread.start()
+            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
+            env = deepcopy(os.environ)
+            env["CUDA_VISIBLE_DEVICES"] = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
+            env["LLAMABOARD_ENABLED"] = "1"
+            self.trainer = Popen("llamafactory-cli train {}".format(save_cmd(args)), env=env, shell=True)
             yield from self.monitor()
 
     def preview_train(self, data):
@@ -291,9 +281,6 @@ class Runner:
 
     def monitor(self):
         get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
-        self.aborted = False
-        self.running = True
-
         lang = get("top.lang")
         model_name = get("top.model_name")
         finetuning_type = get("top.finetuning_type")
@@ -301,28 +288,31 @@ class Runner:
         output_path = get_save_dir(model_name, finetuning_type, output_dir)
 
         output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if self.do_train else "eval"))
-        process_bar = self.manager.get_elem_by_id("{}.process_bar".format("train" if self.do_train else "eval"))
+        progress_bar = self.manager.get_elem_by_id("{}.progress_bar".format("train" if self.do_train else "eval"))
         loss_viewer = self.manager.get_elem_by_id("train.loss_viewer") if self.do_train else None
 
-        while self.thread is not None and self.thread.is_alive():
+        while self.trainer is not None:
             if self.aborted:
                 yield {
                     output_box: ALERTS["info_aborting"][lang],
-                    process_bar: gr.Slider(visible=False),
+                    progress_bar: gr.Slider(visible=False),
                 }
             else:
+                running_log, running_progress, running_loss = get_trainer_info(output_path)
                 return_dict = {
-                    output_box: self.logger_handler.log,
-                    process_bar: update_process_bar(self.trainer_callback),
+                    output_box: running_log,
+                    progress_bar: running_progress,
                 }
-                if self.do_train:
-                    plot = gen_plot(output_path)
-                    if plot is not None:
-                        return_dict[loss_viewer] = plot
+                if self.do_train and running_loss is not None:
+                    return_dict[loss_viewer] = running_loss
 
                 yield return_dict
 
-            time.sleep(2)
+            try:
+                self.trainer.wait(2)
+                self.trainer = None
+            except TimeoutExpired:
+                continue
 
         if self.do_train:
             if os.path.exists(os.path.join(output_path, TRAINING_ARGS_NAME)):
@@ -337,16 +327,11 @@ class Runner:
 
         return_dict = {
             output_box: self._finalize(lang, finish_info),
-            process_bar: gr.Slider(visible=False),
+            progress_bar: gr.Slider(visible=False),
         }
-        if self.do_train:
-            plot = gen_plot(output_path)
-            if plot is not None:
-                return_dict[loss_viewer] = plot
-
         yield return_dict
 
-    def save_args(self, data):
+    def save_args(self, data: dict):
         output_box = self.manager.get_elem_by_id("train.output_box")
         error = self._initialize(data, do_train=True, from_preview=True)
         if error:
diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py
index 74f74e6a..2ad1e62c 100644
--- a/src/llmtuner/webui/utils.py
+++ b/src/llmtuner/webui/utils.py
@@ -1,10 +1,13 @@
 import json
 import os
 from datetime import datetime
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
+from yaml import safe_dump
+
+from ..extras.constants import RUNNING_LOG, TRAINER_CONFIG, TRAINER_LOG
 from ..extras.packages import is_gradio_available, is_matplotlib_available
-from ..extras.ploting import smooth
+from ..extras.ploting import gen_loss_plot
 from .locales import ALERTS
 
 
@@ -12,30 +15,6 @@ if is_gradio_available():
     import gradio as gr
 
 
-if is_matplotlib_available():
-    import matplotlib.figure
-    import matplotlib.pyplot as plt
-
-
-if TYPE_CHECKING:
-    from ..extras.callbacks import LogCallback
-
-
-def update_process_bar(callback: "LogCallback") -> "gr.Slider":
-    if not callback.max_steps:
-        return gr.Slider(visible=False)
-
-    percentage = round(100 * callback.cur_steps / callback.max_steps, 0) if callback.max_steps != 0 else 100.0
-    label = "Running {:d}/{:d}: {} < {}".format(
-        callback.cur_steps, callback.max_steps, callback.elapsed_time, callback.remaining_time
-    )
-    return gr.Slider(label=label, value=percentage, visible=True)
-
-
-def get_time() -> str:
-    return datetime.now().strftime(r"%Y-%m-%d-%H-%M-%S")
-
-
 def can_quantize(finetuning_type: str) -> "gr.Dropdown":
     if finetuning_type != "lora":
         return gr.Dropdown(value="none", interactive=False)
@@ -57,14 +36,19 @@ def check_json_schema(text: str, lang: str) -> None:
         gr.Warning(ALERTS["err_json_schema"][lang])
 
 
+def clean_cmd(args: Dict[str, Any]) -> Dict[str, Any]:
+    no_skip_keys = ["packing"]
+    return {k: v for k, v in args.items() if (k in no_skip_keys) or (v is not None and v is not False and v != "")}
+
+
 def gen_cmd(args: Dict[str, Any]) -> str:
     args.pop("disable_tqdm", None)
     args["plot_loss"] = args.get("do_train", None)
     current_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
     cmd_lines = ["CUDA_VISIBLE_DEVICES={} python src/train_bash.py ".format(current_devices)]
-    for k, v in args.items():
-        if v is not None and v is not False and v != "":
-            cmd_lines.append("    --{} {} ".format(k, str(v)))
+    for k, v in clean_cmd(args).items():
+        cmd_lines.append("    --{} {} ".format(k, str(v)))
+
     cmd_text = "\\\n".join(cmd_lines)
     cmd_text = "```bash\n{}\n```".format(cmd_text)
     return cmd_text
@@ -76,29 +60,49 @@ def get_eval_results(path: os.PathLike) -> str:
     return "```json\n{}\n```\n".format(result)
 
 
-def gen_plot(output_path: str) -> Optional["matplotlib.figure.Figure"]:
-    log_file = os.path.join(output_path, "trainer_log.jsonl")
-    if not os.path.isfile(log_file) or not is_matplotlib_available():
-        return
+def get_time() -> str:
+    return datetime.now().strftime(r"%Y-%m-%d-%H-%M-%S")
 
-    plt.close("all")
-    plt.switch_backend("agg")
-    fig = plt.figure()
-    ax = fig.add_subplot(111)
-    steps, losses = [], []
-    with open(log_file, "r", encoding="utf-8") as f:
-        for line in f:
-            log_info: Dict[str, Any] = json.loads(line)
-            if log_info.get("loss", None):
-                steps.append(log_info["current_steps"])
-                losses.append(log_info["loss"])
 
-    if len(losses) == 0:
-        return
+def get_trainer_info(output_path: os.PathLike) -> Tuple[str, "gr.Slider", Optional["gr.Plot"]]:
+    running_log = ""
+    running_progress = gr.Slider(visible=False)
+    running_loss = None
 
-    ax.plot(steps, losses, color="#1f77b4", alpha=0.4, label="original")
-    ax.plot(steps, smooth(losses), color="#1f77b4", label="smoothed")
-    ax.legend()
-    ax.set_xlabel("step")
-    ax.set_ylabel("loss")
-    return fig
+    running_log_path = os.path.join(output_path, RUNNING_LOG)
+    if os.path.isfile(running_log_path):
+        with open(running_log_path, "r", encoding="utf-8") as f:
+            running_log = f.read()
+
+    trainer_log_path = os.path.join(output_path, TRAINER_LOG)
+    if os.path.isfile(trainer_log_path):
+        trainer_log: List[Dict[str, Any]] = []
+        with open(trainer_log_path, "r", encoding="utf-8") as f:
+            for line in f:
+                trainer_log.append(json.loads(line))
+
+        if len(trainer_log) != 0:
+            latest_log = trainer_log[-1]
+            percentage = latest_log["percentage"]
+            label = "Running {:d}/{:d}: {} < {}".format(
+                latest_log["current_steps"],
+                latest_log["total_steps"],
+                latest_log["elapsed_time"],
+                latest_log["remaining_time"],
+            )
+            running_progress = gr.Slider(label=label, value=percentage, visible=True)
+
+            if is_matplotlib_available():
+                running_loss = gr.Plot(gen_loss_plot(trainer_log))
+
+    return running_log, running_progress, running_loss
+
+
+def save_cmd(args: Dict[str, Any]) -> str:
+    output_dir = args["output_dir"]
+    os.makedirs(output_dir, exist_ok=True)
+
+    with open(os.path.join(output_dir, TRAINER_CONFIG), "w", encoding="utf-8") as f:
+        safe_dump(clean_cmd(args), f)
+
+    return os.path.join(output_dir, TRAINER_CONFIG)
diff --git a/src/train_bash.py b/src/train.py
similarity index 67%
rename from src/train_bash.py
rename to src/train.py
index 9ddd0586..6a3212cb 100644
--- a/src/train_bash.py
+++ b/src/train.py
@@ -1,4 +1,4 @@
-from llmtuner import run_exp
+from llmtuner.train.tuner import run_exp
 
 
 def main():
@@ -7,7 +7,7 @@ def main():
 
 def _mp_fn(index):
     # For xla_spawn (TPUs)
-    main()
+    run_exp()
 
 
 if __name__ == "__main__":
diff --git a/src/train_web.py b/src/train_web.py
deleted file mode 100644
index 8327f4dd..00000000
--- a/src/train_web.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from llmtuner import create_ui
-
-
-def main():
-    create_ui().queue().launch(server_name="0.0.0.0", server_port=None, share=False, inbrowser=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/web_demo.py b/src/web_demo.py
deleted file mode 100644
index 3b57ee73..00000000
--- a/src/web_demo.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from llmtuner import create_web_demo
-
-
-def main():
-    create_web_demo().queue().launch(server_name="0.0.0.0", server_port=None, share=False, inbrowser=True)
-
-
-if __name__ == "__main__":
-    main()

From 33d440b57795e95203d58f31f95946ecbbee700e Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 3 May 2024 03:54:46 +0800
Subject: [PATCH 204/341] fix colab gradio

Former-commit-id: 26179a29d3400d1fea155e325a79473a8bc12f04
---
 src/llmtuner/webui/interface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index feb2a20a..5f17d76d 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -69,8 +69,8 @@ def create_web_demo() -> gr.Blocks:
 
 
 def run_web_ui():
-    create_ui().queue().launch(server_name="0.0.0.0", server_port=None, share=False, inbrowser=True)
+    create_ui().queue().launch(server_name="0.0.0.0")
 
 
 def run_web_demo():
-    create_web_demo().queue().launch(server_name="0.0.0.0", server_port=None, share=False, inbrowser=True)
+    create_web_demo().queue().launch(server_name="0.0.0.0")

From 57c6eabf83de7bc092ca7d7739443c2aacd1afb9 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 3 May 2024 04:24:50 +0800
Subject: [PATCH 205/341] fix gen_args

Former-commit-id: c3e2f4f07b7fb3b1d7d2b44451660f082a467aed
---
 src/llmtuner/webui/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py
index 2ad1e62c..74683cb9 100644
--- a/src/llmtuner/webui/utils.py
+++ b/src/llmtuner/webui/utils.py
@@ -45,7 +45,7 @@ def gen_cmd(args: Dict[str, Any]) -> str:
     args.pop("disable_tqdm", None)
     args["plot_loss"] = args.get("do_train", None)
     current_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
-    cmd_lines = ["CUDA_VISIBLE_DEVICES={} python src/train_bash.py ".format(current_devices)]
+    cmd_lines = ["CUDA_VISIBLE_DEVICES={} llamafactory-cli train ".format(current_devices)]
     for k, v in clean_cmd(args).items():
         cmd_lines.append("    --{} {} ".format(k, str(v)))
 

From 09d9fb28f91e1517a5c1c66316279b983c9f538f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 3 May 2024 04:42:50 +0800
Subject: [PATCH 206/341] enable tqdm in webui

Former-commit-id: 1737bff64799047a5b715fd979b4c038ae213bb3
---
 src/llmtuner/webui/runner.py | 2 --
 src/llmtuner/webui/utils.py  | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index b14271b7..4ea08348 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -142,7 +142,6 @@ class Runner:
             bf16=(get("train.compute_type") == "bf16"),
             pure_bf16=(get("train.compute_type") == "pure_bf16"),
         )
-        args["disable_tqdm"] = True
 
         if args["finetuning_type"] == "freeze":
             args["num_layer_trainable"] = get("train.num_layer_trainable")
@@ -233,7 +232,6 @@ class Runner:
             temperature=get("eval.temperature"),
             output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("eval.output_dir")),
         )
-        args["disable_tqdm"] = True
 
         if get("eval.predict"):
             args["do_predict"] = True
diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py
index 74683cb9..c8729d36 100644
--- a/src/llmtuner/webui/utils.py
+++ b/src/llmtuner/webui/utils.py
@@ -42,7 +42,6 @@ def clean_cmd(args: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def gen_cmd(args: Dict[str, Any]) -> str:
-    args.pop("disable_tqdm", None)
     args["plot_loss"] = args.get("do_train", None)
     current_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
     cmd_lines = ["CUDA_VISIBLE_DEVICES={} llamafactory-cli train ".format(current_devices)]

From 1fea91736a35759471fde6e37763e6a19431516e Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 3 May 2024 21:24:27 +0800
Subject: [PATCH 207/341] fix callback log multigpu #3559

Former-commit-id: 1f105f1551b12675ca7d339ef5f91333f0371987
---
 src/llmtuner/extras/callbacks.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/llmtuner/extras/callbacks.py b/src/llmtuner/extras/callbacks.py
index fbe6f373..76f07a42 100644
--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -70,11 +70,9 @@ class LogCallback(TrainerCallback):
         r"""
         Event called at the beginning of training.
         """
-        if args.should_log:
+        if args.should_save:
             self.do_train = True
             self._reset(max_steps=state.max_steps)
-
-        if args.should_save:
             os.makedirs(args.output_dir, exist_ok=True)
             self.thread_pool = ThreadPoolExecutor(max_workers=1)
 
@@ -98,7 +96,7 @@ class LogCallback(TrainerCallback):
         r"""
         Event called at the end of a training step.
         """
-        if args.should_log:
+        if args.should_save:
             self._timing(cur_steps=state.global_step)
 
         if self.aborted:
@@ -119,7 +117,7 @@ class LogCallback(TrainerCallback):
         Event called after a prediction step.
         """
         eval_dataloader = kwargs.pop("eval_dataloader", None)
-        if args.should_log and has_length(eval_dataloader) and not self.do_train:
+        if args.should_save and has_length(eval_dataloader) and not self.do_train:
             if self.max_steps == 0:
                 self.max_steps = len(eval_dataloader)
 
@@ -131,8 +129,11 @@ class LogCallback(TrainerCallback):
 
     def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs) -> None:
         r"""
-        Event called after logging the last logs, `args.should_log` has been applied.
+        Event called after logging the last logs.
         """
+        if not args.should_save:
+            return
+
         logs = dict(
             current_steps=self.cur_steps,
             total_steps=self.max_steps,
@@ -148,12 +149,12 @@ class LogCallback(TrainerCallback):
             remaining_time=self.remaining_time,
         )
         logs = {k: v for k, v in logs.items() if v is not None}
-        if self.webui_mode and "loss" in logs and "learning_rate" in logs and "epoch" in logs:
+        if self.webui_mode and all(key in logs for key in ["loss", "learning_rate", "epoch"]):
             logger.info(
                 "{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}}}".format(
                     logs["loss"], logs["learning_rate"], logs["epoch"]
                 )
             )
 
-        if args.should_save and self.thread_pool is not None:
+        if self.thread_pool is not None:
             self.thread_pool.submit(self._write_log, args.output_dir, logs)

From 2383e5440cd28e916fb3cb025de708acfec31e4d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 3 May 2024 23:06:52 +0800
Subject: [PATCH 208/341] fix slow op in dpo/orpo trainer

Former-commit-id: 38cad0896ea0516de6d4b2759ec9d45ee67d339b
---
 src/llmtuner/train/dpo/trainer.py  | 16 ++++++++--------
 src/llmtuner/train/orpo/trainer.py | 20 ++++++++++----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py
index 35dcd8db..b144d561 100644
--- a/src/llmtuner/train/dpo/trainer.py
+++ b/src/llmtuner/train/dpo/trainer.py
@@ -165,13 +165,13 @@ class CustomDPOTrainer(DPOTrainer):
         reward_accuracies = (chosen_rewards > rejected_rewards).float()
 
         prefix = "eval_" if train_eval == "eval" else ""
-        metrics["{}rewards/chosen".format(prefix)] = chosen_rewards.cpu().mean()
-        metrics["{}rewards/rejected".format(prefix)] = rejected_rewards.cpu().mean()
-        metrics["{}rewards/accuracies".format(prefix)] = reward_accuracies.cpu().mean()
-        metrics["{}rewards/margins".format(prefix)] = (chosen_rewards - rejected_rewards).cpu().mean()
-        metrics["{}logps/rejected".format(prefix)] = policy_rejected_logps.detach().cpu().mean()
-        metrics["{}logps/chosen".format(prefix)] = policy_chosen_logps.detach().cpu().mean()
-        metrics["{}logits/rejected".format(prefix)] = policy_rejected_logits.detach().cpu().mean()
-        metrics["{}logits/chosen".format(prefix)] = policy_chosen_logits.detach().cpu().mean()
+        metrics["{}rewards/chosen".format(prefix)] = chosen_rewards.mean().cpu()
+        metrics["{}rewards/rejected".format(prefix)] = rejected_rewards.mean().cpu()
+        metrics["{}rewards/accuracies".format(prefix)] = reward_accuracies.mean().cpu()
+        metrics["{}rewards/margins".format(prefix)] = (chosen_rewards - rejected_rewards).mean().cpu()
+        metrics["{}logps/rejected".format(prefix)] = policy_rejected_logps.detach().mean().cpu()
+        metrics["{}logps/chosen".format(prefix)] = policy_chosen_logps.detach().mean().cpu()
+        metrics["{}logits/rejected".format(prefix)] = policy_rejected_logits.detach().mean().cpu()
+        metrics["{}logits/chosen".format(prefix)] = policy_chosen_logits.detach().mean().cpu()
 
         return losses.mean(), metrics
diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py
index 5e0d70d9..88090a9e 100644
--- a/src/llmtuner/train/orpo/trainer.py
+++ b/src/llmtuner/train/orpo/trainer.py
@@ -113,15 +113,15 @@ class CustomORPOTrainer(DPOTrainer):
         reward_accuracies = (chosen_rewards > rejected_rewards).float()
 
         prefix = "eval_" if train_eval == "eval" else ""
-        metrics["{}rewards/chosen".format(prefix)] = chosen_rewards.cpu().mean()
-        metrics["{}rewards/rejected".format(prefix)] = rejected_rewards.cpu().mean()
-        metrics["{}rewards/accuracies".format(prefix)] = reward_accuracies.cpu().mean()
-        metrics["{}rewards/margins".format(prefix)] = (chosen_rewards - rejected_rewards).cpu().mean()
-        metrics["{}logps/rejected".format(prefix)] = rejected_logps.detach().cpu().mean()
-        metrics["{}logps/chosen".format(prefix)] = chosen_logps.detach().cpu().mean()
-        metrics["{}logits/rejected".format(prefix)] = rejected_logits.detach().cpu().mean()
-        metrics["{}logits/chosen".format(prefix)] = chosen_logits.detach().cpu().mean()
-        metrics["{}sft_loss".format(prefix)] = sft_loss.detach().cpu().mean()
-        metrics["{}odds_ratio_loss".format(prefix)] = odds_ratio_loss.detach().cpu().mean()
+        metrics["{}rewards/chosen".format(prefix)] = chosen_rewards.mean().cpu()
+        metrics["{}rewards/rejected".format(prefix)] = rejected_rewards.mean().cpu()
+        metrics["{}rewards/accuracies".format(prefix)] = reward_accuracies.mean().cpu()
+        metrics["{}rewards/margins".format(prefix)] = (chosen_rewards - rejected_rewards).mean().cpu()
+        metrics["{}logps/rejected".format(prefix)] = rejected_logps.detach().mean().cpu()
+        metrics["{}logps/chosen".format(prefix)] = chosen_logps.detach().mean().cpu()
+        metrics["{}logits/rejected".format(prefix)] = rejected_logits.detach().mean().cpu()
+        metrics["{}logits/chosen".format(prefix)] = chosen_logits.detach().mean().cpu()
+        metrics["{}sft_loss".format(prefix)] = sft_loss.detach().mean().cpu()
+        metrics["{}odds_ratio_loss".format(prefix)] = odds_ratio_loss.detach().mean().cpu()
 
         return batch_loss, metrics

From 7a4a6a55227176bbfa77b063c95296ded6effc94 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 3 May 2024 23:15:19 +0800
Subject: [PATCH 209/341] fix webui resume

Former-commit-id: c2f6582ddd365bb64b72e8057cc4ecd7884d2480
---
 src/llmtuner/webui/runner.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 4ea08348..b04c9b00 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -278,6 +278,9 @@ class Runner:
         yield from self._launch(data, do_train=False)
 
     def monitor(self):
+        self.aborted = False
+        self.running = True
+
         get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
         lang = get("top.lang")
         model_name = get("top.model_name")

From 182b9747862a7735c799bc7c6115d8e225c2bd63 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 00:19:19 +0800
Subject: [PATCH 210/341] fix eval in webui

Former-commit-id: 774ef2bf5823d68b9cc254a676f5adb4af533d75
---
 src/llmtuner/extras/callbacks.py       | 85 +++++++++++++++++++-------
 src/llmtuner/webui/common.py           | 10 ++-
 src/llmtuner/webui/components/eval.py  | 15 ++---
 src/llmtuner/webui/components/train.py |  4 +-
 src/llmtuner/webui/runner.py           |  4 +-
 src/llmtuner/webui/utils.py            |  4 +-
 6 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/src/llmtuner/extras/callbacks.py b/src/llmtuner/extras/callbacks.py
index 76f07a42..a07c7059 100644
--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -5,7 +5,7 @@ import signal
 import time
 from concurrent.futures import ThreadPoolExecutor
 from datetime import timedelta
-from typing import TYPE_CHECKING, Any, Dict
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import transformers
 from transformers import TrainerCallback
@@ -38,8 +38,20 @@ class FixValueHeadModelCallback(TrainerCallback):
 
 class LogCallback(TrainerCallback):
     def __init__(self, output_dir: str) -> None:
+        r"""
+        Initializes a callback for logging training and evaluation status.
+        """
+        """ Progress """
+        self.start_time = 0
+        self.cur_steps = 0
+        self.max_steps = 0
+        self.elapsed_time = ""
+        self.remaining_time = ""
+        self.thread_pool: Optional["ThreadPoolExecutor"] = None
+        """ Status """
         self.aborted = False
         self.do_train = False
+        """ Web UI """
         self.webui_mode = bool(int(os.environ.get("LLAMABOARD_ENABLED", "0")))
         if self.webui_mode:
             signal.signal(signal.SIGABRT, self._set_abort)
@@ -66,6 +78,19 @@ class LogCallback(TrainerCallback):
         self.elapsed_time = str(timedelta(seconds=int(elapsed_time)))
         self.remaining_time = str(timedelta(seconds=int(remaining_time)))
 
+    def _write_log(self, output_dir: str, logs: Dict[str, Any]) -> None:
+        with open(os.path.join(output_dir, TRAINER_LOG), "a", encoding="utf-8") as f:
+            f.write(json.dumps(logs) + "\n")
+
+    def _create_thread_pool(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+
+    def _close_thread_pool(self) -> None:
+        if self.thread_pool is not None:
+            self.thread_pool.shutdown(wait=True)
+            self.thread_pool = None
+
     def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
         Event called at the beginning of training.
@@ -73,8 +98,7 @@ class LogCallback(TrainerCallback):
         if args.should_save:
             self.do_train = True
             self._reset(max_steps=state.max_steps)
-            os.makedirs(args.output_dir, exist_ok=True)
-            self.thread_pool = ThreadPoolExecutor(max_workers=1)
+            self._create_thread_pool(output_dir=args.output_dir)
 
         if (
             args.should_save
@@ -84,6 +108,12 @@ class LogCallback(TrainerCallback):
             logger.warning("Previous trainer log in this folder will be deleted.")
             os.remove(os.path.join(args.output_dir, TRAINER_LOG))
 
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the end of training.
+        """
+        self._close_thread_pool()
+
     def on_substep_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
         Event called at the end of an substep during gradient accumulation.
@@ -103,31 +133,19 @@ class LogCallback(TrainerCallback):
             control.should_epoch_stop = True
             control.should_training_stop = True
 
-    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+    def on_evaluate(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
-        Event called at the end of training.
+        Event called after an evaluation phase.
         """
-        self.thread_pool.shutdown(wait=True)
-        self.thread_pool = None
+        self._close_thread_pool()
 
-    def on_prediction_step(
-        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs
-    ):
+    def on_predict(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
-        Event called after a prediction step.
+        Event called after a successful prediction.
         """
-        eval_dataloader = kwargs.pop("eval_dataloader", None)
-        if args.should_save and has_length(eval_dataloader) and not self.do_train:
-            if self.max_steps == 0:
-                self.max_steps = len(eval_dataloader)
+        self._close_thread_pool()
 
-            self._timing(cur_steps=self.cur_steps + 1)
-
-    def _write_log(self, output_dir: str, logs: Dict[str, Any]):
-        with open(os.path.join(output_dir, TRAINER_LOG), "a", encoding="utf-8") as f:
-            f.write(json.dumps(logs) + "\n")
-
-    def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs) -> None:
+    def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
         Event called after logging the last logs.
         """
@@ -158,3 +176,26 @@ class LogCallback(TrainerCallback):
 
         if self.thread_pool is not None:
             self.thread_pool.submit(self._write_log, args.output_dir, logs)
+
+    def on_prediction_step(
+        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs
+    ):
+        r"""
+        Event called after a prediction step.
+        """
+        eval_dataloader = kwargs.pop("eval_dataloader", None)
+        if args.should_save and has_length(eval_dataloader) and not self.do_train:
+            if self.max_steps == 0:
+                self._reset(max_steps=len(eval_dataloader))
+                self._create_thread_pool(output_dir=args.output_dir)
+
+            self._timing(cur_steps=self.cur_steps + 1)
+            if self.cur_steps % 5 == 0 and self.thread_pool is not None:
+                logs = dict(
+                    current_steps=self.cur_steps,
+                    total_steps=self.max_steps,
+                    percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
+                    elapsed_time=self.elapsed_time,
+                    remaining_time=self.remaining_time,
+                )
+                self.thread_pool.submit(self._write_log, args.output_dir, logs)
diff --git a/src/llmtuner/webui/common.py b/src/llmtuner/webui/common.py
index a33e3db7..d569f1fa 100644
--- a/src/llmtuner/webui/common.py
+++ b/src/llmtuner/webui/common.py
@@ -17,6 +17,7 @@ from ..extras.constants import (
     TRAINING_STAGES,
     DownloadSource,
 )
+from ..extras.logging import get_logger
 from ..extras.misc import use_modelscope
 from ..extras.packages import is_gradio_available
 
@@ -25,6 +26,9 @@ if is_gradio_available():
     import gradio as gr
 
 
+logger = get_logger(__name__)
+
+
 ADAPTER_NAMES = {WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME}
 DEFAULT_CACHE_DIR = "cache"
 DEFAULT_CONFIG_DIR = "config"
@@ -128,11 +132,15 @@ def list_adapters(model_name: str, finetuning_type: str) -> "gr.Dropdown":
 
 
 def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]:
+    if dataset_dir == "ONLINE":
+        logger.info("dataset_dir is ONLINE, using online dataset.")
+        return {}
+
     try:
         with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f:
             return json.load(f)
     except Exception as err:
-        print("Cannot open {} due to {}.".format(os.path.join(dataset_dir, DATA_CONFIG), str(err)))
+        logger.warning("Cannot open {} due to {}.".format(os.path.join(dataset_dir, DATA_CONFIG), str(err)))
         return {}
 
 
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index 3910a746..222f9314 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -21,16 +21,16 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Row():
         dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2)
-        dataset = gr.Dropdown(multiselect=True, scale=4)
+        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
     input_elems.update({dataset_dir, dataset})
     elem_dict.update(dict(dataset_dir=dataset_dir, dataset=dataset, **preview_elems))
 
     with gr.Row():
-        cutoff_len = gr.Slider(value=1024, minimum=4, maximum=8192, step=1)
+        cutoff_len = gr.Slider(value=1024, minimum=4, maximum=65536, step=1)
         max_samples = gr.Textbox(value="100000")
-        batch_size = gr.Slider(value=8, minimum=1, maximum=512, step=1)
+        batch_size = gr.Slider(value=2, minimum=1, maximum=1024, step=1)
         predict = gr.Checkbox(value=True)
 
     input_elems.update({cutoff_len, max_samples, batch_size, predict})
@@ -48,30 +48,27 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
     with gr.Row():
         cmd_preview_btn = gr.Button()
         start_btn = gr.Button(variant="primary")
-        stop_btn = gr.Button(variant="stop")
 
     with gr.Row():
         resume_btn = gr.Checkbox(visible=False, interactive=False)
-        process_bar = gr.Slider(visible=False, interactive=False)
+        progress_bar = gr.Slider(visible=False, interactive=False)
 
     with gr.Row():
         output_box = gr.Markdown()
 
-    output_elems = [output_box, process_bar]
+    output_elems = [output_box, progress_bar]
     elem_dict.update(
         dict(
             cmd_preview_btn=cmd_preview_btn,
             start_btn=start_btn,
-            stop_btn=stop_btn,
             resume_btn=resume_btn,
-            process_bar=process_bar,
+            progress_bar=progress_bar,
             output_box=output_box,
         )
     )
 
     cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems, concurrency_limit=None)
     start_btn.click(engine.runner.run_eval, input_elems, output_elems)
-    stop_btn.click(engine.runner.set_abort)
     resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
 
     dataset_dir.change(list_dataset, [dataset_dir], [dataset], queue=False)
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index c709b916..857c56ac 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -27,7 +27,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=1
         )
         dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=1)
-        dataset = gr.Dropdown(multiselect=True, scale=4)
+        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
         preview_elems = create_preview_box(dataset_dir, dataset)
 
     input_elems.update({training_stage, dataset_dir, dataset})
@@ -52,7 +52,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
     )
 
     with gr.Row():
-        cutoff_len = gr.Slider(value=1024, minimum=4, maximum=16384, step=1)
+        cutoff_len = gr.Slider(value=1024, minimum=4, maximum=65536, step=1)
         batch_size = gr.Slider(value=2, minimum=1, maximum=1024, step=1)
         gradient_accumulation_steps = gr.Slider(value=8, minimum=1, maximum=1024, step=1)
         val_size = gr.Slider(value=0, minimum=0, maximum=1, step=0.001)
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index b04c9b00..59515a62 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -299,12 +299,12 @@ class Runner:
                     progress_bar: gr.Slider(visible=False),
                 }
             else:
-                running_log, running_progress, running_loss = get_trainer_info(output_path)
+                running_log, running_progress, running_loss = get_trainer_info(output_path, self.do_train)
                 return_dict = {
                     output_box: running_log,
                     progress_bar: running_progress,
                 }
-                if self.do_train and running_loss is not None:
+                if running_loss is not None:
                     return_dict[loss_viewer] = running_loss
 
                 yield return_dict
diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py
index c8729d36..1f2b0591 100644
--- a/src/llmtuner/webui/utils.py
+++ b/src/llmtuner/webui/utils.py
@@ -63,7 +63,7 @@ def get_time() -> str:
     return datetime.now().strftime(r"%Y-%m-%d-%H-%M-%S")
 
 
-def get_trainer_info(output_path: os.PathLike) -> Tuple[str, "gr.Slider", Optional["gr.Plot"]]:
+def get_trainer_info(output_path: os.PathLike, do_train: bool) -> Tuple[str, "gr.Slider", Optional["gr.Plot"]]:
     running_log = ""
     running_progress = gr.Slider(visible=False)
     running_loss = None
@@ -91,7 +91,7 @@ def get_trainer_info(output_path: os.PathLike) -> Tuple[str, "gr.Slider", Option
             )
             running_progress = gr.Slider(label=label, value=percentage, visible=True)
 
-            if is_matplotlib_available():
+            if do_train and is_matplotlib_available():
                 running_loss = gr.Plot(gen_loss_plot(trainer_log))
 
     return running_log, running_progress, running_loss

From 99125c882582d417de74c45b2bdc3b19aef7b496 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 00:31:02 +0800
Subject: [PATCH 211/341] update readme

Former-commit-id: 012e5b9625682a628a0b7fb5879097be7166c7be
---
 README.md    | 10 +++++++++-
 README_zh.md | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8caac93f..5e7d61ea 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-34-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-42-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -441,6 +441,7 @@ If you have a project that should be incorporated, please contact via email or c
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
+1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073)
 1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
 1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
 1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
@@ -448,7 +449,14 @@ If you have a project that should be incorporated, please contact via email or c
 1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
 1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
 1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
+1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
 1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
+1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836)
+1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581)
+1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215)
+1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621)
+1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2404.17140)
+1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
diff --git a/README_zh.md b/README_zh.md
index 27522232..bfb9feaa 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-34-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-42-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -441,6 +441,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
+1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073)
 1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
 1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
 1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
@@ -448,7 +449,14 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
 1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
 1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
+1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
 1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
+1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836)
+1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581)
+1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215)
+1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621)
+1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2404.17140)
+1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper，基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM，基于 Baichuan-13B 微调而得，具有法律推理和知识检索能力。
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。

From 37bcbf72b45c14eb70f0f14e808f47c9101ebb67 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 00:43:02 +0800
Subject: [PATCH 212/341] update readme and webui launch

Former-commit-id: c66ffa57323ef6ea78a9b75ec5122d9ea25fd420
---
 README.md                       | 8 +++++---
 README_zh.md                    | 8 +++++---
 src/llmtuner/webui/interface.py | 4 ++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 5e7d61ea..4f363099 100644
--- a/README.md
+++ b/README.md
@@ -344,11 +344,12 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 #### Use local environment
 
 ```bash
-export CUDA_VISIBLE_DEVICES=0 # `set CUDA_VISIBLE_DEVICES=0` for Windows
-export GRADIO_SERVER_PORT=7860 # `set GRADIO_SERVER_PORT=7860` for Windows
 llamafactory-cli webui
 ```
 
+> [!TIPS]
+> To modify the default setting in the LLaMA Board GUI, you can use environment variables, e.g., `export CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False` (use `set` command on Windows OS).
+
 <details><summary>For Alibaba Cloud users</summary>
 
 If you encountered display problems in LLaMA Board on Alibaba Cloud, try using the following command to set environment variables before starting LLaMA Board:
@@ -392,7 +393,8 @@ docker compose -f ./docker-compose.yml up -d
 
 See [examples/README.md](examples/README.md) for usage.
 
-Use `llamafactory-cli train -h` to display arguments description.
+> [!TIPS]
+> Use `llamafactory-cli train -h` to display arguments description.
 
 ### Deploy with OpenAI-style API and vLLM
 
diff --git a/README_zh.md b/README_zh.md
index bfb9feaa..8f9d5513 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -344,11 +344,12 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 #### 使用本地环境
 
 ```bash
-export CUDA_VISIBLE_DEVICES=0 # Windows 使用 `set CUDA_VISIBLE_DEVICES=0`
-export GRADIO_SERVER_PORT=7860 # Windows 使用 `set GRADIO_SERVER_PORT=7860`
 llamafactory-cli webui
 ```
 
+> [!TIPS]
+> 您可以使用环境变量来修改 LLaMA Board 可视化界面的默认设置，例如 `export CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False`（Windows 系统可使用 `set` 指令）。
+
 <details><summary>阿里云用户指南</summary>
 
 如果您在阿里云上使用 LLaMA Board 时遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
@@ -392,7 +393,8 @@ docker compose -f ./docker-compose.yml up -d
 
 使用方法请参考 [examples/README_zh.md](examples/README_zh.md)。
 
-您可以执行 `llamafactory-cli train -h` 来查看参数文档。
+> [!TIPS]
+> 您可以执行 `llamafactory-cli train -h` 来查看参数文档。
 
 ### 利用 vLLM 部署 OpenAI API
 
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index 5f17d76d..459802f2 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -69,8 +69,8 @@ def create_web_demo() -> gr.Blocks:
 
 
 def run_web_ui():
-    create_ui().queue().launch(server_name="0.0.0.0")
+    create_ui().queue().launch()
 
 
 def run_web_demo():
-    create_web_demo().queue().launch(server_name="0.0.0.0")
+    create_web_demo().queue().launch()

From b1b18b2c5ade54fa8d1ea34f822b881f00417874 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 00:43:53 +0800
Subject: [PATCH 213/341] update readme

Former-commit-id: 5061f7196a3278af5ebce77249d9c3c0f8a55b34
---
 README.md    | 4 ++--
 README_zh.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4f363099..45732220 100644
--- a/README.md
+++ b/README.md
@@ -347,7 +347,7 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 llamafactory-cli webui
 ```
 
-> [!TIPS]
+> [!TIP]
 > To modify the default setting in the LLaMA Board GUI, you can use environment variables, e.g., `export CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False` (use `set` command on Windows OS).
 
 <details><summary>For Alibaba Cloud users</summary>
@@ -393,7 +393,7 @@ docker compose -f ./docker-compose.yml up -d
 
 See [examples/README.md](examples/README.md) for usage.
 
-> [!TIPS]
+> [!TIP]
 > Use `llamafactory-cli train -h` to display arguments description.
 
 ### Deploy with OpenAI-style API and vLLM
diff --git a/README_zh.md b/README_zh.md
index 8f9d5513..4db1f843 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -347,7 +347,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 llamafactory-cli webui
 ```
 
-> [!TIPS]
+> [!TIP]
 > 您可以使用环境变量来修改 LLaMA Board 可视化界面的默认设置，例如 `export CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False`（Windows 系统可使用 `set` 指令）。
 
 <details><summary>阿里云用户指南</summary>
@@ -393,7 +393,7 @@ docker compose -f ./docker-compose.yml up -d
 
 使用方法请参考 [examples/README_zh.md](examples/README_zh.md)。
 
-> [!TIPS]
+> [!TIP]
 > 您可以执行 `llamafactory-cli train -h` 来查看参数文档。
 
 ### 利用 vLLM 部署 OpenAI API

From efa9140577995a5d899a570e39785c75ab2b84e8 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 15:59:15 +0800
Subject: [PATCH 214/341] update api and support abort eval in webui

Former-commit-id: 8661bed68812e9ded9439e8a821b1d7716bc797b
---
 src/llmtuner/api/app.py               | 205 +++++---------------------
 src/llmtuner/api/chat.py              | 176 ++++++++++++++++++++++
 src/llmtuner/api/common.py            |  20 +++
 src/llmtuner/api/protocol.py          |  12 +-
 src/llmtuner/chat/chat_model.py       |   2 +-
 src/llmtuner/eval/evaluator.py        |   5 +-
 src/llmtuner/extras/callbacks.py      |  36 +++--
 src/llmtuner/train/tuner.py           |   4 +-
 src/llmtuner/webui/components/eval.py |   3 +
 src/llmtuner/webui/interface.py       |   4 +-
 src/llmtuner/webui/locales.py         |   2 +-
 11 files changed, 277 insertions(+), 192 deletions(-)
 create mode 100644 src/llmtuner/api/chat.py
 create mode 100644 src/llmtuner/api/common.py

diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index 36918d1b..375ee61f 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -1,36 +1,29 @@
-import json
 import os
 from contextlib import asynccontextmanager
-from typing import Any, Dict, Sequence
-
-from pydantic import BaseModel
+from typing import Annotated, Optional
 
 from ..chat import ChatModel
-from ..data import Role as DataRole
 from ..extras.misc import torch_gc
 from ..extras.packages import is_fastapi_availble, is_starlette_available, is_uvicorn_available
+from .chat import (
+    create_chat_completion_response,
+    create_score_evaluation_response,
+    create_stream_chat_completion_response,
+)
 from .protocol import (
-    ChatCompletionMessage,
     ChatCompletionRequest,
     ChatCompletionResponse,
-    ChatCompletionResponseChoice,
-    ChatCompletionResponseStreamChoice,
-    ChatCompletionResponseUsage,
-    ChatCompletionStreamResponse,
-    Finish,
-    Function,
-    FunctionCall,
     ModelCard,
     ModelList,
-    Role,
     ScoreEvaluationRequest,
     ScoreEvaluationResponse,
 )
 
 
 if is_fastapi_availble():
-    from fastapi import FastAPI, HTTPException, status
+    from fastapi import Depends, FastAPI, HTTPException, status
     from fastapi.middleware.cors import CORSMiddleware
+    from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
 
 
 if is_starlette_available():
@@ -47,23 +40,8 @@ async def lifespan(app: "FastAPI"):  # collects GPU memory
     torch_gc()
 
 
-def dictify(data: "BaseModel") -> Dict[str, Any]:
-    try:  # pydantic v2
-        return data.model_dump(exclude_unset=True)
-    except AttributeError:  # pydantic v1
-        return data.dict(exclude_unset=True)
-
-
-def jsonify(data: "BaseModel") -> str:
-    try:  # pydantic v2
-        return json.dumps(data.model_dump(exclude_unset=True), ensure_ascii=False)
-    except AttributeError:  # pydantic v1
-        return data.json(exclude_unset=True, ensure_ascii=False)
-
-
 def create_app(chat_model: "ChatModel") -> "FastAPI":
     app = FastAPI(lifespan=lifespan)
-
     app.add_middleware(
         CORSMiddleware,
         allow_origins=["*"],
@@ -71,161 +49,58 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
         allow_methods=["*"],
         allow_headers=["*"],
     )
+    api_key = os.environ.get("API_KEY", None)
+    security = HTTPBearer(auto_error=False)
 
-    role_mapping = {
-        Role.USER: DataRole.USER.value,
-        Role.ASSISTANT: DataRole.ASSISTANT.value,
-        Role.SYSTEM: DataRole.SYSTEM.value,
-        Role.FUNCTION: DataRole.FUNCTION.value,
-        Role.TOOL: DataRole.OBSERVATION.value,
-    }
+    async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]):
+        if api_key and (auth is None or auth.credentials != api_key):
+            raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
 
-    @app.get("/v1/models", response_model=ModelList)
+    @app.get(
+        "/v1/models",
+        response_model=ModelList,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
     async def list_models():
         model_card = ModelCard(id="gpt-3.5-turbo")
         return ModelList(data=[model_card])
 
-    @app.post("/v1/chat/completions", response_model=ChatCompletionResponse, status_code=status.HTTP_200_OK)
+    @app.post(
+        "/v1/chat/completions",
+        response_model=ChatCompletionResponse,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
     async def create_chat_completion(request: ChatCompletionRequest):
         if not chat_model.engine.can_generate:
             raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
-        if len(request.messages) == 0:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
-
-        if request.messages[0].role == Role.SYSTEM:
-            system = request.messages.pop(0).content
-        else:
-            system = ""
-
-        if len(request.messages) % 2 == 0:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...")
-
-        input_messages = []
-        for i, message in enumerate(request.messages):
-            if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]:
-                raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
-            elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]:
-                raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
-
-            if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
-                name = message.tool_calls[0].function.name
-                arguments = message.tool_calls[0].function.arguments
-                content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)
-                input_messages.append({"role": role_mapping[Role.FUNCTION], "content": content})
-            else:
-                input_messages.append({"role": role_mapping[message.role], "content": message.content})
-
-        tool_list = request.tools
-        if isinstance(tool_list, list) and len(tool_list):
-            try:
-                tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
-            except Exception:
-                raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
-        else:
-            tools = ""
-
         if request.stream:
-            if tools:
-                raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
-
-            generate = stream_chat_completion(input_messages, system, tools, request)
+            generate = create_stream_chat_completion_response(request, chat_model)
             return EventSourceResponse(generate, media_type="text/event-stream")
+        else:
+            return await create_chat_completion_response(request, chat_model)
 
-        responses = await chat_model.achat(
-            input_messages,
-            system,
-            tools,
-            do_sample=request.do_sample,
-            temperature=request.temperature,
-            top_p=request.top_p,
-            max_new_tokens=request.max_tokens,
-            num_return_sequences=request.n,
-        )
-
-        prompt_length, response_length = 0, 0
-        choices = []
-        for i, response in enumerate(responses):
-            if tools:
-                result = chat_model.engine.template.format_tools.extract(response.response_text)
-            else:
-                result = response.response_text
-
-            if isinstance(result, tuple):
-                name, arguments = result
-                function = Function(name=name, arguments=arguments)
-                response_message = ChatCompletionMessage(
-                    role=Role.ASSISTANT, tool_calls=[FunctionCall(function=function)]
-                )
-                finish_reason = Finish.TOOL
-            else:
-                response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
-                finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH
-
-            choices.append(
-                ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason)
-            )
-            prompt_length = response.prompt_length
-            response_length += response.response_length
-
-        usage = ChatCompletionResponseUsage(
-            prompt_tokens=prompt_length,
-            completion_tokens=response_length,
-            total_tokens=prompt_length + response_length,
-        )
-
-        return ChatCompletionResponse(model=request.model, choices=choices, usage=usage)
-
-    async def stream_chat_completion(
-        messages: Sequence[Dict[str, str]], system: str, tools: str, request: ChatCompletionRequest
-    ):
-        choice_data = ChatCompletionResponseStreamChoice(
-            index=0, delta=ChatCompletionMessage(role=Role.ASSISTANT, content=""), finish_reason=None
-        )
-        chunk = ChatCompletionStreamResponse(model=request.model, choices=[choice_data])
-        yield jsonify(chunk)
-
-        async for new_token in chat_model.astream_chat(
-            messages,
-            system,
-            tools,
-            do_sample=request.do_sample,
-            temperature=request.temperature,
-            top_p=request.top_p,
-            max_new_tokens=request.max_tokens,
-        ):
-            if len(new_token) == 0:
-                continue
-
-            choice_data = ChatCompletionResponseStreamChoice(
-                index=0, delta=ChatCompletionMessage(content=new_token), finish_reason=None
-            )
-            chunk = ChatCompletionStreamResponse(model=request.model, choices=[choice_data])
-            yield jsonify(chunk)
-
-        choice_data = ChatCompletionResponseStreamChoice(
-            index=0, delta=ChatCompletionMessage(), finish_reason=Finish.STOP
-        )
-        chunk = ChatCompletionStreamResponse(model=request.model, choices=[choice_data])
-        yield jsonify(chunk)
-        yield "[DONE]"
-
-    @app.post("/v1/score/evaluation", response_model=ScoreEvaluationResponse, status_code=status.HTTP_200_OK)
+    @app.post(
+        "/v1/score/evaluation",
+        response_model=ScoreEvaluationResponse,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
     async def create_score_evaluation(request: ScoreEvaluationRequest):
         if chat_model.engine.can_generate:
             raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
-        if len(request.messages) == 0:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
-
-        scores = await chat_model.aget_scores(request.messages, max_length=request.max_length)
-        return ScoreEvaluationResponse(model=request.model, scores=scores)
+        return await create_score_evaluation_response(request, chat_model)
 
     return app
 
 
-def run_api():
+def run_api() -> None:
     chat_model = ChatModel()
     app = create_app(chat_model)
-    print("Visit http://localhost:{}/docs for API document.".format(os.environ.get("API_PORT", 8000)))
-    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("API_PORT", 8000)), workers=1)
+    api_host = os.environ.get("API_HOST", "0.0.0.0")
+    api_port = int(os.environ.get("API_PORT", "8000"))
+    print("Visit http://localhost:{}/docs for API document.".format(api_port))
+    uvicorn.run(app, host=api_host, port=api_port)
diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
new file mode 100644
index 00000000..c9c00f16
--- /dev/null
+++ b/src/llmtuner/api/chat.py
@@ -0,0 +1,176 @@
+import json
+import uuid
+from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Optional, Tuple
+
+from ..data import Role as DataRole
+from ..extras.packages import is_fastapi_availble
+from .common import dictify, jsonify
+from .protocol import (
+    ChatCompletionMessage,
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseUsage,
+    ChatCompletionStreamResponse,
+    ChatCompletionStreamResponseChoice,
+    Finish,
+    Function,
+    FunctionCall,
+    Role,
+    ScoreEvaluationResponse,
+)
+
+
+if is_fastapi_availble():
+    from fastapi import HTTPException, status
+
+
+if TYPE_CHECKING:
+    from ..chat import ChatModel
+    from .protocol import ChatCompletionRequest, ScoreEvaluationRequest
+
+
+ROLE_MAPPING = {
+    Role.USER: DataRole.USER.value,
+    Role.ASSISTANT: DataRole.ASSISTANT.value,
+    Role.SYSTEM: DataRole.SYSTEM.value,
+    Role.FUNCTION: DataRole.FUNCTION.value,
+    Role.TOOL: DataRole.OBSERVATION.value,
+}
+
+
+async def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, str]], str, str]:
+    if len(request.messages) == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
+
+    if request.messages[0].role == Role.SYSTEM:
+        system = request.messages.pop(0).content
+    else:
+        system = ""
+
+    if len(request.messages) % 2 == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...")
+
+    input_messages = []
+    for i, message in enumerate(request.messages):
+        if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+        elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+
+        if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
+            name = message.tool_calls[0].function.name
+            arguments = message.tool_calls[0].function.arguments
+            content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)
+            input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
+        else:
+            input_messages.append({"role": ROLE_MAPPING[message.role], "content": message.content})
+
+    tool_list = request.tools
+    if isinstance(tool_list, list) and len(tool_list):
+        try:
+            tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
+        except Exception:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
+    else:
+        tools = ""
+
+    return input_messages, system, tools
+
+
+async def create_chat_completion_response(
+    request: "ChatCompletionRequest", chat_model: "ChatModel"
+) -> "ChatCompletionResponse":
+    completion_id = "chatcmpl-{}".format(uuid.uuid4().hex)
+    input_messages, system, tools = await _process_request(request)
+    responses = await chat_model.achat(
+        input_messages,
+        system,
+        tools,
+        do_sample=request.do_sample,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_new_tokens=request.max_tokens,
+        num_return_sequences=request.n,
+    )
+
+    prompt_length, response_length = 0, 0
+    choices = []
+    for i, response in enumerate(responses):
+        if tools:
+            result = chat_model.engine.template.format_tools.extract(response.response_text)
+        else:
+            result = response.response_text
+
+        if isinstance(result, tuple):
+            name, arguments = result
+            function = Function(name=name, arguments=arguments)
+            tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=[tool_call])
+            finish_reason = Finish.TOOL
+        else:
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
+            finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH
+
+        choices.append(ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason))
+        prompt_length = response.prompt_length
+        response_length += response.response_length
+
+    usage = ChatCompletionResponseUsage(
+        prompt_tokens=prompt_length,
+        completion_tokens=response_length,
+        total_tokens=prompt_length + response_length,
+    )
+
+    return ChatCompletionResponse(id=completion_id, model=request.model, choices=choices, usage=usage)
+
+
+async def _create_stream_chat_completion_chunk(
+    completion_id: str,
+    model: str,
+    delta: "ChatCompletionMessage",
+    index: Optional[int] = 0,
+    finish_reason: Optional["Finish"] = None,
+) -> str:
+    choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason)
+    chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data])
+    return jsonify(chunk)
+
+
+async def create_stream_chat_completion_response(
+    request: "ChatCompletionRequest", chat_model: "ChatModel"
+) -> AsyncGenerator[str, None]:
+    completion_id = "chatcmpl-{}".format(uuid.uuid4().hex)
+    input_messages, system, tools = await _process_request(request)
+    if tools:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
+
+    yield _create_stream_chat_completion_chunk(
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(role=Role.ASSISTANT, content="")
+    )
+    async for new_token in chat_model.astream_chat(
+        input_messages,
+        system,
+        tools,
+        do_sample=request.do_sample,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_new_tokens=request.max_tokens,
+    ):
+        yield _create_stream_chat_completion_chunk(
+            completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token)
+        )
+
+    yield _create_stream_chat_completion_chunk(
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(), finish_reason=Finish.STOP
+    )
+    yield "[DONE]"
+
+
+async def create_score_evaluation_response(
+    request: "ScoreEvaluationRequest", chat_model: "ChatModel"
+) -> "ScoreEvaluationResponse":
+    if len(request.messages) == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
+
+    scores = await chat_model.aget_scores(request.messages, max_length=request.max_length)
+    return ScoreEvaluationResponse(model=request.model, scores=scores)
diff --git a/src/llmtuner/api/common.py b/src/llmtuner/api/common.py
new file mode 100644
index 00000000..5ad9a071
--- /dev/null
+++ b/src/llmtuner/api/common.py
@@ -0,0 +1,20 @@
+import json
+from typing import TYPE_CHECKING, Any, Dict
+
+
+if TYPE_CHECKING:
+    from pydantic import BaseModel
+
+
+def dictify(data: "BaseModel") -> Dict[str, Any]:
+    try:  # pydantic v2
+        return data.model_dump(exclude_unset=True)
+    except AttributeError:  # pydantic v1
+        return data.dict(exclude_unset=True)
+
+
+def jsonify(data: "BaseModel") -> str:
+    try:  # pydantic v2
+        return json.dumps(data.model_dump(exclude_unset=True), ensure_ascii=False)
+    except AttributeError:  # pydantic v1
+        return data.json(exclude_unset=True, ensure_ascii=False)
diff --git a/src/llmtuner/api/protocol.py b/src/llmtuner/api/protocol.py
index ece2132b..ae6e2e9b 100644
--- a/src/llmtuner/api/protocol.py
+++ b/src/llmtuner/api/protocol.py
@@ -51,7 +51,7 @@ class FunctionAvailable(BaseModel):
 
 
 class FunctionCall(BaseModel):
-    id: Literal["call_default"] = "call_default"
+    id: str
     type: Literal["function"] = "function"
     function: Function
 
@@ -86,7 +86,7 @@ class ChatCompletionResponseChoice(BaseModel):
     finish_reason: Finish
 
 
-class ChatCompletionResponseStreamChoice(BaseModel):
+class ChatCompletionStreamResponseChoice(BaseModel):
     index: int
     delta: ChatCompletionMessage
     finish_reason: Optional[Finish] = None
@@ -99,7 +99,7 @@ class ChatCompletionResponseUsage(BaseModel):
 
 
 class ChatCompletionResponse(BaseModel):
-    id: Literal["chatcmpl-default"] = "chatcmpl-default"
+    id: str
     object: Literal["chat.completion"] = "chat.completion"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
@@ -108,11 +108,11 @@ class ChatCompletionResponse(BaseModel):
 
 
 class ChatCompletionStreamResponse(BaseModel):
-    id: Literal["chatcmpl-default"] = "chatcmpl-default"
+    id: str
     object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    choices: List[ChatCompletionResponseStreamChoice]
+    choices: List[ChatCompletionStreamResponseChoice]
 
 
 class ScoreEvaluationRequest(BaseModel):
@@ -122,7 +122,7 @@ class ScoreEvaluationRequest(BaseModel):
 
 
 class ScoreEvaluationResponse(BaseModel):
-    id: Literal["scoreeval-default"] = "scoreeval-default"
+    id: str
     object: Literal["score.evaluation"] = "score.evaluation"
     model: str
     scores: List[float]
diff --git a/src/llmtuner/chat/chat_model.py b/src/llmtuner/chat/chat_model.py
index 97ae87d7..281ef0c1 100644
--- a/src/llmtuner/chat/chat_model.py
+++ b/src/llmtuner/chat/chat_model.py
@@ -98,7 +98,7 @@ class ChatModel:
         return await self.engine.get_scores(batch_input, **input_kwargs)
 
 
-def run_chat():
+def run_chat() -> None:
     try:
         import platform
 
diff --git a/src/llmtuner/eval/evaluator.py b/src/llmtuner/eval/evaluator.py
index 4ea134c6..192f4815 100644
--- a/src/llmtuner/eval/evaluator.py
+++ b/src/llmtuner/eval/evaluator.py
@@ -118,6 +118,5 @@ class Evaluator:
                 f.write(score_info)
 
 
-def run_eval():
-    evaluator = Evaluator()
-    evaluator.eval()
+def run_eval() -> None:
+    Evaluator().eval()
diff --git a/src/llmtuner/extras/callbacks.py b/src/llmtuner/extras/callbacks.py
index a07c7059..a142928a 100644
--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -2,6 +2,7 @@ import json
 import logging
 import os
 import signal
+import sys
 import time
 from concurrent.futures import ThreadPoolExecutor
 from datetime import timedelta
@@ -91,6 +92,18 @@ class LogCallback(TrainerCallback):
             self.thread_pool.shutdown(wait=True)
             self.thread_pool = None
 
+    def on_init_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the end of the initialization of the `Trainer`.
+        """
+        if (
+            args.should_save
+            and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG))
+            and args.overwrite_output_dir
+        ):
+            logger.warning("Previous trainer log in this folder will be deleted.")
+            os.remove(os.path.join(args.output_dir, TRAINER_LOG))
+
     def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
         Event called at the beginning of training.
@@ -100,14 +113,6 @@ class LogCallback(TrainerCallback):
             self._reset(max_steps=state.max_steps)
             self._create_thread_pool(output_dir=args.output_dir)
 
-        if (
-            args.should_save
-            and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG))
-            and args.overwrite_output_dir
-        ):
-            logger.warning("Previous trainer log in this folder will be deleted.")
-            os.remove(os.path.join(args.output_dir, TRAINER_LOG))
-
     def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
         Event called at the end of training.
@@ -126,9 +131,6 @@ class LogCallback(TrainerCallback):
         r"""
         Event called at the end of a training step.
         """
-        if args.should_save:
-            self._timing(cur_steps=state.global_step)
-
         if self.aborted:
             control.should_epoch_stop = True
             control.should_training_stop = True
@@ -152,6 +154,7 @@ class LogCallback(TrainerCallback):
         if not args.should_save:
             return
 
+        self._timing(cur_steps=state.global_step)
         logs = dict(
             current_steps=self.cur_steps,
             total_steps=self.max_steps,
@@ -183,8 +186,17 @@ class LogCallback(TrainerCallback):
         r"""
         Event called after a prediction step.
         """
+        if self.do_train:
+            return
+
+        if self.aborted:
+            sys.exit(0)
+
+        if not args.should_save:
+            return
+
         eval_dataloader = kwargs.pop("eval_dataloader", None)
-        if args.should_save and has_length(eval_dataloader) and not self.do_train:
+        if has_length(eval_dataloader):
             if self.max_steps == 0:
                 self._reset(max_steps=len(eval_dataloader))
                 self._create_thread_pool(output_dir=args.output_dir)
diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index 6822ffb5..e1a997c1 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
-def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallback"] = []):
+def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallback"] = []) -> None:
     model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
     callbacks.append(LogCallback(training_args.output_dir))
 
@@ -43,7 +43,7 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallb
         raise ValueError("Unknown task.")
 
 
-def export_model(args: Optional[Dict[str, Any]] = None):
+def export_model(args: Optional[Dict[str, Any]] = None) -> None:
     model_args, data_args, finetuning_args, _ = get_infer_args(args)
 
     if model_args.export_dir is None:
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index 222f9314..60e22bb7 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -48,6 +48,7 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
     with gr.Row():
         cmd_preview_btn = gr.Button()
         start_btn = gr.Button(variant="primary")
+        stop_btn = gr.Button(variant="stop")
 
     with gr.Row():
         resume_btn = gr.Checkbox(visible=False, interactive=False)
@@ -61,6 +62,7 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
         dict(
             cmd_preview_btn=cmd_preview_btn,
             start_btn=start_btn,
+            stop_btn=stop_btn,
             resume_btn=resume_btn,
             progress_bar=progress_bar,
             output_box=output_box,
@@ -69,6 +71,7 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems, concurrency_limit=None)
     start_btn.click(engine.runner.run_eval, input_elems, output_elems)
+    stop_btn.click(engine.runner.set_abort)
     resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
 
     dataset_dir.change(list_dataset, [dataset_dir], [dataset], queue=False)
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index 459802f2..b293db90 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -68,9 +68,9 @@ def create_web_demo() -> gr.Blocks:
     return demo
 
 
-def run_web_ui():
+def run_web_ui() -> None:
     create_ui().queue().launch()
 
 
-def run_web_demo():
+def run_web_demo() -> None:
     create_web_demo().queue().launch()
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index 1c474f34..5bf925b7 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -1449,7 +1449,7 @@ ALERTS = {
     "info_aborting": {
         "en": "Aborted, wait for terminating...",
         "ru": "Прервано, ожидание завершения...",
-        "zh": "训练中断，正在等待线程结束……",
+        "zh": "训练中断，正在等待进程结束……",
     },
     "info_aborted": {
         "en": "Ready.",

From 9381fecca708eb7b251e59204d8471bf82542885 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 16:11:18 +0800
Subject: [PATCH 215/341] fix async stream api response

Former-commit-id: d70bbcae6513e50aa6094f2d98c4aa5c6641ea02
---
 src/llmtuner/api/chat.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index c9c00f16..716dec56 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -38,7 +38,7 @@ ROLE_MAPPING = {
 }
 
 
-async def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, str]], str, str]:
+def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, str]], str, str]:
     if len(request.messages) == 0:
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
 
@@ -77,11 +77,23 @@ async def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[
     return input_messages, system, tools
 
 
+def _create_stream_chat_completion_chunk(
+    completion_id: str,
+    model: str,
+    delta: "ChatCompletionMessage",
+    index: Optional[int] = 0,
+    finish_reason: Optional["Finish"] = None,
+) -> str:
+    choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason)
+    chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data])
+    return jsonify(chunk)
+
+
 async def create_chat_completion_response(
     request: "ChatCompletionRequest", chat_model: "ChatModel"
 ) -> "ChatCompletionResponse":
     completion_id = "chatcmpl-{}".format(uuid.uuid4().hex)
-    input_messages, system, tools = await _process_request(request)
+    input_messages, system, tools = _process_request(request)
     responses = await chat_model.achat(
         input_messages,
         system,
@@ -124,23 +136,11 @@ async def create_chat_completion_response(
     return ChatCompletionResponse(id=completion_id, model=request.model, choices=choices, usage=usage)
 
 
-async def _create_stream_chat_completion_chunk(
-    completion_id: str,
-    model: str,
-    delta: "ChatCompletionMessage",
-    index: Optional[int] = 0,
-    finish_reason: Optional["Finish"] = None,
-) -> str:
-    choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason)
-    chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data])
-    return jsonify(chunk)
-
-
 async def create_stream_chat_completion_response(
     request: "ChatCompletionRequest", chat_model: "ChatModel"
 ) -> AsyncGenerator[str, None]:
     completion_id = "chatcmpl-{}".format(uuid.uuid4().hex)
-    input_messages, system, tools = await _process_request(request)
+    input_messages, system, tools = _process_request(request)
     if tools:
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
 

From e9fe8815bece9384ae13b0a2f4916368ef3e8aa8 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 16:13:52 +0800
Subject: [PATCH 216/341] remove empty stream response

Former-commit-id: 070d0da928b1e974a094279a2782201016d2a3ab
---
 src/llmtuner/api/chat.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index 716dec56..fa2f0d03 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -156,9 +156,10 @@ async def create_stream_chat_completion_response(
         top_p=request.top_p,
         max_new_tokens=request.max_tokens,
     ):
-        yield _create_stream_chat_completion_chunk(
-            completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token)
-        )
+        if len(new_token) != 0:
+            yield _create_stream_chat_completion_chunk(
+                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token)
+            )
 
     yield _create_stream_chat_completion_chunk(
         completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(), finish_reason=Finish.STOP

From 6eda42eb7c0904b7bead943a50d24e5fa026fa91 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 17:01:21 +0800
Subject: [PATCH 217/341] update readme

Former-commit-id: eaf83847ef6d89d8b70429138e73b04fd2aa3ef8
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 45732220..347ebe7e 100644
--- a/README.md
+++ b/README.md
@@ -339,7 +339,7 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 ### Train with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
 
 > [!IMPORTANT]
-> LLaMA Board GUI only supports training on a single GPU, please use [CLI](#command-line-interface) for distributed training.
+> LLaMA Board GUI only supports training on a single GPU, please use [CLI](#train-with-command-line-interface) for distributed training.
 
 #### Use local environment
 
diff --git a/README_zh.md b/README_zh.md
index 4db1f843..8a2fb79b 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -339,7 +339,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 ### 利用 LLaMA Board 可视化界面训练（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
 
 > [!IMPORTANT]
-> LLaMA Board 可视化界面目前仅支持单 GPU 训练，请使用[命令行接口](#命令行接口)来进行多 GPU 分布式训练。
+> LLaMA Board 可视化界面目前仅支持单 GPU 训练，请使用[命令行接口](#利用命令行接口训练)来进行多 GPU 分布式训练。
 
 #### 使用本地环境
 

From 342d7da8d73bfddef5a60e9a6236db9fc6c0c28c Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 22:02:25 +0800
Subject: [PATCH 218/341] add cal_ppl script

Former-commit-id: 947068c11c0be00db2cecddb2c5842a0d6e2c321
---
 scripts/cal_flops.py  | 12 +++----
 scripts/cal_lr.py     | 17 ++++------
 scripts/cal_ppl.py    | 79 +++++++++++++++++++++++++++++++++++++++++++
 scripts/length_cdf.py |  9 +++--
 4 files changed, 95 insertions(+), 22 deletions(-)
 create mode 100644 scripts/cal_ppl.py

diff --git a/scripts/cal_flops.py b/scripts/cal_flops.py
index 35d98254..19414ce5 100644
--- a/scripts/cal_flops.py
+++ b/scripts/cal_flops.py
@@ -3,24 +3,22 @@
 # Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
 # Inspired by: https://www.deepspeed.ai/tutorials/flops-profiler/
 
-from typing import Optional
-
 import fire
 import torch
 from deepspeed.accelerator import get_accelerator  # type: ignore
 from deepspeed.profiling.flops_profiler import get_model_profile  # type: ignore
 
-from llmtuner import ChatModel
+from llmtuner.chat import ChatModel
 
 
 def calculate_flops(
     model_name_or_path: str,
-    batch_size: Optional[int] = 1,
-    seq_length: Optional[int] = 256,
-    flash_attn: Optional[bool] = False,
+    batch_size: int = 1,
+    seq_length: int = 256,
+    flash_attn: str = "auto",
 ):
     with get_accelerator().device(0):
-        chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="vanilla", flash_attn=flash_attn))
+        chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
         fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.model.device)
         input_dict = {"input_ids": fake_input, "labels": fake_input.clone()}
         flops, macs, params = get_model_profile(chat_model.model, kwargs=input_dict, print_profile=True, detailed=True)
diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py
index c1c1f7a2..7bf8839d 100644
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -4,7 +4,6 @@
 # Inspired by: https://github.com/imoneoi/openchat/blob/master/ochat/training_deepspeed/train.py
 
 import math
-from typing import Optional
 
 import fire
 import torch
@@ -25,12 +24,12 @@ BASE_BS = 4_000_000  # from llama paper
 def calculate_lr(
     model_name_or_path: str,
     batch_size: int,  # total batch size, namely (batch size * gradient accumulation * world size)
-    stage: Optional[str] = "sft",
-    dataset: Optional[str] = "alpaca_en",
-    dataset_dir: Optional[str] = "data",
-    template: Optional[str] = "default",
-    cutoff_len: Optional[int] = 1024,  # i.e. maximum input length during training
-    is_mistral: Optional[bool] = False,  # mistral model uses a smaller learning rate,
+    stage: str = "sft",
+    dataset: str = "alpaca_en",
+    dataset_dir: str = "data",
+    template: str = "default",
+    cutoff_len: int = 1024,  # i.e. maximum input length during training
+    is_mistral: bool = False,  # mistral model uses a smaller learning rate,
 ):
     model_args, data_args, training_args, _, _ = get_train_args(
         dict(
@@ -54,9 +53,7 @@ def calculate_lr(
     else:
         raise NotImplementedError
 
-    dataloader = DataLoader(
-        dataset=trainset, batch_size=batch_size, shuffle=True, collate_fn=data_collator, pin_memory=True
-    )
+    dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
     valid_tokens, total_tokens = 0, 0
     for batch in tqdm(dataloader):
         valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item()
diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py
new file mode 100644
index 00000000..bdfc210b
--- /dev/null
+++ b/scripts/cal_ppl.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Calculates the ppl of pre-trained models.
+# Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
+
+import json
+from typing import Dict
+
+import fire
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
+
+from llmtuner.data import get_dataset
+from llmtuner.extras.constants import IGNORE_INDEX
+from llmtuner.hparams import get_train_args
+from llmtuner.model import load_model, load_tokenizer
+
+
+def cal_ppl(
+    model_name_or_path: str,
+    batch_size: int = 4,
+    stage: str = "sft",
+    dataset: str = "alpaca_en",
+    dataset_dir: str = "data",
+    template: str = "default",
+    cutoff_len: int = 1024,
+    train_on_prompt: bool = False,
+):
+    model_args, data_args, training_args, finetuning_args, _ = get_train_args(
+        dict(
+            stage=stage,
+            model_name_or_path=model_name_or_path,
+            dataset=dataset,
+            dataset_dir=dataset_dir,
+            template=template,
+            cutoff_len=cutoff_len,
+            train_on_prompt=train_on_prompt,
+            output_dir="dummy_dir",
+            overwrite_cache=True,
+        )
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False)
+    if stage == "pt":
+        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    elif stage == "sft":
+        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
+    else:
+        raise NotImplementedError
+
+    dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
+    criterion = torch.nn.CrossEntropyLoss(reduction="none")
+    perplexities = []
+    batch: Dict[str, "torch.Tensor"]
+    with torch.no_grad():
+        for batch in tqdm(dataloader):
+            batch = batch.to(model.device)
+            outputs = model(**batch)
+            shift_logits: "torch.Tensor" = outputs["logits"][..., :-1, :]
+            shift_labels: "torch.Tensor" = batch["labels"][..., 1:]
+            loss_mask = shift_labels != IGNORE_INDEX
+            flatten_logits = shift_logits.contiguous().view(shift_labels.size(0) * shift_labels.size(1), -1)
+            flatten_labels = shift_labels.contiguous().view(-1)
+            token_logps: "torch.Tensor" = criterion(flatten_logits, flatten_labels)
+            token_logps = token_logps.contiguous().view(shift_logits.size(0), -1)
+            sentence_logps = (token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+            perplexities.extend(sentence_logps.exp().tolist())
+
+    with open("ppl.json", "w", encoding="utf-8") as f:
+        json.dump(perplexities, f, indent=2)
+
+    print("Perplexities have been saved at ppl.json.")
+
+
+if __name__ == "__main__":
+    fire.Fire(cal_ppl)
diff --git a/scripts/length_cdf.py b/scripts/length_cdf.py
index 1446f77a..da41a942 100644
--- a/scripts/length_cdf.py
+++ b/scripts/length_cdf.py
@@ -3,7 +3,6 @@
 # Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en --template default
 
 from collections import defaultdict
-from typing import Optional
 
 import fire
 from tqdm import tqdm
@@ -15,10 +14,10 @@ from llmtuner.model import load_tokenizer
 
 def length_cdf(
     model_name_or_path: str,
-    dataset: Optional[str] = "alpaca_en",
-    dataset_dir: Optional[str] = "data",
-    template: Optional[str] = "default",
-    interval: Optional[int] = 1000,
+    dataset: str = "alpaca_en",
+    dataset_dir: str = "data",
+    template: str = "default",
+    interval: int = 1000,
 ):
     model_args, data_args, training_args, _, _ = get_train_args(
         dict(

From 68ed89f351568ea5c13db47ff0d7e25f13afc9fb Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 22:13:14 +0800
Subject: [PATCH 219/341] update ppl script

Former-commit-id: 07606fa4ab303f088170a569c1f86141a1b496c5
---
 scripts/cal_ppl.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py
index bdfc210b..6c8c6174 100644
--- a/scripts/cal_ppl.py
+++ b/scripts/cal_ppl.py
@@ -1,6 +1,6 @@
 # coding=utf-8
-# Calculates the ppl of pre-trained models.
-# Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
+# Calculates the ppl on the dataset of the pre-trained models.
+# Usage: python cal_ppl.py --model_name_or_path path_to_model --save_name ppl.json
 
 import json
 from typing import Dict
@@ -19,6 +19,7 @@ from llmtuner.model import load_model, load_tokenizer
 
 def cal_ppl(
     model_name_or_path: str,
+    save_name: str,
     batch_size: int = 4,
     stage: str = "sft",
     dataset: str = "alpaca_en",
@@ -69,10 +70,10 @@ def cal_ppl(
             sentence_logps = (token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
             perplexities.extend(sentence_logps.exp().tolist())
 
-    with open("ppl.json", "w", encoding="utf-8") as f:
+    with open(save_name, "w", encoding="utf-8") as f:
         json.dump(perplexities, f, indent=2)
 
-    print("Perplexities have been saved at ppl.json.")
+    print("Perplexities have been saved at {}.".format(save_name))
 
 
 if __name__ == "__main__":

From 9b187b274c39aa5cea47fd99fcb9d6919c530309 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 22:35:31 +0800
Subject: [PATCH 220/341] add avg ppl

Former-commit-id: 40caeb6f0fdf76a1e2c9ca3761299d087fc643e0
---
 scripts/cal_ppl.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py
index 6c8c6174..06c2a43b 100644
--- a/scripts/cal_ppl.py
+++ b/scripts/cal_ppl.py
@@ -54,6 +54,7 @@ def cal_ppl(
 
     dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
     criterion = torch.nn.CrossEntropyLoss(reduction="none")
+    total_ppl = 0
     perplexities = []
     batch: Dict[str, "torch.Tensor"]
     with torch.no_grad():
@@ -68,11 +69,13 @@ def cal_ppl(
             token_logps: "torch.Tensor" = criterion(flatten_logits, flatten_labels)
             token_logps = token_logps.contiguous().view(shift_logits.size(0), -1)
             sentence_logps = (token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+            total_ppl += sentence_logps.exp().sum().item()
             perplexities.extend(sentence_logps.exp().tolist())
 
     with open(save_name, "w", encoding="utf-8") as f:
         json.dump(perplexities, f, indent=2)
 
+    print("Average perplexity is {:.2f}".format(total_ppl / len(perplexities)))
     print("Perplexities have been saved at {}.".format(save_name))
 
 
From f9aa74715aa0876cedf0a6825cdc2f6de9e74de3 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 4 May 2024 23:05:17 +0800
Subject: [PATCH 221/341] update scripts

Former-commit-id: 1c07648c4bb4bb0c46bc0240547b46bd2835dce1
---
 scripts/cal_lr.py  |  3 ++-
 scripts/cal_ppl.py | 35 +++++++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py
index 7bf8839d..dd864162 100644
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -4,6 +4,7 @@
 # Inspired by: https://github.com/imoneoi/openchat/blob/master/ochat/training_deepspeed/train.py
 
 import math
+from typing import Literal
 
 import fire
 import torch
@@ -24,7 +25,7 @@ BASE_BS = 4_000_000  # from llama paper
 def calculate_lr(
     model_name_or_path: str,
     batch_size: int,  # total batch size, namely (batch size * gradient accumulation * world size)
-    stage: str = "sft",
+    stage: Literal["pt", "sft"] = "sft",
     dataset: str = "alpaca_en",
     dataset_dir: str = "data",
     template: str = "default",
diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py
index 06c2a43b..2e74c70a 100644
--- a/scripts/cal_ppl.py
+++ b/scripts/cal_ppl.py
@@ -3,7 +3,8 @@
 # Usage: python cal_ppl.py --model_name_or_path path_to_model --save_name ppl.json
 
 import json
-from typing import Dict
+from dataclasses import dataclass
+from typing import Any, Dict, Literal, Sequence
 
 import fire
 import torch
@@ -17,11 +18,37 @@ from llmtuner.hparams import get_train_args
 from llmtuner.model import load_model, load_tokenizer
 
 
+@dataclass
+class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq):
+    r"""
+    Data collator for pairwise data.
+    """
+
+    train_on_prompt: bool = False
+
+    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        r"""
+        Pads batched data to the longest sequence in the batch.
+
+        We generate 2 * n examples where the first n examples represent chosen examples and
+        the last n examples represent rejected examples.
+        """
+        chosen_features = []
+        for feature in features:
+            prompt_len, answer_len = len(feature["prompt_ids"]), len(feature["chosen_ids"])
+            input_ids = feature["prompt_ids"] + feature["chosen_ids"]
+            attention_mask = [1] * (prompt_len + answer_len)
+            labels = input_ids if self.train_on_prompt else [IGNORE_INDEX] * prompt_len + feature["chosen_ids"]
+            chosen_features.append({"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels})
+
+        return super().__call__(chosen_features)
+
+
 def cal_ppl(
     model_name_or_path: str,
     save_name: str,
     batch_size: int = 4,
-    stage: str = "sft",
+    stage: Literal["pt", "sft", "rm"] = "sft",
     dataset: str = "alpaca_en",
     dataset_dir: str = "data",
     template: str = "default",
@@ -49,6 +76,10 @@ def cal_ppl(
         data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     elif stage == "sft":
         data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
+    elif stage == "rm":
+        data_collator = PairwiseDataCollatorWithPadding(
+            tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX, train_on_prompt=train_on_prompt
+        )
     else:
         raise NotImplementedError
 

From 7ef3788ff4ebea4df3ef4424e65ac0f6e51fe77a Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 5 May 2024 00:17:54 +0800
Subject: [PATCH 222/341] update webui

Former-commit-id: 17a53d25cdadd2df70a8afa0488f75bbf1918b89
---
 scripts/cal_ppl.py                       |  4 ++-
 src/llmtuner/webui/components/chatbot.py |  6 ++--
 src/llmtuner/webui/components/eval.py    | 10 +++---
 src/llmtuner/webui/components/export.py  |  2 +-
 src/llmtuner/webui/components/train.py   | 42 ++++++++++++------------
 5 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py
index 2e74c70a..9eebc57d 100644
--- a/scripts/cal_ppl.py
+++ b/scripts/cal_ppl.py
@@ -4,7 +4,7 @@
 
 import json
 from dataclasses import dataclass
-from typing import Any, Dict, Literal, Sequence
+from typing import Any, Dict, Literal, Optional, Sequence
 
 import fire
 import torch
@@ -53,6 +53,7 @@ def cal_ppl(
     dataset_dir: str = "data",
     template: str = "default",
     cutoff_len: int = 1024,
+    max_samples: Optional[int] = None,
     train_on_prompt: bool = False,
 ):
     model_args, data_args, training_args, finetuning_args, _ = get_train_args(
@@ -63,6 +64,7 @@ def cal_ppl(
             dataset_dir=dataset_dir,
             template=template,
             cutoff_len=cutoff_len,
+            max_samples=max_samples,
             train_on_prompt=train_on_prompt,
             output_dir="dummy_dir",
             overwrite_cache=True,
diff --git a/src/llmtuner/webui/components/chatbot.py b/src/llmtuner/webui/components/chatbot.py
index 0a55460c..f83694b1 100644
--- a/src/llmtuner/webui/components/chatbot.py
+++ b/src/llmtuner/webui/components/chatbot.py
@@ -36,9 +36,9 @@ def create_chat_box(
                 submit_btn = gr.Button(variant="primary")
 
             with gr.Column(scale=1):
-                max_new_tokens = gr.Slider(8, 4096, value=512, step=1)
-                top_p = gr.Slider(0.01, 1.0, value=0.7, step=0.01)
-                temperature = gr.Slider(0.01, 1.5, value=0.95, step=0.01)
+                max_new_tokens = gr.Slider(minimum=8, maximum=4096, value=512, step=1)
+                top_p = gr.Slider(minimum=0.01, maximum=1.0, value=0.7, step=0.01)
+                temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01)
                 clear_btn = gr.Button()
 
     tools.input(check_json_schema, inputs=[tools, engine.manager.get_elem_by_id("top.lang")])
diff --git a/src/llmtuner/webui/components/eval.py b/src/llmtuner/webui/components/eval.py
index 60e22bb7..8b70283b 100644
--- a/src/llmtuner/webui/components/eval.py
+++ b/src/llmtuner/webui/components/eval.py
@@ -28,18 +28,18 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]:
     elem_dict.update(dict(dataset_dir=dataset_dir, dataset=dataset, **preview_elems))
 
     with gr.Row():
-        cutoff_len = gr.Slider(value=1024, minimum=4, maximum=65536, step=1)
+        cutoff_len = gr.Slider(minimum=4, maximum=65536, value=1024, step=1)
         max_samples = gr.Textbox(value="100000")
-        batch_size = gr.Slider(value=2, minimum=1, maximum=1024, step=1)
+        batch_size = gr.Slider(minimum=1, maximum=1024, value=2, step=1)
         predict = gr.Checkbox(value=True)
 
     input_elems.update({cutoff_len, max_samples, batch_size, predict})
     elem_dict.update(dict(cutoff_len=cutoff_len, max_samples=max_samples, batch_size=batch_size, predict=predict))
 
     with gr.Row():
-        max_new_tokens = gr.Slider(10, 2048, value=128, step=1)
-        top_p = gr.Slider(0.01, 1, value=0.7, step=0.01)
-        temperature = gr.Slider(0.01, 1.5, value=0.95, step=0.01)
+        max_new_tokens = gr.Slider(minimum=8, maximum=4096, value=512, step=1)
+        top_p = gr.Slider(minimum=0.01, maximum=1, value=0.7, step=0.01)
+        temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01)
         output_dir = gr.Textbox()
 
     input_elems.update({max_new_tokens, top_p, temperature, output_dir})
diff --git a/src/llmtuner/webui/components/export.py b/src/llmtuner/webui/components/export.py
index 64273882..134b77e0 100644
--- a/src/llmtuner/webui/components/export.py
+++ b/src/llmtuner/webui/components/export.py
@@ -85,7 +85,7 @@ def save_model(
 
 def create_export_tab(engine: "Engine") -> Dict[str, "Component"]:
     with gr.Row():
-        export_size = gr.Slider(value=1, minimum=1, maximum=100, step=1)
+        export_size = gr.Slider(minimum=1, maximum=100, value=1, step=1)
         export_quantization_bit = gr.Dropdown(choices=["none", "8", "4", "3", "2"], value="none")
         export_quantization_dataset = gr.Textbox(value="data/c4_demo.json")
         export_device = gr.Radio(choices=["cpu", "cuda"], value="cpu")
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 857c56ac..5cde660c 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -52,10 +52,10 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
     )
 
     with gr.Row():
-        cutoff_len = gr.Slider(value=1024, minimum=4, maximum=65536, step=1)
-        batch_size = gr.Slider(value=2, minimum=1, maximum=1024, step=1)
-        gradient_accumulation_steps = gr.Slider(value=8, minimum=1, maximum=1024, step=1)
-        val_size = gr.Slider(value=0, minimum=0, maximum=1, step=0.001)
+        cutoff_len = gr.Slider(minimum=4, maximum=65536, value=1024, step=1)
+        batch_size = gr.Slider(minimum=1, maximum=1024, value=2, step=1)
+        gradient_accumulation_steps = gr.Slider(minimum=1, maximum=1024, value=8, step=1)
+        val_size = gr.Slider(minimum=0, maximum=1, value=0, step=0.001)
         lr_scheduler_type = gr.Dropdown(choices=[scheduler.value for scheduler in SchedulerType], value="cosine")
 
     input_elems.update({cutoff_len, batch_size, gradient_accumulation_steps, val_size, lr_scheduler_type})
@@ -71,10 +71,10 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as extra_tab:
         with gr.Row():
-            logging_steps = gr.Slider(value=5, minimum=5, maximum=1000, step=5)
-            save_steps = gr.Slider(value=100, minimum=10, maximum=5000, step=10)
-            warmup_steps = gr.Slider(value=0, minimum=0, maximum=5000, step=1)
-            neftune_alpha = gr.Slider(value=0, minimum=0, maximum=10, step=0.1)
+            logging_steps = gr.Slider(minimum=1, maximum=1000, value=5, step=5)
+            save_steps = gr.Slider(minimum=10, maximum=5000, value=100, step=10)
+            warmup_steps = gr.Slider(minimum=0, maximum=5000, value=0, step=1)
+            neftune_alpha = gr.Slider(minimum=0, maximum=10, value=0, step=0.1)
             optim = gr.Textbox(value="adamw_torch")
 
         with gr.Row():
@@ -124,7 +124,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as freeze_tab:
         with gr.Row():
-            num_layer_trainable = gr.Slider(value=3, minimum=1, maximum=128, step=1)
+            num_layer_trainable = gr.Slider(minimum=1, maximum=128, value=2, step=1)
             name_module_trainable = gr.Textbox(value="all")
 
     input_elems.update({num_layer_trainable, name_module_trainable})
@@ -136,10 +136,10 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as lora_tab:
         with gr.Row():
-            lora_rank = gr.Slider(value=8, minimum=1, maximum=1024, step=1)
-            lora_alpha = gr.Slider(value=16, minimum=1, maximum=2048, step=1)
-            lora_dropout = gr.Slider(value=0, minimum=0, maximum=1, step=0.01)
-            loraplus_lr_ratio = gr.Slider(value=0, minimum=0, maximum=64, step=0.01)
+            lora_rank = gr.Slider(minimum=1, maximum=1024, value=8, step=1)
+            lora_alpha = gr.Slider(minimum=1, maximum=2048, value=16, step=1)
+            lora_dropout = gr.Slider(minimum=0, maximum=1, value=0, step=0.01)
+            loraplus_lr_ratio = gr.Slider(minimum=0, maximum=64, value=0, step=0.01)
             create_new_adapter = gr.Checkbox()
 
         with gr.Row():
@@ -180,9 +180,9 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as rlhf_tab:
         with gr.Row():
-            dpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01)
-            dpo_ftx = gr.Slider(value=0, minimum=0, maximum=10, step=0.01)
-            orpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01)
+            dpo_beta = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.01)
+            dpo_ftx = gr.Slider(minimum=0, maximum=10, value=0, step=0.01)
+            orpo_beta = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.01)
             reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True)
 
     input_elems.update({dpo_beta, dpo_ftx, orpo_beta, reward_model})
@@ -193,9 +193,9 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
     with gr.Accordion(open=False) as galore_tab:
         with gr.Row():
             use_galore = gr.Checkbox()
-            galore_rank = gr.Slider(value=16, minimum=1, maximum=1024, step=1)
-            galore_update_interval = gr.Slider(value=200, minimum=1, maximum=1024, step=1)
-            galore_scale = gr.Slider(value=0.25, minimum=0, maximum=1, step=0.01)
+            galore_rank = gr.Slider(minimum=1, maximum=1024, value=16, step=1)
+            galore_update_interval = gr.Slider(minimum=1, maximum=1024, value=200, step=1)
+            galore_scale = gr.Slider(minimum=0, maximum=1, value=0.25, step=0.01)
             galore_target = gr.Textbox(value="all")
 
     input_elems.update({use_galore, galore_rank, galore_update_interval, galore_scale, galore_target})
@@ -215,8 +215,8 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
             use_badam = gr.Checkbox()
             badam_mode = gr.Dropdown(choices=["layer", "ratio"], value="layer")
             badam_switch_mode = gr.Dropdown(choices=["ascending", "descending", "random", "fixed"], value="ascending")
-            badam_switch_interval = gr.Slider(value=50, minimum=1, maximum=1024, step=1)
-            badam_update_ratio = gr.Slider(value=0.05, minimum=0, maximum=1, step=0.01)
+            badam_switch_interval = gr.Slider(minimum=1, maximum=1024, value=50, step=1)
+            badam_update_ratio = gr.Slider(minimum=0, maximum=1, value=0.05, step=0.01)
 
     input_elems.update({use_badam, badam_mode, badam_switch_mode, badam_switch_interval, badam_update_ratio})
     elem_dict.update(

From 2f5f6722cf138c72e69acc73c966ca9c5c200a3e Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 5 May 2024 00:53:07 +0800
Subject: [PATCH 223/341] fix eval scripts

Former-commit-id: fc3743d0b82c28fbff1170761139e4fa5d2a8939
---
 evaluation/ceval/ceval.py |  14 ++--
 evaluation/cmmlu/cmmlu.py | 134 +++++++++++++++++++-------------------
 evaluation/mmlu/mmlu.py   |  12 +---
 3 files changed, 74 insertions(+), 86 deletions(-)

diff --git a/evaluation/ceval/ceval.py b/evaluation/ceval/ceval.py
index 33005de3..4111d6b4 100644
--- a/evaluation/ceval/ceval.py
+++ b/evaluation/ceval/ceval.py
@@ -19,7 +19,7 @@ import pandas as pd
 
 _CITATION = """\
 @article{huang2023ceval,
-  title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models}, 
+  title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
   author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
   journal={arXiv preprint arXiv:2305.08322},
   year={2023}
@@ -133,25 +133,19 @@ class Ceval(datasets.GeneratorBasedBuilder):
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "test", f"{task_name}_test.csv"
-                    ),
+                    "filepath": os.path.join(data_dir, "test", f"{task_name}_test.csv"),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "val", f"{task_name}_val.csv"
-                    ),
+                    "filepath": os.path.join(data_dir, "val", f"{task_name}_val.csv"),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "dev", f"{task_name}_dev.csv"
-                    ),
+                    "filepath": os.path.join(data_dir, "dev", f"{task_name}_dev.csv"),
                 },
             ),
         ]
diff --git a/evaluation/cmmlu/cmmlu.py b/evaluation/cmmlu/cmmlu.py
index 62096203..37efb328 100644
--- a/evaluation/cmmlu/cmmlu.py
+++ b/evaluation/cmmlu/cmmlu.py
@@ -37,73 +37,73 @@ _LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 Internatio
 _URL = "cmmlu.zip"
 
 task_list = [
-     'agronomy',
-     'anatomy',
-     'ancient_chinese',
-     'arts',
-     'astronomy',
-     'business_ethics',
-     'chinese_civil_service_exam',
-     'chinese_driving_rule',
-     'chinese_food_culture',
-     'chinese_foreign_policy',
-     'chinese_history',
-     'chinese_literature',
-     'chinese_teacher_qualification',
-     'clinical_knowledge',
-     'college_actuarial_science',
-     'college_education',
-     'college_engineering_hydrology',
-     'college_law',
-     'college_mathematics',
-     'college_medical_statistics',
-     'college_medicine',
-     'computer_science',
-     'computer_security',
-     'conceptual_physics',
-     'construction_project_management',
-     'economics',
-     'education',
-     'electrical_engineering',
-     'elementary_chinese',
-     'elementary_commonsense',
-     'elementary_information_and_technology',
-     'elementary_mathematics',
-     'ethnology',
-     'food_science',
-     'genetics',
-     'global_facts',
-     'high_school_biology',
-     'high_school_chemistry',
-     'high_school_geography',
-     'high_school_mathematics',
-     'high_school_physics',
-     'high_school_politics',
-     'human_sexuality',
-     'international_law',
-     'journalism',
-     'jurisprudence',
-     'legal_and_moral_basis',
-     'logical',
-     'machine_learning',
-     'management',
-     'marketing',
-     'marxist_theory',
-     'modern_chinese',
-     'nutrition',
-     'philosophy',
-     'professional_accounting',
-     'professional_law',
-     'professional_medicine',
-     'professional_psychology',
-     'public_relations',
-     'security_study',
-     'sociology',
-     'sports_science',
-     'traditional_chinese_medicine',
-     'virology',
-     'world_history',
-     'world_religions',
+    "agronomy",
+    "anatomy",
+    "ancient_chinese",
+    "arts",
+    "astronomy",
+    "business_ethics",
+    "chinese_civil_service_exam",
+    "chinese_driving_rule",
+    "chinese_food_culture",
+    "chinese_foreign_policy",
+    "chinese_history",
+    "chinese_literature",
+    "chinese_teacher_qualification",
+    "clinical_knowledge",
+    "college_actuarial_science",
+    "college_education",
+    "college_engineering_hydrology",
+    "college_law",
+    "college_mathematics",
+    "college_medical_statistics",
+    "college_medicine",
+    "computer_science",
+    "computer_security",
+    "conceptual_physics",
+    "construction_project_management",
+    "economics",
+    "education",
+    "electrical_engineering",
+    "elementary_chinese",
+    "elementary_commonsense",
+    "elementary_information_and_technology",
+    "elementary_mathematics",
+    "ethnology",
+    "food_science",
+    "genetics",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_geography",
+    "high_school_mathematics",
+    "high_school_physics",
+    "high_school_politics",
+    "human_sexuality",
+    "international_law",
+    "journalism",
+    "jurisprudence",
+    "legal_and_moral_basis",
+    "logical",
+    "machine_learning",
+    "management",
+    "marketing",
+    "marxist_theory",
+    "modern_chinese",
+    "nutrition",
+    "philosophy",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_study",
+    "sociology",
+    "sports_science",
+    "traditional_chinese_medicine",
+    "virology",
+    "world_history",
+    "world_religions",
 ]
 
 
diff --git a/evaluation/mmlu/mmlu.py b/evaluation/mmlu/mmlu.py
index 9f1bd101..f3218c38 100644
--- a/evaluation/mmlu/mmlu.py
+++ b/evaluation/mmlu/mmlu.py
@@ -136,25 +136,19 @@ class MMLU(datasets.GeneratorBasedBuilder):
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "data", "test", f"{task_name}_test.csv"
-                    ),
+                    "filepath": os.path.join(data_dir, "data", "test", f"{task_name}_test.csv"),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "data", "val", f"{task_name}_val.csv"
-                    ),
+                    "filepath": os.path.join(data_dir, "data", "val", f"{task_name}_val.csv"),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "data", "dev", f"{task_name}_dev.csv"
-                    ),
+                    "filepath": os.path.join(data_dir, "data", "dev", f"{task_name}_dev.csv"),
                 },
             ),
         ]

From 4674f3baa7d456abe8386f2e7fbd5144acc34b28 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 5 May 2024 02:44:35 +0800
Subject: [PATCH 224/341] add version and help to cli

Former-commit-id: f762f2215169b9fe55564d5600b758ddc66f9c9c
---
 src/llmtuner/cli.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/llmtuner/cli.py b/src/llmtuner/cli.py
index 1b5bd658..f2619ab9 100644
--- a/src/llmtuner/cli.py
+++ b/src/llmtuner/cli.py
@@ -1,6 +1,7 @@
 import sys
 from enum import Enum, unique
 
+from . import __version__
 from .api.app import run_api
 from .chat.chat_model import run_chat
 from .eval.evaluator import run_eval
@@ -8,6 +9,19 @@ from .train.tuner import export_model, run_exp
 from .webui.interface import run_web_demo, run_web_ui
 
 
+USAGE = """
+Usage:
+    llamafactory-cli api -h: launch an API server
+    llamafactory-cli chat -h: launch a chat interface in CLI
+    llamafactory-cli eval -h: do evaluation
+    llamafactory-cli export -h: merge LoRA adapters and export model
+    llamafactory-cli train -h: do training
+    llamafactory-cli webchat -h: launch a chat interface in Web UI
+    llamafactory-cli webui: launch LlamaBoard
+    llamafactory-cli version: show version info
+"""
+
+
 @unique
 class Command(str, Enum):
     API = "api"
@@ -17,6 +31,8 @@ class Command(str, Enum):
     TRAIN = "train"
     WEBDEMO = "webchat"
     WEBUI = "webui"
+    VERSION = "version"
+    HELP = "help"
 
 
 def main():
@@ -35,5 +51,9 @@ def main():
         run_web_demo()
     elif command == Command.WEBUI:
         run_web_ui()
+    elif command == Command.VERSION:
+        print("Welcome to LLaMA Factory, version {}".format(__version__))
+    elif command == Command.HELP:
+        print(USAGE)
     else:
         raise NotImplementedError("Unknown command: {}".format(command))

From d0597897bf88a123a2fb833368eca1a027d7b70a Mon Sep 17 00:00:00 2001
From: Oscar <pha123661@gmail.com>
Date: Sun, 5 May 2024 23:35:19 +0800
Subject: [PATCH 225/341] Fix badam example outdated argument

Former-commit-id: 29aa188cc774cb72367f706f1cd4c07bc5a9f241
---
 examples/extras/badam/sft.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
index 61167dad..4bcfe9d2 100644
--- a/examples/extras/badam/sft.sh
+++ b/examples/extras/badam/sft.sh
@@ -10,7 +10,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --finetuning_type full \
     --use_badam \
     --badam_switch_mode descending \
-    --badam_switch_block_every 50 \
+    --badam_switch_interval 50 \
     --badam_verbose 2 \
     --output_dir ../../../saves/LLaMA2-7B/badam/sft \
     --overwrite_cache \

From 45becd2a45c9f2f34d4c0c1d71bf06ed70edab15 Mon Sep 17 00:00:00 2001
From: zhaonx96 <953608703@qq,com>
Date: Mon, 6 May 2024 10:10:00 +0800
Subject: [PATCH 226/341] =?UTF-8?q?=E2=80=9Dadd=20stop=20parameter=20in=20?=
 =?UTF-8?q?chat.py=E2=80=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Former-commit-id: e529bf5bc14c72558d26f73c42076eaa9684205c
---
 src/llmtuner/api/chat.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index fa2f0d03..972ee906 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -103,6 +103,7 @@ async def create_chat_completion_response(
         top_p=request.top_p,
         max_new_tokens=request.max_tokens,
         num_return_sequences=request.n,
+        stop=request.stop
     )
 
     prompt_length, response_length = 0, 0
@@ -155,6 +156,7 @@ async def create_stream_chat_completion_response(
         temperature=request.temperature,
         top_p=request.top_p,
         max_new_tokens=request.max_tokens,
+        stop=request.stop
     ):
         if len(new_token) != 0:
             yield _create_stream_chat_completion_chunk(

From 3d1b0e1864cf6a149cc33ef073ac12d3d9335137 Mon Sep 17 00:00:00 2001
From: zhouwei <363232733@qq.com>
Date: Mon, 6 May 2024 13:29:59 +0800
Subject: [PATCH 227/341] The training efficiency of the Ascend 910A has been
 significantly enhanced, leveraging the full computational power of the NPU
 (Neural Processing Unit) and the capabilities of torch_npu, a PyTorch library
 optimized for NPUs. This improvement has resulted in a remarkable tenfold
 increase in efficiency.

Former-commit-id: 90980b626d3408b3e2ee32a02456c20881318be7
---
 src/train.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/train.py b/src/train.py
index 6a3212cb..e2609b66 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,3 +1,7 @@
+import os
+import torch
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
 from llmtuner.train.tuner import run_exp
 
 
@@ -11,4 +15,6 @@ def _mp_fn(index):
 
 
 if __name__ == "__main__":
+    use_jit_compile = os.getenv('JIT_COMPILE', 'False').lower() in ['true', '1']
+    torch.npu.set_compile_mode(jit_compile=use_jit_compile)
     main()

From 5c9da798b543dd613eeffa16af58e39227cf617f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 6 May 2024 21:47:00 +0800
Subject: [PATCH 228/341] update docs

Former-commit-id: a4a2e94241bea6f96590f6cb8ca8b5cddee1917e
---
 README.md                                     | 69 +++++++++---------
 README_zh.md                                  | 71 ++++++++++---------
 examples/README.md                            |  9 ++-
 examples/extras/badam/sft.sh                  |  2 +-
 examples/inference/api_demo.sh                |  7 --
 examples/inference/cli_demo.sh                |  7 --
 examples/inference/evaluate.sh                | 12 ----
 examples/inference/llama3.yaml                |  2 +
 examples/inference/llama3_lora_sft.yaml       |  4 ++
 examples/inference/llama3_vllm.yaml           |  4 ++
 examples/inference/web_demo.sh                |  8 ---
 examples/lora_single_gpu/dpo.sh               | 35 ---------
 examples/lora_single_gpu/llama3_lora_dpo.yaml | 39 ++++++++++
 .../lora_single_gpu/llama3_lora_eval.yaml     | 19 +++++
 .../lora_single_gpu/llama3_lora_orpo.yaml     | 38 ++++++++++
 examples/lora_single_gpu/llama3_lora_ppo.yaml | 38 ++++++++++
 .../lora_single_gpu/llama3_lora_predict.yaml  | 24 +++++++
 .../lora_single_gpu/llama3_lora_pretrain.yaml | 37 ++++++++++
 .../lora_single_gpu/llama3_lora_reward.yaml   | 38 ++++++++++
 examples/lora_single_gpu/llama3_lora_sft.yaml | 38 ++++++++++
 .../lora_single_gpu/llama3_preprocess.yaml    | 22 ++++++
 .../lora_single_gpu/llava1_5_lora_sft.yaml    | 39 ++++++++++
 examples/lora_single_gpu/orpo.sh              | 32 ---------
 examples/lora_single_gpu/ppo.sh               | 32 ---------
 examples/lora_single_gpu/predict.sh           | 19 -----
 examples/lora_single_gpu/prepare.sh           | 19 -----
 examples/lora_single_gpu/pretrain.sh          | 31 --------
 examples/lora_single_gpu/reward.sh            | 33 ---------
 examples/lora_single_gpu/sft.sh               | 32 ---------
 examples/lora_single_gpu/sft_mllm.sh          | 33 ---------
 examples/merge_lora/llama3_gptq.yaml          | 11 +++
 examples/merge_lora/llama3_lora_sft.yaml      | 13 ++++
 examples/merge_lora/merge.sh                  | 12 ----
 examples/merge_lora/quantize.sh               | 11 ---
 examples/qlora_single_gpu/aqlm.sh             | 30 --------
 examples/qlora_single_gpu/awq.sh              | 30 --------
 examples/qlora_single_gpu/bitsandbytes.sh     | 31 --------
 examples/qlora_single_gpu/gptq.sh             | 30 --------
 .../llama3_lora_sft_aqlm.yaml                 | 27 +++++++
 .../qlora_single_gpu/llama3_lora_sft_awq.yaml |  0
 .../llama3_lora_sft_bitsandbytes.yaml         |  0
 .../llama3_lora_sft_gptq.yaml                 |  0
 setup.py                                      |  6 +-
 src/webui.py                                  |  9 +++
 44 files changed, 487 insertions(+), 516 deletions(-)
 delete mode 100644 examples/inference/api_demo.sh
 delete mode 100644 examples/inference/cli_demo.sh
 delete mode 100644 examples/inference/evaluate.sh
 create mode 100644 examples/inference/llama3.yaml
 create mode 100644 examples/inference/llama3_lora_sft.yaml
 create mode 100644 examples/inference/llama3_vllm.yaml
 delete mode 100644 examples/inference/web_demo.sh
 delete mode 100644 examples/lora_single_gpu/dpo.sh
 create mode 100644 examples/lora_single_gpu/llama3_lora_dpo.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_eval.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_orpo.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_ppo.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_predict.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_pretrain.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_reward.yaml
 create mode 100644 examples/lora_single_gpu/llama3_lora_sft.yaml
 create mode 100644 examples/lora_single_gpu/llama3_preprocess.yaml
 create mode 100644 examples/lora_single_gpu/llava1_5_lora_sft.yaml
 delete mode 100644 examples/lora_single_gpu/orpo.sh
 delete mode 100644 examples/lora_single_gpu/ppo.sh
 delete mode 100644 examples/lora_single_gpu/predict.sh
 delete mode 100644 examples/lora_single_gpu/prepare.sh
 delete mode 100644 examples/lora_single_gpu/pretrain.sh
 delete mode 100644 examples/lora_single_gpu/reward.sh
 delete mode 100644 examples/lora_single_gpu/sft.sh
 delete mode 100644 examples/lora_single_gpu/sft_mllm.sh
 create mode 100644 examples/merge_lora/llama3_gptq.yaml
 create mode 100644 examples/merge_lora/llama3_lora_sft.yaml
 delete mode 100644 examples/merge_lora/merge.sh
 delete mode 100644 examples/merge_lora/quantize.sh
 delete mode 100644 examples/qlora_single_gpu/aqlm.sh
 delete mode 100644 examples/qlora_single_gpu/awq.sh
 delete mode 100644 examples/qlora_single_gpu/bitsandbytes.sh
 delete mode 100644 examples/qlora_single_gpu/gptq.sh
 create mode 100644 examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
 create mode 100644 examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
 create mode 100644 examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
 create mode 100644 examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
 create mode 100644 src/webui.py

diff --git a/README.md b/README.md
index 347ebe7e..d10ef982 100644
--- a/README.md
+++ b/README.md
@@ -276,18 +276,19 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.3    |
-| datasets     | 2.14.3  | 2.18.0    |
-| accelerate   | 0.27.2  | 0.28.0    |
+| transformers | 4.37.2  | 4.40.1    |
+| datasets     | 2.14.3  | 2.19.1    |
+| accelerate   | 0.27.2  | 0.30.0    |
 | peft         | 0.9.0   | 0.10.0    |
-| trl          | 0.8.1   | 0.8.1     |
+| trl          | 0.8.1   | 0.8.6     |
 
 | Optional     | Minimum | Recommend |
 | ------------ | ------- | --------- |
 | CUDA         | 11.6    | 12.2      |
 | deepspeed    | 0.10.0  | 0.14.0    |
-| bitsandbytes | 0.39.0  | 0.43.0    |
-| flash-attn   | 2.3.0   | 2.5.6     |
+| bitsandbytes | 0.39.0  | 0.43.1    |
+| vllm         | 0.4.0   | 0.4.2     |
+| flash-attn   | 2.3.0   | 2.5.8     |
 
 ### Hardware Requirement
 
@@ -305,24 +306,15 @@ huggingface-cli login
 
 ## Getting Started
 
-### Data Preparation
-
-Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope hub or load the dataset in local disk.
-
-> [!NOTE]
-> Please update `data/dataset_info.json` to use your custom dataset.
-
-### Dependence Installation
+### Installation
 
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
-conda create -n llama_factory python=3.10
-conda activate llama_factory
 cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-Extra dependencies available: deepspeed, metrics, galore, badam, vllm, bitsandbytes, gptq, awq, aqlm, qwen, modelscope, quality
+Extra dependencies available: metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality
 
 <details><summary>For Windows users</summary>
 
@@ -336,19 +328,41 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 </details>
 
-### Train with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
+### Data Preparation
+
+Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope hub or load the dataset in local disk.
+
+> [!NOTE]
+> Please update `data/dataset_info.json` to use your custom dataset.
+
+### Quickstart
+
+The following 3 commands conduct LoRA fine-tuning, inference and merging for Llama3-8B-Instruct model, respectively.
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+See [examples/README.md](examples/README.md) for advanced usage.
+
+> [!TIP]
+> Use `llamafactory-cli help` to show help information.
+
+### Use LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
 
 > [!IMPORTANT]
-> LLaMA Board GUI only supports training on a single GPU, please use [CLI](#train-with-command-line-interface) for distributed training.
+> LLaMA Board GUI only supports training on a single GPU.
 
 #### Use local environment
 
 ```bash
-llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webui
 ```
 
 > [!TIP]
-> To modify the default setting in the LLaMA Board GUI, you can use environment variables, e.g., `export CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False` (use `set` command on Windows OS).
+> To modify the default setting in the LLaMA Board GUI, you can use environment variables, e.g., `export GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False` (use `set` command on Windows OS).
 
 <details><summary>For Alibaba Cloud users</summary>
 
@@ -389,21 +403,10 @@ docker compose -f ./docker-compose.yml up -d
 
 </details>
 
-### Train with Command Line Interface
-
-See [examples/README.md](examples/README.md) for usage.
-
-> [!TIP]
-> Use `llamafactory-cli train -h` to display arguments description.
-
 ### Deploy with OpenAI-style API and vLLM
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api \
-    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
-    --template llama3 \
-    --infer_backend vllm \
-    --vllm_enforce_eager
+CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
 ```
 
 ### Download from ModelScope Hub
diff --git a/README_zh.md b/README_zh.md
index 8a2fb79b..9c639f2c 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -163,7 +163,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
-> **默认模块**应作为 `--lora_target` 参数的默认值，可使用 `--lora_target all` 参数指定全部模块以得到更好的效果。
+> **默认模块**应作为 `--lora_target` 参数的默认值，可使用 `--lora_target all` 参数指定全部模块以取得更好的效果。
 >
 > 对于所有“基座”（Base）模型，`--template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
 >
@@ -276,18 +276,19 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.3    |
-| datasets     | 2.14.3  | 2.18.0    |
-| accelerate   | 0.27.2  | 0.28.0    |
+| transformers | 4.37.2  | 4.40.1    |
+| datasets     | 2.14.3  | 2.19.1    |
+| accelerate   | 0.27.2  | 0.30.0    |
 | peft         | 0.9.0   | 0.10.0    |
-| trl          | 0.8.1   | 0.8.1     |
+| trl          | 0.8.1   | 0.8.6     |
 
 | 可选项       | 至少     | 推荐      |
 | ------------ | ------- | --------- |
 | CUDA         | 11.6    | 12.2      |
 | deepspeed    | 0.10.0  | 0.14.0    |
-| bitsandbytes | 0.39.0  | 0.43.0    |
-| flash-attn   | 2.3.0   | 2.5.6     |
+| bitsandbytes | 0.39.0  | 0.43.1    |
+| vllm         | 0.4.0   | 0.4.2     |
+| flash-attn   | 2.3.0   | 2.5.8     |
 
 ### 硬件依赖
 
@@ -305,24 +306,15 @@ huggingface-cli login
 
 ## 如何使用
 
-### 数据准备
-
-关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。
-
-> [!NOTE]
-> 使用自定义数据集时，请更新 `data/dataset_info.json` 文件。
-
-### 安装依赖
+### 安装 LLaMA Factory
 
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
-conda create -n llama_factory python=3.10
-conda activate llama_factory
 cd LLaMA-Factory
 pip install -e .[metrics]
 ```
 
-可选的额外依赖项：deepspeed、metrics、galore、badam、vllm、bitsandbytes、gptq、awq、aqlm、qwen、modelscope、quality
+可选的额外依赖项：metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality
 
 <details><summary>Windows 用户指南</summary>
 
@@ -336,19 +328,41 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 </details>
 
-### 利用 LLaMA Board 可视化界面训练（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
+### 数据准备
+
+关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。
+
+> [!NOTE]
+> 使用自定义数据集时，请更新 `data/dataset_info.json` 文件。
+
+### 快速开始
+
+下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA 微调、推理和合并。
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+高级用法请参考 [examples/README_zh.md](examples/README_zh.md)。
+
+> [!TIP]
+> 使用 `llamafactory-cli help` 显示使用帮助。
+
+### 使用 LLaMA Board 可视化界面（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
 
 > [!IMPORTANT]
-> LLaMA Board 可视化界面目前仅支持单 GPU 训练，请使用[命令行接口](#利用命令行接口训练)来进行多 GPU 分布式训练。
+> LLaMA Board 可视化界面目前仅支持单 GPU 训练。
 
 #### 使用本地环境
 
 ```bash
-llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webui
 ```
 
 > [!TIP]
-> 您可以使用环境变量来修改 LLaMA Board 可视化界面的默认设置，例如 `export CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False`（Windows 系统可使用 `set` 指令）。
+> 您可以使用环境变量来修改 LLaMA Board 可视化界面的默认设置，例如 `export GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False`（Windows 系统可使用 `set` 指令）。
 
 <details><summary>阿里云用户指南</summary>
 
@@ -389,21 +403,10 @@ docker compose -f ./docker-compose.yml up -d
 
 </details>
 
-### 利用命令行接口训练
-
-使用方法请参考 [examples/README_zh.md](examples/README_zh.md)。
-
-> [!TIP]
-> 您可以执行 `llamafactory-cli train -h` 来查看参数文档。
-
 ### 利用 vLLM 部署 OpenAI API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api \
-    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
-    --template llama3 \
-    --infer_backend vllm \
-    --vllm_enforce_eager
+CUDA_VISIBLE_DEVICES=0,1 API_PORT=8000 llamafactory-cli api examples/inference/llama3_vllm.yaml
 ```
 
 ### 从魔搭社区下载
diff --git a/examples/README.md b/examples/README.md
index 895e9c72..0a14c5bd 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,9 +1,16 @@
 We provide diverse examples about fine-tuning LLMs.
 
+```bash
+export CUDA_VISIBLE_DEVICES=0
+cd examples/lora_single_gpu
+llamafactory-cli train llama3_lora_pretrain.yaml # Do continuous pre-training using LoRA
+
+```
+
 ```
 examples/
 ├── lora_single_gpu/
-│   ├── pretrain.sh: Do continuous pre-training using LoRA
+│   ├── `
 │   ├── sft.sh: Do supervised fine-tuning using LoRA
 │   ├── reward.sh: Do reward modeling using LoRA
 │   ├── ppo.sh: Do PPO training using LoRA
diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
index 4bcfe9d2..61167dad 100644
--- a/examples/extras/badam/sft.sh
+++ b/examples/extras/badam/sft.sh
@@ -10,7 +10,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
     --finetuning_type full \
     --use_badam \
     --badam_switch_mode descending \
-    --badam_switch_interval 50 \
+    --badam_switch_block_every 50 \
     --badam_verbose 2 \
     --output_dir ../../../saves/LLaMA2-7B/badam/sft \
     --overwrite_cache \
diff --git a/examples/inference/api_demo.sh b/examples/inference/api_demo.sh
deleted file mode 100644
index 6f0f1b2e..00000000
--- a/examples/inference/api_demo.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 llamafactory-cli api \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template default \
-    --finetuning_type lora
diff --git a/examples/inference/cli_demo.sh b/examples/inference/cli_demo.sh
deleted file mode 100644
index bc762411..00000000
--- a/examples/inference/cli_demo.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template default \
-    --finetuning_type lora
diff --git a/examples/inference/evaluate.sh b/examples/inference/evaluate.sh
deleted file mode 100644
index 5030329d..00000000
--- a/examples/inference/evaluate.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template fewshot \
-    --finetuning_type lora \
-    --task mmlu \
-    --split test \
-    --lang en \
-    --n_shot 5 \
-    --batch_size 4
diff --git a/examples/inference/llama3.yaml b/examples/inference/llama3.yaml
new file mode 100644
index 00000000..ffc5be82
--- /dev/null
+++ b/examples/inference/llama3.yaml
@@ -0,0 +1,2 @@
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
diff --git a/examples/inference/llama3_lora_sft.yaml b/examples/inference/llama3_lora_sft.yaml
new file mode 100644
index 00000000..262f4445
--- /dev/null
+++ b/examples/inference/llama3_lora_sft.yaml
@@ -0,0 +1,4 @@
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+template: llama3
+finetuning_type: lora
diff --git a/examples/inference/llama3_vllm.yaml b/examples/inference/llama3_vllm.yaml
new file mode 100644
index 00000000..8dd3b61a
--- /dev/null
+++ b/examples/inference/llama3_vllm.yaml
@@ -0,0 +1,4 @@
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
+infer_backend: vllm
+vllm_enforce_eager: true
diff --git a/examples/inference/web_demo.sh b/examples/inference/web_demo.sh
deleted file mode 100644
index a58cd2a0..00000000
--- a/examples/inference/web_demo.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-# add `--visual_inputs True` to load MLLM
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template default \
-    --finetuning_type lora
diff --git a/examples/lora_single_gpu/dpo.sh b/examples/lora_single_gpu/dpo.sh
deleted file mode 100644
index 2cb6cb01..00000000
--- a/examples/lora_single_gpu/dpo.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage dpo \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --create_new_adapter \
-    --dataset orca_rlhf \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/dpo \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --max_samples 1000 \
-    --val_size 0.1 \
-    --dpo_ftx 1.0 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/lora_single_gpu/llama3_lora_dpo.yaml
new file mode 100644
index 00000000..f71f752d
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: dpo
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+dpo_ftx: 1.0
+
+# dataset
+dataset: orca_rlhf
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/dpo
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.00001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_eval.yaml b/examples/lora_single_gpu/llama3_lora_eval.yaml
new file mode 100644
index 00000000..5808a47a
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_eval.yaml
@@ -0,0 +1,19 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+
+# method
+finetuning_type: lora
+
+# dataset
+task: mmlu
+split: test
+template: fewshot
+lang: en
+n_shot: 5
+
+# output
+save_dir: saves/llama3-8b/lora/eval
+
+# eval
+batch_size: 4
diff --git a/examples/lora_single_gpu/llama3_lora_orpo.yaml b/examples/lora_single_gpu/llama3_lora_orpo.yaml
new file mode 100644
index 00000000..5d78d260
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_orpo.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: orpo
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: orca_rlhf
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/orpo
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.00001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_ppo.yaml b/examples/lora_single_gpu/llama3_lora_ppo.yaml
new file mode 100644
index 00000000..8d78d20d
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_ppo.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+reward_model: saves/llama3-8b/lora/reward
+
+# method
+stage: ppo
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/ppo
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.00001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# generate
+max_new_tokens: 512
+top_k: 0
+top_p: 0.9
diff --git a/examples/lora_single_gpu/llama3_lora_predict.yaml b/examples/lora_single_gpu/llama3_lora_predict.yaml
new file mode 100644
index 00000000..5a9de686
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_predict.yaml
@@ -0,0 +1,24 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+
+# method
+stage: sft
+do_predict: true
+finetuning_type: lora
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 50
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/predict
+overwrite_output_dir: true
+
+# eval
+per_device_eval_batch_size: 1
+predict_with_generate: true
diff --git a/examples/lora_single_gpu/llama3_lora_pretrain.yaml b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
new file mode 100644
index 00000000..64245b71
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
@@ -0,0 +1,37 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: pt
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: c4_demo
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/lora_single_gpu/llama3_lora_reward.yaml
new file mode 100644
index 00000000..f190f4ac
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_reward.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: rm
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: orca_rlhf
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/reward
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.00001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/lora_single_gpu/llama3_lora_sft.yaml
new file mode 100644
index 00000000..f99df305
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/lora_single_gpu/llama3_preprocess.yaml
new file mode 100644
index 00000000..04df9631
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@@ -0,0 +1,22 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+tokenized_path: saves/llama3-8b/dataset/sft # use `tokenized_path` in config to load data
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+overwrite_output_dir: true
diff --git a/examples/lora_single_gpu/llava1_5_lora_sft.yaml b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
new file mode 100644
index 00000000..96c2701a
--- /dev/null
+++ b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: llava-hf/llava-1.5-7b-hf
+visual_inputs: true
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: mllm_demo
+template: vicuna
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llava1_5-7b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/orpo.sh b/examples/lora_single_gpu/orpo.sh
deleted file mode 100644
index 335707bf..00000000
--- a/examples/lora_single_gpu/orpo.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage orpo \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset orca_rlhf \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/orpo \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --max_samples 1000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/ppo.sh b/examples/lora_single_gpu/ppo.sh
deleted file mode 100644
index 9eccb05e..00000000
--- a/examples/lora_single_gpu/ppo.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage ppo \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --create_new_adapter \
-    --dataset alpaca_gpt4_en \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --reward_model ../../saves/LLaMA2-7B/lora/reward \
-    --output_dir ../../saves/LLaMA2-7B/lora/ppo \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 512 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --max_samples 1000 \
-    --top_k 0 \
-    --top_p 0.9 \
-    --max_new_tokens 256 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/predict.sh b/examples/lora_single_gpu/predict.sh
deleted file mode 100644
index 250efed1..00000000
--- a/examples/lora_single_gpu/predict.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_predict \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft,../../saves/LLaMA2-7B/lora/dpo \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --output_dir ../../saves/LLaMA2-7B/lora/predict \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_eval_batch_size 1 \
-    --max_samples 20 \
-    --predict_with_generate
diff --git a/examples/lora_single_gpu/prepare.sh b/examples/lora_single_gpu/prepare.sh
deleted file mode 100644
index 277f9b7a..00000000
--- a/examples/lora_single_gpu/prepare.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-# use `--tokenized_path` in training script to load data
-
-CUDA_VISIBLE_DEVICES= llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --max_samples 3000 \
-    --tokenized_path ../../saves/datasets/sft
diff --git a/examples/lora_single_gpu/pretrain.sh b/examples/lora_single_gpu/pretrain.sh
deleted file mode 100644
index 0782f00c..00000000
--- a/examples/lora_single_gpu/pretrain.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage pt \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset c4_demo \
-    --dataset_dir ../../data \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/pretrain \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 10000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/reward.sh b/examples/lora_single_gpu/reward.sh
deleted file mode 100644
index 678809fd..00000000
--- a/examples/lora_single_gpu/reward.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage rm \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --create_new_adapter \
-    --dataset orca_rlhf \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/reward \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --max_samples 5000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/sft.sh b/examples/lora_single_gpu/sft.sh
deleted file mode 100644
index 2047e21f..00000000
--- a/examples/lora_single_gpu/sft.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/lora_single_gpu/sft_mllm.sh b/examples/lora_single_gpu/sft_mllm.sh
deleted file mode 100644
index 53e37262..00000000
--- a/examples/lora_single_gpu/sft_mllm.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path llava-hf/llava-1.5-7b-hf \
-    --visual_inputs \
-    --dataset mllm_demo \
-    --dataset_dir ../../data \
-    --template vicuna \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft_mllm \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 100.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/merge_lora/llama3_gptq.yaml b/examples/merge_lora/llama3_gptq.yaml
new file mode 100644
index 00000000..eac12f90
--- /dev/null
+++ b/examples/merge_lora/llama3_gptq.yaml
@@ -0,0 +1,11 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
+
+# export
+export_dir: models/llama3_gptq
+export_quantization_bit: 4
+export_quantization_dataset: data/c4_demo.json
+export_size: 2
+export_device: cpu
+export_legacy_format: false
diff --git a/examples/merge_lora/llama3_lora_sft.yaml b/examples/merge_lora/llama3_lora_sft.yaml
new file mode 100644
index 00000000..508a0b8c
--- /dev/null
+++ b/examples/merge_lora/llama3_lora_sft.yaml
@@ -0,0 +1,13 @@
+# Note: DO NOT use quantized model or quantization_bit when merging lora weights
+
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+template: llama3
+finetuning_type: lora
+
+# export
+export_dir: models/llama3_lora_sft
+export_size: 2
+export_device: cpu
+export_legacy_format: false
diff --git a/examples/merge_lora/merge.sh b/examples/merge_lora/merge.sh
deleted file mode 100644
index 186e64a4..00000000
--- a/examples/merge_lora/merge.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-# DO NOT use quantized model or quantization_bit when merging lora weights
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
-    --template default \
-    --finetuning_type lora \
-    --export_dir ../../models/llama2-7b-sft \
-    --export_size 2 \
-    --export_device cpu \
-    --export_legacy_format False
diff --git a/examples/merge_lora/quantize.sh b/examples/merge_lora/quantize.sh
deleted file mode 100644
index 4a104645..00000000
--- a/examples/merge_lora/quantize.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-# NEED TO run `merge.sh` before using this script
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli export \
-    --model_name_or_path ../../models/llama2-7b-sft \
-    --template default \
-    --export_dir ../../models/llama2-7b-sft-int4 \
-    --export_quantization_bit 4 \
-    --export_quantization_dataset ../../data/c4_demo.json \
-    --export_size 2 \
-    --export_legacy_format False
diff --git a/examples/qlora_single_gpu/aqlm.sh b/examples/qlora_single_gpu/aqlm.sh
deleted file mode 100644
index 1e0a71ca..00000000
--- a/examples/qlora_single_gpu/aqlm.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/qlora_single_gpu/awq.sh b/examples/qlora_single_gpu/awq.sh
deleted file mode 100644
index c13c8134..00000000
--- a/examples/qlora_single_gpu/awq.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path TheBloke/Llama-2-7B-AWQ \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/qlora_single_gpu/bitsandbytes.sh b/examples/qlora_single_gpu/bitsandbytes.sh
deleted file mode 100644
index 27f48d41..00000000
--- a/examples/qlora_single_gpu/bitsandbytes.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --quantization_bit 4 \
-    --plot_loss \
-    --fp16
diff --git a/examples/qlora_single_gpu/gptq.sh b/examples/qlora_single_gpu/gptq.sh
deleted file mode 100644
index 5b1b80e1..00000000
--- a/examples/qlora_single_gpu/gptq.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path TheBloke/Llama-2-7B-GPTQ \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
new file mode 100644
index 00000000..2bd99740
--- /dev/null
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@@ -0,0 +1,27 @@
+stage: sft
+do_train: true
+model_name_or_path: BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf
+dataset: alpaca_gpt4_en,glaive_toolcall
+dataset_dir: data
+template: default
+finetuning_type: lora
+lora_target: q_proj,v_proj
+output_dir: ../../saves/LLaMA2-7B/lora/sft
+overwrite_cache: true
+overwrite_output_dir: true
+cutoff_len: 1024
+per_device_train_batch_size: 1
+per_device_eval_batch_size: 1
+gradient_accumulation_steps: 8
+lr_scheduler_type: cosine
+logging_steps: 10
+save_steps: 100
+eval_steps: 100
+evaluation_strategy: steps
+load_best_model_at_end: true
+learning_rate: 5e-5
+num_train_epochs: 3.0
+max_samples: 3000
+val_size: 0.1
+plot_loss: true
+fp16: true
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
new file mode 100644
index 00000000..e69de29b
diff --git a/setup.py b/setup.py
index f7589eb8..7b849942 100644
--- a/setup.py
+++ b/setup.py
@@ -20,12 +20,12 @@ def get_requires():
 
 
 extra_require = {
-    "deepspeed": ["deepspeed>=0.10.0"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
+    "deepspeed": ["deepspeed>=0.10.0"],
+    "bitsandbytes": ["bitsandbytes>=0.39.0"],
+    "vllm": ["vllm>=0.4.0"],
     "galore": ["galore-torch"],
     "badam": ["badam"],
-    "vllm": ["vllm>=0.4.0"],
-    "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
diff --git a/src/webui.py b/src/webui.py
new file mode 100644
index 00000000..c225c710
--- /dev/null
+++ b/src/webui.py
@@ -0,0 +1,9 @@
+from llmtuner.webui.interface import create_ui
+
+
+def main():
+    create_ui().queue().launch(server_name="0.0.0.0", server_port=None, share=False)
+
+
+if __name__ == "__main__":
+    main()

From 50c71dd29fe735dde7c96d9bea18e86ae15f1384 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 6 May 2024 22:51:02 +0800
Subject: [PATCH 229/341] update example docs

Former-commit-id: 102cd42768d9eb2cf1219309a25b41e26149067e
---
 README.md                                     |   4 +-
 README_zh.md                                  |   6 +-
 examples/README.md                            | 247 ++++++++++++++----
 examples/README_zh.md                         | 246 +++++++++++++----
 examples/extras/badam/llama3_lora_sft.yaml    |  41 +++
 examples/extras/badam/sft.sh                  |  35 ---
 .../extras/fsdp_qlora/llama3_lora_sft.yaml    |  39 +++
 examples/extras/fsdp_qlora/sft.sh             |  41 ---
 examples/extras/fsdp_qlora/single_node.sh     |  10 +
 examples/extras/galore/llama3_full_sft.yaml   |  42 +++
 examples/extras/galore/sft.sh                 |  36 ---
 examples/extras/llama_pro/expand.sh           |   6 +-
 .../extras/llama_pro/llama3_freeze_sft.yaml   |  40 +++
 examples/extras/llama_pro/sft.sh              |  34 ---
 examples/extras/loraplus/llama3_lora_sft.yaml |  39 +++
 examples/extras/loraplus/sft.sh               |  33 ---
 examples/extras/mod/llama3_full_sft.yaml      |  39 +++
 examples/extras/mod/sft.sh                    |  33 ---
 .../full_multi_gpu/llama3_full_predict.yaml   |  23 ++
 examples/full_multi_gpu/llama3_full_sft.yaml  |  41 +++
 examples/full_multi_gpu/multi_node.sh         |  31 +--
 examples/full_multi_gpu/predict.sh            |  19 +-
 examples/full_multi_gpu/single_node.sh        |  32 +--
 examples/lora_multi_gpu/ds_zero3.sh           |  33 +--
 examples/lora_multi_gpu/llama3_lora_sft.yaml  |  41 +++
 .../lora_multi_gpu/llama3_lora_sft_ds.yaml    |  42 +++
 examples/lora_multi_gpu/multi_node.sh         |  34 +--
 examples/lora_multi_gpu/single_node.sh        |  34 +--
 .../lora_single_gpu/llama3_preprocess.yaml    |   2 +-
 .../llama3_lora_sft_aqlm.yaml                 |  49 ++--
 .../qlora_single_gpu/llama3_lora_sft_awq.yaml |  38 +++
 .../llama3_lora_sft_bitsandbytes.yaml         |  42 +++
 .../llama3_lora_sft_gptq.yaml                 |  38 +++
 33 files changed, 962 insertions(+), 508 deletions(-)
 create mode 100644 examples/extras/badam/llama3_lora_sft.yaml
 delete mode 100644 examples/extras/badam/sft.sh
 create mode 100644 examples/extras/fsdp_qlora/llama3_lora_sft.yaml
 delete mode 100644 examples/extras/fsdp_qlora/sft.sh
 create mode 100644 examples/extras/fsdp_qlora/single_node.sh
 create mode 100644 examples/extras/galore/llama3_full_sft.yaml
 delete mode 100644 examples/extras/galore/sft.sh
 create mode 100644 examples/extras/llama_pro/llama3_freeze_sft.yaml
 delete mode 100644 examples/extras/llama_pro/sft.sh
 create mode 100644 examples/extras/loraplus/llama3_lora_sft.yaml
 delete mode 100644 examples/extras/loraplus/sft.sh
 create mode 100644 examples/extras/mod/llama3_full_sft.yaml
 delete mode 100644 examples/extras/mod/sft.sh
 create mode 100644 examples/full_multi_gpu/llama3_full_predict.yaml
 create mode 100644 examples/full_multi_gpu/llama3_full_sft.yaml
 create mode 100644 examples/lora_multi_gpu/llama3_lora_sft.yaml
 create mode 100644 examples/lora_multi_gpu/llama3_lora_sft_ds.yaml

diff --git a/README.md b/README.md
index d10ef982..14a2084d 100644
--- a/README.md
+++ b/README.md
@@ -337,7 +337,7 @@ Please refer to [data/README.md](data/README.md) for checking the details about
 
 ### Quickstart
 
-The following 3 commands conduct LoRA fine-tuning, inference and merging for Llama3-8B-Instruct model, respectively.
+Use the following 3 commands to conduct LoRA **fine-tuning**, **inference** and **merging** for Llama3-8B-Instruct model, respectively.
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -345,7 +345,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
-See [examples/README.md](examples/README.md) for advanced usage.
+See [examples/README.md](examples/README.md) for advanced usage (including distributed training).
 
 > [!TIP]
 > Use `llamafactory-cli help` to show help information.
diff --git a/README_zh.md b/README_zh.md
index 9c639f2c..daf5f2e8 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -337,7 +337,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 ### 快速开始
 
-下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA 微调、推理和合并。
+下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -345,10 +345,10 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
 
-高级用法请参考 [examples/README_zh.md](examples/README_zh.md)。
+高级用法请参考 [examples/README_zh.md](examples/README_zh.md)（包括多 GPU 微调）。
 
 > [!TIP]
-> 使用 `llamafactory-cli help` 显示使用帮助。
+> 使用 `llamafactory-cli help` 显示帮助信息。
 
 ### 使用 LLaMA Board 可视化界面（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
 
diff --git a/examples/README.md b/examples/README.md
index 0a14c5bd..922f9c7b 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,57 +1,204 @@
 We provide diverse examples about fine-tuning LLMs.
 
+### LoRA Fine-Tuning on A Single GPU
+
+#### (Continuous) Pre-Training
+
 ```bash
-export CUDA_VISIBLE_DEVICES=0
-cd examples/lora_single_gpu
-llamafactory-cli train llama3_lora_pretrain.yaml # Do continuous pre-training using LoRA
-
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
 ```
 
+#### Supervised Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
 ```
-examples/
-├── lora_single_gpu/
-│   ├── `
-│   ├── sft.sh: Do supervised fine-tuning using LoRA
-│   ├── reward.sh: Do reward modeling using LoRA
-│   ├── ppo.sh: Do PPO training using LoRA
-│   ├── dpo.sh: Do DPO training using LoRA
-│   ├── orpo.sh: Do ORPO training using LoRA
-│   ├── sft_mllm.sh: Do supervised fine-tuning on multimodal data using LoRA
-│   ├── prepare.sh: Save tokenized dataset
-│   └── predict.sh: Do batch predict and compute BLEU and ROUGE scores after LoRA tuning
-├── qlora_single_gpu/
-│   ├── bitsandbytes.sh: Fine-tune 4/8-bit BNB models using QLoRA
-│   ├── gptq.sh: Fine-tune 4/8-bit GPTQ models using QLoRA
-│   ├── awq.sh: Fine-tune 4-bit AWQ models using QLoRA
-│   └── aqlm.sh: Fine-tune 2-bit AQLM models using QLoRA
-├── lora_multi_gpu/
-│   ├── single_node.sh: Fine-tune model with Accelerate on single node using LoRA
-│   ├── multi_node.sh: Fine-tune model with Accelerate on multiple nodes using LoRA
-│   └── ds_zero3.sh: Fine-tune model with DeepSpeed ZeRO-3 using LoRA (weight sharding)
-├── full_multi_gpu/
-│   ├── single_node.sh: Full fine-tune model with DeepSpeed on single node
-│   ├── multi_node.sh: Full fine-tune model with DeepSpeed on multiple nodes
-│   └── predict.sh: Do parallel batch predict and compute BLEU and ROUGE scores after full tuning
-├── merge_lora/
-│   ├── merge.sh: Merge LoRA weights into the pre-trained models
-│   └── quantize.sh: Quantize the fine-tuned model with AutoGPTQ
-├── inference/
-│   ├── cli_demo.sh: Chat with fine-tuned model in the CLI with LoRA adapters
-│   ├── api_demo.sh: Chat with fine-tuned model in an OpenAI-style API with LoRA adapters
-│   ├── web_demo.sh: Chat with fine-tuned model in the Web browser with LoRA adapters
-│   └── evaluate.sh: Evaluate model on the MMLU/CMMLU/C-Eval benchmarks with LoRA adapters
-└── extras/
-    ├── galore/
-    │   └── sft.sh: Fine-tune model with GaLore
-    ├── badam/
-    │   └── sft.sh: Fine-tune model with BAdam
-    ├── loraplus/
-    │   └── sft.sh: Fine-tune model using LoRA+
-    ├── mod/
-    │   └── sft.sh: Fine-tune model using Mixture-of-Depths
-    ├── llama_pro/
-    │   ├── expand.sh: Expand layers in the model
-    │   └── sft.sh: Fine-tune the expanded model
-    └── fsdp_qlora/
-        └── sft.sh: Fine-tune quantized model with FSDP+QLoRA
+
+#### Reward Modeling
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
+```
+
+#### PPO Training
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
+```
+
+#### DPO Training
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
+```
+
+#### ORPO Training
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
+```
+
+#### Multimodal Supervised Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+```
+
+#### Preprocess Dataset
+
+It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
+```
+
+#### Evaluating on MMLU/CMMLU/C-Eval Benchmarks
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
+```
+
+#### Batch Predicting and Computing BLEU and ROUGE Scores
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
+```
+
+### QLoRA Fine-Tuning on a Single GPU
+
+#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes Quantization (Recommended)
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+```
+
+#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+```
+
+#### Supervised Fine-Tuning with 4-bit AWQ Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+```
+
+#### Supervised Fine-Tuning with 2-bit AQLM Quantization
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+```
+
+### LoRA Fine-Tuning on Multiple GPUs
+
+#### Supervised Fine-Tuning with Accelerate on Single Node
+
+```bash
+bash examples/lora_multi_gpu/single_node.sh
+```
+
+#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
+
+```bash
+bash examples/lora_multi_gpu/multi_node.sh
+```
+
+#### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
+
+```bash
+bash examples/lora_multi_gpu/ds_zero3.sh
+```
+
+### Full-Parameter Fine-Tuning on Multiple GPUs
+
+#### Supervised Fine-Tuning with Accelerate on Single Node
+
+```bash
+bash examples/full_multi_gpu/single_node.sh
+```
+
+#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
+
+```bash
+bash examples/full_multi_gpu/multi_node.sh
+```
+
+#### Batch Predicting and Computing BLEU and ROUGE Scores
+
+```bash
+bash examples/full_multi_gpu/predict.sh
+```
+
+### Merging LoRA Adapters and Quantization
+
+#### Merge LoRA Adapters
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### Quantizing Model using AutoGPTQ
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+```
+
+### Inferring LoRA Fine-Tuned Models
+
+#### Use CLI
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### Use Web UI
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### Launch OpenAI-style API
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/merge_lora/llama3_lora_sft.yaml
+```
+
+### Extras
+
+#### Full-Parameter Fine-Tuning using GaLore
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+```
+
+#### Full-Parameter Fine-Tuning using BAdam
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+```
+
+#### LoRA+ Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+```
+
+#### Mixture-of-Depths Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+```
+
+#### LLaMA-Pro Fine-Tuning
+
+```bash
+bash examples/extras/llama_pro/expand.sh
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+```
+
+#### FSDP+QLoRA Fine-Tuning
+
+```bash
+bash examples/extras/fsdp_qlora/single_node.sh
 ```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 091a877f..14d72c10 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -1,50 +1,204 @@
 我们提供了多样化的大模型微调示例脚本。
 
+### 单 GPU LoRA 微调
+
+#### （增量）预训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_pretrain.yaml
 ```
-examples/
-├── lora_single_gpu/
-│   ├── pretrain.sh: 基于 LoRA 进行增量预训练
-│   ├── sft.sh: 基于 LoRA 进行指令监督微调
-│   ├── reward.sh: 基于 LoRA 进行奖励模型训练
-│   ├── ppo.sh: 基于 LoRA 进行 PPO 训练
-│   ├── dpo.sh: 基于 LoRA 进行 DPO 训练
-│   ├── orpo.sh: 基于 LoRA 进行 ORPO 训练
-│   ├── sft_mllm.sh: 基于 LoRA 进行多模态指令监督微调
-│   ├── prepare.sh: 保存预处理后的数据集
-│   └── predict.sh: 基于 LoRA 进行批量预测并计算 BLEU 和 ROUGE 分数
-├── qlora_single_gpu/
-│   ├── bitsandbytes.sh: 基于 QLoRA 微调 4/8 比特 BNB 模型
-│   ├── gptq.sh: 基于 QLoRA 微调 4/8 比特 GPTQ 模型
-│   ├── awq.sh: 基于 QLoRA 微调 4 比特 AWQ 模型
-│   └── aqlm.sh: 基于 QLoRA 微调 2 比特 AQLM 模型
-├── lora_multi_gpu/
-│   ├── single_node.sh: 使用 Accelerate 进行单节点 LoRA 训练
-│   ├── multi_node.sh: 使用 Accelerate 进行多节点 LoRA 训练
-│   └── ds_zero3.sh: 使用 DeepSpeed ZeRO-3 进行 LoRA 训练（拆分权重）
-├── full_multi_gpu/
-│   ├── single_node.sh: 使用 DeepSpeed 进行单节点全量训练
-│   ├── multi_node.sh: 使用 DeepSpeed 进行多节点全量训练
-│   └── predict.sh: 基于全量训练进行多卡批量预测并计算 BLEU 和 ROUGE 分数
-├── merge_lora/
-│   ├── merge.sh: 将 LoRA 权重合并到预训练模型中
-│   └── quantize.sh: 使用 AutoGPTQ 量化微调后的模型
-├── inference/
-│   ├── cli_demo.sh: 启动 LoRA 模型的命令行推理接口
-│   ├── api_demo.sh: 启动 LoRA 模型的 OpenAI 风格 API
-│   ├── web_demo.sh: 启动 LoRA 模型的浏览器推理接口
-│   └── evaluate.sh: 在 MMLU/CMMLU/C-Eval 数据集上评测 LoRA 模型
-└── extras/
-    ├── galore/
-    │   └── sft.sh: 使用 GaLore 训练模型
-    ├── badam/
-    │   └── sft.sh: 使用 BAdam 训练模型
-    ├── loraplus/
-    │   └── sft.sh: 使用 LoRA+ 训练模型
-    ├── mod/
-    │   └── sft.sh: 使用深度混合训练模型
-    ├── llama_pro/
-    │   ├── expand.sh: 扩展模型中的层
-    │   └── sft.sh: 训练扩展后的模型
-    └── fsdp_qlora/
-        └── sft.sh: 使用 FSDP+QLoRA 微调量化模型
+
+#### 指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
+```
+
+#### 奖励模型训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_reward.yaml
+```
+
+#### PPO 训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml
+```
+
+#### DPO 训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
+```
+
+#### ORPO 训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
+```
+
+#### 多模态指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+```
+
+#### 预处理数据集
+
+对于大数据集有帮助，在配置中使用 `tokenized_path` 以加载预处理后的数据集。
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_preprocess.yaml
+```
+
+#### 在 MMLU/CMMLU/C-Eval 上评估
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli eval examples/lora_single_gpu/llama3_lora_eval.yaml
+```
+
+#### 批量预测并计算 BLEU 和 ROUGE 分数
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_predict.yaml
+```
+
+### 单 GPU QLoRA 微调
+
+#### 基于 4/8 比特 Bitsandbytes 量化进行指令监督微调（推荐）
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+```
+
+#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+```
+
+#### 基于 4 比特 AWQ 量化进行指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+```
+
+#### 基于 2 比特 AQLM 量化进行指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+```
+
+### 多 GPU LoRA 微调
+
+#### 使用 Accelerate 进行单节点训练
+
+```bash
+bash examples/lora_multi_gpu/single_node.sh
+```
+
+#### 使用 Accelerate 进行多节点训练
+
+```bash
+bash examples/lora_multi_gpu/multi_node.sh
+```
+
+#### 使用 DeepSpeed ZeRO-3 平均分配显存
+
+```bash
+bash examples/lora_multi_gpu/ds_zero3.sh
+```
+
+### 多 GPU 全参数微调
+
+#### 使用 DeepSpeed 进行单节点训练
+
+```bash
+bash examples/full_multi_gpu/single_node.sh
+```
+
+#### 使用 DeepSpeed 进行多节点训练
+
+```bash
+bash examples/full_multi_gpu/multi_node.sh
+```
+
+#### 批量预测并计算 BLEU 和 ROUGE 分数
+
+```bash
+bash examples/full_multi_gpu/predict.sh
+```
+
+### 合并 LoRA 适配器与模型量化
+
+#### 合并 LoRA 适配器
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### 使用 AutoGPTQ 量化模型
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+```
+
+### 推理 LoRA 模型
+
+#### 使用命令行接口
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### 使用浏览器界面
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### 启动 OpenAI 风格 API
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/merge_lora/llama3_lora_sft.yaml
+```
+
+### 杂项
+
+#### 使用 GaLore 进行全参数训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+```
+
+#### 使用 BAdam 进行全参数训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+```
+
+#### LoRA+ 微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+```
+
+#### 深度混合微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+```
+
+#### LLaMA-Pro 微调
+
+```bash
+bash examples/extras/llama_pro/expand.sh
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+```
+
+#### FSDP+QLoRA 微调
+
+```bash
+bash examples/extras/fsdp_qlora/single_node.sh
 ```
diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_lora_sft.yaml
new file mode 100644
index 00000000..9f1f1976
--- /dev/null
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@@ -0,0 +1,41 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+use_badam: true
+badam_switch_mode: descending
+badam_switch_interval: 50
+badam_verbose: 2
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh
deleted file mode 100644
index 61167dad..00000000
--- a/examples/extras/badam/sft.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --use_badam \
-    --badam_switch_mode descending \
-    --badam_switch_block_every 50 \
-    --badam_verbose 2 \
-    --output_dir ../../../saves/LLaMA2-7B/badam/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
new file mode 100644
index 00000000..64bf1356
--- /dev/null
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+quantization_bit: 4
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/fsdp_qlora/sft.sh b/examples/extras/fsdp_qlora/sft.sh
deleted file mode 100644
index 9eb70a53..00000000
--- a/examples/extras/fsdp_qlora/sft.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
-
-pip install "transformers>=4.39.1"
-pip install "accelerate>=0.28.0"
-pip install "bitsandbytes>=0.43.0"
-
-CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
-    --config_file ../../accelerate/fsdp_config.yaml \
-    ../../../src/train.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-70b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../../saves/LLaMA2-70B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --quantization_bit 4 \
-    --plot_loss \
-    --fp16
diff --git a/examples/extras/fsdp_qlora/single_node.sh b/examples/extras/fsdp_qlora/single_node.sh
new file mode 100644
index 00000000..54ec2bd2
--- /dev/null
+++ b/examples/extras/fsdp_qlora/single_node.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
+
+pip install "transformers>=4.39.1"
+pip install "accelerate>=0.28.0"
+pip install "bitsandbytes>=0.43.0"
+
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
+    --config_file examples/accelerate/fsdp_config.yaml \
+    src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
diff --git a/examples/extras/galore/llama3_full_sft.yaml b/examples/extras/galore/llama3_full_sft.yaml
new file mode 100644
index 00000000..5aec8af9
--- /dev/null
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -0,0 +1,42 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+use_galore: true
+galore_layerwise: true
+galore_target: mlp,self_attn
+galore_rank: 128
+galore_scale: 2.0
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/galore/sft.sh b/examples/extras/galore/sft.sh
deleted file mode 100644
index 283673e7..00000000
--- a/examples/extras/galore/sft.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --use_galore \
-    --galore_layerwise \
-    --galore_target mlp,self_attn \
-    --galore_rank 128 \
-    --galore_scale 2.0 \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/extras/llama_pro/expand.sh b/examples/extras/llama_pro/expand.sh
index b260902c..e0d41c7b 100644
--- a/examples/extras/llama_pro/expand.sh
+++ b/examples/extras/llama_pro/expand.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-python ../../../scripts/llama_pro.py \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --output_dir ../../../models/llama2-7b-pro \
+python scripts/llama_pro.py \
+    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
+    --output_dir models/llama3-8b-instruct-pro \
     --num_expand 8
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
new file mode 100644
index 00000000..a54be8b8
--- /dev/null
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -0,0 +1,40 @@
+# model
+model_name_or_path: models/llama3-8b-instruct-pro
+
+# method
+stage: sft
+do_train: true
+finetuning_type: freeze
+name_module_trainable: all
+num_layer_trainable: 8
+use_llama_pro: true
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b-instruct-pro/freeze/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/llama_pro/sft.sh b/examples/extras/llama_pro/sft.sh
deleted file mode 100644
index 3e26e0a6..00000000
--- a/examples/extras/llama_pro/sft.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path ../../../models/llama2-7b-pro \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type freeze \
-    --name_module_trainable all \
-    --num_layer_trainable 8 \
-    --use_llama_pro \
-    --output_dir ../../../saves/LLaMA2-7B-Pro/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
new file mode 100644
index 00000000..dfb7058b
--- /dev/null
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+loraplus_lr_ratio: 16.0
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/loraplus/sft.sh b/examples/extras/loraplus/sft.sh
deleted file mode 100644
index 8d152d9e..00000000
--- a/examples/extras/loraplus/sft.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --loraplus_lr_ratio 16.0 \
-    --output_dir ../../saves/LLaMA2-7B/loraplus/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml
new file mode 100644
index 00000000..5f80521d
--- /dev/null
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -0,0 +1,39 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+mixture_of_depths: convert
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b-mod/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+optim: paged_adamw_8bit
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+pure_bf16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/extras/mod/sft.sh b/examples/extras/mod/sft.sh
deleted file mode 100644
index 5219751f..00000000
--- a/examples/extras/mod/sft.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --mixture_of_depths convert \
-    --output_dir ../../../saves/LLaMA2-7B/mod/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --optim paged_adamw_8bit \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
diff --git a/examples/full_multi_gpu/llama3_full_predict.yaml b/examples/full_multi_gpu/llama3_full_predict.yaml
new file mode 100644
index 00000000..5b9b680b
--- /dev/null
+++ b/examples/full_multi_gpu/llama3_full_predict.yaml
@@ -0,0 +1,23 @@
+# model
+model_name_or_path: saves/llama3-8b/full/sft
+
+# method
+stage: sft
+do_predict: true
+finetuning_type: full
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 50
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/predict
+overwrite_output_dir: true
+
+# eval
+per_device_eval_batch_size: 1
+predict_with_generate: true
diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/full_multi_gpu/llama3_full_sft.yaml
new file mode 100644
index 00000000..ef35e441
--- /dev/null
+++ b/examples/full_multi_gpu/llama3_full_sft.yaml
@@ -0,0 +1,41 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: full
+
+# ddp
+ddp_timeout: 180000000
+deepspeed: examples/deepspeed/ds_z3_config.json
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
index a1ffc0ee..9c2508b6 100644
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -6,33 +6,4 @@ python -m torch.distributed.run \
     --node_rank $RANK \
     --master_addr $MASTER_ADDR \
     --master_port $MASTER_PORT \
-    ../../src/train.py \
-    --deepspeed ../deepspeed/ds_z3_config.json \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type full \
-    --output_dir ../../saves/LLaMA2-7B/full/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+    src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/full_multi_gpu/predict.sh b/examples/full_multi_gpu/predict.sh
index 7c2e458f..2445f444 100644
--- a/examples/full_multi_gpu/predict.sh
+++ b/examples/full_multi_gpu/predict.sh
@@ -1,20 +1,5 @@
 #!/bin/bash
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
-    --config_file ../accelerate/single_config.yaml \
-    ../../src/train.py \
-    --stage sft \
-    --do_predict \
-    --model_name_or_path ../../saves/LLaMA2-7B/full/sft \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type full \
-    --output_dir ../../saves/LLaMA2-7B/full/predict \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_eval_batch_size 1 \
-    --max_samples 20 \
-    --predict_with_generate
+    --config_file examples/accelerate/single_config.yaml \
+    src/train.py examples/full_multi_gpu/llama3_full_predict.yaml
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
index 73c7662d..f391166a 100644
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -1,32 +1,4 @@
 #!/bin/bash
 
-deepspeed --num_gpus 4 ../../src/train.py \
-    --deepspeed ../deepspeed/ds_z3_config.json \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type full \
-    --output_dir ../../saves/LLaMA2-7B/full/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+deepspeed --include "localhost:0,1,2,3" \
+    src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh
index bc74a6de..304f3780 100644
--- a/examples/lora_multi_gpu/ds_zero3.sh
+++ b/examples/lora_multi_gpu/ds_zero3.sh
@@ -1,34 +1,5 @@
 #!/bin/bash
 # ZeRO-3 enables weight sharding on multiple GPUs
 
-deepspeed --num_gpus 4 ../../src/train.py \
-    --deepspeed ../deepspeed/ds_z3_config.json \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+deepspeed --include "localhost:0,1,2,3" \
+    src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml
new file mode 100644
index 00000000..d9690679
--- /dev/null
+++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml
@@ -0,0 +1,41 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# ddp
+ddp_timeout: 180000000
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
new file mode 100644
index 00000000..26955167
--- /dev/null
+++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
@@ -0,0 +1,42 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# ddp
+ddp_timeout: 180000000
+deepspeed: examples/deepspeed/ds_z3_config.json
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_multi_gpu/multi_node.sh b/examples/lora_multi_gpu/multi_node.sh
index a58cac20..401fac5f 100644
--- a/examples/lora_multi_gpu/multi_node.sh
+++ b/examples/lora_multi_gpu/multi_node.sh
@@ -2,35 +2,5 @@
 # also launch it on slave machine using slave_config.yaml
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
-    --config_file ../accelerate/master_config.yaml \
-    ../../src/train.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+    --config_file examples/accelerate/master_config.yaml \
+    src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh
index c0719c04..885a0e8c 100644
--- a/examples/lora_multi_gpu/single_node.sh
+++ b/examples/lora_multi_gpu/single_node.sh
@@ -1,35 +1,5 @@
 #!/bin/bash
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
-    --config_file ../accelerate/single_config.yaml \
-    ../../src/train.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../data \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir ../../saves/LLaMA2-7B/lora/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --ddp_timeout 180000000 \
-    --plot_loss \
-    --fp16
+    --config_file examples/accelerate/single_config.yaml \
+    src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/lora_single_gpu/llama3_preprocess.yaml
index 04df9631..0b3dc599 100644
--- a/examples/lora_single_gpu/llama3_preprocess.yaml
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@@ -15,7 +15,7 @@ max_samples: 1000
 val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
-tokenized_path: saves/llama3-8b/dataset/sft # use `tokenized_path` in config to load data
+tokenized_path: saves/llama3-8b/dataset/sft
 
 # output
 output_dir: saves/llama3-8b/lora/sft
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
index 2bd99740..11f1d277 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@@ -1,27 +1,38 @@
+# model
+model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
+
+# method
 stage: sft
 do_train: true
-model_name_or_path: BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf
-dataset: alpaca_gpt4_en,glaive_toolcall
-dataset_dir: data
-template: default
 finetuning_type: lora
 lora_target: q_proj,v_proj
-output_dir: ../../saves/LLaMA2-7B/lora/sft
-overwrite_cache: true
-overwrite_output_dir: true
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
 cutoff_len: 1024
-per_device_train_batch_size: 1
-per_device_eval_batch_size: 1
-gradient_accumulation_steps: 8
-lr_scheduler_type: cosine
-logging_steps: 10
-save_steps: 100
-eval_steps: 100
-evaluation_strategy: steps
-load_best_model_at_end: true
-learning_rate: 5e-5
-num_train_epochs: 3.0
-max_samples: 3000
+max_samples: 1000
 val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
 plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
 fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
index e69de29b..4b070d45 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
index e69de29b..7bc31bde 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
@@ -0,0 +1,42 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+quantization_bit: 4
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# ddp
+ddp_timeout: 180000000
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
index e69de29b..2f8cfe45 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
@@ -0,0 +1,38 @@
+# model
+model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+val_size: 0.1
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500

From 6aec4469406654d6ab54d8fc471e9804438dd0ff Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 6 May 2024 23:07:55 +0800
Subject: [PATCH 230/341] update examples

Former-commit-id: cca50b627c85e0a777717d609377406cc7fd579f
---
 examples/README.md    | 14 ++++++++++++++
 examples/README_zh.md | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 922f9c7b..ba993b99 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,5 +1,19 @@
 We provide diverse examples about fine-tuning LLMs.
 
+Make sure to execute these commands in the `LLaMA-Factory` directory.
+
+## Table of Contents
+
+- [LoRA Fine-Tuning on A Single GPU](#lora-fine-tuning-on-a-single-gpu)
+- [QLoRA Fine-Tuning on a Single GPU](#qlora-fine-tuning-on-a-single-gpu)
+- [LoRA Fine-Tuning on Multiple GPUs](#lora-fine-tuning-on-multiple-gpus)
+- [Full-Parameter Fine-Tuning on Multiple GPUs](#full-parameter-fine-tuning-on-multiple-gpus)
+- [Merging LoRA Adapters and Quantization](#merging-lora-adapters-and-quantization)
+- [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
+- [Extras](#extras)
+
+## Examples
+
 ### LoRA Fine-Tuning on A Single GPU
 
 #### (Continuous) Pre-Training
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 14d72c10..491ec688 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -1,5 +1,19 @@
 我们提供了多样化的大模型微调示例脚本。
 
+请确保在 `LLaMA-Factory` 目录下执行下述命令。
+
+## 目录
+
+- [单 GPU LoRA 微调](#单-gpu-lora-微调)
+- [单 GPU QLoRA 微调](#单-gpu-qlora-微调)
+- [多 GPU LoRA 微调](#多-gpu-lora-微调)
+- [多 GPU 全参数微调](#多-gpu-全参数微调)
+- [合并 LoRA 适配器与模型量化](#合并-lora-适配器与模型量化)
+- [推理 LoRA 模型](#推理-lora-模型)
+- [杂项](#杂项)
+
+## 示例
+
 ### 单 GPU LoRA 微调
 
 #### （增量）预训练

From da2295f8c862cb3f5445ffa4aa24450d267b19c7 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 6 May 2024 23:33:06 +0800
Subject: [PATCH 231/341] fix gradio args

Former-commit-id: 7767c1ad4b2b638b558f941ba1f0d05d4a049507
---
 src/llmtuner/webui/interface.py | 12 ++++++++++--
 src/webui.py                    |  7 ++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index b293db90..969ce6bd 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -1,3 +1,5 @@
+import os
+
 from ..extras.packages import is_gradio_available
 from .common import save_config
 from .components import (
@@ -69,8 +71,14 @@ def create_web_demo() -> gr.Blocks:
 
 
 def run_web_ui() -> None:
-    create_ui().queue().launch()
+    server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
+    server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
+    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
+    create_ui().queue().launch(share=gradio_share, server_name=server_name, server_port=server_port)
 
 
 def run_web_demo() -> None:
-    create_web_demo().queue().launch()
+    server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
+    server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
+    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
+    create_web_demo().queue().launch(share=gradio_share, server_name=server_name, server_port=server_port)
diff --git a/src/webui.py b/src/webui.py
index c225c710..b9385259 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -1,8 +1,13 @@
+import os
+
 from llmtuner.webui.interface import create_ui
 
 
 def main():
-    create_ui().queue().launch(server_name="0.0.0.0", server_port=None, share=False)
+    server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
+    server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
+    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
+    create_ui().queue().launch(share=gradio_share, server_name=server_name, server_port=server_port)
 
 
 if __name__ == "__main__":

From 196068fa19779a70acbe4d0604fcf521797b1ea3 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 6 May 2024 23:34:59 +0800
Subject: [PATCH 232/341] update readme

Former-commit-id: 1c67708291195825e8356d5862d22cbee9566233
---
 README.md    | 5 +----
 README_zh.md | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 14a2084d..38e68ba8 100644
--- a/README.md
+++ b/README.md
@@ -358,12 +358,9 @@ See [examples/README.md](examples/README.md) for advanced usage (including distr
 #### Use local environment
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
-> [!TIP]
-> To modify the default setting in the LLaMA Board GUI, you can use environment variables, e.g., `export GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False` (use `set` command on Windows OS).
-
 <details><summary>For Alibaba Cloud users</summary>
 
 If you encountered display problems in LLaMA Board on Alibaba Cloud, try using the following command to set environment variables before starting LLaMA Board:
diff --git a/README_zh.md b/README_zh.md
index daf5f2e8..826bea47 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -358,12 +358,9 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_s
 #### 使用本地环境
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
-> [!TIP]
-> 您可以使用环境变量来修改 LLaMA Board 可视化界面的默认设置，例如 `export GRADIO_SERVER_NAME=0.0.0.0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=False`（Windows 系统可使用 `set` 指令）。
-
 <details><summary>阿里云用户指南</summary>
 
 如果您在阿里云上使用 LLaMA Board 时遇到显示问题，请尝试在启动前使用以下命令设置环境变量：

From 8e4ab2f7d0979a611dda7eb305a7cfe71f4dfb88 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 00:27:56 +0800
Subject: [PATCH 233/341] Update generating_args.py

Former-commit-id: 7a9fb56786f4c40856211009656a983be1e42cb7
---
 src/llmtuner/hparams/generating_args.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/llmtuner/hparams/generating_args.py b/src/llmtuner/hparams/generating_args.py
index 03e760e7..e3e196e9 100644
--- a/src/llmtuner/hparams/generating_args.py
+++ b/src/llmtuner/hparams/generating_args.py
@@ -1,5 +1,5 @@
 from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, Union, Optional, List
+from typing import Any, Dict
 
 
 @dataclass
@@ -46,10 +46,6 @@ class GeneratingArguments:
         default=1.0,
         metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
     )
-    stop: Union[Optional[str], List[str]] = field(
-        default=None,
-        metadata={"help": "List of strings or string that stop the generation when they are generated. The returned output will not contain the stop strings."},
-    )
     def to_dict(self) -> Dict[str, Any]:
         args = asdict(self)
         if args.get("max_new_tokens", -1) > 0:

From 14316f65831186e62ebd5a9af11adf9acf05e1f2 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 00:28:16 +0800
Subject: [PATCH 234/341] Update generating_args.py

Former-commit-id: 714957ba0159919a89fc1659a7a7b4b6bd82eead
---
 src/llmtuner/hparams/generating_args.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmtuner/hparams/generating_args.py b/src/llmtuner/hparams/generating_args.py
index e3e196e9..e792c003 100644
--- a/src/llmtuner/hparams/generating_args.py
+++ b/src/llmtuner/hparams/generating_args.py
@@ -46,6 +46,7 @@ class GeneratingArguments:
         default=1.0,
         metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
     )
+
     def to_dict(self) -> Dict[str, Any]:
         args = asdict(self)
         if args.get("max_new_tokens", -1) > 0:

From 1ebd1e50e742f5cbeff912a93ff1bf6d25d42a06 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 00:37:05 +0800
Subject: [PATCH 235/341] Update vllm_engine.py

Former-commit-id: fa2410de07150a82082ab5b88baf56aa891db870
---
 src/llmtuner/chat/vllm_engine.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index 9863d635..d50e41aa 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -106,7 +106,6 @@ class VllmEngine(BaseEngine):
                 top_k=top_k or generating_args["top_k"],
                 num_return_sequences=num_return_sequences or 1,
                 repetition_penalty=repetition_penalty or generating_args["repetition_penalty"],
-                stop=stop or generating_args["stop"]
             )
         )
 
@@ -124,10 +123,10 @@ class VllmEngine(BaseEngine):
             top_k=generating_args["top_k"],
             use_beam_search=generating_args["num_beams"] > 1,
             length_penalty=generating_args["length_penalty"],
+            stop=stop,
             stop_token_ids=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
             max_tokens=generating_args["max_new_tokens"],
             skip_special_tokens=True,
-            stop=generating_args["stop"],
         )
 
         if self.processor is not None and image is not None:

From e3b3a722dec0a29f2e98654abbe5c67d43a698f2 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 00:41:04 +0800
Subject: [PATCH 236/341] fix stop param

Former-commit-id: f0a850c25211b72eddbb357c81679db9b0930d44
---
 src/llmtuner/api/chat.py       | 7 +++++--
 src/llmtuner/api/protocol.py   | 2 +-
 src/llmtuner/chat/hf_engine.py | 4 ++++
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index 972ee906..2a703877 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -103,7 +103,7 @@ async def create_chat_completion_response(
         top_p=request.top_p,
         max_new_tokens=request.max_tokens,
         num_return_sequences=request.n,
-        stop=request.stop
+        stop=request.stop,
     )
 
     prompt_length, response_length = 0, 0
@@ -145,6 +145,9 @@ async def create_stream_chat_completion_response(
     if tools:
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
 
+    if request.n > 1:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream multiple responses.")
+
     yield _create_stream_chat_completion_chunk(
         completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(role=Role.ASSISTANT, content="")
     )
@@ -156,7 +159,7 @@ async def create_stream_chat_completion_response(
         temperature=request.temperature,
         top_p=request.top_p,
         max_new_tokens=request.max_tokens,
-        stop=request.stop
+        stop=request.stop,
     ):
         if len(new_token) != 0:
             yield _create_stream_chat_completion_chunk(
diff --git a/src/llmtuner/api/protocol.py b/src/llmtuner/api/protocol.py
index f526c813..525fa6a7 100644
--- a/src/llmtuner/api/protocol.py
+++ b/src/llmtuner/api/protocol.py
@@ -77,8 +77,8 @@ class ChatCompletionRequest(BaseModel):
     top_p: Optional[float] = None
     n: int = 1
     max_tokens: Optional[int] = None
+    stop: Optional[Union[str, List[str]]] = None
     stream: bool = False
-    stop: Union[Optional[str], List[str]] = None
 
 
 class ChatCompletionResponseChoice(BaseModel):
diff --git a/src/llmtuner/chat/hf_engine.py b/src/llmtuner/chat/hf_engine.py
index e8f06a73..97160d57 100644
--- a/src/llmtuner/chat/hf_engine.py
+++ b/src/llmtuner/chat/hf_engine.py
@@ -73,6 +73,10 @@ class HuggingfaceEngine(BaseEngine):
         repetition_penalty = input_kwargs.pop("repetition_penalty", None)
         max_length = input_kwargs.pop("max_length", None)
         max_new_tokens = input_kwargs.pop("max_new_tokens", None)
+        stop = input_kwargs.pop("stop", None)
+
+        if stop is not None:
+            raise ValueError("Stop parameter is not supported in Huggingface engine yet.")
 
         generating_args.update(
             dict(

From 4bde37e7c8242ea27f6bd6e701ae8cc3f975d2b9 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 06:19:29 +0800
Subject: [PATCH 237/341] update readme

Former-commit-id: 3fdc72b9aad9e129f74417cbbf25e841d28e3737
---
 README.md    | 2 ++
 README_zh.md | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 38e68ba8..611ddc81 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,8 @@
 [![Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board)
 [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)
 
+[![GitHub Tread](https://trendshift.io/api/badge/repositories/4535)](https://trendshift.io/repositories/4535)
+
 👋 Join our [WeChat](assets/wechat.jpg).
 
 \[ English | [中文](README_zh.md) \]
diff --git a/README_zh.md b/README_zh.md
index 826bea47..ecbc15f0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -13,6 +13,8 @@
 [![Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board)
 [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)
 
+[![GitHub Tread](https://trendshift.io/api/badge/repositories/4535)](https://trendshift.io/repositories/4535)
+
 👋 加入我们的[微信群](assets/wechat.jpg)。
 
 \[ [English](README.md) | 中文 \]

From 5100c290c416fc4621500d717e1eae0fe6e614af Mon Sep 17 00:00:00 2001
From: Katehuuh <133996730+Katehuuh@users.noreply.github.com>
Date: Tue, 7 May 2024 06:23:36 +0200
Subject: [PATCH 238/341] Update README.md

Add Projects Nekochu/Luminia-13B-v3

Former-commit-id: 3d2cd743c2c8830e8b131d1192f1549fa557762d
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 611ddc81..ee4adce8 100644
--- a/README.md
+++ b/README.md
@@ -466,6 +466,7 @@ If you have a project that should be incorporated, please contact via email or c
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: A series of large language models for Chinese medical domain, based on LLaMA2-7B and Baichuan-13B.
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.
+1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: [🤗Demo](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt) A large language model specialized in generate metadata for stable diffusion.
 
 </details>
 

From 0f626a2145ff1324ffb6d613b75f5e66544e2cf3 Mon Sep 17 00:00:00 2001
From: Katehuuh <133996730+Katehuuh@users.noreply.github.com>
Date: Tue, 7 May 2024 06:28:48 +0200
Subject: [PATCH 239/341] Update README_zh.md

Add Projects Nekochu/Luminia-13B-v3

Former-commit-id: 88d01e831bd511daec30a94817f06e07b8406b18
---
 README_zh.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README_zh.md b/README_zh.md
index ecbc15f0..6019f765 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -466,6 +466,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: 医疗大模型项目 CareGPT，基于 LLaMA2-7B 和 Baichuan-13B 在中文医疗数据上微调而得。
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**：MBTI性格大模型项目，根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。
+1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: [🤗Demo](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt) 一个专门用于生成稳定扩散元数据的大型语言模型。
 
 </details>
 

From 8493753fab3bde19f0726b1e8c9d73b7e1ea38f8 Mon Sep 17 00:00:00 2001
From: "junwooo.lee" <junwooo.lee@navercorp.com>
Date: Tue, 7 May 2024 15:09:48 +0900
Subject: [PATCH 240/341] fix: splitted Dockerfile's CMD

Former-commit-id: d8032550c7e084648fbf24da5abbac6432b54f26
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 4b8bb084..d757d618 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,4 +11,4 @@ RUN pip install -e .[deepspeed,metrics,bitsandbytes,qwen]
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
 EXPOSE 7860
 
-CMD [ "llamafactory-cli webui" ]
+CMD [ "llamafactory-cli", "webui" ]

From ebab655683d78f8039cf924ebe7c0cbebe68ab33 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 17:50:27 +0800
Subject: [PATCH 241/341] fix #3602

Former-commit-id: 1518b45490606ea200482da4737113c46985e8c5
---
 examples/README.md                       |  2 ++
 examples/README_zh.md                    |  2 ++
 examples/full_multi_gpu/multi_node.sh    |  8 +++++++-
 examples/full_multi_gpu/single_node.sh   |  7 ++++++-
 examples/lora_multi_gpu/ds_zero3.sh      |  8 ++++++--
 examples/merge_lora/llama3_lora_sft.yaml |  2 +-
 src/api.py                               | 19 +++++++++++++++++++
 7 files changed, 43 insertions(+), 5 deletions(-)
 create mode 100644 src/api.py

diff --git a/examples/README.md b/examples/README.md
index ba993b99..ce19f9d1 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -148,6 +148,8 @@ bash examples/full_multi_gpu/predict.sh
 
 #### Merge LoRA Adapters
 
+Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
+
 ```bash
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 491ec688..91bdcda9 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -148,6 +148,8 @@ bash examples/full_multi_gpu/predict.sh
 
 #### 合并 LoRA 适配器
 
+注：请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
+
 ```bash
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 ```
diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
index 9c2508b6..962409a1 100644
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -1,6 +1,12 @@
 #!/bin/bash
 
-python -m torch.distributed.run \
+NPROC_PER_NODE=4
+NNODES=2
+RANK=0
+MASTER_ADDR=192.168.0.1
+MASTER_PORT=29500
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
     --nproc_per_node $NPROC_PER_NODE \
     --nnodes $NNODES \
     --node_rank $RANK \
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
index f391166a..97f7af64 100644
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -1,4 +1,9 @@
 #!/bin/bash
 
-deepspeed --include "localhost:0,1,2,3" \
+NPROC_PER_NODE=4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes 1 \
+    --standalone \
     src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh
index 304f3780..b8fd2640 100644
--- a/examples/lora_multi_gpu/ds_zero3.sh
+++ b/examples/lora_multi_gpu/ds_zero3.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
-# ZeRO-3 enables weight sharding on multiple GPUs
 
-deepspeed --include "localhost:0,1,2,3" \
+NPROC_PER_NODE=4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes 1 \
+    --standalone \
     src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
diff --git a/examples/merge_lora/llama3_lora_sft.yaml b/examples/merge_lora/llama3_lora_sft.yaml
index 508a0b8c..de41d48b 100644
--- a/examples/merge_lora/llama3_lora_sft.yaml
+++ b/examples/merge_lora/llama3_lora_sft.yaml
@@ -1,4 +1,4 @@
-# Note: DO NOT use quantized model or quantization_bit when merging lora weights
+# Note: DO NOT use quantized model or quantization_bit when merging lora adapters
 
 # model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
diff --git a/src/api.py b/src/api.py
new file mode 100644
index 00000000..277920ac
--- /dev/null
+++ b/src/api.py
@@ -0,0 +1,19 @@
+import os
+
+import uvicorn
+
+from llmtuner.api.app import create_app
+from llmtuner.chat import ChatModel
+
+
+def main():
+    chat_model = ChatModel()
+    app = create_app(chat_model)
+    api_host = os.environ.get("API_HOST", "0.0.0.0")
+    api_port = int(os.environ.get("API_PORT", "8000"))
+    print("Visit http://localhost:{}/docs for API document.".format(api_port))
+    uvicorn.run(app, host=api_host, port=api_port)
+
+
+if __name__ == "__main__":
+    main()

From f6ac3796ca6faec281867077c2fc12445b558dcb Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 19:03:35 +0800
Subject: [PATCH 242/341] fix #3560

Former-commit-id: ea69cbe903a301df1bcc4b63cdc5bd4c6e3a8255
---
 src/llmtuner/model/utils/longlora.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/model/utils/longlora.py b/src/llmtuner/model/utils/longlora.py
index c3740a73..c91febdd 100644
--- a/src/llmtuner/model/utils/longlora.py
+++ b/src/llmtuner/model/utils/longlora.py
@@ -15,6 +15,7 @@ from transformers.utils import logging
 from transformers.utils.versions import require_version
 
 from ...extras.constants import SUPPORTED_CLASS_FOR_S2ATTN
+from ...extras.logging import get_logger
 
 
 if TYPE_CHECKING:
@@ -179,10 +180,12 @@ def llama_flash_attention_2_forward(
 
         query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
         if attention_mask is not None:
-            attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1)
+            attention_mask = attention_mask[:, :groupsz].repeat(num_groups, 1)
+    else:
+        groupsz = q_len
 
     attn_output: torch.Tensor = self._flash_attention_forward(
-        query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        query_states, key_states, value_states, attention_mask, groupsz, dropout=dropout_rate
     )
 
     if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
@@ -299,7 +302,7 @@ def llama_sdpa_attention_forward(
 
 
 def _apply_llama_patch() -> None:
-    require_version("transformers==4.40.0", "To fix: pip install transformers==4.40.0")
+    require_version("transformers==4.40.1", "To fix: pip install transformers==4.40.1")
     LlamaAttention.forward = llama_attention_forward
     LlamaFlashAttention2.forward = llama_flash_attention_2_forward
     LlamaSdpaAttention.forward = llama_sdpa_attention_forward
@@ -309,6 +312,8 @@ def configure_longlora(config: "PretrainedConfig", model_args: "ModelArguments",
     if not is_trainable or not model_args.shift_attn:
         return
 
+    logger = get_logger(__name__)
+
     if getattr(config, "model_type", None) in SUPPORTED_CLASS_FOR_S2ATTN:
         setattr(config, "group_size_ratio", 0.25)
         _apply_llama_patch()

From 0c811a7653681548c44049c3a6ab16213c2ccf7b Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 19:03:47 +0800
Subject: [PATCH 243/341] update readme

Former-commit-id: 730ea71584debc5784d68eeadceb42f7e827447f
---
 README.md    | 4 ++--
 README_zh.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index ee4adce8..628f6f65 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-42-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-43-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -466,7 +466,7 @@ If you have a project that should be incorporated, please contact via email or c
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: A series of large language models for Chinese medical domain, based on LLaMA2-7B and Baichuan-13B.
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.
-1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: [🤗Demo](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt) A large language model specialized in generate metadata for stable diffusion.
+1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: A large language model specialized in generate metadata for stable diffusion. [[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
 
 </details>
 
diff --git a/README_zh.md b/README_zh.md
index 6019f765..c41d126c 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-42-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-43-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -466,7 +466,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: 医疗大模型项目 CareGPT，基于 LLaMA2-7B 和 Baichuan-13B 在中文医疗数据上微调而得。
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**：MBTI性格大模型项目，根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。
-1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: [🤗Demo](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt) 一个专门用于生成稳定扩散元数据的大型语言模型。
+1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**：一个用于生成 Stable Diffusion 提示词的大型语言模型。[[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
 
 </details>
 

From 8061e92d07015c0ed3f9454701e28bb9f28c3a2b Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 21:17:31 +0800
Subject: [PATCH 244/341] update readme

Former-commit-id: ecefcb2e891e75d37df5ebfc616cfdb2106bcfd6
---
 README.md    | 4 ++--
 README_zh.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 628f6f65..92caee38 100644
--- a/README.md
+++ b/README.md
@@ -339,7 +339,7 @@ Please refer to [data/README.md](data/README.md) for checking the details about
 
 ### Quickstart
 
-Use the following 3 commands to conduct LoRA **fine-tuning**, **inference** and **merging** for Llama3-8B-Instruct model, respectively.
+Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Llama3-8B-Instruct model, respectively.
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -352,7 +352,7 @@ See [examples/README.md](examples/README.md) for advanced usage (including distr
 > [!TIP]
 > Use `llamafactory-cli help` to show help information.
 
-### Use LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
+### Fine-Tuning with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
 
 > [!IMPORTANT]
 > LLaMA Board GUI only supports training on a single GPU.
diff --git a/README_zh.md b/README_zh.md
index c41d126c..ff64097d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -352,7 +352,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_s
 > [!TIP]
 > 使用 `llamafactory-cli help` 显示帮助信息。
 
-### 使用 LLaMA Board 可视化界面（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
+### LLaMA Board 可视化微调（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
 
 > [!IMPORTANT]
 > LLaMA Board 可视化界面目前仅支持单 GPU 训练。

From 3f7f1daa3341a5add1ab497d8d64a275549845a8 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 22:14:06 +0800
Subject: [PATCH 245/341] remove big file

Former-commit-id: 8a05242787f810ec25d1b33358257d2867c45497
---
 data/oaast_rm.json.REMOVED.git-id  | 1 -
 data/oaast_sft.json.REMOVED.git-id | 1 -
 2 files changed, 2 deletions(-)
 delete mode 100644 data/oaast_rm.json.REMOVED.git-id
 delete mode 100644 data/oaast_sft.json.REMOVED.git-id

diff --git a/data/oaast_rm.json.REMOVED.git-id b/data/oaast_rm.json.REMOVED.git-id
deleted file mode 100644
index 41e74ec9..00000000
--- a/data/oaast_rm.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-274079ea921762be356de85b18f13fa60b7ba8cb
\ No newline at end of file
diff --git a/data/oaast_sft.json.REMOVED.git-id b/data/oaast_sft.json.REMOVED.git-id
deleted file mode 100644
index fd29e313..00000000
--- a/data/oaast_sft.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-57fd080be5bffe4153fe3ee26a175e3d56da30f3
\ No newline at end of file

From 6d9d8b92ca95ca9b3c5f7b44899be7933265146d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 7 May 2024 22:17:04 +0800
Subject: [PATCH 246/341] update readme

Former-commit-id: bcc3d3b95609555e5e9a4deb68e65391c5b465bd
---
 README.md    | 6 +++---
 README_zh.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 92caee38..e9d93daf 100644
--- a/README.md
+++ b/README.md
@@ -207,8 +207,8 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ
 - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
 - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
 - [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
-- [Self Cognition (zh)](data/self_cognition.json)
-- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [Identity (en&zh)](data/identity.json)
+- [Open Assistant (zh)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
 - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
 - [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
@@ -256,11 +256,11 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ
 <details><summary>Preference datasets</summary>
 
 - [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
-- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Orca DPO (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 - [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
+- [Open Assistant (zh)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
 
 </details>
diff --git a/README_zh.md b/README_zh.md
index ff64097d..15758ae4 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -207,8 +207,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
 - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
 - [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
-- [Self Cognition (zh)](data/self_cognition.json)
-- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [Identity (en&zh)](data/identity.json)
+- [Open Assistant (zh)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
 - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
 - [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
@@ -256,11 +256,11 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 <details><summary>偏好数据集</summary>
 
 - [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
-- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Orca DPO (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 - [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
+- [Open Assistant (zh)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
 
 </details>

From 05ef89cfccb85e56382c616dbbde1345c4c0ac2a Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Wed, 8 May 2024 10:36:36 +0800
Subject: [PATCH 247/341] modify export model

Former-commit-id: c7051edae4ce23f85daf204a2aaac134b1f29c3d
---
 src/llmtuner/train/tuner.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index e1a997c1..6973a4e5 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -15,11 +15,9 @@ from .pt import run_pt
 from .rm import run_rm
 from .sft import run_sft
 
-
 if TYPE_CHECKING:
     from transformers import TrainerCallback
 
-
 logger = get_logger(__name__)
 
 
@@ -52,7 +50,9 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
     if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None:
         raise ValueError("Please merge adapters before quantizing the model.")
 
-    tokenizer = load_tokenizer(model_args)["tokenizer"]
+    tokenizer_module = load_tokenizer(model_args)["tokenizer"]
+    tokenizer = tokenizer_module['tokenizer']
+    processor = tokenizer_module['processor']
     get_template_and_fix_tokenizer(tokenizer, data_args.template)
     model = load_model(tokenizer, model_args, finetuning_args)  # must after fixing tokenizer to resize vocab
 
@@ -88,3 +88,6 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
             tokenizer.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
     except Exception:
         logger.warning("Cannot save tokenizer, please copy the files manually.")
+
+    if model_args.visual_inputs:
+        processor.image_processor.save_pretrained(model_args.export_dir)

From 0a1b6ca5a7aa99e371eea762800741c26bc9d381 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 8 May 2024 16:37:54 +0800
Subject: [PATCH 248/341] add deepseek moe 236B

Former-commit-id: 30c10e2dc41b5d64191a91ad2d61f3b5c440b1d5
---
 README.md                        | 2 +-
 README_zh.md                     | 2 +-
 requirements.txt                 | 2 +-
 setup.py                         | 2 +-
 src/llmtuner/extras/constants.py | 8 ++++++++
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index e9d93daf..798b7bd4 100644
--- a/README.md
+++ b/README.md
@@ -145,7 +145,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B      | query_key_value   | -         |
 | [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | query_key_value   | chatglm3  |
 | [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | q_proj,v_proj     | cohere    |
-| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                       | q_proj,v_proj     | deepseek  |
+| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | q_proj,v_proj     | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                      | query_key_value   | falcon    |
 | [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | q_proj,v_proj     | gemma     |
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | wqkv              | intern2   |
diff --git a/README_zh.md b/README_zh.md
index 15758ae4..2c5b1aa1 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -145,7 +145,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [BLOOMZ](https://huggingface.co/bigscience)              | 560M/1.1B/1.7B/3B/7.1B/176B      | query_key_value   | -         |
 | [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | query_key_value   | chatglm3  |
 | [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | q_proj,v_proj     | cohere    |
-| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B                       | q_proj,v_proj     | deepseek  |
+| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | q_proj,v_proj     | deepseek  |
 | [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                      | query_key_value   | falcon    |
 | [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | q_proj,v_proj     | gemma     |
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | wqkv              | intern2   |
diff --git a/requirements.txt b/requirements.txt
index f4818ed2..67bd7033 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ uvicorn
 pydantic
 fastapi
 sse-starlette
-matplotlib
+matplotlib>=3.7.0
 fire
 packaging
 pyyaml
diff --git a/setup.py b/setup.py
index 7b849942..ddc3a594 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@ def get_requires():
 
 extra_require = {
     "metrics": ["nltk", "jieba", "rouge-chinese"],
-    "deepspeed": ["deepspeed>=0.10.0"],
+    "deepspeed": ["deepspeed>=0.10.0,<=0.14.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
     "vllm": ["vllm>=0.4.0"],
     "galore": ["galore-torch"],
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index bf542e69..b620bed4 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -324,6 +324,14 @@ register_model_group(
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat",
         },
+        "DeepSeek-MoE-236B": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2",
+        },
+        "DeepSeek-MoE-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat",
+        },
     },
     template="deepseek",
 )

From 400ae144a449e0a8a33002821a480b486e2c9c0e Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 8 May 2024 17:10:03 +0800
Subject: [PATCH 249/341] add llama3 chinese chat

Former-commit-id: ee3e5920f2f28567259693cb106e884a90cb02a2
---
 src/llmtuner/extras/constants.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index b620bed4..e055f1f3 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -577,6 +577,10 @@ register_model_group(
             DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B-Instruct",
             DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B-Instruct",
         },
+        "LLaMA3-8B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3-8B-Chinese-Chat",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama3-8B-Chinese-Chat",
+        },
     },
     template="llama3",
 )

From 4ce4172c87803de377af89fc2a1150da9d565529 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 8 May 2024 17:12:56 +0800
Subject: [PATCH 250/341] fix #3625

Former-commit-id: 8c0f5d1db29862277d84aa128b424b7d0f2b187f
---
 src/llmtuner/model/utils/valuehead.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llmtuner/model/utils/valuehead.py b/src/llmtuner/model/utils/valuehead.py
index a192dcfa..a6180753 100644
--- a/src/llmtuner/model/utils/valuehead.py
+++ b/src/llmtuner/model/utils/valuehead.py
@@ -57,3 +57,7 @@ def prepare_valuehead_model(model: "PreTrainedModel") -> None:
     if getattr(model.config, "model_type", None) == "chatglm":
         setattr(model, "lm_head", model.transformer.output_layer)
         setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
+
+    if getattr(model.config, "model_type", None) == "internlm2":
+        setattr(model, "lm_head", model.output)
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])

From 172600d4326a444b854059500515958d78d09f95 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Wed, 8 May 2024 22:50:42 +0800
Subject: [PATCH 251/341] add mllm export

Former-commit-id: ce4770d33f6761d3b1d60661efcb0be34a036154
---
 src/llmtuner/train/tuner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index 6973a4e5..00349e09 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -50,7 +50,7 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
     if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None:
         raise ValueError("Please merge adapters before quantizing the model.")
 
-    tokenizer_module = load_tokenizer(model_args)["tokenizer"]
+    tokenizer_module = load_tokenizer(model_args)
     tokenizer = tokenizer_module['tokenizer']
     processor = tokenizer_module['processor']
     get_template_and_fix_tokenizer(tokenizer, data_args.template)

From 333f4a69bb88ef598f5a0c105be8885d61f066f1 Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Thu, 9 May 2024 01:26:15 -0400
Subject: [PATCH 252/341] remove unnecessary environment variable usage

Former-commit-id: 4be1d832cb269a07987f5cab5d5f949e269087da
---
 src/webui.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/webui.py b/src/webui.py
index b9385259..000098d1 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -5,9 +5,7 @@ from llmtuner.webui.interface import create_ui
 
 def main():
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
-    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
-    create_ui().queue().launch(share=gradio_share, server_name=server_name, server_port=server_port)
+    create_ui().queue().launch(server_name=server_name)
 
 
 if __name__ == "__main__":

From c213f2a9a90d52096a717ebdc8ec65399c5f4559 Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Thu, 9 May 2024 01:32:00 -0400
Subject: [PATCH 253/341] more removal of unnecessary environment variables

Former-commit-id: 59ef1a6e0d81585a6c010143d05fcfae26d40c00
---
 src/llmtuner/webui/interface.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index 969ce6bd..6cfce8aa 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -79,6 +79,4 @@ def run_web_ui() -> None:
 
 def run_web_demo() -> None:
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
-    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
-    create_web_demo().queue().launch(share=gradio_share, server_name=server_name, server_port=server_port)
+    create_web_demo().queue().launch(server_name=server_name)

From 2aeae4b88b3d58285e518ebdee68ef57595f979a Mon Sep 17 00:00:00 2001
From: cocktailpeanut <cocktailpeanuts@proton.me>
Date: Thu, 9 May 2024 01:33:20 -0400
Subject: [PATCH 254/341] yet another removal of unnecessary environment
 variables

Former-commit-id: a07726028f0287de28e4751672b27efe0efc6477
---
 src/llmtuner/webui/interface.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index 6cfce8aa..bbd91bb7 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -72,9 +72,7 @@ def create_web_demo() -> gr.Blocks:
 
 def run_web_ui() -> None:
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    server_port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
-    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
-    create_ui().queue().launch(share=gradio_share, server_name=server_name, server_port=server_port)
+    create_ui().queue().launch(server_name=server_name)
 
 
 def run_web_demo() -> None:

From e508519e0a7a1abad11857f7bfa2c3b5c33c5d08 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 9 May 2024 13:53:39 +0800
Subject: [PATCH 255/341] add mllm processor save and Chinese-LLaVA-Med show

Former-commit-id: 110c49fbf79fe0625f091e63746bfabde00add99
---
 README.md    | 1 +
 README_zh.md | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 798b7bd4..e71ee552 100644
--- a/README.md
+++ b/README.md
@@ -467,6 +467,7 @@ If you have a project that should be incorporated, please contact via email or c
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: A series of large language models for Chinese medical domain, based on LLaMA2-7B and Baichuan-13B.
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: A large language model specialized in generate metadata for stable diffusion. [[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
+1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**: A multimodal large language model specialized in Chinese medical domain, based on LLaVA-1.5-7B.
 
 </details>
 
diff --git a/README_zh.md b/README_zh.md
index 2c5b1aa1..7c0497c2 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -467,6 +467,8 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: 医疗大模型项目 CareGPT，基于 LLaMA2-7B 和 Baichuan-13B 在中文医疗数据上微调而得。
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**：MBTI性格大模型项目，根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**：一个用于生成 Stable Diffusion 提示词的大型语言模型。[[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
+1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**: 中文多模态医学大模型，基于 LLaVA-1.5-7B 在中文多模态医疗数据上微调而得
+
 
 </details>
 

From 827a929f1d1d4cb037f756ff9cfe60e3353c5edd Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Thu, 9 May 2024 14:05:19 +0800
Subject: [PATCH 256/341] add push processor to hub

Former-commit-id: 7a05a965311edfdfafa57af8342875860d341f27
---
 src/llmtuner/train/tuner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index 00349e09..11509c20 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -91,3 +91,5 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
 
     if model_args.visual_inputs:
         processor.image_processor.save_pretrained(model_args.export_dir)
+        if model_args.export_hub_model_id is not None:
+            processor.image_processor.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
\ No newline at end of file

From 9dadff90bb2a5875fb5c60aff85ddde5518d33f2 Mon Sep 17 00:00:00 2001
From: Tendo33 <sjf1998112@gmail.com>
Date: Thu, 9 May 2024 14:28:01 +0800
Subject: [PATCH 257/341] 1.Change the name of is_fastapi_available function 2.
 Added the log of printing requests when deploying using vllm

Former-commit-id: 530d4f5d51c13c71d99de5fe2d23805b0aa875a2
---
 src/llmtuner/api/app.py         | 16 +++++---
 src/llmtuner/api/chat.py        | 73 ++++++++++++++++++++++-----------
 src/llmtuner/api/common.py      |  6 +--
 src/llmtuner/extras/packages.py |  2 +-
 4 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index 375ee61f..2d93312d 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -4,7 +4,7 @@ from typing import Annotated, Optional
 
 from ..chat import ChatModel
 from ..extras.misc import torch_gc
-from ..extras.packages import is_fastapi_availble, is_starlette_available, is_uvicorn_available
+from ..extras.packages import is_fastapi_available, is_starlette_available, is_uvicorn_available
 from .chat import (
     create_chat_completion_response,
     create_score_evaluation_response,
@@ -20,7 +20,7 @@ from .protocol import (
 )
 
 
-if is_fastapi_availble():
+if is_fastapi_available():
     from fastapi import Depends, FastAPI, HTTPException, status
     from fastapi.middleware.cors import CORSMiddleware
     from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
@@ -54,7 +54,8 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
 
     async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]):
         if api_key and (auth is None or auth.credentials != api_key):
-            raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
+            raise HTTPException(
+                status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
 
     @app.get(
         "/v1/models",
@@ -74,10 +75,12 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
     )
     async def create_chat_completion(request: ChatCompletionRequest):
         if not chat_model.engine.can_generate:
-            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+            raise HTTPException(
+                status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
         if request.stream:
-            generate = create_stream_chat_completion_response(request, chat_model)
+            generate = create_stream_chat_completion_response(
+                request, chat_model)
             return EventSourceResponse(generate, media_type="text/event-stream")
         else:
             return await create_chat_completion_response(request, chat_model)
@@ -90,7 +93,8 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
     )
     async def create_score_evaluation(request: ScoreEvaluationRequest):
         if chat_model.engine.can_generate:
-            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+            raise HTTPException(
+                status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
         return await create_score_evaluation_response(request, chat_model)
 
diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index 2a703877..3ab473d1 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -3,7 +3,8 @@ import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Optional, Tuple
 
 from ..data import Role as DataRole
-from ..extras.packages import is_fastapi_availble
+from ..extras.packages import is_fastapi_available
+from ..extras.logging import get_logger
 from .common import dictify, jsonify
 from .protocol import (
     ChatCompletionMessage,
@@ -19,8 +20,9 @@ from .protocol import (
     ScoreEvaluationResponse,
 )
 
+logger = get_logger(__name__)
 
-if is_fastapi_availble():
+if is_fastapi_available():
     from fastapi import HTTPException, status
 
 
@@ -39,8 +41,13 @@ ROLE_MAPPING = {
 
 
 def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, str]], str, str]:
+
+    params = dictify(request)
+    logger.info(f"==== request ====\n{params}")
+
     if len(request.messages) == 0:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
 
     if request.messages[0].role == Role.SYSTEM:
         system = request.messages.pop(0).content
@@ -48,29 +55,37 @@ def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, s
         system = ""
 
     if len(request.messages) % 2 == 0:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
+                            detail="Only supports u/a/u/a/u...")
 
     input_messages = []
     for i, message in enumerate(request.messages):
         if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
         elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
 
         if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
             name = message.tool_calls[0].function.name
             arguments = message.tool_calls[0].function.arguments
-            content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)
-            input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
+            content = json.dumps(
+                {"name": name, "argument": arguments}, ensure_ascii=False)
+            input_messages.append(
+                {"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
         else:
-            input_messages.append({"role": ROLE_MAPPING[message.role], "content": message.content})
+            input_messages.append(
+                {"role": ROLE_MAPPING[message.role], "content": message.content})
 
     tool_list = request.tools
     if isinstance(tool_list, list) and len(tool_list):
         try:
-            tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
+            tools = json.dumps([dictify(tool.function)
+                               for tool in tool_list], ensure_ascii=False)
         except Exception:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
     else:
         tools = ""
 
@@ -84,8 +99,10 @@ def _create_stream_chat_completion_chunk(
     index: Optional[int] = 0,
     finish_reason: Optional["Finish"] = None,
 ) -> str:
-    choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason)
-    chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data])
+    choice_data = ChatCompletionStreamResponseChoice(
+        index=index, delta=delta, finish_reason=finish_reason)
+    chunk = ChatCompletionStreamResponse(
+        id=completion_id, model=model, choices=[choice_data])
     return jsonify(chunk)
 
 
@@ -110,21 +127,26 @@ async def create_chat_completion_response(
     choices = []
     for i, response in enumerate(responses):
         if tools:
-            result = chat_model.engine.template.format_tools.extract(response.response_text)
+            result = chat_model.engine.template.format_tools.extract(
+                response.response_text)
         else:
             result = response.response_text
 
         if isinstance(result, tuple):
             name, arguments = result
             function = Function(name=name, arguments=arguments)
-            tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
-            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=[tool_call])
+            tool_call = FunctionCall(id="call_{}".format(
+                uuid.uuid4().hex), function=function)
+            response_message = ChatCompletionMessage(
+                role=Role.ASSISTANT, tool_calls=[tool_call])
             finish_reason = Finish.TOOL
         else:
-            response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
+            response_message = ChatCompletionMessage(
+                role=Role.ASSISTANT, content=result)
             finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH
 
-        choices.append(ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason))
+        choices.append(ChatCompletionResponseChoice(
+            index=i, message=response_message, finish_reason=finish_reason))
         prompt_length = response.prompt_length
         response_length += response.response_length
 
@@ -143,13 +165,16 @@ async def create_stream_chat_completion_response(
     completion_id = "chatcmpl-{}".format(uuid.uuid4().hex)
     input_messages, system, tools = _process_request(request)
     if tools:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
+                            detail="Cannot stream function calls.")
 
     if request.n > 1:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream multiple responses.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
+                            detail="Cannot stream multiple responses.")
 
     yield _create_stream_chat_completion_chunk(
-        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(role=Role.ASSISTANT, content="")
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(
+            role=Role.ASSISTANT, content="")
     )
     async for new_token in chat_model.astream_chat(
         input_messages,
@@ -163,7 +188,8 @@ async def create_stream_chat_completion_response(
     ):
         if len(new_token) != 0:
             yield _create_stream_chat_completion_chunk(
-                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token)
+                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(
+                    content=new_token)
             )
 
     yield _create_stream_chat_completion_chunk(
@@ -176,7 +202,8 @@ async def create_score_evaluation_response(
     request: "ScoreEvaluationRequest", chat_model: "ChatModel"
 ) -> "ScoreEvaluationResponse":
     if len(request.messages) == 0:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
 
     scores = await chat_model.aget_scores(request.messages, max_length=request.max_length)
     return ScoreEvaluationResponse(model=request.model, scores=scores)
diff --git a/src/llmtuner/api/common.py b/src/llmtuner/api/common.py
index 5ad9a071..3e95d211 100644
--- a/src/llmtuner/api/common.py
+++ b/src/llmtuner/api/common.py
@@ -6,11 +6,11 @@ if TYPE_CHECKING:
     from pydantic import BaseModel
 
 
-def dictify(data: "BaseModel") -> Dict[str, Any]:
+def dictify(data: "BaseModel", **kwargs) -> Dict[str, Any]:
     try:  # pydantic v2
-        return data.model_dump(exclude_unset=True)
+        return data.model_dump(**kwargs)
     except AttributeError:  # pydantic v1
-        return data.dict(exclude_unset=True)
+        return data.dict(**kwargs)
 
 
 def jsonify(data: "BaseModel") -> str:
diff --git a/src/llmtuner/extras/packages.py b/src/llmtuner/extras/packages.py
index a7317eec..4c9e6492 100644
--- a/src/llmtuner/extras/packages.py
+++ b/src/llmtuner/extras/packages.py
@@ -20,7 +20,7 @@ def _get_package_version(name: str) -> "Version":
         return version.parse("0.0.0")
 
 
-def is_fastapi_availble():
+def is_fastapi_available():
     return _is_package_available("fastapi")
 
 
From 223bbd993021a76b2801e0f04b93670dd82327fd Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 9 May 2024 16:52:27 +0800
Subject: [PATCH 258/341] resolve python 3.8 package

Former-commit-id: 5eee4ec7016846356715a4fa1ad58e3cbb1cac6e
---
 README.md               | 10 ++++++++--
 README_zh.md            | 10 ++++++++--
 requirements.txt        |  1 -
 setup.py                |  1 +
 src/llmtuner/api/app.py |  4 +++-
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 798b7bd4..474e536d 100644
--- a/README.md
+++ b/README.md
@@ -310,13 +310,19 @@ huggingface-cli login
 
 ### Installation
 
+> [!IMPORTANT]
+> Installation is mandatory.
+
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
 cd LLaMA-Factory
-pip install -e .[metrics]
+pip install -e .[torch,metrics]
 ```
 
-Extra dependencies available: metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality
+Extra dependencies available: torch, metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality
+
+> [!TIP]
+> Use `pip install --no-deps -e .` to resolve package conflicts.
 
 <details><summary>For Windows users</summary>
 
diff --git a/README_zh.md b/README_zh.md
index 2c5b1aa1..1df9bcba 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -310,13 +310,19 @@ huggingface-cli login
 
 ### 安装 LLaMA Factory
 
+> [!IMPORTANT]
+> 此步骤为必需。
+
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
 cd LLaMA-Factory
-pip install -e .[metrics]
+pip install -e .[torch,metrics]
 ```
 
-可选的额外依赖项：metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality
+可选的额外依赖项：torch、metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality
+
+> [!TIP]
+> 遇到包冲突时，可使用 `pip install --no-deps -e .` 解决。
 
 <details><summary>Windows 用户指南</summary>
 
diff --git a/requirements.txt b/requirements.txt
index 67bd7033..f4a942e6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-torch>=1.13.1
 transformers>=4.37.2
 datasets>=2.14.3
 accelerate>=0.27.2
diff --git a/setup.py b/setup.py
index ddc3a594..2f684753 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,7 @@ def get_requires():
 
 
 extra_require = {
+    "torch": ["torch>=1.13.1"],
     "metrics": ["nltk", "jieba", "rouge-chinese"],
     "deepspeed": ["deepspeed>=0.10.0,<=0.14.0"],
     "bitsandbytes": ["bitsandbytes>=0.39.0"],
diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index 375ee61f..6d3d5afc 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -1,6 +1,8 @@
 import os
 from contextlib import asynccontextmanager
-from typing import Annotated, Optional
+from typing import Optional
+
+from typing_extensions import Annotated
 
 from ..chat import ChatModel
 from ..extras.misc import torch_gc

From 9b1008912c318e7e5fa512603d891e79b3ba2ed6 Mon Sep 17 00:00:00 2001
From: kkkl <57311960+YUUUCC@users.noreply.github.com>
Date: Sat, 11 May 2024 00:22:40 +0800
Subject: [PATCH 259/341] Update constants.py

Fix the download issue of the Phi3 model

Former-commit-id: 8978e80914ac6db1ed1b79641b20c84087dd4341
---
 src/llmtuner/extras/constants.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index e055f1f3..50c78b3f 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -715,11 +715,11 @@ register_model_group(
     models={
         "Phi3-3.8B-4k-Chat": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-mini-4k-instruct",
-            DownloadSource.DEFAULT: "LLM-Research/Phi-3-mini-4k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-4k-instruct",
         },
         "Phi3-3.8B-128k-Chat": {
             DownloadSource.DEFAULT: "microsoft/Phi-3-mini-128k-instruct",
-            DownloadSource.DEFAULT: "LLM-Research/Phi-3-mini-128k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-128k-instruct",
         },
     },
     module="qkv_proj",

From 1a78b675beb4a77643631f9e8e6874368da013bc Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Sat, 11 May 2024 13:11:00 +0800
Subject: [PATCH 260/341] add full parameter finetuning of mllm

Former-commit-id: f90c1da5636ac3cb8112c5081a3b56b09a17fcf8
---
 src/llmtuner/hparams/model_args.py | 4 ++++
 src/llmtuner/model/loader.py       | 3 ++-
 src/llmtuner/model/patcher.py      | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index ac70bb3c..996eabae 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -85,6 +85,10 @@ class ModelArguments:
         default=False,
         metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."},
     )
+    autocast_projector: bool = field(
+        default=True,
+        metadata={"help": "Whethor or not to autocast projector."},
+    )
     moe_aux_loss_coef: Optional[float] = field(
         default=None,
         metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."},
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index ead6178f..1dca84a1 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -155,7 +155,8 @@ def load_model(
         model.eval()
     else:
         model.train()
-
+        if model_args.visual_inputs:
+            model.vision_tower.requires_grad_(False)
     trainable_params, all_param = count_parameters(model)
     if is_trainable:
         param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 31cba492..6ca6f2e5 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -101,7 +101,7 @@ def patch_model(
     if model_args.resize_vocab:
         resize_embedding_layer(model, tokenizer)
 
-    if model_args.visual_inputs:
+    if model_args.visual_inputs and model_args.autocast_projector:
         autocast_projector_dtype(model, model_args)
 
     if is_trainable:

From b530a798c14bec4911c2ecf84be5163eeefc0426 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 11 May 2024 22:43:04 +0800
Subject: [PATCH 261/341] Update README.md

Former-commit-id: d24c83bb30e2829ba78db90c4c4975788f2eed25
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5880dbea..90c66caf 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-43-green)](#projects-using-llama-factory)
+[![Citation](https://img.shields.io/badge/citation-44-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)

From e2cfcb0a5f71ee86a640a6cdd026307dda4415b5 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 11 May 2024 22:44:51 +0800
Subject: [PATCH 262/341] Update README_zh.md

Former-commit-id: 1a205478403b5852fac0aa8418cdb8995fbe40e3
---
 README_zh.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README_zh.md b/README_zh.md
index 330a012e..0aba9043 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -5,7 +5,7 @@
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
 [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
 [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Citation](https://img.shields.io/badge/citation-43-green)](#使用了-llama-factory-的项目)
+[![Citation](https://img.shields.io/badge/citation-44-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
@@ -473,7 +473,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: 医疗大模型项目 CareGPT，基于 LLaMA2-7B 和 Baichuan-13B 在中文医疗数据上微调而得。
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**：MBTI性格大模型项目，根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**：一个用于生成 Stable Diffusion 提示词的大型语言模型。[[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
-1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**: 中文多模态医学大模型，基于 LLaVA-1.5-7B 在中文多模态医疗数据上微调而得
+1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**：中文多模态医学大模型，基于 LLaVA-1.5-7B 在中文多模态医疗数据上微调而得。
 
 
 </details>

From 0ccc76392e22fbe8608182ec6752070043a7298e Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 11 May 2024 23:54:53 +0800
Subject: [PATCH 263/341] Update tuner.py

Former-commit-id: 22afcbdb25160583e5ece28fad0585c7bc70f41a
---
 src/llmtuner/train/tuner.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index 11509c20..cf44aa8c 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -15,9 +15,11 @@ from .pt import run_pt
 from .rm import run_rm
 from .sft import run_sft
 
+
 if TYPE_CHECKING:
     from transformers import TrainerCallback
 
+
 logger = get_logger(__name__)
 
 
@@ -51,8 +53,8 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
         raise ValueError("Please merge adapters before quantizing the model.")
 
     tokenizer_module = load_tokenizer(model_args)
-    tokenizer = tokenizer_module['tokenizer']
-    processor = tokenizer_module['processor']
+    tokenizer = tokenizer_module["tokenizer"]
+    processor = tokenizer_module["processor"]
     get_template_and_fix_tokenizer(tokenizer, data_args.template)
     model = load_model(tokenizer, model_args, finetuning_args)  # must after fixing tokenizer to resize vocab
 
@@ -63,7 +65,7 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
         raise ValueError("The model is not a `PreTrainedModel`, export aborted.")
 
     if getattr(model, "quantization_method", None) is None:  # cannot convert dtype of a quantized model
-        output_dtype = getattr(model.config, "torch_dtype", torch.float16)
+        output_dtype = torch.float16
         setattr(model.config, "torch_dtype", output_dtype)
         model = model.to(output_dtype)
 
@@ -86,10 +88,12 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
         tokenizer.save_pretrained(model_args.export_dir)
         if model_args.export_hub_model_id is not None:
             tokenizer.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
+
+        if model_args.visual_inputs and processor is not None:
+            getattr(processor, "image_processor").save_pretrained(model_args.export_dir)
+            if model_args.export_hub_model_id is not None:
+                getattr(processor, "image_processor").push_to_hub(
+                    model_args.export_hub_model_id, token=model_args.hf_hub_token
+                )
     except Exception:
         logger.warning("Cannot save tokenizer, please copy the files manually.")
-
-    if model_args.visual_inputs:
-        processor.image_processor.save_pretrained(model_args.export_dir)
-        if model_args.export_hub_model_id is not None:
-            processor.image_processor.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
\ No newline at end of file

From 382f0964756f5b8f6adc65641e9e0d303facf6fa Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 11 May 2024 23:55:59 +0800
Subject: [PATCH 264/341] Update tuner.py

Former-commit-id: ccd1eb2c0992f75440c0e1c5cd3f02d03aacb085
---
 src/llmtuner/train/tuner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index cf44aa8c..ffdc3e60 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -65,7 +65,7 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
         raise ValueError("The model is not a `PreTrainedModel`, export aborted.")
 
     if getattr(model, "quantization_method", None) is None:  # cannot convert dtype of a quantized model
-        output_dtype = torch.float16
+        output_dtype = getattr(model.config, "torch_dtype", torch.float16)
         setattr(model.config, "torch_dtype", output_dtype)
         model = model.to(output_dtype)
 

From d54313fcf932eeaffaa151ca08fc9729be614740 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 11 May 2024 23:56:40 +0800
Subject: [PATCH 265/341] Update patcher.py

Former-commit-id: 2c88d394d29c6e98ac3a6860848855722614ca52
---
 src/llmtuner/model/patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 6ca6f2e5..31cba492 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -101,7 +101,7 @@ def patch_model(
     if model_args.resize_vocab:
         resize_embedding_layer(model, tokenizer)
 
-    if model_args.visual_inputs and model_args.autocast_projector:
+    if model_args.visual_inputs:
         autocast_projector_dtype(model, model_args)
 
     if is_trainable:

From 4efe56fd68852681c6f18d5b99c08da1b5bec435 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 11 May 2024 23:57:05 +0800
Subject: [PATCH 266/341] Update model_args.py

Former-commit-id: c4114add4c42c1d7723f7270451a6c9fc656ecd1
---
 src/llmtuner/hparams/model_args.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py
index 996eabae..ac70bb3c 100644
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -85,10 +85,6 @@ class ModelArguments:
         default=False,
         metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."},
     )
-    autocast_projector: bool = field(
-        default=True,
-        metadata={"help": "Whethor or not to autocast projector."},
-    )
     moe_aux_loss_coef: Optional[float] = field(
         default=None,
         metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."},

From db47c53486ea3f16da44838ebc790169fe2c90ba Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Sat, 11 May 2024 23:58:47 +0800
Subject: [PATCH 267/341] Update loader.py

Former-commit-id: 2fc12790414677bb82736208fb9547640780af2e
---
 src/llmtuner/model/loader.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index 1dca84a1..ead6178f 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -155,8 +155,7 @@ def load_model(
         model.eval()
     else:
         model.train()
-        if model_args.visual_inputs:
-            model.vision_tower.requires_grad_(False)
+
     trainable_params, all_param = count_parameters(model)
     if is_trainable:
         param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(

From 2bcd5b2b73e1a23c2e29c4889e3bc90acf4a1afe Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 00:02:49 +0800
Subject: [PATCH 268/341] fix llava config

Former-commit-id: b13d032325e45d401a9dbc64d4c73e308eff3288
---
 src/llmtuner/model/adapter.py         | 3 +++
 src/llmtuner/model/loader.py          | 2 +-
 src/llmtuner/model/patcher.py         | 9 +++------
 src/llmtuner/model/utils/valuehead.py | 7 +------
 src/llmtuner/model/utils/visual.py    | 9 +++++++--
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index d43e00f0..0ffb91c1 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -46,6 +46,9 @@ def init_adapter(
         if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
             model = model.float()
 
+        if model_args.visual_inputs and hasattr(model, "vision_tower"):  # freeze vision model
+            model.vision_tower.requires_grad_(False)
+
     if finetuning_args.finetuning_type == "freeze" and is_trainable:
         logger.info("Fine-tuning method: Freeze")
         num_layers = (
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index ead6178f..ea55de27 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -106,7 +106,7 @@ def load_model(
     """
     init_kwargs = _get_init_kwargs(model_args)
     config = load_config(model_args)
-    patch_config(config, tokenizer, model_args, init_kwargs, is_trainable, add_valuehead)
+    patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
 
     model = None
     lazy_load = False
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 31cba492..fd99bd3b 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -15,8 +15,8 @@ from .utils.longlora import configure_longlora
 from .utils.moe import add_z3_leaf_module, configure_moe
 from .utils.quantization import configure_quantization
 from .utils.rope import configure_rope
-from .utils.valuehead import configure_valuehead, prepare_valuehead_model
-from .utils.visual import autocast_projector_dtype
+from .utils.valuehead import prepare_valuehead_model
+from .utils.visual import autocast_projector_dtype, configure_hidden_size
 
 
 if TYPE_CHECKING:
@@ -40,7 +40,6 @@ def patch_config(
     model_args: "ModelArguments",
     init_kwargs: Dict[str, Any],
     is_trainable: bool,
-    add_valuehead: bool,
 ) -> None:
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
         model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
@@ -50,9 +49,7 @@ def patch_config(
     configure_longlora(config, model_args, is_trainable)
     configure_quantization(config, tokenizer, model_args, init_kwargs)
     configure_moe(config, model_args, is_trainable)
-
-    if add_valuehead:
-        configure_valuehead(config)
+    configure_hidden_size(config)
 
     if model_args.use_cache and not is_trainable:
         setattr(config, "use_cache", True)
diff --git a/src/llmtuner/model/utils/valuehead.py b/src/llmtuner/model/utils/valuehead.py
index a6180753..d813729e 100644
--- a/src/llmtuner/model/utils/valuehead.py
+++ b/src/llmtuner/model/utils/valuehead.py
@@ -8,7 +8,7 @@ from ...extras.logging import get_logger
 
 
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig, PreTrainedModel
+    from transformers import PreTrainedModel
 
     from ...hparams import ModelArguments
 
@@ -16,11 +16,6 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
-def configure_valuehead(config: "PretrainedConfig") -> None:
-    if getattr(config, "model_type", None) == "llava":
-        setattr(config, "hidden_size", getattr(config.vision_config, "intermediate_size", None))
-
-
 def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> Dict[str, torch.Tensor]:
     r"""
     Loads value head parameters from Hugging Face Hub or local disk.
diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index cb51301b..b29a9ba5 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -6,7 +6,7 @@ from ...extras.logging import get_logger
 
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+    from transformers import PretrainedConfig, PreTrainedModel
 
     from ...hparams import ModelArguments
 
@@ -14,6 +14,11 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
+def configure_hidden_size(config: "PretrainedConfig") -> None:
+    if getattr(config, "model_type", None) == "llava":
+        setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
+
+
 def autocast_projector_dtype(
     model: "PreTrainedModel", model_args: "ModelArguments", mm_projector_name: str = "multi_modal_projector"
 ) -> None:
@@ -22,7 +27,7 @@ def autocast_projector_dtype(
     ) -> "torch.Tensor":
         return output.to(model_args.compute_dtype)
 
-    if hasattr(model, mm_projector_name):
+    if hasattr(model, mm_projector_name) and getattr(model.config, "quantization_method", None):
         logger.info("Casting multimodal projector outputs in {}.".format(model_args.compute_dtype))
         mm_projector: "torch.nn.Module" = getattr(model, mm_projector_name)
         mm_projector.register_forward_hook(_mm_projector_forward_post_hook)

From 25d316b1a0ffe8fa3d0e639e06030d85e6e82331 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 00:03:59 +0800
Subject: [PATCH 269/341] fix #3674

Former-commit-id: 6bad2eafef75ec697477e1f2ce739006042fb4c7
---
 src/llmtuner/train/tuner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py
index ffdc3e60..8f103ca1 100644
--- a/src/llmtuner/train/tuner.py
+++ b/src/llmtuner/train/tuner.py
@@ -68,6 +68,8 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None:
         output_dtype = getattr(model.config, "torch_dtype", torch.float16)
         setattr(model.config, "torch_dtype", output_dtype)
         model = model.to(output_dtype)
+    else:
+        setattr(model.config, "torch_dtype", torch.float16)
 
     model.save_pretrained(
         save_directory=model_args.export_dir,

From 6bc0ec63c74fe44432eefe3ec65cdcf63a1f626e Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 00:33:49 +0800
Subject: [PATCH 270/341] update readme

Former-commit-id: d57ca8a865b46588f65b2cc15073c5fcc4e4cebc
---
 README.md    | 12 +++++++++---
 README_zh.md | 13 +++++++++----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 90c66caf..80154cae 100644
--- a/README.md
+++ b/README.md
@@ -366,17 +366,23 @@ See [examples/README.md](examples/README.md) for advanced usage (including distr
 #### Use local environment
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
-<details><summary>For Alibaba Cloud users</summary>
+<details><summary>For Alibaba Cloud PAI or AutoDL users</summary>
 
-If you encountered display problems in LLaMA Board on Alibaba Cloud, try using the following command to set environment variables before starting LLaMA Board:
+If you encountered display problems in LLaMA Board on Alibaba Cloud PAI, try using the following command to set environment variables before starting LLaMA Board:
 
 ```bash
 export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
 ```
 
+If you are using AutoDL, please install a specific version of Gradio:
+
+```bash
+pip install gradio==4.10.0
+```
+
 </details>
 
 #### Use Docker
diff --git a/README_zh.md b/README_zh.md
index 0aba9043..5656fb4a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -366,17 +366,23 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_s
 #### 使用本地环境
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
-<details><summary>阿里云用户指南</summary>
+<details><summary>阿里云 PAI 和 AutoDL 用户指南</summary>
 
-如果您在阿里云上使用 LLaMA Board 时遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
+如果您在阿里云 PAI 上使用 LLaMA Board 时遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
 
 ```bash
 export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
 ```
 
+如果您正在使用 AutoDL，请安装下述 Gradio 版本：
+
+```bash
+pip install gradio==4.10.0
+```
+
 </details>
 
 #### 使用 Docker
@@ -475,7 +481,6 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**：一个用于生成 Stable Diffusion 提示词的大型语言模型。[[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
 1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**：中文多模态医学大模型，基于 LLaVA-1.5-7B 在中文多模态医疗数据上微调而得。
 
-
 </details>
 
 ## 协议

From 8f1ba07b302a78479bf2029c0a4d708b982563fc Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 01:10:30 +0800
Subject: [PATCH 271/341] remove checksum and fix ui args

Former-commit-id: 0cfdeb1d30efb63211434bc4656bceb59e666289
---
 README.md                       |  4 ++--
 README_zh.md                    |  4 ++--
 src/llmtuner/data/loader.py     |  4 +---
 src/llmtuner/data/parser.py     |  2 --
 src/llmtuner/data/utils.py      | 15 ---------------
 src/llmtuner/webui/interface.py |  6 ++++--
 src/webui.py                    |  3 ++-
 7 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 80154cae..57a34dab 100644
--- a/README.md
+++ b/README.md
@@ -366,7 +366,7 @@ See [examples/README.md](examples/README.md) for advanced usage (including distr
 #### Use local environment
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
 <details><summary>For Alibaba Cloud PAI or AutoDL users</summary>
@@ -374,7 +374,7 @@ CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli w
 If you encountered display problems in LLaMA Board on Alibaba Cloud PAI, try using the following command to set environment variables before starting LLaMA Board:
 
 ```bash
-export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
+export GRADIO_SERVER_PORT=7860 GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
 ```
 
 If you are using AutoDL, please install a specific version of Gradio:
diff --git a/README_zh.md b/README_zh.md
index 5656fb4a..047b1645 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -366,7 +366,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_s
 #### 使用本地环境
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli webui
+CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=1 llamafactory-cli webui
 ```
 
 <details><summary>阿里云 PAI 和 AutoDL 用户指南</summary>
@@ -374,7 +374,7 @@ CUDA_VISIBLE_DEVICES=0 GRADIO_SERVER_PORT=7860 GRADIO_SHARE=1 llamafactory-cli w
 如果您在阿里云 PAI 上使用 LLaMA Board 时遇到显示问题，请尝试在启动前使用以下命令设置环境变量：
 
 ```bash
-export GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
+export GRADIO_SERVER_PORT=7860 GRADIO_ROOT_PATH=/${JUPYTER_NAME}/proxy/7860/
 ```
 
 如果您正在使用 AutoDL，请安装下述 Gradio 版本：
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index ca0d5407..3cc01b0d 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -11,7 +11,7 @@ from .aligner import align_dataset
 from .parser import get_dataset_list
 from .preprocess import get_preprocess_and_print_func
 from .template import get_template_and_fix_tokenizer
-from .utils import checksum, merge_dataset
+from .utils import merge_dataset
 
 
 if TYPE_CHECKING:
@@ -61,8 +61,6 @@ def load_single_dataset(
 
         if data_path is None:
             raise ValueError("File extension must be txt, csv, json or jsonl.")
-
-        checksum(data_files, dataset_attr.file_sha1)
     else:
         raise NotImplementedError
 
diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py
index 01a417a9..3170fd8a 100644
--- a/src/llmtuner/data/parser.py
+++ b/src/llmtuner/data/parser.py
@@ -21,7 +21,6 @@ class DatasetAttr:
     load_from: Literal["hf_hub", "ms_hub", "script", "file"]
     dataset_name: str
     """ extra configs """
-    file_sha1: Optional[str] = None
     subset: Optional[str] = None
     folder: Optional[str] = None
     ranking: bool = False
@@ -99,7 +98,6 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
         else:
             dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])
 
-        dataset_attr.set_attr("file_sha1", dataset_info[name])
         dataset_attr.set_attr("subset", dataset_info[name])
         dataset_attr.set_attr("folder", dataset_info[name])
         dataset_attr.set_attr("ranking", dataset_info[name], default=False)
diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py
index dc189609..29fd4ad4 100644
--- a/src/llmtuner/data/utils.py
+++ b/src/llmtuner/data/utils.py
@@ -26,21 +26,6 @@ class Role(str, Enum):
     OBSERVATION = "observation"
 
 
-def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
-    if file_sha1 is None:
-        logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.")
-        return
-
-    if len(data_files) != 1:
-        logger.warning("Checksum failed: too many files.")
-        return
-
-    with open(data_files[0], "rb") as f:
-        sha1 = hashlib.sha1(f.read()).hexdigest()
-        if sha1 != file_sha1:
-            logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0]))
-
-
 def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]:
     max_target_len = int(max_len * (target_len / (source_len + target_len)))
     max_target_len = max(max_target_len, reserved_label_len)
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index bbd91bb7..91709d40 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -71,10 +71,12 @@ def create_web_demo() -> gr.Blocks:
 
 
 def run_web_ui() -> None:
+    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    create_ui().queue().launch(server_name=server_name)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name)
 
 
 def run_web_demo() -> None:
+    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    create_web_demo().queue().launch(server_name=server_name)
+    create_web_demo().queue().launch(share=gradio_share, server_name=server_name)
diff --git a/src/webui.py b/src/webui.py
index 000098d1..3f8690d0 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -4,8 +4,9 @@ from llmtuner.webui.interface import create_ui
 
 
 def main():
+    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    create_ui().queue().launch(server_name=server_name)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name)
 
 
 if __name__ == "__main__":

From 7b02c8339948ca9229ab027c761c4034de377de6 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 01:25:16 +0800
Subject: [PATCH 272/341] fix #3658

Former-commit-id: 37799a62d4431d1d8c02fee6c23d607a65723c1a
---
 src/llmtuner/extras/callbacks.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/extras/callbacks.py b/src/llmtuner/extras/callbacks.py
index a142928a..6d24b244 100644
--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -139,13 +139,15 @@ class LogCallback(TrainerCallback):
         r"""
         Event called after an evaluation phase.
         """
-        self._close_thread_pool()
+        if not self.do_train:
+            self._close_thread_pool()
 
     def on_predict(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""
         Event called after a successful prediction.
         """
-        self._close_thread_pool()
+        if not self.do_train:
+            self._close_thread_pool()
 
     def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
         r"""

From 1e1b8899f58ef8c4765d021464235357bf3567c2 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Sun, 12 May 2024 01:28:51 +0800
Subject: [PATCH 273/341] lint

Former-commit-id: cb72eb6ab24615ce492ca2945f29daa34c0c52d4
---
 src/llmtuner/data/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llmtuner/data/utils.py b/src/llmtuner/data/utils.py
index 29fd4ad4..aaa5bdc0 100644
--- a/src/llmtuner/data/utils.py
+++ b/src/llmtuner/data/utils.py
@@ -1,6 +1,5 @@
-import hashlib
 from enum import Enum, unique
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Tuple, Union
 
 from datasets import concatenate_datasets, interleave_datasets
 

From 75891234651c919af57a7e958b970e29ebb90f11 Mon Sep 17 00:00:00 2001
From: Tendo33 <sjf1998112@gamil.com>
Date: Mon, 13 May 2024 09:40:33 +0800
Subject: [PATCH 274/341] ruff check scripts src tests --fix

Former-commit-id: da5277b6a1cff40d59df8f1835d9514b2a51be34
---
 src/llmtuner/api/app.py  | 12 +++----
 src/llmtuner/api/chat.py | 67 ++++++++++++++--------------------------
 2 files changed, 27 insertions(+), 52 deletions(-)

diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index 5936955b..6d06d1d0 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -56,8 +56,7 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
 
     async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]):
         if api_key and (auth is None or auth.credentials != api_key):
-            raise HTTPException(
-                status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
+            raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
 
     @app.get(
         "/v1/models",
@@ -77,12 +76,10 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
     )
     async def create_chat_completion(request: ChatCompletionRequest):
         if not chat_model.engine.can_generate:
-            raise HTTPException(
-                status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
         if request.stream:
-            generate = create_stream_chat_completion_response(
-                request, chat_model)
+            generate = create_stream_chat_completion_response(request, chat_model)
             return EventSourceResponse(generate, media_type="text/event-stream")
         else:
             return await create_chat_completion_response(request, chat_model)
@@ -95,8 +92,7 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
     )
     async def create_score_evaluation(request: ScoreEvaluationRequest):
         if chat_model.engine.can_generate:
-            raise HTTPException(
-                status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
 
         return await create_score_evaluation_response(request, chat_model)
 
diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index 3ab473d1..76ddc88d 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -3,8 +3,8 @@ import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Optional, Tuple
 
 from ..data import Role as DataRole
-from ..extras.packages import is_fastapi_available
 from ..extras.logging import get_logger
+from ..extras.packages import is_fastapi_available
 from .common import dictify, jsonify
 from .protocol import (
     ChatCompletionMessage,
@@ -20,6 +20,7 @@ from .protocol import (
     ScoreEvaluationResponse,
 )
 
+
 logger = get_logger(__name__)
 
 if is_fastapi_available():
@@ -41,13 +42,11 @@ ROLE_MAPPING = {
 
 
 def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, str]], str, str]:
-
     params = dictify(request)
     logger.info(f"==== request ====\n{params}")
 
     if len(request.messages) == 0:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
 
     if request.messages[0].role == Role.SYSTEM:
         system = request.messages.pop(0).content
@@ -55,37 +54,29 @@ def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, s
         system = ""
 
     if len(request.messages) % 2 == 0:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
-                            detail="Only supports u/a/u/a/u...")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...")
 
     input_messages = []
     for i, message in enumerate(request.messages):
         if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
         elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
 
         if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
             name = message.tool_calls[0].function.name
             arguments = message.tool_calls[0].function.arguments
-            content = json.dumps(
-                {"name": name, "argument": arguments}, ensure_ascii=False)
-            input_messages.append(
-                {"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
+            content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False)
+            input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
         else:
-            input_messages.append(
-                {"role": ROLE_MAPPING[message.role], "content": message.content})
+            input_messages.append({"role": ROLE_MAPPING[message.role], "content": message.content})
 
     tool_list = request.tools
     if isinstance(tool_list, list) and len(tool_list):
         try:
-            tools = json.dumps([dictify(tool.function)
-                               for tool in tool_list], ensure_ascii=False)
+            tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
         except Exception:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
     else:
         tools = ""
 
@@ -99,10 +90,8 @@ def _create_stream_chat_completion_chunk(
     index: Optional[int] = 0,
     finish_reason: Optional["Finish"] = None,
 ) -> str:
-    choice_data = ChatCompletionStreamResponseChoice(
-        index=index, delta=delta, finish_reason=finish_reason)
-    chunk = ChatCompletionStreamResponse(
-        id=completion_id, model=model, choices=[choice_data])
+    choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason)
+    chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data])
     return jsonify(chunk)
 
 
@@ -127,26 +116,21 @@ async def create_chat_completion_response(
     choices = []
     for i, response in enumerate(responses):
         if tools:
-            result = chat_model.engine.template.format_tools.extract(
-                response.response_text)
+            result = chat_model.engine.template.format_tools.extract(response.response_text)
         else:
             result = response.response_text
 
         if isinstance(result, tuple):
             name, arguments = result
             function = Function(name=name, arguments=arguments)
-            tool_call = FunctionCall(id="call_{}".format(
-                uuid.uuid4().hex), function=function)
-            response_message = ChatCompletionMessage(
-                role=Role.ASSISTANT, tool_calls=[tool_call])
+            tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function)
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=[tool_call])
             finish_reason = Finish.TOOL
         else:
-            response_message = ChatCompletionMessage(
-                role=Role.ASSISTANT, content=result)
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
             finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH
 
-        choices.append(ChatCompletionResponseChoice(
-            index=i, message=response_message, finish_reason=finish_reason))
+        choices.append(ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason))
         prompt_length = response.prompt_length
         response_length += response.response_length
 
@@ -165,16 +149,13 @@ async def create_stream_chat_completion_response(
     completion_id = "chatcmpl-{}".format(uuid.uuid4().hex)
     input_messages, system, tools = _process_request(request)
     if tools:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
-                            detail="Cannot stream function calls.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
 
     if request.n > 1:
-        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST,
-                            detail="Cannot stream multiple responses.")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream multiple responses.")
 
     yield _create_stream_chat_completion_chunk(
-        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(
-            role=Role.ASSISTANT, content="")
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(role=Role.ASSISTANT, content="")
     )
     async for new_token in chat_model.astream_chat(
         input_messages,
@@ -188,8 +169,7 @@ async def create_stream_chat_completion_response(
     ):
         if len(new_token) != 0:
             yield _create_stream_chat_completion_chunk(
-                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(
-                    content=new_token)
+                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token)
             )
 
     yield _create_stream_chat_completion_chunk(
@@ -202,8 +182,7 @@ async def create_score_evaluation_response(
     request: "ScoreEvaluationRequest", chat_model: "ChatModel"
 ) -> "ScoreEvaluationResponse":
     if len(request.messages) == 0:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
 
     scores = await chat_model.aget_scores(request.messages, max_length=request.max_length)
     return ScoreEvaluationResponse(model=request.model, scores=scores)

From b1c791fb0d79094fff3c746d50debd4daebf5974 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 13 May 2024 16:51:20 +0800
Subject: [PATCH 275/341] support Yi 1.5

Former-commit-id: e580823676cbb83ddb9a0f685992e6054ae5ffaa
---
 README.md                            |  4 +--
 README_zh.md                         |  4 +--
 src/llmtuner/extras/constants.py     | 39 +++++++++++++++++++++++-----
 src/llmtuner/model/utils/longlora.py |  2 +-
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 57a34dab..d260ad36 100644
--- a/README.md
+++ b/README.md
@@ -161,7 +161,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
-| [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                        | q_proj,v_proj     | yi        |
+| [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | q_proj,v_proj     | yi        |
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
@@ -487,7 +487,7 @@ If you have a project that should be incorporated, please contact via email or c
 
 This repository is licensed under the [Apache-2.0 License](LICENSE).
 
-Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2/LLaVA-1.5](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+Please follow the model licenses to use the corresponding model weights: [Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index 047b1645..8912d5e1 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -161,7 +161,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [Qwen1.5 (Code/MoE)](https://huggingface.co/Qwen)        | 0.5B/1.8B/4B/7B/14B/32B/72B/110B | q_proj,v_proj     | qwen      |
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
-| [Yi](https://huggingface.co/01-ai)                       | 6B/9B/34B                        | q_proj,v_proj     | yi        |
+| [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | q_proj,v_proj     | yi        |
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
@@ -487,7 +487,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 
 本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。
 
-使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2/LLaVA-1.5](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+使用模型权重时，请遵循对应的模型协议：[Baichuan2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command-R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [InternLM2](https://github.com/InternLM/InternLM#license) / [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [LLaMA-2 (LLaVA-1.5)](https://ai.meta.com/llama/license/) / [LLaMA-3](https://llama.meta.com/llama3/license/) / [Mistral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [StarCoder2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
 
 ## 引用
 
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index 50c78b3f..ff52f29a 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -320,14 +320,14 @@ register_model_group(
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-base",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-base",
         },
+        "DeepSeek-MoE-236B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2",
+        },
         "DeepSeek-MoE-16B-Chat": {
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat",
         },
-        "DeepSeek-MoE-236B": {
-            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2",
-            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2",
-        },
         "DeepSeek-MoE-236B-Chat": {
             DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat",
             DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat",
@@ -424,13 +424,13 @@ register_model_group(
 register_model_group(
     models={
         "CodeGemma-2B": {
-            DownloadSource.DEFAULT: "google/codegemma-2b",
+            DownloadSource.DEFAULT: "google/codegemma-1.1-2b",
         },
         "CodeGemma-7B": {
             DownloadSource.DEFAULT: "google/codegemma-7b",
         },
         "CodeGemma-7B-Chat": {
-            DownloadSource.DEFAULT: "google/codegemma-7b-it",
+            DownloadSource.DEFAULT: "google/codegemma-1.1-7b-it",
             DownloadSource.MODELSCOPE: "AI-ModelScope/codegemma-7b-it",
         },
     },
@@ -581,6 +581,9 @@ register_model_group(
             DownloadSource.DEFAULT: "shenzhi-wang/Llama3-8B-Chinese-Chat",
             DownloadSource.MODELSCOPE: "LLM-Research/Llama3-8B-Chinese-Chat",
         },
+        "LLaMA3-70B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3-70B-Chinese-Chat",
+        },
     },
     template="llama3",
 )
@@ -1174,6 +1177,30 @@ register_model_group(
             DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-4bits",
             DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-4bits",
         },
+        "Yi-1.5-6B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B",
+        },
+        "Yi-1.5-9B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B",
+        },
+        "Yi-1.5-34B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B",
+        },
+        "Yi-1.5-6B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B-Chat",
+        },
+        "Yi-1.5-9B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B-Chat",
+        },
+        "Yi-1.5-34B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B-Chat",
+        },
     },
     template="yi",
 )
diff --git a/src/llmtuner/model/utils/longlora.py b/src/llmtuner/model/utils/longlora.py
index c91febdd..a11351f1 100644
--- a/src/llmtuner/model/utils/longlora.py
+++ b/src/llmtuner/model/utils/longlora.py
@@ -302,7 +302,7 @@ def llama_sdpa_attention_forward(
 
 
 def _apply_llama_patch() -> None:
-    require_version("transformers==4.40.1", "To fix: pip install transformers==4.40.1")
+    require_version("transformers==4.40.2", "To fix: pip install transformers==4.40.2")
     LlamaAttention.forward = llama_attention_forward
     LlamaFlashAttention2.forward = llama_flash_attention_2_forward
     LlamaSdpaAttention.forward = llama_sdpa_attention_forward

From 2892e5d42a9e03931f10acfa8dc0b35d07fbc0d9 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 13 May 2024 18:24:35 +0800
Subject: [PATCH 276/341] fix #3702

Former-commit-id: 55755786f21050b9efc127c391509ba5d9ea8982
---
 README.md                     | 48 ++++++++++++++++++-----------------
 README_zh.md                  | 48 ++++++++++++++++++-----------------
 src/llmtuner/api/chat.py      |  6 ++---
 src/llmtuner/api/common.py    |  6 ++---
 src/llmtuner/data/template.py |  2 +-
 5 files changed, 56 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index d260ad36..90fcb295 100644
--- a/README.md
+++ b/README.md
@@ -70,57 +70,59 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
-[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See `examples/lora_single_gpu/sft_mllm.sh` for usage.
+[24/05/13] We supported fine-tuning the **Yi-1.5** series models.
+
+[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See [examples](examples/README.md) for usage.
 
 [24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details.
 
-[24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See `examples/extras/mod` for usage.
+<details><summary>Full Changelog</summary>
 
-[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See `examples/extras/badam` for usage.
+[24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See [examples](examples/README.md) for usage.
+
+[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See [examples](examples/README.md) for usage.
 
 [24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
 
-<details><summary>Full Changelog</summary>
-
-[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See `examples/lora_single_gpu` for usage.
+[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See [examples](examples/README.md) for usage.
 
 [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
 
-[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
+[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See [examples](examples/README.md) for usage.
 
-[24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See `examples/extras/loraplus` for usage.
+[24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See [examples](examples/README.md) for usage.
 
-[24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. See `examples/extras/galore` for usage.
+[24/03/07] We supported gradient low-rank projection (**[GaLore](https://arxiv.org/abs/2403.03507)**) algorithm. See [examples](examples/README.md) for usage.
 
-[24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `--infer_backend vllm` to enjoy **270%** inference speed. (LoRA is not yet supported, merge it first.)
+[24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `infer_backend: vllm` to enjoy **270%** inference speed.
 
-[24/02/28] We supported weight-decomposed LoRA (**[DoRA](https://arxiv.org/abs/2402.09353)**). Try `--use_dora` to activate DoRA training.
+[24/02/28] We supported weight-decomposed LoRA (**[DoRA](https://arxiv.org/abs/2402.09353)**). Try `use_dora: true` to activate DoRA training.
 
-[24/02/15] We supported **block expansion** proposed by [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro). See `examples/extras/llama_pro` for usage.
+[24/02/15] We supported **block expansion** proposed by [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro). See [examples](examples/README.md) for usage.
 
 [24/02/05] Qwen1.5 (Qwen2 beta version) series models are supported in LLaMA-Factory. Check this [blog post](https://qwenlm.github.io/blog/qwen1.5/) for details.
 
-[24/01/18] We supported **agent tuning** for most models, equipping model with tool using abilities by fine-tuning with `--dataset glaive_toolcall`.
+[24/01/18] We supported **agent tuning** for most models, equipping model with tool using abilities by fine-tuning with `dataset: glaive_toolcall`.
 
-[23/12/23] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s implementation to boost LoRA tuning for the LLaMA, Mistral and Yi models. Try `--use_unsloth` argument to activate unsloth patch. It achieves **170%** speed in our benchmark, check [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison) for details.
+[23/12/23] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s implementation to boost LoRA tuning for the LLaMA, Mistral and Yi models. Try `use_unsloth: true` argument to activate unsloth patch. It achieves **170%** speed in our benchmark, check [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison) for details.
 
 [23/12/12] We supported fine-tuning the latest MoE model **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)** in our framework. See hardware requirement [here](#hardware-requirement).
 
-[23/12/01] We supported downloading pre-trained models and datasets from the **[ModelScope Hub](https://modelscope.cn/models)** for Chinese mainland users. See [this tutorial](#use-modelscope-hub-optional) for usage.
+[23/12/01] We supported downloading pre-trained models and datasets from the **[ModelScope Hub](https://modelscope.cn/models)** for Chinese mainland users. See [this tutorial](#download-from-modelscope-hub) for usage.
 
-[23/10/21] We supported **[NEFTune](https://arxiv.org/abs/2310.05914)** trick for fine-tuning. Try `--neftune_noise_alpha` argument to activate NEFTune, e.g., `--neftune_noise_alpha 5`.
+[23/10/21] We supported **[NEFTune](https://arxiv.org/abs/2310.05914)** trick for fine-tuning. Try `neftune_noise_alpha: 5` argument to activate NEFTune.
 
-[23/09/27] We supported **$S^2$-Attn** proposed by [LongLoRA](https://github.com/dvlab-research/LongLoRA) for the LLaMA models. Try `--shift_attn` argument to enable shift short attention.
+[23/09/27] We supported **$S^2$-Attn** proposed by [LongLoRA](https://github.com/dvlab-research/LongLoRA) for the LLaMA models. Try `shift_attn: true` argument to enable shift short attention.
 
-[23/09/23] We integrated MMLU, C-Eval and CMMLU benchmarks in this repo. See [this example](#evaluation) to evaluate your models.
+[23/09/23] We integrated MMLU, C-Eval and CMMLU benchmarks in this repo. See [examples](examples/README.md) for usage.
 
-[23/09/10] We supported **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**. Try `--flash_attn fa2` argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs.
+[23/09/10] We supported **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**. Try `flash_attn: fa2` argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs.
 
-[23/08/12] We supported **RoPE scaling** to extend the context length of the LLaMA models. Try `--rope_scaling linear` argument in training and `--rope_scaling dynamic` argument at inference to extrapolate the position embeddings.
+[23/08/12] We supported **RoPE scaling** to extend the context length of the LLaMA models. Try `rope_scaling: linear` argument in training and `rope_scaling: dynamic` argument at inference to extrapolate the position embeddings.
 
-[23/08/11] We supported **[DPO training](https://arxiv.org/abs/2305.18290)** for instruction-tuned models. See [this example](#dpo-training) to train your models.
+[23/08/11] We supported **[DPO training](https://arxiv.org/abs/2305.18290)** for instruction-tuned models. See [examples](examples/README.md) for usage.
 
-[23/07/31] We supported **dataset streaming**. Try `--streaming` and `--max_steps 10000` arguments to load your dataset in streaming mode.
+[23/07/31] We supported **dataset streaming**. Try `streaming: true` and `max_steps: 10000` arguments to load your dataset in streaming mode.
 
 [23/07/29] We released two instruction-tuned 13B models at Hugging Face. See these Hugging Face Repos ([LLaMA-2](https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat) / [Baichuan](https://huggingface.co/hiyouga/Baichuan-13B-sft)) for details.
 
@@ -132,7 +134,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 [23/06/22] We aligned the [demo API](src/api_demo.py) with the [OpenAI's](https://platform.openai.com/docs/api-reference/chat) format where you can insert the fine-tuned model in **arbitrary ChatGPT-based applications**.
 
-[23/06/03] We supported quantized training and inference (aka **[QLoRA](https://github.com/artidoro/qlora)**). Try `--quantization_bit 4/8` argument to work with quantized models.
+[23/06/03] We supported quantized training and inference (aka **[QLoRA](https://github.com/artidoro/qlora)**). See [examples](examples/README.md) for usage.
 
 </details>
 
diff --git a/README_zh.md b/README_zh.md
index 8912d5e1..1d15515e 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -70,57 +70,59 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
-[24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 `examples/lora_single_gpu/sft_mllm.sh`。
+[24/05/13] 我们支持了 Yi-1.5 系列模型的微调。
+
+[24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 [examples](examples/README_zh.md)。
 
 [24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型，详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。
 
-[24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 `examples/extras/mod`。
+<details><summary>展开日志</summary>
 
-[24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 `examples/extras/badam`。
+[24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 [examples](examples/README_zh.md)。
+
+[24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 [examples](examples/README_zh.md)。
 
 [24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练（24GB 可训练 Llama-2-7B-56k）。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
 
-<details><summary>展开日志</summary>
-
-[24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 `examples/lora_single_gpu`。
+[24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 [examples](examples/README_zh.md)。
 
 [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
 
-[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
+[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 [examples](examples/README_zh.md)。
 
-[24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 `examples/extras/loraplus`。
+[24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 [examples](examples/README_zh.md)。
 
-[24/03/07] 我们支持了梯度低秩投影（**[GaLore](https://arxiv.org/abs/2403.03507)**）算法。详细用法请参照 `examples/extras/galore`。
+[24/03/07] 我们支持了梯度低秩投影（**[GaLore](https://arxiv.org/abs/2403.03507)**）算法。详细用法请参照 [examples](examples/README_zh.md)。
 
-[24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `--infer_backend vllm` 来获得 **270%** 的推理速度。（尚不支持 LoRA，请先合并权重。）
+[24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `infer_backend: vllm` 来获得 **270%** 的推理速度。
 
-[24/02/28] 我们支持了 **[DoRA](https://arxiv.org/abs/2402.09353)** 微调。请使用 `--use_dora` 参数进行 DoRA 微调。
+[24/02/28] 我们支持了 **[DoRA](https://arxiv.org/abs/2402.09353)** 微调。请使用 `use_dora: true` 参数进行 DoRA 微调。
 
-[24/02/15] 我们支持了 [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro) 提出的**块扩展**方法。详细用法请参照 `examples/extras/llama_pro`。
+[24/02/15] 我们支持了 [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro) 提出的**块扩展**方法。详细用法请参照 [examples](examples/README_zh.md)。
 
 [24/02/05] Qwen1.5（Qwen2 测试版）系列模型已在 LLaMA-Factory 中实现微调支持。详情请查阅该[博客页面](https://qwenlm.github.io/zh/blog/qwen1.5/)。
 
-[24/01/18] 我们针对绝大多数模型实现了 **Agent 微调**，微调时指定 `--dataset glaive_toolcall` 即可使模型获得工具调用能力。
+[24/01/18] 我们针对绝大多数模型实现了 **Agent 微调**，微调时指定 `dataset: glaive_toolcall` 即可使模型获得工具调用能力。
 
-[23/12/23] 我们针对 LLaMA, Mistral 和 Yi 模型支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的 LoRA 训练加速。请使用 `--use_unsloth` 参数启用 unsloth 优化。该方法可提供 **170%** 的训练速度，详情请查阅[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
+[23/12/23] 我们针对 LLaMA, Mistral 和 Yi 模型支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的 LoRA 训练加速。请使用 `use_unsloth: true` 参数启用 unsloth 优化。该方法可提供 **170%** 的训练速度，详情请查阅[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。
 
 [23/12/12] 我们支持了微调最新的混合专家模型 **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)**。硬件需求请查阅[此处](#硬件依赖)。
 
-[23/12/01] 我们支持了从 **[魔搭社区](https://modelscope.cn/models)** 下载预训练模型和数据集。详细用法请参照 [此教程](#使用魔搭社区可跳过)。
+[23/12/01] 我们支持了从 **[魔搭社区](https://modelscope.cn/models)** 下载预训练模型和数据集。详细用法请参照 [此教程](#从魔搭社区下载)。
 
-[23/10/21] 我们支持了 **[NEFTune](https://arxiv.org/abs/2310.05914)** 训练技巧。请使用 `--neftune_noise_alpha` 参数启用 NEFTune，例如 `--neftune_noise_alpha 5`。
+[23/10/21] 我们支持了 **[NEFTune](https://arxiv.org/abs/2310.05914)** 训练技巧。请使用 `neftune_noise_alpha: 5` 参数启用 NEFTune。
 
-[23/09/27] 我们针对 LLaMA 模型支持了 [LongLoRA](https://github.com/dvlab-research/LongLoRA) 提出的 **$S^2$-Attn**。请使用 `--shift_attn` 参数以启用该功能。
+[23/09/27] 我们针对 LLaMA 模型支持了 [LongLoRA](https://github.com/dvlab-research/LongLoRA) 提出的 **$S^2$-Attn**。请使用 `shift_attn: true` 参数以启用该功能。
 
-[23/09/23] 我们在项目中集成了 MMLU、C-Eval 和 CMMLU 评估集。使用方法请参阅[此示例](#模型评估)。
+[23/09/23] 我们在项目中集成了 MMLU、C-Eval 和 CMMLU 评估集。详细用法请参照 [examples](examples/README_zh.md)。
 
-[23/09/10] 我们支持了 **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**。如果您使用的是 RTX4090、A100 或 H100 GPU，请使用 `--flash_attn fa2` 参数以启用 FlashAttention-2。
+[23/09/10] 我们支持了 **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**。如果您使用的是 RTX4090、A100 或 H100 GPU，请使用 `flash_attn: fa2` 参数以启用 FlashAttention-2。
 
-[23/08/12] 我们支持了 **RoPE 插值**来扩展 LLaMA 模型的上下文长度。请使用 `--rope_scaling linear` 参数训练模型或使用 `--rope_scaling dynamic` 参数评估模型。
+[23/08/12] 我们支持了 **RoPE 插值**来扩展 LLaMA 模型的上下文长度。请使用 `rope_scaling: linear` 参数训练模型或使用 `rope_scaling: dynamic` 参数评估模型。
 
-[23/08/11] 我们支持了指令模型的 **[DPO 训练](https://arxiv.org/abs/2305.18290)**。使用方法请参阅[此示例](#dpo-训练)。
+[23/08/11] 我们支持了指令模型的 **[DPO 训练](https://arxiv.org/abs/2305.18290)**。详细用法请参照 [examples](examples/README_zh.md)。
 
-[23/07/31] 我们支持了**数据流式加载**。请使用 `--streaming` 和 `--max_steps 10000` 参数来流式加载数据集。
+[23/07/31] 我们支持了**数据流式加载**。请使用 `streaming: true` 和 `max_steps: 10000` 参数来流式加载数据集。
 
 [23/07/29] 我们在 Hugging Face 发布了两个 13B 指令微调模型。详细内容请查阅我们的 Hugging Face 项目（[LLaMA-2](https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat) / [Baichuan](https://huggingface.co/hiyouga/Baichuan-13B-sft)）。
 
@@ -132,7 +134,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 [23/06/22] 我们对齐了[示例 API](src/api_demo.py) 与 [OpenAI API](https://platform.openai.com/docs/api-reference/chat) 的格式，您可以将微调模型接入**任意基于 ChatGPT 的应用**中。
 
-[23/06/03] 我们实现了 4 比特的 LoRA 训练（也称 **[QLoRA](https://github.com/artidoro/qlora)**）。请使用 `--quantization_bit 4` 参数进行 4 比特量化微调。
+[23/06/03] 我们实现了 4 比特的 LoRA 训练（也称 **[QLoRA](https://github.com/artidoro/qlora)**）。详细用法请参照 [examples](examples/README_zh.md)。
 
 </details>
 
diff --git a/src/llmtuner/api/chat.py b/src/llmtuner/api/chat.py
index 76ddc88d..b7a08f0b 100644
--- a/src/llmtuner/api/chat.py
+++ b/src/llmtuner/api/chat.py
@@ -21,8 +21,6 @@ from .protocol import (
 )
 
 
-logger = get_logger(__name__)
-
 if is_fastapi_available():
     from fastapi import HTTPException, status
 
@@ -32,6 +30,7 @@ if TYPE_CHECKING:
     from .protocol import ChatCompletionRequest, ScoreEvaluationRequest
 
 
+logger = get_logger(__name__)
 ROLE_MAPPING = {
     Role.USER: DataRole.USER.value,
     Role.ASSISTANT: DataRole.ASSISTANT.value,
@@ -42,8 +41,7 @@ ROLE_MAPPING = {
 
 
 def _process_request(request: "ChatCompletionRequest") -> Tuple[List[Dict[str, str]], str, str]:
-    params = dictify(request)
-    logger.info(f"==== request ====\n{params}")
+    logger.info("==== request ====\n{}".format(json.dumps(dictify(request), indent=2, ensure_ascii=False)))
 
     if len(request.messages) == 0:
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
diff --git a/src/llmtuner/api/common.py b/src/llmtuner/api/common.py
index 3e95d211..5ad9a071 100644
--- a/src/llmtuner/api/common.py
+++ b/src/llmtuner/api/common.py
@@ -6,11 +6,11 @@ if TYPE_CHECKING:
     from pydantic import BaseModel
 
 
-def dictify(data: "BaseModel", **kwargs) -> Dict[str, Any]:
+def dictify(data: "BaseModel") -> Dict[str, Any]:
     try:  # pydantic v2
-        return data.model_dump(**kwargs)
+        return data.model_dump(exclude_unset=True)
     except AttributeError:  # pydantic v1
-        return data.dict(**kwargs)
+        return data.dict(exclude_unset=True)
 
 
 def jsonify(data: "BaseModel") -> str:
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index ada6cfcd..f716102f 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -308,7 +308,7 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer")
         jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
 
     jinja_template += (
-        "{% if messages[0]['role'] == 'system' %}" "{% set system_message = messages[0]['content'] %}" "{% endif %}"
+        "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}"
     )
 
     system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message")

From 5f5f948806ad63a5c847f79256d3534554a48fa9 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 13 May 2024 20:09:09 +0800
Subject: [PATCH 277/341] fix #3724

Former-commit-id: 62f5999d79834d6cbc4129eda387a317665d6099
---
 src/llmtuner/model/utils/longlora.py | 29 ++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/llmtuner/model/utils/longlora.py b/src/llmtuner/model/utils/longlora.py
index a11351f1..c8dc52f5 100644
--- a/src/llmtuner/model/utils/longlora.py
+++ b/src/llmtuner/model/utils/longlora.py
@@ -41,9 +41,9 @@ def llama_attention_forward(
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     bsz, q_len, _ = hidden_states.size()
 
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
+    query_states: "torch.Tensor" = self.q_proj(hidden_states)
+    key_states: "torch.Tensor" = self.k_proj(hidden_states)
+    value_states: "torch.Tensor" = self.v_proj(hidden_states)
 
     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -87,7 +87,7 @@ def llama_attention_forward(
     # upcast attention to fp32
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
     attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-    attn_output = torch.matmul(attn_weights, value_states)  # (bsz, :, seq_len, :) or (bsz*n_group, :, groupsz, :)
+    attn_output = torch.matmul(attn_weights, value_states)  # (bsz, :, seq_len, :) or (bsz * n_group, :, groupsz, :)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
     if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
@@ -125,9 +125,9 @@ def llama_flash_attention_2_forward(
 
     bsz, q_len, _ = hidden_states.size()
 
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
+    query_states: "torch.Tensor" = self.q_proj(hidden_states)
+    key_states: "torch.Tensor" = self.k_proj(hidden_states)
+    value_states: "torch.Tensor" = self.v_proj(hidden_states)
 
     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -233,9 +233,9 @@ def llama_sdpa_attention_forward(
 
     bsz, q_len, _ = hidden_states.size()
 
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
+    query_states: "torch.Tensor" = self.q_proj(hidden_states)
+    key_states: "torch.Tensor" = self.k_proj(hidden_states)
+    value_states: "torch.Tensor" = self.v_proj(hidden_states)
 
     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -270,11 +270,12 @@ def llama_sdpa_attention_forward(
 
     causal_mask = attention_mask
     if attention_mask is not None:
-        causal_mask = causal_mask[:, :, :, :groupsz]
+        causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
 
-    query_states = query_states.contiguous()
-    key_states = key_states.contiguous()
-    value_states = value_states.contiguous()
+    if query_states.device.type == "cuda" and causal_mask is not None:
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
 
     attn_output = torch.nn.functional.scaled_dot_product_attention(
         query_states,

From e4972c8fc472b1561f53ba2669fede97147a5a01 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 13 May 2024 20:39:36 +0800
Subject: [PATCH 278/341] update examples

Former-commit-id: 779603055ae9216ff549f5285caac8c0c0a1e9fb
---
 examples/README.md                                   | 12 ++++++------
 examples/README_zh.md                                | 12 ++++++------
 examples/extras/badam/llama3_lora_sft.yaml           |  2 +-
 examples/extras/fsdp_qlora/llama3_lora_sft.yaml      |  5 ++++-
 examples/extras/galore/llama3_full_sft.yaml          |  2 +-
 examples/extras/llama_pro/llama3_freeze_sft.yaml     |  4 ++--
 examples/extras/loraplus/llama3_lora_sft.yaml        |  4 ++--
 examples/extras/mod/llama3_full_sft.yaml             |  2 +-
 examples/full_multi_gpu/llama3_full_sft.yaml         |  2 +-
 examples/lora_multi_gpu/llama3_lora_sft.yaml         |  2 +-
 examples/lora_multi_gpu/llama3_lora_sft_ds.yaml      |  2 +-
 examples/lora_single_gpu/llama3_lora_dpo.yaml        |  2 +-
 examples/lora_single_gpu/llama3_lora_orpo.yaml       |  2 +-
 examples/lora_single_gpu/llama3_lora_pretrain.yaml   |  2 +-
 examples/lora_single_gpu/llama3_lora_reward.yaml     |  2 +-
 examples/lora_single_gpu/llama3_lora_sft.yaml        |  2 +-
 examples/lora_single_gpu/llama3_preprocess.yaml      |  1 -
 examples/lora_single_gpu/llava1_5_lora_sft.yaml      |  2 +-
 examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml  |  2 +-
 examples/qlora_single_gpu/llama3_lora_sft_awq.yaml   |  2 +-
 .../llama3_lora_sft_bitsandbytes.yaml                |  5 +----
 examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml  |  2 +-
 22 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index ce19f9d1..0838314a 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -28,6 +28,12 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
 ```
 
+#### Multimodal Supervised Fine-Tuning
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+```
+
 #### Reward Modeling
 
 ```bash
@@ -52,12 +58,6 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
 ```
 
-#### Multimodal Supervised Fine-Tuning
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
-```
-
 #### Preprocess Dataset
 
 It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 91bdcda9..7fe43954 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -28,6 +28,12 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_sft.yaml
 ```
 
+#### 多模态指令监督微调
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
+```
+
 #### 奖励模型训练
 
 ```bash
@@ -52,12 +58,6 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml
 ```
 
-#### 多模态指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llava1_5_lora_sft.yaml
-```
-
 #### 预处理数据集
 
 对于大数据集有帮助，在配置中使用 `tokenized_path` 以加载预处理后的数据集。
diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_lora_sft.yaml
index 9f1f1976..5e8994bc 100644
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@@ -15,7 +15,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -36,6 +35,7 @@ warmup_steps: 0.1
 pure_bf16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
index 64bf1356..1fd8f16a 100644
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -8,12 +8,14 @@ do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
+# ddp
+ddp_timeout: 180000000
+
 # dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -34,6 +36,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/galore/llama3_full_sft.yaml b/examples/extras/galore/llama3_full_sft.yaml
index 5aec8af9..3bc074c5 100644
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -16,7 +16,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -37,6 +36,7 @@ warmup_steps: 0.1
 pure_bf16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index a54be8b8..4d92cdad 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -14,7 +14,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -32,9 +31,10 @@ learning_rate: 0.0001
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_steps: 0.1
-pure_bf16: true
+fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
index dfb7058b..0956aa71 100644
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -13,7 +13,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -31,9 +30,10 @@ learning_rate: 0.0001
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_steps: 0.1
-pure_bf16: true
+fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml
index 5f80521d..5dc8c061 100644
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -34,6 +33,7 @@ warmup_steps: 0.1
 pure_bf16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/full_multi_gpu/llama3_full_sft.yaml
index ef35e441..2d8031f1 100644
--- a/examples/full_multi_gpu/llama3_full_sft.yaml
+++ b/examples/full_multi_gpu/llama3_full_sft.yaml
@@ -15,7 +15,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -36,6 +35,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml
index d9690679..6cc06f8a 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml
@@ -15,7 +15,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -36,6 +35,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
index 26955167..5a7348c1 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
@@ -16,7 +16,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -37,6 +36,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/lora_single_gpu/llama3_lora_dpo.yaml
index f71f752d..16c6d0c9 100644
--- a/examples/lora_single_gpu/llama3_lora_dpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml
@@ -13,7 +13,6 @@ dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -34,6 +33,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_orpo.yaml b/examples/lora_single_gpu/llama3_lora_orpo.yaml
index 5d78d260..bc42bdd4 100644
--- a/examples/lora_single_gpu/llama3_lora_orpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_orpo.yaml
@@ -12,7 +12,6 @@ dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_pretrain.yaml b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
index 64245b71..48425b15 100644
--- a/examples/lora_single_gpu/llama3_lora_pretrain.yaml
+++ b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
@@ -11,7 +11,6 @@ lora_target: q_proj,v_proj
 dataset: c4_demo
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -32,6 +31,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/lora_single_gpu/llama3_lora_reward.yaml
index f190f4ac..ecaf8d72 100644
--- a/examples/lora_single_gpu/llama3_lora_reward.yaml
+++ b/examples/lora_single_gpu/llama3_lora_reward.yaml
@@ -12,7 +12,6 @@ dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/lora_single_gpu/llama3_lora_sft.yaml
index f99df305..0e5e30b3 100644
--- a/examples/lora_single_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/lora_single_gpu/llama3_preprocess.yaml
index 0b3dc599..4c45c1cd 100644
--- a/examples/lora_single_gpu/llama3_preprocess.yaml
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 tokenized_path: saves/llama3-8b/dataset/sft
diff --git a/examples/lora_single_gpu/llava1_5_lora_sft.yaml b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
index 96c2701a..84d2a672 100644
--- a/examples/lora_single_gpu/llava1_5_lora_sft.yaml
+++ b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
@@ -13,7 +13,6 @@ dataset: mllm_demo
 template: vicuna
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -34,6 +33,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
index 11f1d277..a1d5f95d 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
index 4b070d45..8941d6b2 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
index 7bc31bde..885fcd83 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
@@ -8,15 +8,11 @@ do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
-ddp_timeout: 180000000
-
 # dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -37,6 +33,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
index 2f8cfe45..87a404a0 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
@@ -12,7 +12,6 @@ dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
-val_size: 0.1
 overwrite_cache: true
 preprocessing_num_workers: 16
 
@@ -33,6 +32,7 @@ warmup_steps: 0.1
 fp16: true
 
 # eval
+val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
 eval_steps: 500

From ba72e089015eb0b51bcef9545b46fdbb902f8d0a Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Mon, 13 May 2024 23:28:28 +0800
Subject: [PATCH 279/341] add yi-vl

Former-commit-id: 891b25cb3d709ea82182ca90496034360e1cd5d8
---
 src/llmtuner/data/template.py      | 14 ++++++++++++++
 src/llmtuner/model/patcher.py      |  3 ++-
 src/llmtuner/model/utils/visual.py | 26 +++++++++++++++++++++++++-
 src/llmtuner/train/sft/trainer.py  | 12 ++++++++++--
 src/llmtuner/train/sft/workflow.py |  2 ++
 5 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index ada6cfcd..7ab31147 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -856,6 +856,20 @@ _register_template(
 )
 
 
+_register_template(
+    name="yi-vl",
+    format_user=StringFormatter(slots=["### Human:\n{{content}}\n### Assistant: "]),
+    stop_words=["###"],
+    default_system=(
+        "This is a chat between an inquisitive human and an AI assistant. "
+        "Assume the role of the AI assistant. "
+        "Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
+        "这是一个好奇的人类和一个人工智能助手之间的对话。"
+        "假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。"
+    ),
+)
+
+
 _register_template(
     name="yuan",
     format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index fd99bd3b..b7cad67c 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -16,7 +16,7 @@ from .utils.moe import add_z3_leaf_module, configure_moe
 from .utils.quantization import configure_quantization
 from .utils.rope import configure_rope
 from .utils.valuehead import prepare_valuehead_model
-from .utils.visual import autocast_projector_dtype, configure_hidden_size
+from .utils.visual import autocast_projector_dtype, configure_hidden_size, configure_visual
 
 
 if TYPE_CHECKING:
@@ -50,6 +50,7 @@ def patch_config(
     configure_quantization(config, tokenizer, model_args, init_kwargs)
     configure_moe(config, model_args, is_trainable)
     configure_hidden_size(config)
+    configure_visual(config, model_args)
 
     if model_args.use_cache and not is_trainable:
         setattr(config, "use_cache", True)
diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index b29a9ba5..d1556bb3 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -1,12 +1,14 @@
 from typing import TYPE_CHECKING, Tuple
 
 import torch
+import transformers
+from torch import nn
 
 from ...extras.logging import get_logger
 
 
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig, PreTrainedModel
+    from transformers import PretrainedConfig, PreTrainedModel, LlavaConfig
 
     from ...hparams import ModelArguments
 
@@ -31,3 +33,25 @@ def autocast_projector_dtype(
         logger.info("Casting multimodal projector outputs in {}.".format(model_args.compute_dtype))
         mm_projector: "torch.nn.Module" = getattr(model, mm_projector_name)
         mm_projector.register_forward_hook(_mm_projector_forward_post_hook)
+
+
+class LlavaMultiModalProjectorYiVL(nn.Module):
+    def __init__(self, config: "LlavaConfig"):
+        super().__init__()
+        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_2 = nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.linear_3 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_4 = nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.act = nn.GELU()
+        self.proj = nn.Sequential(*[self.linear_1, self.linear_2, self.act, self.linear_3, self.linear_4])
+
+    def forward(self, image_features):
+        hidden_states = self.proj(image_features)
+        return hidden_states
+
+
+def configure_visual(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
+    logger = get_logger(__name__)
+    if model_args.visual_inputs and "Yi" in getattr(config.text_config, "_name_or_path", None):
+        transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorYiVL
+        logger.info("Patched Multimodal Projector for Yi-VL.")
diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py
index def427fd..1b456e50 100644
--- a/src/llmtuner/train/sft/trainer.py
+++ b/src/llmtuner/train/sft/trainer.py
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
-from transformers import Seq2SeqTrainer
+from transformers import Seq2SeqTrainer, ProcessorMixin
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
@@ -26,9 +26,10 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
     Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE.
     """
 
-    def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
+    def __init__(self, finetuning_args: "FinetuningArguments", processor: "ProcessorMixin", **kwargs) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
+        self.processor = processor
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
 
@@ -120,3 +121,10 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
             for label, pred in zip(decoded_labels, decoded_preds):
                 res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False))
             writer.write("\n".join(res))
+
+    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
+        super().save_model(output_dir, _internal_call)
+        if self.processor is not None:
+            if output_dir is None:
+                output_dir = self.args.output_dir
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)
\ No newline at end of file
diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py
index 4a9775b4..3b7b909a 100644
--- a/src/llmtuner/train/sft/workflow.py
+++ b/src/llmtuner/train/sft/workflow.py
@@ -30,6 +30,7 @@ def run_sft(
 ):
     tokenizer_module = load_tokenizer(model_args)
     tokenizer = tokenizer_module["tokenizer"]
+    processor = tokenizer_module["processor"]
     dataset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
 
@@ -55,6 +56,7 @@ def run_sft(
         model=model,
         args=training_args,
         finetuning_args=finetuning_args,
+        processor=processor,
         tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,

From dd4619e9f30cae0f894cf6b205b13beeaebd3331 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 14 May 2024 14:03:19 +0800
Subject: [PATCH 280/341] add support for Yi-VL

Former-commit-id: d7834ca92d3048949caa48f8635cfbcea2c85771
---
 src/llmtuner/model/utils/visual.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index d1556bb3..79a6570e 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -43,10 +43,13 @@ class LlavaMultiModalProjectorYiVL(nn.Module):
         self.linear_3 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
         self.linear_4 = nn.LayerNorm(config.text_config.hidden_size, bias=True)
         self.act = nn.GELU()
-        self.proj = nn.Sequential(*[self.linear_1, self.linear_2, self.act, self.linear_3, self.linear_4])
 
     def forward(self, image_features):
-        hidden_states = self.proj(image_features)
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_3(hidden_states)
+        hidden_states = self.linear_4(hidden_states)
         return hidden_states
 
 
From 483ed64b43ef3605b8de928238f0a591fc225e10 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Tue, 14 May 2024 16:45:28 +0800
Subject: [PATCH 281/341] modify yi-vl template

Former-commit-id: f113975b425e70bed2588ca55a2c62594fbf2283
---
 src/llmtuner/data/template.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 7fa2ccce..631c79c1 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -857,15 +857,16 @@ _register_template(
 
 
 _register_template(
-    name="yi-vl",
-    format_user=StringFormatter(slots=["### Human:\n{{content}}\n### Assistant: "]),
+    name="yivl",
+    format_user=StringFormatter(slots=["### Human: {{content}}\n### Assistant:"]),
+    format_assistant=StringFormatter(slots=[" {{content}}"]),
     stop_words=["###"],
     default_system=(
         "This is a chat between an inquisitive human and an AI assistant. "
         "Assume the role of the AI assistant. "
         "Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
         "这是一个好奇的人类和一个人工智能助手之间的对话。"
-        "假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。"
+        "假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n"
     ),
 )
 

From 84ff56c3a0959a0f0cee0161210fc9586daf86de Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 14 May 2024 20:37:21 +0800
Subject: [PATCH 282/341] fix #3728

Former-commit-id: ea3e32a27f7f7dce75a708f8a6f376b5d3e8059a
---
 src/llmtuner/extras/ploting.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/llmtuner/extras/ploting.py b/src/llmtuner/extras/ploting.py
index e53f1f89..dea23bbe 100644
--- a/src/llmtuner/extras/ploting.py
+++ b/src/llmtuner/extras/ploting.py
@@ -21,6 +21,9 @@ def smooth(scalars: List[float]) -> List[float]:
     r"""
     EMA implementation according to TensorBoard.
     """
+    if len(scalars) == 0:
+        return []
+
     last = scalars[0]
     smoothed = []
     weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5)  # a sigmoid function
@@ -32,6 +35,9 @@ def smooth(scalars: List[float]) -> List[float]:
 
 
 def gen_loss_plot(trainer_log: List[Dict[str, Any]]) -> "matplotlib.figure.Figure":
+    r"""
+    Plots loss curves in LlamaBoard.
+    """
     plt.close("all")
     plt.switch_backend("agg")
     fig = plt.figure()
@@ -51,6 +57,9 @@ def gen_loss_plot(trainer_log: List[Dict[str, Any]]) -> "matplotlib.figure.Figur
 
 
 def plot_loss(save_dictionary: os.PathLike, keys: List[str] = ["loss"]) -> None:
+    r"""
+    Plots loss curves and saves the image.
+    """
     plt.switch_backend("agg")
     with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), "r", encoding="utf-8") as f:
         data = json.load(f)

From fe1a3b1367b2f4c52dab3a05be73bdd8990167c6 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 14 May 2024 20:44:04 +0800
Subject: [PATCH 283/341] Apply suggestions from code review

Co-authored-by: Huazhong Ji <hzji210@gmail.com>
Former-commit-id: a435e5a0bdd7268c4f1204f99f289ee0b36fd930
---
 src/train.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/train.py b/src/train.py
index e2609b66..098ec1b5 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,6 +1,4 @@
 import os
-import torch
-import torch_npu
 from torch_npu.contrib import transfer_to_npu
 from llmtuner.train.tuner import run_exp
 
@@ -15,6 +13,7 @@ def _mp_fn(index):
 
 
 if __name__ == "__main__":
-    use_jit_compile = os.getenv('JIT_COMPILE', 'False').lower() in ['true', '1']
-    torch.npu.set_compile_mode(jit_compile=use_jit_compile)
+    if is_torch_npu_available():
+        use_jit_compile = os.getenv('JIT_COMPILE', 'False').lower() in ['true', '1']
+        torch.npu.set_compile_mode(jit_compile=use_jit_compile)
     main()

From 86a048128bcc8123ce97ff792b938887fefcfd36 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 14 May 2024 20:44:21 +0800
Subject: [PATCH 284/341] Apply suggestions from code review

Co-authored-by: Huazhong Ji <hzji210@gmail.com>
Former-commit-id: abef48c17ee795eae984fcc89019c2c4859108c1
---
 src/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/train.py b/src/train.py
index 098ec1b5..00a7fa26 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,5 +1,5 @@
 import os
-from torch_npu.contrib import transfer_to_npu
+from transformers import is_torch_npu_available
 from llmtuner.train.tuner import run_exp
 
 
From 7a1d13aae2b840a90b331534b40fd326f2b1f686 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 14 May 2024 20:47:52 +0800
Subject: [PATCH 285/341] Update train.py

Former-commit-id: da1e6f0d9c2eff64f92da1f6ada3aa44ef6d6a7e
---
 src/train.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/train.py b/src/train.py
index 00a7fa26..4cc21194 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,5 +1,8 @@
 import os
+
+import torch
 from transformers import is_torch_npu_available
+
 from llmtuner.train.tuner import run_exp
 
 
From 6670b36c49381b6f4144ba48ddfc51ff28aadcd5 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 14 May 2024 21:36:42 +0800
Subject: [PATCH 286/341] use robust envs

Former-commit-id: f3e194c3b3c40a3e6c3c5397ec0d859e6db614b5
---
 src/llmtuner/api/app.py          | 2 +-
 src/llmtuner/extras/callbacks.py | 2 +-
 src/llmtuner/extras/misc.py      | 2 +-
 src/llmtuner/webui/interface.py  | 4 ++--
 src/webui.py                     | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/llmtuner/api/app.py b/src/llmtuner/api/app.py
index 6d06d1d0..21edab2f 100644
--- a/src/llmtuner/api/app.py
+++ b/src/llmtuner/api/app.py
@@ -51,7 +51,7 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
         allow_methods=["*"],
         allow_headers=["*"],
     )
-    api_key = os.environ.get("API_KEY", None)
+    api_key = os.environ.get("API_KEY")
     security = HTTPBearer(auto_error=False)
 
     async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]):
diff --git a/src/llmtuner/extras/callbacks.py b/src/llmtuner/extras/callbacks.py
index 6d24b244..637b786d 100644
--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -53,7 +53,7 @@ class LogCallback(TrainerCallback):
         self.aborted = False
         self.do_train = False
         """ Web UI """
-        self.webui_mode = bool(int(os.environ.get("LLAMABOARD_ENABLED", "0")))
+        self.webui_mode = os.environ.get("LLAMABOARD_ENABLED", "0").lower() in ["true", "1"]
         if self.webui_mode:
             signal.signal(signal.SIGABRT, self._set_abort)
             self.logger_handler = LoggerHandler(output_dir)
diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index 8ce25d18..53140efa 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -58,7 +58,7 @@ class AverageMeter:
 
 
 def check_dependencies() -> None:
-    if int(os.environ.get("DISABLE_VERSION_CHECK", "0")):
+    if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]:
         logger.warning("Version checking has been disabled, may lead to unexpected behaviors.")
     else:
         require_version("transformers>=4.37.2", "To fix: pip install transformers>=4.37.2")
diff --git a/src/llmtuner/webui/interface.py b/src/llmtuner/webui/interface.py
index 91709d40..c5a30113 100644
--- a/src/llmtuner/webui/interface.py
+++ b/src/llmtuner/webui/interface.py
@@ -71,12 +71,12 @@ def create_web_demo() -> gr.Blocks:
 
 
 def run_web_ui() -> None:
-    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
+    gradio_share = os.environ.get("GRADIO_SHARE", "0").lower() in ["true", "1"]
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
     create_ui().queue().launch(share=gradio_share, server_name=server_name)
 
 
 def run_web_demo() -> None:
-    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
+    gradio_share = os.environ.get("GRADIO_SHARE", "0").lower() in ["true", "1"]
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
     create_web_demo().queue().launch(share=gradio_share, server_name=server_name)
diff --git a/src/webui.py b/src/webui.py
index 3f8690d0..7a43039d 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -4,7 +4,7 @@ from llmtuner.webui.interface import create_ui
 
 
 def main():
-    gradio_share = bool(int(os.environ.get("GRADIO_SHARE", "0")))
+    gradio_share = os.environ.get("GRADIO_SHARE", "0").lower() in ["true", "1"]
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
     create_ui().queue().launch(share=gradio_share, server_name=server_name)
 

From ba0da830319bc5e7988e19870ac93597e3965abf Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 14 May 2024 23:32:53 +0800
Subject: [PATCH 287/341] add npu examples

Former-commit-id: 0f21e68e2dbd84c820d66d5c6d980004efc51d51
---
 examples/full_multi_gpu/multi_node.sh         |  2 +-
 examples/full_multi_gpu/single_node.sh        | 12 ++++--
 examples/lora_multi_gpu/ds_zero3.sh           | 12 ++++--
 examples/lora_multi_npu/ds_zero0.sh           | 15 +++++++
 .../lora_multi_npu/llama3_lora_sft_ds.yaml    | 42 +++++++++++++++++++
 src/llmtuner/model/patcher.py                 |  9 +++-
 src/llmtuner/model/utils/attention.py         |  4 +-
 src/train.py                                  |  8 ----
 8 files changed, 85 insertions(+), 19 deletions(-)
 create mode 100644 examples/lora_multi_npu/ds_zero0.sh
 create mode 100644 examples/lora_multi_npu/llama3_lora_sft_ds.yaml

diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
index 962409a1..34c038d4 100644
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -6,7 +6,7 @@ RANK=0
 MASTER_ADDR=192.168.0.1
 MASTER_PORT=29500
 
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
     --nproc_per_node $NPROC_PER_NODE \
     --nnodes $NNODES \
     --node_rank $RANK \
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
index 97f7af64..ac29c097 100644
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -1,9 +1,15 @@
 #!/bin/bash
 
 NPROC_PER_NODE=4
+NNODES=1
+RANK=0
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=29500
 
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
     --nproc_per_node $NPROC_PER_NODE \
-    --nnodes 1 \
-    --standalone \
+    --nnodes $NNODES \
+    --node_rank $RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
     src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh
index b8fd2640..90ea00dd 100644
--- a/examples/lora_multi_gpu/ds_zero3.sh
+++ b/examples/lora_multi_gpu/ds_zero3.sh
@@ -1,9 +1,15 @@
 #!/bin/bash
 
 NPROC_PER_NODE=4
+NNODES=1
+RANK=0
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=29500
 
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
     --nproc_per_node $NPROC_PER_NODE \
-    --nnodes 1 \
-    --standalone \
+    --nnodes $NNODES \
+    --node_rank $RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
     src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
diff --git a/examples/lora_multi_npu/ds_zero0.sh b/examples/lora_multi_npu/ds_zero0.sh
new file mode 100644
index 00000000..f849c5c9
--- /dev/null
+++ b/examples/lora_multi_npu/ds_zero0.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+NPROC_PER_NODE=4
+NNODES=1
+RANK=0
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=29500
+
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 torchrun \
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
diff --git a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
new file mode 100644
index 00000000..2e9c0558
--- /dev/null
+++ b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
@@ -0,0 +1,42 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# ddp
+ddp_timeout: 180000000
+deepspeed: examples/deepspeed/ds_z0_config.json
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index fd99bd3b..b28a23d0 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -1,9 +1,10 @@
+import os
 from types import MethodType
 from typing import TYPE_CHECKING, Any, Dict
 
 import torch
 from peft import PeftModel
-from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available
 from transformers.integrations import is_deepspeed_zero3_enabled
 
 from ..extras.logging import get_logger
@@ -44,6 +45,10 @@ def patch_config(
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
         model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
 
+    if is_torch_npu_available():
+        use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
+        torch.npu.set_compile_mode(jit_compile=use_jit_compile)
+
     configure_attn_implementation(config, model_args)
     configure_rope(config, model_args, is_trainable)
     configure_longlora(config, model_args, is_trainable)
@@ -56,7 +61,7 @@ def patch_config(
         logger.info("Using KV cache for faster generation.")
 
     if getattr(config, "model_type", None) == "qwen":
-        setattr(config, "use_flash_attn", model_args.flash_attn)
+        setattr(config, "use_flash_attn", model_args.flash_attn == "fa2")
         for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
             setattr(config, dtype_name, model_args.compute_dtype == dtype)
 
diff --git a/src/llmtuner/model/utils/attention.py b/src/llmtuner/model/utils/attention.py
index f4686489..b52ddc86 100644
--- a/src/llmtuner/model/utils/attention.py
+++ b/src/llmtuner/model/utils/attention.py
@@ -22,7 +22,7 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model
 
     elif model_args.flash_attn == "sdpa":
         if not is_sdpa_available():
-            logger.warning("Torch>=2.1.1 is required for SDPA attention.")
+            logger.warning("torch>=2.1.1 is required for SDPA attention.")
             return
 
         requested_attn_implementation = "sdpa"
@@ -52,4 +52,4 @@ def print_attn_implementation(config: "PretrainedConfig") -> None:
     elif attn_implementation == "sdpa":
         logger.info("Using torch SDPA for faster training and inference.")
     else:
-        logger.info("Using vanilla Attention implementation.")
+        logger.info("Using vanilla attention implementation.")
diff --git a/src/train.py b/src/train.py
index 4cc21194..6a3212cb 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,8 +1,3 @@
-import os
-
-import torch
-from transformers import is_torch_npu_available
-
 from llmtuner.train.tuner import run_exp
 
 
@@ -16,7 +11,4 @@ def _mp_fn(index):
 
 
 if __name__ == "__main__":
-    if is_torch_npu_available():
-        use_jit_compile = os.getenv('JIT_COMPILE', 'False').lower() in ['true', '1']
-        torch.npu.set_compile_mode(jit_compile=use_jit_compile)
     main()

From 322331df51d493891fc6df3bf0fdc3fc06dd18cb Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 14 May 2024 23:55:49 +0800
Subject: [PATCH 288/341] update readme

Former-commit-id: f315a545d85a661746ad304b5a688d1fad9eaea1
---
 README.md    | 26 +++++++++++++++++++++++---
 README_zh.md | 24 ++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 90fcb295..a138d646 100644
--- a/README.md
+++ b/README.md
@@ -70,14 +70,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/05/14] We supported training and inference on the Ascend NPU devices. Check [installation](#installation) section for details.
+
 [24/05/13] We supported fine-tuning the **Yi-1.5** series models.
 
 [24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See [examples](examples/README.md) for usage.
 
-[24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details.
-
 <details><summary>Full Changelog</summary>
 
+[24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details.
+
 [24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See [examples](examples/README.md) for usage.
 
 [24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)**. See [examples](examples/README.md) for usage.
@@ -328,7 +330,7 @@ Extra dependencies available: torch, metrics, deepspeed, bitsandbytes, vllm, gal
 
 <details><summary>For Windows users</summary>
 
-If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you will be required to install a pre-built version of `bitsandbytes` library, which supports CUDA 11.1 to 12.2, please select the appropriate [release version](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) based on your CUDA version.
+If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you need to install a pre-built version of `bitsandbytes` library, which supports CUDA 11.1 to 12.2, please select the appropriate [release version](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) based on your CUDA version.
 
 ```bash
 pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl
@@ -338,6 +340,24 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 </details>
 
+<details><summary>For Ascend NPU users</summary>
+
+To utilize Ascend NPU devices for (distributed) training and inference, you need to install the **[torch-npu](https://gitee.com/ascend/pytorch)** package and the **[Ascend CANN Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**.
+
+| Requirement  | Minimum | Recommend |
+| ------------ | ------- | --------- |
+| CANN         | 8.0.RC1 | 8.0.RC1   |
+| torch        | 2.2.0   | 2.2.0     |
+| torch-npu    | 2.2.0   | 2.2.0     |
+| deepspeed    | 0.13.2  | 0.13.2    |
+
+> [!NOTE]
+> Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
+>
+> If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
+
+</details>
+
 ### Data Preparation
 
 Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope hub or load the dataset in local disk.
diff --git a/README_zh.md b/README_zh.md
index 1d15515e..a0373711 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -70,14 +70,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/05/14] 我们支持了昇腾 NPU 设备的训练和推理。详情请查阅[安装](#安装-llama-factory)部分。
+
 [24/05/13] 我们支持了 Yi-1.5 系列模型的微调。
 
 [24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 [examples](examples/README_zh.md)。
 
-[24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型，详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。
-
 <details><summary>展开日志</summary>
 
+[24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型，详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。
+
 [24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 [examples](examples/README_zh.md)。
 
 [24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)**。详细用法请参照 [examples](examples/README_zh.md)。
@@ -338,6 +340,24 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 </details>
 
+<details><summary>昇腾 NPU 用户指南</summary>
+
+如果使用昇腾 NPU 设备进行（分布式）训练或推理，需要安装 **[torch-npu](https://gitee.com/ascend/pytorch)** 库和 **[Ascend CANN Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**。
+
+| 依赖项       | 至少     | 推荐      |
+| ------------ | ------- | --------- |
+| CANN         | 8.0.RC1 | 8.0.RC1   |
+| torch        | 2.2.0   | 2.2.0     |
+| torch-npu    | 2.2.0   | 2.2.0     |
+| deepspeed    | 0.13.2  | 0.13.2    |
+
+> [!NOTE]
+> 请记得使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定您使用的设备。
+>
+> 如果遇到无法正常推理的情况，请尝试设置 `do_sample: false`。
+
+</details>
+
 ### 数据准备
 
 关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。

From 304a2efec8e0cd0d2f274fc75a3043234ff08333 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 14 May 2024 23:57:08 +0800
Subject: [PATCH 289/341] update readme

Former-commit-id: 568cc1d33c3d202e6430b68e0bcb2772aa6b0aa2
---
 README.md    | 7 +++----
 README_zh.md | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index a138d646..826512c6 100644
--- a/README.md
+++ b/README.md
@@ -351,10 +351,9 @@ To utilize Ascend NPU devices for (distributed) training and inference, you need
 | torch-npu    | 2.2.0   | 2.2.0     |
 | deepspeed    | 0.13.2  | 0.13.2    |
 
-> [!NOTE]
-> Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
->
-> If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
+Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
+
+If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
 
 </details>
 
diff --git a/README_zh.md b/README_zh.md
index a0373711..d41ff13a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -351,10 +351,9 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 | torch-npu    | 2.2.0   | 2.2.0     |
 | deepspeed    | 0.13.2  | 0.13.2    |
 
-> [!NOTE]
-> 请记得使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定您使用的设备。
->
-> 如果遇到无法正常推理的情况，请尝试设置 `do_sample: false`。
+请记得使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定您使用的设备。
+
+如果遇到无法正常推理的情况，请尝试设置 `do_sample: false`。
 
 </details>
 

From 12d666a63ccbb9fa38e800e830271c8f892fa9dd Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 00:05:17 +0800
Subject: [PATCH 290/341] update examples

Former-commit-id: 09269c59427e8a007c1c1b6f9d2014b4c0d0a328
---
 examples/README.md    | 9 +++++++++
 examples/README_zh.md | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 0838314a..4b4a8248 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -7,6 +7,7 @@ Make sure to execute these commands in the `LLaMA-Factory` directory.
 - [LoRA Fine-Tuning on A Single GPU](#lora-fine-tuning-on-a-single-gpu)
 - [QLoRA Fine-Tuning on a Single GPU](#qlora-fine-tuning-on-a-single-gpu)
 - [LoRA Fine-Tuning on Multiple GPUs](#lora-fine-tuning-on-multiple-gpus)
+- [LoRA Fine-Tuning on Multiple NPUs](#lora-fine-tuning-on-multiple-npus)
 - [Full-Parameter Fine-Tuning on Multiple GPUs](#full-parameter-fine-tuning-on-multiple-gpus)
 - [Merging LoRA Adapters and Quantization](#merging-lora-adapters-and-quantization)
 - [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
@@ -124,6 +125,14 @@ bash examples/lora_multi_gpu/multi_node.sh
 bash examples/lora_multi_gpu/ds_zero3.sh
 ```
 
+### LoRA Fine-Tuning on Multiple NPUs
+
+#### Supervised Fine-Tuning with DeepSpeed ZeRO-0
+
+```bash
+bash examples/lora_multi_npu/ds_zero0.sh
+```
+
 ### Full-Parameter Fine-Tuning on Multiple GPUs
 
 #### Supervised Fine-Tuning with Accelerate on Single Node
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 7fe43954..3b5b2dee 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -7,6 +7,7 @@
 - [单 GPU LoRA 微调](#单-gpu-lora-微调)
 - [单 GPU QLoRA 微调](#单-gpu-qlora-微调)
 - [多 GPU LoRA 微调](#多-gpu-lora-微调)
+- [多 NPU LoRA 微调](#多-npu-lora-微调)
 - [多 GPU 全参数微调](#多-gpu-全参数微调)
 - [合并 LoRA 适配器与模型量化](#合并-lora-适配器与模型量化)
 - [推理 LoRA 模型](#推理-lora-模型)
@@ -124,6 +125,14 @@ bash examples/lora_multi_gpu/multi_node.sh
 bash examples/lora_multi_gpu/ds_zero3.sh
 ```
 
+### 多 NPU LoRA 微调
+
+#### 使用 DeepSpeed ZeRO-0 训练
+
+```bash
+bash examples/lora_multi_npu/ds_zero0.sh
+```
+
 ### 多 GPU 全参数微调
 
 #### 使用 DeepSpeed 进行单节点训练

From 88c34d26a8903194ba2531167357a419dc3ccdd7 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 00:26:10 +0800
Subject: [PATCH 291/341] fix examples

Former-commit-id: 910ffaf46e3dde87d2dbb48b82a59a9898a90847
---
 examples/lora_multi_npu/ds_zero0.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lora_multi_npu/ds_zero0.sh b/examples/lora_multi_npu/ds_zero0.sh
index f849c5c9..4ffaa1b0 100644
--- a/examples/lora_multi_npu/ds_zero0.sh
+++ b/examples/lora_multi_npu/ds_zero0.sh
@@ -12,4 +12,4 @@ ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 torchrun \
     --node_rank $RANK \
     --master_addr $MASTER_ADDR \
     --master_port $MASTER_PORT \
-    src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+    src/train.py examples/lora_multi_npu/llama3_lora_sft_ds.yaml

From a8c753125068ae61591a5dfe77b46888d399be1a Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 01:49:05 +0800
Subject: [PATCH 292/341] fix gen args

Former-commit-id: d79f91f87106ba1bc3c0ea08da5898aad59566a7
---
 src/llmtuner/chat/hf_engine.py   | 31 ++++++++++++++---------
 src/llmtuner/chat/vllm_engine.py | 43 +++++++++++++-------------------
 2 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/src/llmtuner/chat/hf_engine.py b/src/llmtuner/chat/hf_engine.py
index 97160d57..5cb8bfe4 100644
--- a/src/llmtuner/chat/hf_engine.py
+++ b/src/llmtuner/chat/hf_engine.py
@@ -65,12 +65,13 @@ class HuggingfaceEngine(BaseEngine):
         prompt_length = len(prompt_ids)
         inputs = torch.tensor([prompt_ids], device=model.device)
 
-        do_sample = input_kwargs.pop("do_sample", None)
-        temperature = input_kwargs.pop("temperature", None)
-        top_p = input_kwargs.pop("top_p", None)
-        top_k = input_kwargs.pop("top_k", None)
-        num_return_sequences = input_kwargs.pop("num_return_sequences", None)
-        repetition_penalty = input_kwargs.pop("repetition_penalty", None)
+        do_sample = input_kwargs.pop("do_sample", generating_args["do_sample"])
+        temperature = input_kwargs.pop("temperature", generating_args["temperature"])
+        top_p = input_kwargs.pop("top_p", generating_args["top_p"])
+        top_k = input_kwargs.pop("top_k", generating_args["top_k"])
+        num_return_sequences = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty = input_kwargs.pop("repetition_penalty", generating_args["repetition_penalty"])
+        length_penalty = input_kwargs.pop("length_penalty", generating_args["length_penalty"])
         max_length = input_kwargs.pop("max_length", None)
         max_new_tokens = input_kwargs.pop("max_new_tokens", None)
         stop = input_kwargs.pop("stop", None)
@@ -78,14 +79,16 @@ class HuggingfaceEngine(BaseEngine):
         if stop is not None:
             raise ValueError("Stop parameter is not supported in Huggingface engine yet.")
 
+        generating_args = generating_args.copy()
         generating_args.update(
             dict(
-                do_sample=do_sample if do_sample is not None else generating_args["do_sample"],
-                temperature=temperature or generating_args["temperature"],
-                top_p=top_p or generating_args["top_p"],
-                top_k=top_k or generating_args["top_k"],
-                num_return_sequences=num_return_sequences or 1,
-                repetition_penalty=repetition_penalty or generating_args["repetition_penalty"],
+                do_sample=do_sample,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                num_return_sequences=num_return_sequences,
+                repetition_penalty=repetition_penalty,
+                length_penalty=length_penalty,
                 eos_token_id=[tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids,
                 pad_token_id=tokenizer.pad_token_id,
             )
@@ -94,6 +97,10 @@ class HuggingfaceEngine(BaseEngine):
         if isinstance(num_return_sequences, int) and num_return_sequences > 1:
             generating_args["do_sample"] = True
 
+        if not generating_args["do_sample"]:
+            generating_args.pop("temperature", None)
+            generating_args.pop("top_p", None)
+
         if max_length:
             generating_args.pop("max_new_tokens", None)
             generating_args["max_length"] = max_length
diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index d50e41aa..faf8c9fe 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -89,43 +89,34 @@ class VllmEngine(BaseEngine):
         )
         prompt_length = len(prompt_ids)
 
-        temperature = input_kwargs.pop("temperature", None)
-        top_p = input_kwargs.pop("top_p", None)
-        top_k = input_kwargs.pop("top_k", None)
-        num_return_sequences = input_kwargs.pop("num_return_sequences", None)
-        repetition_penalty = input_kwargs.pop("repetition_penalty", None)
+        use_beam_search = self.generating_args["num_beams"] > 1
+        temperature = input_kwargs.pop("temperature", self.generating_args["temperature"])
+        top_p = input_kwargs.pop("top_p", self.generating_args["top_p"])
+        top_k = input_kwargs.pop("top_k", self.generating_args["top_k"])
+        num_return_sequences = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty = input_kwargs.pop("repetition_penalty", self.generating_args["repetition_penalty"])
+        length_penalty = input_kwargs.pop("length_penalty", self.generating_args["length_penalty"])
         max_length = input_kwargs.pop("max_length", None)
         max_new_tokens = input_kwargs.pop("max_new_tokens", None)
         stop = input_kwargs.pop("stop", None)
 
-        generating_args = self.generating_args.copy()
-        generating_args.update(
-            dict(
-                temperature=temperature or generating_args["temperature"],
-                top_p=top_p or generating_args["top_p"],
-                top_k=top_k or generating_args["top_k"],
-                num_return_sequences=num_return_sequences or 1,
-                repetition_penalty=repetition_penalty or generating_args["repetition_penalty"],
-            )
-        )
-
         if max_length:
-            generating_args["max_new_tokens"] = max_length - prompt_length
+            max_tokens = max_length - prompt_length
 
         if max_new_tokens:
-            generating_args["max_new_tokens"] = max_new_tokens
+            max_tokens = max_new_tokens
 
         sampling_params = SamplingParams(
-            n=generating_args["num_return_sequences"],
-            repetition_penalty=generating_args["repetition_penalty"],
-            temperature=generating_args["temperature"],
-            top_p=generating_args["top_p"],
-            top_k=generating_args["top_k"],
-            use_beam_search=generating_args["num_beams"] > 1,
-            length_penalty=generating_args["length_penalty"],
+            n=num_return_sequences,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            use_beam_search=use_beam_search,
+            length_penalty=length_penalty,
             stop=stop,
             stop_token_ids=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
-            max_tokens=generating_args["max_new_tokens"],
+            max_tokens=max_tokens,
             skip_special_tokens=True,
         )
 

From 2c011060b172d361283f7866b7000f8692f82cd1 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 02:17:54 +0800
Subject: [PATCH 293/341] fix bug in vllm engine

Former-commit-id: 38f02a2c5b52cba6908c2d3c2a455677f8574faf
---
 src/llmtuner/chat/vllm_engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index faf8c9fe..aaaad2f1 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -100,8 +100,9 @@ class VllmEngine(BaseEngine):
         max_new_tokens = input_kwargs.pop("max_new_tokens", None)
         stop = input_kwargs.pop("stop", None)
 
+        max_tokens = self.generating_args["max_new_tokens"] or self.generating_args["max_length"]
         if max_length:
-            max_tokens = max_length - prompt_length
+            max_tokens = max_length - prompt_length if max_length > prompt_length else 1
 
         if max_new_tokens:
             max_tokens = max_new_tokens

From 51795e8db11ed7067c473a2e4b34ea5636cd0c24 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Wed, 15 May 2024 09:54:00 +0800
Subject: [PATCH 294/341] add yivl and save processor to model_dir

Former-commit-id: ae72f745cb4f7713c3b835d11202aec19c3c5093
---
 src/llmtuner/data/template.py      | 2 +-
 src/llmtuner/model/utils/visual.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 631c79c1..0b2ca0e6 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -859,7 +859,7 @@ _register_template(
 _register_template(
     name="yivl",
     format_user=StringFormatter(slots=["### Human: {{content}}\n### Assistant:"]),
-    format_assistant=StringFormatter(slots=[" {{content}}"]),
+    format_assistant=StringFormatter(slots=[" {{content}}\n"]),
     stop_words=["###"],
     default_system=(
         "This is a chat between an inquisitive human and an AI assistant. "
diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index 79a6570e..8553cf86 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -29,7 +29,8 @@ def autocast_projector_dtype(
     ) -> "torch.Tensor":
         return output.to(model_args.compute_dtype)
 
-    if hasattr(model, mm_projector_name) and getattr(model.config, "quantization_method", None):
+    if hasattr(model, mm_projector_name) and (getattr(model.config, "quantization_method", None)
+                                              or "Yi" in getattr(model.config.text_config, "_name_or_path", None)):
         logger.info("Casting multimodal projector outputs in {}.".format(model_args.compute_dtype))
         mm_projector: "torch.nn.Module" = getattr(model, mm_projector_name)
         mm_projector.register_forward_hook(_mm_projector_forward_post_hook)

From 8061cb567101f97fa138dc79c253811bf4a374e2 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Wed, 15 May 2024 10:18:10 +0800
Subject: [PATCH 295/341] modify style

Former-commit-id: 823af88c3201412da7ef734d34198424e09b2d51
---
 src/llmtuner/model/utils/visual.py | 8 +++++---
 src/llmtuner/train/sft/trainer.py  | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index 8553cf86..0dc844f5 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -8,7 +8,7 @@ from ...extras.logging import get_logger
 
 
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig, PreTrainedModel, LlavaConfig
+    from transformers import LlavaConfig, PretrainedConfig, PreTrainedModel
 
     from ...hparams import ModelArguments
 
@@ -29,8 +29,10 @@ def autocast_projector_dtype(
     ) -> "torch.Tensor":
         return output.to(model_args.compute_dtype)
 
-    if hasattr(model, mm_projector_name) and (getattr(model.config, "quantization_method", None)
-                                              or "Yi" in getattr(model.config.text_config, "_name_or_path", None)):
+    if hasattr(model, mm_projector_name) and (
+        getattr(model.config, "quantization_method", None)
+        or "Yi" in getattr(model.config.text_config, "_name_or_path", None)
+    ):
         logger.info("Casting multimodal projector outputs in {}.".format(model_args.compute_dtype))
         mm_projector: "torch.nn.Module" = getattr(model, mm_projector_name)
         mm_projector.register_forward_hook(_mm_projector_forward_post_hook)
diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py
index 1b456e50..5f187375 100644
--- a/src/llmtuner/train/sft/trainer.py
+++ b/src/llmtuner/train/sft/trainer.py
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
-from transformers import Seq2SeqTrainer, ProcessorMixin
+from transformers import ProcessorMixin, Seq2SeqTrainer
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
@@ -127,4 +127,4 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         if self.processor is not None:
             if output_dir is None:
                 output_dir = self.args.output_dir
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)
\ No newline at end of file
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)

From b12679ad59dbdd62e9ee494d896561c4e337aff5 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Wed, 15 May 2024 11:22:15 +0800
Subject: [PATCH 296/341] cast dtype in mm_proj

Former-commit-id: e0ab22648fe8b65055b5986258cc2800438dc60c
---
 src/llmtuner/model/utils/visual.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index 0dc844f5..b8696096 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -8,7 +8,7 @@ from ...extras.logging import get_logger
 
 
 if TYPE_CHECKING:
-    from transformers import LlavaConfig, PretrainedConfig, PreTrainedModel
+    from transformers import LlavaConfig, PretrainedConfig, PreTrainedModel, LlavaForConditionalGeneration
 
     from ...hparams import ModelArguments
 
@@ -29,10 +29,7 @@ def autocast_projector_dtype(
     ) -> "torch.Tensor":
         return output.to(model_args.compute_dtype)
 
-    if hasattr(model, mm_projector_name) and (
-        getattr(model.config, "quantization_method", None)
-        or "Yi" in getattr(model.config.text_config, "_name_or_path", None)
-    ):
+    if hasattr(model, mm_projector_name) and getattr(model.config, "quantization_method", None):
         logger.info("Casting multimodal projector outputs in {}.".format(model_args.compute_dtype))
         mm_projector: "torch.nn.Module" = getattr(model, mm_projector_name)
         mm_projector.register_forward_hook(_mm_projector_forward_post_hook)
@@ -48,11 +45,13 @@ class LlavaMultiModalProjectorYiVL(nn.Module):
         self.act = nn.GELU()
 
     def forward(self, image_features):
+        dtype_ = self.linear_1.weight.dtype
         hidden_states = self.linear_1(image_features)
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_3(hidden_states)
         hidden_states = self.linear_4(hidden_states)
+        hidden_states = hidden_states.to(dtype_)
         return hidden_states
 
 
From aead3ca8e59bac4aeeeb1e7fda7a98bd5ce9fc60 Mon Sep 17 00:00:00 2001
From: BUAADreamer <1428195643@qq.com>
Date: Wed, 15 May 2024 12:48:18 +0800
Subject: [PATCH 297/341] rm extra import

Former-commit-id: 031215019e3d7727b1c7cc87a44e1cf1eb2853ec
---
 src/llmtuner/model/utils/visual.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index b8696096..1f770861 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -8,7 +8,7 @@ from ...extras.logging import get_logger
 
 
 if TYPE_CHECKING:
-    from transformers import LlavaConfig, PretrainedConfig, PreTrainedModel, LlavaForConditionalGeneration
+    from transformers import LlavaConfig, PretrainedConfig, PreTrainedModel
 
     from ...hparams import ModelArguments
 

From fc82acbbd89cbaa8ad708aecfffc6bcb5a1639e4 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 14:13:01 +0800
Subject: [PATCH 298/341] Update workflow.py

Former-commit-id: 97cfb44bced18b721166ccb5f260098645fc5318
---
 src/llmtuner/train/sft/workflow.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py
index 3b7b909a..d9d7c8e9 100644
--- a/src/llmtuner/train/sft/workflow.py
+++ b/src/llmtuner/train/sft/workflow.py
@@ -30,7 +30,6 @@ def run_sft(
 ):
     tokenizer_module = load_tokenizer(model_args)
     tokenizer = tokenizer_module["tokenizer"]
-    processor = tokenizer_module["processor"]
     dataset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
 
@@ -56,11 +55,10 @@ def run_sft(
         model=model,
         args=training_args,
         finetuning_args=finetuning_args,
-        processor=processor,
-        tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,
         compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None,
+        **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
     )
 

From 79165100e5806f2369dee37aab7647ed1bfd1a7d Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 14:13:26 +0800
Subject: [PATCH 299/341] Update trainer.py

Former-commit-id: dd767b20635bb549ce14f9556e1c4fb44b3662c5
---
 src/llmtuner/train/sft/trainer.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py
index 5f187375..35671e1b 100644
--- a/src/llmtuner/train/sft/trainer.py
+++ b/src/llmtuner/train/sft/trainer.py
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
-from transformers import ProcessorMixin, Seq2SeqTrainer
+from transformers import Seq2SeqTrainer
 
 from ...extras.constants import IGNORE_INDEX
 from ...extras.logging import get_logger
@@ -13,6 +13,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
+    from transformers import ProcessorMixin
     from transformers.trainer import PredictionOutput
 
     from ...hparams import FinetuningArguments
@@ -26,7 +27,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
     Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE.
     """
 
-    def __init__(self, finetuning_args: "FinetuningArguments", processor: "ProcessorMixin", **kwargs) -> None:
+    def __init__(
+        self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs
+    ) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
         self.processor = processor
@@ -46,6 +49,12 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
+    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
+        super()._save(output_dir, state_dict)
+        if self.processor is not None:
+            output_dir = output_dir if output_dir is not None else self.args.output_dir
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)
+
     def prediction_step(
         self,
         model: "torch.nn.Module",
@@ -121,10 +130,3 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
             for label, pred in zip(decoded_labels, decoded_preds):
                 res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False))
             writer.write("\n".join(res))
-
-    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
-        super().save_model(output_dir, _internal_call)
-        if self.processor is not None:
-            if output_dir is None:
-                output_dir = self.args.output_dir
-            getattr(self.processor, "image_processor").save_pretrained(output_dir)

From 8e518d6c62237046cb251f2a53877f7d6737a399 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 14:20:39 +0800
Subject: [PATCH 300/341] Update template.py

Former-commit-id: a13022166ba691c03f4fea7e9e2927fa446cf681
---
 src/llmtuner/data/template.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 0b2ca0e6..b20c9203 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -857,17 +857,17 @@ _register_template(
 
 
 _register_template(
-    name="yivl",
+    name="yi_vl",
     format_user=StringFormatter(slots=["### Human: {{content}}\n### Assistant:"]),
-    format_assistant=StringFormatter(slots=[" {{content}}\n"]),
-    stop_words=["###"],
+    format_separator=EmptyFormatter(slots=["\n"]),
     default_system=(
         "This is a chat between an inquisitive human and an AI assistant. "
-        "Assume the role of the AI assistant. "
-        "Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
-        "这是一个好奇的人类和一个人工智能助手之间的对话。"
-        "假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n"
+        "Assume the role of the AI assistant. Read all the images carefully, "
+        "and respond to the human's questions with informative, helpful, detailed and polite answers. "
+        "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。"
+        "仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n"
     ),
+    stop_words=["###"],
 )
 
 
From c4f5e49d0dfcca62de1b35535cb5ed24c244d3ba Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 15:37:07 +0800
Subject: [PATCH 301/341] Update patcher.py

Former-commit-id: 4c31a21f2106adcdad100119bad83ecaef0be3f3
---
 src/llmtuner/model/patcher.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index bddea594..70aed709 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -17,7 +17,7 @@ from .utils.moe import add_z3_leaf_module, configure_moe
 from .utils.quantization import configure_quantization
 from .utils.rope import configure_rope
 from .utils.valuehead import prepare_valuehead_model
-from .utils.visual import autocast_projector_dtype, configure_hidden_size, configure_visual
+from .utils.visual import autocast_projector_dtype, configure_visual_model
 
 
 if TYPE_CHECKING:
@@ -54,8 +54,7 @@ def patch_config(
     configure_longlora(config, model_args, is_trainable)
     configure_quantization(config, tokenizer, model_args, init_kwargs)
     configure_moe(config, model_args, is_trainable)
-    configure_hidden_size(config)
-    configure_visual(config, model_args)
+    configure_visual_model(config)
 
     if model_args.use_cache and not is_trainable:
         setattr(config, "use_cache", True)

From d4d36e157c8c3fd08a12ee4889edf57e13cf4335 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 16:32:28 +0800
Subject: [PATCH 302/341] fix fsdp model loading

Former-commit-id: fc6fe23cc9ae4a920a17e8268a85c1aa4ad16d3b
---
 src/llmtuner/model/patcher.py            | 3 ++-
 src/llmtuner/model/utils/quantization.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index b28a23d0..8625f3e1 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -6,6 +6,7 @@ import torch
 from peft import PeftModel
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available
 from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.modeling_utils import is_fsdp_enabled
 
 from ..extras.logging import get_logger
 from ..extras.misc import infer_optim_dtype
@@ -69,7 +70,7 @@ def patch_config(
         setattr(config, "use_cache", False)  # qwen2 does not support use_cache when using flashattn
 
     init_kwargs["torch_dtype"] = model_args.compute_dtype
-    if not is_deepspeed_zero3_enabled():
+    if not is_deepspeed_zero3_enabled() and not is_fsdp_enabled():
         init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage
         if init_kwargs["low_cpu_mem_usage"]:
             if "device_map" not in init_kwargs and model_args.device_map:
diff --git a/src/llmtuner/model/utils/quantization.py b/src/llmtuner/model/utils/quantization.py
index 3cf159c1..95412e7c 100644
--- a/src/llmtuner/model/utils/quantization.py
+++ b/src/llmtuner/model/utils/quantization.py
@@ -7,6 +7,7 @@ import torch
 from datasets import load_dataset
 from transformers import BitsAndBytesConfig, GPTQConfig
 from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.modeling_utils import is_fsdp_enabled
 from transformers.utils.versions import require_version
 
 from ...extras.constants import FILEEXT2TYPE
@@ -133,7 +134,7 @@ def configure_quantization(
                 bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp qlora
             )
 
-        if is_deepspeed_zero3_enabled() or model_args.quantization_device_map == "auto":
+        if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
             if model_args.quantization_bit != 4:
                 raise ValueError("Only 4-bit quantized model can use auto device map.")
 

From 36a4224bf584a3f002a5b1c2aa1ea19d28c6990b Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 16:39:57 +0800
Subject: [PATCH 303/341] Update visual.py

Former-commit-id: f5f13a995c64fc374ad05e26cde8efa6651aefa1
---
 src/llmtuner/model/utils/visual.py | 53 ++++++++++++++----------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index 1f770861..9a5134ff 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -1,8 +1,8 @@
 from typing import TYPE_CHECKING, Tuple
 
 import torch
-import transformers
-from torch import nn
+import transformers.models
+from transformers.activations import ACT2FN
 
 from ...extras.logging import get_logger
 
@@ -16,9 +16,23 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
-def configure_hidden_size(config: "PretrainedConfig") -> None:
-    if getattr(config, "model_type", None) == "llava":
-        setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
+class LlavaMultiModalProjector(torch.nn.Module):
+    def __init__(self, config: "LlavaConfig"):
+        super().__init__()
+
+        self.linear_1 = torch.nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_2 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.linear_3 = torch.nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_4 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.act = ACT2FN[config.projector_hidden_act]
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_3(hidden_states)
+        hidden_states = self.linear_4(hidden_states)
+        return hidden_states
 
 
 def autocast_projector_dtype(
@@ -35,28 +49,9 @@ def autocast_projector_dtype(
         mm_projector.register_forward_hook(_mm_projector_forward_post_hook)
 
 
-class LlavaMultiModalProjectorYiVL(nn.Module):
-    def __init__(self, config: "LlavaConfig"):
-        super().__init__()
-        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
-        self.linear_2 = nn.LayerNorm(config.text_config.hidden_size, bias=True)
-        self.linear_3 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
-        self.linear_4 = nn.LayerNorm(config.text_config.hidden_size, bias=True)
-        self.act = nn.GELU()
+def configure_visual_model(config: "PretrainedConfig") -> None:
+    if getattr(config, "model_type", None) == "llava":
+        setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
 
-    def forward(self, image_features):
-        dtype_ = self.linear_1.weight.dtype
-        hidden_states = self.linear_1(image_features)
-        hidden_states = self.linear_2(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.linear_3(hidden_states)
-        hidden_states = self.linear_4(hidden_states)
-        hidden_states = hidden_states.to(dtype_)
-        return hidden_states
-
-
-def configure_visual(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
-    logger = get_logger(__name__)
-    if model_args.visual_inputs and "Yi" in getattr(config.text_config, "_name_or_path", None):
-        transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorYiVL
-        logger.info("Patched Multimodal Projector for Yi-VL.")
+        if getattr(config, "is_yi_vl_derived_model", None):
+            transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjector

From 096677b989d54431af5875c8e5d4e17649cd66b4 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 19:20:11 +0800
Subject: [PATCH 304/341] add NPU docker images

Former-commit-id: 3b3257962c52f5d1f15ce245fee402c5baddb774
---
 README.md    | 7 ++++++-
 README_zh.md | 5 +++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 826512c6..0643e8ef 100644
--- a/README.md
+++ b/README.md
@@ -342,7 +342,7 @@ To enable FlashAttention-2 on the Windows platform, you need to install the prec
 
 <details><summary>For Ascend NPU users</summary>
 
-To utilize Ascend NPU devices for (distributed) training and inference, you need to install the **[torch-npu](https://gitee.com/ascend/pytorch)** package and the **[Ascend CANN Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**.
+To utilize Ascend NPU devices for (distributed) training and inference, you need to install the **[torch-npu](https://gitee.com/ascend/pytorch)** library and the **[Ascend CANN Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**.
 
 | Requirement  | Minimum | Recommend |
 | ------------ | ------- | --------- |
@@ -351,6 +351,11 @@ To utilize Ascend NPU devices for (distributed) training and inference, you need
 | torch-npu    | 2.2.0   | 2.2.0     |
 | deepspeed    | 0.13.2  | 0.13.2    |
 
+Docker image:
+
+- 32GB: [Download page](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html)
+- 64GB: Coming soon
+
 Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
 
 If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
diff --git a/README_zh.md b/README_zh.md
index d41ff13a..47f7e111 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -351,6 +351,11 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 | torch-npu    | 2.2.0   | 2.2.0     |
 | deepspeed    | 0.13.2  | 0.13.2    |
 
+Docker 镜像：
+
+- 32GB：[下载地址](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html)
+- 64GB：敬请期待
+
 请记得使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定您使用的设备。
 
 如果遇到无法正常推理的情况，请尝试设置 `do_sample: false`。

From 40e3d3fbddde1ced4865c79f038c873dd703560c Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 19:25:48 +0800
Subject: [PATCH 305/341] fix yi vl vllm infer

Former-commit-id: de54e5d7ec06dd7c20ec82c9ff032fc16cd50244
---
 src/llmtuner/chat/vllm_engine.py    | 19 +++++++++++----
 src/llmtuner/data/template.py       |  2 +-
 src/llmtuner/hparams/parser.py      |  2 +-
 src/llmtuner/model/utils/visual.py  | 37 +++++++++++++++++++++++++----
 src/llmtuner/train/dpo/trainer.py   | 10 +++++++-
 src/llmtuner/train/dpo/workflow.py  |  2 +-
 src/llmtuner/train/orpo/trainer.py  | 10 +++++++-
 src/llmtuner/train/orpo/workflow.py |  2 +-
 src/llmtuner/train/ppo/trainer.py   | 16 ++++++++++++-
 src/llmtuner/train/ppo/workflow.py  |  2 +-
 src/llmtuner/train/pt/trainer.py    | 14 +++++++++--
 src/llmtuner/train/pt/workflow.py   |  2 +-
 src/llmtuner/train/rm/trainer.py    | 13 ++++++++--
 src/llmtuner/train/rm/workflow.py   |  2 +-
 src/llmtuner/webui/runner.py        |  3 +++
 src/llmtuner/webui/utils.py         |  1 -
 16 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py
index aaaad2f1..8d602655 100644
--- a/src/llmtuner/chat/vllm_engine.py
+++ b/src/llmtuner/chat/vllm_engine.py
@@ -2,9 +2,11 @@ import uuid
 from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence
 
 from ..data import get_template_and_fix_tokenizer
+from ..extras.logging import get_logger
 from ..extras.misc import get_device_count, infer_optim_dtype
 from ..extras.packages import is_vllm_available
 from ..model import load_config, load_tokenizer
+from ..model.utils.visual import LlavaMultiModalProjectorForYiVLForVLLM
 from .base_engine import BaseEngine, Response
 
 
@@ -22,6 +24,9 @@ if TYPE_CHECKING:
     from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
 
 
+logger = get_logger(__name__)
+
+
 class VllmEngine(BaseEngine):
     def __init__(
         self,
@@ -57,13 +62,19 @@ class VllmEngine(BaseEngine):
         }
 
         if model_args.visual_inputs:
-            # TODO: auto derive from config
-            # https://github.com/vllm-project/vllm/pull/3042#issuecomment-1984893549
-            self.image_feature_size = 576
+            image_size = config.vision_config.image_size
+            patch_size = config.vision_config.patch_size
+            self.image_feature_size = (image_size // patch_size) ** 2
             engine_args["image_input_type"] = "pixel_values"
             engine_args["image_token_id"] = self.tokenizer.convert_tokens_to_ids("<image>")
-            engine_args["image_input_shape"] = "1,3,336,336"
+            engine_args["image_input_shape"] = "1,3,{},{}".format(image_size, image_size)
             engine_args["image_feature_size"] = self.image_feature_size
+            if getattr(config, "is_yi_vl_derived_model", None):
+                # bug in vllm 0.4.2, see: https://github.com/vllm-project/vllm/pull/4828
+                import vllm.model_executor.models.llava
+
+                logger.info("Detected Yi-VL model, applying projector patch.")
+                vllm.model_executor.models.llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVLForVLLM
 
         self.model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**engine_args))
         if model_args.adapter_name_or_path is not None:
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index b20c9203..b7a34b59 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -865,7 +865,7 @@ _register_template(
         "Assume the role of the AI assistant. Read all the images carefully, "
         "and respond to the human's questions with informative, helpful, detailed and polite answers. "
         "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。"
-        "仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n"
+        "仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n"
     ),
     stop_words=["###"],
 )
diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py
index 7fdd3234..20f9a003 100644
--- a/src/llmtuner/hparams/parser.py
+++ b/src/llmtuner/hparams/parser.py
@@ -285,7 +285,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
     model_args.model_max_length = data_args.cutoff_len
     data_args.packing = data_args.packing if data_args.packing is not None else finetuning_args.stage == "pt"
 
-    # Log on each process the small summary:
+    # Log on each process the small summary
     logger.info(
         "Process rank: {}, device: {}, n_gpu: {}, distributed training: {}, compute dtype: {}".format(
             training_args.local_rank,
diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index 9a5134ff..33fb394d 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -16,25 +16,51 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
 
 
-class LlavaMultiModalProjector(torch.nn.Module):
-    def __init__(self, config: "LlavaConfig"):
+class LlavaMultiModalProjectorForYiVL(torch.nn.Module):
+    def __init__(self, config: "LlavaConfig") -> None:
         super().__init__()
 
+        self.config = config
+        if config is None:
+            return
+
         self.linear_1 = torch.nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
         self.linear_2 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
         self.linear_3 = torch.nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
         self.linear_4 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
         self.act = ACT2FN[config.projector_hidden_act]
 
-    def forward(self, image_features):
+    def forward(self, image_features: "torch.Tensor") -> "torch.Tensor":
         hidden_states = self.linear_1(image_features)
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_3(hidden_states)
         hidden_states = self.linear_4(hidden_states)
+        if hidden_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.linear_1.weight.dtype
+
+            logger.warning_once("The hidden states seems to be silently casted in float32.")
+            hidden_states = hidden_states.to(target_dtype)
+
         return hidden_states
 
 
+class LlavaMultiModalProjectorForYiVLForVLLM(LlavaMultiModalProjectorForYiVL):
+    def __init__(self, vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str) -> None:
+        super().__init__(config=None)
+
+        self.linear_1 = torch.nn.Linear(vision_hidden_size, text_hidden_size, bias=True)
+        self.linear_2 = torch.nn.LayerNorm(text_hidden_size, bias=True)
+        self.linear_3 = torch.nn.Linear(text_hidden_size, text_hidden_size, bias=True)
+        self.linear_4 = torch.nn.LayerNorm(text_hidden_size, bias=True)
+        self.act = torch.nn.GELU()
+
+
 def autocast_projector_dtype(
     model: "PreTrainedModel", model_args: "ModelArguments", mm_projector_name: str = "multi_modal_projector"
 ) -> None:
@@ -53,5 +79,6 @@ def configure_visual_model(config: "PretrainedConfig") -> None:
     if getattr(config, "model_type", None) == "llava":
         setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
 
-        if getattr(config, "is_yi_vl_derived_model", None):
-            transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjector
+    if getattr(config, "is_yi_vl_derived_model", None):
+        logger.info("Detected Yi-VL model, applying projector patch.")
+        transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVL
diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py
index b144d561..3c0b0276 100644
--- a/src/llmtuner/train/dpo/trainer.py
+++ b/src/llmtuner/train/dpo/trainer.py
@@ -13,7 +13,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+    from transformers import PreTrainedModel, ProcessorMixin
 
     from ...hparams import FinetuningArguments
 
@@ -24,6 +24,7 @@ class CustomDPOTrainer(DPOTrainer):
         model: Union["PreTrainedModel", torch.nn.Module],
         ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]],
         finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
         disable_dropout: bool = True,
         **kwargs,
     ):
@@ -33,6 +34,7 @@ class CustomDPOTrainer(DPOTrainer):
                 disable_dropout_in_model(ref_model)
 
         self.finetuning_args = finetuning_args
+        self.processor = processor
         self.reference_free = False
         self.use_dpo_data_collator = True  # hack to avoid warning
         self.generate_during_eval = False  # disable at evaluation
@@ -80,6 +82,12 @@ class CustomDPOTrainer(DPOTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
+    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
+        super()._save(output_dir, state_dict)
+        if self.processor is not None:
+            output_dir = output_dir if output_dir is not None else self.args.output_dir
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)
+
     def sft_loss(self, chosen_logits: "torch.FloatTensor", chosen_labels: "torch.LongTensor") -> "torch.Tensor":
         r"""
         Computes supervised cross-entropy loss of given labels under the given logits.
diff --git a/src/llmtuner/train/dpo/workflow.py b/src/llmtuner/train/dpo/workflow.py
index b19a643e..8ac4952a 100644
--- a/src/llmtuner/train/dpo/workflow.py
+++ b/src/llmtuner/train/dpo/workflow.py
@@ -50,9 +50,9 @@ def run_dpo(
         ref_model=ref_model,
         args=training_args,
         finetuning_args=finetuning_args,
-        tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,
+        **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
     )
 
diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py
index 88090a9e..1b743647 100644
--- a/src/llmtuner/train/orpo/trainer.py
+++ b/src/llmtuner/train/orpo/trainer.py
@@ -13,7 +13,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+    from transformers import PreTrainedModel, ProcessorMixin
 
     from ...hparams import FinetuningArguments
 
@@ -23,6 +23,7 @@ class CustomORPOTrainer(DPOTrainer):
         self,
         model: Union["PreTrainedModel", "torch.nn.Module"],
         finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
         disable_dropout: bool = True,
         **kwargs,
     ):
@@ -30,6 +31,7 @@ class CustomORPOTrainer(DPOTrainer):
             disable_dropout_in_model(model)
 
         self.finetuning_args = finetuning_args
+        self.processor = processor
         self.reference_free = False
         self.use_dpo_data_collator = True  # hack to avoid warning
         self.generate_during_eval = False  # disable at evaluation
@@ -61,6 +63,12 @@ class CustomORPOTrainer(DPOTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
+    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
+        super()._save(output_dir, state_dict)
+        if self.processor is not None:
+            output_dir = output_dir if output_dir is not None else self.args.output_dir
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)
+
     def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
         r"""
         Computes ORPO's odds ratio (OR) loss.
diff --git a/src/llmtuner/train/orpo/workflow.py b/src/llmtuner/train/orpo/workflow.py
index 9c870096..6ea18dae 100644
--- a/src/llmtuner/train/orpo/workflow.py
+++ b/src/llmtuner/train/orpo/workflow.py
@@ -43,9 +43,9 @@ def run_orpo(
         model=model,
         args=training_args,
         finetuning_args=finetuning_args,
-        tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,
+        **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
     )
 
diff --git a/src/llmtuner/train/ppo/trainer.py b/src/llmtuner/train/ppo/trainer.py
index ef769968..985664b7 100644
--- a/src/llmtuner/train/ppo/trainer.py
+++ b/src/llmtuner/train/ppo/trainer.py
@@ -23,7 +23,13 @@ from .utils import dump_layernorm, get_rewards_from_server, replace_model, resto
 
 if TYPE_CHECKING:
     from datasets import Dataset
-    from transformers import DataCollatorWithPadding, PreTrainedTokenizer, Seq2SeqTrainingArguments, TrainerCallback
+    from transformers import (
+        DataCollatorWithPadding,
+        PreTrainedTokenizer,
+        ProcessorMixin,
+        Seq2SeqTrainingArguments,
+        TrainerCallback,
+    )
     from trl import AutoModelForCausalLMWithValueHead
 
     from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments
@@ -48,6 +54,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         reward_model: Optional["AutoModelForCausalLMWithValueHead"],
         ref_model: Optional["AutoModelForCausalLMWithValueHead"],
         tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
         dataset: "Dataset",
         data_collator: "DataCollatorWithPadding",
     ):
@@ -97,6 +104,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         self.finetuning_args = finetuning_args
         self.reward_model = reward_model
         self.current_device = get_current_device()  # patch for deepspeed training
+        self.processor = processor
 
         self.generation_config = GenerationConfig(
             pad_token_id=self.tokenizer.pad_token_id,
@@ -295,6 +303,12 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
         )
         return lr_scheduler
 
+    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
+        super()._save(output_dir, state_dict)
+        if self.processor is not None:
+            output_dir = output_dir if output_dir is not None else self.args.output_dir
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)
+
     @torch.no_grad()
     def get_inputs(self, batch: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
         r"""
diff --git a/src/llmtuner/train/ppo/workflow.py b/src/llmtuner/train/ppo/workflow.py
index 8cd15932..4383bcdc 100644
--- a/src/llmtuner/train/ppo/workflow.py
+++ b/src/llmtuner/train/ppo/workflow.py
@@ -49,9 +49,9 @@ def run_ppo(
         model=model,
         reward_model=reward_model,
         ref_model=ref_model,
-        tokenizer=tokenizer,
         dataset=dataset,
         data_collator=data_collator,
+        **tokenizer_module,
     )
 
     # Training
diff --git a/src/llmtuner/train/pt/trainer.py b/src/llmtuner/train/pt/trainer.py
index 969ebf04..b7b80f88 100644
--- a/src/llmtuner/train/pt/trainer.py
+++ b/src/llmtuner/train/pt/trainer.py
@@ -1,5 +1,5 @@
 from types import MethodType
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Dict, Optional
 
 from transformers import Trainer
 
@@ -9,6 +9,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler
 
 if TYPE_CHECKING:
     import torch
+    from transformers import ProcessorMixin
 
     from ...hparams import FinetuningArguments
 
@@ -21,9 +22,12 @@ class CustomTrainer(Trainer):
     Inherits Trainer for custom optimizer.
     """
 
-    def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
+    def __init__(
+        self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs
+    ) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
+        self.processor = processor
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
 
@@ -39,3 +43,9 @@ class CustomTrainer(Trainer):
     ) -> "torch.optim.lr_scheduler.LRScheduler":
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
+
+    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
+        super()._save(output_dir, state_dict)
+        if self.processor is not None:
+            output_dir = output_dir if output_dir is not None else self.args.output_dir
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)
diff --git a/src/llmtuner/train/pt/workflow.py b/src/llmtuner/train/pt/workflow.py
index 3b127da4..9f945901 100644
--- a/src/llmtuner/train/pt/workflow.py
+++ b/src/llmtuner/train/pt/workflow.py
@@ -36,9 +36,9 @@ def run_pt(
         model=model,
         args=training_args,
         finetuning_args=finetuning_args,
-        tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,
+        **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
     )
 
diff --git a/src/llmtuner/train/rm/trainer.py b/src/llmtuner/train/rm/trainer.py
index 0f5d88d3..d49dd67b 100644
--- a/src/llmtuner/train/rm/trainer.py
+++ b/src/llmtuner/train/rm/trainer.py
@@ -11,7 +11,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
-    from transformers.modeling_utils import PreTrainedModel
+    from transformers import PreTrainedModel, ProcessorMixin
     from transformers.trainer import PredictionOutput
 
     from ...hparams import FinetuningArguments
@@ -25,9 +25,12 @@ class PairwiseTrainer(Trainer):
     Inherits Trainer to compute pairwise loss.
     """
 
-    def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
+    def __init__(
+        self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs
+    ) -> None:
         super().__init__(**kwargs)
         self.finetuning_args = finetuning_args
+        self.processor = processor
         self.can_return_loss = True  # override property to return eval_loss
         if finetuning_args.use_badam:
             from badam import clip_grad_norm_for_sparse_tensor
@@ -45,6 +48,12 @@ class PairwiseTrainer(Trainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
+    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
+        super()._save(output_dir, state_dict)
+        if self.processor is not None:
+            output_dir = output_dir if output_dir is not None else self.args.output_dir
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)
+
     def compute_loss(
         self, model: "PreTrainedModel", inputs: Dict[str, torch.Tensor], return_outputs: bool = False
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
diff --git a/src/llmtuner/train/rm/workflow.py b/src/llmtuner/train/rm/workflow.py
index bd0a756c..621d03b7 100644
--- a/src/llmtuner/train/rm/workflow.py
+++ b/src/llmtuner/train/rm/workflow.py
@@ -39,10 +39,10 @@ def run_rm(
         model=model,
         args=training_args,
         finetuning_args=finetuning_args,
-        tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks + [FixValueHeadModelCallback()],
         compute_metrics=compute_accuracy,
+        **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
     )
 
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 59515a62..168abd86 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -107,6 +107,7 @@ class Runner:
             model_name_or_path=get("top.model_path"),
             adapter_name_or_path=adapter_name_or_path,
             cache_dir=user_config.get("cache_dir", None),
+            preprocessing_num_workers=16,
             finetuning_type=get("top.finetuning_type"),
             quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
             template=get("top.template"),
@@ -141,6 +142,7 @@ class Runner:
             fp16=(get("train.compute_type") == "fp16"),
             bf16=(get("train.compute_type") == "bf16"),
             pure_bf16=(get("train.compute_type") == "pure_bf16"),
+            plot_loss=True,
         )
 
         if args["finetuning_type"] == "freeze":
@@ -214,6 +216,7 @@ class Runner:
             model_name_or_path=get("top.model_path"),
             adapter_name_or_path=adapter_name_or_path,
             cache_dir=user_config.get("cache_dir", None),
+            preprocessing_num_workers=16,
             finetuning_type=get("top.finetuning_type"),
             quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None,
             template=get("top.template"),
diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py
index 1f2b0591..3d34f0d2 100644
--- a/src/llmtuner/webui/utils.py
+++ b/src/llmtuner/webui/utils.py
@@ -42,7 +42,6 @@ def clean_cmd(args: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def gen_cmd(args: Dict[str, Any]) -> str:
-    args["plot_loss"] = args.get("do_train", None)
     current_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
     cmd_lines = ["CUDA_VISIBLE_DEVICES={} llamafactory-cli train ".format(current_devices)]
     for k, v in clean_cmd(args).items():

From 4d7f0fbb7a5528144cdc650a9d56fa2e48a0a8b8 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 20:02:41 +0800
Subject: [PATCH 306/341] add yi-vl 6b model

Former-commit-id: 35f4041b13a593a6cf1ec6686fa18b38911ad6a4
---
 README.md                        |  1 +
 README_zh.md                     |  1 +
 src/llmtuner/data/template.py    |  1 +
 src/llmtuner/extras/constants.py | 19 +++++++++++++++++--
 src/llmtuner/webui/common.py     |  4 ++--
 5 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0643e8ef..d613fcd1 100644
--- a/README.md
+++ b/README.md
@@ -166,6 +166,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
 | [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | q_proj,v_proj     | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                    | 6B                               | q_proj,v_proj     | yi_vl     |
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
diff --git a/README_zh.md b/README_zh.md
index 47f7e111..0d91b2f4 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -166,6 +166,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
 | [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | q_proj,v_proj     | yi        |
+| [Yi-VL](https://huggingface.co/01-ai)                    | 6B                               | q_proj,v_proj     | yi_vl     |
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index b7a34b59..66f6f651 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -868,6 +868,7 @@ _register_template(
         "仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n"
     ),
     stop_words=["###"],
+    efficient_eos=True,
 )
 
 
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index ff52f29a..c427411a 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -26,8 +26,6 @@ LAYERNORM_NAMES = {"norm", "ln"}
 
 METHODS = ["full", "freeze", "lora"]
 
-MLLM_LIST = ["LLaVA1.5"]
-
 MOD_SUPPORTED_MODELS = ["bloom", "falcon", "gemma", "llama", "mistral", "mixtral", "phi", "starcoder2"]
 
 PEFT_METHODS = ["lora"]
@@ -59,6 +57,8 @@ V_HEAD_WEIGHTS_NAME = "value_head.bin"
 
 V_HEAD_SAFE_WEIGHTS_NAME = "value_head.safetensors"
 
+VISION_MODELS = set()
+
 
 class DownloadSource(str, Enum):
     DEFAULT = "hf"
@@ -69,6 +69,7 @@ def register_model_group(
     models: Dict[str, Dict[DownloadSource, str]],
     module: Optional[str] = None,
     template: Optional[str] = None,
+    vision: bool = False,
 ) -> None:
     prefix = None
     for name, path in models.items():
@@ -81,6 +82,8 @@ def register_model_group(
         DEFAULT_MODULE[prefix] = module
     if template is not None:
         DEFAULT_TEMPLATE[prefix] = template
+    if vision:
+        VISION_MODELS.add(prefix)
 
 
 register_model_group(
@@ -599,6 +602,7 @@ register_model_group(
         },
     },
     template="vicuna",
+    vision=True,
 )
 
 
@@ -1206,6 +1210,17 @@ register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "YiVL-6B-Chat": {
+            DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-6B-hf",
+        },
+    },
+    template="yi_vl",
+    vision=True,
+)
+
+
 register_model_group(
     models={
         "Yuan2-2B-Chat": {
diff --git a/src/llmtuner/webui/common.py b/src/llmtuner/webui/common.py
index d569f1fa..c63e9d74 100644
--- a/src/llmtuner/webui/common.py
+++ b/src/llmtuner/webui/common.py
@@ -10,11 +10,11 @@ from ..extras.constants import (
     DATA_CONFIG,
     DEFAULT_MODULE,
     DEFAULT_TEMPLATE,
-    MLLM_LIST,
     PEFT_METHODS,
     STAGES_USE_PAIR_DATA,
     SUPPORTED_MODELS,
     TRAINING_STAGES,
+    VISION_MODELS,
     DownloadSource,
 )
 from ..extras.logging import get_logger
@@ -112,7 +112,7 @@ def get_template(model_name: str) -> str:
 
 
 def get_visual(model_name: str) -> bool:
-    return get_prefix(model_name) in MLLM_LIST
+    return get_prefix(model_name) in VISION_MODELS
 
 
 def list_adapters(model_name: str, finetuning_type: str) -> "gr.Dropdown":

From 2ac972d6e7a4266d1b5163db2bc02d844542ea7f Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 22:58:19 +0800
Subject: [PATCH 307/341] add Yi-VL-34B model

Former-commit-id: 8b3d8a7e3bd8dff27cc72edba1b8a042f6d1929c
---
 README.md                          |  2 +-
 README_zh.md                       |  2 +-
 src/llmtuner/extras/constants.py   |  3 +++
 src/llmtuner/model/loader.py       | 11 +++++++++--
 src/llmtuner/model/utils/visual.py |  2 +-
 5 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index d613fcd1..11700e93 100644
--- a/README.md
+++ b/README.md
@@ -166,7 +166,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
 | [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | q_proj,v_proj     | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                    | 6B                               | q_proj,v_proj     | yi_vl     |
+| [Yi-VL](https://huggingface.co/01-ai)                    | 6B/34B                           | q_proj,v_proj     | yi_vl     |
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
diff --git a/README_zh.md b/README_zh.md
index 0d91b2f4..146d046d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -166,7 +166,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [StarCoder2](https://huggingface.co/bigcode)             | 3B/7B/15B                        | q_proj,v_proj     | -         |
 | [XVERSE](https://huggingface.co/xverse)                  | 7B/13B/65B                       | q_proj,v_proj     | xverse    |
 | [Yi (1/1.5)](https://huggingface.co/01-ai)               | 6B/9B/34B                        | q_proj,v_proj     | yi        |
-| [Yi-VL](https://huggingface.co/01-ai)                    | 6B                               | q_proj,v_proj     | yi_vl     |
+| [Yi-VL](https://huggingface.co/01-ai)                    | 6B/34B                           | q_proj,v_proj     | yi_vl     |
 | [Yuan](https://huggingface.co/IEITYuan)                  | 2B/51B/102B                      | q_proj,v_proj     | yuan      |
 
 > [!NOTE]
diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py
index c427411a..f1ee55a0 100644
--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -1215,6 +1215,9 @@ register_model_group(
         "YiVL-6B-Chat": {
             DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-6B-hf",
         },
+        "YiVL-34B-Chat": {
+            DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-34B-hf",
+        },
     },
     template="yi_vl",
     vision=True,
diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py
index ea55de27..08cdf17f 100644
--- a/src/llmtuner/model/loader.py
+++ b/src/llmtuner/model/loader.py
@@ -78,8 +78,15 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
     patch_tokenizer(tokenizer)
 
     if model_args.visual_inputs:
-        processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs)
-        setattr(processor, "tokenizer", tokenizer)
+        try:
+            processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs)
+            setattr(processor, "tokenizer", tokenizer)
+        except Exception:
+            raise ValueError(
+                "This multimodal LLM is not supported.\n"
+                "Download LLaVA-1.5 models from: https://huggingface.co/llava-hf\n"
+                "Download Yi-VL models from: https://huggingface.co/BUAADreamer"
+            )
     else:
         processor = None
 
diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index 33fb394d..50f92d22 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -58,7 +58,7 @@ class LlavaMultiModalProjectorForYiVLForVLLM(LlavaMultiModalProjectorForYiVL):
         self.linear_2 = torch.nn.LayerNorm(text_hidden_size, bias=True)
         self.linear_3 = torch.nn.Linear(text_hidden_size, text_hidden_size, bias=True)
         self.linear_4 = torch.nn.LayerNorm(text_hidden_size, bias=True)
-        self.act = torch.nn.GELU()
+        self.act = ACT2FN[projector_hidden_act]
 
 
 def autocast_projector_dtype(

From 437cc20be6b1d149a9978466ff305c9a39931b50 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Wed, 15 May 2024 23:05:02 +0800
Subject: [PATCH 308/341] fix #3606

https://github.com/huggingface/peft/pull/1706

Former-commit-id: bf2783e1b6bc207375974c48736d6f82dd293f02
---
 src/llmtuner/model/adapter.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index 0ffb91c1..83f9a2d2 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -2,7 +2,8 @@ from typing import TYPE_CHECKING
 
 import torch
 from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model
-from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.integrations import deepspeed_config, is_deepspeed_zero3_enabled
+from transformers.modeling_utils import is_fsdp_enabled
 
 from ..extras.logging import get_logger
 from .utils.misc import find_all_linear_modules, find_expanded_modules
@@ -41,9 +42,16 @@ def init_adapter(
     if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None):
         raise ValueError("You can only use lora for quantized models.")
 
+    if deepspeed_config() is not None or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
+        logger.info("DeepSpeed/FSDP/PureBF16/BAdam detected, remaining trainable params in half precision.")
+        cast_trainable_params_to_fp32 = False
+    else:
+        logger.info("Upcasting trainable params to float32.")
+        cast_trainable_params_to_fp32 = True
+
     if finetuning_args.finetuning_type == "full" and is_trainable:
         logger.info("Fine-tuning method: Full")
-        if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
+        if cast_trainable_params_to_fp32:
             model = model.float()
 
         if model_args.visual_inputs and hasattr(model, "vision_tower"):  # freeze vision model
@@ -93,7 +101,7 @@ def init_adapter(
 
         for name, param in model.named_parameters():
             if any(trainable_layer in name for trainable_layer in trainable_layers):
-                if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
+                if cast_trainable_params_to_fp32:
                     param.data = param.data.to(torch.float32)
             else:
                 param.requires_grad_(False)
@@ -191,7 +199,7 @@ def init_adapter(
                 )
                 model = get_peft_model(model, lora_config)
 
-        if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam):
+        if cast_trainable_params_to_fp32:
             for param in filter(lambda p: p.requires_grad, model.parameters()):
                 param.data = param.data.to(torch.float32)
 

From 538c79fd8fec4a231e86caa393f7f244105fc43d Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 16 May 2024 00:35:28 +0800
Subject: [PATCH 309/341] fix #3694

Former-commit-id: 3d1b818cb6a77b7603724fbeb756b468aa74e7ea
---
 .../extras/llama_pro/llama3_freeze_sft.yaml   |   4 +-
 scripts/llama_pro.py                          |   5 +-
 src/llmtuner/hparams/finetuning_args.py       | 110 ++++++++++--------
 src/llmtuner/model/adapter.py                 |  42 ++++---
 src/llmtuner/webui/components/train.py        |  12 +-
 src/llmtuner/webui/locales.py                 |  32 ++++-
 src/llmtuner/webui/runner.py                  |   5 +-
 7 files changed, 133 insertions(+), 77 deletions(-)

diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index 4d92cdad..0ffcb5e8 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -5,8 +5,8 @@ model_name_or_path: models/llama3-8b-instruct-pro
 stage: sft
 do_train: true
 finetuning_type: freeze
-name_module_trainable: all
-num_layer_trainable: 8
+freeze_trainable_layers: 8
+freeze_trainable_modules: all
 use_llama_pro: true
 
 # dataset
diff --git a/scripts/llama_pro.py b/scripts/llama_pro.py
index 8a4294a2..997b3496 100644
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Performs block expansion for LLaMA, Mistral or Qwen1.5 models.
+# Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.
 # Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
 # Inspired by: https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
 
@@ -106,8 +106,7 @@ def block_expansion(
     print("Fine-tune this model with:")
     print("  --model_name_or_path {} \\".format(output_dir))
     print("  --finetuning_type freeze \\")
-    print("  --name_module_trainable all \\")
-    print("  --num_layer_trainable {} \\".format(num_expand))
+    print("  --freeze_trainable_layers {} \\".format(num_expand))
     print("  --use_llama_pro")
 
 
diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py
index 03bf52af..e728c30a 100644
--- a/src/llmtuner/hparams/finetuning_args.py
+++ b/src/llmtuner/hparams/finetuning_args.py
@@ -1,5 +1,4 @@
-import json
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from typing import Literal, Optional
 
 
@@ -9,22 +8,40 @@ class FreezeArguments:
     Arguments pertaining to the freeze (partial-parameter) training.
     """
 
-    name_module_trainable: str = field(
-        default="all",
+    freeze_trainable_layers: int = field(
+        default=2,
         metadata={
-            "help": """Name of trainable modules for partial-parameter (freeze) fine-tuning. \
-                    Use commas to separate multiple modules. \
-                    Use "all" to specify all the available modules. \
-                    LLaMA choices: ["mlp", "self_attn"], \
-                    BLOOM & Falcon & ChatGLM choices: ["mlp", "self_attention"], \
-                    Qwen choices: ["mlp", "attn"], \
-                    InternLM2 choices: ["feed_forward", "attention"], \
-                    Others choices: the same as LLaMA."""
+            "help": (
+                "The number of trainable layers for freeze (partial-parameter) fine-tuning. "
+                "Positive numbers mean the last n layers are set as trainable, "
+                "negative numbers mean the first n layers are set as trainable."
+            )
         },
     )
-    num_layer_trainable: int = field(
-        default=2,
-        metadata={"help": "The number of trainable layers for partial-parameter (freeze) fine-tuning."},
+    freeze_trainable_modules: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of trainable modules for freeze (partial-parameter) fine-tuning. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the available modules. "
+                "LLaMA choices: [`mlp`, `self_attn`], "
+                "BLOOM & Falcon & ChatGLM choices: [`mlp`, `self_attention`], "
+                "Qwen choices: [`mlp`, `attn`], "
+                "InternLM2 choices: [`feed_forward`, `attention`], "
+                "Others choices: the same as LLaMA."
+            )
+        },
+    )
+    freeze_extra_modules: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name(s) of modules apart from hidden layers to be set as trainable "
+                "for freeze (partial-parameter) fine-tuning. "
+                "Use commas to separate multiple modules."
+            )
+        },
     )
 
 
@@ -37,7 +54,11 @@ class LoraArguments:
     additional_target: Optional[str] = field(
         default=None,
         metadata={
-            "help": "Name(s) of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint."
+            "help": (
+                "Name(s) of modules apart from LoRA layers to be set as trainable "
+                "and saved in the final checkpoint. "
+                "Use commas to separate multiple modules."
+            )
         },
     )
     lora_alpha: Optional[int] = field(
@@ -55,15 +76,17 @@ class LoraArguments:
     lora_target: str = field(
         default="all",
         metadata={
-            "help": """Name(s) of target modules to apply LoRA. \
-                    Use commas to separate multiple modules. \
-                    Use "all" to specify all the linear modules. \
-                    LLaMA choices: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], \
-                    BLOOM & Falcon & ChatGLM choices: ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], \
-                    Baichuan choices: ["W_pack", "o_proj", "gate_proj", "up_proj", "down_proj"], \
-                    Qwen choices: ["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"], \
-                    InternLM2 choices: ["wqkv", "wo", "w1", "w2", "w3"], \
-                    Others choices: the same as LLaMA."""
+            "help": (
+                "Name(s) of target modules to apply LoRA. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules. "
+                "LLaMA choices: [`q_proj`, `k_proj`, `v_proj`, `o_proj`, `gate_proj`, `up_proj`, `down_proj`], "
+                "BLOOM & Falcon & ChatGLM choices: [`query_key_value`, `dense`, `dense_h_to_4h`, `dense_4h_to_h`], "
+                "Baichuan choices: [`W_pack`, `o_proj`, `gate_proj`, `up_proj`, `down_proj`], "
+                "Qwen choices: [`c_attn`, `attn.c_proj`, `w1`, `w2`, `mlp.c_proj`], "
+                "InternLM2 choices: [`wqkv`, `wo`, `w1`, `w2`, `w3`], "
+                "Others choices: the same as LLaMA."
+            )
         },
     )
     loraplus_lr_ratio: Optional[float] = field(
@@ -177,8 +200,10 @@ class GaloreArguments:
     galore_target: str = field(
         default="all",
         metadata={
-            "help": """Name(s) of modules to apply GaLore. Use commas to separate multiple modules. \
-                    Use "all" to specify all the linear modules."""
+            "help": (
+                "Name(s) of modules to apply GaLore. Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
         },
     )
     galore_rank: int = field(
@@ -238,16 +263,20 @@ class BAdamArgument:
     badam_mask_mode: Literal["adjacent", "scatter"] = field(
         default="adjacent",
         metadata={
-            "help": """The mode of the mask for BAdam optimizer. \
-                    `adjacent` means that the trainable parameters are adjacent to each other, \
-                    `scatter` means that trainable parameters are randomly choosed from the weight."""
+            "help": (
+                "The mode of the mask for BAdam optimizer. "
+                "`adjacent` means that the trainable parameters are adjacent to each other, "
+                "`scatter` means that trainable parameters are randomly choosed from the weight."
+            )
         },
     )
     badam_verbose: int = field(
         default=0,
         metadata={
-            "help": """The verbosity level of BAdam optimizer. \
-                    0 for no print, 1 for print the block prefix, 2 for print trainable parameters"""
+            "help": (
+                "The verbosity level of BAdam optimizer. "
+                "0 for no print, 1 for print the block prefix, 2 for print trainable parameters."
+            )
         },
     )
 
@@ -285,7 +314,8 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
                 return [item.strip() for item in arg.split(",")]
             return arg
 
-        self.name_module_trainable = split_arg(self.name_module_trainable)
+        self.freeze_trainable_modules = split_arg(self.freeze_trainable_modules)
+        self.freeze_extra_modules = split_arg(self.freeze_extra_modules)
         self.lora_alpha = self.lora_alpha or self.lora_rank * 2
         self.lora_target = split_arg(self.lora_target)
         self.additional_target = split_arg(self.additional_target)
@@ -315,17 +345,3 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
 
         if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora":
             raise ValueError("`loraplus_lr_ratio` is only valid for the LoRA training.")
-
-    def save_to_json(self, json_path: str):
-        r"""Saves the content of this instance in JSON format inside `json_path`."""
-        json_string = json.dumps(asdict(self), indent=2, sort_keys=True) + "\n"
-        with open(json_path, "w", encoding="utf-8") as f:
-            f.write(json_string)
-
-    @classmethod
-    def load_from_json(cls, json_path: str):
-        r"""Creates an instance from the content of `json_path`."""
-        with open(json_path, "r", encoding="utf-8") as f:
-            text = f.read()
-
-        return cls(**json.loads(text))
diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py
index 83f9a2d2..4ae95a62 100644
--- a/src/llmtuner/model/adapter.py
+++ b/src/llmtuner/model/adapter.py
@@ -1,3 +1,4 @@
+import re
 from typing import TYPE_CHECKING
 
 import torch
@@ -68,37 +69,52 @@ def init_adapter(
             raise ValueError("Current model does not support freeze tuning.")
 
         if finetuning_args.use_llama_pro:
-            if num_layers % finetuning_args.num_layer_trainable != 0:
+            if num_layers % finetuning_args.freeze_trainable_layers != 0:
                 raise ValueError(
                     "`num_layers` {} should be divisible by `num_layer_trainable` {}.".format(
-                        num_layers, finetuning_args.num_layer_trainable
+                        num_layers, finetuning_args.freeze_trainable_layers
                     )
                 )
 
-            stride = num_layers // finetuning_args.num_layer_trainable
+            stride = num_layers // finetuning_args.freeze_trainable_layers
             trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
-        elif finetuning_args.num_layer_trainable > 0:  # fine-tuning the last n layers if num_layer_trainable > 0
-            trainable_layer_ids = range(num_layers - finetuning_args.num_layer_trainable, num_layers)
+        elif finetuning_args.freeze_trainable_layers > 0:  # fine-tuning the last n layers if num_layer_trainable > 0
+            trainable_layer_ids = range(max(0, num_layers - finetuning_args.freeze_trainable_layers), num_layers)
         else:  # fine-tuning the first n layers if num_layer_trainable < 0
-            trainable_layer_ids = range(-finetuning_args.num_layer_trainable)
+            trainable_layer_ids = range(min(-finetuning_args.freeze_trainable_layers, num_layers))
 
-        freeze_modules = {"all"}
-        for name, _ in model.named_modules():
+        hidden_modules = set()
+        non_hidden_modules = set()
+        for name, _ in model.named_parameters():
             if ".0." in name:
-                freeze_modules.add(name.split(".0.")[-1].split(".")[0])
+                hidden_modules.add(name.split(".0.")[-1].split(".")[0])
             elif ".1." in name:  # MoD starts from layer 1
-                freeze_modules.add(name.split(".1.")[-1].split(".")[0])
+                hidden_modules.add(name.split(".1.")[-1].split(".")[0])
+
+            if re.search(r"\.\d+\.", name) is None:
+                non_hidden_modules.add(name.split(".")[-2])
 
         trainable_layers = []
-        for module_name in finetuning_args.name_module_trainable:
-            if module_name not in freeze_modules:
+        for module_name in finetuning_args.freeze_trainable_modules:
+            if module_name != "all" and module_name not in hidden_modules:
                 raise ValueError(
-                    "Module {} is not found, please choose from {}".format(module_name, ", ".join(freeze_modules))
+                    "Module {} is not found, please choose from {}".format(module_name, ", ".join(hidden_modules))
                 )
 
             for idx in trainable_layer_ids:
                 trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else ""))
 
+        if finetuning_args.freeze_extra_modules:
+            for module_name in finetuning_args.freeze_extra_modules:
+                if module_name not in non_hidden_modules:
+                    raise ValueError(
+                        "Module {} is not found, please choose from {}".format(
+                            module_name, ", ".join(non_hidden_modules)
+                        )
+                    )
+
+                trainable_layers.append(module_name)
+
         for name, param in model.named_parameters():
             if any(trainable_layer in name for trainable_layer in trainable_layers):
                 if cast_trainable_params_to_fp32:
diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py
index 5cde660c..be853604 100644
--- a/src/llmtuner/webui/components/train.py
+++ b/src/llmtuner/webui/components/train.py
@@ -124,13 +124,17 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
 
     with gr.Accordion(open=False) as freeze_tab:
         with gr.Row():
-            num_layer_trainable = gr.Slider(minimum=1, maximum=128, value=2, step=1)
-            name_module_trainable = gr.Textbox(value="all")
+            freeze_trainable_layers = gr.Slider(minimum=-128, maximum=128, value=2, step=1)
+            freeze_trainable_modules = gr.Textbox(value="all")
+            freeze_extra_modules = gr.Textbox()
 
-    input_elems.update({num_layer_trainable, name_module_trainable})
+    input_elems.update({freeze_trainable_layers, freeze_trainable_modules, freeze_extra_modules})
     elem_dict.update(
         dict(
-            freeze_tab=freeze_tab, num_layer_trainable=num_layer_trainable, name_module_trainable=name_module_trainable
+            freeze_tab=freeze_tab,
+            freeze_trainable_layers=freeze_trainable_layers,
+            freeze_trainable_modules=freeze_trainable_modules,
+            freeze_extra_modules=freeze_extra_modules,
         )
     )
 
diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py
index 5bf925b7..7afe6ec3 100644
--- a/src/llmtuner/webui/locales.py
+++ b/src/llmtuner/webui/locales.py
@@ -572,24 +572,24 @@ LOCALES = {
             "label": "部分参数微调设置",
         },
     },
-    "num_layer_trainable": {
+    "freeze_trainable_layers": {
         "en": {
             "label": "Trainable layers",
-            "info": "The number of trainable layers.",
+            "info": "Number of the last(+)/first(-) hidden layers to be set as trainable.",
         },
         "ru": {
             "label": "Обучаемые слои",
-            "info": "Количество обучаемых слоев.",
+            "info": "Количество последних (+)/первых (-) скрытых слоев, которые будут установлены как обучаемые.",
         },
         "zh": {
             "label": "可训练层数",
-            "info": "可训练模型层的数量。",
+            "info": "最末尾（+）/最前端（-）可训练隐藏层的数量。",
         },
     },
-    "name_module_trainable": {
+    "freeze_trainable_modules": {
         "en": {
             "label": "Trainable modules",
-            "info": "The name of trainable modules. Use commas to separate multiple modules.",
+            "info": "Name(s) of trainable modules. Use commas to separate multiple modules.",
         },
         "ru": {
             "label": "Обучаемые модули",
@@ -600,6 +600,26 @@ LOCALES = {
             "info": "可训练模块的名称。使用英文逗号分隔多个名称。",
         },
     },
+    "freeze_extra_modules": {
+        "en": {
+            "label": "Extra modules (optional)",
+            "info": (
+                "Name(s) of modules apart from hidden layers to be set as trainable. "
+                "Use commas to separate multiple modules."
+            ),
+        },
+        "ru": {
+            "label": "Дополнительные модули (опционально)",
+            "info": (
+                "Имена модулей, кроме скрытых слоев, которые следует установить в качестве обучаемых. "
+                "Используйте запятые для разделения нескольких модулей."
+            ),
+        },
+        "zh": {
+            "label": "额外模块（非必填）",
+            "info": "除隐藏层以外的可训练模块名称。使用英文逗号分隔多个名称。",
+        },
+    },
     "lora_tab": {
         "en": {
             "label": "LoRA configurations",
diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py
index 168abd86..ef911a16 100644
--- a/src/llmtuner/webui/runner.py
+++ b/src/llmtuner/webui/runner.py
@@ -146,8 +146,9 @@ class Runner:
         )
 
         if args["finetuning_type"] == "freeze":
-            args["num_layer_trainable"] = get("train.num_layer_trainable")
-            args["name_module_trainable"] = get("train.name_module_trainable")
+            args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
+            args["freeze_trainable_modules"] = get("train.freeze_trainable_modules")
+            args["freeze_extra_modules"] = get("train.freeze_extra_modules") or None
         elif args["finetuning_type"] == "lora":
             args["lora_rank"] = get("train.lora_rank")
             args["lora_alpha"] = get("train.lora_alpha")

From b2949b88e9e45d6f36e30a5ca2c3eef237f1d540 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 16 May 2024 00:57:16 +0800
Subject: [PATCH 310/341] release v0.7.1

Former-commit-id: a4f8adb021b6218d624303b51cd5e93ffa3111a1
---
 setup.py                 |  4 ++--
 src/llmtuner/__init__.py |  5 ++++-
 src/llmtuner/cli.py      | 46 +++++++++++++++++++++++++++-------------
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/setup.py b/setup.py
index 2f684753..1b83c373 100644
--- a/setup.py
+++ b/setup.py
@@ -5,9 +5,9 @@ from setuptools import find_packages, setup
 
 
 def get_version():
-    with open(os.path.join("src", "llmtuner", "__init__.py"), "r", encoding="utf-8") as f:
+    with open(os.path.join("src", "llmtuner", "cli.py"), "r", encoding="utf-8") as f:
         file_content = f.read()
-        pattern = r"{0}\W*=\W*\"([^\"]+)\"".format("__version__")
+        pattern = r"{}\W*=\W*\"([^\"]+)\"".format("VERSION")
         (version,) = re.findall(pattern, file_content)
         return version
 
diff --git a/src/llmtuner/__init__.py b/src/llmtuner/__init__.py
index a3a97450..b889e268 100644
--- a/src/llmtuner/__init__.py
+++ b/src/llmtuner/__init__.py
@@ -1,3 +1,6 @@
 # Level: api, webui > chat, eval, train > data, model > extras, hparams
 
-__version__ = "0.7.1.dev0"
+from .cli import VERSION
+
+
+__version__ = VERSION
diff --git a/src/llmtuner/cli.py b/src/llmtuner/cli.py
index f2619ab9..e4d6847a 100644
--- a/src/llmtuner/cli.py
+++ b/src/llmtuner/cli.py
@@ -1,7 +1,6 @@
 import sys
 from enum import Enum, unique
 
-from . import __version__
 from .api.app import run_api
 from .chat.chat_model import run_chat
 from .eval.evaluator import run_eval
@@ -9,17 +8,34 @@ from .train.tuner import export_model, run_exp
 from .webui.interface import run_web_demo, run_web_ui
 
 
-USAGE = """
-Usage:
-    llamafactory-cli api -h: launch an API server
-    llamafactory-cli chat -h: launch a chat interface in CLI
-    llamafactory-cli eval -h: do evaluation
-    llamafactory-cli export -h: merge LoRA adapters and export model
-    llamafactory-cli train -h: do training
-    llamafactory-cli webchat -h: launch a chat interface in Web UI
-    llamafactory-cli webui: launch LlamaBoard
-    llamafactory-cli version: show version info
-"""
+USAGE = (
+    "-" * 70
+    + "\n"
+    + "| Usage:                                                             |\n"
+    + "|   llamafactory-cli api -h: launch an OpenAI-style API server       |\n"
+    + "|   llamafactory-cli chat -h: launch a chat interface in CLI         |\n"
+    + "|   llamafactory-cli eval -h: evaluate models                        |\n"
+    + "|   llamafactory-cli export -h: merge LoRA adapters and export model |\n"
+    + "|   llamafactory-cli train -h: train models                          |\n"
+    + "|   llamafactory-cli webchat -h: launch a chat interface in Web UI   |\n"
+    + "|   llamafactory-cli webui: launch LlamaBoard                        |\n"
+    + "|   llamafactory-cli version: show version info                      |\n"
+    + "-" * 70
+)
+
+VERSION = "0.7.1"
+
+WELCOME = (
+    "-" * 58
+    + "\n"
+    + "| Welcome to LLaMA Factory, version {}".format(VERSION)
+    + " " * (21 - len(VERSION))
+    + "|\n|"
+    + " " * 56
+    + "|\n"
+    + "| Project page: https://github.com/hiyouga/LLaMA-Factory |\n"
+    + "-" * 58
+)
 
 
 @unique
@@ -31,7 +47,7 @@ class Command(str, Enum):
     TRAIN = "train"
     WEBDEMO = "webchat"
     WEBUI = "webui"
-    VERSION = "version"
+    VER = "version"
     HELP = "help"
 
 
@@ -51,8 +67,8 @@ def main():
         run_web_demo()
     elif command == Command.WEBUI:
         run_web_ui()
-    elif command == Command.VERSION:
-        print("Welcome to LLaMA Factory, version {}".format(__version__))
+    elif command == Command.VER:
+        print(WELCOME)
     elif command == Command.HELP:
         print(USAGE)
     else:

From fe638cf11fc50a6e0c11d90b074a1c276db90dc7 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 16 May 2024 02:17:31 +0800
Subject: [PATCH 311/341] set dev version

Former-commit-id: 5e9c72d07c3793cdccbdb8a9f95f1bb5d714e0a3
---
 README.md                          | 2 +-
 README_zh.md                       | 2 +-
 src/llmtuner/cli.py                | 2 +-
 src/llmtuner/model/utils/visual.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 11700e93..dfbaa92c 100644
--- a/README.md
+++ b/README.md
@@ -502,7 +502,7 @@ If you have a project that should be incorporated, please contact via email or c
 1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
-1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
+1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: A series of large language models for Chinese medical domain, based on LLaMA2-7B and Baichuan-13B.
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: A large language model specialized in generate metadata for stable diffusion. [[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
diff --git a/README_zh.md b/README_zh.md
index 146d046d..4e03f3c6 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -502,7 +502,7 @@ export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
 1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper，基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。
 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM，基于 Baichuan-13B 微调而得，具有法律推理和知识检索能力。
-1. **[Sunsimiao](https://github.com/thomas-yanxin/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。
+1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao，基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。
 1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: 医疗大模型项目 CareGPT，基于 LLaMA2-7B 和 Baichuan-13B 在中文医疗数据上微调而得。
 1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**：MBTI性格大模型项目，根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。
 1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**：一个用于生成 Stable Diffusion 提示词的大型语言模型。[[🤗Demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
diff --git a/src/llmtuner/cli.py b/src/llmtuner/cli.py
index e4d6847a..f9b63ded 100644
--- a/src/llmtuner/cli.py
+++ b/src/llmtuner/cli.py
@@ -23,7 +23,7 @@ USAGE = (
     + "-" * 70
 )
 
-VERSION = "0.7.1"
+VERSION = "0.7.2.dev0"
 
 WELCOME = (
     "-" * 58
diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py
index 50f92d22..e3b81bd1 100644
--- a/src/llmtuner/model/utils/visual.py
+++ b/src/llmtuner/model/utils/visual.py
@@ -76,7 +76,7 @@ def autocast_projector_dtype(
 
 
 def configure_visual_model(config: "PretrainedConfig") -> None:
-    if getattr(config, "model_type", None) == "llava":
+    if getattr(config, "model_type", None) == "llava":  # required for ds zero3 and valuehead models
         setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
 
     if getattr(config, "is_yi_vl_derived_model", None):

From dfa686b6173469dea001f3bb4236a2c0eaf398cd Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 16 May 2024 18:39:08 +0800
Subject: [PATCH 312/341] rename package

Former-commit-id: a07ff0c083558cfe6f474d13027642d3052fee08
---
 Dockerfile                                                | 2 +-
 README.md                                                 | 7 +++----
 README_zh.md                                              | 7 +++----
 pyproject.toml                                            | 2 +-
 scripts/cal_flops.py                                      | 2 +-
 scripts/cal_lr.py                                         | 8 ++++----
 scripts/cal_ppl.py                                        | 8 ++++----
 scripts/length_cdf.py                                     | 6 +++---
 setup.py                                                  | 6 +++---
 src/api.py                                                | 4 ++--
 src/{llmtuner => llamafactory}/__init__.py                | 0
 src/{llmtuner => llamafactory}/api/__init__.py            | 0
 src/{llmtuner => llamafactory}/api/app.py                 | 0
 src/{llmtuner => llamafactory}/api/chat.py                | 0
 src/{llmtuner => llamafactory}/api/common.py              | 0
 src/{llmtuner => llamafactory}/api/protocol.py            | 0
 src/{llmtuner => llamafactory}/chat/__init__.py           | 0
 src/{llmtuner => llamafactory}/chat/base_engine.py        | 0
 src/{llmtuner => llamafactory}/chat/chat_model.py         | 0
 src/{llmtuner => llamafactory}/chat/hf_engine.py          | 0
 src/{llmtuner => llamafactory}/chat/vllm_engine.py        | 0
 src/{llmtuner => llamafactory}/cli.py                     | 0
 src/{llmtuner => llamafactory}/data/__init__.py           | 0
 src/{llmtuner => llamafactory}/data/aligner.py            | 0
 src/{llmtuner => llamafactory}/data/collator.py           | 0
 src/{llmtuner => llamafactory}/data/formatter.py          | 0
 src/{llmtuner => llamafactory}/data/loader.py             | 0
 src/{llmtuner => llamafactory}/data/parser.py             | 2 ++
 src/{llmtuner => llamafactory}/data/preprocess.py         | 0
 src/{llmtuner => llamafactory}/data/template.py           | 0
 src/{llmtuner => llamafactory}/data/utils.py              | 2 +-
 src/{llmtuner => llamafactory}/eval/__init__.py           | 0
 src/{llmtuner => llamafactory}/eval/evaluator.py          | 0
 src/{llmtuner => llamafactory}/eval/template.py           | 0
 src/{llmtuner => llamafactory}/extras/__init__.py         | 0
 src/{llmtuner => llamafactory}/extras/callbacks.py        | 0
 src/{llmtuner => llamafactory}/extras/constants.py        | 0
 src/{llmtuner => llamafactory}/extras/logging.py          | 0
 src/{llmtuner => llamafactory}/extras/misc.py             | 2 +-
 src/{llmtuner => llamafactory}/extras/packages.py         | 0
 src/{llmtuner => llamafactory}/extras/ploting.py          | 0
 src/{llmtuner => llamafactory}/hparams/__init__.py        | 0
 src/{llmtuner => llamafactory}/hparams/data_args.py       | 0
 src/{llmtuner => llamafactory}/hparams/evaluation_args.py | 0
 src/{llmtuner => llamafactory}/hparams/finetuning_args.py | 0
 src/{llmtuner => llamafactory}/hparams/generating_args.py | 0
 src/{llmtuner => llamafactory}/hparams/model_args.py      | 0
 src/{llmtuner => llamafactory}/hparams/parser.py          | 0
 src/{llmtuner => llamafactory}/model/__init__.py          | 0
 src/{llmtuner => llamafactory}/model/adapter.py           | 0
 src/{llmtuner => llamafactory}/model/loader.py            | 0
 src/{llmtuner => llamafactory}/model/patcher.py           | 0
 src/{llmtuner => llamafactory}/model/utils/__init__.py    | 0
 src/{llmtuner => llamafactory}/model/utils/attention.py   | 0
 .../model/utils/checkpointing.py                          | 0
 src/{llmtuner => llamafactory}/model/utils/embedding.py   | 0
 src/{llmtuner => llamafactory}/model/utils/longlora.py    | 0
 src/{llmtuner => llamafactory}/model/utils/misc.py        | 0
 src/{llmtuner => llamafactory}/model/utils/mod.py         | 0
 src/{llmtuner => llamafactory}/model/utils/moe.py         | 0
 .../model/utils/quantization.py                           | 0
 src/{llmtuner => llamafactory}/model/utils/rope.py        | 0
 src/{llmtuner => llamafactory}/model/utils/unsloth.py     | 0
 src/{llmtuner => llamafactory}/model/utils/valuehead.py   | 0
 src/{llmtuner => llamafactory}/model/utils/visual.py      | 0
 src/{llmtuner => llamafactory}/train/__init__.py          | 0
 src/{llmtuner => llamafactory}/train/dpo/__init__.py      | 0
 src/{llmtuner => llamafactory}/train/dpo/trainer.py       | 0
 src/{llmtuner => llamafactory}/train/dpo/workflow.py      | 0
 src/{llmtuner => llamafactory}/train/orpo/__init__.py     | 0
 src/{llmtuner => llamafactory}/train/orpo/trainer.py      | 0
 src/{llmtuner => llamafactory}/train/orpo/workflow.py     | 0
 src/{llmtuner => llamafactory}/train/ppo/__init__.py      | 0
 src/{llmtuner => llamafactory}/train/ppo/trainer.py       | 0
 src/{llmtuner => llamafactory}/train/ppo/utils.py         | 0
 src/{llmtuner => llamafactory}/train/ppo/workflow.py      | 0
 src/{llmtuner => llamafactory}/train/pt/__init__.py       | 0
 src/{llmtuner => llamafactory}/train/pt/trainer.py        | 0
 src/{llmtuner => llamafactory}/train/pt/workflow.py       | 0
 src/{llmtuner => llamafactory}/train/rm/__init__.py       | 0
 src/{llmtuner => llamafactory}/train/rm/metric.py         | 0
 src/{llmtuner => llamafactory}/train/rm/trainer.py        | 0
 src/{llmtuner => llamafactory}/train/rm/workflow.py       | 0
 src/{llmtuner => llamafactory}/train/sft/__init__.py      | 0
 src/{llmtuner => llamafactory}/train/sft/metric.py        | 0
 src/{llmtuner => llamafactory}/train/sft/trainer.py       | 0
 src/{llmtuner => llamafactory}/train/sft/workflow.py      | 0
 src/{llmtuner => llamafactory}/train/tuner.py             | 0
 src/{llmtuner => llamafactory}/train/utils.py             | 0
 src/{llmtuner => llamafactory}/webui/__init__.py          | 0
 src/{llmtuner => llamafactory}/webui/chatter.py           | 0
 src/{llmtuner => llamafactory}/webui/common.py            | 0
 .../webui/components/__init__.py                          | 0
 .../webui/components/chatbot.py                           | 0
 src/{llmtuner => llamafactory}/webui/components/data.py   | 0
 src/{llmtuner => llamafactory}/webui/components/eval.py   | 0
 src/{llmtuner => llamafactory}/webui/components/export.py | 0
 src/{llmtuner => llamafactory}/webui/components/infer.py  | 0
 src/{llmtuner => llamafactory}/webui/components/top.py    | 0
 src/{llmtuner => llamafactory}/webui/components/train.py  | 0
 src/{llmtuner => llamafactory}/webui/css.py               | 0
 src/{llmtuner => llamafactory}/webui/engine.py            | 0
 src/{llmtuner => llamafactory}/webui/interface.py         | 0
 src/{llmtuner => llamafactory}/webui/locales.py           | 0
 src/{llmtuner => llamafactory}/webui/manager.py           | 0
 src/{llmtuner => llamafactory}/webui/runner.py            | 0
 src/{llmtuner => llamafactory}/webui/utils.py             | 0
 src/train.py                                              | 2 +-
 src/webui.py                                              | 2 +-
 109 files changed, 31 insertions(+), 31 deletions(-)
 rename src/{llmtuner => llamafactory}/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/api/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/api/app.py (100%)
 rename src/{llmtuner => llamafactory}/api/chat.py (100%)
 rename src/{llmtuner => llamafactory}/api/common.py (100%)
 rename src/{llmtuner => llamafactory}/api/protocol.py (100%)
 rename src/{llmtuner => llamafactory}/chat/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/chat/base_engine.py (100%)
 rename src/{llmtuner => llamafactory}/chat/chat_model.py (100%)
 rename src/{llmtuner => llamafactory}/chat/hf_engine.py (100%)
 rename src/{llmtuner => llamafactory}/chat/vllm_engine.py (100%)
 rename src/{llmtuner => llamafactory}/cli.py (100%)
 rename src/{llmtuner => llamafactory}/data/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/data/aligner.py (100%)
 rename src/{llmtuner => llamafactory}/data/collator.py (100%)
 rename src/{llmtuner => llamafactory}/data/formatter.py (100%)
 rename src/{llmtuner => llamafactory}/data/loader.py (100%)
 rename src/{llmtuner => llamafactory}/data/parser.py (98%)
 rename src/{llmtuner => llamafactory}/data/preprocess.py (100%)
 rename src/{llmtuner => llamafactory}/data/template.py (100%)
 rename src/{llmtuner => llamafactory}/data/utils.py (98%)
 rename src/{llmtuner => llamafactory}/eval/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/eval/evaluator.py (100%)
 rename src/{llmtuner => llamafactory}/eval/template.py (100%)
 rename src/{llmtuner => llamafactory}/extras/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/extras/callbacks.py (100%)
 rename src/{llmtuner => llamafactory}/extras/constants.py (100%)
 rename src/{llmtuner => llamafactory}/extras/logging.py (100%)
 rename src/{llmtuner => llamafactory}/extras/misc.py (99%)
 rename src/{llmtuner => llamafactory}/extras/packages.py (100%)
 rename src/{llmtuner => llamafactory}/extras/ploting.py (100%)
 rename src/{llmtuner => llamafactory}/hparams/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/hparams/data_args.py (100%)
 rename src/{llmtuner => llamafactory}/hparams/evaluation_args.py (100%)
 rename src/{llmtuner => llamafactory}/hparams/finetuning_args.py (100%)
 rename src/{llmtuner => llamafactory}/hparams/generating_args.py (100%)
 rename src/{llmtuner => llamafactory}/hparams/model_args.py (100%)
 rename src/{llmtuner => llamafactory}/hparams/parser.py (100%)
 rename src/{llmtuner => llamafactory}/model/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/model/adapter.py (100%)
 rename src/{llmtuner => llamafactory}/model/loader.py (100%)
 rename src/{llmtuner => llamafactory}/model/patcher.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/attention.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/checkpointing.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/embedding.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/longlora.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/misc.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/mod.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/moe.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/quantization.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/rope.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/unsloth.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/valuehead.py (100%)
 rename src/{llmtuner => llamafactory}/model/utils/visual.py (100%)
 rename src/{llmtuner => llamafactory}/train/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/train/dpo/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/train/dpo/trainer.py (100%)
 rename src/{llmtuner => llamafactory}/train/dpo/workflow.py (100%)
 rename src/{llmtuner => llamafactory}/train/orpo/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/train/orpo/trainer.py (100%)
 rename src/{llmtuner => llamafactory}/train/orpo/workflow.py (100%)
 rename src/{llmtuner => llamafactory}/train/ppo/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/train/ppo/trainer.py (100%)
 rename src/{llmtuner => llamafactory}/train/ppo/utils.py (100%)
 rename src/{llmtuner => llamafactory}/train/ppo/workflow.py (100%)
 rename src/{llmtuner => llamafactory}/train/pt/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/train/pt/trainer.py (100%)
 rename src/{llmtuner => llamafactory}/train/pt/workflow.py (100%)
 rename src/{llmtuner => llamafactory}/train/rm/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/train/rm/metric.py (100%)
 rename src/{llmtuner => llamafactory}/train/rm/trainer.py (100%)
 rename src/{llmtuner => llamafactory}/train/rm/workflow.py (100%)
 rename src/{llmtuner => llamafactory}/train/sft/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/train/sft/metric.py (100%)
 rename src/{llmtuner => llamafactory}/train/sft/trainer.py (100%)
 rename src/{llmtuner => llamafactory}/train/sft/workflow.py (100%)
 rename src/{llmtuner => llamafactory}/train/tuner.py (100%)
 rename src/{llmtuner => llamafactory}/train/utils.py (100%)
 rename src/{llmtuner => llamafactory}/webui/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/webui/chatter.py (100%)
 rename src/{llmtuner => llamafactory}/webui/common.py (100%)
 rename src/{llmtuner => llamafactory}/webui/components/__init__.py (100%)
 rename src/{llmtuner => llamafactory}/webui/components/chatbot.py (100%)
 rename src/{llmtuner => llamafactory}/webui/components/data.py (100%)
 rename src/{llmtuner => llamafactory}/webui/components/eval.py (100%)
 rename src/{llmtuner => llamafactory}/webui/components/export.py (100%)
 rename src/{llmtuner => llamafactory}/webui/components/infer.py (100%)
 rename src/{llmtuner => llamafactory}/webui/components/top.py (100%)
 rename src/{llmtuner => llamafactory}/webui/components/train.py (100%)
 rename src/{llmtuner => llamafactory}/webui/css.py (100%)
 rename src/{llmtuner => llamafactory}/webui/engine.py (100%)
 rename src/{llmtuner => llamafactory}/webui/interface.py (100%)
 rename src/{llmtuner => llamafactory}/webui/locales.py (100%)
 rename src/{llmtuner => llamafactory}/webui/manager.py (100%)
 rename src/{llmtuner => llamafactory}/webui/runner.py (100%)
 rename src/{llmtuner => llamafactory}/webui/utils.py (100%)

diff --git a/Dockerfile b/Dockerfile
index d757d618..0a35e355 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,7 +6,7 @@ COPY requirements.txt /app/
 RUN pip install -r requirements.txt
 
 COPY . /app/
-RUN pip install -e .[deepspeed,metrics,bitsandbytes,qwen]
+RUN pip install -e .[metrics,bitsandbytes,qwen]
 
 VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ]
 EXPOSE 7860
diff --git a/README.md b/README.md
index dfbaa92c..d392b19b 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,7 @@
 [![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/LLaMA-Factory?style=social)](https://github.com/hiyouga/LLaMA-Factory/stargazers)
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
-[![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
+[![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
 [![Citation](https://img.shields.io/badge/citation-44-green)](#projects-using-llama-factory)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
@@ -176,9 +175,9 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 >
 > Remember to use the **SAME** template in training and inference.
 
-Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list of models we supported.
+Please refer to [constants.py](src/llamafactory/extras/constants.py) for a full list of models we supported.
 
-You also can add a custom chat template to [template.py](src/llmtuner/data/template.py).
+You also can add a custom chat template to [template.py](src/llamafactory/data/template.py).
 
 ## Supported Training Approaches
 
diff --git a/README_zh.md b/README_zh.md
index 4e03f3c6..58398a31 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -3,8 +3,7 @@
 [![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/LLaMA-Factory?style=social)](https://github.com/hiyouga/LLaMA-Factory/stargazers)
 [![GitHub Code License](https://img.shields.io/github/license/hiyouga/LLaMA-Factory)](LICENSE)
 [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
-[![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/)
-[![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/)
+[![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
 [![Citation](https://img.shields.io/badge/citation-44-green)](#使用了-llama-factory-的项目)
 [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
@@ -176,9 +175,9 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 >
 > 请务必在训练和推理时使用**完全一致**的模板。
 
-项目所支持模型的完整列表请参阅 [constants.py](src/llmtuner/extras/constants.py)。
+项目所支持模型的完整列表请参阅 [constants.py](src/llamafactory/extras/constants.py)。
 
-您也可以在 [template.py](src/llmtuner/data/template.py) 中添加自己的对话模板。
+您也可以在 [template.py](src/llamafactory/data/template.py) 中添加自己的对话模板。
 
 ## 训练方法
 
diff --git a/pyproject.toml b/pyproject.toml
index 0316276b..62e77e1f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ select = ["C", "E", "F", "I", "W"]
 
 [tool.ruff.lint.isort]
 lines-after-imports = 2
-known-first-party = ["llmtuner"]
+known-first-party = ["llamafactory"]
 known-third-party = [
     "accelerate",
     "datasets",
diff --git a/scripts/cal_flops.py b/scripts/cal_flops.py
index 19414ce5..ac87e0ab 100644
--- a/scripts/cal_flops.py
+++ b/scripts/cal_flops.py
@@ -8,7 +8,7 @@ import torch
 from deepspeed.accelerator import get_accelerator  # type: ignore
 from deepspeed.profiling.flops_profiler import get_model_profile  # type: ignore
 
-from llmtuner.chat import ChatModel
+from llamafactory.chat import ChatModel
 
 
 def calculate_flops(
diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py
index dd864162..bfa32cc9 100644
--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -12,10 +12,10 @@ from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
 
-from llmtuner.data import get_dataset
-from llmtuner.extras.constants import IGNORE_INDEX
-from llmtuner.hparams import get_train_args
-from llmtuner.model import load_tokenizer
+from llamafactory.data import get_dataset
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_tokenizer
 
 
 BASE_LR = 3e-4  # 1.5e-4 for 30B-70B models
diff --git a/scripts/cal_ppl.py b/scripts/cal_ppl.py
index 9eebc57d..387b756c 100644
--- a/scripts/cal_ppl.py
+++ b/scripts/cal_ppl.py
@@ -12,10 +12,10 @@ from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
 
-from llmtuner.data import get_dataset
-from llmtuner.extras.constants import IGNORE_INDEX
-from llmtuner.hparams import get_train_args
-from llmtuner.model import load_model, load_tokenizer
+from llamafactory.data import get_dataset
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_model, load_tokenizer
 
 
 @dataclass
diff --git a/scripts/length_cdf.py b/scripts/length_cdf.py
index da41a942..7739dcf0 100644
--- a/scripts/length_cdf.py
+++ b/scripts/length_cdf.py
@@ -7,9 +7,9 @@ from collections import defaultdict
 import fire
 from tqdm import tqdm
 
-from llmtuner.data import get_dataset
-from llmtuner.hparams import get_train_args
-from llmtuner.model import load_tokenizer
+from llamafactory.data import get_dataset
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_tokenizer
 
 
 def length_cdf(
diff --git a/setup.py b/setup.py
index 1b83c373..4d948450 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ from setuptools import find_packages, setup
 
 
 def get_version():
-    with open(os.path.join("src", "llmtuner", "cli.py"), "r", encoding="utf-8") as f:
+    with open(os.path.join("src", "llamafactory", "cli.py"), "r", encoding="utf-8") as f:
         file_content = f.read()
         pattern = r"{}\W*=\W*\"([^\"]+)\"".format("VERSION")
         (version,) = re.findall(pattern, file_content)
@@ -38,7 +38,7 @@ extra_require = {
 
 def main():
     setup(
-        name="llmtuner",
+        name="llamafactory",
         version=get_version(),
         author="hiyouga",
         author_email="hiyouga" "@" "buaa.edu.cn",
@@ -53,7 +53,7 @@ def main():
         python_requires=">=3.8.0",
         install_requires=get_requires(),
         extras_require=extra_require,
-        entry_points={"console_scripts": ["llamafactory-cli = llmtuner.cli:main"]},
+        entry_points={"console_scripts": ["llamafactory-cli = llamafactory.cli:main"]},
         classifiers=[
             "Development Status :: 4 - Beta",
             "Intended Audience :: Developers",
diff --git a/src/api.py b/src/api.py
index 277920ac..3655e393 100644
--- a/src/api.py
+++ b/src/api.py
@@ -2,8 +2,8 @@ import os
 
 import uvicorn
 
-from llmtuner.api.app import create_app
-from llmtuner.chat import ChatModel
+from llamafactory.api.app import create_app
+from llamafactory.chat import ChatModel
 
 
 def main():
diff --git a/src/llmtuner/__init__.py b/src/llamafactory/__init__.py
similarity index 100%
rename from src/llmtuner/__init__.py
rename to src/llamafactory/__init__.py
diff --git a/src/llmtuner/api/__init__.py b/src/llamafactory/api/__init__.py
similarity index 100%
rename from src/llmtuner/api/__init__.py
rename to src/llamafactory/api/__init__.py
diff --git a/src/llmtuner/api/app.py b/src/llamafactory/api/app.py
similarity index 100%
rename from src/llmtuner/api/app.py
rename to src/llamafactory/api/app.py
diff --git a/src/llmtuner/api/chat.py b/src/llamafactory/api/chat.py
similarity index 100%
rename from src/llmtuner/api/chat.py
rename to src/llamafactory/api/chat.py
diff --git a/src/llmtuner/api/common.py b/src/llamafactory/api/common.py
similarity index 100%
rename from src/llmtuner/api/common.py
rename to src/llamafactory/api/common.py
diff --git a/src/llmtuner/api/protocol.py b/src/llamafactory/api/protocol.py
similarity index 100%
rename from src/llmtuner/api/protocol.py
rename to src/llamafactory/api/protocol.py
diff --git a/src/llmtuner/chat/__init__.py b/src/llamafactory/chat/__init__.py
similarity index 100%
rename from src/llmtuner/chat/__init__.py
rename to src/llamafactory/chat/__init__.py
diff --git a/src/llmtuner/chat/base_engine.py b/src/llamafactory/chat/base_engine.py
similarity index 100%
rename from src/llmtuner/chat/base_engine.py
rename to src/llamafactory/chat/base_engine.py
diff --git a/src/llmtuner/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
similarity index 100%
rename from src/llmtuner/chat/chat_model.py
rename to src/llamafactory/chat/chat_model.py
diff --git a/src/llmtuner/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
similarity index 100%
rename from src/llmtuner/chat/hf_engine.py
rename to src/llamafactory/chat/hf_engine.py
diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
similarity index 100%
rename from src/llmtuner/chat/vllm_engine.py
rename to src/llamafactory/chat/vllm_engine.py
diff --git a/src/llmtuner/cli.py b/src/llamafactory/cli.py
similarity index 100%
rename from src/llmtuner/cli.py
rename to src/llamafactory/cli.py
diff --git a/src/llmtuner/data/__init__.py b/src/llamafactory/data/__init__.py
similarity index 100%
rename from src/llmtuner/data/__init__.py
rename to src/llamafactory/data/__init__.py
diff --git a/src/llmtuner/data/aligner.py b/src/llamafactory/data/aligner.py
similarity index 100%
rename from src/llmtuner/data/aligner.py
rename to src/llamafactory/data/aligner.py
diff --git a/src/llmtuner/data/collator.py b/src/llamafactory/data/collator.py
similarity index 100%
rename from src/llmtuner/data/collator.py
rename to src/llamafactory/data/collator.py
diff --git a/src/llmtuner/data/formatter.py b/src/llamafactory/data/formatter.py
similarity index 100%
rename from src/llmtuner/data/formatter.py
rename to src/llamafactory/data/formatter.py
diff --git a/src/llmtuner/data/loader.py b/src/llamafactory/data/loader.py
similarity index 100%
rename from src/llmtuner/data/loader.py
rename to src/llamafactory/data/loader.py
diff --git a/src/llmtuner/data/parser.py b/src/llamafactory/data/parser.py
similarity index 98%
rename from src/llmtuner/data/parser.py
rename to src/llamafactory/data/parser.py
index 3170fd8a..848fd66c 100644
--- a/src/llmtuner/data/parser.py
+++ b/src/llamafactory/data/parser.py
@@ -32,6 +32,8 @@ class DatasetAttr:
     prompt: Optional[str] = "instruction"
     query: Optional[str] = "input"
     response: Optional[str] = "output"
+    chosen: Optional[str] = "chosen"
+    rejected: Optional[str] = "rejected"
     history: Optional[str] = None
     """ columns for the sharegpt format """
     messages: Optional[str] = "conversations"
diff --git a/src/llmtuner/data/preprocess.py b/src/llamafactory/data/preprocess.py
similarity index 100%
rename from src/llmtuner/data/preprocess.py
rename to src/llamafactory/data/preprocess.py
diff --git a/src/llmtuner/data/template.py b/src/llamafactory/data/template.py
similarity index 100%
rename from src/llmtuner/data/template.py
rename to src/llamafactory/data/template.py
diff --git a/src/llmtuner/data/utils.py b/src/llamafactory/data/utils.py
similarity index 98%
rename from src/llmtuner/data/utils.py
rename to src/llamafactory/data/utils.py
index aaa5bdc0..9b313112 100644
--- a/src/llmtuner/data/utils.py
+++ b/src/llamafactory/data/utils.py
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
     from datasets import Dataset, IterableDataset
     from transformers import Seq2SeqTrainingArguments
 
-    from llmtuner.hparams import DataArguments
+    from ..hparams import DataArguments
 
 
 logger = get_logger(__name__)
diff --git a/src/llmtuner/eval/__init__.py b/src/llamafactory/eval/__init__.py
similarity index 100%
rename from src/llmtuner/eval/__init__.py
rename to src/llamafactory/eval/__init__.py
diff --git a/src/llmtuner/eval/evaluator.py b/src/llamafactory/eval/evaluator.py
similarity index 100%
rename from src/llmtuner/eval/evaluator.py
rename to src/llamafactory/eval/evaluator.py
diff --git a/src/llmtuner/eval/template.py b/src/llamafactory/eval/template.py
similarity index 100%
rename from src/llmtuner/eval/template.py
rename to src/llamafactory/eval/template.py
diff --git a/src/llmtuner/extras/__init__.py b/src/llamafactory/extras/__init__.py
similarity index 100%
rename from src/llmtuner/extras/__init__.py
rename to src/llamafactory/extras/__init__.py
diff --git a/src/llmtuner/extras/callbacks.py b/src/llamafactory/extras/callbacks.py
similarity index 100%
rename from src/llmtuner/extras/callbacks.py
rename to src/llamafactory/extras/callbacks.py
diff --git a/src/llmtuner/extras/constants.py b/src/llamafactory/extras/constants.py
similarity index 100%
rename from src/llmtuner/extras/constants.py
rename to src/llamafactory/extras/constants.py
diff --git a/src/llmtuner/extras/logging.py b/src/llamafactory/extras/logging.py
similarity index 100%
rename from src/llmtuner/extras/logging.py
rename to src/llamafactory/extras/logging.py
diff --git a/src/llmtuner/extras/misc.py b/src/llamafactory/extras/misc.py
similarity index 99%
rename from src/llmtuner/extras/misc.py
rename to src/llamafactory/extras/misc.py
index 53140efa..8955acd1 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -30,7 +30,7 @@ except Exception:
 if TYPE_CHECKING:
     from trl import AutoModelForCausalLMWithValueHead
 
-    from llmtuner.hparams import ModelArguments
+    from ..hparams import ModelArguments
 
 
 logger = get_logger(__name__)
diff --git a/src/llmtuner/extras/packages.py b/src/llamafactory/extras/packages.py
similarity index 100%
rename from src/llmtuner/extras/packages.py
rename to src/llamafactory/extras/packages.py
diff --git a/src/llmtuner/extras/ploting.py b/src/llamafactory/extras/ploting.py
similarity index 100%
rename from src/llmtuner/extras/ploting.py
rename to src/llamafactory/extras/ploting.py
diff --git a/src/llmtuner/hparams/__init__.py b/src/llamafactory/hparams/__init__.py
similarity index 100%
rename from src/llmtuner/hparams/__init__.py
rename to src/llamafactory/hparams/__init__.py
diff --git a/src/llmtuner/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
similarity index 100%
rename from src/llmtuner/hparams/data_args.py
rename to src/llamafactory/hparams/data_args.py
diff --git a/src/llmtuner/hparams/evaluation_args.py b/src/llamafactory/hparams/evaluation_args.py
similarity index 100%
rename from src/llmtuner/hparams/evaluation_args.py
rename to src/llamafactory/hparams/evaluation_args.py
diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
similarity index 100%
rename from src/llmtuner/hparams/finetuning_args.py
rename to src/llamafactory/hparams/finetuning_args.py
diff --git a/src/llmtuner/hparams/generating_args.py b/src/llamafactory/hparams/generating_args.py
similarity index 100%
rename from src/llmtuner/hparams/generating_args.py
rename to src/llamafactory/hparams/generating_args.py
diff --git a/src/llmtuner/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
similarity index 100%
rename from src/llmtuner/hparams/model_args.py
rename to src/llamafactory/hparams/model_args.py
diff --git a/src/llmtuner/hparams/parser.py b/src/llamafactory/hparams/parser.py
similarity index 100%
rename from src/llmtuner/hparams/parser.py
rename to src/llamafactory/hparams/parser.py
diff --git a/src/llmtuner/model/__init__.py b/src/llamafactory/model/__init__.py
similarity index 100%
rename from src/llmtuner/model/__init__.py
rename to src/llamafactory/model/__init__.py
diff --git a/src/llmtuner/model/adapter.py b/src/llamafactory/model/adapter.py
similarity index 100%
rename from src/llmtuner/model/adapter.py
rename to src/llamafactory/model/adapter.py
diff --git a/src/llmtuner/model/loader.py b/src/llamafactory/model/loader.py
similarity index 100%
rename from src/llmtuner/model/loader.py
rename to src/llamafactory/model/loader.py
diff --git a/src/llmtuner/model/patcher.py b/src/llamafactory/model/patcher.py
similarity index 100%
rename from src/llmtuner/model/patcher.py
rename to src/llamafactory/model/patcher.py
diff --git a/src/llmtuner/model/utils/__init__.py b/src/llamafactory/model/utils/__init__.py
similarity index 100%
rename from src/llmtuner/model/utils/__init__.py
rename to src/llamafactory/model/utils/__init__.py
diff --git a/src/llmtuner/model/utils/attention.py b/src/llamafactory/model/utils/attention.py
similarity index 100%
rename from src/llmtuner/model/utils/attention.py
rename to src/llamafactory/model/utils/attention.py
diff --git a/src/llmtuner/model/utils/checkpointing.py b/src/llamafactory/model/utils/checkpointing.py
similarity index 100%
rename from src/llmtuner/model/utils/checkpointing.py
rename to src/llamafactory/model/utils/checkpointing.py
diff --git a/src/llmtuner/model/utils/embedding.py b/src/llamafactory/model/utils/embedding.py
similarity index 100%
rename from src/llmtuner/model/utils/embedding.py
rename to src/llamafactory/model/utils/embedding.py
diff --git a/src/llmtuner/model/utils/longlora.py b/src/llamafactory/model/utils/longlora.py
similarity index 100%
rename from src/llmtuner/model/utils/longlora.py
rename to src/llamafactory/model/utils/longlora.py
diff --git a/src/llmtuner/model/utils/misc.py b/src/llamafactory/model/utils/misc.py
similarity index 100%
rename from src/llmtuner/model/utils/misc.py
rename to src/llamafactory/model/utils/misc.py
diff --git a/src/llmtuner/model/utils/mod.py b/src/llamafactory/model/utils/mod.py
similarity index 100%
rename from src/llmtuner/model/utils/mod.py
rename to src/llamafactory/model/utils/mod.py
diff --git a/src/llmtuner/model/utils/moe.py b/src/llamafactory/model/utils/moe.py
similarity index 100%
rename from src/llmtuner/model/utils/moe.py
rename to src/llamafactory/model/utils/moe.py
diff --git a/src/llmtuner/model/utils/quantization.py b/src/llamafactory/model/utils/quantization.py
similarity index 100%
rename from src/llmtuner/model/utils/quantization.py
rename to src/llamafactory/model/utils/quantization.py
diff --git a/src/llmtuner/model/utils/rope.py b/src/llamafactory/model/utils/rope.py
similarity index 100%
rename from src/llmtuner/model/utils/rope.py
rename to src/llamafactory/model/utils/rope.py
diff --git a/src/llmtuner/model/utils/unsloth.py b/src/llamafactory/model/utils/unsloth.py
similarity index 100%
rename from src/llmtuner/model/utils/unsloth.py
rename to src/llamafactory/model/utils/unsloth.py
diff --git a/src/llmtuner/model/utils/valuehead.py b/src/llamafactory/model/utils/valuehead.py
similarity index 100%
rename from src/llmtuner/model/utils/valuehead.py
rename to src/llamafactory/model/utils/valuehead.py
diff --git a/src/llmtuner/model/utils/visual.py b/src/llamafactory/model/utils/visual.py
similarity index 100%
rename from src/llmtuner/model/utils/visual.py
rename to src/llamafactory/model/utils/visual.py
diff --git a/src/llmtuner/train/__init__.py b/src/llamafactory/train/__init__.py
similarity index 100%
rename from src/llmtuner/train/__init__.py
rename to src/llamafactory/train/__init__.py
diff --git a/src/llmtuner/train/dpo/__init__.py b/src/llamafactory/train/dpo/__init__.py
similarity index 100%
rename from src/llmtuner/train/dpo/__init__.py
rename to src/llamafactory/train/dpo/__init__.py
diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
similarity index 100%
rename from src/llmtuner/train/dpo/trainer.py
rename to src/llamafactory/train/dpo/trainer.py
diff --git a/src/llmtuner/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py
similarity index 100%
rename from src/llmtuner/train/dpo/workflow.py
rename to src/llamafactory/train/dpo/workflow.py
diff --git a/src/llmtuner/train/orpo/__init__.py b/src/llamafactory/train/orpo/__init__.py
similarity index 100%
rename from src/llmtuner/train/orpo/__init__.py
rename to src/llamafactory/train/orpo/__init__.py
diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llamafactory/train/orpo/trainer.py
similarity index 100%
rename from src/llmtuner/train/orpo/trainer.py
rename to src/llamafactory/train/orpo/trainer.py
diff --git a/src/llmtuner/train/orpo/workflow.py b/src/llamafactory/train/orpo/workflow.py
similarity index 100%
rename from src/llmtuner/train/orpo/workflow.py
rename to src/llamafactory/train/orpo/workflow.py
diff --git a/src/llmtuner/train/ppo/__init__.py b/src/llamafactory/train/ppo/__init__.py
similarity index 100%
rename from src/llmtuner/train/ppo/__init__.py
rename to src/llamafactory/train/ppo/__init__.py
diff --git a/src/llmtuner/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py
similarity index 100%
rename from src/llmtuner/train/ppo/trainer.py
rename to src/llamafactory/train/ppo/trainer.py
diff --git a/src/llmtuner/train/ppo/utils.py b/src/llamafactory/train/ppo/utils.py
similarity index 100%
rename from src/llmtuner/train/ppo/utils.py
rename to src/llamafactory/train/ppo/utils.py
diff --git a/src/llmtuner/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py
similarity index 100%
rename from src/llmtuner/train/ppo/workflow.py
rename to src/llamafactory/train/ppo/workflow.py
diff --git a/src/llmtuner/train/pt/__init__.py b/src/llamafactory/train/pt/__init__.py
similarity index 100%
rename from src/llmtuner/train/pt/__init__.py
rename to src/llamafactory/train/pt/__init__.py
diff --git a/src/llmtuner/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py
similarity index 100%
rename from src/llmtuner/train/pt/trainer.py
rename to src/llamafactory/train/pt/trainer.py
diff --git a/src/llmtuner/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py
similarity index 100%
rename from src/llmtuner/train/pt/workflow.py
rename to src/llamafactory/train/pt/workflow.py
diff --git a/src/llmtuner/train/rm/__init__.py b/src/llamafactory/train/rm/__init__.py
similarity index 100%
rename from src/llmtuner/train/rm/__init__.py
rename to src/llamafactory/train/rm/__init__.py
diff --git a/src/llmtuner/train/rm/metric.py b/src/llamafactory/train/rm/metric.py
similarity index 100%
rename from src/llmtuner/train/rm/metric.py
rename to src/llamafactory/train/rm/metric.py
diff --git a/src/llmtuner/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
similarity index 100%
rename from src/llmtuner/train/rm/trainer.py
rename to src/llamafactory/train/rm/trainer.py
diff --git a/src/llmtuner/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py
similarity index 100%
rename from src/llmtuner/train/rm/workflow.py
rename to src/llamafactory/train/rm/workflow.py
diff --git a/src/llmtuner/train/sft/__init__.py b/src/llamafactory/train/sft/__init__.py
similarity index 100%
rename from src/llmtuner/train/sft/__init__.py
rename to src/llamafactory/train/sft/__init__.py
diff --git a/src/llmtuner/train/sft/metric.py b/src/llamafactory/train/sft/metric.py
similarity index 100%
rename from src/llmtuner/train/sft/metric.py
rename to src/llamafactory/train/sft/metric.py
diff --git a/src/llmtuner/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py
similarity index 100%
rename from src/llmtuner/train/sft/trainer.py
rename to src/llamafactory/train/sft/trainer.py
diff --git a/src/llmtuner/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py
similarity index 100%
rename from src/llmtuner/train/sft/workflow.py
rename to src/llamafactory/train/sft/workflow.py
diff --git a/src/llmtuner/train/tuner.py b/src/llamafactory/train/tuner.py
similarity index 100%
rename from src/llmtuner/train/tuner.py
rename to src/llamafactory/train/tuner.py
diff --git a/src/llmtuner/train/utils.py b/src/llamafactory/train/utils.py
similarity index 100%
rename from src/llmtuner/train/utils.py
rename to src/llamafactory/train/utils.py
diff --git a/src/llmtuner/webui/__init__.py b/src/llamafactory/webui/__init__.py
similarity index 100%
rename from src/llmtuner/webui/__init__.py
rename to src/llamafactory/webui/__init__.py
diff --git a/src/llmtuner/webui/chatter.py b/src/llamafactory/webui/chatter.py
similarity index 100%
rename from src/llmtuner/webui/chatter.py
rename to src/llamafactory/webui/chatter.py
diff --git a/src/llmtuner/webui/common.py b/src/llamafactory/webui/common.py
similarity index 100%
rename from src/llmtuner/webui/common.py
rename to src/llamafactory/webui/common.py
diff --git a/src/llmtuner/webui/components/__init__.py b/src/llamafactory/webui/components/__init__.py
similarity index 100%
rename from src/llmtuner/webui/components/__init__.py
rename to src/llamafactory/webui/components/__init__.py
diff --git a/src/llmtuner/webui/components/chatbot.py b/src/llamafactory/webui/components/chatbot.py
similarity index 100%
rename from src/llmtuner/webui/components/chatbot.py
rename to src/llamafactory/webui/components/chatbot.py
diff --git a/src/llmtuner/webui/components/data.py b/src/llamafactory/webui/components/data.py
similarity index 100%
rename from src/llmtuner/webui/components/data.py
rename to src/llamafactory/webui/components/data.py
diff --git a/src/llmtuner/webui/components/eval.py b/src/llamafactory/webui/components/eval.py
similarity index 100%
rename from src/llmtuner/webui/components/eval.py
rename to src/llamafactory/webui/components/eval.py
diff --git a/src/llmtuner/webui/components/export.py b/src/llamafactory/webui/components/export.py
similarity index 100%
rename from src/llmtuner/webui/components/export.py
rename to src/llamafactory/webui/components/export.py
diff --git a/src/llmtuner/webui/components/infer.py b/src/llamafactory/webui/components/infer.py
similarity index 100%
rename from src/llmtuner/webui/components/infer.py
rename to src/llamafactory/webui/components/infer.py
diff --git a/src/llmtuner/webui/components/top.py b/src/llamafactory/webui/components/top.py
similarity index 100%
rename from src/llmtuner/webui/components/top.py
rename to src/llamafactory/webui/components/top.py
diff --git a/src/llmtuner/webui/components/train.py b/src/llamafactory/webui/components/train.py
similarity index 100%
rename from src/llmtuner/webui/components/train.py
rename to src/llamafactory/webui/components/train.py
diff --git a/src/llmtuner/webui/css.py b/src/llamafactory/webui/css.py
similarity index 100%
rename from src/llmtuner/webui/css.py
rename to src/llamafactory/webui/css.py
diff --git a/src/llmtuner/webui/engine.py b/src/llamafactory/webui/engine.py
similarity index 100%
rename from src/llmtuner/webui/engine.py
rename to src/llamafactory/webui/engine.py
diff --git a/src/llmtuner/webui/interface.py b/src/llamafactory/webui/interface.py
similarity index 100%
rename from src/llmtuner/webui/interface.py
rename to src/llamafactory/webui/interface.py
diff --git a/src/llmtuner/webui/locales.py b/src/llamafactory/webui/locales.py
similarity index 100%
rename from src/llmtuner/webui/locales.py
rename to src/llamafactory/webui/locales.py
diff --git a/src/llmtuner/webui/manager.py b/src/llamafactory/webui/manager.py
similarity index 100%
rename from src/llmtuner/webui/manager.py
rename to src/llamafactory/webui/manager.py
diff --git a/src/llmtuner/webui/runner.py b/src/llamafactory/webui/runner.py
similarity index 100%
rename from src/llmtuner/webui/runner.py
rename to src/llamafactory/webui/runner.py
diff --git a/src/llmtuner/webui/utils.py b/src/llamafactory/webui/utils.py
similarity index 100%
rename from src/llmtuner/webui/utils.py
rename to src/llamafactory/webui/utils.py
diff --git a/src/train.py b/src/train.py
index 6a3212cb..b20aa9d2 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,4 +1,4 @@
-from llmtuner.train.tuner import run_exp
+from llamafactory.train.tuner import run_exp
 
 
 def main():
diff --git a/src/webui.py b/src/webui.py
index 7a43039d..8da69497 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -1,6 +1,6 @@
 import os
 
-from llmtuner.webui.interface import create_ui
+from llamafactory.webui.interface import create_ui
 
 
 def main():

From efcf5e050d64e0c1e7d3549719ec05b6dfce5108 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Thu, 16 May 2024 19:12:09 +0800
Subject: [PATCH 313/341] fix examples #3769

Former-commit-id: 80c036beb8d9ddac8f844f1818c9488ded04e86e
---
 examples/README.md    | 8 +++++---
 examples/README_zh.md | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 4b4a8248..5aa03dfc 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -171,22 +171,24 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.y
 
 ### Inferring LoRA Fine-Tuned Models
 
+Use `CUDA_VISIBLE_DEVICES=0,1` to infer models on multiple devices.
+
 #### Use CLI
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/merge_lora/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### Use Web UI
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/merge_lora/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### Launch OpenAI-style API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/merge_lora/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.yaml
 ```
 
 ### Extras
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 3b5b2dee..5d205a21 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -171,22 +171,24 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_gptq.y
 
 ### 推理 LoRA 模型
 
+使用 `CUDA_VISIBLE_DEVICES=0,1` 进行多卡推理。
+
 #### 使用命令行接口
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/merge_lora/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### 使用浏览器界面
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/merge_lora/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
 ```
 
 #### 启动 OpenAI 风格 API
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/merge_lora/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli api examples/inference/llama3_lora_sft.yaml
 ```
 
 ### 杂项

From 6481321470c339a9803567e8c83a458bce3d15ce Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 17 May 2024 00:08:33 +0800
Subject: [PATCH 314/341] add falcon 11b

Former-commit-id: 897acc725edc204fad393cc9616828431b4fa768
---
 README.md                            | 2 +-
 README_zh.md                         | 2 +-
 src/llamafactory/extras/constants.py | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d392b19b..a41415fd 100644
--- a/README.md
+++ b/README.md
@@ -149,7 +149,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | query_key_value   | chatglm3  |
 | [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | q_proj,v_proj     | cohere    |
 | [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | q_proj,v_proj     | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                      | query_key_value   | falcon    |
+| [Falcon](https://huggingface.co/tiiuae)                  | 7B/11B/40B/180B                  | query_key_value   | falcon    |
 | [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | q_proj,v_proj     | gemma     |
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B                   | q_proj,v_proj     | -         |
diff --git a/README_zh.md b/README_zh.md
index 58398a31..4f8ffa28 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -149,7 +149,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | [ChatGLM3](https://huggingface.co/THUDM)                 | 6B                               | query_key_value   | chatglm3  |
 | [Command-R](https://huggingface.co/CohereForAI)          | 35B/104B                         | q_proj,v_proj     | cohere    |
 | [DeepSeek (MoE)](https://huggingface.co/deepseek-ai)     | 7B/16B/67B/236B                  | q_proj,v_proj     | deepseek  |
-| [Falcon](https://huggingface.co/tiiuae)                  | 7B/40B/180B                      | query_key_value   | falcon    |
+| [Falcon](https://huggingface.co/tiiuae)                  | 7B/11B/40B/180B                  | query_key_value   | falcon    |
 | [Gemma/CodeGemma](https://huggingface.co/google)         | 2B/7B                            | q_proj,v_proj     | gemma     |
 | [InternLM2](https://huggingface.co/internlm)             | 7B/20B                           | wqkv              | intern2   |
 | [LLaMA](https://github.com/facebookresearch/llama)       | 7B/13B/33B/65B                   | q_proj,v_proj     | -         |
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index f1ee55a0..40826163 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -375,6 +375,9 @@ register_model_group(
             DownloadSource.DEFAULT: "tiiuae/falcon-7b",
             DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b",
         },
+        "Falcon-11B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-11B",
+        },
         "Falcon-40B": {
             DownloadSource.DEFAULT: "tiiuae/falcon-40b",
             DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b",

From 45329d9e3c29a4950e3da48e31baba16af23f436 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 17 May 2024 00:08:56 +0800
Subject: [PATCH 315/341] enable inbrowser in webui

Former-commit-id: 71fdeedb64b2339eb1c740d670b87e0c03dada68
---
 src/llamafactory/webui/interface.py | 4 ++--
 src/webui.py                        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llamafactory/webui/interface.py b/src/llamafactory/webui/interface.py
index c5a30113..bae3ba76 100644
--- a/src/llamafactory/webui/interface.py
+++ b/src/llamafactory/webui/interface.py
@@ -73,10 +73,10 @@ def create_web_demo() -> gr.Blocks:
 def run_web_ui() -> None:
     gradio_share = os.environ.get("GRADIO_SHARE", "0").lower() in ["true", "1"]
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    create_ui().queue().launch(share=gradio_share, server_name=server_name)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)
 
 
 def run_web_demo() -> None:
     gradio_share = os.environ.get("GRADIO_SHARE", "0").lower() in ["true", "1"]
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    create_web_demo().queue().launch(share=gradio_share, server_name=server_name)
+    create_web_demo().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)
diff --git a/src/webui.py b/src/webui.py
index 8da69497..bbefb54e 100644
--- a/src/webui.py
+++ b/src/webui.py
@@ -6,7 +6,7 @@ from llamafactory.webui.interface import create_ui
 def main():
     gradio_share = os.environ.get("GRADIO_SHARE", "0").lower() in ["true", "1"]
     server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
-    create_ui().queue().launch(share=gradio_share, server_name=server_name)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)
 
 
 if __name__ == "__main__":

From a3320f26cfd75ddfe19c5271bfe671914be4177c Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 17 May 2024 01:02:00 +0800
Subject: [PATCH 316/341] update examples

Former-commit-id: 3b5f138155d96b346bda18e465cf60ec7d99e19c
---
 examples/extras/badam/llama3_lora_sft.yaml         | 12 ++++++------
 examples/extras/fsdp_qlora/llama3_lora_sft.yaml    | 14 +++++++-------
 examples/extras/galore/llama3_full_sft.yaml        | 12 ++++++------
 examples/extras/llama_pro/llama3_freeze_sft.yaml   | 12 ++++++------
 examples/extras/loraplus/llama3_lora_sft.yaml      | 12 ++++++------
 examples/extras/mod/llama3_full_sft.yaml           | 12 ++++++------
 examples/full_multi_gpu/llama3_full_predict.yaml   | 10 +++++-----
 examples/full_multi_gpu/llama3_full_sft.yaml       | 14 +++++++-------
 examples/lora_multi_gpu/llama3_lora_sft.yaml       | 14 +++++++-------
 examples/lora_multi_gpu/llama3_lora_sft_ds.yaml    | 14 +++++++-------
 examples/lora_multi_npu/llama3_lora_sft_ds.yaml    | 14 +++++++-------
 examples/lora_single_gpu/llama3_lora_dpo.yaml      | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_eval.yaml     | 10 +++++-----
 examples/lora_single_gpu/llama3_lora_orpo.yaml     | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_ppo.yaml      | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_predict.yaml  | 10 +++++-----
 examples/lora_single_gpu/llama3_lora_pretrain.yaml | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_reward.yaml   | 12 ++++++------
 examples/lora_single_gpu/llama3_lora_sft.yaml      | 12 ++++++------
 examples/lora_single_gpu/llama3_preprocess.yaml    |  8 ++++----
 examples/lora_single_gpu/llava1_5_lora_sft.yaml    | 12 ++++++------
 examples/merge_lora/llama3_gptq.yaml               |  4 ++--
 examples/merge_lora/llama3_lora_sft.yaml           |  6 +++---
 .../qlora_single_gpu/llama3_lora_sft_aqlm.yaml     | 12 ++++++------
 examples/qlora_single_gpu/llama3_lora_sft_awq.yaml | 12 ++++++------
 .../llama3_lora_sft_bitsandbytes.yaml              | 12 ++++++------
 .../qlora_single_gpu/llama3_lora_sft_gptq.yaml     | 12 ++++++------
 27 files changed, 155 insertions(+), 155 deletions(-)

diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_lora_sft.yaml
index 5e8994bc..c8c00431 100644
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@@ -1,7 +1,7 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
@@ -10,7 +10,7 @@ badam_switch_mode: descending
 badam_switch_interval: 50
 badam_verbose: 2
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -18,14 +18,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -34,7 +34,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 pure_bf16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
index 1fd8f16a..9d3b1124 100644
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -1,17 +1,17 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/galore/llama3_full_sft.yaml b/examples/extras/galore/llama3_full_sft.yaml
index 3bc074c5..7f5ce354 100644
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -1,7 +1,7 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
@@ -11,7 +11,7 @@ galore_target: mlp,self_attn
 galore_rank: 128
 galore_scale: 2.0
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 1
 learning_rate: 0.0001
@@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 pure_bf16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index 0ffcb5e8..fc9bc9d3 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -1,7 +1,7 @@
-# model
+### model
 model_name_or_path: models/llama3-8b-instruct-pro
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: freeze
@@ -9,7 +9,7 @@ freeze_trainable_layers: 8
 freeze_trainable_modules: all
 use_llama_pro: true
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -17,14 +17,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b-instruct-pro/freeze/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -33,7 +33,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
index 0956aa71..c0e582d9 100644
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 loraplus_lr_ratio: 16.0
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml
index 5dc8c061..cfcd4f8a 100644
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
 mixture_of_depths: convert
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b-mod/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 optim: paged_adamw_8bit
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 pure_bf16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/full_multi_gpu/llama3_full_predict.yaml b/examples/full_multi_gpu/llama3_full_predict.yaml
index 5b9b680b..f037a20c 100644
--- a/examples/full_multi_gpu/llama3_full_predict.yaml
+++ b/examples/full_multi_gpu/llama3_full_predict.yaml
@@ -1,12 +1,12 @@
-# model
+### model
 model_name_or_path: saves/llama3-8b/full/sft
 
-# method
+### method
 stage: sft
 do_predict: true
 finetuning_type: full
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -14,10 +14,10 @@ max_samples: 50
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/full/predict
 overwrite_output_dir: true
 
-# eval
+### eval
 per_device_eval_batch_size: 1
 predict_with_generate: true
diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/full_multi_gpu/llama3_full_sft.yaml
index 2d8031f1..a08af5fe 100644
--- a/examples/full_multi_gpu/llama3_full_sft.yaml
+++ b/examples/full_multi_gpu/llama3_full_sft.yaml
@@ -1,16 +1,16 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: full
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -18,14 +18,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@@ -34,7 +34,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml
index 6cc06f8a..ed39144f 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml
@@ -1,16 +1,16 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -18,14 +18,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@@ -34,7 +34,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
index 5a7348c1..1ce045c0 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
@@ -1,17 +1,17 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
index 2e9c0558..286ab503 100644
--- a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
@@ -1,17 +1,17 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# ddp
+### ddp
 ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z0_config.json
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -19,14 +19,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 2
 learning_rate: 0.0001
@@ -35,7 +35,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/lora_single_gpu/llama3_lora_dpo.yaml
index 16c6d0c9..615e919f 100644
--- a/examples/lora_single_gpu/llama3_lora_dpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: dpo
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 dpo_ftx: 1.0
 
-# dataset
+### dataset
 dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/dpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_eval.yaml b/examples/lora_single_gpu/llama3_lora_eval.yaml
index 5808a47a..6fcfd6ef 100644
--- a/examples/lora_single_gpu/llama3_lora_eval.yaml
+++ b/examples/lora_single_gpu/llama3_lora_eval.yaml
@@ -1,19 +1,19 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
 
-# method
+### method
 finetuning_type: lora
 
-# dataset
+### dataset
 task: mmlu
 split: test
 template: fewshot
 lang: en
 n_shot: 5
 
-# output
+### output
 save_dir: saves/llama3-8b/lora/eval
 
-# eval
+### eval
 batch_size: 4
diff --git a/examples/lora_single_gpu/llama3_lora_orpo.yaml b/examples/lora_single_gpu/llama3_lora_orpo.yaml
index bc42bdd4..6fed8735 100644
--- a/examples/lora_single_gpu/llama3_lora_orpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_orpo.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: orpo
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/orpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_ppo.yaml b/examples/lora_single_gpu/llama3_lora_ppo.yaml
index 8d78d20d..5cd2f18f 100644
--- a/examples/lora_single_gpu/llama3_lora_ppo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_ppo.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 reward_model: saves/llama3-8b/lora/reward
 
-# method
+### method
 stage: ppo
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/ppo
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# generate
+### generate
 max_new_tokens: 512
 top_k: 0
 top_p: 0.9
diff --git a/examples/lora_single_gpu/llama3_lora_predict.yaml b/examples/lora_single_gpu/llama3_lora_predict.yaml
index 5a9de686..ba55219a 100644
--- a/examples/lora_single_gpu/llama3_lora_predict.yaml
+++ b/examples/lora_single_gpu/llama3_lora_predict.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
 
-# method
+### method
 stage: sft
 do_predict: true
 finetuning_type: lora
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,10 +15,10 @@ max_samples: 50
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/predict
 overwrite_output_dir: true
 
-# eval
+### eval
 per_device_eval_batch_size: 1
 predict_with_generate: true
diff --git a/examples/lora_single_gpu/llama3_lora_pretrain.yaml b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
index 48425b15..acb18ebf 100644
--- a/examples/lora_single_gpu/llama3_lora_pretrain.yaml
+++ b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
@@ -1,27 +1,27 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: pt
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: c4_demo
 cutoff_len: 1024
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -30,7 +30,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/lora_single_gpu/llama3_lora_reward.yaml
index ecaf8d72..67baefd0 100644
--- a/examples/lora_single_gpu/llama3_lora_reward.yaml
+++ b/examples/lora_single_gpu/llama3_lora_reward.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: rm
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: orca_rlhf
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/reward
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.00001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/lora_single_gpu/llama3_lora_sft.yaml
index 0e5e30b3..e7836fd1 100644
--- a/examples/lora_single_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/lora_single_gpu/llama3_preprocess.yaml
index 4c45c1cd..59090544 100644
--- a/examples/lora_single_gpu/llama3_preprocess.yaml
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -16,6 +16,6 @@ overwrite_cache: true
 preprocessing_num_workers: 16
 tokenized_path: saves/llama3-8b/dataset/sft
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 overwrite_output_dir: true
diff --git a/examples/lora_single_gpu/llava1_5_lora_sft.yaml b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
index 84d2a672..8e4226da 100644
--- a/examples/lora_single_gpu/llava1_5_lora_sft.yaml
+++ b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: llava-hf/llava-1.5-7b-hf
 visual_inputs: true
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: mllm_demo
 template: vicuna
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llava1_5-7b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/merge_lora/llama3_gptq.yaml b/examples/merge_lora/llama3_gptq.yaml
index eac12f90..70c96a6b 100644
--- a/examples/merge_lora/llama3_gptq.yaml
+++ b/examples/merge_lora/llama3_gptq.yaml
@@ -1,8 +1,8 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 template: llama3
 
-# export
+### export
 export_dir: models/llama3_gptq
 export_quantization_bit: 4
 export_quantization_dataset: data/c4_demo.json
diff --git a/examples/merge_lora/llama3_lora_sft.yaml b/examples/merge_lora/llama3_lora_sft.yaml
index de41d48b..1e017f69 100644
--- a/examples/merge_lora/llama3_lora_sft.yaml
+++ b/examples/merge_lora/llama3_lora_sft.yaml
@@ -1,12 +1,12 @@
-# Note: DO NOT use quantized model or quantization_bit when merging lora adapters
+### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
 
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
 template: llama3
 finetuning_type: lora
 
-# export
+### export
 export_dir: models/llama3_lora_sft
 export_size: 2
 export_device: cpu
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
index a1d5f95d..c8f2cff6 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
index 8941d6b2..05cb2a3f 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
index 885fcd83..d6da94d3 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
@@ -1,14 +1,14 @@
-# model
+### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -16,14 +16,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -32,7 +32,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
index 87a404a0..f2ba7490 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
@@ -1,13 +1,13 @@
-# model
+### model
 model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
 
-# method
+### method
 stage: sft
 do_train: true
 finetuning_type: lora
 lora_target: q_proj,v_proj
 
-# dataset
+### dataset
 dataset: identity,alpaca_gpt4_en
 template: llama3
 cutoff_len: 1024
@@ -15,14 +15,14 @@ max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
 
-# output
+### output
 output_dir: saves/llama3-8b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
 
-# train
+### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
 learning_rate: 0.0001
@@ -31,7 +31,7 @@ lr_scheduler_type: cosine
 warmup_steps: 0.1
 fp16: true
 
-# eval
+### eval
 val_size: 0.1
 per_device_eval_batch_size: 1
 evaluation_strategy: steps

From 969e605c7e53ade2f3c66a8a53654eabacfd574b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 17 May 2024 02:14:56 +0800
Subject: [PATCH 317/341] better dtype handle in loading

Former-commit-id: 663f0577dd61a1a31191db2c6fbb0c7cea533b21
---
 src/llamafactory/model/adapter.py |  5 ++++-
 src/llamafactory/model/loader.py  |  1 +
 src/llamafactory/model/patcher.py | 17 ++++++++++-------
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index 4ae95a62..e868afd6 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -44,7 +44,7 @@ def init_adapter(
         raise ValueError("You can only use lora for quantized models.")
 
     if deepspeed_config() is not None or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
-        logger.info("DeepSpeed/FSDP/PureBF16/BAdam detected, remaining trainable params in half precision.")
+        logger.info("DeepSpeed/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.")
         cast_trainable_params_to_fp32 = False
     else:
         logger.info("Upcasting trainable params to float32.")
@@ -122,6 +122,9 @@ def init_adapter(
             else:
                 param.requires_grad_(False)
 
+        if model_args.visual_inputs and hasattr(model, "vision_tower"):  # freeze vision model
+            model.vision_tower.requires_grad_(False)
+
         logger.info("Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids))))
 
     if finetuning_args.finetuning_type == "lora":
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 08cdf17f..49b347d5 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -170,6 +170,7 @@ def load_model(
         )
     else:
         param_stats = "all params: {:d}".format(all_param)
+
     logger.info(param_stats)
 
     if model_args.print_param_status:
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 603e3c9e..9297ef00 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict
 import torch
 from peft import PeftModel
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available
-from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.integrations import deepspeed_config, is_deepspeed_zero3_enabled
 from transformers.modeling_utils import is_fsdp_enabled
 
 from ..extras.logging import get_logger
@@ -66,13 +66,16 @@ def patch_config(
         for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
             setattr(config, dtype_name, model_args.compute_dtype == dtype)
 
-    if getattr(config, "model_type", None) == "qwen2" and is_trainable and model_args.flash_attn:
-        setattr(config, "use_cache", False)  # qwen2 does not support use_cache when using flashattn
+    if getattr(config, "model_type", None) == "qwen2" and is_trainable and model_args.flash_attn == "fa2":
+        setattr(config, "use_cache", False)  # qwen2 does not support use_cache when using flash attn
 
-    init_kwargs["torch_dtype"] = model_args.compute_dtype
-    if not is_deepspeed_zero3_enabled() and not is_fsdp_enabled():
-        init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage
-        if init_kwargs["low_cpu_mem_usage"]:
+    # deepspeed zero3 is not compatible with low_cpu_mem_usage
+    init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage and (not is_deepspeed_zero3_enabled())
+
+    if deepspeed_config() is None and not is_fsdp_enabled():  # set dtype and device map if not use deepspeed or fsdp
+        init_kwargs["torch_dtype"] = model_args.compute_dtype
+
+        if init_kwargs["low_cpu_mem_usage"]:  # device map requires low_cpu_mem_usage=True
             if "device_map" not in init_kwargs and model_args.device_map:
                 init_kwargs["device_map"] = model_args.device_map
 

From 92b3697e2cb2e80f38b519bc1938fcf64d7b9e6c Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 17 May 2024 02:21:10 +0800
Subject: [PATCH 318/341] update badam example #3764

Former-commit-id: a3730fd0a96bab869be6d695031182dabaea8137
---
 examples/extras/badam/llama3_lora_sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_lora_sft.yaml
index c8c00431..24322356 100644
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@@ -6,7 +6,7 @@ stage: sft
 do_train: true
 finetuning_type: full
 use_badam: true
-badam_switch_mode: descending
+badam_switch_mode: ascending
 badam_switch_interval: 50
 badam_verbose: 2
 

From 66b5634ebf4ebe0f6891880d0d5189fea886a176 Mon Sep 17 00:00:00 2001
From: "enji.zhou" <enji.zhou@amh-group.com>
Date: Fri, 17 May 2024 13:09:17 +0800
Subject: [PATCH 319/341] add kto

Former-commit-id: ec51986cf70b0bdd79b8141e45916670fb97a08e
---
 src/llamafactory/data/__init__.py           |   3 +-
 src/llamafactory/data/aligner.py            |   4 +-
 src/llamafactory/data/collator.py           |  33 ++++
 src/llamafactory/data/loader.py             |   2 +-
 src/llamafactory/data/parser.py             |   3 +-
 src/llamafactory/data/preprocess.py         | 104 +++++++++-
 src/llamafactory/extras/constants.py        |   1 +
 src/llamafactory/hparams/finetuning_args.py |  18 +-
 src/llamafactory/train/kto/__init__.py      |   4 +
 src/llamafactory/train/kto/trainer.py       | 206 ++++++++++++++++++++
 src/llamafactory/train/kto/workflow.py      |  78 ++++++++
 src/llamafactory/train/tuner.py             |   4 +-
 12 files changed, 452 insertions(+), 8 deletions(-)
 create mode 100644 src/llamafactory/train/kto/__init__.py
 create mode 100644 src/llamafactory/train/kto/trainer.py
 create mode 100644 src/llamafactory/train/kto/workflow.py

diff --git a/src/llamafactory/data/__init__.py b/src/llamafactory/data/__init__.py
index 792e89d9..0b3a8dcf 100644
--- a/src/llamafactory/data/__init__.py
+++ b/src/llamafactory/data/__init__.py
@@ -1,4 +1,4 @@
-from .collator import PairwiseDataCollatorWithPadding
+from .collator import PairwiseDataCollatorWithPadding,KTODataCollatorWithPadding
 from .loader import get_dataset
 from .template import Template, get_template_and_fix_tokenizer, templates
 from .utils import Role, split_dataset
@@ -6,6 +6,7 @@ from .utils import Role, split_dataset
 
 __all__ = [
     "PairwiseDataCollatorWithPadding",
+    "KTODataCollatorWithPadding",
     "get_dataset",
     "Template",
     "get_template_and_fix_tokenizer",
diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py
index 6bd12aad..2cf8a4f3 100644
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -29,7 +29,7 @@ def _convert_images(images: List[Any], dataset_attr: "DatasetAttr", data_args: "
 def convert_alpaca(
     examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
 ) -> Dict[str, List[Any]]:
-    outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
+    outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": [], "tag": []}
     convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args)
     for i in range(len(examples[dataset_attr.prompt])):
         prompt = []
@@ -61,6 +61,7 @@ def convert_alpaca(
         outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
         outputs["tools"].append("")
         outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else [])
+        outputs["tag"].append(examples[dataset_attr.tag][i] if dataset_attr.tag else True)
 
     return outputs
 
@@ -137,6 +138,7 @@ def align_dataset(
             "system": {"dtype": "string", "_type": "Value"},
             "tools": {"dtype": "string", "_type": "Value"},
             "images": [{"_type": "Image"}],
+            "tag": {"dtype": "bool", "_type": "Value"},
         }
     )
     kwargs = {}
diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py
index 5e506546..517fa68c 100644
--- a/src/llamafactory/data/collator.py
+++ b/src/llamafactory/data/collator.py
@@ -49,3 +49,36 @@ class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq):
         batch = super().__call__(concatenated_features)
         batch["labels"] = self._pad_labels(batch["input_ids"], label_positions)
         return batch
+
+@dataclass
+class KTODataCollatorWithPadding(DataCollatorForSeq2Seq):
+    r"""
+    Data collator for KTO data.
+    """
+    def __call__(self, features, return_tensors=None):
+        concatenated_features = []
+        kl_concatenated_features = []
+        tags = []
+        for feature in features:
+            concatenated_features.append(
+                {
+                    "input_ids": feature["input_ids"],
+                    "attention_mask": feature["attention_mask"],
+                    "labels": feature["labels"],
+                }
+            )
+            kl_concatenated_features.append(
+                {
+                    "input_ids": feature["kl_input_ids"],
+                    "attention_mask": feature["kl_attention_mask"],
+                    "labels": feature["kl_labels"],
+                }
+            )
+            tags.append(feature["tag"])
+        batch = super().__call__(concatenated_features)
+        kl_batch = super().__call__(kl_concatenated_features)
+        batch["KL_completion_input_ids"] = kl_batch["input_ids"]
+        batch["KL_completion_attention_mask"] = kl_batch["attention_mask"]
+        batch["kl_labels"] = kl_batch["labels"]
+        batch["tag"] = torch.tensor(tags)
+        return batch
\ No newline at end of file
diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py
index 3cc01b0d..a04bf377 100644
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -116,7 +116,7 @@ def get_dataset(
     model_args: "ModelArguments",
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
-    stage: Literal["pt", "sft", "rm", "ppo"],
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
     tokenizer: "PreTrainedTokenizer",
     processor: Optional["ProcessorMixin"] = None,
 ) -> Union["Dataset", "IterableDataset"]:
diff --git a/src/llamafactory/data/parser.py b/src/llamafactory/data/parser.py
index 848fd66c..33136551 100644
--- a/src/llamafactory/data/parser.py
+++ b/src/llamafactory/data/parser.py
@@ -28,6 +28,7 @@ class DatasetAttr:
     """ columns """
     system: Optional[str] = None
     images: Optional[str] = None
+    tag: Optional[bool] = None
     """ columns for the alpaca format """
     prompt: Optional[str] = "instruction"
     query: Optional[str] = "input"
@@ -106,7 +107,7 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
         dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca")
 
         if "columns" in dataset_info[name]:
-            column_names = ["system", "images"]
+            column_names = ["system", "images", "tag"]
             if dataset_attr.formatting == "alpaca":
                 column_names.extend(["prompt", "query", "response", "history"])
             else:
diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py
index 38211b0c..4a348ce2 100644
--- a/src/llamafactory/data/preprocess.py
+++ b/src/llamafactory/data/preprocess.py
@@ -70,7 +70,7 @@ def preprocess_supervised_dataset(
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
-    model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
+    model_inputs = {"input_ids": [], "attention_mask": [], "labels": [], "tag": []}
     if processor is not None:
         model_inputs["pixel_values"] = []
         preprocess_visual_inputs = partial(_preprocess_visual_inputs, processor=processor)
@@ -111,11 +111,102 @@ def preprocess_supervised_dataset(
         model_inputs["input_ids"].append(input_ids)
         model_inputs["attention_mask"].append([1] * len(input_ids))
         model_inputs["labels"].append(labels)
+        model_inputs["tag"].append(examples["tag"])
         if processor is not None:
             model_inputs["pixel_values"].append(preprocess_visual_inputs(examples["images"][i]))
 
     return model_inputs
 
+def preprocess_kto_dataset(
+    examples: Dict[str, List[Any]],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    data_args: "DataArguments",
+) -> Dict[str, List[List[int]]]:
+    # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
+    # for multiturn examples, we only mask the prompt part in each prompt-response pair.
+    model_inputs = {"input_ids": [], "attention_mask": [], "labels": [],"kl_input_ids": [], "kl_attention_mask": [], "kl_labels": [], "tag": []}
+    """Creates mismatched pairs of prompts and completions for the KL dataset by reversing the order of completions."""
+    examples['kl_response'] = examples['response'][::-1]
+    if processor is not None:
+        model_inputs["pixel_values"] = []
+        preprocess_visual_inputs = partial(_preprocess_visual_inputs, processor=processor)
+
+    for i in range(len(examples["prompt"])):
+        if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
+            continue
+
+        if processor is not None:
+            examples["prompt"][i][0]["content"] = "<image>" + examples["prompt"][i][0]["content"]
+
+        messages = examples["prompt"][i] + examples["response"][i]
+        kl_messages = examples["prompt"][i] + examples["kl_response"][i]
+        input_ids, labels = [], []
+        kl_input_ids, kl_labels = [], []
+        for turn_idx, (source_ids, target_ids) in enumerate(
+            template.encode_multiturn(
+                tokenizer,
+                messages,
+                examples["system"][i],
+                examples["tools"][i],
+                data_args.cutoff_len,
+                data_args.reserved_label_len,
+            )
+        ):
+            if data_args.train_on_prompt:
+                source_mask = source_ids
+            elif turn_idx != 0 and template.efficient_eos:
+                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
+            else:
+                source_mask = [IGNORE_INDEX] * len(source_ids)
+
+            input_ids += source_ids + target_ids
+            labels += source_mask + target_ids
+
+        if template.efficient_eos:
+            input_ids += [tokenizer.eos_token_id]
+            labels += [tokenizer.eos_token_id]
+
+        for turn_idx, (source_ids, target_ids) in enumerate(
+            template.encode_multiturn(
+                tokenizer,
+                kl_messages,
+                examples["system"][i],
+                examples["tools"][i],
+                data_args.cutoff_len,
+                data_args.reserved_label_len,
+            )
+        ):
+            if data_args.train_on_prompt:
+                source_mask = source_ids
+            elif turn_idx != 0 and template.efficient_eos:
+                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
+            else:
+                source_mask = [IGNORE_INDEX] * len(source_ids)
+
+            kl_input_ids += source_ids + target_ids
+            kl_labels += source_mask + target_ids
+
+        if template.efficient_eos:
+            kl_input_ids += [tokenizer.eos_token_id]
+            kl_labels += [tokenizer.eos_token_id]
+
+        model_inputs["input_ids"].append(input_ids)
+        model_inputs["attention_mask"].append([1] * len(input_ids))
+        model_inputs["labels"].append(labels)
+        model_inputs["kl_input_ids"].append(kl_input_ids)
+        model_inputs["kl_attention_mask"].append([1] * len(kl_input_ids))
+        model_inputs["kl_labels"].append(kl_labels)
+        model_inputs["tag"].append(examples["tag"][i])
+        if processor is not None:
+            model_inputs["pixel_values"].append(preprocess_visual_inputs(examples["images"][i]))
+    desirable = sum([1 for tag in model_inputs["tag"] if tag is True])
+    undesirable = sum([1 for tag in model_inputs["tag"] if tag is False])
+    logger.info("desirable data in KTO dataset: {},undesirable data in KTO dataset: {}".format(desirable, undesirable))
+    if desirable == 0 or undesirable == 0:
+        logger.warning("Your dataset only has one preference type.")
+    return model_inputs
 
 def preprocess_packed_supervised_dataset(
     examples: Dict[str, List[Any]],
@@ -289,7 +380,7 @@ def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer:
 def get_preprocess_and_print_func(
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
-    stage: Literal["pt", "sft", "rm", "ppo"],
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
     template: "Template",
     tokenizer: "PreTrainedTokenizer",
     processor: Optional["ProcessorMixin"],
@@ -328,6 +419,15 @@ def get_preprocess_and_print_func(
             data_args=data_args,
         )
         print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer)
+    elif stage == "kto":
+        preprocess_func = partial(
+            preprocess_kto_dataset,
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            data_args=data_args,
+        )
+        print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)
     else:
         preprocess_func = partial(
             preprocess_unsupervised_dataset,
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 40826163..cf675296 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -45,6 +45,7 @@ TRAINING_STAGES = {
     "Reward Modeling": "rm",
     "PPO": "ppo",
     "DPO": "dpo",
+    "KTO": "kto",
     "ORPO": "orpo",
     "Pre-Training": "pt",
 }
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index e728c30a..e6840518 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -133,6 +133,22 @@ class RLHFArguments:
         default=0.0,
         metadata={"help": "The supervised fine-tuning loss coefficient in DPO training."},
     )
+    kto_beta: float = field(
+        default=0.1,
+        metadata={"help": "The beta parameter for the KTO loss."},
+    )
+    kto_ftx: float = field(
+        default=0.0,
+        metadata={"help": "The supervised fine-tuning loss coefficient in KTO training."},
+    )
+    kto_desirable_weight: float = field(
+        default=1.0,
+        metadata={"help": "The desirable weight for the KTO loss."},
+    )
+    kto_undesirable_weight: float = field(
+        default=1.0,
+        metadata={"help": "The undesirable weight for the KTO loss."},
+    )
     orpo_beta: float = field(
         default=0.1,
         metadata={"help": "The beta (lambda) parameter in ORPO loss representing the weight of the SFT loss."},
@@ -291,7 +307,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         default=False,
         metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."},
     )
-    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo"] = field(
+    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo", "kto"] = field(
         default="sft",
         metadata={"help": "Which stage will be performed in training."},
     )
diff --git a/src/llamafactory/train/kto/__init__.py b/src/llamafactory/train/kto/__init__.py
new file mode 100644
index 00000000..34c7905a
--- /dev/null
+++ b/src/llamafactory/train/kto/__init__.py
@@ -0,0 +1,4 @@
+from .workflow import run_kto
+
+
+__all__ = ["run_kto"]
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
new file mode 100644
index 00000000..6f9f6754
--- /dev/null
+++ b/src/llamafactory/train/kto/trainer.py
@@ -0,0 +1,206 @@
+from collections import defaultdict
+from contextlib import nullcontext
+from types import MethodType
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import Trainer
+from trl import KTOTrainer
+from trl.trainer.utils import disable_dropout_in_model
+
+from ...extras.constants import IGNORE_INDEX
+from ..utils import create_custom_optimzer, create_custom_scheduler
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from ...hparams import FinetuningArguments
+
+
+class CustomKTOTrainer(KTOTrainer):
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", torch.nn.Module],
+        ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]],
+        finetuning_args: "FinetuningArguments",
+        disable_dropout: bool = True,
+        **kwargs,
+    ):
+        if disable_dropout:
+            disable_dropout_in_model(model)
+            if ref_model is not None:
+                disable_dropout_in_model(ref_model)
+
+        self.finetuning_args = finetuning_args
+        self.reference_free = False
+        self.use_dpo_data_collator = True  # hack to avoid warning
+        self.generate_during_eval = False  # disable at evaluation
+        self.label_pad_token_id = IGNORE_INDEX
+        self.padding_value = 0
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.precompute_ref_log_probs = False
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+        self._peft_has_been_casted_to_bf16 = False
+        self.ref_model = ref_model
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # KTO parameter
+        self.beta = finetuning_args.kto_beta
+        self.ftx_gamma = finetuning_args.kto_ftx
+        self.desirable_weight = finetuning_args.kto_desirable_weight
+        self.undesirable_weight = finetuning_args.kto_undesirable_weight
+
+
+        Trainer.__init__(self, model=model, **kwargs)
+        if not hasattr(self, "accelerator"):
+            raise AttributeError("Please update `transformers`.")
+
+        if ref_model is not None:
+            if self.is_deepspeed_enabled:
+                if not (
+                    getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False)
+                ):  # quantized models are already set on the correct device
+                    self.ref_model = self._prepare_deepspeed(self.ref_model)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+
+        if finetuning_args.use_badam:
+            from badam import clip_grad_norm_for_sparse_tensor
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
+
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    def sft_loss(self, chosen_logits: "torch.FloatTensor", chosen_labels: "torch.LongTensor") -> "torch.Tensor":
+        r"""
+        Computes supervised cross-entropy loss of given labels under the given logits.
+        Returns:
+            A tensor of shape (batch_size,) containing the cross-entropy loss of each samples.
+        """
+        all_logps = self.get_batch_logps(chosen_logits, chosen_labels, average_log_prob=True)
+        return -all_logps.nanmean()
+
+
+    def forward(
+        self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
+    ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        with torch.no_grad():
+            KL_logits = model(
+                batch["KL_completion_input_ids"],
+                attention_mask=batch["KL_completion_attention_mask"],
+            ).logits
+
+        completion_logits = model(
+            batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+        ).logits
+
+        completion_logps = self.get_batch_logps(
+            completion_logits,
+            batch["labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        KL_logps = self.get_batch_logps(
+            KL_logits,
+            batch["kl_labels"],
+            average_log_prob=False,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+        )
+
+        if completion_logps.shape[0] != len(batch["tag"]):
+            raise ValueError(
+                "There is a mismatch between the number of examples in this batch and the number of "
+                "examples for which an output sequence was predicted."
+            )
+        chosen_idx = [i for i in range(completion_logps.shape[0]) if batch["tag"][i]]
+        rejected_idx = [i for i in range(completion_logps.shape[0]) if not batch["tag"][i]]
+
+        chosen_logps = completion_logps[chosen_idx, ...]
+        rejected_logps = completion_logps[rejected_idx, ...]
+
+        chosen_logits = completion_logits[chosen_idx, ...]
+        rejected_logits = completion_logits[rejected_idx, ...]
+
+        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, KL_logps)
+
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: Dict[str, Union[List, torch.LongTensor]],
+    ):
+        """Compute the KTO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        batch = {k: (v.to(self.accelerator.device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
+
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_KL_logps,
+        ) = self.forward(model, batch)
+
+        with torch.no_grad():
+            if self.ref_model is None:
+                ref_model = self.model
+                ref_context = self.accelerator.unwrap_model(self.model).disable_adapter()
+            else:
+                ref_model = self.ref_model
+                ref_context = nullcontext()
+            with ref_context:
+                (
+                    reference_chosen_logps,
+                    reference_rejected_logps,
+                    _,
+                    _,
+                    reference_KL_logps,
+                ) = self.forward(ref_model, batch)
+
+        losses, chosen_rewards, rejected_rewards, kl = self.kto_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_KL_logps,
+            reference_chosen_logps,
+            reference_rejected_logps,
+            reference_KL_logps,
+        )
+        losses = losses.nanmean()
+        if self.ftx_gamma > 1e-6 and len(batch["labels"][batch['tag']])>0:
+            losses += self.ftx_gamma * self.sft_loss(policy_chosen_logits, batch["labels"][batch['tag']])
+
+
+        num_chosen = torch.Tensor([len(chosen_rewards)]).to(self.accelerator.device)
+        num_rejected = torch.Tensor([len(rejected_rewards)]).to(self.accelerator.device)
+
+        all_num_chosen = self.accelerator.gather(num_chosen).sum().item()
+        all_num_rejected = self.accelerator.gather(num_rejected).sum().item()
+
+        if all_num_chosen > 0:
+            metrics["rewards/chosen_sum"] = self.accelerator.gather(chosen_rewards.nansum()).nansum().item()
+            metrics["logps/chosen_sum"] = self.accelerator.gather(policy_chosen_logps.nansum()).nansum().item()
+            metrics["count/chosen"] = all_num_chosen
+
+        if all_num_rejected > 0:
+            metrics["rewards/rejected_sum"] = self.accelerator.gather(rejected_rewards.nansum()).nansum().item()
+            metrics["logps/rejected_sum"] = self.accelerator.gather(policy_rejected_logps.nansum()).nansum().item()
+            metrics["count/rejected"] = all_num_rejected
+
+        metrics["kl"] = kl.item()
+
+        return losses, metrics
\ No newline at end of file
diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py
new file mode 100644
index 00000000..a2d0ec24
--- /dev/null
+++ b/src/llamafactory/train/kto/workflow.py
@@ -0,0 +1,78 @@
+from typing import TYPE_CHECKING, List, Optional
+
+from ...data import KTODataCollatorWithPadding, get_dataset, split_dataset
+from ...extras.constants import IGNORE_INDEX
+from ...extras.ploting import plot_loss
+from ...hparams import ModelArguments
+from ...model import load_model, load_tokenizer
+from ..utils import create_modelcard_and_push, create_ref_model
+from .trainer import CustomKTOTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments
+
+
+def run_kto(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[List["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    dataset = get_dataset(model_args, data_args, training_args, stage="kto", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
+    data_collator = KTODataCollatorWithPadding(
+        tokenizer=tokenizer,
+        pad_to_multiple_of=8,
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+    )
+
+    # Create reference model
+    if finetuning_args.ref_model is None and (not training_args.do_train):  # use the model itself
+        ref_model = model
+    else:
+        ref_model = create_ref_model(model_args, finetuning_args)
+
+    # Update arguments
+    training_args.remove_unused_columns = False  # important for pairwise dataset
+
+    # Initialize our Trainer
+    trainer = CustomKTOTrainer(
+        model=model,
+        ref_model=ref_model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **split_dataset(dataset, data_args, training_args),
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+        if id(model) == id(ref_model):  # unable to compute rewards without a reference model
+            remove_keys = [key for key in metrics.keys() if "rewards" in key]
+            for key in remove_keys:
+                metrics.pop(key)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py
index 8f103ca1..89dcb9ac 100644
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -14,7 +14,7 @@ from .ppo import run_ppo
 from .pt import run_pt
 from .rm import run_rm
 from .sft import run_sft
-
+from .kto import run_kto
 
 if TYPE_CHECKING:
     from transformers import TrainerCallback
@@ -39,6 +39,8 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallb
         run_dpo(model_args, data_args, training_args, finetuning_args, callbacks)
     elif finetuning_args.stage == "orpo":
         run_orpo(model_args, data_args, training_args, finetuning_args, callbacks)
+    elif finetuning_args.stage == "kto":
+        run_kto(model_args, data_args, training_args, finetuning_args, callbacks)
     else:
         raise ValueError("Unknown task.")
 

From a32c3a50fcf1eb5e5d1f0f02fbd363548273f2fa Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Fri, 17 May 2024 13:25:36 +0800
Subject: [PATCH 320/341] add deepseek v2 lite model

Former-commit-id: 5e864e6b721d8b891b1cc2ca2dcac41babb9eaaf
---
 src/llamafactory/extras/constants.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 40826163..6b967517 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -323,6 +323,9 @@ register_model_group(
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-base",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-base",
         },
+        "DeepSeek-MoE-16B-v2-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite",
+        },
         "DeepSeek-MoE-236B-Base": {
             DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2",
             DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2",
@@ -331,6 +334,9 @@ register_model_group(
             DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat",
             DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat",
         },
+        "DeepSeek-MoE-16B-v2-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite-Chat",
+        },
         "DeepSeek-MoE-236B-Chat": {
             DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat",
             DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat",

From 6373d307ec47415fc76dd0f55cccd25137d401bd Mon Sep 17 00:00:00 2001
From: juejuezi <juejuezi.git@foxmail.com>
Date: Fri, 17 May 2024 16:07:39 +0800
Subject: [PATCH 321/341] feat: pass the `max_lora_rank` parameter to vLLM
 backend

Former-commit-id: a8756d839405ecb5deabe885cf11d1a61564deee
---
 src/llamafactory/chat/vllm_engine.py   | 1 +
 src/llamafactory/hparams/model_args.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 8d602655..ba0cc1b3 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -59,6 +59,7 @@ class VllmEngine(BaseEngine):
             "disable_log_requests": True,
             "enforce_eager": model_args.vllm_enforce_eager,
             "enable_lora": model_args.adapter_name_or_path is not None,
+            "max_lora_rank": model_args.vllm_max_lora_rank,
         }
 
         if model_args.visual_inputs:
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index ac70bb3c..20ff74aa 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -117,6 +117,7 @@ class ModelArguments:
         default=False,
         metadata={"help": "Whether or not to disable CUDA graph in the vLLM engine."},
     )
+    vllm_max_lora_rank: int = field(default=8, metadata={"help": "The maximum supported rank of all LoRAs."})
     offload_folder: str = field(
         default="offload",
         metadata={"help": "Path to offload model weights."},

From 0fd1a05cec238378350b974bd995a3bc612a1736 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Fri, 17 May 2024 16:16:41 +0800
Subject: [PATCH 322/341] Update model_args.py

Former-commit-id: f40a2fe5334865763e4d513292d359317b7a091b
---
 src/llamafactory/hparams/model_args.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index 20ff74aa..5885bb09 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -117,7 +117,10 @@ class ModelArguments:
         default=False,
         metadata={"help": "Whether or not to disable CUDA graph in the vLLM engine."},
     )
-    vllm_max_lora_rank: int = field(default=8, metadata={"help": "The maximum supported rank of all LoRAs."})
+    vllm_max_lora_rank: int = field(
+        default=8,
+        metadata={"help": "Maximum rank of all LoRAs in the vLLM engine."},
+    )
     offload_folder: str = field(
         default="offload",
         metadata={"help": "Path to offload model weights."},

From 2bff90719be31704b7271df984dda8f40d9e5d7c Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 18 May 2024 03:44:56 +0800
Subject: [PATCH 323/341] improve KTO impl., replace datasets

Former-commit-id: e56a57ddcf061de6e4acc8679f7dbf0b68364986
---
 README.md                                     |  34 ++--
 README_zh.md                                  |  34 ++--
 data/README.md                                |  32 ++--
 data/README_zh.md                             |  30 +--
 data/alpaca_data_en_52k.json.REMOVED.git-id   |   1 -
 data/alpaca_data_zh_51k.json.REMOVED.git-id   |   1 -
 data/alpaca_gpt4_data_en.json.REMOVED.git-id  |   1 -
 data/alpaca_gpt4_data_zh.json.REMOVED.git-id  |   1 -
 ...omparison_gpt4_data_en.json.REMOVED.git-id |   1 -
 ...omparison_gpt4_data_zh.json.REMOVED.git-id |   1 -
 data/example_dataset/example_dataset.py       |  37 ----
 data/glaive_toolcall_10k.json.REMOVED.git-id  |   1 -
 data/hh_rlhf_en/hh_rlhf_en.py                 |   2 +-
 data/orca_rlhf.json.REMOVED.git-id            |   1 -
 data/wiki_demo.txt                            |  30 +++
 data/wiki_demo.txt.REMOVED.git-id             |   1 -
 examples/README.md                            |   6 +
 examples/README_zh.md                         |   6 +
 examples/extras/badam/llama3_lora_sft.yaml    |   2 +-
 .../extras/fsdp_qlora/llama3_lora_sft.yaml    |   2 +-
 examples/extras/galore/llama3_full_sft.yaml   |   2 +-
 .../extras/llama_pro/llama3_freeze_sft.yaml   |   2 +-
 examples/extras/loraplus/llama3_lora_sft.yaml |   2 +-
 examples/extras/mod/llama3_full_sft.yaml      |   2 +-
 .../full_multi_gpu/llama3_full_predict.yaml   |   2 +-
 examples/full_multi_gpu/llama3_full_sft.yaml  |   2 +-
 examples/lora_multi_gpu/llama3_lora_sft.yaml  |   2 +-
 .../lora_multi_gpu/llama3_lora_sft_ds.yaml    |   2 +-
 .../lora_multi_npu/llama3_lora_sft_ds.yaml    |   2 +-
 examples/lora_single_gpu/llama3_lora_dpo.yaml |   4 +-
 examples/lora_single_gpu/llama3_lora_kto.yaml |  39 ++++
 .../lora_single_gpu/llama3_lora_orpo.yaml     |   4 +-
 examples/lora_single_gpu/llama3_lora_ppo.yaml |   2 +-
 .../lora_single_gpu/llama3_lora_predict.yaml  |   2 +-
 .../lora_single_gpu/llama3_lora_reward.yaml   |   2 +-
 examples/lora_single_gpu/llama3_lora_sft.yaml |   2 +-
 .../lora_single_gpu/llama3_preprocess.yaml    |   2 +-
 .../llama3_lora_sft_aqlm.yaml                 |   2 +-
 .../qlora_single_gpu/llama3_lora_sft_awq.yaml |   2 +-
 .../llama3_lora_sft_bitsandbytes.yaml         |   2 +-
 .../llama3_lora_sft_gptq.yaml                 |   2 +-
 src/llamafactory/data/__init__.py             |   4 +-
 src/llamafactory/data/aligner.py              |  93 +++++++--
 src/llamafactory/data/collator.py             |  29 +--
 src/llamafactory/data/loader.py               |   4 +-
 src/llamafactory/data/parser.py               |  21 +-
 src/llamafactory/data/preprocess.py           | 179 +++++++++---------
 src/llamafactory/hparams/finetuning_args.py   |  20 +-
 src/llamafactory/train/dpo/trainer.py         |   5 +-
 src/llamafactory/train/kto/trainer.py         | 105 +++++-----
 src/llamafactory/train/kto/workflow.py        |   2 +-
 src/llamafactory/train/ppo/workflow.py        |   2 +-
 src/llamafactory/train/tuner.py               |   7 +-
 53 files changed, 448 insertions(+), 330 deletions(-)
 delete mode 100644 data/alpaca_data_en_52k.json.REMOVED.git-id
 delete mode 100644 data/alpaca_data_zh_51k.json.REMOVED.git-id
 delete mode 100644 data/alpaca_gpt4_data_en.json.REMOVED.git-id
 delete mode 100644 data/alpaca_gpt4_data_zh.json.REMOVED.git-id
 delete mode 100644 data/comparison_gpt4_data_en.json.REMOVED.git-id
 delete mode 100644 data/comparison_gpt4_data_zh.json.REMOVED.git-id
 delete mode 100644 data/example_dataset/example_dataset.py
 delete mode 100644 data/glaive_toolcall_10k.json.REMOVED.git-id
 delete mode 100644 data/orca_rlhf.json.REMOVED.git-id
 create mode 100644 data/wiki_demo.txt
 delete mode 100644 data/wiki_demo.txt.REMOVED.git-id
 create mode 100644 examples/lora_single_gpu/llama3_lora_kto.yaml

diff --git a/README.md b/README.md
index a41415fd..da81a929 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ Choose your path:
 ## Features
 
 - **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc.
-- **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO and ORPO.
+- **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO and ORPO.
 - **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8.
 - **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and Agent tuning.
 - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA.
@@ -69,14 +69,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[24/05/18] We supported **[KTO](https://arxiv.org/abs/2402.01306)** algorithm for preference learning. See [examples](examples/README.md) for usage.
+
 [24/05/14] We supported training and inference on the Ascend NPU devices. Check [installation](#installation) section for details.
 
 [24/05/13] We supported fine-tuning the **Yi-1.5** series models.
 
-[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See [examples](examples/README.md) for usage.
-
 <details><summary>Full Changelog</summary>
 
+[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See [examples](examples/README.md) for usage.
+
 [24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details.
 
 [24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See [examples](examples/README.md) for usage.
@@ -188,6 +190,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 | Reward Modeling        | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | PPO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | DPO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| KTO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | ORPO Training          | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 
 ## Provided Datasets
@@ -208,12 +211,12 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 
 <details><summary>Supervised fine-tuning datasets</summary>
 
-- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
-- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
-- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Identity (en&zh)](data/identity.json)
-- [Open Assistant (zh)](https://huggingface.co/datasets/OpenAssistant/oasst1)
-- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
+- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
+- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3)
+- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+- [Glaive Function Calling V2 (en&zh)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
+- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
 - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
 - [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
 - [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
@@ -222,7 +225,6 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
 - [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
 - [UltraChat (en)](https://github.com/thunlp/UltraChat)
-- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
 - [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
 - [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
 - [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
@@ -235,15 +237,16 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 - [deepctrl (en&zh)](https://www.modelscope.cn/datasets/deepctrl/deepctrl-sft-data)
-- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
+- [Advertise Generating (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
 - [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
 - [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
 - [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
 - [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
 - [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
 - [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
-- [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
 - [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
+- [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
+- [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
@@ -259,13 +262,12 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 
 <details><summary>Preference datasets</summary>
 
-- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
-- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
-- [Orca DPO (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
-- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 - [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
-- [Open Assistant (zh)](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
+- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
+- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 - [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
+- [KTO mixed (en)](https://huggingface.co/datasets/argilla/kto-mix-15k)
 
 </details>
 
diff --git a/README_zh.md b/README_zh.md
index 4f8ffa28..b8f5e6ab 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -45,7 +45,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 ## 项目特色
 
 - **多种模型**：LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。
-- **集成方法**：（增量）预训练、（多模态）指令监督微调、奖励模型训练、PPO 训练、DPO 训练和 ORPO 训练。
+- **集成方法**：（增量）预训练、（多模态）指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练和 ORPO 训练。
 - **多种精度**：32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。
 - **先进算法**：GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ 和 Agent 微调。
 - **实用技巧**：FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。
@@ -69,14 +69,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 更新日志
 
+[24/05/18] 我们支持了 **[KTO](https://arxiv.org/abs/2402.01306)** 偏好对齐算法。详细用法请参照 [examples](examples/README_zh.md)。
+
 [24/05/14] 我们支持了昇腾 NPU 设备的训练和推理。详情请查阅[安装](#安装-llama-factory)部分。
 
 [24/05/13] 我们支持了 Yi-1.5 系列模型的微调。
 
-[24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 [examples](examples/README_zh.md)。
-
 <details><summary>展开日志</summary>
 
+[24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 [examples](examples/README_zh.md)。
+
 [24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型，详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。
 
 [24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 [examples](examples/README_zh.md)。
@@ -188,6 +190,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | 奖励模型训练            | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | PPO 训练               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | DPO 训练               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| KTO 训练               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | ORPO 训练              | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 
 ## 数据集
@@ -208,12 +211,12 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 <details><summary>指令微调数据集</summary>
 
-- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
-- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
-- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Identity (en&zh)](data/identity.json)
-- [Open Assistant (zh)](https://huggingface.co/datasets/OpenAssistant/oasst1)
-- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
+- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
+- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3)
+- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+- [Glaive Function Calling V2 (en&zh)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
+- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
 - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
 - [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
 - [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
@@ -222,7 +225,6 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
 - [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
 - [UltraChat (en)](https://github.com/thunlp/UltraChat)
-- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
 - [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
 - [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
 - [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
@@ -235,15 +237,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 - [deepctrl (en&zh)](https://www.modelscope.cn/datasets/deepctrl/deepctrl-sft-data)
-- [Ad Gen (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
+- [Advertise Generating (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
 - [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
 - [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
 - [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
 - [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
 - [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
 - [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
-- [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
 - [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
+- [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
+- [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
 - [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
@@ -259,13 +262,12 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 <details><summary>偏好数据集</summary>
 
-- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
-- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
-- [Orca DPO (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
-- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 - [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
-- [Open Assistant (zh)](https://huggingface.co/datasets/OpenAssistant/oasst1)
+- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
+- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
+- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
 - [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
+- [KTO mixed (en)](https://huggingface.co/datasets/argilla/kto-mix-15k)
 
 </details>
 
diff --git a/data/README.md b/data/README.md
index 012de4e7..b1368d4a 100644
--- a/data/README.md
+++ b/data/README.md
@@ -19,7 +19,10 @@ If you are using a custom dataset, please add your **dataset description** to `d
     "messages": "the column name in the dataset containing the messages. (default: conversations)",
     "system": "the column name in the dataset containing the system prompts. (default: None)",
     "tools": "the column name in the dataset containing the tool description. (default: None)",
-    "images": "the column name in the dataset containing the image inputs. (default: None)"
+    "images": "the column name in the dataset containing the image inputs. (default: None)",
+    "chosen": "the column name in the dataset containing the chosen answers. (default: None)",
+    "rejected": "the column name in the dataset containing the rejected answers. (default: None)",
+    "kto_tag": "the column name in the dataset containing the kto tags. (default: None)"
   },
   "tags (optional, used for the sharegpt format)": {
     "role_tag": "the key in the message represents the identity. (default: from)",
@@ -42,13 +45,13 @@ Currently we support dataset in **alpaca** or **sharegpt** format, the dataset i
 ```json
 [
   {
-    "instruction": "user instruction (required)",
-    "input": "user input (optional)",
+    "instruction": "human instruction (required)",
+    "input": "human input (optional)",
     "output": "model response (required)",
     "system": "system prompt (optional)",
     "history": [
-      ["user instruction in the first round (optional)", "model response in the first round (optional)"],
-      ["user instruction in the second round (optional)", "model response in the second round (optional)"]
+      ["human instruction in the first round (optional)", "model response in the first round (optional)"],
+      ["human instruction in the second round (optional)", "model response in the second round (optional)"]
     ]
   }
 ]
@@ -69,7 +72,7 @@ Regarding the above dataset, the description in `dataset_info.json` should be:
 }
 ```
 
-The `query` column will be concatenated with the `prompt` column and used as the user prompt, then the user prompt would be `prompt\nquery`. The `response` column represents the model response.
+The `query` column will be concatenated with the `prompt` column and used as the human prompt, then the human prompt would be `prompt\nquery`. The `response` column represents the model response.
 
 The `system` column will be used as the system prompt. The `history` column is a list consisting string tuples representing prompt-response pairs in the history. Note that the responses in the history **will also be used for training** in supervised fine-tuning.
 
@@ -98,12 +101,10 @@ For the **preference datasets**, the `response` column should be a string list w
 ```json
 [
   {
-    "instruction": "user instruction",
-    "input": "user input",
-    "output": [
-      "chosen answer",
-      "rejected answer"
-    ]
+    "instruction": "human instruction",
+    "input": "human input",
+    "chosen": "chosen answer",
+    "rejected": "rejected answer"
   }
 ]
 ```
@@ -117,7 +118,8 @@ Regarding the above dataset, the description in `dataset_info.json` should be:
   "columns": {
     "prompt": "instruction",
     "query": "input",
-    "response": "output",
+    "chosen": "chosen",
+    "rejected": "rejected"
   }
 }
 ```
@@ -132,7 +134,7 @@ The dataset in **sharegpt** format should follow the below format:
     "conversations": [
       {
         "from": "human",
-        "value": "user instruction"
+        "value": "human instruction"
       },
       {
         "from": "gpt",
@@ -179,7 +181,7 @@ We also supports the dataset in the **openai** format:
       },
       {
         "role": "user",
-        "content": "user instruction"
+        "content": "human instruction"
       },
       {
         "role": "assistant",
diff --git a/data/README_zh.md b/data/README_zh.md
index 6449c5d5..deed94c5 100644
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -19,7 +19,10 @@
     "messages": "数据集代表消息列表的表头名称（默认：conversations）",
     "system": "数据集代表系统提示的表头名称（默认：None）",
     "tools": "数据集代表工具描述的表头名称（默认：None）",
-    "images": "数据集代表图像输入的表头名称（默认：None）"
+    "images": "数据集代表图像输入的表头名称（默认：None）",
+    "chosen": "数据集代表更优回复的表头名称（默认：None）",
+    "rejected": "数据集代表更差回复的表头名称（默认：None）",
+    "kto_tag": "数据集代表 KTO 标签的表头名称（默认：None）"
   },
   "tags（可选，用于 sharegpt 格式）": {
     "role_tag": "消息中代表发送者身份的键名（默认：from）",
@@ -42,8 +45,8 @@
 ```json
 [
   {
-    "instruction": "用户指令（必填）",
-    "input": "用户输入（选填）",
+    "instruction": "人类指令（必填）",
+    "input": "人类输入（选填）",
     "output": "模型回答（必填）",
     "system": "系统提示词（选填）",
     "history": [
@@ -69,7 +72,7 @@
 }
 ```
 
-其中 `query` 列对应的内容会与 `prompt` 列对应的内容拼接后作为用户指令，即用户指令为 `prompt\nquery`。`response` 列对应的内容为模型回答。
+其中 `query` 列对应的内容会与 `prompt` 列对应的内容拼接后作为人类指令，即人类指令为 `prompt\nquery`。`response` 列对应的内容为模型回答。
 
 `system` 列对应的内容将被作为系统提示词。`history` 列是由多个字符串二元组构成的列表，分别代表历史消息中每轮的指令和回答。注意在指令监督学习时，历史消息中的回答**也会被用于训练**。
 
@@ -98,12 +101,10 @@
 ```json
 [
   {
-    "instruction": "用户指令",
-    "input": "用户输入",
-    "output": [
-      "优质回答",
-      "劣质回答"
-    ]
+    "instruction": "人类指令",
+    "input": "人类输入",
+    "chosen": "优质回答",
+    "rejected": "劣质回答"
   }
 ]
 ```
@@ -117,7 +118,8 @@
   "columns": {
     "prompt": "instruction",
     "query": "input",
-    "response": "output",
+    "chosen": "chosen",
+    "rejected": "rejected"
   }
 }
 ```
@@ -132,7 +134,7 @@
     "conversations": [
       {
         "from": "human",
-        "value": "用户指令"
+        "value": "人类指令"
       },
       {
         "from": "gpt",
@@ -165,7 +167,7 @@
 }
 ```
 
-其中 `messages` 列应当是一个列表，且符合 `用户/模型/用户/模型/用户/模型` 的顺序。
+其中 `messages` 列应当是一个列表，且符合 `人类/模型/人类/模型/人类/模型` 的顺序。
 
 我们同样支持 **openai** 格式的数据集：
 
@@ -179,7 +181,7 @@
       },
       {
         "role": "user",
-        "content": "用户指令"
+        "content": "人类指令"
       },
       {
         "role": "assistant",
diff --git a/data/alpaca_data_en_52k.json.REMOVED.git-id b/data/alpaca_data_en_52k.json.REMOVED.git-id
deleted file mode 100644
index 5568c425..00000000
--- a/data/alpaca_data_en_52k.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-3779ddbc040543ab1834ef216c983d6fcc06cc9a
\ No newline at end of file
diff --git a/data/alpaca_data_zh_51k.json.REMOVED.git-id b/data/alpaca_data_zh_51k.json.REMOVED.git-id
deleted file mode 100644
index 0cd1db46..00000000
--- a/data/alpaca_data_zh_51k.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-a97cf9475291591843976554878568e046d8a46d
\ No newline at end of file
diff --git a/data/alpaca_gpt4_data_en.json.REMOVED.git-id b/data/alpaca_gpt4_data_en.json.REMOVED.git-id
deleted file mode 100644
index 15985776..00000000
--- a/data/alpaca_gpt4_data_en.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-25508714b7879a1e5a6764ba7f979a980f549f1a
\ No newline at end of file
diff --git a/data/alpaca_gpt4_data_zh.json.REMOVED.git-id b/data/alpaca_gpt4_data_zh.json.REMOVED.git-id
deleted file mode 100644
index c86d1aea..00000000
--- a/data/alpaca_gpt4_data_zh.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-7cb6a7d11455bddc3d495750a2392683d775b184
\ No newline at end of file
diff --git a/data/comparison_gpt4_data_en.json.REMOVED.git-id b/data/comparison_gpt4_data_en.json.REMOVED.git-id
deleted file mode 100644
index 884ac974..00000000
--- a/data/comparison_gpt4_data_en.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-f5cb08305ff5dc9c17a09809c54c8c8834aadc70
\ No newline at end of file
diff --git a/data/comparison_gpt4_data_zh.json.REMOVED.git-id b/data/comparison_gpt4_data_zh.json.REMOVED.git-id
deleted file mode 100644
index dbc830e7..00000000
--- a/data/comparison_gpt4_data_zh.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-aee47b7b443496e37808d7f34ef10403ff99bcc3
\ No newline at end of file
diff --git a/data/example_dataset/example_dataset.py b/data/example_dataset/example_dataset.py
deleted file mode 100644
index bf0baa54..00000000
--- a/data/example_dataset/example_dataset.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import json
-from typing import Any, Dict, Generator, List, Tuple
-
-import datasets
-
-
-_DESCRIPTION = "An example of dataset."
-_CITATION = ""
-_HOMEPAGE = ""
-_LICENSE = ""
-_URL = "examples.json"
-
-
-class ExampleDataset(datasets.GeneratorBasedBuilder):
-    VERSION = datasets.Version("0.0.0")
-
-    def _info(self) -> datasets.DatasetInfo:
-        features = datasets.Features(
-            {
-                "instruction": datasets.Value("string"),
-                "input": datasets.Value("string"),
-                "output": datasets.Value("string"),
-                "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
-        )
-
-    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
-        file_path = dl_manager.download(_URL)
-        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
-
-    def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
-        example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
-        for key, example in enumerate(example_dataset):
-            yield key, example
diff --git a/data/glaive_toolcall_10k.json.REMOVED.git-id b/data/glaive_toolcall_10k.json.REMOVED.git-id
deleted file mode 100644
index 64693b28..00000000
--- a/data/glaive_toolcall_10k.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-4748dff00d1dc42768a5b6cc772143c313017812
\ No newline at end of file
diff --git a/data/hh_rlhf_en/hh_rlhf_en.py b/data/hh_rlhf_en/hh_rlhf_en.py
index abe4673c..1bc18f4f 100644
--- a/data/hh_rlhf_en/hh_rlhf_en.py
+++ b/data/hh_rlhf_en/hh_rlhf_en.py
@@ -79,5 +79,5 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
                             break
                         prompt = prompt[:human_idx]
 
-                    yield key, {"instruction": query, "output": [r_accept, r_reject], "history": history}
+                    yield key, {"instruction": query, "chosen": r_accept, "rejected": r_reject, "history": history}
                     key += 1
diff --git a/data/orca_rlhf.json.REMOVED.git-id b/data/orca_rlhf.json.REMOVED.git-id
deleted file mode 100644
index 45f1a9ac..00000000
--- a/data/orca_rlhf.json.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-736bcedea2b24a1414765c6d69cbdafaea839f3c
\ No newline at end of file
diff --git a/data/wiki_demo.txt b/data/wiki_demo.txt
new file mode 100644
index 00000000..cbd09e83
--- /dev/null
+++ b/data/wiki_demo.txt
@@ -0,0 +1,30 @@
+Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.Humans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished in most parts of the world and had a significant role in workers' struggles for emancipation. Various anarchist schools of thought formed during this period. Anarchists have taken part in several revolutions, most notably in the Paris Commune, the Russian Civil War and the Spanish Civil War, whose end marked the end of the classical era of anarchism. In the last decades of the 20th and into the 21st century, the anarchist movement has been resurgent once more.Anarchism employs a diversity of tactics in order to meet its ideal ends which can be broadly separated into revolutionary and evolutionary tactics; there is significant overlap between the two, which are merely descriptive. Revolutionary tactics aim to bring down authority and state, having taken a violent turn in the past, while evolutionary tactics aim to prefigure what an anarchist society would be like. Anarchist thought, criticism, and praxis have played a part in diverse areas of human society. Criticism of anarchism include claims that it is internally inconsistent, violent, or utopian.Etymology, terminology, and definition The etymological origin of anarchism is from the Ancient Greek anarkhia, meaning "without a ruler", composed of the prefix an- ("without") and the word arkhos ("leader" or "ruler"). The suffix -ism denotes the ideological current that favours anarchy. Anarchism appears in English from 1642 as anarchisme and anarchy from 1539; early English usages emphasised a sense of disorder. Various factions within the French Revolution labelled their opponents as anarchists, although few such accused shared many views with later anarchists. Many revolutionaries of the 19th century such as William Godwin (1756–1836) and Wilhelm Weitling (1808–1871) would contribute to the anarchist doctrines of the next generation but did not use anarchist or anarchism in describing themselves or their beliefs.The first political philosopher to call himself an anarchist () was Pierre-Joseph Proudhon (1809–1865), marking the formal birth of anarchism in the mid-19th century. Since the 1890s and beginning in France, libertarianism has often been used as a synonym for anarchism and its use as a synonym is still common outside the United States. Some usages of libertarianism refer to individualistic free-market philosophy only, and free-market anarchism in particular is termed libertarian anarchism.While the term libertarian has been largely synonymous with anarchism, its meaning has more recently diluted with wider adoption from ideologically disparate groups, including both the New Left and libertarian Marxists, who do not associate themselves with authoritarian socialists or a vanguard party, and extreme cultural liberals, who are primarily concerned with civil liberties. Additionally, some anarchists use libertarian socialist to avoid anarchism's negative connotations and emphasise its connections with socialism. Anarchism is broadly used to describe the anti-authoritarian wing of the socialist movement. Anarchism is contrasted to socialist forms which are state-oriented or from above. Scholars of anarchism generally highlight anarchism's socialist credentials and criticise attempts at creating dichotomies between the two. Some scholars describe anarchism as having many influences from liberalism, and being both liberals and socialists but more so, while most scholars reject anarcho-capitalism as a misunderstanding of anarchist principles.While opposition to the state is central to anarchist thought, defining anarchism is not an easy task for scholars, as there is a lot of discussion among scholars and anarchists on the matter, and various currents perceive anarchism slightly differently. Major definitional elements include the will for a non-coercive society, the rejection of the state apparatus, the belief that human nature allows humans to exist in or progress toward such a non-coercive society, and a suggestion on how to act to pursue the ideal of anarchy.HistoryPre-modern era Before the establishment of towns and cities, an established authority did not exist. It was after the creation of institutions of authority that anarchistic ideas espoused as a reaction. The most notable precursors to anarchism in the ancient world were in China and Greece. In China, philosophical anarchism (the discussion on the legitimacy of the state) was delineated by Taoist philosophers Zhuang Zhou and Laozi. Alongside Stoicism, Taoism has been said to have had "significant anticipations" of anarchism. Anarchic attitudes were also articulated by tragedians and philosophers in Greece. Aeschylus and Sophocles used the myth of Antigone to illustrate the conflict between rules set by the state and personal autonomy. Socrates questioned Athenian authorities constantly and insisted on the right of individual freedom of conscience. Cynics dismissed human law (nomos) and associated authorities while trying to live according to nature (physis). Stoics were supportive of a society based on unofficial and friendly relations among its citizens without the presence of a state.In medieval Europe, there was no anarchistic activity except some ascetic religious movements. These, and other Muslim movements, later gave birth to religious anarchism. In the Sasanian Empire, Mazdak called for an egalitarian society and the abolition of monarchy, only to be soon executed by Emperor Kavad I.In Basra, religious sects preached against the state. In Europe, various sects developed anti-state and libertarian tendencies. Renewed interest in antiquity during the Renaissance and in private judgment during the Reformation restored elements of anti-authoritarian secularism, particularly in France. Enlightenment challenges to intellectual authority (secular and religious) and the revolutions of the 1790s and 1848 all spurred the ideological development of what became the era of classical anarchism.Modern era During the French Revolution, partisan groups such as the Enragés and the  saw a turning point in the fermentation of anti-state and federalist sentiments. The first anarchist currents developed throughout the 18th century as William Godwin espoused philosophical anarchism in England, morally delegitimising the state, Max Stirner's thinking paved the way to individualism and Pierre-Joseph Proudhon's theory of mutualism found fertile soil in France. By the late 1870s, various anarchist schools of thought had become well-defined and a wave of then unprecedented globalisation occurred from 1880 to 1914. This era of classical anarchism lasted until the end of the Spanish Civil War and is considered the golden age of anarchism.Drawing from mutualism, Mikhail Bakunin founded collectivist anarchism and entered the International Workingmen's Association, a class worker union later known as the First International that formed in 1864 to unite diverse revolutionary currents. The International became a significant political force, with Karl Marx being a leading figure and a member of its General Council. Bakunin's faction (the Jura Federation) and Proudhon's followers (the mutualists) opposed state socialism, advocating political abstentionism and small property holdings. After bitter disputes, the Bakuninists were expelled from the International by the Marxists at the 1872 Hague Congress. Anarchists were treated similarly in the Second International, being ultimately expelled in 1896. Bakunin famously predicted that if revolutionaries gained power by Marx's terms, they would end up the new tyrants of workers. In response to their expulsion from the First International, anarchists formed the St. Imier International. Under the influence of Peter Kropotkin, a Russian philosopher and scientist, anarcho-communism overlapped with collectivism. Anarcho-communists, who drew inspiration from the 1871 Paris Commune, advocated for free federation and for the distribution of goods according to one's needs.At the turn of the century, anarchism had spread all over the world. It was a notable feature of the international syndicalism movement. In China, small groups of students imported the humanistic pro-science version of anarcho-communism. Tokyo was a hotspot for rebellious youth from countries of the far east, travelling to the Japanese capital to study. In Latin America, Argentina was a stronghold for anarcho-syndicalism, where it became the most prominent left-wing ideology. During this time, a minority of anarchists adopted tactics of revolutionary political violence. This strategy became known as propaganda of the deed. The dismemberment of the French socialist movement into many groups and the execution and exile of many Communards to penal colonies following the suppression of the Paris Commune favoured individualist political expression and acts. Even though many anarchists distanced themselves from these terrorist acts, infamy came upon the movement and attempts were made to exclude them from American immigration, including the Immigration Act of 1903, also called the Anarchist Exclusion Act. Illegalism was another strategy which some anarchists adopted during this period.Despite concerns, anarchists enthusiastically participated in the Russian Revolution in opposition to the White movement; however, they met harsh suppression after the Bolshevik government was stabilised. Several anarchists from Petrograd and Moscow fled to Ukraine, notably leading to the Kronstadt rebellion and Nestor Makhno's struggle in the Free Territory. With the anarchists being crushed in Russia, two new antithetical currents emerged, namely platformism and synthesis anarchism. The former sought to create a coherent group that would push for revolution while the latter were against anything that would resemble a political party. Seeing the victories of the Bolsheviks in the October Revolution and the resulting Russian Civil War, many workers and activists turned to communist parties which grew at the expense of anarchism and other socialist movements. In France and the United States, members of major syndicalist movements such as the General Confederation of Labour and the Industrial Workers of the World left their organisations and joined the Communist International.In the Spanish Civil War of 1936, anarchists and syndicalists (CNT and FAI) once again allied themselves with various currents of leftists. A long tradition of Spanish anarchism led to anarchists playing a pivotal role in the war. In response to the army rebellion, an anarchist-inspired movement of peasants and workers, supported by armed militias, took control of Barcelona and of large areas of rural Spain, where they collectivised the land. The Soviet Union provided some limited assistance at the beginning of the war, but the result was a bitter fight among communists and anarchists at a series of events named May Days as Joseph Stalin tried to seize control of the Republicans.Post-war era At the end of World War II, the anarchist movement was severely weakened. The 1960s witnessed a revival of anarchism, likely caused by a perceived failure of Marxism–Leninism and tensions built by the Cold War. During this time, anarchism found a presence in other movements critical towards both capitalism and the state such as the anti-nuclear, environmental, and peace movements, the counterculture of the 1960s, and the New Left. It also saw a transition from its previous revolutionary nature to provocative anti-capitalist reformism. Anarchism became associated with punk subculture as exemplified by bands such as Crass and the Sex Pistols. The established feminist tendencies of anarcha-feminism returned with vigour during the second wave of feminism. Black anarchism began to take form at this time and influenced anarchism's move from a Eurocentric demographic. This coincided with its failure to gain traction in Northern Europe and its unprecedented height in Latin America.Around the turn of the 21st century, anarchism grew in popularity and influence within anti-capitalist, anti-war and anti-globalisation movements. Anarchists became known for their involvement in protests against the World Trade Organization (WTO), the Group of Eight and the World Economic Forum. During the protests, ad hoc leaderless anonymous cadres known as black blocs engaged in rioting, property destruction and violent confrontations with the police. Other organisational tactics pioneered in this time include affinity groups, security culture and the use of decentralised technologies such as the Internet. A significant event of this period was the confrontations at the 1999 Seattle WTO conference. Anarchist ideas have been influential in the development of the Zapatistas in Mexico and the Democratic Federation of Northern Syria, more commonly known as Rojava, a de facto autonomous region in northern Syria.Thought Anarchist schools of thought have been generally grouped into two main historical traditions, social anarchism and individualist anarchism, owing to their different origins, values and evolution. The individualist current emphasises negative liberty in opposing restraints upon the free individual, while the social current emphasises positive liberty in aiming to achieve the free potential of society through equality and social ownership. In a chronological sense, anarchism can be segmented by the classical currents of the late 19th century and the post-classical currents (anarcha-feminism, green anarchism, and post-anarchism) developed thereafter.Beyond the specific factions of anarchist movements which constitute political anarchism lies philosophical anarchism which holds that the state lacks moral legitimacy, without necessarily accepting the imperative of revolution to eliminate it. A component especially of individualist anarchism, philosophical anarchism may tolerate the existence of a minimal state but claims that citizens have no moral obligation to obey government when it conflicts with individual autonomy. Anarchism pays significant attention to moral arguments since ethics have a central role in anarchist philosophy. Anarchism's emphasis on anti-capitalism, egalitarianism, and for the extension of community and individuality sets it apart from anarcho-capitalism and other types of economic libertarianism.Anarchism is usually placed on the far-left of the political spectrum. Much of its economics and legal philosophy reflect anti-authoritarian, anti-statist, libertarian, and radical interpretations of left-wing and socialist politics such as collectivism, communism, individualism, mutualism, and syndicalism, among other libertarian socialist economic theories. As anarchism does not offer a fixed body of doctrine from a single particular worldview, many anarchist types and traditions exist and varieties of anarchy diverge widely. One reaction against sectarianism within the anarchist milieu was anarchism without adjectives, a call for toleration and unity among anarchists first adopted by Fernando Tarrida del Mármol in 1889 in response to the bitter debates of anarchist theory at the time. Belief in political nihilism has been espoused by anarchists. Despite separation, the various anarchist schools of thought are not seen as distinct entities but rather as tendencies that intermingle and are connected through a set of uniform principles such as individual and local autonomy, mutual aid, network organisation, communal democracy, justified authority and decentralisation.Classical Inceptive currents among classical anarchist currents were mutualism and individualism. They were followed by the major currents of social anarchism (collectivist, communist and syndicalist). They differ on organisational and economic aspects of their ideal society.Mutualism is an 18th-century economic theory that was developed into anarchist theory by Pierre-Joseph Proudhon. Its aims include reciprocity, free association, voluntary contract, federation and monetary reform of both credit and currency that would be regulated by a bank of the people. Mutualism has been retrospectively characterised as ideologically situated between individualist and collectivist forms of anarchism. In What Is Property? (1840), Proudhon first characterised his goal as a "third form of society, the synthesis of communism and property." Collectivist anarchism is a revolutionary socialist form of anarchism commonly associated with Mikhail Bakunin. Collectivist anarchists advocate collective ownership of the means of production which is theorised to be achieved through violent revolution and that workers be paid according to time worked, rather than goods being distributed according to need as in communism. Collectivist anarchism arose alongside Marxism but rejected the dictatorship of the proletariat despite the stated Marxist goal of a collectivist stateless society.Anarcho-communism is a theory of anarchism that advocates a communist society with common ownership of the means of production, direct democracy and a horizontal network of voluntary associations, workers' councils and worker cooperatives, with production and consumption based on the guiding principle "From each according to his ability, to each according to his need." Anarcho-communism developed from radical socialist currents after the French Revolution but was first formulated as such in the Italian section of the First International. It was later expanded upon in the theoretical work of Peter Kropotkin, whose specific style would go onto become the dominating view of anarchists by the late 19th century. Anarcho-syndicalism is a branch of anarchism that views labour syndicates as a potential force for revolutionary social change, replacing capitalism and the state with a new society democratically self-managed by workers. The basic principles of anarcho-syndicalism are direct action, workers' solidarity and workers' self-management.Individualist anarchism is a set of several traditions of thought within the anarchist movement that emphasise the individual and their will over any kinds of external determinants. Early influences on individualist forms of anarchism include William Godwin, Max Stirner, and Henry David Thoreau. Through many countries, individualist anarchism attracted a small yet diverse following of Bohemian artists and intellectuals as well as young anarchist outlaws in what became known as illegalism and individual reclamation.Post-classical and contemporary Anarchist principles undergird contemporary radical social movements of the left. Interest in the anarchist movement developed alongside momentum in the anti-globalisation movement, whose leading activist networks were anarchist in orientation. As the movement shaped 21st century radicalism, wider embrace of anarchist principles signaled a revival of interest. Anarchism has continued to generate many philosophies and movements, at times eclectic, drawing upon various sources and combining disparate concepts to create new philosophical approaches. The anti-capitalist tradition of classical anarchism has remained prominent within contemporary currents.Contemporary news coverage which emphasizes black bloc demonstrations has reinforced anarchism's historical association with chaos and violence. Its publicity has also led more scholars in fields such as anthropology and history to engage with the anarchist movement, although contemporary anarchism favours actions over academic theory. Various anarchist groups, tendencies, and schools of thought exist today, making it difficult to describe the contemporary anarchist movement. While theorists and activists have established "relatively stable constellations of anarchist principles", there is no consensus on which principles are core and commentators describe multiple anarchisms, rather than a singular anarchism, in which common principles are shared between schools of anarchism while each group prioritizes those principles differently. Gender equality can be a common principle, although it ranks as a higher priority to anarcha-feminists than anarcho-communists.Anarchists are generally committed against coercive authority in all forms, namely "all centralized and hierarchical forms of government (e.g., monarchy, representative democracy, state socialism, etc.), economic class systems (e.g., capitalism, Bolshevism, feudalism, slavery, etc.), autocratic religions (e.g., fundamentalist Islam, Roman Catholicism, etc.), patriarchy, heterosexism, white supremacy, and imperialism." Anarchist schools disagree on the methods by which these forms should be opposed. The principle of equal liberty is closer to anarchist political ethics in that it transcends both the liberal and socialist traditions. This entails that liberty and equality cannot be implemented within the state, resulting in the questioning of all forms of domination and hierarchy.Tactics Anarchists' tactics take various forms but in general serve two major goals, namely to first oppose the Establishment and secondly to promote anarchist ethics and reflect an anarchist vision of society, illustrating the unity of means and ends. A broad categorisation can be made between aims to destroy oppressive states and institutions by revolutionary means on one hand and aims to change society through evolutionary means on the other. Evolutionary tactics embrace nonviolence, reject violence and take a gradual approach to anarchist aims, although there is significant overlap between the two.Anarchist tactics have shifted during the course of the last century. Anarchists during the early 20th century focused more on strikes and militancy while contemporary anarchists use a broader array of approaches.Classical era tactics During the classical era, anarchists had a militant tendency. Not only did they confront state armed forces, as in Spain and Ukraine, but some of them also employed terrorism as propaganda of the deed. Assassination attempts were carried out against heads of state, some of which were successful. Anarchists also took part in revolutions. Many anarchists, especially the Galleanists, believed that these attempts would be the impetus for a revolution against capitalism and the state. Many of these attacks were done by individual assailants and the majority took place in the late 1870s, the early 1880s and the 1890s, with some still occurring in the early 1900s. Their decrease in prevalence was the result of further judicial power and targeting and cataloging by state institutions.Anarchist perspectives towards violence have always been controversial. Anarcho-pacifists advocate for non-violence means to achieve their stateless, nonviolent ends. Other anarchist groups advocate direct action, a tactic which can include acts of sabotage or terrorism. This attitude was quite prominent a century ago when seeing the state as a tyrant and some anarchists believing that they had every right to oppose its oppression by any means possible. Emma Goldman and Errico Malatesta, who were proponents of limited use of violence, stated that violence is merely a reaction to state violence as a necessary evil.Anarchists took an active role in strike actions, although they tended to be antipathetic to formal syndicalism, seeing it as reformist. They saw it as a part of the movement which sought to overthrow the state and capitalism. Anarchists also reinforced their propaganda within the arts, some of whom practiced naturism and nudism. Those anarchists also built communities which were based on friendship and were involved in the news media.Revolutionary tactics In the current era, Italian anarchist Alfredo Bonanno, a proponent of insurrectionary anarchism, has reinstated the debate on violence by rejecting the nonviolence tactic adopted since the late 19th century by Kropotkin and other prominent anarchists afterwards. Both Bonanno and the French group The Invisible Committee advocate for small, informal affiliation groups, where each member is responsible for their own actions but works together to bring down oppression utilizing sabotage and other violent means against state, capitalism, and other enemies. Members of The Invisible Committee were arrested in 2008 on various charges, terrorism included.Overall, contemporary anarchists are much less violent and militant than their ideological ancestors. They mostly engage in confronting the police during demonstrations and riots, especially in countries such as Canada, Greece, and Mexico. Militant black bloc protest groups are known for clashing with the police; however, anarchists not only clash with state operators, they also engage in the struggle against fascists and racists, taking anti-fascist action and mobilizing to prevent hate rallies from happening.Evolutionary tactics Anarchists commonly employ direct action. This can take the form of disrupting and protesting against unjust hierarchy, or the form of self-managing their lives through the creation of counter-institutions such as communes and non-hierarchical collectives. Decision-making is often handled in an anti-authoritarian way, with everyone having equal say in each decision, an approach known as horizontalism. Contemporary-era anarchists have been engaging with various grassroots movements that are more or less based on horizontalism, although not explicitly anarchist, respecting personal autonomy and participating in mass activism such as strikes and demonstrations. In contrast with the big-A anarchism of the classical era, the newly coined term small-a anarchism signals their tendency not to base their thoughts and actions on classical-era anarchism or to refer to classical anarchists such as Peter Kropotkin and Pierre-Joseph Proudhon to justify their opinions. Those anarchists would rather base their thought and praxis on their own experience which they will later theorize.The decision-making process of small anarchist affinity groups plays a significant tactical role. Anarchists have employed various methods in order to build a rough consensus among members of their group without the need of a leader or a leading group. One way is for an individual from the group to play the role of facilitator to help achieve a consensus without taking part in the discussion themselves or promoting a specific point. Minorities usually accept rough consensus, except when they feel the proposal contradicts anarchist ethics, goals and values. Anarchists usually form small groups (5–20 individuals) to enhance autonomy and friendships among their members. These kinds of groups more often than not interconnect with each other, forming larger networks. Anarchists still support and participate in strikes, especially wildcat strikes as these are leaderless strikes not organised centrally by a syndicate.As in the past, newspapers and journals are used, and anarchists have gone online in the World Wide Web to spread their message. Anarchists have found it easier to create websites because of distributional and other difficulties, hosting electronic libraries and other portals. Anarchists were also involved in developing various software that are available for free. The way these hacktivists work to develop and distribute resembles the anarchist ideals, especially when it comes to preserving users' privacy from state surveillance.Anarchists organize themselves to squat and reclaim public spaces. During important events such as protests and when spaces are being occupied, they are often called Temporary Autonomous Zones (TAZ), spaces where art, poetry, and surrealism are blended to display the anarchist ideal. As seen by anarchists, squatting is a way to regain urban space from the capitalist market, serving pragmatical needs and also being an exemplary direct action. Acquiring space enables anarchists to experiment with their ideas and build social bonds. Adding up these tactics while having in mind that not all anarchists share the same attitudes towards them, along with various forms of protesting at highly symbolic events, make up a carnivalesque atmosphere that is part of contemporary anarchist vividity.Key issues As anarchism is a philosophy that embodies many diverse attitudes, tendencies, and schools of thought; disagreement over questions of values, ideology, and tactics is common. Its diversity has led to widely different uses of identical terms among different anarchist traditions which has created a number of definitional concerns in anarchist theory. The compatibility of capitalism, nationalism, and religion with anarchism is widely disputed, and anarchism enjoys complex relationships with ideologies such as communism, collectivism, Marxism, and trade unionism. Anarchists may be motivated by humanism, divine authority, enlightened self-interest, veganism, or any number of alternative ethical doctrines. Phenomena such as civilisation, technology (e.g. within anarcho-primitivism), and the democratic process may be sharply criticised within some anarchist tendencies and simultaneously lauded in others.Gender, sexuality, and free love As gender and sexuality carry along them dynamics of hierarchy, many anarchists address, analyse, and oppose the suppression of one's autonomy imposed by gender roles.Sexuality was not often discussed by classical anarchists but the few that did felt that an anarchist society would lead to sexuality naturally developing. Sexual violence was a concern for anarchists such as Benjamin Tucker, who opposed age of consent laws, believing they would benefit predatory men. A historical current that arose and flourished during 1890 and 1920 within anarchism was free love. In contemporary anarchism, this current survives as a tendency to support polyamory and queer anarchism. Free love advocates were against marriage, which they saw as a way of men imposing authority over women, largely because marriage law greatly favoured the power of men. The notion of free love was much broader and included a critique of the established order that limited women's sexual freedom and pleasure. Those free love movements contributed to the establishment of communal houses, where large groups of travelers, anarchists and other activists slept in beds together. Free love had roots both in Europe and the United States; however, some anarchists struggled with the jealousy that arose from free love. Anarchist feminists were advocates of free love, against marriage, and pro-choice (utilising a contemporary term), and had a similar agenda. Anarchist and non-anarchist feminists differed on suffrage but were supportive of one another.During the second half of the 20th century, anarchism intermingled with the second wave of feminism, radicalising some currents of the feminist movement and being influenced as well. By the latest decades of the 20th century, anarchists and feminists were advocating for the rights and autonomy of women, gays, queers and other marginalised groups, with some feminist thinkers suggesting a fusion of the two currents. With the third wave of feminism, sexual identity and compulsory heterosexuality became a subject of study for anarchists, yielding a post-structuralist critique of sexual normality. Some anarchists distanced themselves from this line of thinking, suggesting that it leaned towards an individualism that was dropping the cause of social liberation.Anarchism and education The interest of anarchists in education stretches back to the first emergence of classical anarchism. Anarchists consider proper education, one which sets the foundations of the future autonomy of the individual and the society, to be an act of mutual aid. Anarchist writers such as William Godwin (Political Justice) and Max Stirner ("The False Principle of Our Education") attacked both state education and private education as another means by which the ruling class replicate their privileges.In 1901, Catalan anarchist and free thinker Francisco Ferrer established the Escuela Moderna in Barcelona as an opposition to the established education system which was dictated largely by the Catholic Church. Ferrer's approach was secular, rejecting both state and church involvement in the educational process whilst giving pupils large amounts of autonomy in planning their work and attendance. Ferrer aimed to educate the working class and explicitly sought to foster class consciousness among students. The school closed after constant harassment by the state and Ferrer was later arrested. Nonetheless, his ideas formed the inspiration for a series of modern schools around the world. Christian anarchist Leo Tolstoy, who published the essay Education and Culture, also established a similar school with its founding principle being that "for education to be effective it had to be free." In a similar token, A. S. Neill founded what became the Summerhill School in 1921, also declaring being free from coercion.Anarchist education is based largely on the idea that a child's right to develop freely and without manipulation ought to be respected and that rationality would lead children to morally good conclusions; however, there has been little consensus among anarchist figures as to what constitutes manipulation. Ferrer believed that moral indoctrination was necessary and explicitly taught pupils that equality, liberty and social justice were not possible under capitalism, along with other critiques of government and nationalism.Late 20th century and contemporary anarchist writers (Paul Goodman, Herbert Read, and Colin Ward) intensified and expanded the anarchist critique of state education, largely focusing on the need for a system that focuses on children's creativity rather than on their ability to attain a career or participate in consumerism as part of a consumer society. Contemporary anarchists such as Ward claim that state education serves to perpetuate socioeconomic inequality.While few anarchist education institutions have survived to the modern-day, major tenets of anarchist schools, among them respect for child autonomy and relying on reasoning rather than indoctrination as a teaching method, have spread among mainstream educational institutions. Judith Suissa names three schools as explicitly anarchists schools, namely the Free Skool Santa Cruz in the United States which is part of a wider American-Canadian network of schools, the Self-Managed Learning College in Brighton, England, and the Paideia School in Spain.Anarchism and the state Objection to the state and its institutions is a sine qua non of anarchism. Anarchists consider the state as a tool of domination and believe it to be illegitimate regardless of its political tendencies. Instead of people being able to control the aspects of their life, major decisions are taken by a small elite. Authority ultimately rests solely on power, regardless of whether that power is open or transparent, as it still has the ability to coerce people. Another anarchist argument against states is that the people constituting a government, even the most altruistic among officials, will unavoidably seek to gain more power, leading to corruption. Anarchists consider the idea that the state is the collective will of the people to be an unachievable fiction due to the fact that the ruling class is distinct from the rest of society.Specific anarchist attitudes towards the state vary. Robert Paul Wolff believed that the tension between authority and autonomy would mean the state could never be legitimate. Bakunin saw the state as meaning "coercion, domination by means of coercion, camouflaged if possible but unceremonious and overt if need be." A. John Simmons and Leslie Green, who leaned toward philosophical anarchism, believed that the state could be legitimate if it is governed by consensus, although they saw this as highly unlikely. Beliefs on how to abolish the state also differ.Anarchism and the arts The connection between anarchism and art was quite profound during the classical era of anarchism, especially among artistic currents that were developing during that era such as futurists, surrealists and others. In literature, anarchism was mostly associated with the New Apocalyptics and the neo-romanticism movement. In music, anarchism has been associated with music scenes such as punk. Anarchists such as Leo Tolstoy and Herbert Read stated that the border between the artist and the non-artist, what separates art from a daily act, is a construct produced by the alienation caused by capitalism and it prevents humans from living a joyful life.Other anarchists advocated for or used art as a means to achieve anarchist ends. In his book Breaking the Spell: A History of Anarchist Filmmakers, Videotape Guerrillas, and Digital Ninjas, Chris Robé claims that "anarchist-inflected practices have increasingly structured movement-based video activism." Throughout the 20th century, many prominent anarchists (Peter Kropotkin, Emma Goldman, Gustav Landauer and Camillo Berneri) and publications such as Anarchy wrote about matters pertaining to the arts.Three overlapping properties made art useful to anarchists. It could depict a critique of existing society and hierarchies, serve as a prefigurative tool to reflect the anarchist ideal society and even turn into a means of direct action such as in protests. As it appeals to both emotion and reason, art could appeal to the whole human and have a powerful effect. The 19th-century neo-impressionist movement had an ecological aesthetic and offered an example of an anarchist perception of the road towards socialism. In Les chataigniers a Osny by anarchist painter Camille Pissarro, the blending of aesthetic and social harmony is prefiguring an ideal anarchistic agrarian community.Analysis The most common critique of anarchism is that humans cannot self-govern and so a state is necessary for human survival. Philosopher Bertrand Russell supported this critique, stating that "[p]eace and war, tariffs, regulations of sanitary conditions and the sale of noxious drugs, the preservation of a just system of distribution: these, among others, are functions which could hardly be performed in a community in which there was no central government." Another common criticism of anarchism is that it fits a world of isolation in which only the small enough entities can be self-governing; a response would be that major anarchist thinkers advocated anarchist federalism.Philosophy lecturer Andrew G. Fiala composed a list of common arguments against anarchism which includes critiques such as that anarchism is innately related to violence and destruction, not only in the pragmatic world, such as at protests, but in the world of ethics as well. Secondly, anarchism is evaluated as unfeasible or utopian since the state cannot be defeated practically. This line of arguments most often calls for political action within the system to reform it. The third argument is that anarchism is self-contradictory. While it advocates for no-one to archiei, if accepted by the many, then anarchism would turn into the ruling political theory. In this line of criticism also comes the self-contradiction that anarchism calls for collective action whilst endorsing the autonomy of the individual, hence no collective action can be taken. Lastly, Fiala mentions a critique towards philosophical anarchism of being ineffective (all talk and thoughts) and in the meantime capitalism and bourgeois class remains strong.Philosophical anarchism has met the criticism of members of academia following the release of pro-anarchist books such as A. John Simmons' Moral Principles and Political Obligations. Law professor William A. Edmundson authored an essay to argue against three major philosophical anarchist principles which he finds fallacious. Edmundson says that while the individual does not owe the state a duty of obedience, this does not imply that anarchism is the inevitable conclusion and the state is still morally legitimate. In The Problem of Political Authority, Michael Huemer defends philosophical anarchism, claiming that "political authority is a moral illusion."One of the earliest criticisms is that anarchism defies and fails to understand the biological inclination to authority. Joseph Raz states that the acceptance of authority implies the belief that following their instructions will afford more success. Raz believes that this argument is true in following both authorities' successful and mistaken instruction. Anarchists reject this criticism because challenging or disobeying authority does not entail the disappearance of its advantages by acknowledging authority such as doctors or lawyers as reliable, nor does it involve a complete surrender of independent judgment. Anarchist perception of human nature, rejection of the state, and commitment to social revolution has been criticised by academics as naive, overly simplistic, and unrealistic, respectively. Classical anarchism has been criticised for relying too heavily on the belief that the abolition of the state will lead to human cooperation prospering.Friedrich Engels, considered to be one of the principal founders of Marxism, criticised anarchism's anti-authoritarianism as inherently counter-revolutionary because in his view a revolution is by itself authoritarian. Academic John Molyneux writes in his book Anarchism: A Marxist Criticism that "anarchism cannot win", believing that it lacks the ability to properly implement its ideas. The Marxist criticism of anarchism is that it has a utopian character because all individuals should have anarchist views and values. According to the Marxist view, that a social idea would follow directly from this human ideal and out of the free will of every individual formed its essence. Marxists state that this contradiction was responsible for their inability to act. In the anarchist vision, the conflict between liberty and equality was resolved through coexistence and intertwining.See also  Anarchism by country Governance without government List of anarchist political ideologies List of books about anarchismReferencesCitationsNotesSourcesPrimary sourcesSecondary sourcesTertiary sourcesFurther reading    Criticism of philosophical anarchism.   A defence of philosophical anarchism, stating that "both kinds of 'anarchism' [i.e. philosophical and political anarchism] are philosophical and political claims." (p. 137)  Anarchistic popular fiction novel.     An argument for philosophical anarchism.External links  Anarchy Archives. Anarchy Archives is an online research center on the history and theory of anarchism. Anti-capitalismAnti-fascismEconomic ideologiesLeft-wing politicsLibertarian socialismLibertarianismPolitical culturePolitical movementsPolitical ideologiesSocial theoriesSocialismFar-left politics
+Autism is a neurodevelopmental disorder characterized by difficulties with social interaction and communication, and by restricted and repetitive behavior. Parents often notice signs during the first three years of their child's life. These signs often develop gradually, though some autistic children experience regression in their communication and social skills after reaching developmental milestones at a normal pace.Autism is associated with a combination of genetic and environmental factors. Risk factors during pregnancy include certain infections, such as rubella, toxins including valproic acid, alcohol, cocaine, pesticides, lead, and air pollution, fetal growth restriction, and autoimmune diseases. Controversies surround other proposed environmental causes; for example, the vaccine hypothesis, which has been disproven. Autism affects information processing in the brain and how nerve cells and their synapses connect and organize; how this occurs is not well understood. The Diagnostic and Statistical Manual of Mental Disorders (DSM-5) combines forms of the condition, including Asperger syndrome and pervasive developmental disorder not otherwise specified (PDD-NOS) into the diagnosis of autism spectrum disorder (ASD).Several interventions have been shown to reduce symptoms and improve the ability of autistic people to function and participate independently in the community. Behavioral, psychological, education, and/or skill-building interventions may be used to assist autistic people to learn life skills necessary for living independently, as well as other social, communication, and language skills. Therapy also aims to reduce challenging behaviors and build upon strengths. Some autistic adults are unable to live independently. An autistic culture has developed, with some individuals seeking a cure and others believing autism should be accepted as a difference to be accommodated instead of cured.Globally, autism is estimated to affect 24.8 million people . In the 2000s, the number of autistic people worldwide was estimated at 1–2 per 1,000 people. In the developed countries, about 1.5% of children are diagnosed with ASD , up from 0.7% in 2000 in the United States. It is diagnosed four to five times more often in males than females. The number of people diagnosed has increased considerably since the 1990s, which may be partly due to increased recognition of the condition.CharacteristicsAutism is a highly variable neurodevelopmental disorder whose symptoms first appear during infancy or childhood, and generally follows a steady course without remission. Autistic people may be severely impaired in some respects but average, or even superior, in others. Overt symptoms gradually begin after the age of six months, become established by age two or three years and tend to continue through adulthood, although often in more muted form. It is distinguished by a characteristic triad of symptoms: impairments in social interaction, impairments in communication, and repetitive behavior. Other aspects, such as atypical eating, are also common but are not essential for diagnosis. Individual symptoms of autism occur in the general population and appear not to associate highly, without a sharp line separating pathologically severe from common traits.Social developmentSocial deficits distinguish autism and the related autism spectrum disorders (ASD; see Classification) from other developmental disorders. Autistic people have social impairments and often lack the intuition about others that many people take for granted. Noted autistic Temple Grandin described her inability to understand the social communication of neurotypicals, or people with typical neural development, as leaving her feeling "like an anthropologist on Mars".Unusual social development becomes apparent early in childhood. Autistic infants show less attention to social stimuli, smile and look at others less often, and respond less to their own name. Autistic toddlers differ more strikingly from social norms; for example, they have less eye contact and turn-taking, and do not have the ability to use simple movements to express themselves, such as pointing at things. Three- to five-year-old autistic children are less likely to exhibit social understanding, approach others spontaneously, imitate and respond to emotions, communicate nonverbally, and take turns with others. However, they do form attachments to their primary caregivers. Most autistic children display moderately less attachment security than neurotypical children, although this difference disappears in children with higher mental development or less pronounced autistic traits. Older children and adults with ASD perform worse on tests of face and emotion recognition although this may be partly due to a lower ability to define a person's own emotions.Children with high-functioning autism have more intense and frequent loneliness compared to non-autistic peers, despite the common belief that autistic children prefer to be alone. Making and maintaining friendships often proves to be difficult for autistic people. For them, the quality of friendships, not the number of friends, predicts how lonely they feel. Functional friendships, such as those resulting in invitations to parties, may affect the quality of life more deeply.There are many anecdotal reports, but few systematic studies, of aggression and violence in individuals with ASD. The limited data suggest that, in children with intellectual disability, autism is associated with aggression, destruction of property, and meltdowns.CommunicationAbout one third to half of autistic people do not develop enough natural speech to meet their daily communication needs. Differences in communication may be present from the first year of life, and may include delayed onset of babbling, unusual gestures, diminished responsiveness, and vocal patterns that are not synchronized with the caregiver. In the second and third years, autistic children have less frequent and less diverse babbling, consonants, words, and word combinations; their gestures are less often integrated with words. Autistic children are less likely to make requests or share experiences, and are more likely to simply repeat others' words (echolalia) or reverse pronouns. Joint attention seems to be necessary for functional speech, and deficits in joint attention seem to distinguish infants with ASD. For example, they may look at a pointing hand instead of the object to which the hand is pointing, and they consistently fail to point at objects in order to comment on or share an experience. Autistic children may have difficulty with imaginative play and with developing symbols into language.In a pair of studies, high-functioning autistic children aged 8–15 performed equally well as, and as adults better than, individually matched controls at basic language tasks involving vocabulary and spelling. Both autistic groups performed worse than controls at complex language tasks such as figurative language, comprehension, and inference. As people are often sized up initially from their basic language skills, these studies suggest that people speaking to autistic individuals are more likely to overestimate what their audience comprehends.Repetitive behaviorAutistic individuals can display many forms of repetitive or restricted behavior, which the Repetitive Behavior Scale-Revised (RBS-R) categorizes as follows. Stereotyped behaviors: Repetitive movements, such as hand flapping, head rolling, or body rocking. Compulsive behaviors: Time-consuming behaviors intended to reduce the anxiety that an individual feels compelled to perform repeatedly or according to rigid rules, such as placing objects in a specific order, checking things, or handwashing. Sameness: Resistance to change; for example, insisting that the furniture not be moved or refusing to be interrupted. Ritualistic behavior: Unvarying pattern of daily activities, such as an unchanging menu or a dressing ritual. This is closely associated with sameness and an independent validation has suggested combining the two factors. Restricted interests: Interests or fixations that are abnormal in theme or intensity of focus, such as preoccupation with a single television program, toy, or game. Self-injury: Behaviors such as eye-poking, skin-picking, hand-biting and head-banging.No single repetitive or self-injurious behavior seems to be specific to autism, but autism appears to have an elevated pattern of occurrence and severity of these behaviors.Other symptomsAutistic individuals may have symptoms that are independent of the diagnosis, but that can affect the individual or the family.An estimated 0.5% to 10% of individuals with ASD show unusual abilities, ranging from splinter skills such as the memorization of trivia to the extraordinarily rare talents of prodigious autistic savants. Many individuals with ASD show superior skills in perception and attention, relative to the general population. Sensory abnormalities are found in over 90% of autistic people, and are considered core features by some, although there is no good evidence that sensory symptoms differentiate autism from other developmental disorders. Differences are greater for under-responsivity (for example, walking into things) than for over-responsivity (for example, distress from loud noises) or for sensation seeking (for example, rhythmic movements). An estimated 60–80% of autistic people have motor signs that include poor muscle tone, poor motor planning, and toe walking;  deficits in motor coordination are pervasive across ASD and are greater in autism proper. Unusual eating behavior occurs in about three-quarters of children with ASD, to the extent that it was formerly a diagnostic indicator. Selectivity is the most common problem, although eating rituals and food refusal also occur.There is tentative evidence that gender dysphoria occurs more frequently in autistic people (see Autism and LGBT identities). As well as that, a 2021 anonymized online survey of 16-90 year-olds revealed that autistic males are more likely to be bisexual, while autistic females are more likely to be homosexual.Gastrointestinal problems are one of the most commonly co-occurring medical conditions in autistic people. These are linked to greater social impairment, irritability, behavior and sleep problems, language impairments and mood changes.Parents of children with ASD have higher levels of stress. Siblings of children with ASD report greater admiration of and less conflict with the affected sibling than siblings of unaffected children and were similar to siblings of children with Down syndrome in these aspects of the sibling relationship. However, they reported lower levels of closeness and intimacy than siblings of children with Down syndrome; siblings of individuals with ASD have greater risk of negative well-being and poorer sibling relationships as adults.CausesIt has long been presumed that there is a common cause at the genetic, cognitive, and neural levels for autism's characteristic triad of symptoms. However, there is increasing suspicion that autism is instead a complex disorder whose core aspects have distinct causes that often co-occur.Autism has a strong genetic basis, although the genetics of autism are complex and it is unclear whether ASD is explained more by rare mutations with major effects, or by rare multigene interactions of common genetic variants. Complexity arises due to interactions among multiple genes, the environment, and epigenetic factors which do not change DNA sequencing but are heritable and influence gene expression. Many genes have been associated with autism through sequencing the genomes of affected individuals and their parents. Studies of twins suggest that heritability is 0.7 for autism and as high as 0.9 for ASD, and siblings of those with autism are about 25 times more likely to be autistic than the general population. However, most of the mutations that increase autism risk have not been identified. Typically, autism cannot be traced to a Mendelian (single-gene) mutation or to a single chromosome abnormality, and none of the genetic syndromes associated with ASDs have been shown to selectively cause ASD. Numerous candidate genes have been located, with only small effects attributable to any particular gene. Most loci individually explain less than 1% of cases of autism. The large number of autistic individuals with unaffected family members may result from spontaneous structural variation—such as deletions, duplications or inversions in genetic material during meiosis. Hence, a substantial fraction of autism cases may be traceable to genetic causes that are highly heritable but not inherited: that is, the mutation that causes the autism is not present in the parental genome. Autism may be underdiagnosed in women and girls due to an assumption that it is primarily a male condition, but genetic phenomena such as imprinting and X linkage have the ability to raise the frequency and severity of conditions in males, and theories have been put forward for a genetic reason why males are diagnosed more often, such as the imprinted brain hypothesis and the extreme male brain theory.Maternal nutrition and inflammation during preconception and pregnancy influences fetal neurodevelopment. Intrauterine growth restriction is associated with ASD, in both term and preterm infants. Maternal inflammatory and autoimmune diseases may damage fetal tissues, aggravating a genetic problem or damaging the nervous system.Exposure to air pollution during pregnancy, especially heavy metals and particulates, may increase the risk of autism. Environmental factors that have been claimed without evidence to contribute to or exacerbate autism include certain foods, infectious diseases, solvents, PCBs, phthalates and phenols used in plastic products, pesticides, brominated flame retardants, alcohol, smoking, illicit drugs, vaccines, and prenatal stress. Some, such as the MMR vaccine, have been completely disproven.Parents may first become aware of autistic symptoms in their child around the time of a routine vaccination. This has led to unsupported theories blaming vaccine "overload", a vaccine preservative, or the MMR vaccine for causing autism. The latter theory was supported by a litigation-funded study that has since been shown to have been "an elaborate fraud". Although these theories lack convincing scientific evidence and are biologically implausible, parental concern about a potential vaccine link with autism has led to lower rates of childhood immunizations, outbreaks of previously controlled childhood diseases in some countries, and the preventable deaths of several children.MechanismAutism's symptoms result from maturation-related changes in various systems of the brain. How autism occurs is not well understood. Its mechanism can be divided into two areas: the pathophysiology of brain structures and processes associated with autism, and the neuropsychological linkages between brain structures and behaviors. The behaviors appear to have multiple pathophysiologies.There is evidence that gut–brain axis abnormalities may be involved. A 2015 review proposed that immune dysregulation, gastrointestinal inflammation, malfunction of the autonomic nervous system, gut flora alterations, and food metabolites may cause brain neuroinflammation and dysfunction. A 2016 review concludes that enteric nervous system abnormalities might play a role in neurological disorders such as autism. Neural connections and the immune system are a pathway that may allow diseases originated in the intestine to spread to the brain.Several lines of evidence point to synaptic dysfunction as a cause of autism. Some rare mutations may lead to autism by disrupting some synaptic pathways, such as those involved with cell adhesion. Gene replacement studies in mice suggest that autistic symptoms are closely related to later developmental steps that depend on activity in synapses and on activity-dependent changes. All known teratogens (agents that cause birth defects) related to the risk of autism appear to act during the first eight weeks from conception, and though this does not exclude the possibility that autism can be initiated or affected later, there is strong evidence that autism arises very early in development.DiagnosisDiagnosis is based on behavior, not cause or mechanism. Under the DSM-5, autism is characterized by persistent deficits in social communication and interaction across multiple contexts, as well as restricted, repetitive patterns of behavior, interests, or activities. These deficits are present in early childhood, typically before age three, and lead to clinically significant functional impairment. Sample symptoms include lack of social or emotional reciprocity, stereotyped and repetitive use of language or idiosyncratic language, and persistent preoccupation with unusual objects. The disturbance must not be better accounted for by Rett syndrome, intellectual disability or global developmental delay. ICD-10 uses essentially the same definition.Several diagnostic instruments are available. Two are commonly used in autism research: the Autism Diagnostic Interview-Revised (ADI-R) is a semistructured parent interview, and the Autism Diagnostic Observation Schedule (ADOS) uses observation and interaction with the child. The Childhood Autism Rating Scale (CARS) is used widely in clinical environments to assess severity of autism based on observation of children. The Diagnostic interview for social and communication disorders (DISCO) may also be used.A pediatrician commonly performs a preliminary investigation by taking developmental history and physically examining the child. If warranted, diagnosis and evaluations are conducted with help from ASD specialists, observing and assessing cognitive, communication, family, and other factors using standardized tools, and taking into account any associated medical conditions. A pediatric neuropsychologist is often asked to assess behavior and cognitive skills, both to aid diagnosis and to help recommend educational interventions. A differential diagnosis for ASD at this stage might also consider intellectual disability, hearing impairment, and a specific language impairment such as Landau–Kleffner syndrome. The presence of autism can make it harder to diagnose coexisting psychiatric disorders such as depression.Clinical genetics evaluations are often done once ASD is diagnosed, particularly when other symptoms already suggest a genetic cause. Although genetic technology allows clinical geneticists to link an estimated 40% of cases to genetic causes, consensus guidelines in the US and UK are limited to high-resolution chromosome and fragile X testing. A genotype-first model of diagnosis has been proposed, which would routinely assess the genome's copy number variations. As new genetic tests are developed several ethical, legal, and social issues will emerge. Commercial availability of tests may precede adequate understanding of how to use test results, given the complexity of autism's genetics. Metabolic and neuroimaging tests are sometimes helpful, but are not routine.ASD can sometimes be diagnosed by age 14 months, although diagnosis becomes increasingly stable over the first three years of life: for example, a one-year-old who meets diagnostic criteria for ASD is less likely than a three-year-old to continue to do so a few years later. In the UK the National Autism Plan for Children recommends at most 30 weeks from first concern to completed diagnosis and assessment, though few cases are handled that quickly in practice. Although the symptoms of autism and ASD begin early in childhood, they are sometimes missed; years later, adults may seek diagnoses to help them or their friends and family understand themselves, to help their employers make adjustments, or in some locations to claim disability living allowances or other benefits.Signs of autism may be more challenging for clinicians to detect in females. Autistic females have been shown to engage in masking more frequently than autistic males. Masking may include making oneself perform normative facial expressions and eye contact. A notable percentage of autistic females may be misdiagnosed, diagnosed after a considerable delay, or not diagnosed at all.Conversely, the cost of screening and diagnosis and the challenge of obtaining payment can inhibit or delay diagnosis. It is particularly hard to diagnose autism among the visually impaired, partly because some of its diagnostic criteria depend on vision, and partly because autistic symptoms overlap with those of common blindness syndromes or blindisms.ClassificationAutism is one of the five pervasive developmental disorders (PDD), which are characterized by widespread abnormalities of social interactions and communication, severely restricted interests, and highly repetitive behavior. These symptoms do not imply sickness, fragility, or emotional disturbance.Of the five PDD forms, Asperger syndrome is closest to autism in signs and likely causes; Rett syndrome and childhood disintegrative disorder share several signs with autism, but may have unrelated causes; PDD not otherwise specified (PDD-NOS; also called atypical autism) is diagnosed when the criteria are not met for a more specific disorder. Unlike with autism, people with Asperger syndrome have no substantial delay in language development. The terminology of autism can be bewildering, with autism, Asperger syndrome and PDD-NOS often called the autism spectrum disorders (ASD) or sometimes the autistic disorders, whereas autism itself is often called autistic disorder, childhood autism, or infantile autism. In this article, autism refers to the classic autistic disorder; in clinical practice, though, autism, ASD, and PDD are often used interchangeably. ASD, in turn, is a subset of the broader autism phenotype, which describes individuals who may not have ASD but do have autistic-like traits, such as avoiding eye contact.Research into causes has been hampered by the inability to identify biologically meaningful subgroups within the autistic population and by the traditional boundaries between the disciplines of psychiatry, psychology, neurology and pediatrics. Newer technologies such as fMRI and diffusion tensor imaging can help identify biologically relevant phenotypes (observable traits) that can be viewed on brain scans, to help further neurogenetic studies of autism; one example is lowered activity in the fusiform face area of the brain, which is associated with impaired perception of people versus objects. It has been proposed to classify autism using genetics as well as behavior. (For more, see Brett Abrahams, geneticist and neuroscientist)Spectrum Autism has long been thought to cover a wide spectrum, ranging from individuals with severe impairments—who may be silent, developmentally disabled, and prone to frequent repetitive behavior such as hand flapping and rocking—to high functioning individuals who may have active but distinctly odd social approaches, narrowly focused interests, and verbose, pedantic communication. Because the behavior spectrum is continuous, boundaries between diagnostic categories are necessarily somewhat arbitrary.ScreeningAbout half of parents of children with ASD notice their child's unusual behaviors by age 18 months, and about four-fifths notice by age 24 months. According to an article, failure to meet any of the following milestones "is an absolute indication to proceed with further evaluations. Delay in referral for such testing may delay early diagnosis and treatment and affect the long-term outcome". No response to name (or eye-to-eye gaze) by 6 months.  No babbling by 12 months. No gesturing (pointing, waving, etc.) by 12 months. No single words by 16 months. No two-word (spontaneous, not just echolalic) phrases by 24 months. Loss of any language or social skills, at any age.The United States Preventive Services Task Force in 2016 found it was unclear if screening was beneficial or harmful among children in whom there is no concern. The Japanese practice is to screen all children for ASD at 18 and 24 months, using autism-specific formal screening tests. In contrast, in the UK, children whose families or doctors recognize possible signs of autism are screened. It is not known which approach is more effective. Screening tools include the Modified Checklist for Autism in Toddlers (M-CHAT), the Early Screening of Autistic Traits Questionnaire, and the First Year Inventory; initial data on M-CHAT and its predecessor, the Checklist for Autism in Toddlers (CHAT), on children aged 18–30 months suggests that it is best used in a clinical setting and that it has low sensitivity (many false-negatives) but good specificity (few false-positives). It may be more accurate to precede these tests with a broadband screener that does not distinguish ASD from other developmental disorders. Screening tools designed for one culture's norms for behaviors like eye contact may be inappropriate for a different culture. Although genetic screening for autism is generally still impractical, it can be considered in some cases, such as children with neurological symptoms and dysmorphic features.Some authors suggest that automatic motor assessment could be useful to screen the children with ASD for instance with behavioural motor and emotionals reactions during smartphone watching.PreventionWhile infection with rubella during pregnancy causes fewer than 1% of cases of autism, vaccination against rubella can prevent many of those cases.ManagementThe main goals when treating autistic children are to lessen associated deficits and family distress, and to increase quality of life and functional independence. In general, higher IQs are correlated with greater responsiveness to treatment and improved treatment outcomes. No single treatment is best and treatment is typically tailored to the child's needs. Families and the educational system are the main resources for treatment. Services should be carried out by behavior analysts, special education teachers, speech pathologists, and licensed psychologists. Studies of interventions have methodological problems that prevent definitive conclusions about efficacy. However, the development of evidence-based interventions has advanced in recent years. Although many psychosocial interventions have some positive evidence, suggesting that some form of treatment is preferable to no treatment, the methodological quality of systematic reviews of these studies has generally been poor, their clinical results are mostly tentative, and there is little evidence for the relative effectiveness of treatment options. Intensive, sustained special education programs and behavior therapy early in life can help children acquire self-care, communication, and job skills, and often improve functioning and decrease symptom severity and maladaptive behaviors; claims that intervention by around age three years is crucial are not substantiated. While medications have not been found to help with core symptoms, they may be used for associated symptoms, such as irritability, inattention, or repetitive behavior patterns.EducationEducational interventions often used include applied behavior analysis (ABA), developmental models, structured teaching, speech and language therapy, social skills therapy, and occupational therapy and cognitive behavioral interventions in adults without intellectual disability to reduce depression, anxiety, and obsessive-compulsive disorder. Among these approaches, interventions either treat autistic features comprehensively, or focalize treatment on a specific area of deficit. The quality of research for early intensive behavioral intervention (EIBI)—a treatment procedure incorporating over thirty hours per week of the structured type of ABA that is carried out with very young children—is currently low, and more vigorous research designs with larger sample sizes are needed. Two theoretical frameworks outlined for early childhood intervention include structured and naturalistic ABA interventions, and developmental social pragmatic models (DSP). One interventional strategy utilizes a parent training model, which teaches parents how to implement various ABA and DSP techniques, allowing for parents to disseminate interventions themselves. Various DSP programs have been developed to explicitly deliver intervention systems through at-home parent implementation. Despite the recent development of parent training models, these interventions have demonstrated effectiveness in numerous studies, being evaluated as a probable efficacious mode of treatment.Early, intensive ABA therapy has demonstrated effectiveness in enhancing communication and adaptive functioning in preschool children; it is also well-established for improving the intellectual performance of that age group. Similarly, a teacher-implemented intervention that utilizes a more naturalistic form of ABA combined with a developmental social pragmatic approach has been found to be beneficial in improving social-communication skills in young children, although there is less evidence in its treatment of global symptoms. Neuropsychological reports are often poorly communicated to educators, resulting in a gap between what a report recommends and what education is provided. It is not known whether treatment programs for children lead to significant improvements after the children grow up, and the limited research on the effectiveness of adult residential programs shows mixed results. The appropriateness of including children with varying severity of autism spectrum disorders in the general education population is a subject of current debate among educators and researchers.MedicationMedications may be used to treat ASD symptoms that interfere with integrating a child into home or school when behavioral treatment fails. They may also be used for associated health problems, such as ADHD or anxiety. More than half of US children diagnosed with ASD are prescribed psychoactive drugs or anticonvulsants, with the most common drug classes being antidepressants, stimulants, and antipsychotics. The atypical antipsychotic drugs risperidone and aripiprazole are FDA-approved for treating associated aggressive and self-injurious behaviors. However, their side effects must be weighed against their potential benefits, and autistic people may respond atypically. Side effects, for example, may include weight gain, tiredness, drooling, and aggression. SSRI antidepressants, such as fluoxetine and fluvoxamine, have been shown to be effective in reducing repetitive and ritualistic behaviors, while the stimulant medication methylphenidate is beneficial for some children with co-morbid inattentiveness or hyperactivity. There is scant reliable research about the effectiveness or safety of drug treatments for adolescents and adults with ASD. No known medication relieves autism's core symptoms of social and communication impairments. Experiments in mice have reversed or reduced some symptoms related to autism by replacing or modulating gene function, suggesting the possibility of targeting therapies to specific rare mutations known to cause autism.Alternative medicineAlthough many alternative therapies and interventions are available, few are supported by scientific studies. Treatment approaches have little empirical support in quality-of-life contexts, and many programs focus on success measures that lack predictive validity and real-world relevance. Some alternative treatments may place the child at risk. The preference that autistic children have for unconventional foods can lead to reduction in bone cortical thickness with this being greater in those on casein-free diets, as a consequence of the low intake of calcium and vitamin D; however, suboptimal bone development in ASD has also been associated with lack of exercise and gastrointestinal disorders. In 2005, botched chelation therapy killed a five-year-old child with autism. Chelation is not recommended for autistic people since the associated risks outweigh any potential benefits. Another alternative medicine practice with no evidence is CEASE therapy, a mixture of homeopathy, supplements, and 'vaccine detoxing'.Although popularly used as an alternative treatment for autistic people, as of 2018 there is no good evidence to recommend a gluten- and casein-free diet as a standard treatment. A 2018 review concluded that it may be a therapeutic option for specific groups of children with autism, such as those with known food intolerances or allergies, or with food intolerance markers. The authors analyzed the prospective trials conducted to date that studied the efficacy of the gluten- and casein-free diet in children with ASD (4 in total). All of them compared gluten- and casein-free diet versus normal diet with a control group (2 double-blind randomized controlled trials, 1 double-blind crossover trial, 1 single-blind trial). In two of the studies, whose duration was 12 and 24 months, a significant improvement in ASD symptoms (efficacy rate 50%) was identified. In the other two studies, whose duration was 3 months, no significant effect was observed. The authors concluded that a longer duration of the diet may be necessary to achieve the improvement of the ASD symptoms. Other problems documented in the trials carried out include transgressions of the diet, small sample size, the heterogeneity of the participants and the possibility of a placebo effect. In the subset of people who have gluten sensitivity there is limited evidence that suggests that a gluten-free diet may improve some autistic behaviors.Results of a systematic review on interventions to address health outcomes among autistic adults found emerging evidence to support mindfulness-based interventions for improving mental health. This includes decreasing stress, anxiety, ruminating thoughts, anger, and aggression. There is tentative evidence that music therapy may improve social interactions, verbal communication, and non-verbal communication skills. There has been early research looking at hyperbaric treatments in children with autism. Studies on pet therapy have shown positive effects.PrognosisThere is no known cure for autism. The degree of symptoms can decrease, occasionally to the extent that people lose their diagnosis of ASD; this occurs sometimes after intensive treatment and sometimes not. It is not known how often this outcome happens; reported rates in unselected samples have ranged from 3% to 25%. Most autistic children acquire language by age five or younger, though a few have developed communication skills in later years. Many autistic children lack social support, future employment opportunities or self-determination. Although core difficulties tend to persist, symptoms often become less severe with age.Few high-quality studies address long-term prognosis. Some adults show modest improvement in communication skills, but a few decline; no study has focused on autism after midlife. Acquiring language before age six, having an IQ above 50, and having a marketable skill all predict better outcomes; independent living is unlikely with severe autism.Many autistic people face significant obstacles in transitioning to adulthood. Compared to the general population autistic people are more likely to be unemployed and to have never had a job. About half of people in their 20s with autism are not employed.Autistic people tend to face increased stress levels related to psychosocial factors, such as stigma, which may increase the rates of mental health issues in the autistic population.EpidemiologyAs of 2007, reviews estimate a prevalence of 1–2 per 1,000 for autism and close to 6 per 1,000 for ASD. A 2016 survey in the United States reported a rate of 25 per 1,000 children for ASD. Globally, autism affects an estimated 24.8 million people , while Asperger syndrome affects a further 37.2 million. In 2012, the NHS estimated that the overall prevalence of autism among adults aged 18 years and over in the UK was 1.1%. Rates of PDD-NOS's has been estimated at 3.7 per 1,000, Asperger syndrome at roughly 0.6 per 1,000, and childhood disintegrative disorder at 0.02 per 1,000. CDC estimates about 1 out of 59 (1.7%) for 2014, an increase from 1 out of every 68 children (1.5%) for 2010.In the UK, from 1998 to 2018, the autism diagnoses increased by 787%. This increase is largely attributable to changes in diagnostic practices, referral patterns, availability of services, age at diagnosis, and public awareness (particularly among women), though unidentified environmental risk factors cannot be ruled out. The available evidence does not rule out the possibility that autism's true prevalence has increased; a real increase would suggest directing more attention and funding toward psychosocial factors and changing environmental factors instead of continuing to focus on genetics. It has been established that vaccination is not a risk factor for autism and is not behind any increase in autism prevalence rates, if any change in the rate of autism exists at all.Males are at higher risk for ASD than females. The sex ratio averages 4.3:1 and is greatly modified by cognitive impairment: it may be close to 2:1 with intellectual disability and more than 5.5:1 without. Several theories about the higher prevalence in males have been investigated, but the cause of the difference is unconfirmed; one theory is that females are underdiagnosed.Although the evidence does not implicate any single pregnancy-related risk factor as a cause of autism, the risk of autism is associated with advanced age in either parent, and with diabetes, bleeding, and use of psychiatric drugs in the mother during pregnancy. The risk is greater with older fathers than with older mothers; two potential explanations are the known increase in mutation burden in older sperm, and the hypothesis that men marry later if they carry genetic liability and show some signs of autism. Most professionals believe that race, ethnicity, and socioeconomic background do not affect the occurrence of autism.Several other conditions are common in children with autism. They include: Genetic disorders. About 10–15% of autism cases have an identifiable Mendelian (single-gene) condition, chromosome abnormality, or other genetic syndrome, and ASD is associated with several genetic disorders. Intellectual disability. The percentage of autistic individuals who also meet criteria for intellectual disability has been reported as anywhere from 25% to 70%, a wide variation illustrating the difficulty of assessing intelligence of individuals on the autism spectrum. In comparison, for PDD-NOS the association with intellectual disability is much weaker, and by definition, the diagnosis of Asperger's excludes intellectual disability. Anxiety disorders are common among children with ASD; there are no firm data, but studies have reported prevalences ranging from 11% to 84%. Many anxiety disorders have symptoms that are better explained by ASD itself, or are hard to distinguish from ASD's symptoms. Epilepsy, with variations in risk of epilepsy due to age, cognitive level, and type of language disorder. Several metabolic defects, such as phenylketonuria, are associated with autistic symptoms. Minor physical anomalies are significantly increased in the autistic population. Preempted diagnoses. Although the DSM-IV rules out the concurrent diagnosis of many other conditions along with autism, the full criteria for Attention deficit hyperactivity disorder (ADHD), Tourette syndrome, and other of these conditions are often present and these co-occurrent conditions are increasingly accepted. Sleep problems affect about two-thirds of individuals with ASD at some point in childhood. These most commonly include symptoms of insomnia such as difficulty in falling asleep, frequent nocturnal awakenings, and early morning awakenings. Sleep problems are associated with difficult behaviors and family stress, and are often a focus of clinical attention over and above the primary ASD diagnosis.HistoryA few examples of autistic symptoms and treatments were described long before autism was named. The Table Talk of Martin Luther, compiled by his notetaker, Mathesius, contains the story of a 12-year-old boy who may have been severely autistic. The earliest well-documented case of autism is that of Hugh Blair of Borgue, as detailed in a 1747 court case in which his brother successfully petitioned to annul Blair's marriage to gain Blair's inheritance. The Wild Boy of Aveyron, a feral child caught in 1798, showed several signs of autism; the medical student Jean Itard treated him with a behavioral program designed to help him form social attachments and to induce speech via imitation.The New Latin word autismus (English translation autism) was coined by the Swiss psychiatrist Eugen Bleuler in 1910 as he was defining symptoms of schizophrenia. He derived it from the Greek word autós (αὐτός, meaning "self"), and used it to mean morbid self-admiration, referring to "autistic withdrawal of the patient to his fantasies, against which any influence from outside becomes an intolerable disturbance". A Soviet child psychiatrist, Grunya Sukhareva, described a similar syndrome that was published in Russian in 1925, and in German in 1926.Clinical development and diagnoses The word autism first took its modern sense in 1938 when Hans Asperger of the Vienna University Hospital adopted Bleuler's terminology autistic psychopaths in a lecture in German about child psychology. Asperger was investigating an ASD now known as Asperger syndrome, though for various reasons it was not widely recognized as a separate diagnosis until 1981. Leo Kanner of the Johns Hopkins Hospital first used autism in its modern sense in English when he introduced the label early infantile autism in a 1943 report of 11 children with striking behavioral similarities. Almost all the characteristics described in Kanner's first paper on the subject, notably "autistic aloneness" and "insistence on sameness", are still regarded as typical of the autistic spectrum of disorders. It is not known whether Kanner derived the term independently of Asperger.Kanner's reuse of autism led to decades of confused terminology like infantile schizophrenia, and child psychiatry's focus on maternal deprivation led to misconceptions of autism as an infant's response to "refrigerator mothers". Starting in the late 1960s autism was established as a separate syndrome.Terminology and distinction from schizophrenia As late as the mid-1970s there was little evidence of a genetic role in autism, while in 2007 it was believed to be one of the most heritable psychiatric conditions. Although the rise of parent organizations and the destigmatization of childhood ASD have affected how ASD is viewed, parents continue to feel social stigma in situations where their child's autistic behavior is perceived negatively, and many primary care physicians and medical specialists express some beliefs consistent with outdated autism research.It took until 1980 for the DSM-III to differentiate autism from childhood schizophrenia. In 1987, the DSM-III-R provided a checklist for diagnosing autism. In May 2013, the DSM-5 was released, updating the classification for pervasive developmental disorders. The grouping of disorders, including PDD-NOS, autism, Asperger syndrome, Rett syndrome, and CDD, has been removed and replaced with the general term of Autism Spectrum Disorders. The two categories that exist are impaired social communication and/or interaction, and restricted and/or repetitive behaviors.The Internet has helped autistic individuals bypass nonverbal cues and emotional sharing that they find difficult to deal with, and has given them a way to form online communities and work remotely. Societal and cultural aspects of autism have developed: some in the community seek a cure, while others believe that autism is simply another way of being.Society and cultureAn autistic culture has emerged, accompanied by the autistic rights and neurodiversity movements. Events include World Autism Awareness Day, Autism Sunday, Autistic Pride Day, Autreat, and others. Social-science scholars study those with autism in hopes to learn more about "autism as a culture, transcultural comparisons ... and research on social movements." Many autistic individuals have been successful in their fields.Autism rights movement The autism rights movement is a social movement within the context of disability rights that emphasizes the concept of neurodiversity, viewing the autism spectrum as a result of natural variations in the human brain rather than a disorder to be cured. The autism rights movement advocates for including greater acceptance of autistic behaviors; therapies that focus on coping skills rather than on imitating the behaviors of those without autism, and the recognition of the autistic community as a minority group. Autism rights or neurodiversity advocates believe that the autism spectrum is genetic and should be accepted as a natural expression of the human genome. This perspective is distinct from fringe theories that autism is caused by environmental factors such as vaccines. A common criticism against autistic activists is that the majority of them are "high-functioning" or have Asperger syndrome and do not represent the views of "low-functioning" autistic people.EmploymentAbout half of autistic people are unemployed, and one third of those with graduate degrees may be unemployed. Among those who find work, most are employed in sheltered settings working for wages below the national minimum. While employers state hiring concerns about productivity and supervision, experienced employers of autistic people give positive reports of above average memory and detail orientation as well as a high regard for rules and procedure in autistic employees. A majority of the economic burden of autism is caused by decreased earnings in the job market. Some studies also find decreased earning among parents who care for autistic children.ReferencesExternal links 1910s neologismsArticles containing video clipsCommunication disordersNeurological disorders in childrenPervasive developmental disordersWikipedia medicine articles ready to translate
+Albedo (; ) is the measure of the diffuse reflection of solar radiation out of the total solar radiation and measured on a scale from 0, corresponding to a black body that absorbs all incident radiation, to 1, corresponding to a body that reflects all incident radiation.Surface albedo is defined as the ratio of radiosity Je to the irradiance Ee (flux per unit area) received by a surface. The proportion reflected is not only determined by properties of the surface itself, but also by the spectral and angular distribution of solar radiation reaching the Earth's surface. These factors vary with atmospheric composition, geographic location, and time (see position of the Sun). While bi-hemispherical reflectance is calculated for a single angle of incidence (i.e., for a given position of the Sun), albedo is the directional integration of reflectance over all solar angles in a given period. The temporal resolution may range from seconds (as obtained from flux measurements) to daily, monthly, or annual averages.Unless given for a specific wavelength (spectral albedo), albedo refers to the entire spectrum of solar radiation. Due to measurement constraints, it is often given for the spectrum in which most solar energy reaches the surface (between 0.3 and 3 μm). This spectrum includes visible light (0.4–0.7 μm), which explains why surfaces with a low albedo appear dark (e.g., trees absorb most radiation), whereas surfaces with a high albedo appear bright (e.g., snow reflects most radiation).Albedo is an important concept in climatology, astronomy, and environmental management (e.g., as part of the Leadership in Energy and Environmental Design (LEED) program for sustainable rating of buildings). The average albedo of the Earth from the upper atmosphere, its planetary albedo, is 30–35% because of cloud cover, but widely varies locally across the surface because of different geological and environmental features.The term albedo was introduced into optics by Johann Heinrich Lambert in his 1760 work Photometria.Terrestrial albedoAny albedo in visible light falls within a range of about 0.9 for fresh snow to about 0.04 for charcoal, one of the darkest substances. Deeply shadowed cavities can achieve an effective albedo approaching the zero of a black body. When seen from a distance, the ocean surface has a low albedo, as do most forests, whereas desert areas have some of the highest albedos among landforms. Most land areas are in an albedo range of 0.1 to 0.4. The average albedo of Earth is about 0.3. This is far higher than for the ocean primarily because of the contribution of clouds.Earth's surface albedo is regularly estimated via Earth observation satellite sensors such as NASA's MODIS instruments on board the Terra and Aqua satellites, and the CERES instrument on the Suomi NPP and JPSS. As the amount of reflected radiation is only measured for a single direction by satellite, not all directions, a mathematical model is used to translate a sample set of satellite reflectance measurements into estimates of directional-hemispherical reflectance and bi-hemispherical reflectance (e.g.,). These calculations are based on the bidirectional reflectance distribution function (BRDF), which describes how the reflectance of a given surface depends on the view angle of the observer and the solar angle. BDRF can facilitate translations of observations of reflectance into albedo.Earth's average surface temperature due to its albedo and the greenhouse effect is currently about . If Earth were frozen entirely (and hence be more reflective), the average temperature of the planet would drop below . If only the continental land masses became covered by glaciers, the mean temperature of the planet would drop to about . In contrast, if the entire Earth was covered by water – a so-called ocean planet – the average temperature on the planet would rise to almost .In 2021, scientists reported that Earth dimmed by ~0.5% over two decades (1998-2017) as measured by earthshine using modern photometric techniques. This may have both been co-caused by climate change as well as a substantial increase in global warming. However, the link to climate change has not been explored to date and it is unclear whether or not this represents an ongoing trend.White-sky, black-sky, and blue-sky albedoFor land surfaces, it has been shown that the albedo at a particular solar zenith angle θi can be approximated by the proportionate sum of two terms: the directional-hemispherical reflectance at that solar zenith angle, , sometimes referred to as black-sky albedo, and the bi-hemispherical reflectance, , sometimes referred to as white-sky albedo.with  being the proportion of direct radiation from a given solar angle, and  being the proportion of diffuse illumination, the actual albedo  (also called blue-sky albedo) can then be given as:This formula is important because it allows the albedo to be calculated for any given illumination conditions from a knowledge of the intrinsic properties of the surface.Examples of terrestrial albedo effectsIlluminationAlbedo is not directly dependent on illumination because changing the amount of incoming light proportionally changes the amount of reflected light, except in circumstances where a change in illumination induces a change in the Earth's surface at that location (e.g. through melting of reflective ice). That said, albedo and illumination both vary by latitude. Albedo is highest near the poles and lowest in the subtropics, with a local maximum in the tropics.Insolation effectsThe intensity of albedo temperature effects depends on the amount of albedo and the level of local insolation (solar irradiance); high albedo areas in the Arctic and Antarctic regions are cold due to low insolation, whereas areas such as the Sahara Desert, which also have a relatively high albedo, will be hotter due to high insolation. Tropical and sub-tropical rainforest areas have low albedo, and are much hotter than their temperate forest counterparts, which have lower insolation. Because insolation plays such a big role in the heating and cooling effects of albedo, high insolation areas like the tropics will tend to show a more pronounced fluctuation in local temperature when local albedo changes.Arctic regions notably release more heat back into space than what they absorb, effectively cooling the Earth. This has been a concern since arctic ice and snow has been melting at higher rates due to higher temperatures, creating regions in the arctic that are notably darker (being water or ground which is darker color) and reflects less heat back into space. This feedback loop results in a reduced albedo effect.Climate and weatherAlbedo affects climate by determining how much radiation a planet absorbs. The uneven heating of Earth from albedo variations between land, ice, or ocean surfaces can drive weather.Albedo–temperature feedbackWhen an area's albedo changes due to snowfall, a snow–temperature feedback results. A layer of snowfall increases local albedo, reflecting away sunlight, leading to local cooling. In principle, if no outside temperature change affects this area (e.g., a warm air mass), the raised albedo and lower temperature would maintain the current snow and invite further snowfall, deepening the snow–temperature feedback. However, because local weather is dynamic due to the change of seasons, eventually warm air masses and a more direct angle of sunlight (higher insolation) cause melting. When the melted area reveals surfaces with lower albedo, such as grass, soil, or ocean, the effect is reversed: the darkening surface lowers albedo, increasing local temperatures, which induces more melting and thus reducing the albedo further, resulting in still more heating.SnowSnow albedo is highly variable, ranging from as high as 0.9 for freshly fallen snow, to about 0.4 for melting snow, and as low as 0.2 for dirty snow. Over Antarctica snow albedo averages a little more than 0.8. If a marginally snow-covered area warms, snow tends to melt, lowering the albedo, and hence leading to more snowmelt because more radiation is being absorbed by the snowpack (the ice–albedo positive feedback).Just as fresh snow has a higher albedo than does dirty snow, the albedo of snow-covered sea ice is far higher than that of sea water. Sea water absorbs more solar radiation than would the same surface covered with reflective snow. When sea ice melts, either due to a rise in sea temperature or in response to increased solar radiation from above, the snow-covered surface is reduced, and more surface of sea water is exposed, so the rate of energy absorption increases. The extra absorbed energy heats the sea water, which in turn increases the rate at which sea ice melts. As with the preceding example of snowmelt, the process of melting of sea ice is thus another example of a positive feedback. Both positive feedback loops have long been recognized as important for global warming.Cryoconite, powdery windblown dust containing soot, sometimes reduces albedo on glaciers and ice sheets.The dynamical nature of albedo in response to positive feedback, together with the effects of small errors in the measurement of albedo, can lead to large errors in energy estimates. Because of this, in order to reduce the error of energy estimates, it is important to measure the albedo of snow-covered areas through remote sensing techniques rather than applying a single value for albedo over broad regions.Small-scale effectsAlbedo works on a smaller scale, too. In sunlight, dark clothes absorb more heat and light-coloured clothes reflect it better, thus allowing some control over body temperature by exploiting the albedo effect of the colour of external clothing.Solar photovoltaic effects Albedo can affect the electrical energy output of solar photovoltaic devices. For example, the effects of a spectrally responsive albedo are illustrated by the differences between the spectrally weighted albedo of solar photovoltaic technology based on hydrogenated amorphous silicon (a-Si:H) and crystalline silicon (c-Si)-based compared to traditional spectral-integrated albedo predictions. Research showed impacts of over 10%. More recently, the analysis was extended to the effects of spectral bias due to the specular reflectivity of 22 commonly occurring surface materials (both human-made and natural) and analyzes the albedo effects on the performance of seven photovoltaic materials covering three common photovoltaic system topologies: industrial (solar farms), commercial flat rooftops and residential pitched-roof applications.TreesBecause forests generally have a low albedo, (the majority of the ultraviolet and visible spectrum is absorbed through photosynthesis), some scientists have suggested that greater heat absorption by trees could offset some of the carbon benefits of afforestation (or offset the negative climate impacts of deforestation). In the case of evergreen forests with seasonal snow cover albedo reduction may be great enough for deforestation to cause a net cooling effect. Trees also impact climate in extremely complicated ways through evapotranspiration. The water vapor causes cooling on the land surface, causes heating where it condenses, acts a strong greenhouse gas, and can increase albedo when it condenses into clouds. Scientists generally treat evapotranspiration as a net cooling impact, and the net climate impact of albedo and evapotranspiration changes from deforestation depends greatly on local climate.In seasonally snow-covered zones, winter albedos of treeless areas are 10% to 50% higher than nearby forested areas because snow does not cover the trees as readily. Deciduous trees have an albedo value of about 0.15 to 0.18 whereas coniferous trees have a value of about 0.09 to 0.15. Variation in summer albedo across both forest types is associated with maximum rates of photosynthesis because plants with high growth capacity display a greater fraction of their foliage for direct interception of incoming radiation in the upper canopy. The result is that wavelengths of light not used in photosynthesis are more likely to be reflected back to space rather than being absorbed by other surfaces lower in the canopy.Studies by the Hadley Centre have investigated the relative (generally warming) effect of albedo change and (cooling) effect of carbon sequestration on planting forests. They found that new forests in tropical and midlatitude areas tended to cool; new forests in high latitudes (e.g., Siberia) were neutral or perhaps warming.WaterWater reflects light very differently from typical terrestrial materials. The reflectivity of a water surface is calculated using the Fresnel equations.At the scale of the wavelength of light even wavy water is always smooth so the light is reflected in a locally specular manner (not diffusely). The glint of light off water is a commonplace effect of this. At small angles of incident light, waviness results in reduced reflectivity because of the steepness of the reflectivity-vs.-incident-angle curve and a locally increased average incident angle.Although the reflectivity of water is very low at low and medium angles of incident light, it becomes very high at high angles of incident light such as those that occur on the illuminated side of Earth near the terminator (early morning, late afternoon, and near the poles). However, as mentioned above, waviness causes an appreciable reduction. Because light specularly reflected from water does not usually reach the viewer, water is usually considered to have a very low albedo in spite of its high reflectivity at high angles of incident light.Note that white caps on waves look white (and have high albedo) because the water is foamed up, so there are many superimposed bubble surfaces which reflect, adding up their reflectivities. Fresh 'black' ice exhibits Fresnel reflection.Snow on top of this sea ice increases the albedo to 0.9.CloudsCloud albedo has substantial influence over atmospheric temperatures. Different types of clouds exhibit different reflectivity, theoretically ranging in albedo from a minimum of near 0 to a maximum approaching 0.8. "On any given day, about half of Earth is covered by clouds, which reflect more sunlight than land and water. Clouds keep Earth cool by reflecting sunlight, but they can also serve as blankets to trap warmth."Albedo and climate in some areas are affected by artificial clouds, such as those created by the contrails of heavy commercial airliner traffic. A study following the burning of the Kuwaiti oil fields during Iraqi occupation showed that temperatures under the burning oil fires were as much as  colder than temperatures several miles away under clear skies.Aerosol effectsAerosols (very fine particles/droplets in the atmosphere) have both direct and indirect effects on Earth's radiative balance. The direct (albedo) effect is generally to cool the planet; the indirect effect (the particles act as cloud condensation nuclei and thereby change cloud properties) is less certain. As per Spracklen et al. the effects are: Aerosol direct effect. Aerosols directly scatter and absorb radiation. The scattering of radiation causes atmospheric cooling, whereas absorption can cause atmospheric warming. Aerosol indirect effect. Aerosols modify the properties of clouds through a subset of the aerosol population called cloud condensation nuclei. Increased nuclei concentrations lead to increased cloud droplet number concentrations, which in turn leads to increased cloud albedo, increased light scattering and radiative cooling (first indirect effect), but also leads to reduced precipitation efficiency and increased lifetime of the cloud (second indirect effect).In extremely polluted cities like Delhi, aerosol pollutants influence local weather and induce an urban cool island effect during the day.Black carbonAnother albedo-related effect on the climate is from black carbon particles. The size of this effect is difficult to quantify: the Intergovernmental Panel on Climate Change estimates that the global mean radiative forcing for black carbon aerosols from fossil fuels is +0.2 W m−2, with a range +0.1 to +0.4 W m−2. Black carbon is a bigger cause of the melting of the polar ice cap in the Arctic than carbon dioxide due to its effect on the albedo.Human activitiesHuman activities (e.g., deforestation, farming, and urbanization) change the albedo of various areas around the globe. However, quantification of this effect on the global scale is difficult, further study is required to determine anthropogenic effects.Albedo in Astronomy In astronomy, the term albedo can be defined in several different ways, depending upon the application and the wavelength of electromagnetic radiation involved.Optical or Visual AlbedoThe albedos of planets, satellites and minor planets such as asteroids can be used to infer much about their properties. The study of albedos, their dependence on wavelength, lighting angle ("phase angle"), and variation in time composes a major part of the astronomical field of photometry. For small and far objects that cannot be resolved by telescopes, much of what we know comes from the study of their albedos. For example, the absolute albedo can indicate the surface ice content of outer Solar System objects, the variation of albedo with phase angle gives information about regolith properties, whereas unusually high radar albedo is indicative of high metal content in asteroids.Enceladus, a moon of Saturn, has one of the highest known optical albedos of any body in the Solar System, with an albedo of 0.99. Another notable high-albedo body is Eris, with an albedo of 0.96. Many small objects in the outer Solar System and asteroid belt have low albedos down to about 0.05. A typical comet nucleus has an albedo of 0.04. Such a dark surface is thought to be indicative of a primitive and heavily space weathered surface containing some organic compounds.The overall albedo of the Moon is measured to be around 0.14, but it is strongly directional and non-Lambertian, displaying also a strong opposition effect. Although such reflectance properties are different from those of any terrestrial terrains, they are typical of the regolith surfaces of airless Solar System bodies.Two common optical albedos that are used in astronomy are the (V-band) geometric albedo (measuring brightness when illumination comes from directly behind the observer) and the Bond albedo (measuring total proportion of electromagnetic energy reflected). Their values can differ significantly, which is a common source of confusion.In detailed studies, the directional reflectance properties of astronomical bodies are often expressed in terms of the five Hapke parameters which semi-empirically describe the variation of albedo with phase angle, including a characterization of the opposition effect of regolith surfaces. One of these five parameters is yet another type of albedo called the single-scattering albedo. It is used to define scattering of electromagnetic waves on small particles. It depends on properties of the material (refractive index), the size of the particle, and the wavelength of the incoming radiation. An important relationship between an object's astronomical (geometric) albedo, absolute magnitude and diameter is given by:where  is the astronomical albedo,  is the diameter in kilometers, and  is the absolute magnitude.Radar AlbedoIn planetary radar astronomy, a microwave (or radar) pulse is transmitted toward a planetary target (e.g. Moon, asteroid, etc.) and the echo from the target is measured. In most instances, the transmitted pulse is circularly polarized and the received pulse is measured in the same sense of polarization as the transmitted pulse (SC) and the opposite sense (OC). The echo power is measured in terms of radar cross-section, , , or  (total power, SC + OC) and is equal to the cross-sectional area of a metallic sphere (perfect reflector) at the same distance as the target that would return the same echo power.Those components of the received echo that return from first-surface reflections (as from a smooth or mirror-like surface) are dominated by the OC component as there is a reversal in polarization upon reflection. If the surface is rough at the wavelength scale or there is significant penetration into the regolith, there will be a significant SC component in the echo caused by multiple scattering.For most objects in the solar system, the OC echo dominates and the most commonly reported radar albedo parameter is the (normalized) OC radar albedo (often shortened to radar albedo):where the denominator is the effective cross-sectional area of the target object with mean radius, . A smooth metallic sphere would have .Radar Albedos of Solar System ObjectsThe values reported for the Moon, Mercury, Mars, Venus, and Comet P/2005 JQ5 are derived from the total (OC+SC) radar albedo reported in those references.Relationship to Surface Bulk DensityIn the event that most of the echo is from first surface reflections ( or so), the OC radar albedo is a first-order approximation of the Fresnel reflection coefficient (aka reflectivity) and can be used to estimate the bulk density of a planetary surface to a depth of a meter or so (a few wavelengths of the radar wavelength which is typically at the decimeter scale) using the following empirical relationships: .See also Cool roof Daisyworld Emissivity Exitance Global dimming Irradiance Kirchhoff's law of thermal radiation Opposition surge Polar see-saw Radar astronomy Solar radiation managementReferencesExternal links Albedo Project Albedo – Encyclopedia of Earth NASA MODIS BRDF/albedo product site Ocean surface albedo look-up-table Surface albedo derived from Meteosat observations A discussion of Lunar albedos reflectivity of metals (chart)Land surface effects on climateClimate change feedbacksClimate forcingClimatologyElectromagnetic radiationRadiometryScattering, absorption and radiative transfer (optics)Radiation1760s neologisms
+A, or a, is the first letter and the first vowel of the modern English alphabet and the ISO basic Latin alphabet. Its name in English is a (pronounced ), plural aes. It is similar in shape to the Ancient Greek letter alpha, from which it derives. The uppercase version consists of the two slanting sides of a triangle, crossed in the middle by a horizontal bar. The lowercase version can be written in two forms: the double-storey a and single-storey ɑ. The latter is commonly used in handwriting and fonts based on it, especially fonts intended to be read by children, and is also found in italic type.In the English grammar, "a", and its variant "an", are indefinite articles.HistoryThe earliest certain ancestor of "A" is aleph (also written 'aleph), the first letter of the Phoenician alphabet, which consisted entirely of consonants (for that reason, it is also called an abjad to distinguish it from a true alphabet). In turn, the ancestor of aleph may have been a pictogram of an ox head in proto-Sinaitic script influenced by Egyptian hieroglyphs, styled as a triangular head with two horns extended.When the ancient Greeks adopted the alphabet, they had no use for a letter to represent the glottal stop—the consonant sound that the letter denoted in Phoenician and other Semitic languages, and that was the first phoneme of the Phoenician pronunciation of the letter—so they used their version of the sign to represent the vowel , and called it by the similar name of alpha. In the earliest Greek inscriptions after the Greek Dark Ages, dating to the 8th century BC, the letter rests upon its side, but in the Greek alphabet of later times it generally resembles the modern capital letter, although many local varieties can be distinguished by the shortening of one leg, or by the angle at which the cross line is set.The Etruscans brought the Greek alphabet to their civilization in the Italian Peninsula and left the letter unchanged. The Romans later adopted the Etruscan alphabet to write the Latin language, and the resulting letter was preserved in the Latin alphabet that would come to be used to write many languages, including English.Typographic variantsDuring Roman times, there were many variant forms of the letter "A". First was the monumental or lapidary style, which was used when inscribing on stone or other "permanent" media. There was also a cursive style used for everyday or utilitarian writing, which was done on more perishable surfaces. Due to the "perishable" nature of these surfaces, there are not as many examples of this style as there are of the monumental, but there are still many surviving examples of different types of cursive, such as majuscule cursive, minuscule cursive, and semicursive minuscule. Variants also existed that were intermediate between the monumental and cursive styles. The known variants include the early semi-uncial, the uncial, and the later semi-uncial.At the end of the Roman Empire (5th century AD), several variants of the cursive minuscule developed through Western Europe. Among these were the semicursive minuscule of Italy, the Merovingian script in France, the Visigothic script in Spain, and the Insular or Anglo-Irish semi-uncial or Anglo-Saxon majuscule of Great Britain. By the 9th century, the Caroline script, which was very similar to the present-day form, was the principal form used in book-making, before the advent of the printing press. This form was derived through a combining of prior forms.15th-century Italy saw the formation of the two main variants that are known today. These variants, the Italic and Roman forms, were derived from the Caroline Script version. The Italic form, also called script a, is used in most current handwriting; it consists of a circle and vertical stroke on the right ("ɑ"). This slowly developed from the fifth-century form resembling the Greek letter tau in the hands of medieval Irish and English writers. The Roman form is used in most printed material; it consists of a small loop with an arc over it ("a"). Both derive from the majuscule (capital) form. In Greek handwriting, it was common to join the left leg and horizontal stroke into a single loop, as demonstrated by the uncial version shown. Many fonts then made the right leg vertical. In some of these, the serif that began the right leg stroke developed into an arc, resulting in the printed form, while in others it was dropped, resulting in the modern handwritten form. Graphic designers refer to the Italic and Roman forms as "single decker a" and "double decker a" respectively.Italic type is commonly used to mark emphasis or more generally to distinguish one part of a text from the rest (set in Roman type). There are some other cases aside from italic type where script a ("ɑ"), also called Latin alpha, is used in contrast with Latin "a" (such as in the International Phonetic Alphabet).Use in writing systemsEnglishIn modern English orthography, the letter  represents at least seven different vowel sounds:the near-open front unrounded vowel  as in pad;the open back unrounded vowel  as in father, which is closer to its original Latin and Greek sound;the diphthong  as in ace and major (usually when  is followed by one, or occasionally two, consonants and then another vowel letter) – this results from Middle English lengthening followed by the Great Vowel Shift;the modified form of the above sound that occurs before , as in square and Mary;the rounded vowel of water;the shorter rounded vowel (not present in General American) in was and what;a schwa, in many unstressed syllables, as in about, comma, solar.The double  sequence does not occur in native English words, but is found in some words derived from foreign languages such as Aaron and aardvark. However,  occurs in many common digraphs, all with their own sound or sounds, particularly , , , ,  and . is the third-most-commonly used letter in English (after  and ) and French, the second most common in Spanish, and the most common in Portuguese. About 8.167% of letters used in English texts tend to be ; the number is around 7.636% in French, 11.525% in Spanish, and 14.634% for Portuguese.Other languagesIn most languages that use the Latin alphabet,  denotes an open unrounded vowel, such as , , or . An exception is Saanich, in which  (and the glyph Á) stands for a close-mid front unrounded vowel .Other systemsIn phonetic and phonemic notation:in the International Phonetic Alphabet,  is used for the open front unrounded vowel,  is used for the open central unrounded vowel, and  is used for the open back unrounded vowel.in X-SAMPA,  is used for the open front unrounded vowel and  is used for the open back unrounded vowel.Other usesIn algebra, the letter a along with various other letters of the alphabet is often used to denote a variable, with various conventional meanings in different areas of mathematics. Moreover, in 1637, René Descartes "invented the convention of representing unknowns in equations by x, y, and z, and knowns by a, b, and c", and this convention is still often followed, especially in elementary algebra.In geometry, capital A, B, C etc. are used to denote segments, lines, rays, etc. A capital A is also typically used as one of the letters to represent an angle in a triangle, the lowercase a representing the side opposite angle A."A" is often used to denote something or someone of a better or more prestigious quality or status: A-, A or A+, the best grade that can be assigned by teachers for students' schoolwork; "A grade" for clean restaurants; A-list celebrities, etc. Such associations can have a motivating effect, as exposure to the letter A has been found to improve performance, when compared with other letters."A" is used as a prefix on some words, such as asymmetry, to mean "not" or "without" (from Greek).In English grammar, "a", and its variant "an", is an indefinite article, used to introduce noun phrases.Finally, the letter A is used to denote size, as in a narrow size shoe, or a small cup size in a brassiere.Related charactersDescendants and related characters in the Latin alphabetÆ æ : Latin AE ligatureA with diacritics: Å å Ǻ ǻ Ḁ ḁ ẚ Ă ă Ặ ặ Ắ ắ Ằ ằ Ẳ ẳ Ẵ ẵ Ȃ ȃ Â â Ậ ậ Ấ ấ Ầ ầ Ẫ ẫ Ẩ ẩ Ả ả Ǎ ǎ Ⱥ ⱥ Ȧ ȧ Ǡ ǡ Ạ ạ Ä ä Ǟ ǟ À à Ȁ ȁ Á á Ā ā Ā̀ ā̀ Ã ã Ą ą Ą́ ą́ Ą̃ ą̃ A̲ a̲ ᶏPhonetic alphabet symbols related to A (the International Phonetic Alphabet only uses lowercase, but uppercase forms are used in some other writing systems): Ɑ ɑ : Latin letter alpha / script A, which represents an open back unrounded vowel in the IPAᶐ : Latin small letter alpha with retroflex hookⱯ ɐ : Turned A, which represents a near-open central vowel in the IPAΛ ʌ : Turned V (also called a wedge, a caret, or a hat), which represents an open-mid back unrounded vowel in the IPAⱰ ɒ : Turned alpha / script A, which represents an open back rounded vowel in the IPAᶛ : Modifier letter small turned alphaᴀ : Small capital A, an obsolete or non-standard symbol in the International Phonetic Alphabet used to represent various sounds (mainly open vowels)A a ᵄ : Modifier letters are used in the Uralic Phonetic Alphabet (UPA) (sometimes encoded with Unicode subscripts and superscripts)a : Subscript small a is used in Indo-European studiesꬱ : Small letter a reversed-schwa is used in the Teuthonista phonetic transcription systemꞺ ꞻ : Glottal A, used in the transliteration of UgariticDerived signs, symbols and abbreviationsª : an ordinal indicatorÅ : Ångström sign∀ : a turned capital letter A, used in predicate logic to specify universal quantification ("for all")@ : At sign₳ : Argentine australAncestors and siblings in other alphabets𐤀 : Semitic letter Aleph, from which the following symbols originally deriveΑ α : Greek letter Alpha, from which the following letters deriveА а : Cyrillic letter A : Coptic letter Alpha𐌀 : Old Italic A, which is the ancestor of modern Latin A : Runic letter ansuz, which probably derives from old Italic A : Gothic letter aza/asksԱ ա : Armenian letter AybComputing codes 1Other representationsNotesFootnotesReferencesExternal links History of the Alphabet ISO basic Latin lettersVowel letters
+Alabama () is a state in the Southeastern region of the United States, bordered by Tennessee to the north; Georgia to the east; Florida and the Gulf of Mexico to the south; and Mississippi to the west. Alabama is the 30th largest by area and the 24th-most populous of the U.S. states. With a total of  of inland waterways, Alabama has among the most of any state.Alabama is nicknamed the Yellowhammer State, after the state bird. Alabama is also known as the "Heart of Dixie" and the "Cotton State". The state tree is the longleaf pine, and the state flower is the camellia. Alabama's capital is Montgomery, and its largest city by population and area is Huntsville. Its oldest city is Mobile, founded by French colonists in 1702 as the capital of French Louisiana. Greater Birmingham is Alabama's largest metropolitan area and its economic center.Originally home to many native tribes, present-day Alabama was a Spanish territory beginning in the sixteenth century until the French acquired it in the early eighteenth century. The British won the territory in 1763 until losing it in the American Revolutionary War. Spain held Mobile as part of Spanish West Florida until 1813. In December 1819, Alabama was recognized as a state. During the antebellum period, Alabama was a major producer of cotton, and widely used African American slave labor. In 1861, the state seceded from the United States to become part of the Confederate States of America, with Montgomery acting as its first capital, and rejoined the Union in 1868. Following the American Civil War, Alabama would suffer decades of economic hardship, in part due to agriculture and a few cash crops being the main driver of the states economy. Similar to other former slave states, Alabamian legislators employed Jim Crow laws to disenfranchise and discriminate against African Americans from the late 19th century up until the 1960s. In the early 20th century, despite the growth of major industries and urban centers, white rural interests dominated the state legislature through the mid-20th century. During this time, urban interests and African Americans were markedly under-represented. High-profile events such as the Selma to Montgomery march made the state a major focal point of the civil rights movement in the 1950s and 1960s. During and after World War II, Alabama grew as the state's economy diversified with new industries. NASA's Marshall Space Flight Center in Huntsville would help Alabama's economic growth in the mid-to-late 20th century, by developing an aerospace industry. Alabama's economy in the 21st century is based on automotive, finance, tourism, manufacturing, aerospace, mineral extraction, healthcare, education, retail, and technology.The state's geography is diverse, with the north dominated by the mountainous Tennessee Valley and the south by Mobile Bay, a historically significant port. Politically, as part of the Deep South, Alabama is predominantly a conservative state, and culturally is known for its Southern culture. Within Alabama, American football, particularly at the college level at schools such as the University of Alabama, Auburn University, Alabama A&M University, Alabama State University, Troy University, the University of South Alabama, and Jacksonville State University, play a major part of the state's culture.EtymologyThe European-American naming of the Alabama River and state was derived from the Alabama people, a Muskogean-speaking tribe whose members lived just below the confluence of the Coosa and Tallapoosa rivers on the upper reaches of the river. In the Alabama language, the word for a person of Alabama lineage is  (or variously  or  in different dialects; the plural form is ). The suggestion that "Alabama" was borrowed from the Choctaw language is unlikely. The word's spelling varies significantly among historical sources. The first usage appears in three accounts of the Hernando de Soto expedition of 1540: Garcilaso de la Vega used , while the Knight of Elvas and Rodrigo Ranjel wrote Alibamu and Limamu, respectively, in transliterations of the term. As early as 1702, the French called the tribe the , with French maps identifying the river as . Other spellings of the name have included Alibamu, Alabamo, Albama, Alebamon, Alibama, Alibamou, Alabamu, Allibamou. and possibly Alabahmu. The use of state names derived from Native American languages is common in the U.S.; an estimated 27 states have names of Native American origin.Sources disagree on the word's meaning. Some scholars suggest the word comes from the Choctaw  (meaning 'plants' or 'weeds') and  (meaning 'to cut', 'to trim', or 'to gather'). The meaning may have been 'clearers of the thicket' or 'herb gatherers', referring to clearing land for cultivation or collecting medicinal plants. The state has numerous place names of Native American origin. However, there are no correspondingly similar words in the Alabama language.An 1842 article in the Jacksonville Republican proposed it meant 'Here We Rest'. This notion was popularized in the 1850s through the writings of Alexander Beaufort Meek. Experts in the Muskogean languages have not found any evidence to support such a translation.HistoryPre-European settlementIndigenous peoples of varying cultures lived in the area for thousands of years before the advent of European colonization. Trade with the northeastern tribes by the Ohio River began during the Burial Mound Period (1000BCE700CE) and continued until European contact.The agrarian Mississippian culture covered most of the state from 1000 to 1600 CE, with one of its major centers built at what is now the Moundville Archaeological Site in Moundville, Alabama. This is the second-largest complex of the classic Middle Mississippian era, after Cahokia in present-day Illinois, which was the center of the culture. Analysis of artifacts from archaeological excavations at Moundville were the basis of scholars' formulating the characteristics of the Southeastern Ceremonial Complex (SECC). Contrary to popular belief, the SECC appears to have no direct links to Mesoamerican culture, but developed independently. The Ceremonial Complex represents a major component of the religion of the Mississippian peoples; it is one of the primary means by which their religion is understood.Among the historical tribes of Native American people living in present-day Alabama at the time of European contact were the Cherokee, an Iroquoian language people; and the Muskogean-speaking Alabama (Alibamu), Chickasaw, Choctaw, Creek, and Koasati. While part of the same large language family, the Muskogee tribes developed distinct cultures and languages.European settlementThe Spanish were the first Europeans to reach Alabama during their exploration of North America in the 16th century. The expedition of Hernando de Soto passed through Mabila and other parts of the state in 1540. More than 160 years later, the French founded the region's first European settlement at Old Mobile in 1702. The city was moved to the current site of Mobile in 1711. This area was claimed by the French from 1702 to 1763 as part of La Louisiane.After the French lost to the British in the Seven Years' War, it became part of British West Florida from 1763 to 1783. After the United States victory in the American Revolutionary War, the territory was divided between the United States and Spain. The latter retained control of this western territory from 1783 until the surrender of the Spanish garrison at Mobile to U.S. forces on April 13, 1813.Thomas Bassett, a loyalist to the British monarchy during the Revolutionary era, was one of the earliest white settlers in the state outside Mobile. He settled in the Tombigbee District during the early 1770s. The district's boundaries were roughly limited to the area within a few miles of the Tombigbee River and included portions of what is today southern Clarke County, northernmost Mobile County, and most of Washington County.What is now the counties of Baldwin and Mobile became part of Spanish West Florida in 1783, part of the independent Republic of West Florida in 1810, and was finally added to the Mississippi Territory in 1812. Most of what is now the northern two-thirds of Alabama was known as the Yazoo lands beginning during the British colonial period. It was claimed by the Province of Georgia from 1767 onwards. Following the Revolutionary War, it remained a part of Georgia, although heavily disputed.With the exception of the area around Mobile and the Yazoo lands, what is now the lower one-third of Alabama was made part of the Mississippi Territory when it was organized in 1798. The Yazoo lands were added to the territory in 1804, following the Yazoo land scandal. Spain kept a claim on its former Spanish West Florida territory in what would become the coastal counties until the Adams–Onís Treaty officially ceded it to the United States in 1819.Early 19th centuryBefore Mississippi's admission to statehood on December 10, 1817, the more sparsely settled eastern half of the territory was separated and named the Alabama Territory. The United States Congress created the Alabama Territory on March 3, 1817. St. Stephens, now abandoned, served as the territorial capital from 1817 to 1819.Alabama was admitted as the 22nd state on December 14, 1819, with Congress selecting Huntsville as the site for the first Constitutional Convention. From July5 to August 2, 1819, delegates met to prepare the new state constitution. Huntsville served as temporary capital from 1819 to 1820, when the seat of government moved to Cahaba in Dallas County.Cahaba, now a ghost town, was the first permanent state capital from 1820 to 1825. The Alabama Fever land rush was underway when the state was admitted to the Union, with settlers and land speculators pouring into the state to take advantage of fertile land suitable for cotton cultivation. Part of the frontier in the 1820s and 1830s, its constitution provided for universal suffrage for white men.Southeastern planters and traders from the Upper South brought slaves with them as the cotton plantations in Alabama expanded. The economy of the central Black Belt (named for its dark, productive soil) was built around large cotton plantations whose owners' wealth grew mainly from slave labor. The area also drew many poor, disenfranchised people who became subsistence farmers. Alabama had an estimated population of under 10,000 people in 1810, but it increased to more than 300,000 people by 1830. Most Native American tribes were completely removed from the state within a few years of the passage of the Indian Removal Act by Congress in 1830.From 1826 to 1846, Tuscaloosa served as Alabama's capital. On January 30, 1846, the Alabama legislature announced it had voted to move the capital city from Tuscaloosa to Montgomery. The first legislative session in the new capital met in December 1847. A new capitol building was erected under the direction of Stephen Decatur Button of Philadelphia. The first structure burned down in 1849, but was rebuilt on the same site in 1851. This second capitol building in Montgomery remains to the present day. It was designed by Barachias Holt of Exeter, Maine.Civil War and ReconstructionBy 1860, the population had increased to 964,201 people, of which nearly half, 435,080, were enslaved African Americans, and 2,690 were free people of color. On January 11, 1861, Alabama declared its secession from the Union. After remaining an independent republic for a few days, it joined the Confederate States of America. The Confederacy's capital was initially at Montgomery. Alabama was heavily involved in the American Civil War. Although comparatively few battles were fought in the state, Alabama contributed about 120,000 soldiers to the war effort.A company of cavalry soldiers from Huntsville, Alabama, joined Nathan Bedford Forrest's battalion in Hopkinsville, Kentucky. The company wore new uniforms with yellow trim on the sleeves, collar and coattails. This led to them being greeted with "Yellowhammer", and the name later was applied to all Alabama troops in the Confederate Army.Alabama's slaves were freed by the 13th Amendment in 1865. Alabama was under military rule from the end of the war in May 1865 until its official restoration to the Union in 1868. From 1867 to 1874, with most white citizens barred temporarily from voting and freedmen enfranchised, many African Americans emerged as political leaders in the state. Alabama was represented in Congress during this period by three African-American congressmen: Jeremiah Haralson, Benjamin S. Turner, and James T. Rapier.Following the war, the state remained chiefly agricultural, with an economy tied to cotton. During Reconstruction, state legislators ratified a new state constitution in 1868 which created the state's first public school system and expanded women's rights. Legislators funded numerous public road and railroad projects, although these were plagued with allegations of fraud and misappropriation. Organized insurgent, resistance groups tried to suppress the freedmen and Republicans. Besides the short-lived original Ku Klux Klan, these included the Pale Faces, Knights of the White Camellia, Red Shirts, and the White League.Reconstruction in Alabama ended in 1874, when the Democrats regained control of the legislature and governor's office through an election dominated by fraud and violence. They wrote another constitution in 1875, and the legislature passed the Blaine Amendment, prohibiting public money from being used to finance religious-affiliated schools. The same year, legislation was approved that called for racially segregated schools. Railroad passenger cars were segregated in 1891.20th centuryThe new 1901 Constitution of Alabama included provisions for voter registration that effectively disenfranchised large portions of the population, including nearly all African Americans and Native Americans, and tens of thousands of poor European Americans, through making voter registration difficult, requiring a poll tax and literacy test. The 1901 constitution required racial segregation of public schools. By 1903 only 2,980 African Americans were registered in Alabama, although at least 74,000 were literate. This compared to more than 181,000 African Americans eligible to vote in 1900. The numbers dropped even more in later decades. The state legislature passed additional racial segregation laws related to public facilities into the 1950s: jails were segregated in 1911; hospitals in 1915; toilets, hotels, and restaurants in 1928; and bus stop waiting rooms in 1945.While the planter class had persuaded poor whites to vote for this legislative effort to suppress black voting, the new restrictions resulted in their disenfranchisement as well, due mostly to the imposition of a cumulative poll tax. By 1941, whites constituted a slight majority of those disenfranchised by these laws: 600,000 whites vs. 520,000 African-Americans. Nearly all Blacks had lost the ability to vote. Despite numerous legal challenges which succeeded in overturning certain provisions, the state legislature would create new ones to maintain disenfranchisement. The exclusion of blacks from the political system persisted until after passage of federal civil rights legislation in 1965 to enforce their constitutional rights as citizens.The rural-dominated Alabama legislature consistently underfunded schools and services for the disenfranchised African Americans, but it did not relieve them of paying taxes. Partially as a response to chronic underfunding of education for African Americans in the South, the Rosenwald Fund began funding the construction of what came to be known as Rosenwald Schools. In Alabama these schools were designed and the construction partially financed with Rosenwald funds, which paid one-third of the construction costs. The fund required the local community and state to raise matching funds to pay the rest. Black residents effectively taxed themselves twice, by raising additional monies to supply matching funds for such schools, which were built in many rural areas. They often donated land and labor as well.Beginning in 1913, the first 80 Rosenwald Schools were built in Alabama for African-American children. A total of 387 schools, seven teachers' houses, and several vocational buildings were completed by 1937 in the state. Several of the surviving school buildings in the state are now listed on the National Register of Historic Places.Continued racial discrimination and lynchings, agricultural depression, and the failure of the cotton crops due to boll weevil infestation led tens of thousands of African Americans from rural Alabama and other states to seek opportunities in northern and midwestern cities during the early decades of the 20th century as part of the Great Migration out of the South. Reflecting this emigration, the population growth rate in Alabama (see "historical populations" table below) dropped by nearly half from 1910 to 1920.At the same time, many rural people migrated to the city of Birmingham to work in new industrial jobs. Birmingham experienced such rapid growth it was called the "Magic City". By 1920, Birmingham was the 36th-largest city in the United States. Heavy industry and mining were the basis of its economy. Its residents were under-represented for decades in the state legislature, which refused to redistrict after each decennial census according to population changes, as it was required by the state constitution. This did not change until the late 1960s following a lawsuit and court order.Industrial development related to the demands of World War II brought a level of prosperity to the state not seen since before the civil war. Rural workers poured into the largest cities in the state for better jobs and a higher standard of living. One example of this massive influx of workers occurred in Mobile. Between 1940 and 1943, more than 89,000 people moved into the city to work for war-related industries. Cotton and other cash crops faded in importance as the state developed a manufacturing and service base.Despite massive population changes in the state from 1901 to 1961, the rural-dominated legislature refused to reapportion House and Senate seats based on population, as required by the state constitution to follow the results of decennial censuses. They held on to old representation to maintain political and economic power in agricultural areas. One result was that Jefferson County, containing Birmingham's industrial and economic powerhouse, contributed more than one-third of all tax revenue to the state, but did not receive a proportional amount in services. Urban interests were consistently underrepresented in the legislature. A 1960 study noted that because of rural domination, "a minority of about 25% of the total state population is in majority control of the Alabama legislature."In the United States Supreme Court cases of Baker v. Carr (1962) and Reynolds v. Sims (1964), the court ruled that the principle of "one man, one vote" needed to be the basis of both houses of state legislatures, and that their districts had to be based on population rather than geographic counties.In 1972, for the first time since 1901, the legislature completed the congressional redistricting based on the decennial census. This benefited the urban areas that had developed, as well as all in the population who had been underrepresented for more than sixty years. Other changes were made to implement representative state house and senate districts.African Americans continued to press in the 1950s and 1960s to end disenfranchisement and segregation in the state through the civil rights movement, including legal challenges. In 1954, the U.S. Supreme Court ruled in Brown v. Board of Education that public schools had to be desegregated, but Alabama was slow to comply. During the 1960s, under Governor George Wallace, Alabama resisted compliance with federal demands for desegregation. The civil rights movement had notable events in Alabama, including the Montgomery bus boycott (1955–1956), Freedom Rides in 1961, and 1965 Selma to Montgomery marches. These contributed to Congressional passage and enactment of the Civil Rights Act of 1964 and Voting Rights Act of 1965 by the U.S. Congress.Legal segregation ended in the states in 1964, but Jim Crow customs often continued until specifically challenged in court. According to The New York Times, by 2017, many of Alabama's African-Americans were living in Alabama's cities such as Birmingham and Montgomery. Also, the Black Belt region across central Alabama "is home to largely poor counties that are predominantly African-American. These counties include Dallas, Lowndes, Marengo and Perry."Alabama has made some changes since the late 20th century and has used new types of voting to increase representation. In the 1980s, an omnibus redistricting case, Dillard v. Crenshaw County, challenged the at-large voting for representative seats of 180 Alabama jurisdictions, including counties and school boards. At-large voting had diluted the votes of any minority in a county, as the majority tended to take all seats. Despite African Americans making up a significant minority in the state, they had been unable to elect any representatives in most of the at-large jurisdictions.As part of settlement of this case, five Alabama cities and counties, including Chilton County, adopted a system of cumulative voting for election of representatives in multi-seat jurisdictions. This has resulted in more proportional representation for voters. In another form of proportional representation, 23 jurisdictions use limited voting, as in Conecuh County. In 1982, limited voting was first tested in Conecuh County. Together use of these systems has increased the number of African Americans and women being elected to local offices, resulting in governments that are more representative of their citizens.Beginning in the 1960s, the state's economy shifted away from its traditional lumber, steel, and textile industries because of increased foreign competition. Steel jobs, for instance, declined from 46,314 in 1950 to 14,185 in 2011. However, the state, particularly Huntsville, benefited from the opening of the George C. Marshall Space Flight Center in 1960, a major facility in the development of the Saturn rocket program and the space shuttle. Technology and manufacturing industries, such as automobile assembly, replaced some the state's older industries in the late twentieth century, but the state's economy and growth lagged behind other states in the area, such as Georgia and Florida.21st centuryIn 2001, Alabama Supreme Court chief justice Roy Moore installed a statue of the Ten Commandments in the capitol in Montgomery. In 2002, the 11th US Circuit Court ordered the statue removed, but Moore refused to follow the court order, which led to protests around the capitol in favor of keeping the monument.  The monument was removed in August 2003.A few natural disasters have occurred in the state in the twenty-first century. In 2004, Hurricane Ivan, a category 3 storm upon landfall, struck the state and caused over $18 billion of damage. It was among the most destructive storms to strike the state in its modern history. A super outbreak of 62 tornadoes hit the state in April 2011 and killed 238 people, devastating many communities.GeographyAlabama is the thirtieth-largest state in the United States with  of total area: 3.2% of the area is water, making Alabama 23rd in the amount of surface water, also giving it the second-largest inland waterway system in the United States. About three-fifths of the land area is part of the Gulf Coastal Plain, a gentle plain with a general descent towards the Mississippi River and the Gulf of Mexico. The North Alabama region is mostly mountainous, with the Tennessee River cutting a large valley and creating numerous creeks, streams, rivers, mountains, and lakes.Alabama is bordered by the states of Tennessee to the north, Georgia to the east, Florida to the south, and Mississippi to the west. Alabama has coastline at the Gulf of Mexico, in the extreme southern edge of the state. The state ranges in elevation from sea level at Mobile Bay to more than  in the northeast, to Mount Cheaha at .Alabama's land consists of  of forest or 67% of the state's total land area. Suburban Baldwin County, along the Gulf Coast, is the largest county in the state in both land area and water area.Areas in Alabama administered by the National Park Service include Horseshoe Bend National Military Park near Alexander City; Little River Canyon National Preserve near Fort Payne; Russell Cave National Monument in Bridgeport; Tuskegee Airmen National Historic Site in Tuskegee; and Tuskegee Institute National Historic Site near Tuskegee. Additionally, Alabama has four National Forests: Conecuh, Talladega, Tuskegee, and William B. Bankhead. Alabama also contains the Natchez Trace Parkway, the Selma To Montgomery National Historic Trail, and the Trail of Tears National Historic Trail.Notable natural wonders include: the "Natural Bridge" rock, the longest natural bridge east of the Rockies, located just south of Haleyville; Cathedral Caverns in Marshall County, named for its cathedral-like appearance, features one of the largest cave entrances and stalagmites in the world; Ecor Rouge in Fairhope, the highest coastline point between Maine and Mexico; DeSoto Caverns in Childersburg, the first officially recorded cave in the United States; Noccalula Falls in Gadsden features a 90-foot waterfall; Dismals Canyon near Phil Campbell, home to two waterfalls, six natural bridges and allegedly served as a hideout for legendary outlaw Jesse James; Stephens Gap Cave in Jackson County boasts a 143-foot pit, two waterfalls and is one of the most photographed wild cave scenes in America; Little River Canyon near Fort Payne, one of the nation's longest mountaintop rivers; Rickwood Caverns near Warrior features an underground pool, blind cave fish and 260-million-year-old limestone formations; and the Walls of Jericho canyon on the Alabama-Tennessee state line.A -wide meteorite impact crater is located in Elmore County, just north of Montgomery. This is the Wetumpka crater, the site of "Alabama's greatest natural disaster". A -wide meteorite hit the area about 80 million years ago. The hills just east of downtown Wetumpka showcase the eroded remains of the impact crater that was blasted into the bedrock, with the area labeled the Wetumpka crater or astrobleme ("star-wound") because of the concentric rings of fractures and zones of shattered rock that can be found beneath the surface. In 2002, Christian Koeberl with the Institute of Geochemistry University of Vienna published evidence and established the site as the 157th recognized impact crater on Earth.ClimateThe state is classified as humid subtropical (Cfa) under the Koppen Climate Classification. The average annual temperature is 64°F (18°C). Temperatures tend to be warmer in the southern part of the state with its proximity to the Gulf of Mexico, while the northern parts of the state, especially in the Appalachian Mountains in the northeast, tend to be slightly cooler. Generally, Alabama has very hot summers and mild winters with copious precipitation throughout the year. Alabama receives an average of  of rainfall annually and enjoys a lengthy growing season of up to 300 days in the southern part of the state.Summers in Alabama are among the hottest in the U.S., with high temperatures averaging over  throughout the summer in some parts of the state. Alabama is also prone to tropical storms and hurricanes. Areas of the state far away from the Gulf are not immune to the effects of the storms, which often dump tremendous amounts of rain as they move inland and weaken.South Alabama reports many thunderstorms. The Gulf Coast, around Mobile Bay, averages between 70 and 80 days per year with thunder reported. This activity decreases somewhat further north in the state, but even the far north of the state reports thunder on about 60 days per year. Occasionally, thunderstorms are severe with frequent lightning and large hail; the central and northern parts of the state are most vulnerable to this type of storm. Alabama ranks ninth in the number of deaths from lightning and tenth in the number of deaths from lightning strikes per capita.Alabama, along with Oklahoma and Iowa, has the most confirmed F5 and EF5 tornadoes of any state, according to statistics from the National Climatic Data Center for the period January 1, 1950, to June 2013. Several long-tracked F5/EF5 tornadoes have contributed to Alabama reporting more tornado fatalities since 1950 than any other state. The state was affected by the 1974 Super Outbreak and was devastated tremendously by the 2011 Super Outbreak. The 2011 Super Outbreak produced a record amount of tornadoes in the state. The tally reached 62.The peak season for tornadoes varies from the northern to southern parts of the state. Alabama is one of the few places in the world that has a secondary tornado season in November and December besides the typically severe spring. The northern part—along the Tennessee River Valley—is most vulnerable. The area of Alabama and Mississippi most affected by tornadoes is sometimes referred to as Dixie Alley, as distinct from the Tornado Alley of the Southern Plains.Winters are generally mild in Alabama, as they are throughout most of the Southeastern United States, with average January low temperatures around  in Mobile and around  in Birmingham. Although snow is a rare event in much of Alabama, areas of the state north of Montgomery may receive a dusting of snow a few times every winter, with an occasional moderately heavy snowfall every few years. Historic snowfall events include New Year's Eve 1963 snowstorm and the 1993 Storm of the Century. The annual average snowfall for the Birmingham area is  per year. In the southern Gulf coast, snowfall is less frequent, sometimes going several years without any snowfall.Alabama's highest temperature of  was recorded on September 5, 1925, in the unincorporated community of Centerville. The record low of  occurred on January 30, 1966, in New Market.Flora and faunaAlabama is home to a diverse array of flora and fauna in habitats that range from the Tennessee Valley, Appalachian Plateau, and Ridge-and-Valley Appalachians of the north to the Piedmont, Canebrake, and Black Belt of the central region to the Gulf Coastal Plain and beaches along the Gulf of Mexico in the south. The state is usually ranked among the top in nation for its range of overall biodiversity.Alabama is in the subtropical coniferous forest biome and once boasted huge expanses of pine forest, which still form the largest proportion of forests in the state. It currently ranks fifth in the nation for the diversity of its flora. It is home to nearly 4,000 pteridophyte and spermatophyte plant species.Indigenous animal species in the state include 62 mammal species, 93 reptile species, 73 amphibian species, roughly 307 native freshwater fish species, and 420 bird species that spend at least part of their year within the state. Invertebrates include 97 crayfish species and 383 mollusk species. 113 of these mollusk species have never been collected outside the state.Census-designated and metropolitan areasCitiesDemographicsAccording to the 2020 United States census the population of Alabama was 5,024,279 on April 1, 2020, which represents an increase of 244,543 or 5.12%, since the 2010 census. This includes a natural increase since the last census of 121,054 (502,457 births minus 381,403 deaths) and an increase due to net migration of 104,991 into the state.Immigration from outside the U.S. resulted in a net increase of 31,180 people, and migration within the country produced a net gain of 73,811 people. The state had 108,000 foreign-born (2.4% of the state population), of which an estimated 22.2% were undocumented (24,000).The center of population of Alabama is located in Chilton County, outside the town of Jemison.AncestryThose citing "American" ancestry in Alabama are of overwhelmingly English extraction, however most English Americans identify simply as having American ancestry because their roots have been in North America for so long, in many cases since the early sixteen hundreds.  Demographers estimate that a minimum of 20–23% of people in Alabama are of predominantly English ancestry and state that the figure is probably much higher. In the 1980 census 1,139,976 people in Alabama cited that they were of English ancestry out of a total state population of 2,824,719 making them 41% of the state at the time and the largest ethnic group.In 2011, 46.6% of Alabama's population younger than age1 were minorities. The largest reported ancestry groups in Alabama are American (13.4%), Irish (10.5%), English (10.2%), German (7.9%), and Scots-Irish (2.5%) based on 2006-2008 Census data.The Scots-Irish were the largest non-English immigrant group from the British Isles before the American Revolution, and many settled in the South, later moving into the Deep South as it was developed.In 1984, under the Davis–Strong Act, the state legislature established the Alabama Indian Affairs Commission. Native American groups within the state had increasingly been demanding recognition as ethnic groups and seeking an end to discrimination. Given the long history of slavery and associated racial segregation, the Native American peoples, who have sometimes been of mixed race, have insisted on having their cultural identification respected. In the past, their self-identification was often overlooked as the state tried to impose a binary breakdown of society into white and black. The state has officially recognized nine American Indian tribes in the state, descended mostly from the Five Civilized Tribes of the American Southeast. These are the following. Poarch Band of Creek Indians (who also have federal recognition) MOWA Band of Choctaw Indians Star Clan of Muscogee Creeks Echota Cherokee Tribe of Alabama Cherokee Tribe of Northeast Alabama Cher-O-Creek Intra Tribal Indians Ma-Chis Lower Creek Indian Tribe Piqua Shawnee Tribe Ani-Yun-Wiya NationThe state government has promoted recognition of Native American contributions to the state, including the designation in 2000 for Columbus Day to be jointly celebrated as American Indian Heritage Day.LanguageMost Alabama residents (95.1% of those five and older) spoke only English at home in 2010, a minor decrease from 96.1% in 2000. Alabama English is predominantly Southern, and is related to South Midland speech which was taken across the border from Tennessee. In the major Southern speech region, there is the decreasing loss of the final r, for example the "boyd" pronunciation of "bird". In the northern third of the state, there is a South Midland "arm" and "barb" rhyming with "form" and "orb". Unique words in Alabama English include: redworm (earthworm), peckerwood (woodpecker), snake doctor and snake feeder (dragonfly), tow sack (burlap bag), plum peach (clingstone), French harp (harmonica), and dog irons (andirons).ReligionIn the 2008 American Religious Identification Survey, 86% of Alabama respondents reported their religion as Christian, including 6% Catholic, with 11% as having no religion. The composition of other traditions is 0.5% Mormon, 0.5% Jewish, 0.5% Muslim, 0.5% Buddhist, and 0.5% Hindu.Alabama is located in the middle of the Bible Belt, a region of numerous Protestant Christians. Alabama has been identified as one of the most religious states in the United States, with about 58% of the population attending church regularly. A majority of people in the state identify as Evangelical Protestant. , the three largest denominational groups in Alabama are the Southern Baptist Convention, The United Methodist Church, and non-denominational Evangelical Protestant.In Alabama, the Southern Baptist Convention has the highest number of adherents with 1,380,121; this is followed by the United Methodist Church with 327,734 adherents, non-denominational Evangelical Protestant with 220,938 adherents, and the Catholic Church with 150,647 adherents. Many Baptist and Methodist congregations became established in the Great Awakening of the early 19th century, when preachers proselytized across the South. The Assemblies of God had almost 60,000 members, the Churches of Christ had nearly 120,000 members. The Presbyterian churches, strongly associated with Scots-Irish immigrants of the 18th century and their descendants, had a combined membership around 75,000 (PCA—28,009 members in 108 congregations, PC(USA)—26,247 members in 147 congregations, the Cumberland Presbyterian Church—6,000 members in 59 congregations, the Cumberland Presbyterian Church in America—5,000 members and fifty congregations plus the EPC and Associate Reformed Presbyterians with 230 members and nine congregations).In a 2007 survey, nearly 70% of respondents could name all four of the Christian Gospels. Of those who indicated a religious preference, 59% said they possessed a "full understanding" of their faith and needed no further learning. In a 2007 poll, 92% of Alabamians reported having at least some confidence in churches in the state.Although in much smaller numbers, many other religious faiths are represented in the state as well, including Judaism, Islam, Hinduism, Buddhism, Sikhism, the Baháʼí Faith, and Unitarian Universalism.Jews have been present in what is now Alabama since 1763, during the colonial era of Mobile, when Sephardic Jews immigrated from London. The oldest Jewish congregation in the state is Congregation Sha'arai Shomayim in Mobile. It was formally recognized by the state legislature on January 25, 1844. Later immigrants in the nineteenth and twentieth centuries tended to be Ashkenazi Jews from eastern Europe. Jewish denominations in the state include two Orthodox, four Conservative, ten Reform, and one Humanistic synagogue.Muslims have been increasing in Alabama, with 31 mosques built by 2011, many by African-American converts.Several Hindu temples and cultural centers in the state have been founded by Indian immigrants and their descendants, the best-known being the Shri Swaminarayan Mandir in Birmingham, the Hindu Temple and Cultural Center of Birmingham in Pelham, the Hindu Cultural Center of North Alabama in Capshaw, and the Hindu Mandir and Cultural Center in Tuscaloosa.There are six Dharma centers and organizations for Theravada Buddhists. Most monastic Buddhist temples are concentrated in southern Mobile County, near Bayou La Batre. This area has attracted an influx of refugees from Cambodia, Laos, and Vietnam during the 1970s and thereafter. The four temples within a ten-mile radius of Bayou La Batre, include Chua Chanh Giac, Wat Buddharaksa, and Wat Lao Phoutthavihan.The first community of adherents of the Baháʼí Faith in Alabama was founded in 1896 by Paul K. Dealy, who moved from Chicago to Fairhope. Baháʼí centers in Alabama exist in Birmingham, Huntsville, and Florence.HealthIn 2018, life expectancy in Alabama was 75.1 years, below the national average of 78.7 years and is the third lowest life expectancy in the country. Factors that can cause lower life expectancy are maternal mortality, suicide, and gun crimes.A Centers for Disease Control and Prevention study in 2008 showed that obesity in Alabama is a problem, with most counties having more than 29% of adults obese, except for ten which had a rate between 26% and 29%. Residents of the state, along with those in five other states, were least likely in the nation to be physically active during leisure time. Alabama, and the southeastern U.S. in general, has one of the highest incidences of adult onset diabetes in the country, exceeding 10% of adults.On May 14, 2019, Alabama passed the Human Life Protection Act, banning abortion at any stage of pregnancy unless there is a "serious health risk", with no exceptions for rape and incest. The law, if enacted, would punish doctors who perform abortions with 10 to 99 years imprisonment and be the most restrictive abortion law in the country. However, on October 29, 2019, U.S. District Judge Myron Thompson blocked the law from taking effect.EconomyThe state has invested in aerospace, education, health care, banking, and various heavy industries, including automobile manufacturing, mineral extraction, steel production and fabrication. By 2006, crop and animal production in Alabama was valued at $1.5billion. In contrast to the primarily agricultural economy of the previous century, this was only about one percent of the state's gross domestic product. The number of private farms has declined at a steady rate since the 1960s, as land has been sold to developers, timber companies, and large farming conglomerates.Non-agricultural employment in 2008 was 121,800 in management occupations; 71,750 in business and financial operations; 36,790 in computer-related and mathematical occupation; 44,200 in architecture and engineering; 12,410 in life, physical, and social sciences; 32,260 in community and social services; 12,770 in legal occupations; 116,250 in education, training, and library services; 27,840 in art, design and media occupations; 121,110 in healthcare; 44,750 in fire fighting, law enforcement, and security; 154,040 in food preparation and serving; 76,650 in building and grounds cleaning and maintenance; 53,230 in personal care and services; 244,510 in sales; 338,760 in office and administration support; 20,510 in farming, fishing, and forestry; 120,155 in construction and mining, gas, and oil extraction; 106,280 in installation, maintenance, and repair; 224,110 in production; and 167,160 in transportation and material moving.According to the U.S. Bureau of Economic Analysis, the 2008 total gross state product was $170billion, or $29,411 per capita. Alabama's 2012 GDP increased 1.2% from the previous year. The single largest increase came in the area of information. In 2010, per capita income for the state was $22,984.The state's seasonally adjusted unemployment rate was 5.8% in April 2015. This compared to a nationwide seasonally adjusted rate of 5.4%.Alabama has no minimum wage and in February 2016 passed legislation preventing municipalities from setting one. (A Birmingham city ordinance would have raised theirs to $10.10.), Alabama has the sixth highest poverty rate among states in the U.S. In 2017, United Nations Special Rapporteur Philip Alston toured parts of rural Alabama and observed environmental conditions he said were poorer than anywhere he had seen in the developed world.Largest employersThe five employers that employed the most employees in Alabama in April 2011 were:The next twenty largest employers, , included:AgricultureAlabama's agricultural outputs include poultry and eggs, cattle, fish, plant nursery items, peanuts, cotton, grains such as corn and sorghum, vegetables, milk, soybeans, and peaches. Although known as "The Cotton State", Alabama ranks between eighth and tenth in national cotton production, according to various reports, with Texas, Georgia and Mississippi comprising the top three.Aquaculture Aquaculture is a large part of the economy of Alabama. Alabamians began to practice aquaculture in the early 1960s. U.S. farm-raised catfish is the 8th most popular seafood product in America. By 2008, approximately 4,000 people in Alabama were employed by the catfish industry and Alabama produced 132 million pounds of catfish. In 2020, Alabama produced ⅓ of the United States' farm-raised catfish. The total 2020 sales of catfish raised in Alabama equaled $307 million but by 2020 the total employment of Alabamians fell to 2,442.From the early 2000s to 2020, the Alabamian catfish industry has declined from 250 farms and 4 processors to 66 farms and 2 processors. Reasons for this decline include increased feed prices, catfish alternatives, COVID-19’s impact on restaurant sales, disease, and fish size.IndustryAlabama's industrial outputs include iron and steel products (including cast-iron and steel pipe); paper, lumber, and wood products; mining (mostly coal); plastic products; cars and trucks; and apparel. In addition, Alabama produces aerospace and electronic products, mostly in the Huntsville area, the location of NASA's George C. Marshall Space Flight Center and the U.S. Army Materiel Command, headquartered at Redstone Arsenal.A great deal of Alabama's economic growth since the 1990s has been due to the state's expanding automotive manufacturing industry. Located in the state are Honda Manufacturing of Alabama, Hyundai Motor Manufacturing Alabama, Mercedes-Benz U.S. International, and Toyota Motor Manufacturing Alabama, as well as their various suppliers. Since 1993, the automobile industry has generated more than 67,800 new jobs in the state. Alabama currently ranks 4th in the nation for vehicle exports.Automakers accounted for approximately a third of the industrial expansion in the state in 2012. The eight models produced at the state's auto factories totaled combined sales of 74,335 vehicles for 2012. The strongest model sales during this period were the Hyundai Elantra compact car, the Mercedes-Benz GL-Class sport utility vehicle and the Honda Ridgeline sport utility truck.Steel producers Outokumpu, Nucor, SSAB, ThyssenKrupp, and U.S. Steel have facilities in Alabama and employ more than 10,000 people. In May 2007, German steelmaker ThyssenKrupp selected Calvert in Mobile County for a 4.65billion combined stainless and carbon steel processing facility. ThyssenKrupp's stainless steel division, Inoxum, including the stainless portion of the Calvert plant, was sold to Finnish stainless steel company Outokumpu in 2012. The remaining portion of the ThyssenKrupp plant had final bids submitted by ArcelorMittal and Nippon Steel for $1.6billion in March 2013. Companhia Siderúrgica Nacional submitted a combined bid for the mill at Calvert, plus a majority stake in the ThyssenKrupp mill in Brazil, for $3.8billion. In July 2013, the plant was sold to ArcelorMittal and Nippon Steel.The Hunt Refining Company, a subsidiary of Hunt Consolidated, Inc., is based in Tuscaloosa and operates a refinery there. The company also operates terminals in Mobile, Melvin, and Moundville. JVC America, Inc. operates an optical disc replication and packaging plant in Tuscaloosa.The Goodyear Tire and Rubber Company operates a large plant in Gadsden which employs about 1,400 people. It has been in operation since 1929.Construction of an Airbus A320 family aircraft assembly plant in Mobile was formally announced by Airbus CEO Fabrice Brégier from the Mobile Convention Center on July 2, 2012. The plans include a $600million factory at the Brookley Aeroplex for the assembly of the A319, A320 and A321 aircraft. Construction began in 2013, with plans for it to become operable by 2015 and produce up to 50 aircraft per year by 2017. The assembly plant is the company's first factory to be built within the United States. It was announced on February 1, 2013, that Airbus had hired Alabama-based Hoar Construction to oversee construction of the facility.Tourism and entertainmentAccording to Business Insider, Alabama ranked 14th in most popular states to visit in 2014. An estimated 26 million tourists visited the state in 2017 and spent $14.3 billion, providing directly or indirectly 186,900 jobs in the state, which includes 362,000 International tourists spending $589 million.The state is home to various attractions, natural features, parks and events that attract visitors from around the globe, notably the annual Hangout Music Festival, held on the public beaches of Gulf Shores; the Alabama Shakespeare Festival, one of the ten largest Shakespeare festivals in the world; the Robert Trent Jones Golf Trail, a collection of championship caliber golf courses distributed across the state; casinos such as Victoryland; amusement parks such as Alabama Splash Adventure; the Riverchase Galleria, one of the largest shopping centers in the southeast; Guntersville Lake, voted the best lake in Alabama by Southern Living Magazine readers; and the Alabama Museum of Natural History, the oldest museum in the state.Mobile is known for having the oldest organized Mardi Gras celebration in the United States, beginning in 1703. It was also host to the first formally organized Mardi Gras parade in the United States in 1830, a tradition that continues to this day. Mardi Gras is an official state holiday in Mobile and Baldwin counties.In 2018, Mobile's Mardi Gras parade was the state's top event, producing the most tourists with an attendance of 892,811. The top attraction was the U.S. Space & Rocket Center in Huntsville with an attendance of 849,981, followed by the Birmingham Zoo with 543,090. Of the parks and natural destinations, Alabama's Gulf Coast topped the list with 6,700,000 visitors.Alabama has historically been a popular region for film shoots due to its diverse landscapes and contrast of environments. Movies filmed in Alabama include: Close Encounters of the Third Kind, Get Out, 42, Selma, Big Fish, The Final Destination, Due Date, Need For Speed and many more.HealthcareUAB Hospital, USA Health University Hospital, Huntsville Hospital, and Children's Hospital of Alabama are the only LevelI trauma centers in Alabama. UAB is the largest state government employer in Alabama, with a workforce of about 18,000. A 2017 study found that Alabama had the least competitive health insurance market in the country, with Blue Cross and Blue Shield of Alabama having a market share of 84% followed by UnitedHealth Group at 7%.BankingRegions Financial Corporation is the largest bank headquartered in or operating in Alabama. PNC Financial Services  and Wells Fargo also have a major presence in Alabama.Wells Fargo has a regional headquarters, an operations center campus, and a $400million data center in Birmingham. Many smaller banks are also headquartered in the Birmingham area, including ServisFirst and New South Federal Savings Bank. Birmingham also serves as the headquarters for several large investment management companies, including Harbert Management Corporation.Electronics and communicationsTelecommunications provider AT&T, formerly BellSouth, has a major presence in Alabama with several large offices in Birmingham.Many technology companies are headquartered in Huntsville, such as ADTRAN, a network access company; Intergraph, a computer graphics company; and Avocent, an IT infrastructure company.ConstructionBrasfield & Gorrie, BE&K, Hoar Construction, and B.L. Harbert International, based in Alabama and subsidiaries of URS Corporation, are all routinely are included in the Engineering News-Record lists of top design, international construction, and engineering firms.Law and governmentState governmentThe foundational document for Alabama's government is the Alabama Constitution, which was ratified in 1901. With over 850 amendments and almost 87,000 words, it is by some accounts the world's longest constitution and is roughly forty times the length of the United States Constitution.There has been a significant movement to rewrite and modernize Alabama's constitution. Critics argue that Alabama's constitution maintains highly centralized power with the state legislature, leaving practically no power in local hands. Most counties do not have home rule. Any policy changes proposed in different areas of the state must be approved by the entire Alabama legislature and, frequently, by state referendum. One criticism of the current constitution claims that its complexity and length intentionally codify segregation and racism.Alabama's government is divided into three coequal branches. The legislative branch is the Alabama Legislature, a bicameral assembly composed of the Alabama House of Representatives, with 105 members, and the Alabama Senate, with 35 members. The Legislature is responsible for writing, debating, passing, or defeating state legislation. The Republican Party currently holds a majority in both houses of the Legislature. The Legislature has the power to override a gubernatorial veto by a simple majority (most state Legislatures require a two-thirds majority to override a veto).Until 1964, the state elected state senators on a geographic basis by county, with one per county. It had not redistricted congressional districts since passage of its constitution in 1901; as a result, urbanized areas were grossly underrepresented. It had not changed legislative districts to reflect the decennial censuses, either. In Reynolds v. Sims (1964), the U.S. Supreme Court implemented the principle of "one man, one vote", ruling that congressional districts had to be reapportioned based on censuses (as the state already included in its constitution but had not implemented.) Further, the court ruled that both houses of bicameral state legislatures had to be apportioned by population, as there was no constitutional basis for states to have geographically based systems.At that time, Alabama and many other states had to change their legislative districting, as many across the country had systems that underrepresented urban areas and districts. This had caused decades of underinvestment in such areas. For instance, Birmingham and Jefferson County taxes had supplied one-third of the state budget, but Jefferson County received only 1/67th of state services in funding. Through the legislative delegations, the Alabama legislature kept control of county governments.The executive branch is responsible for the execution and oversight of laws. It is headed by the governor of Alabama. Other members of the executive branch include the cabinet, the lieutenant governor of Alabama, the Attorney General of Alabama, the Alabama Secretary of State, the Alabama State Treasurer, and the State Auditor of Alabama. The current governor is Republican Kay Ivey.The members of the Legislature take office immediately after the November elections. Statewide officials, such as the governor, lieutenant governor, attorney general, and other constitutional officers, take office the following January.The judiciary is responsible for interpreting the Constitution of Alabama and applying the law in state criminal and civil cases. The state's highest court is the Supreme Court of Alabama. Alabama uses partisan elections to select judges. Since the 1980s judicial campaigns have become increasingly politicized. The current chief justice of the Alabama Supreme Court is Republican Tom Parker. All sitting justices on the Alabama Supreme Court are members of the Republican Party. There are two intermediate appellate courts, the Court of Civil Appeals and the Court of Criminal Appeals, and four trial courts: the circuit court (trial court of general jurisdiction), and the district, probate, and municipal courts.Some critics believe the election of judges has contributed to an exceedingly high rate of executions. Alabama has the highest per capita death penalty rate in the country. In some years, it imposes more death sentences than does Texas, a state which has a population five times larger. However, executions per capita are significantly higher in Texas. Some of its cases have been highly controversial; the U.S. Supreme Court has overturned 24 convictions in death penalty cases. It was the only state to allow judges to override jury decisions in whether or not to use a death sentence; in 10 cases judges overturned sentences of life imprisonment without parole that were voted unanimously by juries. This judicial authority was removed in April 2017.TaxesTaxes are collected by the Alabama Department of Revenue. Alabama levies a 2%, 4%, or5% personal income tax, depending on the amount earned and filing status. Taxpayers are allowed to deduct their federal income tax from their Alabama state tax, even if taking the standard deduction; those who itemize can also deduct FICA (the Social Security and Medicare tax).The state's general sales tax rate is 4%. Sales tax rates for cities and counties are also added to purchases.</ref> For example, the total sales tax rate in Mobile County, Alabama is 10% and there is an additional restaurant tax of 1%, which means a diner in Mobile County, Alabama would pay an 11% tax on a meal.In 2020, sales and excise taxes in Alabama accounted for 38% of all state and local revenue.Only Alabama, Mississippi, and South Dakota tax groceries at the full state sales tax rate.The corporate income tax rate in Alabama is 6.5%. The overall federal, state, and local tax burden in Alabama ranks the state as the second least tax-burdened state in the country.Property taxes of .40% of assessed value per year, are the second-lowest in the U.S., after Hawaii. The current state constitution requires a voter referendum to raise property taxes.County and local governmentsAlabama has 67 counties. Each county has its own elected legislative branch, usually called the county commission. It also has limited executive authority in the county. Because of the constraints of the Alabama Constitution, which centralizes power in the state legislature, only seven counties (Jefferson, Lee, Mobile, Madison, Montgomery, Shelby, and Tuscaloosa) in the state have limited home rule. Instead, most counties in the state must lobby the Local Legislation Committee of the state legislature to get simple local policies approved, ranging from waste disposal to land use zoning.The state legislature has retained power over local governments by refusing to pass a constitutional amendment establishing home rule for counties, as recommended by the 1973 Alabama Constitutional Commission. Legislative delegations retain certain powers over each county. United States Supreme Court decisions in Baker v. Carr (1964) required that both houses have districts established on the basis of population, and redistricted after each census, to implement the principle of "one man, one vote". Before that, each county was represented by one state senator, leading to under-representation in the state senate for more urbanized, populous counties. The rural bias of the state legislature, which had also failed to redistrict seats in the state house, affected politics well into the 20th century, failing to recognize the rise of industrial cities and urbanized areas."The lack of home rule for counties in Alabama has resulted in the proliferation of local legislation permitting counties to do things not authorized by the state constitution. Alabama's constitution has been amended more than 700 times, and almost one-third of the amendments are local in nature, applying to only one county or city. A significant part of each legislative session is spent on local legislation, taking away time and attention of legislators from issues of statewide importance."Alabama is an alcoholic beverage control state, meaning the state government holds a monopoly on the sale of alcohol. The Alabama Alcoholic Beverage Control Board controls the sale and distribution of alcoholic beverages in the state. A total of 25 of the 67 counties are "dry counties" which ban the sale of alcohol, and there are many dry municipalities in counties which permit alcohol sales.PoliticsDuring Reconstruction following the American Civil War, Alabama was occupied by federal troops of the Third Military District under General John Pope. In 1874, the political coalition of white Democrats known as the Redeemers took control of the state government from the Republicans, in part by suppressing the black vote through violence, fraud, and intimidation.After 1890, a coalition of White Democratic politicians passed laws to segregate and disenfranchise African American residents, a process completed in provisions of the 1901 constitution. Provisions which disenfranchised blacks resulted in excluding many poor Whites. By 1941 more Whites than Blacks had been disenfranchised: 600,000 to 520,000. The total effects were greater on the black community, as almost all its citizens were disfranchised and relegated to separate and unequal treatment under the law.From 1901 through the 1960s, the state did not redraw election districts as population grew and shifted within the state during urbanization and industrialization of certain areas. As counties were the basis of election districts, the result was a rural minority that dominated state politics through nearly three-quarters of the century, until a series of federal court cases required redistricting in 1972 to meet equal representation.Alabama state politics gained nationwide and international attention in the 1950s and 1960s during the civil rights movement, when whites bureaucratically, and at times violently, resisted protests for electoral and social reform. Governor George Wallace, the state's only four-term governor, was a controversial figure who vowed to maintain segregation. Only after passage of the federal Civil Rights Act of 1964 and Voting Rights Act of 1965 did African Americans regain the ability to exercise suffrage, among other civil rights. In many jurisdictions, they continued to be excluded from representation by at-large electoral systems, which allowed the majority of the population to dominate elections. Some changes at the county level have occurred following court challenges to establish single-member districts that enable a more diverse representation among county boards.In 2007, the Alabama Legislature passed, and Republican governor Bob Riley signed a resolution expressing "profound regret" over slavery and its lingering impact. In a symbolic ceremony, the bill was signed in the Alabama State Capitol, which housed Congress of the Confederate States of America.In 2010, Republicans won control of both houses of the legislature for the first time in 136 years., there are a total of 3,589,839 registered voters, with 3,518,285 active, and the others inactive in the state.ElectionsIn a 2020 study, Alabama was ranked as the 12th most difficult state for citizens to vote.State electionsWith the disfranchisement of Blacks in 1901, the state became part of the "Solid South", a system in which the Democratic Party operated as effectively the only viable political party in every Southern state. For nearly a hundred years local and state elections in Alabama were decided in the Democratic Party primary, with generally only token Republican challengers running in the General Election. Since the mid- to late 20th century, however, white conservatives started shifting to the Republican Party. In Alabama, majority-white districts are now expected to regularly elect Republican candidates to federal, state and local office.Members of the nine seats on the Supreme Court of Alabama and all ten seats on the state appellate courts are elected to office. Until 1994, no Republicans held any of the court seats. In that general election, the then-incumbent chief justice, Ernest C. Hornsby, refused to leave office after losing the election by approximately 3,000 votes to Republican Perry O. Hooper Sr. Hornsby sued Alabama and defiantly remained in office for nearly a year before finally giving up the seat after losing in court. The Democrats lost the last of the nineteen court seats in August 2011 with the resignation of the last Democrat on the bench.In the early 21st century, Republicans hold all seven of the statewide elected executive branch offices. Republicans hold six of the eight elected seats on the Alabama State Board of Education. In 2010, Republicans took large majorities of both chambers of the state legislature, giving them control of that body for the first time in 136 years. The last remaining statewide Democrat, who served on the Alabama Public Service Commission, was defeated in 2012.Only three Republican lieutenant governors have been elected since the end of Reconstruction, when Republicans generally represented Reconstruction government, including the newly emancipated freedmen who had gained the franchise. The three GOP lieutenant governors are Steve Windom (1999–2003), Kay Ivey (2011–2017), and Will Ainsworth (2019–present).Local electionsMany local offices (county commissioners, boards of education, tax assessors, tax collectors, etc.) in the state are still held by Democrats. Many rural counties have voters who are majority Democrats, resulting in local elections being decided in the Democratic primary. Similarly many metropolitan and suburban counties are majority-Republican and elections are effectively decided in the Republican Primary, although there are exceptions.Alabama's 67 county sheriffs are elected in partisan, at-large races, and Democrats still retain the narrow majority of those posts. The current split is 35 Democrats, 31 Republicans, and one Independent Fayette. However, most of the Democratic sheriffs preside over rural and less populated counties. The majority of Republican sheriffs have been elected in the more urban/suburban and heavily populated counties. , the state of Alabama has one female sheriff, in Morgan County, Alabama, and ten African-American sheriffs.Federal electionsThe state's two U.S. senators are Republican Richard C. Shelby and Republican Tommy Tuberville. Shelby was originally elected to the Senate as a Democrat in 1986 and re-elected in 1992, but switched parties immediately following the November 1994 general election.In the U.S. House of Representatives, the state is represented by seven members, six of whom are Republicans: (Bradley Byrne, Mike D. Rogers, Robert Aderholt, Morris J. Brooks, Martha Roby, and Gary Palmer) and one Democrat: Terri Sewell who represents the Black Belt as well as most of the predominantly black portions of Birmingham, Tuscaloosa and Montgomery.EducationPrimary and secondary educationPublic primary and secondary education in Alabama is under the purview of the Alabama State Board of Education as well as local oversight by 67 county school boards and 60 city boards of education. Together, 1,496 individual schools provide education for 744,637 elementary and secondary students.Public school funding is appropriated through the Alabama Legislature through the Education Trust Fund. In FY 2006–2007, Alabama appropriated $3,775,163,578 for primary and secondary education. That represented an increase of $444,736,387 over the previous fiscal year. In 2007, more than 82 percent of schools made adequate yearly progress (AYP) toward student proficiency under the National No Child Left Behind law, using measures determined by the state of Alabama.While Alabama's public education system has improved in recent decades, it lags behind in achievement compared to other states. According to U.S. Census data (2000), Alabama's high school graduation rate (75%) is the fourth lowest in the U.S. (after Kentucky, Louisiana and Mississippi). The largest educational gains were among people with some college education but without degrees.Generally prohibited in the West at large, school corporal punishment is not unusual in Alabama, with 27,260 public school students paddled at least one time, according to government data for the 2011–2012 school year. The rate of school corporal punishment in Alabama is surpassed by only Mississippi and Arkansas.Colleges and universitiesAlabama's programs of higher education include 14 four-year public universities, two-year community colleges, and 17 private, undergraduate and graduate universities. In the state are four medical schools (as of fall 2015) (University of Alabama School of Medicine, University of South Alabama and Alabama College of Osteopathic Medicine and The Edward Via College of Osteopathic Medicine—Auburn Campus), two veterinary colleges (Auburn University and Tuskegee University), a dental school (University of Alabama School of Dentistry), an optometry college (University of Alabama at Birmingham), two pharmacy schools (Auburn University and Samford University), and five law schools (University of Alabama School of Law, Birmingham School of Law, Cumberland School of Law, Miles Law School, and the Thomas Goode Jones School of Law). Public, post-secondary education in Alabama is overseen by the Alabama Commission on Higher Education and the Alabama Department of Postsecondary Education. Colleges and universities in Alabama offer degree programs from two-year associate degrees to a multitude of doctoral level programs.The largest single campus is the University of Alabama, located in Tuscaloosa, with 37,665 enrolled for fall 2016. Troy University was the largest institution in the state in 2010, with an enrollment of 29,689 students across four Alabama campuses (Troy, Dothan, Montgomery, and Phenix City), as well as sixty learning sites in seventeen other states and eleven other countries. The oldest institutions are the public University of North Alabama in Florence and the Catholic Church-affiliated Spring Hill College in Mobile, both founded in 1830.Accreditation of academic programs is through the Southern Association of Colleges and Schools (SACS) as well as other subject-focused national and international accreditation agencies such as the Association for Biblical Higher Education (ABHE), the Council on Occupational Education (COE), and the Accrediting Council for Independent Colleges and Schools (ACICS).According to the 2011 U.S. News & World Report, Alabama had three universities ranked in the top 100 Public Schools in America (University of Alabama at 31, Auburn University at 36, and University of Alabama at Birmingham at 73).According to the 2012 U.S. News & World Report, Alabama had four tier one universities (University of Alabama, Auburn University, University of Alabama at Birmingham and University of Alabama in Huntsville).MediaMajor newspapers include Birmingham News, Mobile Press-Register, and Montgomery Advertiser.Major television network affiliates in Alabama include: ABC WGWW 40.2 ABC, Anniston WBMA 58/WABM 68.2 ABC, Birmingham WDHN 18 ABC, Dothan WAAY 31 ABC, Huntsville WEAR 3 ABC Pensacola, Florida/Mobile WNCF 32 ABC, Montgomery WDBB 17.2 ABC, Tuscaloosa CBS WIAT 42 CBS, Birmingham WTVY 4 CBS, Dothan WHNT 19 CBS, Huntsville WKRG 5 CBS, Mobile WAKA 8 CBS, Selma/Montgomery Fox WBRC 6 FOX, Birmingham WZDX 54 FOX, Huntsville WALA 10 FOX, Mobile WCOV 20 FOX, Montgomery WDFX 34 FOX, Ozark/Dothan NBC WVTM 13 NBC, Birmingham WRGX 23 NBC, Dothan WAFF 48 NBC, Huntsville WPMI 15 NBC, Mobile WSFA 12 NBC, Montgomery PBS/Alabama Public Television WBIQ 10 PBS, Birmingham WIIQ 41 PBS, Demopolis WDIQ 2 PBS, Dozier WFIQ 36 PBS, Florence WHIQ 25 PBS, Huntsville WGIQ 43 PBS, Louisville WEIQ 42 PBS, Mobile WAIQ 26 PBS, Montgomery WCIQ 7 PBS, Mount Cheaha The CW WTTO 21, Homewood/Birmingham WTVY 4.3, Dothan WHDF 15, Florence/Huntsville WFNA 55, Gulf Shores/Mobile/Pensacola, FL WDBB 17, Tuscaloosa WBMM 22, Tuskegee/MontgomeryCultureLiteratureSportsProfessional sportsAlabama has several professional and semi-professional sports teams, including three minor league baseball teams.NotesThe Talladega Superspeedway motorsports complex hosts a series of NASCAR events. It has a seating capacity of 143,000 and is the thirteenth largest stadium in the world and sixth largest stadium in America. Also, the Barber Motorsports Park has hosted IndyCar Series and Rolex Sports Car Series races.The ATP Birmingham was a World Championship Tennis tournament held from 1973 to 1980.Alabama has hosted several professional golf tournaments, such as the 1984 and 1990 PGA Championship at Shoal Creek, the Barbasol Championship (PGA Tour), the Mobile LPGA Tournament of Champions, Airbus LPGA Classic, and Yokohama Tire LPGA Classic (LPGA Tour), and The Tradition (Champions Tour).College sportsCollege football is extremely popular in Alabama, particularly the University of Alabama Crimson Tide and Auburn University Tigers, rivals in the Southeastern Conference. Alabama averages over 100,000 fans per game and Auburn averages over 80,000—both numbers among the top twenty in the nation. Bryant–Denny Stadium is the home of the Alabama football team, and has a seating capacity of 101,821, and is the fifth largest stadium in America. Jordan-Hare Stadium is the home field of the Auburn football team and seats up to 87,451.Legion Field is home of the UAB Blazers football program and the Birmingham Bowl. It seats 71,594. Ladd–Peebles Stadium in Mobile is the home of the University of South Alabama football team, and serves as the home of the NCAA Senior Bowl, LendingTree Bowl, and Alabama-Mississippi All Star Classic; the stadium seats 40,646. In 2009, Bryant–Denny Stadium and Jordan-Hare Stadium became the homes of the Alabama High School Athletic Association state football championship games, after previously being held at Legion Field in Birmingham.TransportationAviationMajor airports with sustained operations in Alabama include Birmingham-Shuttlesworth International Airport (BHM), Huntsville International Airport (HSV), Dothan Regional Airport (DHN), Mobile Regional Airport (MOB), Montgomery Regional Airport (MGM), Northwest Alabama Regional Airport (MSL) and Northeast Alabama Regional Airport (GAD).RailFor rail transport, Amtrak schedules the Crescent, a daily passenger train, running from New York to New Orleans with station stops at Anniston, Birmingham, and Tuscaloosa.RoadsAlabama has six major interstate routes: Interstate 65 (I-65) travels north–south roughly through the middle of the state; I-20/I-59 travel from the central west Mississippi state line to Birmingham, where I-59 continues to the north-east corner of the state and I-20 continues east towards Atlanta; I-85 originates in Montgomery and travels east-northeast to the Georgia state line, providing a main thoroughfare to Atlanta; and I-10 traverses the southernmost portion of the state, traveling from west to east through Mobile. I-22 enters the state from Mississippi and connects Birmingham with Memphis, Tennessee. In addition, there are currently five auxiliary interstate routes in the state: I-165 in Mobile, I-359 in Tuscaloosa, I-459 around Birmingham, I-565 in Decatur and Huntsville, and I-759 in Gadsden. A sixth route, I-685, will be formed when I-85 is rerouted along a new southern bypass of Montgomery. A proposed northern bypass of Birmingham will be designated as I-422. Since a direct connection from I-22 to I-422 will not be possible, I-222 has been proposed, as well.Several U.S. Highways also pass through the state, such as U.S. Route 11 (US-11), US-29, US-31, US-43, US-45, US-72, US-78, US-80, US-82, US-84, US-90, US-98, US-231, US-278, US-280, US-331, US-411, and US-431.There are four toll roads in the state: Montgomery Expressway in Montgomery; Northport/Tuscaloosa Western Bypass in Tuscaloosa and Northport; Emerald Mountain Expressway in Wetumpka; and Beach Express in Orange Beach.PortsThe Port of Mobile, Alabama's only saltwater port, is a large seaport on the Gulf of Mexico with inland waterway access to the Midwest by way of the Tennessee–Tombigbee Waterway. The Port of Mobile was ranked 12th by tons of traffic in the United States during 2009. The newly expanded container terminal at the Port of Mobile was ranked as the 25th busiest for container traffic in the nation during 2011. The state's other ports are on rivers with access to the Gulf of Mexico.Water ports of Alabama, listed from north to south:See also Index of Alabama-related articles Outline of Alabama—organized list of topics about AlabamaNotesReferencesFurther reading Atkins, Leah Rawls, Wayne Flynt, William Warren Rogers, and David Ward. Alabama: The History of a Deep South State (1994). Flynt, Wayne. Alabama in the Twentieth Century (2004). Owen Thomas M. History of Alabama and Dictionary of Alabama Biography (4 vols, 1921). Jackson, Harvey H. Inside Alabama: A Personal History of My State (2004). Mohl, Raymond A. "Latinization in the Heart of Dixie: Hispanics in Late-twentieth-century Alabama" Alabama Review (2002, 55(4): 243–274).  Peirce, Neal R. The Deep South States of America: People, Politics, and Power in the Seven Deep South States (1974). Williams, Benjamin Buford. A Literary History of Alabama: The Nineteenth Century (1979). WPA Guide to Alabama (1939).External links   Alabama State Guide, from the Library of Congress Your Not So Ordinary Alabama Tourist Guide All About Alabama, at the Alabama Department of Archives and History Code of Alabama 1975 USGS real-time, geographic, and other scientific resources of Alabama  Alabama QuickFacts from the U.S. Census Bureau Alabama State Fact Sheet  1819 establishments in the United StatesSouthern United StatesStates and territories established in 1819States of the Confederate StatesStates of the Gulf Coast of the United StatesStates of the United StatesU.S. states with multiple time zonesContiguous United States
+In Greek mythology, Achilles ( ) or Achilleus () was a hero of the Trojan War, the greatest of all the Greek warriors, and is the central character of Homer's Iliad. He was the son of the Nereid Thetis and Peleus, king of Phthia.Achilles' most notable feat during the Trojan War was the slaying of the Trojan prince Hector outside the gates of Troy. Although the death of Achilles is not presented in the Iliad, other sources concur that he was killed near the end of the Trojan War by Paris, who shot him with an arrow. Later legends (beginning with Statius' unfinished epic Achilleid, written in the 1st century AD) state that Achilles was invulnerable in all of his body except for one heel, because when his mother Thetis dipped him in the river Styx as an infant, she held him by one of his heels. Alluding to these legends, the term "Achilles' heel" has come to mean a point of weakness, especially in someone or something with an otherwise strong constitution. The Achilles tendon is also named after him due to these legends.Etymology Linear B tablets attest to the personal name Achilleus in the forms a-ki-re-u and a-ki-re-we, the latter being the dative of the former. The name grew more popular, even becoming common soon after the seventh century BC and was also turned into the female form Ἀχιλλεία (Achilleía), attested in Attica in the fourth century BC (IG II² 1617) and, in the form Achillia, on a stele in Halicarnassus as the name of a female gladiator fighting an "Amazon".Achilles' name can be analyzed as a combination of  () "distress, pain, sorrow, grief" and  () "people, soldiers, nation", resulting in a proto-form *Akhí-lāu̯os "he who has the people distressed" or "he whose people have distress". The grief or distress of the people is a theme raised numerous times in the Iliad (and frequently by Achilles himself). Achilles' role as the hero of grief or distress forms an ironic juxtaposition with the conventional view of him as the hero of   ("glory", usually in war). Furthermore, laós has been construed by Gregory Nagy, following Leonard Palmer, to mean "a corps of soldiers", a muster. With this derivation, the name obtains a double meaning in the poem: when the hero is functioning rightly, his men bring distress to the enemy, but when wrongly, his men get the grief of war. The poem is in part about the misdirection of anger on the part of leadership.Another etymology relates the name to a Proto-Indo-European compound *h₂eḱ-pṓds "sharp foot" which first gave an Illyrian *āk̂pediós, evolving through time into *ākhpdeós and then *akhiddeús. The shift from -dd- to -ll- is then ascribed to the passing of the name into Greek via a Pre-Greek source. The first root part *h₂eḱ- "sharp, pointed" also gave Greek ἀκή (akḗ "point, silence, healing"), ἀκμή (akmḗ "point, edge, zenith") and ὀξύς (oxús "sharp, pointed, keen, quick, clever"), whereas ἄχος stems from the root *h₂egʰ- "to be upset, afraid". The whole expression would be comparable to the Latin acupedius "swift of foot". Compare also the Latin word family of aciēs "sharp edge or point, battle line, battle, engagement", acus "needle, pin, bodkin", and acuō "to make pointed, sharpen, whet; to exercise; to arouse" (whence acute). Some topical epitheta of Achilles in the Iliad point to this "swift-footedness", namely ποδάρκης δῖος Ἀχιλλεὺς (podárkēs dĩos Achilleús "swift-footed divine Achilles") or, even more frequently, πόδας ὠκὺς Ἀχιλλεύς (pódas ōkús Achilleús "quick-footed Achilles").Some researchers deem the name a loan word, possibly from a Pre-Greek language. Achilles' descent from the Nereid Thetis and a similarity of his name with those of river deities such as Acheron and Achelous have led to speculations about his being an old water divinity (see below Worship). Robert S. P. Beekes has suggested a Pre-Greek origin of the name, based among other things on the coexistence of -λλ- and -λ- in epic language, which may account for a palatalized phoneme /ly/ in the original language.Birth and early years Achilles was the son of the Thetis, a nereid, and Peleus, the king of the Myrmidons. Zeus and Poseidon had been rivals for Thetis's hand in marriage until Prometheus, the fore-thinker, warned Zeus of a prophecy (originally uttered by Themis, goddess of divine law) that Thetis would bear a son greater than his father. For this reason, the two gods withdrew their pursuit, and had her wed Peleus.There is a tale which offers an alternative version of these events: In the Argonautica (4.760) Zeus' sister and wife Hera alludes to Thetis' chaste resistance to the advances of Zeus, pointing out that Thetis was so loyal to Hera's marriage bond that she coolly rejected the father of gods. Thetis, although a daughter of the sea-god Nereus, was also brought up by Hera, further explaining her resistance to the advances of Zeus. Zeus was furious and decreed that she would never marry an immortal.According to the Achilleid, written by Statius in the 1st century AD, and to non-surviving previous sources, when Achilles was born Thetis tried to make him immortal by dipping him in the river Styx; however, he was left vulnerable at the part of the body by which she held him: his left heel (see Achilles' heel, Achilles' tendon). It is not clear if this version of events was known earlier. In another version of this story, Thetis anointed the boy in ambrosia and put him on top of a fire in order to burn away the mortal parts of his body. She was interrupted by Peleus and abandoned both father and son in a rage.None of the sources before Statius make any reference to this general invulnerability. To the contrary, in the Iliad, Homer mentions Achilles being wounded: in Book 21 the Paeonian hero Asteropaeus, son of Pelagon, challenged Achilles by the river Scamander. He was ambidextrous, and cast a spear from each hand; one grazed Achilles' elbow, "drawing a spurt of blood".In the few fragmentary poems of the Epic Cycle which describe the hero's death (i.e. the Cypria, the Little Iliad by Lesches of Pyrrha, the Aithiopis and Iliou persis by Arctinus of Miletus), there is no trace of any reference to his general invulnerability or his famous weakness at the heel. In the later vase paintings presenting the death of Achilles, the arrow (or in many cases, arrows) hit his torso.Peleus entrusted Achilles to Chiron the Centaur, who lived on Mount Pelion, to be reared. Thetis foretold that her son's fate was either to gain glory and die young, or to live a long but uneventful life in obscurity. Achilles chose the former, and decided to take part in the Trojan War. According to Homer, Achilles grew up in Phthia with his companion Patroclus.According to Photius, the sixth book of the New History by Ptolemy Hephaestion reported that Thetis burned in a secret place the children she had by Peleus. When she had Achilles, Peleus noticed, tore him from the flames with only a burnt foot, and confided him to the centaur Chiron. Later Chiron exhumed the body of the Damysus, who was the fastest of all the giants, removed the ankle, and incorporated it into Achilles' burnt foot.Other names Among the appellations under which Achilles is generally known are the following: Pyrisous, "saved from the fire", his first name, which seems to favour the tradition in which his mortal parts were burned by his mother Thetis Aeacides, from his grandfather Aeacus Aemonius, from Aemonia, a country which afterwards acquired the name of Thessaly Aspetos, "inimitable" or "vast", his name at Epirus Larissaeus, from Larissa (also called Cremaste), a town of Thessaly, which still bears the same name Ligyron, his original name Nereius, from his mother Thetis, one of the Nereids Pelides, from his father, Peleus Phthius, from his birthplace, Phthia Podarkes, "swift-footed", due to the wings of Arke being attached to his feet.Hidden on Skyros Some post-Homeric sources claim that in order to keep Achilles safe from the war, Thetis (or, in some versions, Peleus) hid the young man at the court of Lycomedes, king of Skyros.There, Achilles was disguised as a girl and lived among Lycomedes' daughters, perhaps under the name "Pyrrha" (the red-haired girl), Cercysera or Aissa ("swift"). With Lycomedes' daughter Deidamia, whom in the account of Statius he raped, Achilles there fathered two sons, Neoptolemus (also called Pyrrhus, after his father's possible alias) and Oneiros. According to this story, Odysseus learned from the prophet Calchas that the Achaeans would be unable to capture Troy without Achilles' aid. Odysseus went to Skyros in the guise of a peddler selling women's clothes and jewellery and placed a shield and spear among his goods. When Achilles instantly took up the spear, Odysseus saw through his disguise and convinced him to join the Greek campaign. In another version of the story, Odysseus arranged for a trumpet alarm to be sounded while he was with Lycomedes' women. While the women fled in panic, Achilles prepared to defend the court, thus giving his identity away.In the Trojan War According to the Iliad, Achilles arrived at Troy with 50 ships, each carrying 50 Myrmidons. He appointed five leaders (each leader commanding 500 Myrmidons): Menesthius, Eudorus, Peisander, Phoenix and Alcimedon.Telephus When the Greeks left for the Trojan War, they accidentally stopped in Mysia, ruled by King Telephus. In the resulting battle, Achilles gave Telephus a wound that would not heal; Telephus consulted an oracle, who stated that "he that wounded shall heal". Guided by the oracle, he arrived at Argos, where Achilles healed him in order that he might become their guide for the voyage to Troy.According to other reports in Euripides' lost play about Telephus, he went to Aulis pretending to be a beggar and asked Achilles to heal his wound. Achilles refused, claiming to have no medical knowledge. Alternatively, Telephus held Orestes for ransom, the ransom being Achilles' aid in healing the wound. Odysseus reasoned that the spear had inflicted the wound; therefore, the spear must be able to heal it. Pieces of the spear were scraped off onto the wound and Telephus was healed.Troilus According to the Cypria (the part of the Epic Cycle that tells the events of the Trojan War before Achilles' wrath), when the Achaeans desired to return home, they were restrained by Achilles, who afterwards attacked the cattle of Aeneas, sacked neighbouring cities (like Pedasus and Lyrnessus, where the Greeks capture the queen Briseis) and killed Tenes, a son of Apollo, as well as Priam's son Troilus in the sanctuary of Apollo Thymbraios; however, the romance between Troilus and Chryseis described in Geoffrey Chaucer's Troilus and Criseyde and in William Shakespeare's Troilus and Cressida is a medieval invention.In Dares Phrygius' Account of the Destruction of Troy, the Latin summary through which the story of Achilles was transmitted to medieval Europe, as well as in older accounts, Troilus was a young Trojan prince, the youngest of King Priam's and Hecuba's five legitimate sons (or according other sources, another son of Apollo). Despite his youth, he was one of the main Trojan war leaders, a "horse fighter" or "chariot fighter" according to Homer. Prophecies linked Troilus' fate to that of Troy and so he was ambushed in an attempt to capture him. Yet Achilles, struck by the beauty of both Troilus and his sister Polyxena, and overcome with lust, directed his sexual attentions on the youth – who, refusing to yield, instead found himself decapitated upon an altar-omphalos of Apollo Thymbraios. Later versions of the story suggested Troilus was accidentally killed by Achilles in an over-ardent lovers' embrace. In this version of the myth, Achilles' death therefore came in retribution for this sacrilege. Ancient writers treated Troilus as the epitome of a dead child mourned by his parents. Had Troilus lived to adulthood, the First Vatican Mythographer claimed, Troy would have been invincible; however, the motif is older and found already in Plautus' Bacchides.In the Iliad Homer's Iliad is the most famous narrative of Achilles' deeds in the Trojan War. Achilles' wrath (μῆνις Ἀχιλλέως, mênis Achilléōs) is the central theme of the poem. The first two lines of the Iliad read:The Homeric epic only covers a few weeks of the decade-long war, and does not narrate Achilles' death. It begins with Achilles' withdrawal from battle after being dishonoured by Agamemnon, the commander of the Achaean forces. Agamemnon has taken a woman named Chryseis as his slave. Her father Chryses, a priest of Apollo, begs Agamemnon to return her to him. Agamemnon refuses, and Apollo sends a plague amongst the Greeks. The prophet Calchas correctly determines the source of the troubles but will not speak unless Achilles vows to protect him. Achilles does so, and Calchas declares that Chryseis must be returned to her father. Agamemnon consents, but then commands that Achilles' battle prize Briseis, the daughter of Briseus, be brought to him to replace Chryseis. Angry at the dishonour of having his plunder and glory taken away (and, as he says later, because he loves Briseis), with the urging of his mother Thetis, Achilles refuses to fight or lead his troops alongside the other Greek forces. At the same time, burning with rage over Agamemnon's theft, Achilles prays to Thetis to convince Zeus to help the Trojans gain ground in the war, so that he may regain his honour.As the battle turns against the Greeks, thanks to the influence of Zeus, Nestor declares that the Trojans are winning because Agamemnon has angered Achilles, and urges the king to appease the warrior. Agamemnon agrees and sends Odysseus and two other chieftains, Ajax and Phoenix. They promise that, if Achilles returns to battle, Agamemnon will return the captive Briseis and other gifts. Achilles rejects all Agamemnon offers him and simply urges the Greeks to sail home as he was planning to do.The Trojans, led by Hector, subsequently push the Greek army back toward the beaches and assault the Greek ships. With the Greek forces on the verge of absolute destruction, Patroclus leads the Myrmidons into battle, wearing Achilles' armour, though Achilles remains at his camp. Patroclus succeeds in pushing the Trojans back from the beaches, but is killed by Hector before he can lead a proper assault on the city of Troy.After receiving the news of the death of Patroclus from Antilochus, the son of Nestor, Achilles grieves over his beloved companion's death. His mother Thetis comes to comfort the distraught Achilles. She persuades Hephaestus to make new armour for him, in place of the armour that Patroclus had been wearing, which was taken by Hector. The new armour includes the Shield of Achilles, described in great detail in the poem.Enraged over the death of Patroclus, Achilles ends his refusal to fight and takes the field, killing many men in his rage but always seeking out Hector. Achilles even engages in battle with the river god Scamander, who has become angry that Achilles is choking his waters with all the men he has killed. The god tries to drown Achilles but is stopped by Hera and Hephaestus. Zeus himself takes note of Achilles' rage and sends the gods to restrain him so that he will not go on to sack Troy itself before the time allotted for its destruction, seeming to show that the unhindered rage of Achilles can defy fate itself. Finally, Achilles finds his prey. Achilles chases Hector around the wall of Troy three times before Athena, in the form of Hector's favorite and dearest brother, Deiphobus, persuades Hector to stop running and fight Achilles face to face. After Hector realizes the trick, he knows the battle is inevitable. Wanting to go down fighting, he charges at Achilles with his only weapon, his sword, but misses. Accepting his fate, Hector begs Achilles not to spare his life, but to treat his body with respect after killing him. Achilles tells Hector it is hopeless to expect that of him, declaring that "my rage, my fury would drive me now to hack your flesh away and eat you raw – such agonies you have caused me". Achilles then kills Hector and drags his corpse by its heels behind his chariot. After having a dream where Patroclus begs Achilles to hold his funeral, Achilles hosts a series of funeral games in honour of his companion.At the onset of his duel with Hector, Achilles is referred to as the brightest star in the sky, which comes on in the autumn, Orion's dog (Sirius); a sign of evil. During the cremation of Patroclus, he is compared to Hesperus, the evening/western star (Venus), while the burning of the funeral pyre lasts until Phosphorus, the morning/eastern star (also Venus) has set (descended).With the assistance of the god Hermes (Argeiphontes), Hector's father Priam goes to Achilles' tent to plead with Achilles for the return of Hector's body so that he can be buried. Achilles relents and promises a truce for the duration of the funeral, lasting 9 days with a burial on the 10th (in the tradition of Niobe's offspring). The poem ends with a description of Hector's funeral, with the doom of Troy and Achilles himself still to come.Later epic accounts: fighting Penthesilea and Memnon The Aethiopis (7th century BC) and a work named Posthomerica, composed by Quintus of Smyrna in the fourth century CE, relate further events from the Trojan War. When Penthesilea, queen of the Amazons and daughter of Ares, arrives in Troy, Priam hopes that she will defeat Achilles. After his temporary truce with Priam, Achilles fights and kills the warrior queen, only to grieve over her death later. At first, he was so distracted by her beauty, he did not fight as intensely as usual. Once he realized that his distraction was endangering his life, he refocused and killed her.Following the death of Patroclus, Nestor's son Antilochus becomes Achilles' closest companion. When Memnon, son of the Dawn Goddess Eos and king of Ethiopia, slays Antilochus, Achilles once more obtains revenge on the battlefield, killing Memnon. Consequently, Eos will not let the sun rise until Zeus persuades her. The fight between Achilles and Memnon over Antilochus echoes that of Achilles and Hector over Patroclus, except that Memnon (unlike Hector) was also the son of a goddess.Many Homeric scholars argued that episode inspired many details in the Iliads description of the death of Patroclus and Achilles' reaction to it. The episode then formed the basis of the cyclic epic Aethiopis, which was composed after the Iliad, possibly in the 7th century BC. The Aethiopis is now lost, except for scattered fragments quoted by later authors.Achilles and Patroclus The exact nature of Achilles' relationship with Patroclus has been a subject of dispute in both the classical period and modern times. In the Iliad, it appears to be the model of a deep and loyal friendship. Homer does not suggest that Achilles and his close friend Patroclus had sexual relations. Although there is no direct evidence in the text of the Iliad that Achilles and Patroclus were lovers, this theory was expressed by some later authors. Commentators from classical antiquity to the present have often interpreted the relationship through the lens of their own cultures. In 5th-century BCE Athens, the intense bond was often viewed in light of the Greek custom of paiderasteia. In Plato's Symposium, the participants in a dialogue about love assume that Achilles and Patroclus were a couple; Phaedrus argues that Achilles was the younger and more beautiful one so he was the beloved and Patroclus was the lover. However, ancient Greek had no words to distinguish heterosexual and homosexual, and it was assumed that a man could both desire handsome young men and have sex with women. Many pairs of men throughout history have been compared to Achilles and Patroclus to imply a homosexual relationship.Death The death of Achilles, even if considered solely as it occurred in the oldest sources, is a complex one, with many different versions. In the oldest version, the Iliad, and as predicted by Hector with his dying breath, the hero's death was brought about by Paris with an arrow (to the heel according to Statius). In some versions, the god Apollo guided Paris' arrow. Some retellings also state that Achilles was scaling the gates of Troy and was hit with a poisoned arrow. All of these versions deny Paris any sort of valour, owing to the common conception that Paris was a coward and not the man his brother Hector was, and Achilles remained undefeated on the battlefield.After death, Achilles' bones were mingled with those of Patroclus, and funeral games were held. He was represented in the Aethiopis as living after his death in the island of Leuke at the mouth of the river Danube.Another version of Achilles' death is that he fell deeply in love with one of the Trojan princesses, Polyxena. Achilles asks Priam for Polyxena's hand in marriage. Priam is willing because it would mean the end of the war and an alliance with the world's greatest warrior. But while Priam is overseeing the private marriage of Polyxena and Achilles, Paris, who would have to give up Helen if Achilles married his sister, hides in the bushes and shoots Achilles with a divine arrow, killing him.In the Odyssey, Agamemnon informs Achilles of his pompous burial and the erection of his mound at the Hellespont while they are receiving the dead suitors in Hades. He claims they built a massive burial mound on the beach of Ilion that could be seen by anyone approaching from the ocean. Achilles was cremated and his ashes buried in the same urn as those of Patroclus. Paris was later killed by Philoctetes using the enormous bow of Heracles.In Book 11 of Homer's Odyssey, Odysseus sails to the underworld and converses with the shades. One of these is Achilles, who when greeted as "blessed in life, blessed in death", responds that he would rather be a slave to the worst of masters than be king of all the dead. But Achilles then asks Odysseus of his son's exploits in the Trojan war, and when Odysseus tells of Neoptolemus' heroic actions, Achilles is filled with satisfaction. This leaves the reader with an ambiguous understanding of how Achilles felt about the heroic life.According to some accounts, he had married Medea in life, so that after both their deaths they were united in the Elysian Fields of Hades – as Hera promised Thetis in Apollonius' Argonautica (3rd century BC).Fate of Achilles' armour Achilles' armour was the object of a feud between Odysseus and Telamonian Ajax (Ajax the greater). They competed for it by giving speeches on why they were the bravest after Achilles to their Trojan prisoners, who, after considering both men's presentations, decided Odysseus was more deserving of the armour. Furious, Ajax cursed Odysseus, which earned him the ire of Athena, who temporarily made Ajax so mad with grief and anguish that he began killing sheep, thinking them his comrades. After a while, when Athena lifted his madness and Ajax realized that he had actually been killing sheep, he was so ashamed that he committed suicide. Odysseus eventually gave the armour to Neoptolemus, the son of Achilles. When Odysseus encounters the shade of Ajax much later in the House of Hades (Odyssey 11.543–566), Ajax is still so angry about the outcome of the competition that he refuses to speak to Odysseus.A relic claimed to be Achilles' bronze-headed spear was preserved for centuries in the temple of Athena on the acropolis of Phaselis, Lycia, a port on the Pamphylian Gulf. The city was visited in 333 BCE by Alexander the Great, who envisioned himself as the new Achilles and carried the Iliad with him, but his court biographers do not mention the spear; however, it was shown in the time of Pausanias in the 2nd century CE.Achilles, Ajax and a game of petteia Numerous paintings on pottery have suggested a tale not mentioned in the literary traditions. At some point in the war, Achilles and Ajax were playing a board game (petteia). They were absorbed in the game and oblivious to the surrounding battle. The Trojans attacked and reached the heroes, who were saved only by an intervention of Athena.Worship and heroic cult The tomb of Achilles, extant throughout antiquity in Troad, was venerated by Thessalians, but also by Persian expeditionary forces, as well as by Alexander the Great and the Roman emperor Caracalla. Achilles' cult was also to be found at other places, e. g. on the island of Astypalaea in the Sporades, in Sparta which had a sanctuary, in Elis and in Achilles' homeland Thessaly, as well as in the Magna Graecia cities of Tarentum, Locri and Croton, accounting for an almost Panhellenic cult to the hero.The cult of Achilles is illustrated in the 500 BCE Polyxena sarcophagus, which depicts the sacrifice of Polyxena near the tumulus of Achilles. Strabo (13.1.32) also suggested that such a cult of Achilles existed in Troad:The spread and intensity of the hero's veneration among the Greeks that had settled on the northern coast of the Pontus Euxinus, today's Black Sea, appears to have been remarkable. An archaic cult is attested for the Milesian colony of Olbia as well as for an island in the middle of the Black Sea, today identified with Snake Island (Ukrainian Зміїний, Zmiinyi, near Kiliya, Ukraine). Early dedicatory inscriptions from the Greek colonies on the Black Sea (graffiti and inscribed clay disks, these possibly being votive offerings, from Olbia, the area of Berezan Island and the Tauric Chersonese) attest the existence of a heroic cult of Achilles from the sixth century BC onwards. The cult was still thriving in the third century CE, when dedicatory stelae from Olbia refer to an Achilles Pontárchēs (Ποντάρχης, roughly "lord of the Sea," or "of the Pontus Euxinus"), who was invoked as a protector of the city of Olbia, venerated on par with Olympian gods such as the local Apollo Prostates, Hermes Agoraeus, or Poseidon.Pliny the Elder (23–79 AD) in his Natural History mentions a "port of the Achæi" and an "island of Achilles", famous for the tomb of that "man" (), situated somewhat nearby Olbia and the Dnieper-Bug Estuary; furthermore, at 125 Roman miles from this island, he places a peninsula "which stretches forth in the shape of a sword" obliquely, called Dromos Achilleos (Ἀχιλλέως δρόμος, Achilléōs drómos "the Race-course of Achilles") and considered the place of the hero's exercise or of games instituted by him. This last feature of Pliny's account is considered to be the iconic spit, called today Tendra (or Kosa Tendra and Kosa Djarilgatch), situated between the mouth of the Dnieper and Karkinit Bay, but which is hardly 125 Roman miles (c. 185 km) away from the Dnieper-Bug estuary, as Pliny states. (To the "Race-course" he gives a length of 80 miles, c. 120 km, whereas the spit measures c. 70 km today.)In the following chapter of his book, Pliny refers to the same island as Achillea and introduces two further names for it: Leuce or Macaron (from Greek [νῆσος] μακαρῶν "island of the blest"). The "present day" measures, he gives at this point, seem to account for an identification of Achillea or Leuce with today's Snake Island. Pliny's contemporary Pomponius Mela (c. 43 AD) tells that Achilles was buried on an island named Achillea, situated between the Borysthenes and the Ister, adding to the geographical confusion. Ruins of a square temple, measuring 30 meters to a side, possibly that dedicated to Achilles, were discovered by Captain Kritzikly () in 1823 on Snake Island. A second exploration in 1840 showed that the construction of a lighthouse had destroyed all traces of this temple. A fifth century BC black-glazed lekythos inscription, found on the island in 1840, reads: "Glaukos, son of Poseidon, dedicated me to Achilles, lord of Leuke." In another inscription from the fifth or fourth century BC, a statue is dedicated to Achilles, lord of Leuke, by a citizen of Olbia, while in a further dedication, the city of Olbia confirms its continuous maintenance of the island's cult, again suggesting its quality as a place of a supra-regional hero veneration.The heroic cult dedicated to Achilles on Leuce seems to go back to an account from the lost epic Aethiopis according to which, after his untimely death, Thetis had snatched her son from the funeral pyre and removed him to a mythical  (Leúkē Nêsos "White Island"). Already in the fifth century BC, Pindar had mentioned a cult of Achilles on a "bright island" (φαεννά νᾶσος, phaenná nâsos) of the Black Sea, while in another of his works, Pindar would retell the story of the immortalized Achilles living on a geographically indefinite Island of the Blest together with other heroes such as his father Peleus and Cadmus. Well known is the connection of these mythological Fortunate Isles (μακαρῶν νῆσοι, makárôn nêsoi) or the Homeric Elysium with the stream Oceanus which according to Greek mythology surrounds the inhabited world, which should have accounted for the identification of the northern strands of the Euxine with it. Guy Hedreen has found further evidence for this connection of Achilles with the northern margin of the inhabited world in a poem by Alcaeus, speaking of "Achilles lord of Scythia" and the opposition of North and South, as evoked by Achilles' fight against the Aethiopian prince Memnon, who in his turn would be removed to his homeland by his mother Eos after his death.The Periplus of the Euxine Sea (c. 130 AD) gives the following details:The Greek geographer Dionysius Periegetes, who likely lived during the first century CE, wrote that the island was called Leuce "because the wild animals which live there are white. It is said that there, in Leuce island, reside the souls of Achilles and other heroes, and that they wander through the uninhabited valleys of this island; this is how Jove rewarded the men who had distinguished themselves through their virtues, because through virtue they had acquired everlasting honour". Similarly, others relate the island's name to its white cliffs, snakes or birds dwelling there. Pausanias has been told that the island is "covered with forests and full of animals, some wild, some tame. In this island there is also Achilles' temple and his statue". Leuce had also a reputation as a place of healing. Pausanias reports that the Delphic Pythia sent a lord of Croton to be cured of a chest wound. Ammianus Marcellinus attributes the healing to waters (aquae) on the island.A number of important commercial port cities of the Greek waters were dedicated to Achilles. Herodotus, Pliny the Elder and Strabo reported on the existence of a town Achílleion (Ἀχίλλειον), built by settlers from Mytilene in the sixth century BC, close to the hero's presumed burial mound in the Troad. Later attestations point to an Achílleion in Messenia (according to Stephanus Byzantinus) and an Achílleios (Ἀχίλλειος) in Laconia. Nicolae Densuşianu recognized a connection to Achilles in the names of Aquileia and of the northern arm of the Danube delta, called Chilia (presumably from an older Achileii), though his conclusion, that Leuce had sovereign rights over the Black Sea, evokes modern rather than archaic sea-law.The kings of Epirus claimed to be descended from Achilles through his son, Neoptolemus. Alexander the Great, son of the Epirote princess Olympias, could therefore also claim this descent, and in many ways strove to be like his great ancestor. He is said to have visited the tomb of Achilles at Achilleion while passing Troy. In AD 216 the Roman Emperor Caracalla, while on his way to war against Parthia, emulated Alexander by holding games around Achilles' tumulus.Reception during antiquityIn Greek tragedy The Greek tragedian Aeschylus wrote a trilogy of plays about Achilles, given the title Achilleis by modern scholars. The tragedies relate the deeds of Achilles during the Trojan War, including his defeat of Hector and eventual death when an arrow shot by Paris and guided by Apollo punctures his heel. Extant fragments of the Achilleis and other Aeschylean fragments have been assembled to produce a workable modern play. The first part of the Achilleis trilogy, The Myrmidons, focused on the relationship between Achilles and chorus, who represent the Achaean army and try to convince Achilles to give up his quarrel with Agamemnon; only a few lines survive today. In Plato's Symposium, Phaedrus points out that Aeschylus portrayed Achilles as the lover and Patroclus as the beloved; Phaedrus argues that this is incorrect because Achilles, being the younger and more beautiful of the two, was the beloved, who loved his lover so much that he chose to die to avenge him.The tragedian Sophocles also wrote The Lovers of Achilles, a play with Achilles as the main character. Only a few fragments survive.Towards the end of the 5th century BCE, a more negative view of Achilles emerges in Greek drama; Euripides refers to Achilles in a bitter or ironic tone in Hecuba, Electra, and Iphigenia in Aulis.In Greek philosophyZenoThe philosopher Zeno of Elea centred one of his paradoxes on an imaginary footrace between "swift-footed" Achilles and a tortoise, by which he attempted to show that Achilles could not catch up to a tortoise with a head start, and therefore that motion and change were impossible. As a student of the monist Parmenides and a member of the Eleatic school, Zeno believed time and motion to be illusions.PlatoIn Hippias Minor, a dialogue attributed to Plato, an arrogant man named Hippias argues with Socrates. The two get into a discussion about lying. They decide that a person who is intentionally false must be "better" than a person who is unintentionally false, on the basis that someone who lies intentionally must understand the subject about which they are lying. Socrates uses various analogies, discussing athletics and the sciences to prove his point. The two also reference Homer extensively. Socrates and Hippias agree that Odysseus, who concocted a number of lies throughout the Odyssey and other stories in the Trojan War Cycle, was false intentionally. Achilles, like Odysseus, told numerous falsehoods. Hippias believes that Achilles was a generally honest man, while Socrates believes that Achilles lied for his own benefit. The two argue over whether it is better to lie on purpose or by accident. Socrates eventually abandons Homeric arguments and makes sports analogies to drive home the point: someone who does wrong on purpose is a better person than someone who does wrong unintentionally.In Roman and medieval literature The Romans, who traditionally traced their lineage to Troy, took a highly negative view of Achilles. Virgil refers to Achilles as a savage and a merciless butcher of men, while Horace portrays Achilles ruthlessly slaying women and children. Other writers, such as Catullus, Propertius, and Ovid, represent a second strand of disparagement, with an emphasis on Achilles' erotic career. This strand continues in Latin accounts of the Trojan War by writers such as Dictys Cretensis and Dares Phrygius and in Benoît de Sainte-Maure's Roman de Troie and Guido delle Colonne's Historia destructionis Troiae, which remained the most widely read and retold versions of the Matter of Troy until the 17th century.Achilles was described by the Byzantine chronicler Leo the Deacon, not as Hellene, but as Scythian, while according to the Byzantine author John Malalas, his army was made up of a tribe previously known as Myrmidons and later as Bulgars.In modern literature and artsLiterature Achilles appears in Dante's Inferno (composed 1308–1320). He is seen in Hell's second circle, that of lust. Achilles is portrayed as a former hero who has become lazy and devoted to the love of Patroclus, in William Shakespeare's Troilus and Cressida (1602). The French dramatist Thomas Corneille wrote a tragedy La Mort d'Achille (1673). Achilles is the subject of the poem Achilleis (1799), a fragment by Johann Wolfgang von Goethe. In 1899, the Polish playwright, painter and poet Stanisław Wyspiański published a national drama, based on Polish history, named Achilles. In 1921, Edward Shanks published The Island of Youth and Other Poems, concerned among others with Achilles. The 1983 novel Kassandra by Christa Wolf also treats the death of Achilles. Akhilles is killed by a poisoned Kentaur arrow shot by Kassandra in Marion Zimmer Bradley's novel The Firebrand (1987). Achilles is one of various 'narrators' in Colleen McCullough's novel The Song of Troy (1998). The Death of Achilles (Смерть Ахиллеса, 1998) is an historical detective novel by Russian writer Boris Akunin that alludes to various figures and motifs from the Iliad. The character Achilles in Ender's Shadow (1999), by Orson Scott Card, shares his namesake's cunning mind and ruthless attitude. Achilles is one of the main characters in Dan Simmons's novels Ilium (2003) and Olympos (2005). Achilles is a major supporting character in David Gemmell's Troy series of books (2005–2007). Achilles is the main character in David Malouf's novel Ransom (2009). The ghost of Achilles appears in Rick Riordan's The Last Olympian (2009). He warns Percy Jackson about the Curse of Achilles and its side effects. Achilles is a main character in Terence Hawkins' 2009 novel The Rage of Achilles. Achilles is a major character in Madeline Miller's debut novel, The Song of Achilles (2011), which won the 2012 Orange Prize for Fiction. The novel explores the relationship between Patroclus and Achilles from boyhood to the fateful events of the Iliad. Achilles appears in the light novel series Fate/Apocrypha (2012–2014) as the Rider of Red. Achilles is a main character in Pat Barker's 2018 novel The Silence of the Girls, much of which is narrated by his slave Briseis.Visual arts  Achilles with the Daughters of Lycomedes is a subject treated in paintings by Anthony van Dyck (before 1618; Museo del Prado, Madrid) and Nicolas Poussin (c. 1652; Museum of Fine Arts, Boston) among others. Peter Paul Rubens has authored a series of works on the life of Achilles, comprising the titles: Thetis dipping the infant Achilles into the river Styx, Achilles educated by the centaur Chiron, Achilles recognized among the daughters of Lycomedes, The wrath of Achilles, The death of Hector, Thetis receiving the arms of Achilles from Vulcanus, The death of Achilles (Museum Boijmans Van Beuningen, Rotterdam), and Briseis restored to Achilles (Detroit Institute of Arts; all c. 1630–1635) Pieter van Lint, "Achilles Discovered among the Daughters of Lycomedes", 1645, at the Israel Museum, Jerusalem Dying Achilles is a sculpture created by Christophe Veyrier (c. 1683; Victoria and Albert Museum, London). The Rage of Achilles is a fresco by Giovanni Battista Tiepolo (1757, Villa Valmarana Ai Nani, Vicenza). Eugène Delacroix painted a version of The Education of Achilles for the ceiling of the Paris Palais Bourbon (1833–1847), one of the seats of the French Parliament.  created a statue group Achilles and Penthesilea (1895; Vienna). Achilleus (1908) is a lithography by Max Slevogt.Music Achilles has been frequently the subject of operas, ballets and related genres. Operas titled Deidamia were composed by Francesco Cavalli (1644) and George Frideric Handel (1739). Achille et Polyxène (Paris 1687) is an opera begun by Jean-Baptiste Lully and finished by Pascal Collasse. Achille et Déidamie (Paris 1735) is an opera composed by André Campra. Achilles (London 1733) is a ballad opera, written by John Gay, parodied by Thomas Arne as Achilles in petticoats in 1773. Achille in Sciro is a libretto by Metastasio, composed by Domenico Sarro for the inauguration of the Teatro di San Carlo (Naples, 4 November 1737). An even earlier composition is from Antonio Caldara (Vienna 1736). Later operas on the same libretto were composed by Leonardo Leo (Turin 1739), Niccolò Jommelli (Vienna 1749 and Rome 1772), Giuseppe Sarti (Copenhagen 1759 and Florence 1779), Johann Adolph Hasse (Naples 1759), Giovanni Paisiello (St. Petersburg 1772), Giuseppe Gazzaniga (Palermo 1781) and many others. It has also been set to music as Il Trionfo della gloria. Achille (Vienna 1801) is an opera by Ferdinando Paër on a libretto by Giovanni de Gamerra. Achille à Scyros (Paris 1804) is a ballet by Pierre Gardel, composed by Luigi Cherubini. Achilles, oder Das zerstörte Troja ("Achilles, or Troy Destroyed", Bonn 1885) is an oratorio by the German composer Max Bruch. Achilles auf Skyros (Stuttgart 1926) is a ballet by the Austrian-British composer and musicologist Egon Wellesz. Achilles' Wrath is a concert piece by Sean O'Loughlin. Achilles Last Stand a track on the 1976 Led Zeppelin album Presence. Achilles, Agony and Ecstasy in Eight Parts is the first song on the 1992 Manowar album The Triumph of Steel. Achilles Come Down is a song on the 2017 Gang of Youths album Go Farther in Lightness.Film and televisionIn films Achilles has been portrayed in the following films and television series: The 1924 film Helena by Carlo Aldini The 1954 film Ulysses by Piero Lulli The 1956 film Helen of Troy by Stanley Baker The 1961 film The Trojan Horse by Arturo Dominici The 1962 film The Fury of Achilles by Gordon Mitchell The 1997 television miniseries The Odyssey by Richard Trewett The 2003 television miniseries Helen of Troy by Joe Montana The 2004 film Troy by Brad Pitt The 2018 TV series Troy: Fall of a City by David GyasiArchitecture In 1890, Elisabeth of Bavaria, Empress of Austria, had a summer palace built in Corfu. The building is named the Achilleion, after Achilles. Its paintings and statuary depict scenes from the Trojan War, with particular focus on Achilles. The Wellington Monument is a statue representing Achilles erected as a memorial to Arthur Wellesley, the first duke of Wellington, and his victories in the Peninsular War and the latter stages of the Napoleonic Wars.Namesakes  The name of Achilles has been used for at least nine Royal Navy warships since 1744 – both as  and with the French spelling . A 60-gun ship of that name served at the Battle of Belleisle in 1761 while a 74-gun ship served at the Battle of Trafalgar. Other battle honours include Walcheren 1809. An armored cruiser of that name served in the Royal Navy during the First World War.  was a  which served with the Royal New Zealand Navy in World War II. It became famous for its part in the Battle of the River Plate, alongside  and . In addition to earning the battle honour 'River Plate', HMNZS Achilles also served at Guadalcanal 1942–1943 and Okinawa in 1945. After returning to the Royal Navy, the ship was sold to the Indian Navy in 1948, but when she was scrapped parts of the ship were saved and preserved in New Zealand. A species of lizard, Anolis achilles, which has widened heel plates, is named for Achilles.GalleryReferencesFurther reading  Ileana Chirassi Colombo (1977), "Heroes Achilleus – Theos Apollon." In Il Mito Greco, edd. Bruno Gentili and Giuseppe Paione. Rome: Edizione dell'Ateneo e Bizzarri. Anthony Edwards (1985a), "Achilles in the Underworld: Iliad, Odyssey, and Æthiopis". Greek, Roman, and Byzantine Studies. 26: pp. 215–227. Anthony Edwards (1985b), "Achilles in the Odyssey: Ideologies of Heroism in the Homeric Epic". Beiträge zur klassischen Philologie. 171.  Graves, Robert, The Greek Myths, Harmondsworth, London, England, Penguin Books, 1960. Graves, Robert, The Greek Myths: The Complete and Definitive Edition. Penguin Books Limited. 2017.    Hélène Monsacré (1984), Les larmes d'Achille. Le héros, la femme et la souffrance dans la poésie d'Homère, Paris: Albin Michel. Gregory Nagy (1984), The Name of Achilles: Questions of Etymology and 'Folk Etymology, Illinois Classical Studies. 19. Gregory Nagy (1999), The Best of The Acheans: Concepts of the Hero in Archaic Greek Poetry. Johns Hopkins University Press (revised edition, online).  Dale S. Sinos (1991), The Entry of Achilles into Greek Epic, PhD thesis, Johns Hopkins University. Ann Arbor, Michigan: University Microfilms International. Jonathan S. Burgess (2009), The Death and Afterlife of Achilles. Baltimore: Johns Hopkins University Press. Abrantes, M.C. (2016), Themes of the Trojan Cycle: Contribution to the study of the greek mythological tradition (Coimbra).External links  Trojan War Resources Gallery of the Ancient Art: Achilles  Poem by Florence Earle CoatesGreek mythological heroesKings of the MyrmidonsAchaean LeadersThessalians in the Trojan WarMetamorphoses charactersMythological rapistsDemigods in classical mythologyLGBT themes in Greek mythology Deeds of ApolloMedea
+Abraham Lincoln (; February 12, 1809 – April 15, 1865) was an American lawyer and statesman who served as the 16th president of the United States from 1861 until his assassination in 1865. Lincoln led the nation through the American Civil War and succeeded in preserving the Union, abolishing slavery, bolstering the federal government, and modernizing the U.S. economy.Lincoln was born into poverty in a log cabin in Kentucky and was raised on the frontier primarily in Indiana. He was self-educated and became a lawyer, Whig Party leader, Illinois state legislator, and U.S. Congressman from Illinois. In 1849, he returned to his law practice but became vexed by the opening of additional lands to slavery as a result of the Kansas–Nebraska Act. He reentered politics in 1854, becoming a leader in the new Republican Party, and he reached a national audience in the 1858 debates against Stephen Douglas. Lincoln ran for President in 1860, sweeping the North in victory. Pro-slavery elements in the South equated his success with the North's rejection of their right to practice slavery, and southern states began seceding from the Union. To secure its independence, the new Confederate States fired on Fort Sumter, a U.S. fort in the South, and Lincoln called up forces to suppress the rebellion and restore the Union.Lincoln, a moderate Republican, had to navigate a contentious array of factions with friends and opponents from both the Democratic and Republican parties. His allies, the War Democrats and the Radical Republicans, demanded harsh treatment of the Southern Confederates. Anti-war Democrats (called "Copperheads") despised Lincoln, and irreconcilable pro-Confederate elements plotted his assassination. He managed the factions by exploiting their mutual enmity, carefully distributing political patronage, and by appealing to the American people. His Gettysburg Address appealed to nationalistic, republican, egalitarian, libertarian, and democratic sentiments. Lincoln scrutinized the strategy and tactics in the war effort, including the selection of generals and the naval blockade of the South's trade. He suspended habeas corpus in Maryland, and he averted British intervention by defusing the Trent Affair. He engineered the end to slavery with his Emancipation Proclamation, including his order that the Army and Navy liberate, protect, and recruit former slaves. He also encouraged border states to outlaw slavery, and promoted the Thirteenth Amendment to the United States Constitution, which outlawed slavery across the country.Lincoln managed his own successful re-election campaign. He sought to heal the war-torn nation through reconciliation. On April 14, 1865, just days after the war's end at Appomattox, he was attending a play at Ford's Theatre in Washington, D.C., with his wife Mary when he was fatally shot by Confederate sympathizer John Wilkes Booth. Lincoln is remembered as a martyr and hero of the United States and is often ranked as the greatest president in American history.Family and childhoodEarly lifeAbraham Lincoln was born on February 12, 1809, the second child of Thomas Lincoln and Nancy Hanks Lincoln, in a log cabin on Sinking Spring Farm near Hodgenville, Kentucky. He was a descendant of Samuel Lincoln, an Englishman who migrated from Hingham, Norfolk, to its namesake, Hingham, Massachusetts, in 1638. The family then migrated west, passing through New Jersey, Pennsylvania, and Virginia. Lincoln's paternal grandparents, his namesake Captain Abraham Lincoln and wife Bathsheba (née Herring) moved the family from Virginia to Jefferson County, Kentucky. The captain was killed in an Indian raid in 1786. His children, including eight-year-old Thomas, Abraham's father, witnessed the attack. Thomas then worked at odd jobs in Kentucky and Tennessee before the family settled in Hardin County, Kentucky, in the early 1800s.The heritage of Lincoln's mother Nancy remains unclear, but it is widely assumed that she was the daughter of Lucy Hanks. Thomas and Nancy married on June 12, 1806, in Washington County, and moved to Elizabethtown, Kentucky. They had three children: Sarah, Abraham, and Thomas, who died as infant.Thomas Lincoln bought or leased farms in Kentucky before losing all but  of his land in court disputes over property titles. In 1816, the family moved to Indiana where the land surveys and titles were more reliable. Indiana was a "free" (non-slaveholding) territory, and they settled in an "unbroken forest" in Hurricane Township, Perry County, Indiana. In 1860, Lincoln noted that the family's move to Indiana was "partly on account of slavery", but mainly due to land title difficulties.In Kentucky and Indiana, Thomas worked as a farmer, cabinetmaker, and carpenter. At various times, he owned farms, livestock, and town lots, paid taxes, sat on juries, appraised estates, and served on county patrols. Thomas and Nancy were members of a Separate Baptists church, which forbade alcohol, dancing, and slavery.Overcoming financial challenges, Thomas in 1827 obtained clear title to  in Indiana, an area which became the Little Pigeon Creek Community.Mother's deathOn October 5, 1818, Nancy Lincoln succumbed to milk sickness, leaving 11-year-old Sarah in charge of a household including her father, 9-year-old Abraham, and Nancy's 19-year-old orphan cousin, Dennis Hanks. Ten years later, on January 20, 1828, Sarah died while giving birth to a stillborn son, devastating Lincoln.On December 2, 1819, Thomas married Sarah Bush Johnston, a widow from Elizabethtown, Kentucky, with three children of her own. Abraham became close to his stepmother and called her "Mother". Lincoln disliked the hard labor associated with farm life. His family even said he was lazy, for all his "reading, scribbling, writing, ciphering, writing Poetry, etc.". His stepmother acknowledged he did not enjoy "physical labor", but loved to read.Education and move to IllinoisLincoln was largely self-educated. His formal schooling was from itinerant teachers. It included two short stints in Kentucky, where he learned to read but probably not to write, at age seven, and in Indiana, where he went to school sporadically due to farm chores, for a total of less than 12 months in aggregate by the age of 15. He persisted as an avid reader and retained a lifelong interest in learning. Family, neighbors, and schoolmates recalled that his reading included the King James Bible, Aesop's Fables, John Bunyan's The Pilgrim's Progress, Daniel Defoe's Robinson Crusoe, and The Autobiography of Benjamin Franklin.As a teen, Lincoln took responsibility for chores and customarily gave his father all earnings from work outside the home until he was 21. Lincoln was tall, strong, and athletic, and became adept at using an ax. He was an active wrestler during his youth and trained in the rough catch-as-catch-can style (also known as catch wrestling). He became county wrestling champion at the age of 21. He gained a reputation for strength and audacity after winning a wrestling match with the renowned leader of ruffians known as "the Clary's Grove Boys".In March 1830, fearing another milk sickness outbreak, several members of the extended Lincoln family, including Abraham, moved west to Illinois, a free state, and settled in Macon County. Abraham then became increasingly distant from Thomas, in part due to his father's lack of education. In 1831, as Thomas and other family prepared to move to a new homestead in Coles County, Illinois, Abraham struck out on his own. He made his home in New Salem, Illinois, for six years. Lincoln and some friends took goods by flatboat to New Orleans, Louisiana, where he was first exposed to slavery.In 1865, Lincoln was asked how he came to acquire his rhetorical skills. He answered that in the practice of law he frequently came across the word "demonstrate" but had insufficient understanding of the term. So, he left Springfield for his father's home to study until he "could give any proposition in the six books of Euclid [here, referencing Euclid's Elements] at sight."Marriage and childrenLincoln's first romantic interest was Ann Rutledge, whom he met when he moved to New Salem. By 1835, they were in a relationship but not formally engaged. She died on August 25, 1835, most likely of typhoid fever. In the early 1830s, he met Mary Owens from Kentucky.Late in 1836, Lincoln agreed to a match with Owens if she returned to New Salem. Owens arrived that November and he courted her for a time; however, they both had second thoughts. On August 16, 1837, he wrote Owens a letter saying he would not blame her if she ended the relationship, and she never replied.In 1839, Lincoln met Mary Todd in Springfield, Illinois, and the following year they became engaged. She was the daughter of Robert Smith Todd, a wealthy lawyer and businessman in Lexington, Kentucky. A wedding set for January 1, 1841, was canceled at Lincoln's request, but they reconciled and married on November 4, 1842, in the Springfield mansion of Mary's sister. While anxiously preparing for the nuptials, he was asked where he was going and replied, "To hell, I suppose." In 1844, the couple bought a house in Springfield near his law office. Mary kept house with the help of a hired servant and a relative.Lincoln was an affectionate husband and father of four sons, though his work regularly kept him away from home. The oldest, Robert Todd Lincoln, was born in 1843 and was the only child to live to maturity. Edward Baker Lincoln (Eddie), born in 1846, died February 1, 1850, probably of tuberculosis. Lincoln's third son, "Willie" Lincoln was born on December 21, 1850, and died of a fever at the White House on February 20, 1862. The youngest, Thomas "Tad" Lincoln, was born on April 4, 1853, and survived his father but died of heart failure at age 18 on July 16, 1871. Lincoln "was remarkably fond of children" and the Lincolns were not considered to be strict with their own. In fact, Lincoln's law partner William H. Herndon would grow irritated when Lincoln would bring his children to the law office. Their father, it seemed, was often too absorbed in his work to notice his children's behavior. Herndon recounted, "I have felt many and many a time that I wanted to wring their little necks, and yet out of respect for Lincoln I kept my mouth shut. Lincoln did not note what his children were doing or had done."The deaths of their sons, Eddie and Willie, had profound effects on both parents. Lincoln suffered from "melancholy", a condition now thought to be clinical depression. Later in life, Mary struggled with the stresses of losing her husband and sons, and Robert committed her for a time to an asylum in 1875.Early career and militia serviceIn 1832, Lincoln joined with a partner, Denton Offutt, in the purchase of a general store on credit in New Salem. Although the economy was booming, the business struggled and Lincoln eventually sold his share. That March he entered politics, running for the Illinois General Assembly, advocating navigational improvements on the Sangamon River. He could draw crowds as a raconteur, but he lacked the requisite formal education, powerful friends, and money, and lost the election.Lincoln briefly interrupted his campaign to serve as a captain in the Illinois Militia during the Black Hawk War. In his first campaign speech after returning, he observed a supporter in the crowd under attack, grabbed the assailant by his "neck and the seat of his trousers", and tossed him. Lincoln finished eighth out of 13 candidates (the top four were elected), though he received 277 of the 300 votes cast in the New Salem precinct.Lincoln served as New Salem's postmaster and later as county surveyor, but continued his voracious reading, and decided to become a lawyer. Rather than studying in the office of an established attorney, as was the custom, Lincoln borrowed legal texts from attorneys John Todd Stuart and Thomas Drummond, purchased books including Blackstone's Commentaries and Chitty's Pleadings, and read law on his own. He later said of his legal education that "I studied with nobody."Illinois state legislature (1834–1842)Lincoln's second state house campaign in 1834, this time as a Whig, was a success over a powerful Whig opponent. Then followed his four terms in the Illinois House of Representatives for Sangamon County. He championed construction of the Illinois and Michigan Canal, and later was a Canal Commissioner. He voted to expand suffrage beyond white landowners to all white males, but adopted a "free soil" stance opposing both slavery and abolition. In 1837, he declared, "[The] Institution of slavery is founded on both injustice and bad policy, but the promulgation of abolition doctrines tends rather to increase than abate its evils." He echoed Henry Clay's support for the American Colonization Society which advocated a program of abolition in conjunction with settling freed slaves in Liberia.He was admitted to the Illinois bar in 1836, and moved to Springfield and began to practice law under John T. Stuart, Mary Todd's cousin. Lincoln emerged as a formidable trial combatant during cross-examinations and closing arguments. He partnered several years with Stephen T. Logan, and in 1844 began his practice with William Herndon, "a studious young man".U.S. House of Representatives (1847–1849)True to his record, Lincoln professed to friends in 1861 to be "an old line Whig, a disciple of Henry Clay". Their party favored economic modernization in banking, tariffs to fund internal improvements including railroads, and urbanization.In 1843, Lincoln sought the Whig nomination for Illinois' 7th district seat in the U.S. House of Representatives; he was defeated by John J. Hardin though he prevailed with the party in limiting Hardin to one term. Lincoln not only pulled off his strategy of gaining the nomination in 1846 but also won the election. He was the only Whig in the Illinois delegation, but as dutiful as any participated in almost all votes and made speeches that toed the party line. He was assigned to the Committee on Post Office and Post Roads and the Committee on Expenditures in the War Department. Lincoln teamed with Joshua R. Giddings on a bill to abolish slavery in the District of Columbia with compensation for the owners, enforcement to capture fugitive slaves, and a popular vote on the matter. He dropped the bill when it eluded Whig support.Political views On foreign and military policy, Lincoln spoke against the Mexican–American War, which he imputed to President James K. Polk's desire for "military glory—that attractive rainbow, that rises in showers of blood". He supported the Wilmot Proviso, a failed proposal to ban slavery in any U.S. territory won from Mexico.Lincoln emphasized his opposition to Polk by drafting and introducing his Spot Resolutions. The war had begun with a Mexican slaughter of American soldiers in territory disputed by Mexico, and Polk insisted that Mexican soldiers had "invaded our territory and shed the blood of our fellow-citizens on our own soil". Lincoln demanded that Polk show Congress the exact spot on which blood had been shed and prove that the spot was on American soil. The resolution was ignored in both Congress and the national papers, and it cost Lincoln political support in his district. One Illinois newspaper derisively nicknamed him "spotty Lincoln". Lincoln later regretted some of his statements, especially his attack on presidential war-making powers.Lincoln had pledged in 1846 to serve only one term in the House. Realizing Clay was unlikely to win the presidency, he supported General Zachary Taylor for the Whig nomination in the 1848 presidential election. Taylor won and Lincoln hoped in vain to be appointed Commissioner of the General Land Office. The administration offered to appoint him secretary or governor of the Oregon Territory as consolation. This distant territory was a Democratic stronghold, and acceptance of the post would have disrupted his legal and political career in Illinois, so he declined and resumed his law practice.Prairie lawyerIn his Springfield practice, Lincoln handled "every kind of business that could come before a prairie lawyer". Twice a year he appeared for 10 consecutive weeks in county seats in the Midstate county courts; this continued for 16 years. Lincoln handled transportation cases in the midst of the nation's western expansion, particularly river barge conflicts under the many new railroad bridges. As a riverboat man, Lincoln initially favored those interests, but ultimately represented whoever hired him. He later represented a bridge company against a riverboat company in Hurd v. Rock Island Bridge Company, a landmark case involving a canal boat that sank after hitting a bridge. In 1849, he received a patent for a flotation device for the movement of boats in shallow water. The idea was never commercialized, but it made Lincoln the only president to hold a patent.Lincoln appeared before the Illinois Supreme Court in 175 cases; he was sole counsel in 51 cases, of which 31 were decided in his favor. From 1853 to 1860, one of his largest clients was the Illinois Central Railroad. His legal reputation gave rise to the nickname "Honest Abe".Lincoln argued in an 1858 criminal trial, defending William "Duff" Armstrong, who was on trial for the murder of James Preston Metzker. The case is famous for Lincoln's use of a fact established by judicial notice to challenge the credibility of an eyewitness. After an opposing witness testified to seeing the crime in the moonlight, Lincoln produced a Farmers' Almanac showing the moon was at a low angle, drastically reducing visibility. Armstrong was acquitted.Leading up to his presidential campaign, Lincoln elevated his profile in an 1859 murder case, with his defense of Simeon Quinn "Peachy" Harrison who was a third cousin; Harrison was also the grandson of Lincoln's political opponent, Rev. Peter Cartwright. Harrison was charged with the murder of Greek Crafton who, as he lay dying of his wounds, confessed to Cartwright that he had provoked Harrison. Lincoln angrily protested the judge's initial decision to exclude Cartwright's testimony about the confession as inadmissible hearsay. Lincoln argued that the testimony involved a dying declaration and was not subject to the hearsay rule. Instead of holding Lincoln in contempt of court as expected, the judge, a Democrat, reversed his ruling and admitted the testimony into evidence, resulting in Harrison's acquittal.Republican politics (1854–1860)Emergence as Republican leaderThe debate over the status of slavery in the territories failed to alleviate tensions between the slave-holding South and the free North, with the failure of the Compromise of 1850, a legislative package designed to address the issue. In his 1852 eulogy for Clay, Lincoln highlighted the latter's support for gradual emancipation and opposition to "both extremes" on the slavery issue. As the slavery debate in the Nebraska and Kansas territories became particularly acrimonious, Illinois Senator Stephen A. Douglas proposed popular sovereignty as a compromise; the measure would allow the electorate of each territory to decide the status of slavery. The legislation alarmed many Northerners, who sought to prevent the resulting spread of slavery, but Douglas's Kansas–Nebraska Act narrowly passed Congress in May 1854.Lincoln did not comment on the act until months later in his "Peoria Speech" in October 1854. Lincoln then declared his opposition to slavery which he repeated en route to the presidency. He said the Kansas Act had a "declared indifference, but as I must think, a covert real zeal for the spread of slavery. I cannot but hate it. I hate it because of the monstrous injustice of slavery itself. I hate it because it deprives our republican example of its just influence in the world ..." Lincoln's attacks on the Kansas–Nebraska Act marked his return to political life.Nationally, the Whigs were irreparably split by the Kansas–Nebraska Act and other efforts to compromise on the slavery issue. Reflecting on the demise of his party, Lincoln wrote in 1855, "I think I am a Whig, but others say there are no Whigs, and that I am an abolitionist...I do no more than oppose the extension of slavery." The new Republican Party was formed as a northern party dedicated to antislavery, drawing from the antislavery wing of the Whig Party, and combining Free Soil, Liberty, and antislavery Democratic Party members, Lincoln resisted early Republican entreaties, fearing that the new party would become a platform for extreme abolitionists. Lincoln held out hope for rejuvenating the Whigs, though he lamented his party's growing closeness with the nativist Know Nothing movement.In 1854, Lincoln was elected to the Illinois legislature but declined to take his seat. The year's elections showed the strong opposition to the Kansas–Nebraska Act, and in the aftermath, Lincoln sought election to the United States Senate. At that time, senators were elected by the state legislature. After leading in the first six rounds of voting, he was unable to obtain a majority. Lincoln instructed his backers to vote for Lyman Trumbull. Trumbull was an antislavery Democrat, and had received few votes in the earlier ballots; his supporters, also antislavery Democrats, had vowed not to support any Whig. Lincoln's decision to withdraw enabled his Whig supporters and Trumbull's antislavery Democrats to combine and defeat the mainstream Democratic candidate, Joel Aldrich Matteson.1856 campaign Violent political confrontations in Kansas continued, and opposition to the Kansas–Nebraska Act remained strong throughout the North. As the 1856 elections approached, Lincoln joined the Republicans and attended the Bloomington Convention, which formally established the Illinois Republican Party. The convention platform endorsed Congress's right to regulate slavery in the territories and backed the admission of Kansas as a free state. Lincoln gave the final speech of the convention supporting the party platform and called for the preservation of the Union. At the June 1856 Republican National Convention, though Lincoln received support to run as vice president, John C. Frémont and William Dayton comprised the ticket, which Lincoln supported throughout Illinois. The Democrats nominated former Secretary of State James Buchanan and the Know-Nothings nominated former Whig President Millard Fillmore. Buchanan prevailed, while Republican William Henry Bissell won election as Governor of Illinois, and Lincoln became a leading Republican in Illinois.Dred Scott v. Sandford Dred Scott was a slave whose master took him from a slave state to a free territory under the Missouri Compromise. After Scott was returned to the slave state he petitioned a federal court for his freedom. His petition was denied in Dred Scott v. Sandford (1857). Supreme Court Chief Justice Roger B. Taney in the decision wrote that blacks were not citizens and derived no rights from the Constitution. While many Democrats hoped that Dred Scott would end the dispute over slavery in the territories, the decision sparked further outrage in the North. Lincoln denounced it as the product of a conspiracy of Democrats to support the Slave Power. He argued the decision was at variance with the Declaration of Independence; he said that while the founding fathers did not believe all men equal in every respect, they believed all men were equal "in certain inalienable rights, among which are life, liberty, and the pursuit of happiness".Lincoln–Douglas debates and Cooper Union speechIn 1858, Douglas was up for re-election in the U.S. Senate, and Lincoln hoped to defeat him. Many in the party felt that a former Whig should be nominated in 1858, and Lincoln's 1856 campaigning and support of Trumbull had earned him a favor. Some eastern Republicans supported Douglas for his opposition to the Lecompton Constitution and admission of Kansas as a slave state. Many Illinois Republicans resented this eastern interference. For the first time, Illinois Republicans held a convention to agree upon a Senate candidate, and Lincoln won the nomination with little opposition.Lincoln accepted the nomination with great enthusiasm and zeal. After his nomination he delivered his House Divided Speech, with the biblical reference Mark 3:25, "A house divided against itself cannot stand. I believe this government cannot endure permanently half slave and half free. I do not expect the Union to be dissolved—I do not expect the house to fall—but I do expect it will cease to be divided. It will become all one thing, or all the other." The speech created a stark image of the danger of disunion. The stage was then set for the election of the Illinois legislature which would, in turn, select Lincoln or Douglas. When informed of Lincoln's nomination, Douglas stated, "[Lincoln] is the strong man of the party ... and if I beat him, my victory will be hardly won."The Senate campaign featured seven debates between Lincoln and Douglas. These were the most famous political debates in American history; they had an atmosphere akin to a prizefight and drew crowds in the thousands. The principals stood in stark contrast both physically and politically. Lincoln warned that Douglas’ "Slave Power" was threatening the values of republicanism, and accused Douglas of distorting the Founding Fathers' premise that all men are created equal. Douglas emphasized his Freeport Doctrine, that local settlers were free to choose whether to allow slavery and accused Lincoln of having joined the abolitionists. Lincoln's argument assumed a moral tone, as he claimed Douglas represented a conspiracy to promote slavery. Douglas's argument was more legal, claiming that Lincoln was defying the authority of the U.S. Supreme Court in the Dred Scott decision.Though the Republican legislative candidates won more popular votes, the Democrats won more seats, and the legislature re-elected Douglas. Lincoln's articulation of the issues gave him a national political presence. In May 1859, Lincoln purchased the Illinois Staats-Anzeiger, a German-language newspaper that was consistently supportive; most of the state's 130,000 German Americans voted Democratically but the German-language paper mobilized Republican support. In the aftermath of the 1858 election, newspapers frequently mentioned Lincoln as a potential Republican presidential candidate, rivaled by William H. Seward, Salmon P. Chase, Edward Bates, and Simon Cameron. While Lincoln was popular in the Midwest, he lacked support in the Northeast and was unsure whether to seek office. In January 1860, Lincoln told a group of political allies that he would accept the nomination if offered, and in the following months' several local papers endorsed his candidacy.Over the coming months, Lincoln was tireless, making nearly fifty speeches along the campaign trail. By the quality and simplicity of his rhetoric, he quickly became the champion of the Republican party. However, despite his overwhelming support in the Midwestern United States, he was less appreciated in the east. Horace Greeley, editor of the New York Tribune, at that time wrote up an unflattering account of Lincoln's compromising position on slavery and his reluctance to challenge the court's Dred-Scott ruling, which was promptly used against him by his political rivals.On February 27, 1860, powerful New York Republicans invited Lincoln to give a speech at Cooper Union, in which he argued that the Founding Fathers of the United States had little use for popular sovereignty and had repeatedly sought to restrict slavery. He insisted that morality required opposition to slavery, and rejected any "groping for some middle ground between the right and the wrong". Many in the audience thought he appeared awkward and even ugly. But Lincoln demonstrated intellectual leadership that brought him into contention. Journalist Noah Brooks reported, "No man ever before made such an impression on his first appeal to a New York audience."Historian David Herbert Donald described the speech as a "superb political move for an unannounced candidate, to appear in one rival's (Seward) own state at an event sponsored by the second rival's (Chase) loyalists, while not mentioning either by name during its delivery". In response to an inquiry about his ambitions, Lincoln said, "The taste is in my mouth a little."1860 presidential electionOn May 9–10, 1860, the Illinois Republican State Convention was held in Decatur. Lincoln's followers organized a campaign team led by David Davis, Norman Judd, Leonard Swett, and Jesse DuBois, and Lincoln received his first endorsement. Exploiting his embellished frontier legend (clearing land and splitting fence rails), Lincoln's supporters adopted the label of "The Rail Candidate". In 1860, Lincoln described himself: "I am in height, six feet, four inches, nearly; lean in flesh, weighing, on an average, one hundred and eighty pounds; dark complexion, with coarse black hair, and gray eyes." Michael Martinez wrote about the effective imaging of Lincoln by his campaign. At times he was presented as the plain-talking "Rail Splitter" and at other times he was "Honest Abe", unpolished but trustworthy.On May 18, at the Republican National Convention in Chicago, Lincoln won the nomination on the third ballot, beating candidates such as Seward and Chase. A former Democrat, Hannibal Hamlin of Maine, was nominated for vice president to balance the ticket. Lincoln's success depended on his campaign team, his reputation as a moderate on the slavery issue, and his strong support for internal improvements and the tariff.Pennsylvania put him over the top, led by the state's iron interests who were reassured by his tariff support. Lincoln's managers had focused on this delegation while honoring Lincoln's dictate to "Make no contracts that will bind me".As the Slave Power tightened its grip on the national government, most Republicans agreed with Lincoln that the North was the aggrieved party. Throughout the 1850s, Lincoln had doubted the prospects of civil war, and his supporters rejected claims that his election would incite secession. When Douglas was selected as the candidate of the Northern Democrats, delegates from eleven slave states walked out of the Democratic convention; they opposed Douglas's position on popular sovereignty, and selected incumbent Vice President John C. Breckinridge as their candidate. A group of former Whigs and Know Nothings formed the Constitutional Union Party and nominated John Bell of Tennessee. Lincoln and Douglas competed for votes in the North, while Bell and Breckinridge primarily found support in the South.Prior to the Republican convention, the Lincoln campaign began cultivating a nationwide youth organization, the Wide Awakes, which it used to generate popular support throughout the country to spearhead voter registration drives, thinking that new voters and young voters tended to embrace new parties. People of the Northern states knew the Southern states would vote against Lincoln and rallied supporters for Lincoln.As Douglas and the other candidates campaigned, Lincoln gave no speeches, relying on the enthusiasm of the Republican Party. The party did the leg work that produced majorities across the North and produced an abundance of campaign posters, leaflets, and newspaper editorials. Republican speakers focused first on the party platform, and second on Lincoln's life story, emphasizing his childhood poverty. The goal was to demonstrate the power of "free labor", which allowed a common farm boy to work his way to the top by his own efforts. The Republican Party's production of campaign literature dwarfed the combined opposition; a Chicago Tribune writer produced a pamphlet that detailed Lincoln's life and sold 100,000–200,000 copies. Though he did not give public appearances, many sought to visit him and write him. In the runup to the election, he took an office in the Illinois state capitol to deal with the influx of attention. He also hired John George Nicolay as his personal secretary, who would remain in that role during the presidency.On November 6, 1860, Lincoln was elected the 16th president. He was the first Republican president and his victory was entirely due to his support in the North and West. No ballots were cast for him in 10 of the 15 Southern slave states, and he won only two of 996 counties in all the Southern states, an omen of the impending Civil War. Lincoln received 1,866,452 votes, or 39.8% of the total in a four-way race, carrying the free Northern states, as well as California and Oregon. His victory in the electoral college was decisive: Lincoln had 180 votes to 123 for his opponents.Presidency (1861–1865)Secession and inaugurationThe South was outraged by Lincoln's election, and in response secessionists implemented plans to leave the Union before he took office in March 1861. On December 20, 1860, South Carolina took the lead by adopting an ordinance of secession; by February 1, 1861, Florida, Mississippi, Alabama, Georgia, Louisiana, and Texas followed. Six of these states declared themselves to be a sovereign nation, the Confederate States of America, and adopted a constitution. The upper South and border states (Delaware, Maryland, Virginia, North Carolina, Tennessee, Kentucky, Missouri, and Arkansas) initially rejected the secessionist appeal. President Buchanan and President-elect Lincoln refused to recognize the Confederacy, declaring secession illegal. The Confederacy selected Jefferson Davis as its provisional president on February 9, 1861.Attempts at compromise followed but Lincoln and the Republicans rejected the proposed Crittenden Compromise as contrary to the Party's platform of free-soil in the territories. Lincoln said, "I will suffer death before I consent ... to any concession or compromise which looks like buying the privilege to take possession of this government to which we have a constitutional right."Lincoln tacitly supported the Corwin Amendment to the Constitution, which passed Congress and was awaiting ratification by the states when Lincoln took office. That doomed amendment would have protected slavery in states where it already existed. A few weeks before the war, Lincoln sent a letter to every governor informing them Congress had passed a joint resolution to amend the Constitution.En route to his inauguration, Lincoln addressed crowds and legislatures across the North. He gave a particularly emotional farewell address upon leaving Springfield; he would never again return to Springfield alive. The president-elect evaded suspected assassins in Baltimore. On February 23, 1861, he arrived in disguise in Washington, D.C., which was placed under substantial military guard. Lincoln directed his inaugural address to the South, proclaiming once again that he had no inclination to abolish slavery in the Southern states: Lincoln cited his plans for banning the expansion of slavery as the key source of conflict between North and South, stating "One section of our country believes slavery is right and ought to be extended, while the other believes it is wrong and ought not to be extended. This is the only substantial dispute." The president ended his address with an appeal to the people of the South: "We are not enemies, but friends. We must not be enemies ... The mystic chords of memory, stretching from every battlefield, and patriot grave, to every living heart and hearthstone, all over this broad land, will yet swell the chorus of the Union, when again touched, as surely they will be, by the better angels of our nature." The failure of the Peace Conference of 1861 signaled that legislative compromise was impossible. By March 1861, no leaders of the insurrection had proposed rejoining the Union on any terms. Meanwhile, Lincoln and the Republican leadership agreed that the dismantling of the Union could not be tolerated. In his second inaugural address, Lincoln looked back on the situation at the time and said: "Both parties deprecated war, but one of them would make war rather than let the Nation survive, and the other would accept war rather than let it perish, and the war came."Civil WarMajor Robert Anderson, commander of the Union's Fort Sumter in Charleston, South Carolina, sent a request for provisions to Washington, and Lincoln's order to meet that request was seen by the secessionists as an act of war. On April 12, 1861, Confederate forces fired on Union troops at Fort Sumter and began the fight. Historian Allan Nevins argued that the newly inaugurated Lincoln made three miscalculations: underestimating the gravity of the crisis, exaggerating the strength of Unionist sentiment in the South, and overlooking Southern Unionist opposition to an invasion.William Tecumseh Sherman talked to Lincoln during inauguration week and was "sadly disappointed" at his failure to realize that "the country was sleeping on a volcano" and that the South was preparing for war. Donald concludes that, "His repeated efforts to avoid collision in the months between inauguration and the firing on Ft. Sumter showed he adhered to his vow not to be the first to shed fraternal blood. But he also vowed not to surrender the forts. The only resolution of these contradictory positions was for the confederates to fire the first shot; they did just that."On April 15, Lincoln called on the states to send a total of 75,000 volunteer troops to recapture forts, protect Washington, and "preserve the Union", which, in his view, remained intact despite the seceding states. This call forced states to choose sides. Virginia seceded and was rewarded with the designation of Richmond as the Confederate capital, despite its exposure to Union lines. North Carolina, Tennessee, and Arkansas followed over the following two months. Secession sentiment was strong in Missouri and Maryland, but did not prevail; Kentucky remained neutral. The Fort Sumter attack rallied Americans north of the Mason-Dixon line to defend the nation.As States sent Union regiments south, on April 19, Baltimore mobs in control of the rail links attacked Union troops who were changing trains. Local leaders' groups later burned critical rail bridges to the capital and the Army responded by arresting local Maryland officials. Lincoln suspended the writ of habeas corpus where needed for the security of troops trying to reach Washington. John Merryman, one Maryland official hindering the U.S. troop movements, petitioned Supreme Court Chief Justice Roger B. Taney to issue a writ of habeas corpus. In June Taney, ruling only for the lower circuit court in ex parte Merryman, issued the writ which he felt could only be suspended by Congress. Lincoln persisted with the policy of suspension in select areas.Union military strategyLincoln took executive control of the war and shaped the Union military strategy. He responded to the unprecedented political and military crisis as commander-in-chief by exercising unprecedented authority. He expanded his war powers, imposed a blockade on Confederate ports, disbursed funds before appropriation by Congress, suspended habeas corpus, and arrested and imprisoned thousands of suspected Confederate sympathizers. Lincoln gained the support of Congress and the northern public for these actions. Lincoln also had to reinforce Union sympathies in the border slave states and keep the war from becoming an international conflict.It was clear from the outset that bipartisan support was essential to success, and that any compromise alienated factions on both sides of the aisle, such as the appointment of Republicans and Democrats to command positions. Copperheads criticized Lincoln for refusing to compromise on slavery. The Radical Republicans criticized him for moving too slowly in abolishing slavery. On August 6, 1861, Lincoln signed the Confiscation Act that authorized judicial proceedings to confiscate and free slaves who were used to support the Confederates. The law had little practical effect, but it signaled political support for abolishing slavery.In August 1861, General John C. Frémont, the 1856 Republican presidential nominee, without consulting Washington, issued a martial edict freeing slaves of the rebels. Lincoln canceled the illegal proclamation as politically motivated and lacking military necessity. As a result, Union enlistments from Maryland, Kentucky, and Missouri increased by over 40,000.Internationally, Lincoln wanted to forestall foreign military aid to the Confederacy. He relied on his combative Secretary of State William Seward while working closely with Senate Foreign Relations Committee chairman Charles Sumner. In the 1861 Trent Affair which threatened war with Great Britain, the U.S. Navy illegally intercepted a British mail ship, the Trent, on the high seas and seized two Confederate envoys; Britain protested vehemently while the U.S. cheered. Lincoln ended the crisis by releasing the two diplomats. Biographer James G. Randall dissected Lincoln's successful techniques:Lincoln painstakingly monitored the telegraph reports coming into the War Department. He tracked all phases of the effort, consulting with governors, and selecting generals based on their success, their state, and their party. In January 1862, after complaints of inefficiency and profiteering in the War Department, Lincoln replaced War Secretary Simon Cameron with Edwin Stanton. Stanton centralized the War Department's activities, auditing and canceling contracts, saving the federal government $17,000,000. Stanton was a staunch Unionist, pro-business, conservative Democrat who gravitated toward the Radical Republican faction. He worked more often and more closely with Lincoln than any other senior official. "Stanton and Lincoln virtually conducted the war together", say Thomas and Hyman.Lincoln's war strategy embraced two priorities: ensuring that Washington was well-defended and conducting an aggressive war effort for a prompt, decisive victory. Twice a week, Lincoln met with his cabinet in the afternoon. Occasionally Mary prevailed on him to take a carriage ride, concerned that he was working too hard. For his edification Lincoln relied upon a book by his chief of staff General Henry Halleck entitled Elements of Military Art and Science; Halleck was a disciple of the European strategist Antoine-Henri Jomini. Lincoln began to appreciate the critical need to control strategic points, such as the Mississippi River. Lincoln saw the importance of Vicksburg and understood the necessity of defeating the enemy's army, rather than simply capturing territory.General McClellanAfter the Union rout at Bull Run and Winfield Scott's retirement, Lincoln appointed Major General George B. McClellan general-in-chief. McClellan then took months to plan his Virginia Peninsula Campaign. McClellan's slow progress frustrated Lincoln, as did his position that no troops were needed to defend Washington. McClellan, in turn, blamed the failure of the campaign on Lincoln's reservation of troops for the capitol.In 1862, Lincoln removed McClellan for the general's continued inaction. He elevated Henry Halleck in July and appointed John Pope as head of the new Army of Virginia. Pope satisfied Lincoln's desire to advance on Richmond from the north, thus protecting Washington from counterattack. But Pope was then soundly defeated at the Second Battle of Bull Run in the summer of 1862, forcing the Army of the Potomac back to defend Washington.Despite his dissatisfaction with McClellan's failure to reinforce Pope, Lincoln restored him to command of all forces around Washington. Two days after McClellan's return to command, General Robert E. Lee's forces crossed the Potomac River into Maryland, leading to the Battle of Antietam. That battle, a Union victory, was among the bloodiest in American history; it facilitated Lincoln's Emancipation Proclamation in January.McClellan then resisted the president's demand that he pursue Lee's withdrawing army, while General Don Carlos Buell likewise refused orders to move the Army of the Ohio against rebel forces in eastern Tennessee. Lincoln replaced Buell with William Rosecrans; and after the 1862 midterm elections he replaced McClellan with Ambrose Burnside. The appointments were both politically neutral and adroit on Lincoln's part.Burnside, against presidential advice, launched an offensive across the Rappahannock River and was defeated by Lee at Fredericksburg in December. Desertions during 1863 came in the thousands and only increased after Fredericksburg, so Lincoln replaced Burnside with Joseph Hooker.In the 1862 midterm elections the Republicans suffered severe losses due to rising inflation, high taxes, rumors of corruption, suspension of habeas corpus, military draft law, and fears that freed slaves would come North and undermine the labor market. The Emancipation Proclamation gained votes for Republicans in rural New England and the upper Midwest, but cost votes in the Irish and German strongholds and in the lower Midwest, where many Southerners had lived for generations.In the spring of 1863 Lincoln was sufficiently optimistic about upcoming military campaigns to think the end of the war could be near; the plans included attacks by Hooker on Lee north of Richmond, Rosecrans on Chattanooga, Grant on Vicksburg, and a naval assault on Charleston.Hooker was routed by Lee at the Battle of Chancellorsville in May, then resigned and was replaced by George Meade. Meade followed Lee north into Pennsylvania and beat him in the Gettysburg Campaign, but then failed to follow up despite Lincoln's demands. At the same time, Grant captured Vicksburg and gained control of the Mississippi River, splitting the far western rebel states.Emancipation ProclamationThe Federal government's power to end slavery was limited by the Constitution, which before 1865 delegated the issue to the individual states. Lincoln argued that slavery would be rendered obsolete if its expansion into new territories were prevented. He sought to persuade the states to agree to compensation for emancipating their slaves in return for their acceptance of abolition. Lincoln rejected Fremont's two emancipation attempts in August 1861, as well as one by Major General David Hunter in May 1862, on the grounds that it was not within their power, and would upset loyal border states.In June 1862, Congress passed an act banning slavery on all federal territory, which Lincoln signed. In July, the Confiscation Act of 1862 was enacted, providing court procedures to free the slaves of those convicted of aiding the rebellion; Lincoln approved the bill despite his belief that it was unconstitutional. He felt such action could be taken only within the war powers of the commander-in-chief, which he planned to exercise. Lincoln at this time reviewed a draft of the Emancipation Proclamation with his cabinet.Privately, Lincoln concluded that the Confederacy's slave base had to be eliminated. Copperheads argued that emancipation was a stumbling block to peace and reunification; Republican editor Horace Greeley of the New York Tribune agreed. In a letter of August 22, 1862, Lincoln said that while he personally wished all men could be free, regardless of that, his first obligation as president was to preserve the Union:The Emancipation Proclamation, issued on September 22, 1862, and effective January 1, 1863, affirmed the freedom of slaves in 10 states not then under Union control, with exemptions specified for areas under such control. Lincoln's comment on signing the Proclamation was: "I never, in my life, felt more certain that I was doing right, than I do in signing this paper." He spent the next 100 days preparing the army and the nation for emancipation, while Democrats rallied their voters by warning of the threat that freed slaves posed to northern whites.With the abolition of slavery in the rebel states now a military objective, Union armies advancing south liberated three million slaves.Enlisting former slaves became official policy. By the spring of 1863, Lincoln was ready to recruit black troops in more than token numbers. In a letter to Tennessee military governor Andrew Johnson encouraging him to lead the way in raising black troops, Lincoln wrote, "The bare sight of 50,000 armed and drilled black soldiers on the banks of the Mississippi would end the rebellion at once". By the end of 1863, at Lincoln's direction, General Lorenzo Thomas had recruited 20 regiments of blacks from the Mississippi Valley.The Proclamation included Lincoln's earlier plans for colonies for newly freed slaves, though that undertaking ultimately failed.Gettysburg Address (1863)Lincoln spoke at the dedication of the Gettysburg battlefield cemetery on November 19, 1863. In 272 words, and three minutes, Lincoln asserted that the nation was born not in 1789, but in 1776, "conceived in Liberty, and dedicated to the proposition that all men are created equal". He defined the war as dedicated to the principles of liberty and equality for all. He declared that the deaths of so many brave soldiers would not be in vain, that slavery would end, and the future of democracy would be assured, that "government of the people, by the people, for the people, shall not perish from the earth".Defying his prediction that "the world will little note, nor long remember what we say here", the Address became the most quoted speech in American history.General GrantGrant's victories at the Battle of Shiloh and in the Vicksburg campaign impressed Lincoln. Responding to criticism of Grant after Shiloh, Lincoln had said, "I can't spare this man. He fights." With Grant in command, Lincoln felt the Union Army could advance in multiple theaters, while also including black troops. Meade's failure to capture Lee's army after Gettysburg and the continued passivity of the Army of the Potomac persuaded Lincoln to promote Grant to supreme commander. Grant then assumed command of Meade's army.Lincoln was concerned that Grant might be considering a presidential candidacy in 1864. He arranged for an intermediary to inquire into Grant's political intentions, and once assured that he had none, Lincoln promoted Grant to the newly revived rank of Lieutenant General, a rank which had been unoccupied since George Washington. Authorization for such a promotion "with the advice and consent of the Senate" was provided by a new bill which Lincoln signed the same day he submitted Grant's name to the Senate. His nomination was confirmed by the Senate on March 2, 1864.Grant in 1864 waged the bloody Overland Campaign, which exacted heavy losses on both sides. When Lincoln asked what Grant's plans were, the persistent general replied, "I propose to fight it out on this line if it takes all summer." Grant's army moved steadily south. Lincoln traveled to Grant's headquarters at City Point, Virginia, to confer with Grant and William Tecumseh Sherman. Lincoln reacted to Union losses by mobilizing support throughout the North. Lincoln authorized Grant to target infrastructure—plantations, railroads, and bridges—hoping to weaken the South's morale and fighting ability. He emphasized defeat of the Confederate armies over destruction (which was considerable) for its own sake. Lincoln's engagement became distinctly personal on one occasion in 1864 when Confederate general Jubal Early raided Washington, D.C. Legend has it that while Lincoln watched from an exposed position, Union Captain (and future Supreme Court Justice) Oliver Wendell Holmes Jr. shouted at him, "Get down, you damn fool, before you get shot!"As Grant continued to weaken Lee's forces, efforts to discuss peace began. Confederate Vice President Stephens led a group meeting with Lincoln, Seward, and others at Hampton Roads. Lincoln refused to negotiate with the Confederacy as a coequal; his objective to end the fighting was not realized. On April 1, 1865, Grant nearly encircled Petersburg in a siege. The Confederate government evacuated Richmond and Lincoln visited the conquered capital. On April 9, Lee surrendered to Grant at Appomattox, officially ending the war.Re-electionLincoln ran for reelection in 1864, while uniting the main Republican factions, along with War Democrats Edwin M. Stanton and Andrew Johnson. Lincoln used conversation and his patronage powers—greatly expanded from peacetime—to build support and fend off the Radicals' efforts to replace him. At its convention, the Republicans selected Johnson as his running mate. To broaden his coalition to include War Democrats as well as Republicans, Lincoln ran under the label of the new Union Party.Grant's bloody stalemates damaged Lincoln's re-election prospects, and many Republicans feared defeat. Lincoln confidentially pledged in writing that if he should lose the election, he would still defeat the Confederacy before turning over the White House; Lincoln did not show the pledge to his cabinet, but asked them to sign the sealed envelope. The pledge read as follows:The Democratic platform followed the "Peace wing" of the party and called the war a "failure"; but their candidate, McClellan, supported the war and repudiated the platform. Meanwhile, Lincoln emboldened Grant with more troops and Republican party support. Sherman's capture of Atlanta in September and David Farragut's capture of Mobile ended defeatism. The Democratic Party was deeply split, with some leaders and most soldiers openly for Lincoln. The National Union Party was united by Lincoln's support for emancipation. State Republican parties stressed the perfidy of the Copperheads. On November 8, Lincoln carried all but three states, including 78 percent of Union soldiers.On March 4, 1865, Lincoln delivered his second inaugural address. In it, he deemed the war casualties to be God's will. Historian Mark Noll places the speech "among the small handful of semi-sacred texts by which Americans conceive their place in the world;" it is inscribed in the Lincoln Memorial. Lincoln said:ReconstructionReconstruction preceded the war's end, as Lincoln and his associates considered the reintegration of the nation, and the fates of Confederate leaders and freed slaves. When a general asked Lincoln how the defeated Confederates were to be treated, Lincoln replied, "Let 'em up easy." Lincoln was determined to find meaning in the war in its aftermath, and did not want to continue to outcast the southern states. His main goal was to keep the union together, so he proceeded by focusing not on whom to blame, but on how to rebuild the nation as one. Lincoln led the moderates in Reconstruction policy and was opposed by the Radicals, under Rep. Thaddeus Stevens, Sen. Charles Sumner and Sen. Benjamin Wade, who otherwise remained Lincoln's allies. Determined to reunite the nation and not alienate the South, Lincoln urged that speedy elections under generous terms be held. His Amnesty Proclamation of December 8, 1863, offered pardons to those who had not held a Confederate civil office and had not mistreated Union prisoners, if they were willing to sign an oath of allegiance.As Southern states fell, they needed leaders while their administrations were restored. In Tennessee and Arkansas, Lincoln respectively appointed Johnson and Frederick Steele as military governors. In Louisiana, Lincoln ordered General Nathaniel P. Banks to promote a plan that would reestablish statehood when 10 percent of the voters agreed, and only if the reconstructed states abolished slavery. Democratic opponents accused Lincoln of using the military to ensure his and the Republicans' political aspirations. The Radicals denounced his policy as too lenient, and passed their own plan, the 1864 Wade–Davis Bill, which Lincoln vetoed. The Radicals retaliated by refusing to seat elected representatives from Louisiana, Arkansas, and Tennessee.Lincoln's appointments were designed to harness both moderates and Radicals. To fill Chief Justice Taney's seat on the Supreme Court, he named the Radicals' choice, Salmon P. Chase, who Lincoln believed would uphold his emancipation and paper money policies.After implementing the Emancipation Proclamation, Lincoln increased pressure on Congress to outlaw slavery throughout the nation with a constitutional amendment. He declared that such an amendment would "clinch the whole matter" and by December 1863 an amendment was brought to Congress. This first attempt fell short of the required two-thirds majority in the House of Representatives. Passage became part of Lincoln's reelection platform, and after his successful reelection, the second attempt in the House passed on January 31, 1865. With ratification, it became the Thirteenth Amendment to the United States Constitution on December 6, 1865.Lincoln believed the federal government had limited responsibility to the millions of freedmen. He signed Senator Charles Sumner's Freedmen's Bureau bill that set up a temporary federal agency designed to meet the immediate needs of former slaves. The law opened land for a lease of three years with the ability to purchase title for the freedmen. Lincoln announced a Reconstruction plan that involved short-term military control, pending readmission under the control of southern Unionists.Historians agree that it is impossible to predict exactly how Reconstruction would have proceeded had Lincoln lived. Biographers James G. Randall and Richard Current, according to David Lincove, argue that:Eric Foner argues that:Native American policyLincoln's experience with Indians followed the death of his grandfather Abraham by Indian assailants, in the presence of his father and uncles. Lincoln claimed Indians were antagonistic toward his father, Thomas Lincoln, and his young family. Although Lincoln was a veteran of the Black Hawk War, which was fought in Wisconsin and Illinois in 1832, he saw no significant action. During his presidency, Lincoln's policy toward Indians was driven by politics. He used the Indian Bureau as a source of patronage, making appointments to his loyal followers in Minnesota and Wisconsin. He faced difficulties guarding Western settlers, railroads, and telegraphs, from Indian attacks.On August 17, 1862, the Dakota uprising in Minnesota, supported by the Yankton Indians, killed hundreds of white settlers, forced 30,000 from their homes, and deeply alarmed the Lincoln administration. Some believed it was a conspiracy by the Confederacy to launch a war on the Northwestern front. Lincoln sent General John Pope, the former head of the Army of Virginia, to Minnesota as commander of the new Department of the Northwest. Lincoln ordered thousands of Confederate prisoners of war sent by railroad to put down the Dakota Uprising. When the Confederates protested forcing Confederate prisoners to fight Indians, Lincoln revoked the policy. Pope fought against the Indians mercilessly, even advocating their extinction. He ordered Indian farms and food supplies be destroyed, and Indian warriors be killed. Aiding Pope, Minnesota Congressman Col. Henry H. Sibley led militiamen and regular troops to defeat the Dakota at Wood Lake. By October 9, Pope considered the uprising to be ended; hostilities ceased on December 26. An unusual military court was set up to prosecute captured natives, with Lincoln effectively acting as the route of appeal.Lincoln personally reviewed each of 303 execution warrants for Santee Dakota convicted of killing innocent farmers; he commuted the sentences of all but 39 (one was later reprieved). Lincoln sought to be lenient, but still send a message. He also faced significant public pressure, including threats of mob justice should any of the Dakota be spared. Former Governor of Minnesota Alexander Ramsey told Lincoln, in 1864, that he would have gotten more presidential election support had he executed all 303 of the Indians. Lincoln responded, "I could not afford to hang men for votes."Other enactmentsIn the selection and use of his cabinet, Lincoln employed the strengths of his opponents in a manner that emboldened his presidency. Lincoln commented on his thought process, "We need the strongest men of the party in the Cabinet. We needed to hold our own people together. I had looked the party over and concluded that these were the very strongest men. Then I had no right to deprive the country of their services."  Goodwin described the group in her biography as a Team of Rivals.Lincoln adhered to the Whig theory of a presidency focused on executing laws while deferring to Congress' responsibility for legislating. Lincoln vetoed only four bills, including the Wade-Davis Bill with its harsh Reconstruction program. The 1862 Homestead Act made millions of acres of Western government-held land available for purchase at low cost. The 1862 Morrill Land-Grant Colleges Act provided government grants for agricultural colleges in each state. The Pacific Railway Acts of 1862 and 1864 granted federal support for the construction of the United States' First Transcontinental Railroad, which was completed in 1869. The passage of the Homestead Act and the Pacific Railway Acts was enabled by the absence of Southern congressmen and senators who had opposed the measures in the 1850s.There were two measures passed to raise revenues for the Federal government: tariffs (a policy with long precedent), and a Federal income tax. In 1861, Lincoln signed the second and third Morrill Tariffs, following the first enacted by Buchanan. He also signed the Revenue Act of 1861, creating the first U.S. income tax—a flat tax of 3 percent on incomes above $800 ($ in current dollar terms). The Revenue Act of 1862 adopted rates that increased with income.Lincoln presided over the expansion of the federal government's economic influence in other areas. The National Banking Act created the system of national banks. The US issued paper currency for the first time, known as greenbacks—printed in green on the reverse side. In 1862, Congress created the Department of Agriculture.In response to rumors of a renewed draft, the editors of the New York World and the Journal of Commerce published a false draft proclamation that created an opportunity for the editors and others to corner the gold market. Lincoln attacked the media for such behavior, and ordered a military seizure of the two papers which lasted for two days.Lincoln is largely responsible for the Thanksgiving holiday. Thanksgiving had become a regional holiday in New England in the 17th century. It had been sporadically proclaimed by the federal government on irregular dates. The prior proclamation had been during James Madison's presidency 50 years earlier. In 1863, Lincoln declared the final Thursday in November of that year to be a day of Thanksgiving.In June 1864, Lincoln approved the Yosemite Grant enacted by Congress, which provided unprecedented federal protection for the area now known as Yosemite National Park.Judicial appointmentsSupreme Court appointmentsLincoln's philosophy on court nominations was that "we cannot ask a man what he will do, and if we should, and he should answer us, we should despise him for it. Therefore we must take a man whose opinions are known." Lincoln made five appointments to the Supreme Court. Noah Haynes Swayne was an anti-slavery lawyer who was committed to the Union. Samuel Freeman Miller supported Lincoln in the 1860 election and was an avowed abolitionist. David Davis was Lincoln's campaign manager in 1860 and had served as a judge in the Illinois court circuit where Lincoln practiced. Democrat Stephen Johnson Field, a previous California Supreme Court justice, provided geographic and political balance. Finally, Lincoln's Treasury Secretary, Salmon P. Chase, became Chief Justice. Lincoln believed Chase was an able jurist, would support Reconstruction legislation, and that his appointment united the Republican Party.Other judicial appointmentsLincoln appointed 27 judges to the United States district courts but no judges to the United States circuit courts during his time in office.States admitted to the UnionWest Virginia was admitted to the Union on June 20, 1863. Nevada, which became the third state in the far-west of the continent, was admitted as a free state on October 31, 1864.AssassinationJohn Wilkes Booth was a well-known actor and a Confederate spy from Maryland; though he never joined the Confederate army, he had contacts with the Confederate secret service. After attending an April 11, 1865 speech in which Lincoln promoted voting rights for blacks, Booth hatched a plot to assassinate the President. When Booth learned of the Lincolns' intent to attend a play with General Grant, he planned to assassinate Lincoln and Grant at Ford's Theatre. Lincoln and his wife attended the play Our American Cousin on the evening of April 14, just five days after the Union victory at the Battle of Appomattox Courthouse. At the last minute, Grant decided to go to New Jersey to visit his children instead of attending the play.At 10:15 in the evening, Booth entered the back of Lincoln's theater box, crept up from behind, and fired at the back of Lincoln's head, mortally wounding him. Lincoln's guest Major Henry Rathbone momentarily grappled with Booth, but Booth stabbed him and escaped. After being attended by Doctor Charles Leale and two other doctors, Lincoln was taken across the street to Petersen House. After remaining in a coma for eight hours, Lincoln died at 7:22 in the morning on April 15. Stanton saluted and said, "Now he belongs to the ages." Lincoln's body was placed in a flag-wrapped coffin, which was loaded into a hearse and escorted to the White House by Union soldiers. President Johnson was sworn in the next morning.Two weeks later, Booth, refusing to surrender, was tracked to a farm in Virginia, and was mortally shot by Sergeant Boston Corbett and died on April 26. Secretary of War Stanton had issued orders that Booth be taken alive, so Corbett was initially arrested for court martial. After a brief interview, Stanton declared him a patriot and dismissed the charge.Funeral and burial The late President lay in state, first in the East Room of the White House, and then in the Capitol Rotunda from April 19 through April 21. The caskets containing Lincoln's body and the body of his son Willie traveled for three weeks on the Lincoln Special funeral train. The train followed a circuitous route from Washington D.C. to Springfield, Illinois, stopping at many cities for memorials attended by hundreds of thousands. Many others gathered along the tracks as the train passed with bands, bonfires, and hymn singing or in silent grief. Poet Walt Whitman composed "When Lilacs Last in the Dooryard Bloom'd" to eulogize him, one of four poems he wrote about Lincoln. African Americans were especially moved; they had lost 'their Moses'. In a larger sense, the reaction was in response to the deaths of so many men in the war. Historians emphasized the widespread shock and sorrow, but noted that some Lincoln haters celebrated his death. Lincoln's body was buried at Oak Ridge Cemetery in Springfield and now lies within the Lincoln Tomb.Religious and philosophical beliefsAs a young man, Lincoln was a religious skeptic. He was deeply familiar with the Bible, quoting and praising it. He was private about his position on organized religion and respected the beliefs of others. He never made a clear profession of Christian beliefs. Through his entire public career, Lincoln had a proneness for quoting Scripture. His three most famous speeches—the House Divided Speech, the Gettysburg Address, and his second inaugural—each contain direct allusions to Providence and quotes from Scripture.In the 1840s, Lincoln subscribed to the Doctrine of Necessity, a belief that the human mind was controlled by a higher power. With the death of his son Edward in 1850 he more frequently expressed a dependence on God. He never joined a church, although he frequently attended First Presbyterian Church with his wife beginning in 1852.In the 1850s, Lincoln asserted his belief in "providence" in a general way, and rarely used the language or imagery of the evangelicals; he regarded the republicanism of the Founding Fathers with an almost religious reverence. The death of son Willie in February 1862 may have caused him to look toward religion for solace. After Willie's death, he questioned the divine necessity of the war's severity. He wrote at this time that God "could have either saved or destroyed the Union without a human contest. Yet the contest began. And having begun, He could give the final victory to either side any day. Yet the contest proceeds."Lincoln did believe in an all-powerful God that shaped events and by 1865 was expressing those beliefs in major speeches. By the end of the war, he increasingly appealed to the Almighty for solace and to explain events, writing on April 4, 1864, to a newspaper editor in Kentucky: I claim not to have controlled events, but confess plainly that events have controlled me. Now, at the end of three years struggle the nation's condition is not what either party, or any man devised, or expected. God alone can claim it. Whither it is tending seems plain. If God now wills the removal of a great wrong, and wills also that we of the North as well as you of the South, shall pay fairly for our complicity in that wrong, impartial history will find therein new cause to attest and revere the justice and goodness of God.This spirituality can best be seen in his second inaugural address, considered by some scholars as the greatest such address in American history, and by Lincoln himself as his own greatest speech, or one of them at the very least. Lincoln explains therein that the cause, purpose, and result of the war was God's will. Lincoln's frequent use of religious imagery and language toward the end of his life may have reflected his own personal beliefs or might have been a device to reach his audiences, who were mostly evangelical Protestants. On the day Lincoln was assassinated, he reportedly told his wife he desired to visit the Holy Land.HealthLincoln is believed to have had depression, smallpox, and malaria. He took blue mass pills, which contained mercury, to treat constipation. It is unknown to what extent he may have suffered from mercury poisoning.Several claims have been made that Lincoln's health was declining before the assassination. These are often based on photographs of Lincoln appearing to show weight loss and muscle wasting. It is also suspected that he might have had a rare genetic disease such as Marfan syndrome or multiple endocrine neoplasia type 2B.LegacyRepublican values Lincoln's redefinition of republican values has been stressed by historians such as John Patrick Diggins, Harry V. Jaffa, Vernon Burton, Eric Foner, and Herman J. Belz. Lincoln called the Declaration of Independence—which emphasized freedom and equality for all—the "sheet anchor" of republicanism beginning in the 1850s. He did this at a time when the Constitution, which "tolerated slavery", was the focus of most political discourse. Diggins notes, "Lincoln presented Americans a theory of history that offers a profound contribution to the theory and destiny of republicanism itself" in the 1860 Cooper Union speech. Instead of focusing on the legality of an argument, he focused on the moral basis of republicanism.His position on war was founded on a legal argument regarding the Constitution as essentially a contract among the states, and all parties must agree to pull out of the contract. Furthermore, it was a national duty to ensure the republic stands in every state. Many soldiers and religious leaders from the north, though, felt the fight for liberty and freedom of slaves was ordained by their moral and religious beliefs.As a Whig activist, Lincoln was a spokesman for business interests, favoring high tariffs, banks, infrastructure improvements, and railroads, in opposition to Jacksonian democrats. William C. Harris found that Lincoln's "reverence for the Founding Fathers, the Constitution, the laws under it, and the preservation of the Republic and its institutions strengthened his conservatism." James G. Randall emphasizes his tolerance and moderation "in his preference for orderly progress, his distrust of dangerous agitation, and his reluctance toward ill digested schemes of reform." Randall concludes that "he was conservative in his complete avoidance of that type of so-called 'radicalism' which involved abuse of the South, hatred for the slaveholder, thirst for vengeance, partisan plotting, and ungenerous demands that Southern institutions be transformed overnight by outsiders."Reunification of the statesIn Lincoln's first inaugural address, he explored the nature of democracy. He denounced secession as anarchy, and explained that majority rule had to be balanced by constitutional restraints. He said "A majority held in restraint by constitutional checks and limitations, and always changing easily with deliberate changes of popular opinions and sentiments, is the only true sovereign of a free people."The successful reunification of the states had consequences for how people viewed the country. The term "the United States" has historically been used sometimes in the plural ("these United States") and other times in the singular. The Civil War was a significant force in the eventual dominance of the singular usage by the end of the 19th century.Historical reputation In surveys of U.S. scholars ranking presidents conducted since 1948, the top three presidents are Lincoln, Washington, and Franklin Delano Roosevelt, although the order varies. Between 1999 and 2011, Lincoln, John F. Kennedy, and Ronald Reagan have been the top-ranked presidents in eight surveys, according to Gallup. A 2004 study found that scholars in the fields of history and politics ranked Lincoln number one, while legal scholars placed him second after George Washington.Lincoln's assassination left him a national martyr. He was viewed by abolitionists as a champion of human liberty. Republicans linked Lincoln's name to their party. Many, though not all, in the South considered Lincoln as a man of outstanding ability. Historians have said he was "a classical liberal" in the 19th-century sense. Allen C. Guelzo states that Lincoln was a "classical liberal democrat—an enemy of artificial hierarchy, a friend to trade and business as ennobling and enabling, and an American counterpart to Mill, Cobden, and Bright", whose portrait Lincoln hung in his White House office.Schwartz argues that Lincoln's American reputation grew slowly from the late 19th century until the Progressive Era (1900–1920s), when he emerged as one of America's most venerated heroes, even among white Southerners. The high point came in 1922 with the dedication of the Lincoln Memorial on the National Mall in Washington, D.C.Union nationalism, as envisioned by Lincoln, "helped lead America to the nationalism of Theodore Roosevelt, Woodrow Wilson, and Franklin Delano Roosevelt." In the New Deal era, liberals honored Lincoln not so much as the self-made man or the great war president, but as the advocate of the common man who they claimed would have supported the welfare state.Sociologist Barry Schwartz argues that in the 1930s and 1940s the memory of Abraham Lincoln was practically sacred and provided the nation with "a moral symbol inspiring and guiding American life." During the Great Depression, he argues, Lincoln served "as a means for seeing the world's disappointments, for making its sufferings not so much explicable as meaningful". Franklin D. Roosevelt, preparing America for war, used the words of the Civil War president to clarify the threat posed by Germany and Japan. Americans asked, "What would Lincoln do?" However, Schwartz also finds that since World War II Lincoln's symbolic power has lost relevance, and this "fading hero is symptomatic of fading confidence in national greatness." He suggested that postmodernism and multiculturalism have diluted greatness as a concept.In the Cold War years, Lincoln's image shifted to a symbol of freedom who brought hope to those oppressed by Communist regimes. By the late 1960s, some African-American intellectuals, led by Lerone Bennett Jr., rejected Lincoln's role as the Great Emancipator. Bennett won wide attention when he called Lincoln a white supremacist in 1968. He noted that Lincoln used ethnic slurs and told jokes that ridiculed blacks. Bennett argued that Lincoln opposed social equality, and proposed sending freed slaves to another country. Defenders, such as authors Dirck and Cashin, retorted that he was not as bad as most politicians of his day; and that he was a "moral visionary" who deftly advanced the abolitionist cause, as fast as politically possible. The emphasis shifted away from Lincoln the emancipator to an argument that blacks had freed themselves from slavery, or at least were responsible for pressuring the government on emancipation.By the 1970s, Lincoln had become a hero to political conservatives, apart from neo-Confederates such as Mel Bradford who denounced his treatment of the white South, for his intense nationalism, support for business, his insistence on stopping the spread of human bondage, his acting in terms of Lockean and Burkean principles on behalf of both liberty and tradition, and his devotion to the principles of the Founding Fathers. Lincoln became a favorite exemplar for liberal intellectuals across the world.Historian Barry Schwartz wrote in 2009 that Lincoln's image suffered "erosion, fading prestige, benign ridicule" in the late 20th century. On the other hand, Donald opined in his 1996 biography that Lincoln was distinctly endowed with the personality trait of negative capability, defined by the poet John Keats and attributed to extraordinary leaders who were "content in the midst of uncertainties and doubts, and not compelled toward fact or reason".In the 21st century, President Barack Obama named Lincoln his favorite president and insisted on using the Lincoln Bible for his inaugural ceremonies. Lincoln has often been portrayed by Hollywood, almost always in a flattering light.Memory and memorialsLincoln's portrait appears on two denominations of United States currency, the penny and the $5 bill. His likeness also appears on many postage stamps. While he is usually portrayed bearded, he did not grow a beard until 1860 at the suggestion of 11-year-old Grace Bedell. He was the first of five presidents to do so.He has been memorialized in many town, city, and county names, including the capital of Nebraska. The United States Navy   is named after Lincoln, the second Navy ship to bear his name.Lincoln Memorial is one of the most visited monuments in the nation's capital, and is one of the top five visited National Park Service sites in the country. Ford's Theatre, among the top sites in Washington, D.C., is across the street from Petersen House (where he died). Memorials in Springfield, Illinois include Abraham Lincoln Presidential Library and Museum, Lincoln's home, as well as his tomb. A portrait carving of Lincoln appears with those of three other presidents on Mount Rushmore, which receives about 3 million visitors a year.See also Outline of Abraham Lincoln Grace Bedell Lincoln Tower List of civil rights leaders List of photographs of Abraham Lincoln Lincoln (film): 2012 film by Steven Spielberg. Linconia, a proposed colony in Central America named for LincolnNotesReferencesBibliography                       Ellenberg's essay is adapted from his 2021 book, Shape: The Hidden Geometry of Information, Biology, Strategy, Democracy, and Everything Else, Penguin Press. ISBN 9781984879059External linksOfficial Abraham Lincoln Presidential Library and Museum The Lincoln Presidential Library's ongoing digitization of all documents written by or to Abraham Lincoln during his lifetime Collected Works of Abraham Lincoln – complete collected works as edited by Basler et al. (1958) – an online edition available through University of Michigan Library Digital Collections White House biographyOrganizations Abraham Lincoln Association  Abraham Lincoln Bicentennial FoundationMedia coverageOther Abraham Lincoln: A Resource Guide from the Library of Congress "Life Portrait of Abraham Lincoln", from C-SPAN's American presidents: Life Portraits, June 28, 1999 "Writings of Abraham Lincoln" from C-SPAN's American Writers: A Journey Through History Abraham Lincoln: Original Letters and Manuscripts – Shapell Manuscript Foundation Lincoln/Net: Abraham Lincoln Historical Digitization Project – Northern Illinois University Libraries Teaching Abraham Lincoln  – National Endowment for the Humanities    In Popular Song: Our Noble Chief Has Passed Away by Cooper/Thomas Abraham Lincoln Recollections and Newspaper Articles Collection , McLean County Museum of History Digitized items in the Alfred Whital Stern Collection of Lincolniana in the Rare Book and Special Collections Division in the Library of Congress 1809 births1865 deaths1865 murders in the United States19th-century American politicians19th-century presidents of the United StatesAmerican abolitionistsAmerican colonization movementAmerican lawyers admitted to the practice of law by reading lawAmerican military personnel of the Indian WarsAmerican militia officersAmerican nationalistsAmerican people of English descentAmerican political party foundersIllinois postmastersAmerican surveyorsAssassinated presidents of the United StatesBurials at Oak Ridge CemeteryCandidates in the 1860 United States presidential electionCandidates in the 1864 United States presidential electionHall of Fame for Great Americans inducteesIllinois Central Railroad peopleIllinois RepublicansIllinois WhigsIllinois lawyersAbrahamMale murder victimsMembers of the Illinois House of RepresentativesMembers of the United States House of Representatives from IllinoisPeople associated with the assassination of Abraham LincolnPeople from Coles County, IllinoisPeople from LaRue County, KentuckyPeople from Macon County, IllinoisPeople from Spencer County, IndianaPeople murdered in Washington, D.C.People of Illinois in the American Civil WarPeople with mood disordersPoliticians from Springfield, IllinoisPresidents of the United StatesRepublican Party (United States) presidential nomineesRepublican Party presidents of the United StatesUnion political leadersWhig Party members of the United States House of Representatives
+Aristotle (;  Aristotélēs, ; 384–322 BC) was a Greek philosopher and polymath during the Classical period in Ancient Greece. Taught by Plato, he was the founder of the Lyceum, the Peripatetic school of philosophy, and the Aristotelian tradition. His writings cover many subjects including physics, biology, zoology, metaphysics, logic, ethics, aesthetics, poetry, theatre, music, rhetoric, psychology, linguistics, economics, politics, meteorology, geology and government. Aristotle provided a complex synthesis of the various philosophies existing prior to him. It was above all from his teachings that the West inherited its intellectual lexicon, as well as problems and methods of inquiry. As a result, his philosophy has exerted a unique influence on almost every form of knowledge in the West and it continues to be a subject of contemporary philosophical discussion.Little is known about his life. Aristotle was born in the city of Stagira in Northern Greece. His father, Nicomachus, died when Aristotle was a child, and he was brought up by a guardian. At seventeen or eighteen years of age he joined Plato's Academy in Athens and remained there until the age of thirty-seven (c. 347 BC). Shortly after Plato died, Aristotle left Athens and, at the request of Philip II of Macedon, tutored Alexander the Great beginning in 343 BC. He established a library in the Lyceum which helped him to produce many of his hundreds of books on papyrus scrolls. Though Aristotle wrote many elegant treatises and dialogues for publication, only around a third of his original output has survived, none of it intended for publication.Aristotle's views profoundly shaped medieval scholarship. The influence of physical science extended from Late Antiquity and the Early Middle Ages into the Renaissance, and were not replaced systematically until the Enlightenment and theories such as classical mechanics were developed. Some of Aristotle's zoological observations found in his biology, such as on the hectocotyl (reproductive) arm of the octopus, were disbelieved until the 19th century. He also influenced Judeo-Islamic philosophies (800–1400) during the Middle Ages, as well as Christian theology, especially the Neoplatonism of the Early Church and the scholastic tradition of the Catholic Church. Aristotle was revered among medieval Muslim scholars as "The First Teacher", and among medieval Christians like Thomas Aquinas as simply "The Philosopher", while the poet Dante called him “the master of those who know". His works contain the earliest known formal study of logic, and were studied by medieval scholars such as Peter Abelard and John Buridan.Aristotle's influence on logic continued well into the 19th century. In addition, his ethics, though always influential, gained renewed interest with the modern advent of virtue ethics.Aristotle has been called "the father of logic", "the father of biology", "the father of political science", "the father of zoology", "the father of embryology", "the father of natural law", "the father of scientific method", "the father of rhetoric", "the father of psychology", "the father of realism", "the father of criticism", "the father of individualism", "the father of teleology", and "the father of meteorology".LifeIn general, the details of Aristotle's life are not well-established. The biographies written in ancient times are often speculative and historians only agree on a few salient points.Aristotle, whose name means "the best purpose" in Ancient Greek, was born in 384 BC in Stagira, Chalcidice, about 55 km (34 miles) east of modern-day Thessaloniki. His father, Nicomachus, was the personal physician to King Amyntas of Macedon. While he was young, Aristotle learned about biology and medical information, which was taught by his father. Both of Aristotle's parents died when he was about thirteen, and Proxenus of Atarneus became his guardian. Although little information about Aristotle's childhood has survived, he probably spent some time within the Macedonian palace, making his first connections with the Macedonian monarchy.At the age of seventeen or eighteen, Aristotle moved to Athens to continue his education at Plato's Academy. He probably experienced the Eleusinian Mysteries as he wrote when describing the sights one viewed at the Eleusinian Mysteries, "to experience is to learn" [παθείν μαθεĩν]. Aristotle remained in Athens for nearly twenty years before leaving in 348/47 BC. The traditional story about his departure records that he was disappointed with the academy's direction after control passed to Plato's nephew Speusippus, although it is possible that he feared the anti-Macedonian sentiments in Athens at that time and left before Plato died. Aristotle then accompanied Xenocrates to the court of his friend Hermias of Atarneus in Asia Minor. After the death of Hermias, Aristotle travelled with his pupil Theophrastus to the island of Lesbos, where together they researched the botany and zoology of the island and its sheltered lagoon. While in Lesbos, Aristotle married Pythias, either Hermias's adoptive daughter or niece. She bore him a daughter, whom they also named Pythias. In 343 BC, Aristotle was invited by Philip II of Macedon to become the tutor to his son Alexander.Aristotle was appointed as the head of the royal academy of Macedon. During Aristotle's time in the Macedonian court, he gave lessons not only to Alexander but also to two other future kings: Ptolemy and Cassander. Aristotle encouraged Alexander toward eastern conquest, and Aristotle's own attitude towards Persia was unabashedly ethnocentric. In one famous example, he counsels Alexander to be "a leader to the Greeks and a despot to the barbarians, to look after the former as after friends and relatives, and to deal with the latter as with beasts or plants". By 335 BC, Aristotle had returned to Athens, establishing his own school there known as the Lyceum. Aristotle conducted courses at the school for the next twelve years. While in Athens, his wife Pythias died and Aristotle became involved with Herpyllis of Stagira, who bore him a son whom he named after his father, Nicomachus. If the Suda  an uncritical compilation from the Middle Ages  is accurate, he may also have had an erômenos, Palaephatus of Abydus.This period in Athens, between 335 and 323 BC, is when Aristotle is believed to have composed many of his works. He wrote many dialogues, of which only fragments have survived. Those works that have survived are in treatise form and were not, for the most part, intended for widespread publication; they are generally thought to be lecture aids for his students. His most important treatises include Physics, Metaphysics, Nicomachean Ethics, Politics, On the Soul and Poetics. Aristotle studied and made significant contributions to "logic, metaphysics, mathematics, physics, biology, botany, ethics, politics, agriculture, medicine, dance, and theatre."Near the end of his life, Alexander and Aristotle became estranged over Alexander's relationship with Persia and Persians. A widespread tradition in antiquity suspected Aristotle of playing a role in Alexander's death, but the only evidence of this is an unlikely claim made some six years after the death. Following Alexander's death, anti-Macedonian sentiment in Athens was rekindled. In 322 BC, Demophilus and Eurymedon the Hierophant reportedly denounced Aristotle for impiety, prompting him to flee to his mother's family estate in Chalcis, on Euboea, at which occasion he was said to have stated: "I will not allow the Athenians to sin twice against philosophy" – a reference to Athens's trial and execution of Socrates. He died on Euboea of natural causes later that same year, having named his student Antipater as his chief executor and leaving a will in which he asked to be buried next to his wife.Speculative philosophyLogicWith the Prior Analytics, Aristotle is credited with the earliest study of formal logic, and his conception of it was the dominant form of Western logic until 19th-century advances in mathematical logic. Kant stated in the Critique of Pure Reason that with Aristotle logic reached its completion.OrganonWhat is today called Aristotelian logic with its types of syllogism (methods of logical argument), Aristotle himself would have labelled "analytics". The term "logic" he reserved to mean dialectics. Most of Aristotle's work is probably not in its original form, because it was most likely edited by students and later lecturers. The logical works of Aristotle were compiled into a set of six books called the Organon around 40 BC by Andronicus of Rhodes or others among his followers. The books are: Categories On Interpretation Prior Analytics Posterior Analytics Topics On Sophistical RefutationsThe order of the books (or the teachings from which they are composed) is not certain, but this list was derived from analysis of Aristotle's writings. It goes from the basics, the analysis of simple terms in the Categories, the analysis of propositions and their elementary relations in On Interpretation, to the study of more complex forms, namely, syllogisms (in the Analytics) and dialectics (in the Topics and Sophistical Refutations). The first three treatises form the core of the logical theory stricto sensu: the grammar of the language of logic and the correct rules of reasoning. The Rhetoric is not conventionally included, but it states that it relies on the Topics.MetaphysicsThe word "metaphysics" appears to have been coined by the first century AD editor who assembled various small selections of Aristotle's works to the treatise we know by the name Metaphysics. Aristotle called it "first philosophy", and distinguished it from mathematics and natural science (physics) as the contemplative (theoretikē) philosophy which is "theological" and studies the divine. He wrote in his Metaphysics (1026a16):Substance Aristotle examines the concepts of substance (ousia) and essence (to ti ên einai, "the what it was to be") in his Metaphysics (Book VII), and he concludes that a particular substance is a combination of both matter and form, a philosophical theory called hylomorphism. In Book VIII, he distinguishes the matter of the substance as the substratum, or the stuff of which it is composed. For example, the matter of a house is the bricks, stones, timbers, etc., or whatever constitutes the potential house, while the form of the substance is the actual house, namely 'covering for bodies and chattels' or any other differentia that let us define something as a house. The formula that gives the components is the account of the matter, and the formula that gives the differentia is the account of the form.Immanent realism Like his teacher Plato, Aristotle's philosophy aims at the universal. Aristotle's ontology places the universal (katholou) in particulars (kath' hekaston), things in the world, whereas for Plato the universal is a separately existing form which actual things imitate. For Aristotle, "form" is still what phenomena are based on, but is "instantiated" in a particular substance.Plato argued that all things have a universal form, which could be either a property or a relation to other things. When one looks at an apple, for example, one sees an apple, and one can also analyse a form of an apple. In this distinction, there is a particular apple and a universal form of an apple. Moreover, one can place an apple next to a book, so that one can speak of both the book and apple as being next to each other. Plato argued that there are some universal forms that are not a part of particular things. For example, it is possible that there is no particular good in existence, but "good" is still a proper universal form. Aristotle disagreed with Plato on this point, arguing that all universals are instantiated at some period of time, and that there are no universals that are unattached to existing things. In addition, Aristotle disagreed with Plato about the location of universals. Where Plato spoke of the world of forms, a place where all universal forms subsist, Aristotle maintained that universals exist within each thing on which each universal is predicated. So, according to Aristotle, the form of apple exists within each apple, rather than in the world of the forms.Potentiality and actuality With regard to the change (kinesis) and its causes now, as he defines in his Physics and On Generation and Corruption 319b–320a, he distinguishes the coming to be from: growth and diminution, which is change in quantity; locomotion, which is change in space; and alteration, which is change in quality.The coming to be is a change where nothing persists of which the resultant is a property. In that particular change he introduces the concept of potentiality (dynamis) and actuality (entelecheia) in association with the matter and the form. Referring to potentiality, this is what a thing is capable of doing or being acted upon if the conditions are right and it is not prevented by something else. For example, the seed of a plant in the soil is potentially (dynamei) a plant, and if it is not prevented by something, it will become a plant. Potentially beings can either 'act' (poiein) or 'be acted upon' (paschein), which can be either innate or learned. For example, the eyes possess the potentiality of sight (innate – being acted upon), while the capability of playing the flute can be possessed by learning (exercise – acting). Actuality is the fulfilment of the end of the potentiality. Because the end (telos) is the principle of every change, and for the sake of the end exists potentiality, therefore actuality is the end. Referring then to the previous example, it can be said that an actuality is when a plant does one of the activities that plants do.In summary, the matter used to make a house has potentiality to be a house and both the activity of building and the form of the final house are actualities, which is also a final cause or end. Then Aristotle proceeds and concludes that the actuality is prior to potentiality in formula, in time and in substantiality. With this definition of the particular substance (i.e., matter and form), Aristotle tries to solve the problem of the unity of the beings, for example, "what is it that makes a man one"? Since, according to Plato there are two Ideas: animal and biped, how then is man a unity? However, according to Aristotle, the potential being (matter) and the actual one (form) are one and the same.EpistemologyAristotle's immanent realism means his epistemology is based on the study of things that exist or happen in the world, and rises to knowledge of the universal, whereas for Plato epistemology begins with knowledge of universal Forms (or ideas) and descends to knowledge of particular imitations of these. Aristotle uses induction from examples alongside deduction, whereas Plato relies on deduction from a priori principles.Natural philosophyAristotle's "natural philosophy" spans a wide range of natural phenomena including those now covered by physics, biology and other natural sciences. In Aristotle's terminology, "natural philosophy" is a branch of philosophy examining the phenomena of the natural world, and includes fields that would be regarded today as physics, biology and other natural sciences. Aristotle's work encompassed virtually all facets of intellectual inquiry. Aristotle makes philosophy in the broad sense coextensive with reasoning, which he also would describe as "science". However, his use of the term science carries a different meaning than that covered by the term "scientific method". For Aristotle, "all science (dianoia) is either practical, poetical or theoretical" (Metaphysics 1025b25). His practical science includes ethics and politics; his poetical science means the study of fine arts including poetry; his theoretical science covers physics, mathematics and metaphysics.PhysicsFive elementsIn his On Generation and Corruption, Aristotle related each of the four elements proposed earlier by Empedocles, Earth, Water, Air, and Fire, to two of the four sensible qualities, hot, cold, wet, and dry. In the Empedoclean scheme, all matter was made of the four elements, in differing proportions. Aristotle's scheme added the heavenly Aether, the divine substance of the heavenly spheres, stars and planets.MotionAristotle describes two kinds of motion: "violent" or "unnatural motion", such as that of a thrown stone, in the Physics (254b10), and "natural motion", such as of a falling object, in On the Heavens (300a20). In violent motion, as soon as the agent stops causing it, the motion stops also: in other words, the natural state of an object is to be at rest, since Aristotle does not address friction. With this understanding, it can be observed that, as Aristotle stated, heavy objects (on the ground, say) require more force to make them move; and objects pushed with greater force move faster. This would imply the equation ,incorrect in modern physics.Natural motion depends on the element concerned: the aether naturally moves in a circle around the heavens, while the 4 Empedoclean elements move vertically up (like fire, as is observed) or down (like earth) towards their natural resting places.In the Physics (215a25), Aristotle effectively states a quantitative law, that the speed, v, of a falling body is proportional (say, with constant c) to its weight, W, and inversely proportional to the density, ρ, of the fluid in which it is falling: Aristotle implies that in a vacuum the speed of fall would become infinite, and concludes from this apparent absurdity that a vacuum is not possible. Opinions have varied on whether Aristotle intended to state quantitative laws. Henri Carteron held the "extreme view" that Aristotle's concept of force was basically qualitative, but other authors reject this.Archimedes corrected Aristotle's theory that bodies move towards their natural resting places; metal boats can float if they displace enough water; floating depends in Archimedes' scheme on the mass and volume of the object, not, as Aristotle thought, its elementary composition.Aristotle's writings on motion remained influential until the Early Modern period. John Philoponus (in the Middle Ages) and Galileo are said to have shown by experiment that Aristotle's claim that a heavier object falls faster than a lighter object is incorrect. A contrary opinion is given by Carlo Rovelli, who argues that Aristotle's physics of motion is correct within its domain of validity, that of objects in the Earth's gravitational field immersed in a fluid such as air. In this system, heavy bodies in steady fall indeed travel faster than light ones (whether friction is ignored, or not), and they do fall more slowly in a denser medium.Newton's "forced" motion corresponds to Aristotle's "violent" motion with its external agent, but Aristotle's assumption that the agent's effect stops immediately it stops acting (e.g., the ball leaves the thrower's hand) has awkward consequences: he has to suppose that surrounding fluid helps to push the ball along to make it continue to rise even though the hand is no longer acting on it, resulting in the Medieval theory of impetus.Four causesAristotle suggested that the reason for anything coming about can be attributed to four different types of simultaneously active factors. His term aitia is traditionally translated as "cause", but it does not always refer to temporal sequence; it might be better translated as "explanation", but the traditional rendering will be employed here. Material cause describes the material out of which something is composed. Thus the material cause of a table is wood. It is not about action. It does not mean that one domino knocks over another domino. The formal cause is its form, i.e., the arrangement of that matter. It tells one what a thing is, that a thing is determined by the definition, form, pattern, essence, whole, synthesis or archetype. It embraces the account of causes in terms of fundamental principles or general laws, as the whole (i.e., macrostructure) is the cause of its parts, a relationship known as the whole-part causation. Plainly put, the formal cause is the idea in the mind of the sculptor that brings the sculpture into being. A simple example of the formal cause is the mental image or idea that allows an artist, architect, or engineer to create a drawing. The efficient cause is "the primary source", or that from which the change under consideration proceeds. It identifies 'what makes of what is made and what causes change of what is changed' and so suggests all sorts of agents, non-living or living, acting as the sources of change or movement or rest. Representing the current understanding of causality as the relation of cause and effect, this covers the modern definitions of "cause" as either the agent or agency or particular events or states of affairs. In the case of two dominoes, when the first is knocked over it causes the second also to fall over. In the case of animals, this agency is a combination of how it develops from the egg, and how its body functions. The final cause (telos) is its purpose, the reason why a thing exists or is done, including both purposeful and instrumental actions and activities. The final cause is the purpose or function that something is supposed to serve. This covers modern ideas of motivating causes, such as volition. In the case of living things, it implies adaptation to a particular way of life.OpticsAristotle describes experiments in optics using a camera obscura in Problems, book 15. The apparatus consisted of a dark chamber with a small aperture that let light in. With it, he saw that whatever shape he made the hole, the sun's image always remained circular. He also noted that increasing the distance between the aperture and the image surface magnified the image.Chance and spontaneityAccording to Aristotle, spontaneity and chance are causes of some things, distinguishable from other types of cause such as simple necessity. Chance as an incidental cause lies in the realm of accidental things, "from what is spontaneous". There is also more a specific kind of chance, which Aristotle names "luck", that only applies to people's moral choices.AstronomyIn astronomy, Aristotle refuted Democritus's claim that the Milky Way was made up of "those stars which are shaded by the earth from the sun's rays," pointing out correctly that if "the size of the sun is greater than that of the earth and the distance of the stars from the earth many times greater than that of the sun, then... the sun shines on all the stars and the earth screens none of them."Geology/Natural SciencesAristotle was one of the first people to record any geological observations. He stated that geological change was too slow to be observed in one person's lifetime.The geologist Charles Lyell noted that Aristotle described such change, including "lakes that had dried up" and "deserts that had become watered by rivers", giving as examples the growth of the Nile delta since the time of Homer, and "the upheaving of one of the Aeolian islands, previous to a volcanic eruption."'Aristotle also made many observations about the hydrologic cycle and meteorology (including his major writings "Meteorologica"). For example, he made some of the earliest observations about desalination: he observed early – and correctly – that when seawater is heated, freshwater evaporates and that the oceans are then replenished by the cycle of rainfall and river runoff ("I have proved by experiment that salt water evaporated forms fresh and the vapor does not when it condenses condense into sea water again.")BiologyEmpirical researchAristotle was the first person to study biology systematically, and biology forms a large part of his writings. He spent two years observing and describing the zoology of Lesbos and the surrounding seas, including in particular the Pyrrha lagoon in the centre of Lesbos. His data in History of Animals, Generation of Animals, Movement of Animals, and Parts of Animals are assembled from his own observations, statements given by people with specialized knowledge such as beekeepers and fishermen, and less accurate accounts provided by travellers from overseas. His apparent emphasis on animals rather than plants is a historical accident: his works on botany have been lost, but two books on plants by his pupil Theophrastus have survived.Aristotle reports on the sea-life visible from observation on Lesbos and the catches of fishermen. He describes the catfish, electric ray, and frogfish in detail, as well as cephalopods such as the octopus and paper nautilus. His description of the hectocotyl arm of cephalopods, used in sexual reproduction, was widely disbelieved until the 19th century. He gives accurate descriptions of the four-chambered fore-stomachs of ruminants, and of the ovoviviparous embryological development of the hound shark.He notes that an animal's structure is well matched to function, so, among birds, the heron, which lives in marshes with soft mud and lives by catching fish, has a long neck and long legs, and a sharp spear-like beak, whereas ducks that swim have short legs and webbed feet. Darwin, too, noted these sorts of differences between similar kinds of animal, but unlike Aristotle used the data to come to the theory of evolution. Aristotle's writings can seem to modern readers close to implying evolution, but while Aristotle was aware that new mutations or hybridizations could occur, he saw these as rare accidents. For Aristotle, accidents, like heat waves in winter, must be considered distinct from natural causes. He was thus critical of Empedocles's materialist theory of a "survival of the fittest" origin of living things and their organs, and ridiculed the idea that accidents could lead to orderly results. To put his views into modern terms, he nowhere says that different species can have a common ancestor, or that one kind can change into another, or that kinds can become extinct.Scientific styleAristotle did not do experiments in the modern sense. He used the ancient Greek term pepeiramenoi to mean observations, or at most investigative procedures like dissection. In Generation of Animals, he finds a fertilized hen's egg of a suitable stage and opens it to see the embryo's heart beating inside.Instead, he practiced a different style of science: systematically gathering data, discovering patterns common to whole groups of animals, and inferring possible causal explanations from these. This style is common in modern biology when large amounts of data become available in a new field, such as genomics. It does not result in the same certainty as experimental science, but it sets out testable hypotheses and constructs a narrative explanation of what is observed. In this sense, Aristotle's biology is scientific.From the data he collected and documented, Aristotle inferred quite a number of rules relating the life-history features of the live-bearing tetrapods (terrestrial placental mammals) that he studied. Among these correct predictions are the following. Brood size decreases with (adult) body mass, so that an elephant has fewer young (usually just one) per brood than a mouse. Lifespan increases with gestation period, and also with body mass, so that elephants live longer than mice, have a longer period of gestation, and are heavier. As a final example, fecundity decreases with lifespan, so long-lived kinds like elephants have fewer young in total than short-lived kinds like mice.Classification of living thingsAristotle distinguished about 500 species of animals, arranging these in the History of Animals in a graded scale of perfection, a nonreligious version of the scala naturae, with man at the top. His system had eleven grades of animal, from highest potential to lowest, expressed in their form at birth: the highest gave live birth to hot and wet creatures, the lowest laid cold, dry mineral-like eggs. Animals came above plants, and these in turn were above minerals. see also: He grouped what the modern zoologist would call vertebrates as the hotter "animals with blood", and below them the colder invertebrates as "animals without blood". Those with blood were divided into the live-bearing (mammals), and the egg-laying (birds, reptiles, fish). Those without blood were insects, crustacea (non-shelled – cephalopods, and shelled) and the hard-shelled molluscs (bivalves and gastropods). He recognised that animals did not exactly fit into a linear scale, and noted various exceptions, such as that sharks had a placenta like the tetrapods. To a modern biologist, the explanation, not available to Aristotle, is convergent evolution. Philosophers of science have generally concluded that Aristotle was not interested in taxonomy, but zoologists who studied this question recently think otherwise. He believed that purposive final causes guided all natural processes; this teleological view justified his observed data as an expression of formal design.PsychologySoulAristotle's psychology, given in his treatise On the Soul (peri psychēs), posits three kinds of soul ("psyches"): the vegetative soul, the sensitive soul, and the rational soul. Humans have a rational soul. The human soul incorporates the powers of the other kinds: Like the vegetative soul it can grow and nourish itself; like the sensitive soul it can experience sensations and move locally. The unique part of the human, rational soul is its ability to receive forms of other things and to compare them using the nous (intellect) and logos (reason).For Aristotle, the soul is the form of a living being. Because all beings are composites of form and matter, the form of living beings is that which endows them with what is specific to living beings, e.g. the ability to initiate movement (or in the case of plants, growth and chemical transformations, which Aristotle considers types of movement). In contrast to earlier philosophers, but in accordance with the Egyptians, he placed the rational soul in the heart, rather than the brain. Notable is Aristotle's division of sensation and thought, which generally differed from the concepts of previous philosophers, with the exception of Alcmaeon.MemoryAccording to Aristotle in On the Soul, memory is the ability to hold a perceived experience in the mind and to distinguish between the internal "appearance" and an occurrence in the past. In other words, a memory is a mental picture (phantasm) that can be recovered. Aristotle believed an impression is left on a semi-fluid bodily organ that undergoes several changes in order to make a memory. A memory occurs when stimuli such as sights or sounds are so complex that the nervous system cannot receive all the impressions at once. These changes are the same as those involved in the operations of sensation, Aristotelian , and thinking.Aristotle uses the term 'memory' for the actual retaining of an experience in the impression that can develop from sensation, and for the intellectual anxiety that comes with the impression because it is formed at a particular time and processing specific contents. Memory is of the past, prediction is of the future, and sensation is of the present. Retrieval of impressions cannot be performed suddenly. A transitional channel is needed and located in past experiences, both for previous experience and present experience.Because Aristotle believes people receive all kinds of sense perceptions and perceive them as impressions, people are continually weaving together new impressions of experiences. To search for these impressions, people search the memory itself. Within the memory, if one experience is offered instead of a specific memory, that person will reject this experience until they find what they are looking for. Recollection occurs when one retrieved experience naturally follows another. If the chain of "images" is needed, one memory will stimulate the next. When people recall experiences, they stimulate certain previous experiences until they reach the one that is needed. Recollection is thus the self-directed activity of retrieving the information stored in a memory impression. Only humans can remember impressions of intellectual activity, such as numbers and words. Animals that have perception of time can retrieve memories of their past observations. Remembering involves only perception of the things remembered and of the time passed.Aristotle believed the chain of thought, which ends in recollection of certain impressions, was connected systematically in relationships such as similarity, contrast, and contiguity, described in his laws of association. Aristotle believed that past experiences are hidden within the mind. A force operates to awaken the hidden material to bring up the actual experience. According to Aristotle, association is the power innate in a mental state, which operates upon the unexpressed remains of former experiences, allowing them to rise and be recalled.DreamsAristotle describes sleep in On Sleep and Wakefulness. Sleep takes place as a result of overuse of the senses or of digestion, so it is vital to the body. While a person is asleep, the critical activities, which include thinking, sensing, recalling and remembering, do not function as they do during wakefulness. Since a person cannot sense during sleep they cannot have desire, which is the result of sensation. However, the senses are able to work during sleep, albeit differently, unless they are weary.Dreams do not involve actually sensing a stimulus. In dreams, sensation is still involved, but in an altered manner. Aristotle explains that when a person stares at a moving stimulus such as the waves in a body of water, and then looks away, the next thing they look at appears to have a wavelike motion. When a person perceives a stimulus and the stimulus is no longer the focus of their attention, it leaves an impression. When the body is awake and the senses are functioning properly, a person constantly encounters new stimuli to sense and so the impressions of previously perceived stimuli are ignored. However, during sleep the impressions made throughout the day are noticed as there are no new distracting sensory experiences. So, dreams result from these lasting impressions. Since impressions are all that are left and not the exact stimuli, dreams do not resemble the actual waking experience. During sleep, a person is in an altered state of mind. Aristotle compares a sleeping person to a person who is overtaken by strong feelings toward a stimulus. For example, a person who has a strong infatuation with someone may begin to think they see that person everywhere because they are so overtaken by their feelings. Since a person sleeping is in a suggestible state and unable to make judgements, they become easily deceived by what appears in their dreams, like the infatuated person. This leads the person to believe the dream is real, even when the dreams are absurd in nature. In De Anima iii 3, Aristotle ascribes the ability to create, to store, and to recall images in the absence of perception to the faculty of imagination, phantasia.One component of Aristotle's theory of dreams disagrees with previously held beliefs. He claimed that dreams are not foretelling and not sent by a divine being. Aristotle reasoned naturalistically that instances in which dreams do resemble future events are simply coincidences. Aristotle claimed that a dream is first established by the fact that the person is asleep when they experience it. If a person had an image appear for a moment after waking up or if they see something in the dark it is not considered a dream because they were awake when it occurred. Secondly, any sensory experience that is perceived while a person is asleep does not qualify as part of a dream. For example, if, while a person is sleeping, a door shuts and in their dream they hear a door is shut, this sensory experience is not part of the dream. Lastly, the images of dreams must be a result of lasting impressions of waking sensory experiences.Practical philosophyAristotle's practical philosophy covers areas such as ethics, politics, economics, and rhetoric.EthicsAristotle considered ethics to be a practical rather than theoretical study, i.e., one aimed at becoming good and doing good rather than knowing for its own sake. He wrote several treatises on ethics, including most notably, the Nicomachean Ethics.Aristotle taught that virtue has to do with the proper function (ergon) of a thing. An eye is only a good eye in so much as it can see, because the proper function of an eye is sight. Aristotle reasoned that humans must have a function specific to humans, and that this function must be an activity of the psuchē (soul) in accordance with reason (logos). Aristotle identified such an optimum activity (the virtuous mean, between the accompanying vices of excess or deficiency) of the soul as the aim of all human deliberate action, eudaimonia, generally translated as "happiness" or sometimes "well-being". To have the potential of ever being happy in this way necessarily requires a good character (ēthikē aretē), often translated as moral or ethical virtue or excellence.Aristotle taught that to achieve a virtuous and potentially happy character requires a first stage of having the fortune to be habituated not deliberately, but by teachers, and experience, leading to a later stage in which one consciously chooses to do the best things. When the best people come to live life this way their practical wisdom (phronesis) and their intellect (nous) can develop with each other towards the highest possible human virtue, the wisdom of an accomplished theoretical or speculative thinker, or in other words, a philosopher.PoliticsIn addition to his works on ethics, which address the individual, Aristotle addressed the city in his work titled Politics. Aristotle considered the city to be a natural community. Moreover, he considered the city to be prior in importance to the family which in turn is prior to the individual, "for the whole must of necessity be prior to the part". He famously stated that "man is by nature a political animal" and argued that humanity's defining factor among others in the animal kingdom is its rationality. Aristotle conceived of politics as being like an organism rather than like a machine, and as a collection of parts none of which can exist without the others. Aristotle's conception of the city is organic, and he is considered one of the first to conceive of the city in this manner.The common modern understanding of a political community as a modern state is quite different from Aristotle's understanding. Although he was aware of the existence and potential of larger empires, the natural community according to Aristotle was the city (polis) which functions as a political "community" or "partnership" (koinōnia). The aim of the city is not just to avoid injustice or for economic stability, but rather to allow at least some citizens the possibility to live a good life, and to perform beautiful acts: "The political partnership must be regarded, therefore, as being for the sake of noble actions, not for the sake of living together." This is distinguished from modern approaches, beginning with social contract theory, according to which individuals leave the state of nature because of "fear of violent death" or its "inconveniences."In Protrepticus, the character 'Aristotle' states:As Plato's disciple Aristotle was rather skeptical concerning democracy and, following Plato's vague ideas, he developed a coherent theory of integrating various forms of power into a so-called mixed state:To illustrate this approach, Aristotle proposed a first-of-its-kind mathematical model of voting, albeit textually described, where the democratic principle of "one voter–one vote" is combined with the oligarchic "merit-weighted voting"; for relevant quotes and their translation into mathematical formulas see.EconomicsAristotle made substantial contributions to economic thought, especially to thought in the Middle Ages. In Politics, Aristotle addresses the city, property, and trade. His response to criticisms of private property, in Lionel Robbins's view, anticipated later proponents of private property among philosophers and economists, as it related to the overall utility of social arrangements. Aristotle believed that although communal arrangements may seem beneficial to society, and that although private property is often blamed for social strife, such evils in fact come from human nature. In Politics, Aristotle offers one of the earliest accounts of the origin of money. Money came into use because people became dependent on one another, importing what they needed and exporting the surplus. For the sake of convenience, people then agreed to deal in something that is intrinsically useful and easily applicable, such as iron or silver.Aristotle's discussions on retail and interest was a major influence on economic thought in the Middle Ages. He had a low opinion of retail, believing that contrary to using money to procure things one needs in managing the household, retail trade seeks to make a profit. It thus uses goods as a means to an end, rather than as an end unto itself. He believed that retail trade was in this way unnatural. Similarly, Aristotle considered making a profit through interest unnatural, as it makes a gain out of the money itself, and not from its use.Aristotle gave a summary of the function of money that was perhaps remarkably precocious for his time. He wrote that because it is impossible to determine the value of every good through a count of the number of other goods it is worth, the necessity arises of a single universal standard of measurement. Money thus allows for the association of different goods and makes them "commensurable". He goes on to state that money is also useful for future exchange, making it a sort of security. That is, "if we do not want a thing now, we shall be able to get it when we do want it".Rhetoric and poeticsAristotle's Rhetoric proposes that a speaker can use three basic kinds of appeals to persuade his audience: ethos (an appeal to the speaker's character), pathos (an appeal to the audience's emotion), and logos (an appeal to logical reasoning). He also categorizes rhetoric into three genres: epideictic (ceremonial speeches dealing with praise or blame), forensic (judicial speeches over guilt or innocence), and deliberative (speeches calling on an audience to make a decision on an issue). Aristotle also outlines two kinds of rhetorical proofs: enthymeme (proof by syllogism) and paradeigma (proof by example).Aristotle writes in his Poetics that epic poetry, tragedy, comedy, dithyrambic poetry, painting, sculpture, music, and dance are all fundamentally acts of mimesis ("imitation"), each varying in imitation by medium, object, and manner. He applies the term mimesis both as a property of a work of art and also as the product of the artist's intention and contends that the audience's realisation of the mimesis is vital to understanding the work itself. Aristotle states that mimesis is a natural instinct of humanity that separates humans from animals and that all human artistry "follows the pattern of nature". Because of this, Aristotle believed that each of the mimetic arts possesses what Stephen Halliwell calls "highly structured procedures for the achievement of their purposes." For example, music imitates with the media of rhythm and harmony, whereas dance imitates with rhythm alone, and poetry with language. The forms also differ in their object of imitation. Comedy, for instance, is a dramatic imitation of men worse than average; whereas tragedy imitates men slightly better than average. Lastly, the forms differ in their manner of imitation – through narrative or character, through change or no change, and through drama or no drama.While it is believed that Aristotle's Poetics originally comprised two books – one on comedy and one on tragedy – only the portion that focuses on tragedy has survived. Aristotle taught that tragedy is composed of six elements: plot-structure, character, style, thought, spectacle, and lyric poetry. The characters in a tragedy are merely a means of driving the story; and the plot, not the characters, is the chief focus of tragedy. Tragedy is the imitation of action arousing pity and fear, and is meant to effect the catharsis of those same emotions. Aristotle concludes Poetics with a discussion on which, if either, is superior: epic or tragic mimesis. He suggests that because tragedy possesses all the attributes of an epic, possibly possesses additional attributes such as spectacle and music, is more unified, and achieves the aim of its mimesis in shorter scope, it can be considered superior to epic. Aristotle was a keen systematic collector of riddles, folklore, and proverbs; he and his school had a special interest in the riddles of the Delphic Oracle and studied the fables of Aesop.Views on womenAristotle's analysis of procreation describes an active, ensouling masculine element bringing life to an inert, passive female element. On this ground, proponents of feminist metaphysics have accused Aristotle of misogyny and sexism. However, Aristotle gave equal weight to women's happiness as he did to men's, and commented in his Rhetoric that the things that lead to happiness need to be in women as well as men.InfluenceMore than 2300 years after his death, Aristotle remains one of the most influential people who ever lived. He contributed to almost every field of human knowledge then in existence, and he was the founder of many new fields. According to the philosopher Bryan Magee, "it is doubtful whether any human being has ever known as much as he did". Among countless other achievements, Aristotle was the founder of formal logic, pioneered the study of zoology, and left every future scientist and philosopher in his debt through his contributions to the scientific method. Taneli Kukkonen, writing in The Classical Tradition, observes that his achievement in founding two sciences is unmatched, and his reach in influencing "every branch of intellectual enterprise" including Western ethical and political theory, theology, rhetoric and literary analysis is equally long. As a result, Kukkonen argues, any analysis of reality today "will almost certainly carry Aristotelian overtones ... evidence of an exceptionally forceful mind." Jonathan Barnes wrote that "an account of Aristotle's intellectual afterlife would be little less than a history of European thought".On his successor, TheophrastusAristotle's pupil and successor, Theophrastus, wrote the History of Plants, a pioneering work in botany. Some of his technical terms remain in use, such as carpel from carpos, fruit, and pericarp, from pericarpion, seed chamber.Theophrastus was much less concerned with formal causes than Aristotle was, instead pragmatically describing how plants functioned.On later Greek philosophersThe immediate influence of Aristotle's work was felt as the Lyceum grew into the Peripatetic school. Aristotle's notable students included Aristoxenus, Dicaearchus, Demetrius of Phalerum, Eudemos of Rhodes, Harpalus, Hephaestion, Mnason of Phocis, Nicomachus, and Theophrastus. Aristotle's influence over Alexander the Great is seen in the latter's bringing with him on his expedition a host of zoologists, botanists, and researchers. He had also learned a great deal about Persian customs and traditions from his teacher. Although his respect for Aristotle was diminished as his travels made it clear that much of Aristotle's geography was clearly wrong, when the old philosopher released his works to the public, Alexander complained "Thou hast not done well to publish thy acroamatic doctrines; for in what shall I surpass other men if those doctrines wherein I have been trained are to be all men's common property?"On Hellenistic scienceAfter Theophrastus, the Lyceum failed to produce any original work. Though interest in Aristotle's ideas survived, they were generally taken unquestioningly. It is not until the age of Alexandria under the Ptolemies that advances in biology can be again found.The first medical teacher at Alexandria, Herophilus of Chalcedon, corrected Aristotle, placing intelligence in the brain, and connected the nervous system to motion and sensation. Herophilus also distinguished between veins and arteries, noting that the latter pulse while the former do not. Though a few ancient atomists such as Lucretius challenged the teleological viewpoint of Aristotelian ideas about life, teleology (and after the rise of Christianity, natural theology) would remain central to biological thought essentially until the 18th and 19th centuries. Ernst Mayr states that there was "nothing of any real consequence in biology after Lucretius and Galen until the Renaissance."On Byzantine scholarsGreek Christian scribes played a crucial role in the preservation of Aristotle by copying all the extant Greek language manuscripts of the corpus. The first Greek Christians to comment extensively on Aristotle were Philoponus, Elias, and David in the sixth century, and Stephen of Alexandria in the early seventh century. John Philoponus stands out for having attempted a fundamental critique of Aristotle's views on the eternity of the world, movement, and other elements of Aristotelian thought. Philoponus questioned Aristotle's teaching of physics, noting its flaws and introducing the theory of impetus to explain his observations.After a hiatus of several centuries, formal commentary by Eustratius and Michael of Ephesus reappeared in the late eleventh and early twelfth centuries, apparently sponsored by Anna Comnena.On the medieval Islamic worldAristotle was one of the most revered Western thinkers in early Islamic theology. Most of the still extant works of Aristotle, as well as a number of the original Greek commentaries, were translated into Arabic and studied by Muslim philosophers, scientists and scholars. Averroes, Avicenna and Alpharabius, who wrote on Aristotle in great depth, also influenced Thomas Aquinas and other Western Christian scholastic philosophers. Alkindus greatly admired Aristotle's philosophy, and Averroes spoke of Aristotle as the "exemplar" for all future philosophers. Medieval Muslim scholars regularly described Aristotle as the "First Teacher". The title "teacher" was first given to Aristotle by Muslim scholars, and was later used by Western philosophers (as in the famous poem of Dante) who were influenced by the tradition of Islamic philosophy.On medieval EuropeWith the loss of the study of ancient Greek in the early medieval Latin West, Aristotle was practically unknown there from c. AD 600 to c. 1100 except through the Latin translation of the Organon made by Boethius. In the twelfth and thirteenth centuries, interest in Aristotle revived and Latin Christians had translations made, both from Arabic translations, such as those by Gerard of Cremona, and from the original Greek, such as those by James of Venice and William of Moerbeke. After the Scholastic Thomas Aquinas wrote his Summa Theologica, working from Moerbeke's translations and calling Aristotle "The Philosopher", the demand for Aristotle's writings grew, and the Greek manuscripts returned to the West, stimulating a revival of Aristotelianism in Europe that continued into the Renaissance. These thinkers blended Aristotelian philosophy with Christianity, bringing the thought of Ancient Greece into the Middle Ages. Scholars such as Boethius, Peter Abelard, and John Buridan worked on Aristotelian logic.The medieval English poet Chaucer describes his student as being happy by havingA cautionary medieval tale held that Aristotle advised his pupil Alexander to avoid the king's seductive mistress, Phyllis, but was himself captivated by her, and allowed her to ride him. Phyllis had secretly told Alexander what to expect, and he witnessed Phyllis proving that a woman's charms could overcome even the greatest philosopher's male intellect. Artists such as Hans Baldung produced a series of illustrations of the popular theme.The Italian poet Dante says of Aristotle in The Divine Comedy:Besides Dante's fellow poets, the classical figure that most influenced the Comedy is Aristotle. Dante built up the philosophy of the Comedy with the works of Aristotle as a foundation, just as the scholastics used Aristotle as the basis for their thinking. Dante knew Aristotle directly from Latin translations of his works and indirectly quotations in the works of Albert Magnus. Dante even acknowledges Aristotle's influence explicitly in the poem, specifically when Virgil justifies the Inferno's structure by citing the Nicomachean Ethics.On medieval JudaismMoses Maimonides (considered to be the foremost intellectual figure of medieval Judaism) adopted Aristotelianism from the Islamic scholars and based his Guide for the Perplexed on it and that became the basis of Jewish scholastic philosophy. Maimonides also considered Aristotle to be the greatest philosopher that ever lived, and styled him as the "chief of the philosophers". Also, in his letter to Samuel ibn Tibbon, Maimonides observes that there is no need for Samuel to study the writings of philosophers who preceded Aristotle because the works of the latter are "sufficient by themselves and [superior] to all that were written before them. His intellect, Aristotle's is the extreme limit of human intellect, apart from him upon whom the divine emanation has flowed forth to such an extent that they reach the level of prophecy, there being no level higher".On Early Modern scientistsIn the Early Modern period, scientists such as William Harvey in England and Galileo Galilei in Italy reacted against the theories of Aristotle and other classical era thinkers like Galen, establishing new theories based to some degree on observation and experiment. Harvey demonstrated the circulation of the blood, establishing that the heart functioned as a pump rather than being the seat of the soul and the controller of the body's heat, as Aristotle thought. Galileo used more doubtful arguments to displace Aristotle's physics, proposing that bodies all fall at the same speed whatever their weight.On 18th/19th-century thinkersThe 19th-century German philosopher Friedrich Nietzsche has been said to have taken nearly all of his political philosophy from Aristotle. Aristotle rigidly separated action from production, and argued for the deserved subservience of some people ("natural slaves"), and the natural superiority (virtue, arete) of others. It was Martin Heidegger, not Nietzsche, who elaborated a new interpretation of Aristotle, intended to warrant his deconstruction of scholastic and philosophical tradition.The English mathematician George Boole fully accepted Aristotle's logic, but decided "to go under, over, and beyond" it with his system of algebraic logic in his 1854 book The Laws of Thought. This gives logic a mathematical foundation with equations, enables it to solve equations as well as check validity, and allows it to handle a wider class of problems by expanding propositions of any number of terms, not just two.Charles Darwin regarded Aristotle as the most important contributor to the subject of biology. In an 1882 letter he wrote that "Linnaeus and Cuvier have been my two gods, though in very different ways, but they were mere schoolboys to old Aristotle". Also, in later editions of the book "On the Origin of Species', Darwin traced evolutionary ideas as far back as Aristotle; the text he cites is a summary by Aristotle of the ideas of the earlier Greek philosopher Empedocles.James Joyce's favoured philosopher was Aristotle, whom he considered to be "the greatest thinker of all times". Samuel Taylor Coleridge said: Everybody is born either a Platonist or an Aristotelian. Ayn Rand acknowledged Aristotle as her greatest influence and remarked that in the history of philosophy she could only recommend "three A's"—Aristotle, Aquinas, and Ayn Rand. She also regarded Aristotle as the greatest of all philosophers.Karl Marx considered Aristotle to be the "greatest thinker of antiquity", and called him a "giant thinker", a "genius", and "the great scholar".Modern rejection and rehabilitationDuring the 20th century, Aristotle's work was widely criticized. The philosopher Bertrand Russellargued that "almost every serious intellectual advance has had to begin with an attack on some Aristotelian doctrine". Russell called Aristotle's ethics "repulsive", and labelled his logic "as definitely antiquated as Ptolemaic astronomy". Russell stated that these errors made it difficult to do historical justice to Aristotle, until one remembered what an advance he made upon all of his predecessors.The Dutch historian of science Eduard Jan Dijksterhuis wrote that Aristotle and his predecessors showed the difficulty of science by "proceed[ing] so readily to frame a theory of such a general character" on limited evidence from their senses. In 1985, the biologist Peter Medawar could still state in "pure seventeenth century" tones that Aristotle had assembled "a strange and generally speaking rather tiresome farrago of hearsay, imperfect observation, wishful thinking and credulity amounting to downright gullibility". Hobbes rejected one of the most famous theses of Aristotle's politics, namely that human beings are naturally suited to life in a polis and do not fully realize their natures until they exercise the role of citizen.By the start of the 21st century, however, Aristotle was taken more seriously: Kukkonen noted that "In the best 20th-century scholarship Aristotle comes alive as a thinker wrestling with the full weight of the Greek philosophical tradition." Alasdair MacIntyre has attempted to reform what he calls the Aristotelian tradition in a way that is anti-elitist and capable of disputing the claims of both liberals and Nietzscheans. Kukkonen observed, too, that "that most enduring of romantic images, Aristotle tutoring the future conqueror Alexander" remained current, as in the 2004 film Alexander, while the "firm rules" of Aristotle's theory of drama have ensured a role for the Poetics in Hollywood.Biologists continue to be interested in Aristotle's thinking. Armand Marie Leroi has reconstructed Aristotle's biology, while Niko Tinbergen's four questions, based on Aristotle's four causes, are used to analyse animal behaviour; they examine function, phylogeny, mechanism, and ontogeny.Surviving worksCorpus AristotelicumThe works of Aristotle that have survived from antiquity through medieval manuscript transmission are collected in the Corpus Aristotelicum. These texts, as opposed to Aristotle's lost works, are technical philosophical treatises from within Aristotle's school. Reference to them is made according to the organization of Immanuel Bekker's Royal Prussian Academy edition (Aristotelis Opera edidit Academia Regia Borussica, Berlin, 1831–1870), which in turn is based on ancient classifications of these works.Loss and preservationAristotle wrote his works on papyrus scrolls, the common writing medium of that era. His writings are divisible into two groups: the "exoteric", intended for the public, and the "esoteric", for use within the Lyceum school. Aristotle's "lost" works stray considerably in characterization from the surviving Aristotelian corpus. Whereas the lost works appear to have been originally written with a view to subsequent publication, the surviving works mostly resemble lecture notes not intended for publication. Cicero's description of Aristotle's literary style as "a river of gold" must have applied to the published works, not the surviving notes. A major question in the history of Aristotle's works is how the exoteric writings were all lost, and how the ones now possessed came to be found. The consensus is that Andronicus of Rhodes collected the esoteric works of Aristotle's school which existed in the form of smaller, separate works, distinguished them from those of Theophrastus and other Peripatetics, edited them, and finally compiled them into the more cohesive, larger works as they are known today.LegacyDepictionsPaintingsAristotle has been depicted by major artists including Lucas Cranach the Elder, Justus van Gent, Raphael, Paolo Veronese, Jusepe de Ribera, Rembrandt, and Francesco Hayez over the centuries. Among the best-known depictions is Raphael's fresco The School of Athens, in the Vatican's Apostolic Palace, where the figures of Plato and Aristotle are central to the image, at the architectural vanishing point, reflecting their importance. Rembrandt's Aristotle with a Bust of Homer, too, is a celebrated work, showing the knowing philosopher and the blind Homer from an earlier age: as the art critic Jonathan Jones writes, "this painting will remain one of the greatest and most mysterious in the world, ensnaring us in its musty, glowing, pitch-black, terrible knowledge of time."SculpturesEponymsThe Aristotle Mountains in Antarctica are named after Aristotle. He was the first person known to conjecture, in his book Meteorology, the existence of a landmass in the southern high-latitude region and called it Antarctica. Aristoteles is a crater on the Moon bearing the classical form of Aristotle's name.See also Aristotelian SocietyAristotle's Biology Conimbricenses PerfectionismReferencesNotesCitationsSourcesFurther readingThe secondary literature on Aristotle is vast. The following is only a small selection. Ackrill, J. L. (1997). Essays on Plato and Aristotle, Oxford University Press.     These translations are available in several places online; see External links. Bakalis, Nikolaos. (2005). Handbook of Greek Philosophy: From Thales to the Stoics Analysis and Fragments, Trafford Publishing, .  Bolotin, David (1998). An Approach to Aristotle's Physics: With Particular Attention to the Role of His Manner of Writing. Albany: SUNY Press. A contribution to our understanding of how to read Aristotle's scientific works. Burnyeat, Myles F. et al. (1979). Notes on Book Zeta of Aristotle's Metaphysics. Oxford: Sub-faculty of Philosophy.   Code, Alan (1995). Potentiality in Aristotle's Science and Metaphysics, Pacific Philosophical Quarterly 76.   De Groot, Jean (2014). Aristotle's Empiricism: Experience and Mechanics in the 4th century BC, Parmenides Publishing, . Frede, Michael (1987). Essays in Ancient Philosophy. Minneapolis: University of Minnesota Press.  Gendlin, Eugene T. (2012). Line by Line Commentary on Aristotle's De Anima , Volume 1: Books I & II; Volume 2: Book III. The Focusing Institute. Gill, Mary Louise (1989). Aristotle on Substance: The Paradox of Unity. Princeton University Press.      Jori, Alberto (2003). Aristotele, Bruno Mondadori (Prize 2003 of the "International Academy of the History of Science"), .  Knight, Kelvin (2007). Aristotelian Philosophy: Ethics and Politics from Aristotle to MacIntyre, Polity Press. Lewis, Frank A. (1991). Substance and Predication in Aristotle. Cambridge University Press. Lord, Carnes (1984). Introduction to The Politics, by Aristotle. Chicago University Press. Loux, Michael J. (1991). Primary Ousia: An Essay on Aristotle's Metaphysics Ζ and Η. Ithaca, NY: Cornell University Press. Maso, Stefano (Ed.), Natali, Carlo (Ed.), Seel, Gerhard (Ed.) (2012) Reading Aristotle: Physics VII. 3: What is Alteration? Proceedings of the International ESAP-HYELE Conference, Parmenides Publishing. .   [Reprinted in J. Barnes, M. Schofield, and R.R.K. Sorabji, eds.(1975). Articles on Aristotle Vol 1. Science. London: Duckworth 14–34.]   Reeve, C. D. C. (2000). Substantial Knowledge: Aristotle's Metaphysics. Hackett.   Scaltsas, T. (1994). Substances and Universals in Aristotle's Metaphysics. Cornell University Press. Strauss, Leo (1964). "On Aristotle's Politics", in The City and Man, Rand McNally.External links   At the Internet Encyclopedia of Philosophy:  At the Internet Classics Archive From the Stanford Encyclopedia of Philosophy:   Collections of works  At Massachusetts Institute of Technology      Perseus Project at Tufts University At the University of Adelaide   P. Remacle The 11-volume 1837 Bekker edition of Aristotle's Works in Greek (PDFDJVU) 384 BC births322 BC deaths4th-century BC mathematicians4th-century BC philosophers4th-century BC writersAcademic philosophersActing theoristsAncient Greek biologistsAncient Greek economistsAncient Greek epistemologistsAncient Greek ethicistsAncient Greek logiciansAncient Greek mathematiciansAncient Greek metaphilosophersAncient Greek metaphysiciansAncient Greek philosophersAncient Greek philosophers of languageAncient Greek philosophers of mindAncient Greek physicistsAncient Greek political philosophersAncient Greek philosophers of artAncient literary criticsAncient StagiritesAphoristsAristotelian philosophersAttic Greek writersAncient Greek cosmologistsCritical thinkingCultural criticsFounders of philosophical traditionsGreek male writersGreek geologistsGreek meteorologistsGreek social commentatorsHumor researchersIrony theoristsMetic philosophers in Classical AthensMoral philosophersNatural philosophersOntologistsPeripatetic philosophersPhilosophers and tutors of Alexander the GreatPhilosophers of ancient ChalcidicePhilosophers of culturePhilosophers of educationPhilosophers of ethics and moralityPhilosophers of historyPhilosophers of lawPhilosophers of literaturePhilosophers of logicPhilosophers of lovePhilosophers of psychologyPhilosophers of sciencePhilosophers of timePhilosophers of sexualityPhilosophers of technologyPhilosophical logicPhilosophical theistsPhilosophy academicsPhilosophy writersRhetoric theoristsSocial criticsSocial philosophersStudents of PlatoTrope theoristsVirtue ethicistsVirtue ethicsWestern cultureWestern philosophyZoologists
+An American in Paris is a jazz-influenced orchestral piece by American composer George Gershwin first performed in 1928. It was inspired by the time that Gershwin had spent in Paris and evokes the sights and energy of the French capital during the Années folles.Gershwin scored the piece for the standard instruments of the symphony orchestra plus celesta, saxophones, and automobile horns. He brought back four Parisian taxi horns for the New York premiere of the composition, which took place on December 13, 1928, in Carnegie Hall, with Walter Damrosch conducting the New York Philharmonic. It was Damrosch who had commissioned Gershwin to write his Concerto in F following the earlier success of Rhapsody in Blue (1924). He completed the orchestration on November 18, less than four weeks before the work's premiere. He collaborated on the original program notes with critic and composer Deems Taylor.BackgroundAlthough the story is likely apocryphal, Gershwin is said to have been attracted by Maurice Ravel's unusual chords, and Gershwin went on his first trip to Paris in 1926 ready to study with Ravel. After his initial student audition with Ravel turned into a sharing of musical theories, Ravel said he could not teach him, saying, "Why be a second-rate Ravel when you can be a first-rate Gershwin?"Gershwin strongly encouraged Ravel to come to the United States for a tour. To this end, upon his return to New York, Gershwin joined the efforts of Ravel's friend Robert Schmitz, a pianist Ravel had met during the war, to urge Ravel to tour the U.S. Schmitz was the head of Pro Musica, promoting Franco-American musical relations, and was able to offer Ravel a $10,000 fee for the tour, an enticement Gershwin knew would be important to Ravel.Gershwin greeted Ravel in New York in March 1928 during a party held for Ravel's birthday by Éva Gauthier. Ravel's tour reignited Gershwin's desire to return to Paris, which he and his brother Ira did after meeting Ravel. Ravel's high praise of Gershwin in an introductory letter to Nadia Boulanger caused Gershwin to seriously consider taking much more time to study abroad in Paris. Yet after he played for her, she told him she could not teach him. Boulanger gave Gershwin basically the same advice she gave all her accomplished master students: "What could I give you that you haven't already got?" This did not set Gershwin back, as his real intent abroad was to complete a new work based on Paris and perhaps a second rhapsody for piano and orchestra to follow his Rhapsody in Blue. Paris at this time hosted many expatriate writers, among them Ezra Pound, W. B. Yeats, Ernest Hemingway, and artist Pablo Picasso.CompositionGershwin based An American in Paris on a melodic fragment called "Very Parisienne", written in 1926 on his first visit to Paris as a gift to his hosts, Robert and Mabel Schirmer. Gershwin called it "a rhapsodic ballet"; it is written freely and in a much more modern idiom than his prior works.Gershwin explained in Musical America, "My purpose here is to portray the impressions of an American visitor in Paris as he strolls about the city, listens to the various street noises, and absorbs the French atmosphere."The piece is structured into five sections, which culminate in a loose ABA format. Gershwin's first A episode introduces the two main "walking" themes in the "Allegretto grazioso" and develops a third theme in the "Subito con brio". The style of this A section is written in the typical French style of composers Claude Debussy and Les Six. This A section featured duple meter, singsong rhythms, and diatonic melodies with the sounds of oboe, English horn, and taxi horns. The B section's "Andante ma con ritmo deciso" introduces the American Blues and spasms of homesickness. The "Allegro" that follows continues to express homesickness in a faster twelve-bar blues. In the B section, Gershwin uses common time, syncopated rhythms, and bluesy melodies with the sounds of trumpet, saxophone, and snare drum. "Moderato con grazia" is the last A section that returns to the themes set in A. After recapitulating the "walking" themes, Gershwin overlays the slow blues theme from section B in the final "Grandioso".ResponseGershwin did not particularly like Walter Damrosch's interpretation at the world premiere of An American in Paris. He stated that Damrosch's sluggish, dragging tempo caused him to walk out of the hall during a matinee performance of this work. The audience, according to Edward Cushing, responded with "a demonstration of enthusiasm impressively genuine in contrast to the conventional applause which new music, good and bad, ordinarily arouses."Critics believed that An American in Paris was better crafted than Gershwin's Concerto in F. Some did not think it belonged in a program with classical composers César Franck, Richard Wagner, or Guillaume Lekeu on its premiere. Gershwin responded to the critics:InstrumentationAn American in Paris was originally scored for 3 flutes (3rd doubling on piccolo), 2 oboes, English horn, 2 clarinets in B-flat, bass clarinet in B-flat, 2 bassoons, contrabassoon, 4 horns in F, 3 trumpets in B-flat, 3 trombones, tuba, timpani, snare drum, bass drum, triangle, wood block, ratchet, cymbals, low and high tom-toms, xylophone, glockenspiel, celesta, 4 taxi horns labeled as A, B, C, and D with circles around them, alto saxophone, tenor saxophone, baritone saxophone (all saxophones doubling soprano saxophones), and strings. Although most modern audiences have heard the taxi horns using the notes A, B, C, and D, it had been Gershwin's intention to use the notes A4, B4, D5, and A4. It is likely that in labeling the taxi horns as A, B, C, and D with circles, he was referring to the four horns, and not the notes that they played.A major revision of the work by composer and arranger F. Campbell-Watson simplified the instrumentation by reducing the saxophones to only three instruments: alto, tenor and baritone. The soprano saxophone doublings were eliminated to avoid changing instruments, and the contrabassoon was also deleted. This became the standard performing edition until 2000, when Gershwin specialist Jack Gibbons made his own restoration of the original orchestration of An American in Paris, working directly from Gershwin's original manuscript, including the restoration of Gershwin's soprano saxophone parts removed in Campbell-Watson's revision. Gibbons' restored orchestration of An American in Paris was performed at London's Queen Elizabeth Hall on July 9, 2000, by the City of Oxford Orchestra conducted by Levon Parikian.William Daly arranged the score for piano solo; this was published by New World Music in 1929.Preservation statusOn September 22, 2013, it was announced that a musicological critical edition of the full orchestral score would be eventually released. The Gershwin family, working in conjunction with the Library of Congress and the University of Michigan, were working to make scores available to the public that represent Gershwin's true intent. It was unknown whether the critical score would include the four minutes of material Gershwin later deleted from the work (such as the restatement of the blues theme after the faster 12 bar blues section), or if the score would document changes in the orchestration during Gershwin's composition process.The score to An American in Paris was scheduled to be issued first in a series of scores to be released. The entire project was expected to take 30 to 40 years to complete, but An American in Paris was planned to be an early volume in the series.Two urtext editions of the work were published by the German publisher B-Note Music in 2015. The changes made by Campbell-Watson were withdrawn in both editions. In the extended urtext, 120 bars of music were re-integrated. Conductor Walter Damrosch had cut them shortly before the first performance.On September 9, 2017, The Cincinnati Symphony Orchestra gave the world premiere of the long-awaited critical edition of the piece prepared by Mark Clague, director of the Gershwin initiative at the University of Michigan. This performance was of the original 1928 orchestration,  an alteration usually attributed to F. Campbell-Watson.RecordingsAn American in Paris has been frequently recorded. The first recording was made for the Victor Talking Machine Company in 1929 with Nathaniel Shilkret conducting the Victor Symphony Orchestra, drawn from members of the Philadelphia Orchestra. Gershwin was on hand to "supervise" the recording; however, Shilkret was reported to be in charge and eventually asked the composer to leave the recording studio. Then, a little later, Shilkret discovered there was no one to play the brief celesta solo during the slow section, so he hastily asked Gershwin if he might play the solo; Gershwin said he could and so he briefly participated in the actual recording. This recording is believed to use the taxi horns in the way that Gershwin had intended using the notes A-flat, B-flat, a higher D, and a lower A.The radio broadcast of the September 8, 1937, Hollywood Bowl George Gershwin Memorial Concert, in which An American in Paris, also conducted by Shilkret, was second on the program, was recorded and was released in 1998 in a two-CD set.Arthur Fiedler and the Boston Pops Orchestra recorded the work for RCA Victor, including one of the first stereo recordings of the music.In 1945, Arturo Toscanini conducting the NBC Symphony Orchestra recorded the piece for RCA Victor, one of the few commercial recordings Toscanini made of music by an American composer.The Seattle Symphony also recorded a version in 1990 of Gershwin's original score, before he made numerous edits resulting in the score as we hear it today.Harry James released a version of the blues section on his 1953 album One Night Stand, recorded live at the Aragon Ballroom in Chicago (Columbia GL 522 and CL 522).Use in filmIn 1951, Metro-Goldwyn-Mayer released the musical film, An American in Paris, featuring Gene Kelly and Leslie Caron. Winning the 1951 Best Picture Oscar, and numerous other awards, the film was directed by Vincente Minnelli, featured many tunes of Gershwin, and concluded with an extensive, elaborate dance sequence built around the An American in Paris symphonic poem (arranged for the film by Johnny Green), costing $500,000.ReferencesFurther reading Rimler, Walter. George Gershwin – An Intimate Portrait. Urbana, University of Illinois Press, 2009. chapter 6: Paris, pp. 28–33.External links   Scores, marked by Leonard Bernstein, Andre Kostelanetz, Erich Leinsdorf; New York Philharmonic archives 1944 recording by the New York Philharmonic conducted by Artur Rodziński , New York Philharmonic, Leonard Bernstein, 1959.  1928 compositionsCompositions by George GershwinGrammy Hall of Fame Award recipientsMusic about ParisMusic commissioned by the New York PhilharmonicSymphonic poems
+The Academy Award for Best Production Design recognizes achievement for art direction in film. The category's original name was Best Art Direction, but was changed to its current name in 2012 for the 85th Academy Awards. This change resulted from the Art Director's branch of the Academy of Motion Picture Arts and Sciences (AMPAS) being renamed the Designer's branch. Since 1947, the award is shared with the set decorator(s). It is awarded to the best interior design in a film.The films below are listed with their production year (for example, the 2000 Academy Award for Best Art Direction is given to a film from 1999). In the lists below, the winner of the award for each year is shown first, followed by the other nominees in alphabetical order.SuperlativesWinners and nominees1920s1930s1940s1950s1960s1970s1980s1990s2000s2010s2020sSee also BAFTA Award for Best Production Design Critics' Choice Movie Award for Best Production DesignNotesReferencesBest Production DesignAwards for best art direction
+The Academy Awards, popularly known as the Oscars, are awards for artistic and technical merit in the film industry. They are regarded by many as the most prestigious and significant awards in the entertainment industry worldwide. Given annually by the Academy of Motion Picture Arts and Sciences (AMPAS), the awards are an international recognition of excellence in cinematic achievements, as assessed by the Academy's voting membership. The various category winners are awarded a copy of a golden statuette as a trophy, officially called the "Academy Award of Merit", although more commonly referred to by its nickname, the "Oscar". The statuette depicts a knight rendered in the Art Deco style.The award was originally sculpted by George Stanley from a design sketch by Cedric Gibbons. AMPAS first presented it in 1929 at a private dinner hosted by Douglas Fairbanks in The Hollywood Roosevelt Hotel in what would become known as the 1st Academy Awards. The Academy Awards ceremony was first broadcast by radio in 1930 and was televised for the first time in 1953. It is the oldest worldwide entertainment awards ceremony and is now televised live worldwide. It is also the oldest of the four major annual American entertainment awards; its equivalents – the Emmy Awards for television, the Tony Awards for theater, and the Grammy Awards for music – are modeled after the Academy Awards. A total of 3,140 Oscar statuettes have been awarded since its inception in 1929. They are widely cited as the most prestigious and renowned competitive awards in the field of entertainment.The 93rd Academy Awards ceremony, honoring the best films of 2020 and early 2021, was held on April 25, 2021, after it was postponed from its original February 28, 2021, schedule due to the impact of the COVID-19 pandemic on cinema. As with the two previous ceremonies, there was no host. The ceremony was broadcast on ABC. It took place at the Dolby Theatre in Los Angeles, California for the 19th consecutive year, along with satellite location taking place at the Union Station also in Los Angeles.HistoryThe first Academy Awards presentation was held on May 16, 1929, at a private dinner function at The Hollywood Roosevelt Hotel with an audience of about 270 people.The post-awards party was held at the Mayfair Hotel. The cost of guest tickets for that night's ceremony was $5 ($ at 2020 prices). Fifteen statuettes were awarded, honoring artists, directors and other participants in the film-making industry of the time, for their works during the 1927–28 period. The ceremony ran for 15 minutes.Winners were announced to the media three months earlier. That was changed for the second ceremony in 1930. Since then, for the rest of the first decade, the results were given to newspapers for publication at 11:00 pm on the night of the awards. This method was used until 1940 when the Los Angeles Times announced the winners before the ceremony began; as a result, the Academy has, since 1941, used a sealed envelope to reveal the names of the winners.MilestonesThe first Best Actor awarded was Emil Jannings, for his performances in The Last Command and The Way of All Flesh. He had to return to Europe before the ceremony, so the Academy agreed to give him the prize earlier; this made him the first Academy Award winner in history. At that time, winners were recognized for the entirety of their work done in a certain category during the qualifying period; for example, Jannings received the award for two movies in which he starred during that period, and Janet Gaynor later won a single Oscar for performances in three films. With the fourth ceremony, however, the system changed, and professionals were honored for a specific performance in a single film. For the first six ceremonies, the eligibility period spanned two calendar years.At the 29th ceremony, held in 1957, the Best Foreign Language Film category, now known as Best International Feature Film, was introduced. Until then, foreign-language films had been honored with the Special Achievement Award.Perhaps the most widely seen streaker in history was 34-year-old Robert Opel, who streaked across the stage of The Dorothy Chandler Pavilion in Los Angeles flashing a peace sign on national US television at the 46th Academy Awards in 1974. Bemused host David Niven quipped, "Isn't it fascinating to think that probably the only laugh that man will ever get in his life is by stripping off and showing his shortcomings?" Later, evidence arose suggesting that Opel's appearance was facilitated as a publicity stunt by the show's producer Jack Haley Jr. Robert Metzler, the show's business manager, believed that the incident had been planned in some way; during the dress rehearsal Niven had asked Metzler's wife to borrow a pen so he could write down the famous line, which was thus not the ad-lib it appeared to be.The 74th Academy Awards, held in 2002, presented the first Academy Award for Best Animated Feature.From 1973 to 2020, all Academy Awards ceremonies have ended with the Academy Award for Best Picture. For 2021, this tradition was broken as the ceremony ended with the Academy Award for Best Actor.Traditionally, the previous year's winner for Best Actor and Best Supporting Actor present the awards for Best Actress and Best Supporting Actress, while the previous year's winner for Best Actress and Best Supporting Actress present the awards for Best Actor and Best Supporting Actor.Parasite became the first foreign-language film to win Best Picture at the February 9, 2020, award ceremony.Tom Hanks announced at the 2020 Oscar Ceremony, the opening of the Academy Museum of Motion Pictures on December 14, 2020.<ref>Barnes, Brooks (February 19, 2020).  "Motion Picture Academy Museum Will Open in December."   The New York Times. Retrieved March 15, 2020.</ref> The museum development started in 2017 under Kerry Brougher, but is now led by Bill Kramer. The industry curated exhibits will be geared toward the history of motion picture, the art & science of film making, exhibiting trailblazing directors, actors, film-makers, sound editors and more, and will house famous artifacts from acclaimed movies like Dorothy's Ruby Red Slippers.Because of COVID-19, Academy president David Rubin and CEO Dawn Hudson announced that for the 2021 Oscar Ceremony, streaming movies not shown in theaters would be eligible, though at some point the requirement that movies be shown in theaters would return.Oscar statuetteAcademy Award of Merit (Oscar statuette)The best known award is the Academy Award of Merit, more popularly known as the Oscar statuette. Made of gold-plated bronze on a black metal base, it is 13.5 in (34.3 cm) tall, weighs 8.5 lb (3.856 kg), and depicts a knight rendered in Art Deco style holding a sword standing on a reel of film with five spokes. The five spokes represent the original branches of the Academy: Actors, Writers, Directors, Producers, and Technicians.Sculptor George Stanley (who also did the Muse Fountain at the Hollywood Bowl) sculpted Cedric Gibbons' design. The statuettes presented at the initial ceremonies were gold-plated solid bronze. Within a few years, the bronze was abandoned in favor of Britannia metal, a pewter-like alloy which is then plated in copper, nickel silver, and finally, 24-karat gold. Due to a metal shortage during World War II, Oscars were made of painted plaster for three years. Following the war, the Academy invited recipients to redeem the plaster figures for gold-plated metal ones. The only addition to the Oscar since it was created is a minor streamlining of the base. The original Oscar mold was cast in 1928 at the C.W. Shumway & Sons Foundry in Batavia, Illinois, which also contributed to casting the molds for the Vince Lombardi Trophy and Emmy Award's statuettes. From 1983 to 2015, approximately 50 Oscars in a tin alloy with gold plating were made each year in Chicago by Illinois manufacturer R.S. Owens & Company. It would take between three and four weeks to manufacture 50 statuettes. In 2016, the Academy returned to bronze as the core metal of the statuettes, handing manufacturing duties to Walden, New York-based Polich Tallix Fine Art Foundry. While based on a digital scan of an original 1929 Oscar, the statuettes retain their modern-era dimensions and black pedestal. Cast in liquid bronze from 3D-printed ceramic molds and polished, they are then electroplated in 24-karat gold by Brooklyn, New York–based Epner Technology. The time required to produce 50 such statuettes is roughly three months. R.S. Owens is expected to continue producing other awards for the Academy and service existing Oscars that need replating.NamingThe Academy officially adopted the name "Oscar" for the trophies in 1939. However, the origin of the nickname is disputed.One biography of Bette Davis, who was a president of the Academy in 1941, claims she named the award after her first husband, band leader Harmon Oscar Nelson. A frequently mentioned originator is Margaret Herrick, the Academy executive secretary, who, when she first saw the award in 1931, said the statuette reminded her of "Uncle Oscar", a nickname for her cousin Oscar Pierce.Columnist Sidney Skolsky, who was present during Herrick's naming in 1931, wrote that "Employees have affectionately dubbed their famous statuette 'Oscar.'" The Academy credits Skolsky with "the first confirmed newspaper reference" to Oscar in his column on March 16, 1934, which was written about that year's 6th Academy Awards. The 1934 awards appeared again in another early media mention of Oscar: a Time magazine story. In the ceremonies that year, Walt Disney was the first to thank the Academy for his "Oscar" during his acceptance speech.EngravingTo prevent information identifying the Oscar winners from leaking ahead of the ceremony, Oscar statuettes presented at the ceremony have blank baseplates. Until 2010, winners returned their statuettes to the Academy and had to wait several weeks to have their names inscribed on their respective Oscars. Since 2010, winners have had the option of having engraved nameplates applied to their statuettes at an inscription-processing station at the Governor's Ball, a party held immediately after the Oscar ceremony. The R.S. Owens company has engraved nameplates made before the ceremony, bearing the name of every potential winner. The nameplates for the non-winning nominees are later recycled.Ownership of Oscar statuettesPrior to 1950, Oscar statuettes were (and remain) the property of the recipient.  Since then the statuettes have been legally encumbered by the requirement that the statuette be first offered for sale back to the Academy for US$1. If a winner refuses to agree to this stipulation, then the Academy keeps the statuette. Academy Awards predating this agreement have been sold in public auctions and private deals for six-figure sums.In 1989, Michael Todd's grandson tried to sell Todd's Best Picture Oscar for his 1956 production of Around the World in 80 Days to a movie prop collector.  The Academy earned enforcement of its statuette contract by gaining a permanent injunction against the sale.In 1992, Harold Russell consigned his 1946 Oscar for Best Supporting Actor for The Best Years of Our Lives to auction to raise money for his wife's medical expenses.  Though his decision caused controversy, the first-ever Oscar to be sold passed   to a private collector on August 6, 1992 for $60,500 ($ today). Russell defended his action, saying, "I don't know why anybody would be critical. My wife's health is much more important than sentimental reasons. The movie will be here, even if Oscar isn't."In December 2011, Orson Welles' 1941 Oscar for Citizen Kane (Academy Award for Best Original Screenplay) was put up for auction, after his heirs won a 2004 court decision contending that Welles did not sign any agreement to return the statue to the Academy. On December 20, 2011, it sold in an online auction for US$861,542 ($ today).Some buyers have subsequently returned the statuettes to the Academy, which keeps them in its treasury.Other awards presented by the AcademyIn addition to the Academy Award of Merit (Oscar award), there are nine honorary (non-competitive) awards presented by the Academy from time to time (except for the Academy Honorary Award, the Technical Achievement Award, and the Student Academy Awards, which are presented annually): Governors Awards: The Academy Honorary Award (annual) (which may or may not be in the form of an Oscar statuette);  The Irving G. Thalberg Memorial Award (since 1938) (in the form of a bust of Thalberg); The Jean Hersholt Humanitarian Award (since 1957) (in the form of an Oscar statuette);  The Academy Scientific and Technical Awards: Academy Award of Merit (non-competitive) (in the form of an Oscar statuette); Scientific and Engineering Award (in the form of a bronze tablet); Technical Achievement Award (annual) (in the form of a certificate); The John A. Bonner Medal of Commendation (since 1978) (in the form of a medal); The Gordon E. Sawyer Award (since 1982); and The Academy Student Academy Awards (annual).The Academy also awards Nicholl Fellowships in Screenwriting.NominationSince 2004, Academy Award nomination results have been announced to the public in mid-January. Prior to that, the results were announced in early February. In 2021, the nominees are announced in March.VotersThe Academy of Motion Picture Arts and Sciences (AMPAS), a professional honorary organization, maintains a voting membership of over 7,000 .Academy membership is divided into different branches, with each representing a different discipline in film production. Actors constitute the largest voting bloc, numbering 1,311 members (22 percent) of the Academy's composition. Votes have been certified by the auditing firm PricewaterhouseCoopers (and its predecessor Price Waterhouse) since the 7th Academy Awards in 1935. The firm mails the ballots of eligible nominees to members of the Academy in December to reflect the previous eligible year with a due date sometime in January of the next year, then tabulates the votes in a process that takes thousands of hours.All AMPAS members must be invited to join by the Board of Governors, on behalf of Academy Branch Executive Committees. Membership eligibility may be achieved by a competitive nomination or a member may submit a name based on other significant contributions to the field of motion pictures.New membership proposals are considered annually. The Academy does not publicly disclose its membership, although as recently as 2007 press releases have announced the names of those who have been invited to join. The 2007 release also stated that it has just under 6,000 voting members. While the membership had been growing, stricter policies have kept its size steady since then.In 2012, the results of a study conducted by the Los Angeles Times were published describing the demographic breakdown of approximately 88% of AMPAS' voting membership. Of the 5,100+ active voters confirmed, 94% were Caucasian, 77% were male, and 54% were found to be over the age of 60. 33% of voting members are former nominees (14%) and winners (19%).In May 2011, the Academy sent a letter advising its 6,000 or so voting members that an online system for Oscar voting would be implemented in 2013.RulesAccording to Rules 2 and 3 of the official Academy Awards Rules, a film must open in the previous calendar year, from midnight at the start of January 1 to midnight at the end of December 31, in Los Angeles County, California, and play for seven consecutive days, to qualify (except for the Best International Feature Film, Best Documentary Feature, and awards in short film categories). Additionally, the film must be shown at least three times on each day of its qualifying run, with at least one of the daily showings starting between 6 pm and 10 pm local time.For example, the 2009 Best Picture winner, The Hurt Locker, was originally first released in 2008, but did not qualify for the 2008 awards, as it did not play its Oscar-qualifying run in Los Angeles until mid-2009, thus qualifying for the 2009 awards. Foreign films must include English subtitles, and each country can submit only one film for consideration in the International Feature Film category per year.Rule 2 states that a film must be feature-length, defined as a minimum of 40 minutes, except for short-subject awards, and it must exist either on a 35 mm or 70 mm film print or in 24 frame/s or 48 frame/s progressive scan digital cinema format with a minimum projector resolution of 2048 by 1080 pixels. Since the 90th Academy Awards, presented in 2018, multi-part and limited series have been ineligible for the Best Documentary Feature award. This followed the win of O.J.: Made in America, an eight-hour presentation that was screened in a limited release before being broadcast in five parts on ABC and ESPN, in that category in 2017. The Academy's announcement of the new rule made no direct mention of that film.The Best International Feature Film award does not require a U.S. release. It requires the film to be submitted as its country's official selection.The Best Documentary Feature award requires either week-long releases in both Los Angeles County and New York City  during the previous calendar year, or a qualifying award at a competitive film festival from the Documentary Feature Qualifying Festival list (regardless of any public exhibition or distribution), or submission in the International Feature Film category as its country's official selection. The qualifying theatrical runs must meet the same requirements as those for non-documentary films regarding numbers and times of screenings. Additionally, a film must have been reviewed by a critic from The New York Times, Time Out New York, the Los Angeles Times, or LA Weekly.Producers must submit an Official Screen Credits online form before the deadline; in case it is not submitted by the defined deadline, the film will be ineligible for Academy Awards in any year. The form includes the production credits for all related categories. Then, each form is checked and put in a Reminder List of Eligible Releases.Awards in short film categories (Best Documentary Short Subject, Best Animated Short Film, and Best Live Action Short Film) have noticeably different eligibility rules from most other competitive awards. First, the qualifying period for release does not coincide with a calendar year, instead of covering one year starting on October 1 and ending on September 30 of the calendar year before the ceremony. Second, there are multiple methods of qualification. The main method is a week-long theatrical release in either Los Angeles County or New York City during the eligibility period. Films also can qualify by winning specified awards at one of several competitive film festivals designated by the Academy, also without regard to prior public distribution. Finally, a film that is selected as a gold, silver, or bronze medal winner in an appropriate category of the immediately previous Student Academy Awards is also eligible (Documentary category for that award, and Animation, Narrative, Alternative, or International for the other awards). The requirements for the qualifying theatrical run are also different from those for other awards. Only one screening per day is required. For the Documentary award, the screening must start between noon and 10 pm local time; for other awards, no specific start time is required, but the film must appear in regular theater listings with dates and screening times.In late December, ballots, and copies of the Reminder List of Eligible Releases are mailed to around 6,000 active members. For most categories, members from each of the branches vote to determine the nominees only in their respective categories (i.e. only directors vote for directors, writers for writers, actors for actors, etc.). In the special case of Best Picture, all voting members are eligible to select the nominees. In all major categories, a variant of the single transferable vote is used, with each member casting a ballot with up to five nominees (ten for Best Picture) ranked preferentially. In certain categories, including International Feature Film, Documentary and Animated Feature, nominees are selected by special screening committees made up of members from all branches.In most categories, the winner is selected from among the nominees by plurality voting of all members. Since 2009, the Best Picture winner has been chosen by instant runoff voting. Since 2013, re-weighted range voting has been used to select the nominees for the Best Visual Effects.Film companies will spend as much as several million dollars on marketing to awards voters for a movie in the running for Best Picture, in attempts to improve chances of receiving Oscars and other movie awards conferred in Oscar season. The Academy enforces rules to limit overt campaigning by its members to try to eliminate excesses and prevent the process from becoming undignified. It has an awards czar on staff who advises members on allowed practices and levies penalties on offenders. For example, a producer of the 2009 Best Picture nominee The Hurt Locker was disqualified as a producer in the category when he contacted associates urging them to vote for his film and not another that was seen as the front-runner (The Hurt Locker eventually won).Academy Screening RoomThe Academy Screening Room or Academy Digital Screening Room is a secure streaming platform which allows voting members of the Academy to view all eligible films (except, initially, those in the International category) in one place. It was introduced in 2019, for the 2020 Oscars, though DVD screeners and Academy in-person screenings were still provided. For films to be included on the platform, the North American distributor must pay $12,500, including a watermarking fee, and a digital copy of the film to be prepared for streaming by the Academy. The platform can be accessed through an app on Apple TV. The watermarking process involved several video security firms, creating a forensic watermark and restricting the ability to take screenshots or screen recordings.In 2021, for the 2022 Oscars, the Academy banned all physical screeners and in-person screenings, restricting official membership viewing to the Academy Screening Room. Films eligible in the Documentary and International categories were made available in different sections of the platform. Distributors can also pay an extra fee to add video featurettes to promote their films on the platform. The in-person screenings were said to be cancelled because of the COVID-19 pandemic. Eligible films do not have to be added to the platform, but the Academy advertises them to voting members when they are.Awards ceremoniesTelecastThe major awards are presented at a live televised ceremony, commonly in late February or early March following the relevant calendar year, and six weeks after the announcement of the nominees. It is the culmination of the film awards season, which usually begins during November or December of the previous year. This is an elaborate extravaganza, with the invited guests walking up the red carpet in the creations of the most prominent fashion designers of the day. Black tie dress is the most common outfit for men, although fashion may dictate not wearing a bow-tie, and musical performers sometimes do not adhere to this. (The artists who recorded the nominees for Best Original Song quite often perform those songs live at the awards ceremony, and the fact that they are performing is often used to promote the television broadcast.)The Academy Awards is the world's longest-running awards show televised live from the U.S. to all-time zones in North America and worldwide, and gathers billions of viewers elsewhere throughout the world. The Oscars were first televised in 1953 by NBC, which continued to broadcast the event until 1960, when ABC took over, televising the festivities (including the first color broadcast of the event in 1966) through 1970. NBC regained the rights for five years  then ABC resumed broadcast duties in 1976 and its current contract with the Academy runs through 2028. The Academy has also produced condensed versions of the ceremony for broadcast in international markets (especially those outside of the Americas) in more desirable local timeslots. The ceremony was broadcast live internationally for the first time via satellite since 1970, but only two South American countries, Chile and Brazil, purchased the rights to air the broadcast. By that time, the television rights to the Academy Awards had been sold in 50 countries. A decade later, the rights were already being sold to 60 countries, and by 1984, the TV rights to the Awards were licensed in 76 countries.The ceremonies were moved up from late March/early April to late February, since 2004, to help disrupt and shorten the intense lobbying and ad campaigns associated with Oscar season in the film industry. Another reason was because of the growing TV ratings success coinciding with the NCAA Basketball Tournament, which would cut into the Academy Awards audience. (In 1976 and 1977, ABC's regained Oscars were moved from Tuesday to Monday and went directly opposite NBC's NCAA title game.) The earlier date is also to the advantage of ABC, as it now usually occurs during the highly profitable and important February sweeps period. Some years, the ceremony is moved into the first Sunday of March to avoid a clash with the Winter Olympic Games. Another reason for the move to late February and early March is also to avoid the awards ceremony occurring so close to the religious holidays of Passover and Easter, which for decades had been a grievance from members and the general public. Advertising is somewhat restricted, however, as traditionally no movie studios or competitors of official Academy Award sponsors may advertise during the telecast. The production of the Academy Awards telecast currently holds the distinction of winning the most Emmys in history, with 47 wins and 195 nominations overall since that award's own launch in 1949.After many years of being held on Mondays at 9:00 pm Eastern/6:00 p.m Pacific, since the 1999 ceremonies, it was moved to Sundays at 8:30 pm ET/5:30 pm PT. The reasons given for the move were that more viewers would tune in on Sundays, that Los Angeles rush-hour traffic jams could be avoided, and an earlier start time would allow viewers on the East Coast to go to bed earlier. For many years the film industry opposed a Sunday broadcast because it would cut into the weekend box office. In 2010, the Academy contemplated moving the ceremony even further back into January, citing TV viewers' fatigue with the film industry's long awards season. However, such an accelerated schedule would dramatically decrease the voting period for its members, to the point where some voters would only have time to view the contending films streamed on their computers (as opposed to traditionally receiving the films and ballots in the mail). Furthermore, a January ceremony on Sunday would clash with National Football League playoff games. In 2018, the Academy announced that the ceremony would be moved from late February to mid February beginning with the 92nd Academy Awards in 2020.Originally scheduled for April 8, 1968, the 40th Academy Awards ceremony was postponed for two days, because of the assassination of Dr. Martin Luther King, Jr. On March 30, 1981, the 53rd Academy Awards was postponed for one day, after the shooting of President Ronald Reagan and others in Washington, D.C.In 1993, an In Memoriam segment was introduced, honoring those who had made a significant contribution to cinema who had died in the preceding 12 months, a selection compiled by a small committee of Academy members. This segment has drawn criticism over the years for the omission of some names. Criticism was also levied for many years regarding another aspect, with the segment having a "popularity contest" feel as the audience varied their applause to those who had died by the subject's cultural impact; the applause has since been muted during the telecast, and the audience is discouraged from clapping during the segment and giving silent reflection instead. This segment was later followed by a commercial break.In terms of broadcast length, the ceremony generally averages three and a half hours. The first Oscars, in 1929, lasted 15 minutes. At the other end of the spectrum, the 2002 ceremony lasted four hours and twenty-three minutes. In 2010, the organizers of the Academy Awards announced winners' acceptance speeches must not run past 45 seconds. This, according to organizer Bill Mechanic, was to ensure the elimination of what he termed "the single most hated thing on the show" – overly long and embarrassing displays of emotion. In 2016, in a further effort to streamline speeches, winners' dedications were displayed on an on-screen ticker. During the 2018 ceremony, host Jimmy Kimmel acknowledged how long the ceremony had become, by announcing that he would give a brand-new jet ski to whoever gave the shortest speech of the night (a reward won by Mark Bridges when accepting his Best Costume Design award for Phantom Thread). The Wall Street Journal analyzed the average minutes spent across the 2014–2018 telecasts as follows: 14 on song performances; 25 on the hosts' speeches; 38 on prerecorded clips; and 78 on the awards themselves, broken into 24 on the introduction and announcement, 24 on winners walking to the stage, and 30 on their acceptance speeches.Although still dominant in ratings, the viewership of the Academy Awards has steadily dropped; the 88th Academy Awards were the lowest-rated in the past eight years (although with increases in male and 18–49 viewership), while the show itself also faced mixed reception. Following the show, Variety reported that ABC was, in negotiating an extension to its contract to broadcast the Oscars, seeking to have more creative control over the broadcast itself. Currently and nominally, AMPAS is responsible for most aspects of the telecast, including the choice of production staff and hosting, although ABC is allowed to have some input on their decisions. In August 2016, AMPAS extended its contract with ABC through 2028: the contract neither contains any notable changes nor gives ABC any further creative control over the telecast.TV ratingsHistorically, the telecast's viewership is higher when box-office hits are favored to win the Best Picture award. More than 57.25 million viewers tuned to the telecast for the 70th Academy Awards in 1998, the year of Titanic, which generated a box office haul during its initial 1997–98 run of US$600.8 million in the US, a box office record that would remain unsurpassed for years. The 76th Academy Awards ceremony, in which The Lord of the Rings: The Return of the King (pre-telecast box office earnings of US$368 million) received 11 Awards including Best Picture, drew 43.56 million viewers. The most watched ceremony based on Nielsen ratings to date, however, was the 42nd Academy Awards (Best Picture Midnight Cowboy) which drew a 43.4% household rating on April 7, 1970.By contrast, ceremonies honoring films that have not performed well at the box office tend to show weaker ratings, despite how much critical acclaim those films have received. The 78th Academy Awards which awarded low-budget independent film Crash (with a pre-Oscar gross of US$53.4 million) generated an audience of 38.64 million with a household rating of 22.91%. In 2008, the 80th Academy Awards telecast was watched by 31.76 million viewers on average with an 18.66% household rating, the lowest-rated and least-watched ceremony at the time, in spite of celebrating 80 years of the Academy Awards. The Best Picture winner of that particular ceremony was another independent film (No Country for Old Men).Whereas the 92nd Academy Awards drew an average of 23.6 million viewers, the 93rd Academy Awards drew an even lower viewership of 10.4 million. That is the lowest viewership recorded by Nielsen since it started recording audience totals in 1974.ArchiveThe Academy Film Archive holds copies of every Academy Awards ceremony since the 1949 Oscars and material on many prior ceremonies, along with ancillary material related to more recent shows. Copies are held in a variety of film, video, and digital formats.VenuesIn 1929, the first Academy Awards were presented at a banquet dinner at The Hollywood Roosevelt Hotel. From 1930 to 1943, the ceremony alternated between two venues: the Ambassador Hotel on Wilshire Boulevard and the Biltmore Hotel in downtown Los Angeles.Grauman's Chinese Theatre in Hollywood then hosted the awards from 1944 to 1946, followed by the Shrine Auditorium in Los Angeles from 1947 to 1948. The 21st Academy Awards in 1949 were held at the Academy Award Theatre at what had been the Academy's headquarters on Melrose Avenue in Hollywood.From 1950 to 1960, the awards were presented at Hollywood's Pantages Theatre. With the advent of television, the awards from 1953 to 1957 took place simultaneously in Hollywood and New York, first at the NBC International Theatre (1953) and then at the NBC Century Theatre, after which the ceremony took place solely in Los Angeles. The Oscars moved to the Santa Monica Civic Auditorium in Santa Monica, California, in 1961. By 1969, the Academy decided to move the ceremonies back to Downtown Los Angeles, this time to the Dorothy Chandler Pavilion at the Los Angeles County Music Center. In the late 1990s and early 2000s, the ceremony returned to the Shrine.In 2002, Hollywood's Dolby Theatre (previously known as the Kodak Theatre) became the presentation's current venue.Awards of Merit categoriesCurrent categoriesIn the first year of the awards, the Best Directing award was split into two categories (Drama and Comedy). At times, the Best Original Score award has also been split into separate categories (Drama and Comedy/Musical). From the 1930s through the 1960s, the Art Direction (now Production Design), Cinematography, and Costume Design awards were likewise split into two categories (black-and-white films and color films). Prior to 2012, the Production Design award was called Art Direction, while the Makeup and Hairstyling award was called Makeup.In August 2018, the Academy announced that several categories would not be televised live, but rather be recorded during commercial breaks and aired later in the ceremony.Following dissent from Academy members, they announced that they would indeed air all 24 categories live. This followed several proposals (among them, the introduction of a Popular Film category) that the Academy had announced but did not implement.Discontinued categoriesProposed categoriesThe Board of Governors meets each year and considers new award categories. To date, the following categories have been proposed: Best Casting: rejected in 1999 Best Popular Film: proposed in 2018 for presentation at the 2019 ceremony; postponed until the 2020 ceremony at the earliest (yet to be implemented) Best Stunt Coordination: rejected every year from 1991 to 2012 Best Title Design: rejected in 1999Special categoriesThe Special Academy Awards are voted on by special committees, rather than by the Academy membership as a whole. They are not always presented on an annual basis.Current special categories Academy Honorary Award: since 1929 Academy Scientific and Technical Award (three different awards): since 1931 Gordon E. Sawyer Award: since 1981 Jean Hersholt Humanitarian Award: since 1957  Irving G. Thalberg Memorial Award: since 1938  Academy Special Achievement Award: from 1972 to 1995, and again for 2017Discontinued special categories Academy Juvenile Award: 1934 to 1960CriticismAccusations of commercialismDue to the positive exposure and prestige of the Academy Awards, many studios spend millions of dollars and hire publicists specifically to promote their films during what is typically called the "Oscar season". This has generated accusations of the Academy Awards being influenced more by marketing than by quality. William Friedkin, an Academy Award-winning film director and former producer of the ceremony, expressed this sentiment at a conference in New York in 2009, describing it as "the greatest promotion scheme that any industry ever devised for itself".Tim Dirks, editor of AMC's filmsite.org, has written of the Academy Awards:A recent technique that has been claimed to be used during the Oscar season is the whisper campaign. These campaigns are intended to spread negative perceptions of other movies nominated and are believed to be perpetrated by those that were involved in creating the movie. Examples of whisper campaigns include the allegations against Zero Dark Thirty suggesting that it justifies torture and the claim that Lincoln distorts history.Accusations of biasTypical criticism of the Academy Awards for Best Picture is that among the winners and nominees there is an over-representation of romantic historical epics, biographical dramas, romantic dramedies and family melodramas, most of which are released in the U.S. in the last three months of the calendar year. The Oscars have been infamously known for selecting specific genres of movies to be awarded. The term "Oscar bait" was coined to describe such movies. This has led, at times, to more specific criticisms that the Academy is disconnected from the audience, e.g., by favoring "Oscar bait" over audience favorites or favoring historical melodramas over critically acclaimed movies that depict current life issues.Allegations of a lack of diversityThe Academy Awards have long received criticism over its lack of diversity among the nominees. This criticism is based on the statistics from every Academy Awards since 1929, which shows us that only 6.4% of academy award nominees have been non-white and since 1991, 11.2% of nominees have been non-white, with the rate of winners being even more polarizing. Due to a variety of reasons, including marketability and historical bans on interracial couples, a number of high-profile Oscars have been given to yellowface portrayals, as well as performances of Asian characters rewritten for white characters. The 88th awards ceremony became the target of a boycott, popularized on social media with the hashtag #OscarsSoWhite, based on activists' perception that its all-white acting nominee list reflected bias. In response, the Academy initiated "historic" changes in membership by the year 2020.Symbolism or sentimentalizationActing prizes in certain years have been criticized for not recognizing superior performances so much as being awarded for personal popularity, to make up for a "snub" for a work that proved in time to be more popular or renowned than the one awarded, or presented as a "career honor" to recognize a distinguished nominee's entire body of work.Recognition of streaming media filmFollowing the 91st Academy Awards in February 2019 in which the Netflix-broadcast film Roma had been nominated for ten awards including the Best Picture category, Steven Spielberg and other members of the Academy discussed changing the requirements through the Board of Governors for films as to exclude those from Netflix and other media streaming services. Spielberg had been concerned that Netflix as a movie production and distribution studio could spend much more than typical Oscar-winning films and have much wider and earlier distribution than other Best Picture-nominated films, while still being able to meet the minimal theatrical-run status to qualify for an Oscar. The United States Department of Justice, having heard of this potential rule change, wrote a letter to the Academy in March 2019, cautioning them that placing additional restrictions on films that originate from streaming media services without proper justification could raise anti-trust concerns against the Academy. Following its April 2019 board meeting, the Academy Board of Governors agreed to retain the current rules that allow for streaming media films to be eligible for Oscars as long as they enjoy limited theatrical runs.Refusals of the awardSome winners critical of the Academy Awards have boycotted the ceremonies and refused to accept their Oscars. The first to do so was screenwriter Dudley Nichols (Best Writing in 1935 for The Informer). Nichols boycotted the 8th Academy Awards ceremony because of conflicts between the Academy and the Writers' Guild. Nichols eventually accepted the 1935 award three years later, at the 1938 ceremony. Nichols was nominated for three further Academy Awards during his career.George C. Scott became the second person to refuse his award (Best Actor in 1970 for Patton) at the 43rd Academy Awards ceremony. Scott described it as a "meat parade", saying, "I don't want any part of it."The third person to refuse the award was Marlon Brando, who refused his award (Best Actor for 1972's The Godfather), citing the film industry's discrimination and mistreatment of Native Americans. At the 45th Academy Awards ceremony, Brando asked actress and civil rights activist Sacheen Littlefeather to read a 15-page speech in his place, detailing his criticisms, for which there was booing and cheering by the audience.DisqualificationsSix films have had nominations revoked before the official award ceremony: The Circus (1928) – The film was voluntarily removed by the Academy from competitive categories, to award Charlie Chaplin a special award. Hondo (1953) – Removed from the Best Story ballot after letters from the producer and nominee questioned its inclusion in the category. High Society (1955) – Withdrawn from screenwriting ballot after being mistaken for the 1956 movie of the same title. The Godfather (1972) – Initially nominated for eleven awards, its nomination for Best Original Score was revoked after it was discovered that its main theme was very similar to music that the score's composer had written for an earlier film. None of its other nominations were revoked, and it received three Oscars, including Best Picture. A Place in the World (1992) – Removed from the Best Foreign Language Film ballot after it was discovered that the country which submitted the film exercised insufficient artistic control. Alone Yet Not Alone (2014) – The film's title song, "Alone Yet Not Alone", was removed from the Best Original Song ballot after Bruce Broughton was found to have improperly contacted other members of the academy's musical branch; this was the first time that a film was removed from a ballot for ethical reasons.One film was disqualified after winning the award, and had the winner return the Oscar: Young Americans (1969) – Initially won the award for Best Documentary Feature, but was later revoked after it was revealed that it had opened theatrically prior to the eligibility period.One film had its nomination revoked after the award ceremony when it had not won the Oscar:Tuba Atlantic (2011) – Its nomination for Best Live Action Short Film was revoked when it was discovered that the film had aired on television in 2010, before its theatrical release.Gender segregationSome advocates of gender equality and non-binary people have criticized the separation of male and female acting categories in the Academy Awards, Emmy Awards and Tony Awards. Though some commentators worry that gender discrimination would cause men to dominate unsegregated categories, other categories are unsegregated. The Grammy Awards went gender-neutral in 2012, while the Daytime Emmy Awards introduced a single Outstanding Younger Performer in a Drama Series category in 2019 to replace their two gender-specific younger actor and actress categories.Associated eventsThe following events are closely associated with the annual Academy Awards: BAFTA Awards César Awards David di Donatello Awards Nominees luncheon Governors Awards The 25th Independent Spirit Awards (2010), usually held in Santa Monica, California the Saturday before the Oscars, marked the first time it was moved to a Friday and a change of venue to L.A. Live The annual "Night Before", traditionally held at the Beverly Hills Hotel, begun in 2002 and generally known as the party of the season, benefits the Motion Picture & Television Fund, which operates a retirement home for SAG actors in the San Fernando Valley Elton John AIDS Foundation Academy Award Party airs the awards live at the nearby Pacific Design Center The Governors Ball is the Academy's official after-party, including dinner (until 2011), and is adjacent to the awards-presentation venue The Vanity Fair after-party, historically at the former Morton's restaurant, has been at the Sunset Tower since 2009 Ariel Award in Mexico Goya Award in SpainPresenter and performer giftsIt has become a tradition to give out gift bags to the presenters and performers at the Oscars. In recent years, these gifts have also been extended to award nominees and winners. The value of each of these gift bags can reach into the tens of thousands of dollars. In 2014, the value was reported to be as high as US$80,000. The value has risen to the point where the U.S. Internal Revenue Service issued a statement regarding the gifts and their taxable status.Oscar gift bags have included vacation packages to Hawaii and Mexico and Japan, a private dinner party for the recipient and friends at a restaurant, videophones, a four-night stay at a hotel, watches, bracelets, spa treatments, bottles of vodka, maple salad dressing, weight-loss gummie candy and up to $25,000 worth of cosmetic treatments and rejuvenation procedures such as lip fillers and chemical peels from New York City facial plastic surgeon Konstantin Vasyukevich. Some of the gifts have even had a "risque" element to them; in 2014, the adult products retailer Adam & Eve had a "Secret Room Gifting Suite". Celebrities visiting the gifting suite included Judith Hoag, Carolyn Hennesy, Kate Linder, Chris Mulkey, Jim O'Heir, and John Salley.Television ratings and advertisement pricesFrom 2006 onwards, results are Live+SD; all previous years are live viewing.TrademarkThe term "Oscar" is a registered trademark of the AMPAS; however, in the Italian language, it is used generically to refer to any award or award ceremony, regardless of which field.Court: Oscar may be generic term in Italian | Reuters See also List of film awards List of Academy Award records List of actors with Academy Award nominations List of superlative Academy Award winners and nomineesFootnotesReferencesFurther reading Brokaw, Lauren (2010). "Wanna see an Academy Awards invite? We got it along with all the major annual events surrounding the Oscars". Los Angeles: The Daily Truffle.      Wright, Jon (2007). The Lunacy of Oscar: The Problems with Hollywood's Biggest Night''. Thomas Publishing, Inc.External links  of the Academy of Motion Picture Arts and Sciences  Official Academy Awards Database (searchable) 1929 establishments in CaliforniaPerforming arts trophiesAmerican annual television specialsAmerican film awardsAnnual events in Los Angeles County, CaliforniaAwards established in 1929Cinema of Southern CaliforniaEvents in Los AngelesHollywood history and cultureAmerican live television shows
+Actresses (Catalan: Actrius) is a 1997 Catalan language Spanish drama film produced and directed by Ventura Pons and based on the award-winning stage play E.R. by Josep Maria Benet i Jornet. The film has no male actors, with all roles played by females. The film was produced in 1996.SynopsisIn order to prepare herself to play a role commemorating the life of legendary actress Empar Ribera, young actress (Mercè Pons) interviews three established actresses who had been the Ribera's pupils: the international diva Glòria Marc (Núria Espert), the television star Assumpta Roca (Rosa Maria Sardà), and dubbing director Maria Caminal (Anna Lizaran).Cast Núria Espert as Glòria Marc Rosa Maria Sardà as Assumpta Roca Anna Lizaran as Maria Caminal Mercè Pons as EstudiantRecognitionScreeningsActrius screened in 2001 at the Grauman's Egyptian Theatre in an American Cinematheque retrospective of the works of its director. The film had first screened at the same location in 1998. It was also shown at the 1997 Stockholm International Film Festival.ReceptionIn Movie - Film - Review, Christopher Tookey wrote that though the actresses were "competent in roles that may have some reference to their own careers", the film "is visually unimaginative, never escapes its stage origins, and is almost totally lacking in revelation or surprising incident". Noting that there were "occasional, refreshing moments of intergenerational bitchiness", they did not "justify comparisons to All About Eve", and were "insufficiently different to deserve critical parallels with Rashomon". He also wrote that The Guardian called the film a "slow, stuffy chamber-piece", and that The Evening Standard stated the film's "best moments exhibit the bitchy tantrums seething beneath the threesome's composed veneers". MRQE wrote "This cinematic adaptation of a theatrical work is true to the original, but does not stray far from a theatrical rendering of the story."Awards and nominations 1997, won 'Best Catalan Film' at Butaca Awards for Ventura Pons 1997, won 'Best Catalan Film Actress' at Butaca Awards, shared by Núria Espert, Rosa Maria Sardà, Anna Lizaran, and Mercè Pons 1998, nominated for 'Best Screenplay' at Goya Awards, shared by Josep Maria Benet i Jornet and Ventura PonsReferencesExternal links   as archived 17 February 2009 (Spanish)1997 films1997 drama filmsSpanish filmsCatalan-language filmsFilms set in BarcelonaFilms directed by Ventura PonsSpanish drama films
+Animalia is an illustrated children's book by Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over four million copies have been sold worldwide.   A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket.SynopsisAnimalia is an alliterative alphabet book and contains twenty-six illustrations, one for each letter of the alphabet. Each illustration features an animal from the animal kingdom (A is for alligator and armadillo, B is for butterfly, etc.) along with a short poem utilizing the letter of the page for many of the words. The illustrations contain many other objects beginning with that letter that the reader can try to identify (however, there are not necessarily "a thousand things, or maybe more", as the author states). As an additional challenge, the author has hidden a picture of himself as a child in every picture.Here are some of the things in each picture that are truly different (the alligator in the A section is wearing an apron featuring the alphabet, which the book is about, and this section also features the author's home country, Australia):Note: This list is incomplete.A1. Astronaut2. Album3. Admiral4. Archdiocese5. Actor6. Actress7. Aborigine8. Athlete9. Acrobat10. Apple11. Acorn12. Apricot13. Avocado14. Adder15. Albatross16. Antelope (this is actually a pronghorn, which is not a true antelope, so it belongs in the P section)17. Anteater18. Aardvark19. Anvil20. Afghan hound21. Affenpinscher22. Airedale terrier23. Aqueduct24. Ant25. Abacus26. Asparagus27. Artichoke28. Accordion29. Anchor30. Anemone 31. Axe32. Angel 33. Algebra34. Atlas35. Apron36. Alien37. Ambulance38. AntennaB36. Bumblebee37. Bobolink38. Bear39. Bonnet40. Barbed wire41. Brambles42. Bulrushes43. Baboon44. Bassoon45. Brontosaurus46. Budgerigar47. Bomb48. Brain49. Brick50. Basket51. Basketball52. Basketball hoop53. Baseball54. Baseball bat55. Backgammon56. Ballpoint pen57. Bagpipes58. Bicycle59. Barrel60. Bell61. Boot62. Button63. Blueberries64. Belt65. Bugle66. Bull67. Bucket68. Bellows69. Boomerang70. Bathtub71. Bone72. Brush73. Bottle74. Banana75. Brush76. Binoculars77. Barracuda78. Buddha79. Battery80. Broom81. Bat (animal)82. Boy83. BungalowC82. Crab83. Chair84. Crane85. Caterpillar86. Canoe87. Computer88. Collar89. Camera90. Concertina91. Cap92. Cheetah93. Chain94. Cassette95. Crocodile96. Cone97. Cube98. Cylinder99. Cymbal100. Cucumber101. Celery102. Cabbage103. Cheese104. Corn105. Carrot106. Cards107. Calculator108. Candle109. Cherry110. Cake111. Coconut112. Cup113. Cocoa114. Can115. Calendar116. Chef117. Castle118. Church119. Cemetery120. Cross of Christ121. Caravan122. Circus123. Clown124. Cricket (game)125. Convict126. Cannon127. Cow128. Chimpanzee129. Cobra130. Cage131. Canary132. Check133. Crossword puzzle134. Crutch135. Cord136. Crown137. Crate138. Cork 139. Cog140. Comb141. Clarinet142. Clam143. Chieftain144. Cactus145. Cliff146. Chateau147. Concorde148. Chandelier149. Cottage150. Cigar151. Candy cane152. Cauldron153. CentipedeD154. Dustpan155. Duster156. Dynamite157. Drill158. Drawers159. Draughts160. Doughnut161. Diamond162. Dice163. Dutch doll164. Dentures165. Date (fruit)166. Date (time)167. Doily168. Dish169. Dollar170. Dolphin171. Decagon172. Devil173. Dormouse174. Diagonal175. Decade176. Doctrine177. Dumbbell178. Dragonfly179. Dwarf180. Dachshund181. Doberman pinscher182. Dalmatian183. Dodo184. Diplodocus185. Dimetrodon186. Dove187. Desperado188. Donkey189. Dam190. Drain191. Dinghy192. Drowning193. Drawbridge194. Deer195. Destroyer196. Dromedary197. Double-decker bus198. Daffodil199. Daisy200. Dirigible201. Dominos202. Dagger203. Dart204. Duck205. Dingo206. Dolly207. Deputy208. DogE208. Eclipse209. Éclair210. Elderberries211. Envelope212. Emu213. Eleven214. Edison215. Einstein216. Embryo217. Earwig218. Echidna219. Elf220. Eskimo221. Eagle222. Edelweiss223. Earring224. Emerald225. Exclamation point226. EyeglassesF226. Flounder227. Film228. Fly229. Foxglove230. Fern231. Fairy232. Fire233. Firewood234. Frankenstein235. Fork236. Forest237. Falcon238. Fungus239. Flier240. Flute241. Fan242. FoghornG243. Graph244. Glockenspiel245. Gerbil246. Geranium247. Gladiolus248. Gladiator249. Gremlin250. Golf club251. Golf ball252. Gibbon253. Guitar254. Galoshes255. Grail256. Greyhound257. Gong258. Gazelle259. Griffin260. Gargoyle261. Graffiti262. Grasshopper263. Globe264. Galleon265. Gorgon266. Gnome267. Gramophone268. Goat269. Goggles270. Goose271. Giraffe272. Gazebo273. Guard274. Gift275. Garage276. Garbage277. Garbage can278. Gallows279. Guillotine280. Ghost281. Giant282. Goal283. Glider284. Gage285. GarterH285. Hexagon286. Hose287. Hare288. Hyena289. Hawk290. Hammock291. Hook292. Hippo293. Hunter294. Hill295. Hang glider296. Herald297. Helicopter298. Hamburger299. Hydrant300. Hourglass301. Hamster302. Hedgehog 303. Horn304. Heart305. Hen306. Hand grenade307. Humpty-Dumpty308. Holly309. Holy Bible310. Hatch311. Haddock312. Hammer313. Hieroglyphics314. Handkerchief315. Handcuffs316. Hatchet317. Hornet318. HalberdI318. Island319. Icicle320. Ice cream321. Iron322. Iceberg323. Icarus324. Imprisoned325. Ingot326. InkJ324. Judge325. Javelin326. Jester327. Jack-in-the-box328. Jack-in-the-pulpit329. Japan330. Jet331. Jasmine332. Jaguar333. JeansK333. Kite334. Knapsack335. Knitting336. Kiwi337. Kilt338. Kitten339. Knight340. Kipper341. Knife342. Keys343. Keychain344. Kitchen345. Kettle346. Kayak347. Knocker348. Ketch349. Keel350. Keypad351. KerbL350. Ladder351. Lyre352. Lantern353. Lobster354. Llama355. Lettuce356. Leprechaun357. Lockbox358. Ladle359. Lemon360. Lute361. Lollipop362. Lamp363. Lily364. LassoM365. Map366. Mammoth367. Mermaid368. Moose369. Magpie370. Mosque371. Mandolin372. Monkey marionette373. Marble374. Metronome375. Moth376. Million377. Millimeter378. Millipede379. Mushroom380. Match381. Matchbox382. Molecule383. Mug384. Milk385. Medal386. Monocle387. Magnet388. Maggot389. Mask390. Microphone391. Microscope392. Moon393. Mole394. Monster395. Monitor396. MoustacheN394. Noah395. Narwhal396. Neptune397. Newspaper398. Nightingale399. Nest400. Net401. Nun402. Nut403. Nutcracker404. North405. Ninety-nine406. Napkin407. Nautilus408. Nurse409. NonagonO410. Orange411. Otter412. Orangutan413. Observatory414. Octagon415. Owl416. Obelisk417. Oak418. Oil drill419. Organ420. Oven421. OrchestraP421. Purse422. Physician423. Poodle424. Parasol425. Pig426. Perambulator427. Periwinkle428. Politician429. Pin430. Philosopher431. Parchment432. Polka dot433. Pigtail434. Pit drum435. Pharaoh436. Pied Piper437. Pyjamas438. Plume439. Police440. Prisoner441. Pygmy442. Punch & Judy443. Pope444. Peace445. Pirate446. Patch447. Peg leg448. Prince449. Princess450. Pendant451. Palace452. Pagoda453. Parachute454. Pegasus455. Pisa (Leaning Tower)456. Parthenon457. Palm tree458. Pyramid459. Paris460. Peninsula461. Penguin462. Pool463. Pathway464. Procession465. Platypus466. Pan467. Pumpkin468. Pheasant469. Partridge470. Puffin471. Pelican472. Porcupine473. Panda474. Parcel475. Pliers476. Plow477. Pitchfork478. Pick479. Pine tree480. Pansy481. Poison ivy482. Periscope483. Porpoise484. Piano485. Popeye486. Phoenix487. Potato488. Plum489. Painter490. Palette491. Paint492. Paintbrush493. Peach494. Pear495. Pomegranate496. Pineapple497. Pussy-willows498. Pavilion499. Pulley500. Pump501. Plaque502. Prism503. Peas504. PearlQ505. Quartz506. Quicksand507. Quarter508. Quoits509. Queen510. Quilt511. Queensland512. QueueR 511. Rust512. Radar513. Raspberry514. Raccoon515. Rhododendron516. Roman numerals517. Ruby518. Ring519. Razor520. Roller skate521. Reindeer522. Roulette523. Rake524. Rifle525. Revolver526. Refrigerator527. Rabbit528. Rolling pin529. Register530. Rose531. Raven532. Ram533. Rat534. Rowboat535. Rooster536. Rattlesnake537. Robin538. Rocking horse539. Rocking chair540. Radius541. Rip542. Racket543. Recorder544. RocketS545. Sapphire546. Soup547. Stump548. Scorpion549. Sieve550. Sandcastle551. Sloop552. Schooner553. Shark554. Scarf555. Spider556. Spur557. Sheriff558. Sling559. Scab560. Sickle561. Scythe562. Slippers563. Sandwich564. Sunflower565. Snowshoes566. Skis567. Stretcher568. Spy569. Stitch570. Screwdriver571. Screw572. Shifter (Wrench)573. Shrug574. Spade575. Shovel576. Sledgehammer577. Scissors578. Shears579. Saw580. Scalpel581. Shack582. Scooter583. Satchel584. Sundae585. Straw586. Spaghetti587. Strawberry588. Spoon589. Saturn590. Seesaw591. Spring592. Sneeze593. Shepherd594. Staff595. Scarecrow596. Sloth597. Stork598. Spoonbill599. Safe600. Shrew601. Skipping rope602. Scroll603. Stamp604. Soccer605. Swimmer606. Snorkel607. Syringe608. Siphon609. Stethoscope610. Starfish611. Snail612. Slug613. Sphinx614. Sprocket615. Spinning wheel616. Spool617. Stool618. Space shuttle619. Satellite620. Sombrero621. Serape622. Saxophone623. Synthesizer624. Superman625. Shower626. Suitcase627. Shuttlecock628. Skittle (Bowling pin)629. Stilts630. Stalactite631. Stalagmite632. Steamroller633. Swings634. Slide635. Sword636. Sheathe637. Stiletto638. Scimitar639. Saber640. Spear641. Sleigh642. Snow643. Santa Claus644. Sack645. Sausage646. Stick figure647. Surfboard648. Surfer649. Seal650. Skull651. Spine652. Shamrock653. Spectacles654. Scapula655. Slingshot656. Snipe657. Swallow658. Sardines659. Swan660. Skunk661. Stepladder662. Sofa663. Scarab beetle664. Stereo665. Star of David666. Sparrow667. Squirrel668. Sextant669. Squid670. Seahorse671. Salute672. Sardines673. SemaphoreT672. Top hat673. Tulip674. Tricycle675. Toad676. Thermos677. Turtle678. Tear679. Trombone680. Trumpet681. Tuba682. Tractor683. Trailer684. Tunnel685. Tepee686. Totem pole687. Target688. Tuxedo689. Tunic690. Telescope691. Teapot692. Television693. Trophy694. Tap695. Teddy bear696. Tambourine697. Torch698. Toy tank699. Tomato700. Thermometer701. Tweezers702. Threader703. Typewriter704. Turntable705. Telephone706. TapirU707. UFO708. Ursa Major709. Ursa Minor710. United Kingdom711. Uncle Sam712. Ukulele713. Underwear714. UmiakV715. Volkswagen716. Vase717. Van718. VCR719. Violin720. Vacuum cleaner721. Voodoo doll722. Vane723. Valve724. Volcano725. Viaduct726. Vicar727. Viking728. Vampire729. Valley730. VegetablesW730. Weevil731. Wristwatch732. Witch733. Wave734. Wizard735. Wand736. Wheat737. Wall738. Wreck739. Wharf740. Whale741. Walrus742. Whirlpool743. Werewolf744. Wolf745. Wishbone746. Well747. Washerwoman748. Washhouse749. Washing machine750. Wagon751. Whip752. Windmill753. Wombat754. Wallaby755. Weeping willow756. Waterfall757. Weapons758. WaterX757. Xylophone758. Xerophytes759. Xmas tree760. X-ray761. X (sign language)Y762. Yoke763. Yolk764. Yeti765. Yeoman766. Yo-yo767. Yard768. YearZ769. Zulu770. Zodiac771. Zipper772. Zinnia773. Zither774. Zebu775. Zorro776. Zero777. ZebraRelated productsJulia MacRae Books published an Animalia colouring book in 2008.   H. N. Abrams also published a wall calendar colouring book version for children the same year.H. N. Abrams published The Animalia Wall Frieze, a fold-out over 26 feet in length, in which the author created new riddles for each letter.The Great American Puzzle Factory created a 300-piece jigsaw puzzle based on the book's cover.AdaptationsA television series was also created, based on the book, which airs in the United States, Australia, Canada, the United Kingdom, Norway and Venezuela. It also airs on Minimax for the Czech Republic and Slovakia. And recently in Greece on the channel ET1. The Australian Children's Television Foundation released a teaching resource DVD-ROM in 2011 to accompany the TV series with teaching aids for classroom use.In 2010, The Base Factory and AppBooks released Animalia as an application for iPad and iPhone/iPod Touch.AwardsAnimalia won the Young Australian's Best Book Award in 1987 for Best Picture Story Book.The Children's Book Council of Australia designated Animalia a 1987 Picture Book of the Year: Honour Book.Kid's Own Australian Literature Awards named Animalia the 1988 Picture Book Winner.ReferencesExternal links Graeme Base's official website A Learning Time activity guide for Animalia created by The Little Big Book ClubAlphabet books1986 children's booksPicture books by Graeme BasePuzzle booksAustralian children's booksPuffin Books books
+International Atomic Time (TAI, from the French name ) is a high-precision atomic coordinate time standard based on the notional passage of proper time on Earth's geoid. It is a continuous scale of time, without leap seconds. It is the principal realisation of Terrestrial Time (with a fixed offset of epoch). It is also the basis for Coordinated Universal Time (UTC), which is used for civil timekeeping all over the Earth's surface. UTC deviates from TAI by a number of whole seconds. , when another leap second was put into effect, UTC is currently exactly 37 seconds behind TAI. The 37 seconds result from the initial difference of 10 seconds at the start of 1972, plus 27 leap seconds in UTC since 1972.TAI may be reported using traditional means of specifying days, carried over from non-uniform time standards based on the rotation of the Earth. Specifically, both Julian days and the Gregorian calendar are used. TAI in this form was synchronised with Universal Time at the beginning of 1958, and the two have drifted apart ever since, due to the changing motion of the Earth.OperationTAI is a weighted average of the time kept by over 400 atomic clocks in over 50 national laboratories worldwide. The majority of the clocks involved are caesium clocks; the International System of Units (SI) definition of the second is based on caesium. The clocks are compared using GPS signals and two-way satellite time and frequency transfer.  Due to the signal averaging TAI is an order of magnitude more stable than its best constituent clock.The participating institutions each broadcast, in real time, a frequency signal with timecodes, which is their estimate of TAI.  Time codes are usually published in the form of UTC, which differs from TAI by a well-known integer number of seconds.  These time scales are denoted in the form UTC(NPL) in the UTC form, where NPL identifies the National Physical Laboratory, UK. The TAI form may be denoted TAI(NPL).  The latter is not to be confused with TA(NPL), which denotes an independent atomic time scale, not synchronised to TAI or to anything else.The clocks at different institutions are regularly compared against each other.  The International Bureau of Weights and Measures (BIPM, France), combines these measurements to retrospectively calculate the weighted average that forms the most stable time scale possible.  This combined time scale is published monthly in "Circular T", and is the canonical TAI.  This time scale is expressed in the form of tables of differences UTC − UTC(k) (equivalent to TAI − TAI(k)) for each participating institution k.  The same circular also gives tables of TAI − TA(k), for the various unsynchronised atomic time scales.Errors in publication may be corrected by issuing a revision of the faulty Circular T or by errata in a subsequent Circular T.  Aside from this, once published in Circular T, the TAI scale is not revised.  In hindsight, it is possible to discover errors in TAI and to make better estimates of the true proper time scale.  Since the published circulars are definitive, better estimates do not create another version of TAI; it is instead considered to be creating a better realisation of Terrestrial Time (TT).HistoryEarly atomic time scales consisted of quartz clocks with frequencies calibrated by a single atomic clock; the atomic clocks were not operated continuously. Atomic timekeeping services started experimentally in 1955, using the first caesium atomic clock at the National Physical Laboratory, UK (NPL). It was used as a basis for calibrating the quartz clocks at the Royal Greenwich Observatory and to establish a time scale, called Greenwich Atomic (GA). The United States Naval Observatory began the A.1 scale on 13 September 1956, using an Atomichron commercial atomic clock, followed by the NBS-A scale at the National Bureau of Standards, Boulder, Colorado on 9 October 1957.The International Time Bureau (BIH) began a time scale, Tm or AM, in July 1955, using both local caesium clocks and comparisons to distant clocks using the phase of VLF radio signals. The BIH scale, A.1, and NBS-A were defined by an epoch at the beginning of 1958 The procedures used by the BIH evolved, and the name for the time scale changed: "A3" in 1964 and "TA(BIH)" in 1969.The SI second was defined in terms of the caesium atom in 1967. From 1971 to 1975 the General Conference on Weights and Measures and the International Committee for Weights and Measures made a series of decisions which designated the BIPM time scale International Atomic Time (TAI).In the 1970s, it became clear that the clocks participating in TAI were ticking at different rates due to gravitational time dilation, and the combined TAI scale, therefore, corresponded to an average of the altitudes of the various clocks. Starting from the Julian Date 2443144.5 (1 January 1977 00:00:00), corrections were applied to the output of all participating clocks, so that TAI would correspond to proper time at the geoid (mean sea level).  Because the clocks were, on average, well above sea level, this meant that TAI slowed by about one part in a trillion. The former uncorrected time scale continues to be published under the name EAL (Échelle Atomique Libre, meaning Free Atomic Scale).The instant that the gravitational correction started to be applied serves as the epoch for Barycentric Coordinate Time (TCB), Geocentric Coordinate Time (TCG), and Terrestrial Time (TT), which represent three fundamental time scales in the solar system.  All three of these time scales were defined to read JD 2443144.5003725 (1 January 1977 00:00:32.184) exactly at that instant. TAI was henceforth a realisation of TT, with the equation TT(TAI) = TAI + 32.184 s.The continued existence of TAI was questioned in a 2007 letter from the BIPM to the ITU-R which stated, "In the case of a redefinition of UTC without leap seconds, the CCTF would consider discussing the possibility of suppressing TAI, as it would remain parallel to the continuous UTC."Relation to UTCUTC is a discontinuous time scale. It is occasionally adjusted by leap seconds. Between these adjustments, it is composed of segments that are mapped to atomic time. From its beginning in 1961 through December 1971, the adjustments were made regularly in fractional leap seconds so that UTC approximated UT2.  Afterward, these adjustments were made only in whole seconds to approximate UT1. This was a compromise arrangement in order to enable a publicly broadcast time scale. The less frequent whole-second adjustments meant that the time scale would be more stable and easier to synchronize internationally. The fact that it continues to approximate UT1 means that tasks such as navigation which require a source of Universal Time continue to be well served by the public broadcast of UTC.See also Clock synchronization Network Time Protocol Precision Time Protocol Time and frequency transferNotesReferencesFootnotesBibliographyExternal links Bureau International des Poids et Mesures: TAI Time and Frequency Section - National Physical Laboratory, UK IERS website  NIST Web Clock FAQs History of time scales NIST-F1 Cesium Fountain Atomic Clock  Japan Standard Time Project, NICT, Japan  Standard of time definition: UTC, GPS, LORAN and TAITime scales
+Altruism is the principle and moral practice of concern for happiness of other human beings or other animals, resulting in a quality of life both material and spiritual. It is a traditional virtue in many cultures and a core aspect of various religious and secular worldviews. However, the object(s) of concern vary among cultures and religions. In an extreme case, altruism may become a synonym of selflessness, which is the opposite of selfishness.The word "altruism" was popularized (and possibly coined) by the French philosopher Auguste Comte in French, as altruisme, for an antonym of egoism. He derived it from the Italian altrui, which in turn was derived from Latin alteri, meaning "other people" or "somebody else".Altruism in biological observations in field populations of the day organisms is an individual performing an action which is at a cost to themselves (e.g., pleasure and quality of life, time, probability of survival or reproduction), but benefits, either directly or indirectly, another individual, without the expectation of reciprocity or compensation for that action. Steinberg suggests a definition for altruism in the clinical setting, that is "intentional and voluntary actions that aim to enhance the welfare of another person in the absence of any quid pro quo external rewards". In one sense, the opposite of altruism is spite; a spiteful action harms another with no self-benefit.Altruism can be distinguished from feelings of loyalty or concern for the common good.  The latter are predicated upon social relationships, whilst altruism does not consider relationships. Much debate exists as to whether "true" altruism is possible in human psychology. The theory of psychological egoism suggests that no act of sharing, helping or sacrificing can be described as truly altruistic, as the actor may receive an intrinsic reward in the form of personal gratification. The validity of this argument depends on whether intrinsic rewards qualify as "benefits".The term altruism may also refer to an ethical doctrine that claims that individuals are morally obliged to benefit others. Used in this sense, it is usually contrasted with egoism, which claims individuals are morally obligated to serve themselves first. Effective altruism is the use of evidence and reason to determine the most effective ways to benefit others.The notion of altruismThe concept has a long history in philosophical and ethical thought. The term was originally coined in the 19th century by the founding sociologist and philosopher of science, Auguste Comte, and has become a major topic for psychologists (especially evolutionary psychology researchers), evolutionary biologists, and ethologists. Whilst ideas about altruism from one field can affect the other fields, the different methods and focuses of these fields always lead to different perspectives on altruism. In simple terms, altruism is caring about the welfare of other people and acting to help them.Scientific viewpointsAnthropologyMarcel Mauss's essay The Gift contains a passage called "Note on alms". This note describes the evolution of the notion of alms (and by extension of altruism) from the notion of sacrifice. In it, he writes:Alms are the fruits of a moral notion of the gift and of fortune on the one hand, and of a notion of sacrifice, on the other. Generosity is an obligation, because Nemesis avenges the poor and the gods for the superabundance of happiness and wealth of certain people who should rid themselves of it. This is the ancient morality of the gift, which has become a principle of justice. The gods and the spirits accept that the share of wealth and happiness that has been offered to them and had been hitherto destroyed in useless sacrifices should serve the poor and children.Evolutionary explanationsIn the science of ethology (the study of animal behaviour), and more generally in the study of social evolution, altruism refers to behaviour by an individual that increases the fitness of another individual while decreasing the fitness of the actor. In evolutionary psychology this may be applied to a wide range of human behaviors such as charity, emergency aid, help to coalition partners, tipping, courtship gifts, production of public goods, and environmentalism.Theories of apparently altruistic behavior were accelerated by the need to produce theories compatible with evolutionary origins. Two related strands of research on altruism have emerged from traditional evolutionary analyses and from evolutionary game theory a mathematical model and analysis of behavioural strategies.Some of the proposed mechanisms are:  Kin selection. That animals and humans are more altruistic towards close kin than to distant kin and non-kin has been confirmed in numerous studies across many different cultures. Even subtle cues indicating kinship may unconsciously increase altruistic behavior. One kinship cue is facial resemblance. One study found that slightly altering photographs so that they more closely resembled the faces of study participants increased the trust the participants expressed regarding depicted persons. Another cue is having the same family name, especially if rare, and this has been found to increase helpful behavior. Another study found more cooperative behavior the greater the number of perceived kin in a group. Using kinship terms in political speeches increased audience agreement with the speaker in one study. This effect was especially strong for firstborns, who are typically close to their families. Vested interests. People are likely to suffer if their friends, allies, and similar social ingroups suffer or even disappear. Helping such group members may therefore eventually benefit the altruist. Making ingroup membership more noticeable increases cooperativeness. Extreme self-sacrifice towards the ingroup may be adaptive if a hostile outgroup threatens to kill the entire ingroup. Reciprocal altruism. See also Reciprocity (evolution). Direct reciprocity. Research shows that it can be beneficial to help others if there is a chance that they can and will reciprocate the help. The effective tit for tat strategy is one game theoretic example. Many people seem to be following a similar strategy by cooperating if and only if others cooperate in return.One consequence is that people are more cooperative if it is more likely that individuals will interact again in the future. People tend to be less cooperative if they perceive that the frequency of helpers in the population is lower. They tend to help less if they see non-cooperativeness by others and this effect tend to be stronger than the opposite effect of seeing cooperative behaviors. Simply changing the cooperative framing of a proposal may increase cooperativeness such as calling it a "Community Game" instead of a "Wall Street Game".A tendency towards reciprocity implies that people will feel obligated to respond if someone helps them. This has been used by charities that give small gifts to potential donors hoping thereby to induce reciprocity. Another method is to announce publicly that someone has given a large donation. The tendency to reciprocate can even generalize so people become more helpful toward others in general after being helped. On the other hand, people will avoid or even retaliate against those perceived not to be cooperating. People sometimes mistakenly fail to help when they intended to, or their helping may not be noticed, which may cause unintended conflicts. As such, it may be an optimal strategy to be slightly forgiving of and have a slightly generous interpretation of non-cooperation.People are more likely to cooperate on a task if they can communicate with one another first. This may be due to better assessments of cooperativeness or due to exchange of promises. They are more cooperative if they can gradually build trust, instead of being asked to give extensive help immediately. Direct reciprocity and cooperation in a group can be increased by changing the focus and incentives from intra-group competition to larger scale competitions such as between groups or against the general population. Thus, giving grades and promotions based only on an individual's performance relative to a small local group, as is common, may reduce cooperative behaviors in the group. Indirect reciprocity. The avoidance of poor reciprocators and cheaters causes a person's reputation to become very important. A person with a good reputation for reciprocity has a higher chance of receiving help even from persons they have had no direct interactions with previously. Strong reciprocity. A form of reciprocity where some individuals seem to spend more resources on cooperating and punishing than would be most beneficial as predicted by several established theories of altruism. A number of theories have been proposed as explanations as well as criticisms regarding its existence. Pseudo-reciprocity. An organism behaves altruistically and the recipient does not reciprocate but has an increased chance of acting in a way that is selfish but also as a byproduct benefits the altruist. Costly signaling and the handicap principle. Since altruism takes away resources from the altruist it can be an "honest signal" of resource availability and the abilities needed to gather resources. This may signal to others that the altruist is a valuable potential partner. It may also be a signal of interactive and cooperative intentions since those not interacting further in the future gain nothing from the costly signaling. It is unclear if costly signaling can indicate a long-term cooperative personality but people have increased trust for those who help. Costly signaling is pointless if everyone has the same traits, resources, and cooperative intentions but become a potentially more important signal if the population increasingly varies on these characteristics.Hunters widely sharing the meat has been seen as a costly signal of ability and research has found that good hunters have higher reproductive success and more adulterous relations even if they themselves receive no more of the hunted meat than anyone else. Similarly, holding large feasts and giving large donations has been seen as ways of demonstrating one's resources. Heroic risk-taking has also been interpreted as a costly signal of ability.Both indirect reciprocity and costly signaling depend on the value of reputation and tend to make similar predictions. One is that people will be more helping when they know that their helping behavior will be communicated to people they will interact with later, is publicly announced, is discussed, or is simply being observed by someone else. This have been documented in many studies. The effect is sensitive to subtle cues such as people being more helpful when there were stylized eyespots instead of a logo on a computer screen. Weak reputational cues such as eyespots may become unimportant if there are stronger cues present and may lose their effect with continued exposure unless reinforced with real reputational effects. Public displays such as public weeping for dead celebrities and participation in demonstrations may be influenced by a desire to be seen as altruistic. People who know that they are publicly monitored sometimes even wastefully donate money they know are not needed by recipient which may be because of reputational concerns.Women have been found to find altruistic men to be attractive partners. When looking for a long-term partner, altruism may be a preferred trait as it may indicate that he is also willing to share resources with her and her children. It has been shown that men perform altruistic acts in the early stages of a romantic relationship or simply when in the presence of an attractive woman. While both sexes state that kindness is the most preferable trait in a partner there is some evidence that men place less value on this than women and that women may not be more altruistic in presence of an attractive man. Men may even avoid altruistic women in short-term relationships which may be because they expect less success.People may compete for social benefit from a burnished reputation, which may cause competitive altruism. On the other hand, in some experiments a proportion of people do not seem to care about reputation and they do not help more even if this is conspicuous. This may possibly be due to reasons such as psychopathy or that they are so attractive that they need not be seen to be altruistic. The reputational benefits of altruism occur in the future as compared to the immediate costs of altruism in the present. While humans and other organisms generally place less value on future costs/benefits as compared to those in the present, some have shorter time horizons than others and these people tend to be less cooperative.Explicit extrinsic rewards and punishments have been found to sometimes actually have the opposite effect on behaviors compared to intrinsic rewards. This may be because such extrinsic, top-down incentives may replace (partially or in whole) intrinsic and reputational incentives, motivating the person to focus on obtaining the extrinsic rewards, which overall may make the behaviors less desirable. Another effect is that people would like altruism to be due to a personality characteristic rather than due to overt reputational concerns and simply pointing out that there are reputational benefits of an action may actually reduce them. This may possibly be used as derogatory tactic against altruists, especially by those who are non-cooperators. A counterargument is that doing good due to reputational concerns is better than doing no good at all. Group selection. It has controversially been argued by some evolutionary scientists such as David Sloan Wilson that natural selection can act at the level of non-kin groups to produce adaptations that benefit a non-kin group even if these adaptations are detrimental at the individual level. Thus, while altruistic persons may under some circumstances be outcompeted by less altruistic persons at the individual level, according to group selection theory the opposite may occur at the group level where groups consisting of the more altruistic persons may outcompete groups consisting of the less altruistic persons. Such altruism may only extend to ingroup members while there may instead prejudice and antagonism against outgroup members (See also in-group favoritism). Group selection theory has been criticized by many other evolutionary scientists.Such explanations do not imply that humans are always consciously calculating how to increase their inclusive fitness when they are doing altruistic acts. Instead, evolution has shaped psychological mechanisms, such as emotions, that promote altruistic behaviors.Every single instance of altruistic behavior need not always increase inclusive fitness; altruistic behaviors would have been selected for if such behaviors on average increased inclusive fitness in the ancestral environment. This need not imply that on average 50% or more of altruistic acts were beneficial for the altruist in the ancestral environment; if the benefits from helping the right person were very high it would be beneficial to err on the side of caution and usually be altruistic even if in most cases there were no benefits.The benefits for the altruist may be increased and the costs reduced by being more altruistic towards certain groups. Research has found that people are more altruistic to kin than to no-kin, to friends than to strangers, to those attractive than to those unattractive, to non-competitors than to competitors, and to members ingroups than to members of outgroup.The study of altruism was the initial impetus behind George R. Price's development of the Price equation, which is a mathematical equation used to study genetic evolution. An interesting example of altruism is found in the cellular slime moulds, such as Dictyostelium mucoroides. These protists live as individual amoebae until starved, at which point they aggregate and form a multicellular fruiting body in which some cells sacrifice themselves to promote the survival of other cells in the fruiting body.Selective investment theory proposes that close social bonds, and associated emotional, cognitive, and neurohormonal mechanisms, evolved in order to facilitate long-term, high-cost altruism between those closely depending on one another for survival and reproductive success.Such cooperative behaviors have sometimes been seen as arguments for left-wing politics such by the Russian zoologist and anarchist Peter Kropotkin in his 1902 book Mutual Aid: A Factor of Evolution and Moral Philosopher Peter Singer in his book A Darwinian Left.NeurobiologyJorge Moll and Jordan Grafman, neuroscientists at the National Institutes of Health and LABS-D'Or Hospital Network (J.M.) provided the first evidence for the neural bases of altruistic giving in normal healthy volunteers, using functional magnetic resonance imaging. In their research, published in the Proceedings of the National Academy of Sciences USA in October 2006, they showed that both pure monetary rewards and charitable donations activated the mesolimbic reward pathway, a primitive part of the brain that usually responds to food and sex. However, when volunteers generously placed the interests of others before their own by making charitable donations, another brain circuit was selectively activated: the subgenual cortex/septal region. These structures are intimately related to social attachment and bonding in other species. Altruism, the experiment suggested, was not a superior moral faculty that suppresses basic selfish urges but rather was basic to the brain, hard-wired and pleasurable. One brain region, the subgenual anterior cingulate cortex/basal forebrain, contributes to learning altruistic behavior, especially in those with trait empathy. The same study has shown a connection between giving to charity and the promotion of social bonding.In fact, in an experiment published in March 2007 at the University of Southern California neuroscientist Antonio R. Damasio and his colleagues showed that subjects with damage to the ventromedial prefrontal cortex lack the ability to empathically feel their way to moral answers, and that when confronted with moral dilemmas, these brain-damaged patients coldly came up with "end-justifies-the-means" answers, leading Damasio to conclude that the point was not that they reached immoral conclusions, but that when they were confronted by a difficult issue – in this case as whether to shoot down a passenger plane hijacked by terrorists before it hits a major city – these patients appear to reach decisions without the anguish that afflicts those with normally functioning brains. According to Adrian Raine, a clinical neuroscientist also at the University of Southern California, one of this study's implications is that society may have to rethink how it judges immoral people: "Psychopaths often feel no empathy or remorse. Without that awareness, people relying exclusively on reasoning seem to find it harder to sort their way through moral thickets. Does that mean they should be held to different standards of accountability?"In another study, in the 1990s, Dr. Bill Harbaugh, a University of Oregon economist, concluded people are motivated to give for reasons of personal prestige and in a similar fMRI scanner test in 2007 with his psychologist colleague Dr. Ulrich Mayr, reached the same conclusions of Jorge Moll and Jordan Grafman about giving to charity, although they were able to divide the study group into two groups: "egoists" and "altruists". One of their discoveries was that, though rarely, even some of the considered "egoists" sometimes gave more than expected because that would help others, leading to the conclusion that there are other factors in cause in charity, such as a person's environment and values.PsychologyThe International Encyclopedia of the Social Sciences defines psychological altruism as "a motivational state with the goal of increasing another's welfare". Psychological altruism is contrasted with psychological egoism, which refers to the motivation to increase one's own welfare.There has been some debate on whether or not humans are truly capable of psychological altruism. Some definitions specify a self-sacrificial nature to altruism and a lack of external rewards for altruistic behaviors. However, because altruism ultimately benefits the self in many cases, the selflessness of altruistic acts is brought to question. The social exchange theory postulates that altruism only exists when benefits to the self outweigh costs to the self. Daniel Batson is a psychologist who examined this question and argues against the social exchange theory. He identified four major motives: to ultimately benefit the self (egoism), to ultimately benefit the other person (altruism), to benefit a group (collectivism), or to uphold a moral principle (principlism). Altruism that ultimately serves selfish gains is thus differentiated from selfless altruism, but the general conclusion has been that empathy-induced altruism can be genuinely selfless. The empathy-altruism hypothesis basically states that psychological altruism does exist and is evoked by the empathic desire to help someone who is suffering. Feelings of empathic concern are contrasted with feelings of personal distress, which compel people to reduce their own unpleasant emotions and increase their own positive ones through helping someone in need. Empathy is thus not selfless, since altruism works either as the way to avoid those negative, unpleasant feelings and have positive, pleasant feelings triggered by others' need for help, or as the way to incentive the gain of social reward or through fear to avoid social punishment by helping. People with empathic concern help others in distress even when exposure to the situation could be easily avoided, whereas those lacking in empathic concern avoid helping unless it is difficult or impossible to avoid exposure to another's suffering. Helping behavior is seen in humans at about two years old, when a toddler is capable of understanding subtle emotional cues.In psychological research on altruism, studies often observe altruism as demonstrated through prosocial behaviors such as helping, comforting, sharing, cooperation, philanthropy, and community service. Research has found that people are most likely to help if they recognize that a person is in need and feel personal responsibility for reducing the person's distress. Research also suggests that the number of bystanders witnessing distress or suffering affects the likelihood of helping (the Bystander effect). Greater numbers of bystanders decrease individual feelings of responsibility. However, a witness with a high level of empathic concern is likely to assume personal responsibility entirely regardless of the number of bystanders.Many studies have observed the effects of volunteerism (as a form of altruism) on happiness and health and have consistently found a strong connection between volunteerism and current and future health and well-being. In a study of older adults, those who volunteered were higher on life satisfaction and will to live, and lower in depression, anxiety, and somatization. Volunteerism and helping behavior have not only been shown to improve mental health, but physical health and longevity as well, attributable to the activity and social integration it encourages. One study examined the physical health of mothers who volunteered over a 30-year period and found that 52% of those who did not belong to a volunteer organization experienced a major illness while only 36% of those who did volunteer experienced one. A study on adults ages 55+ found that during the four-year study period, people who volunteered for two or more organizations had a 63% lower likelihood of dying. After controlling for prior health status, it was determined that volunteerism accounted for a 44% reduction in mortality. Merely being aware of kindness in oneself and others is also associated with greater well-being. A study that asked participants to count each act of kindness they performed for one week significantly enhanced their subjective happiness. It is important to note that, while research supports the idea that altruistic acts bring about happiness, it has also been found to work in the opposite direction—that happier people are also kinder. The relationship between altruistic behavior and happiness is bidirectional. Studies have found that generosity increases linearly from sad to happy affective states.Studies have also been careful to note that feeling over-taxed by the needs of others has conversely negative effects on health and happiness. For example, one study on volunteerism found that feeling overwhelmed by others' demands had an even stronger negative effect on mental health than helping had a positive one (although positive effects were still significant). Additionally, while generous acts make people feel good about themselves, it is also important for people to appreciate the kindness they receive from others. Studies suggest that gratitude goes hand-in-hand with kindness and is also very important for our well-being. A study on the relationship happiness to various character strengths showed that "a conscious focus on gratitude led to reductions in negative affect and increases in optimistic appraisals, positive affect, offering emotional support, sleep quality, and well-being".Pathological altruismPathological altruism is when altruism is taken to an unhealthy extreme, and either harms the altruistic person, or well-intentioned actions cause more harm than good.The term "pathological altruism" was popularised by the book Pathological Altruism.Examples include depression and burnout seen in healthcare professionals, an unhealthy focus on others to the detriment of one's own needs, hoarding of animals, and ineffective philanthropic and social programs that ultimately worsen the situations they are meant to aid.Sociology"Sociologists have long been concerned with how to build the good society" ("Altruism, Morality, and Social Solidarity". American Sociological Association.). The structure of our societies and how individuals come to exhibit charitable, philanthropic, and other pro-social, altruistic actions for the common good is a largely researched topic within the field. The American Sociology Association (ASA) acknowledges public sociology saying, "The intrinsic scientific, policy, and public relevance of this field of investigation in helping to construct 'good societies' is unquestionable" ("Altruism, Morality, and Social Solidarity" ASA). This type of sociology seeks contributions that aid grassroots and theoretical understandings of what motivates altruism and how it is organized, and promotes an altruistic focus in order to benefit the world and people it studies. How altruism is framed, organized, carried out, and what motivates it at the group level is an area of focus that sociologists seek to investigate in order to contribute back to the groups it studies and "build the good society". The motivation of altruism is also the focus of study; some publications link the occurrence of moral outrage to the punishment of perpetrators and compensation of victims. Studies have shown that generosity in laboratory and in online experiments is contagious – people imitate observed generosity of others.Religious viewpointsMost, if not all, of the world's religions promote altruism as a very important moral value. Buddhism, Christianity, Hinduism, Islam, Jainism, Judaism, and Sikhism, etc., place particular emphasis on altruistic morality.BuddhismAltruism figures prominently in Buddhism. Love and compassion are components of all forms of Buddhism, and are focused on all beings equally: love is the wish that all beings be happy, and compassion is the wish that all beings be free from suffering. "Many illnesses can be cured by the one medicine of love and compassion. These qualities are the ultimate source of human happiness, and the need for them lies at the very core of our being" (Dalai Lama).Still, the notion of altruism is modified in such a world-view, since the belief is that such a practice promotes our own happiness: "The more we care for the happiness of others, the greater our own sense of well-being becomes" (Dalai Lama).In the context of larger ethical discussions on moral action and judgment, Buddhism is characterized by the belief that negative (unhappy) consequences of our actions derive not from punishment or correction based on moral judgment, but from the law of karma, which functions like a natural law of cause and effect. A simple illustration of such cause and effect is the case of experiencing the effects of what one causes: if one causes suffering, then as a natural consequence one would experience suffering; if one causes happiness, then as a natural consequence one would experience happiness.JainismThe fundamental principles of Jainism revolve around the concept of altruism, not only for humans but for all sentient beings. Jainism preaches the view of Ahimsa – to live and let live, thereby not harming sentient beings, i.e. uncompromising reverence for all life. It also considers all living things to be equal. The first Tirthankara, Rishabhdev, introduced the concept of altruism for all living beings, from extending knowledge and experience to others to donation, giving oneself up for others, non-violence and compassion for all living things.Jainism prescribes a path of non-violence to progress the soul to this ultimate goal. A major characteristic of Jain belief is the emphasis on the consequences of not only physical but also mental behaviors. One's unconquered mind with anger, pride (ego), deceit, greed and uncontrolled sense organs are the powerful enemies of humans. Anger spoils good relations, pride destroys humility, deceit destroys peace and greed destroys everything. Jainism recommends conquering anger by forgiveness, pride by humility, deceit by straightforwardness and greed by contentment.Jains believe that to attain enlightenment and ultimately liberation, one must practice the following ethical principles (major vows) in thought, speech and action. The degree to which these principles are practiced is different for householders and monks. They are: Non-violence (Ahimsa); Truthfulness (Satya); Non-stealing (Asteya); Celibacy (Brahmacharya); Non-possession or non-materialism (Aparigraha);The "great vows" (Mahavrata) are prescribed for monks and "limited vows" (Anuvrata) are prescribed for householders. The house-holders are encouraged to practice the above-mentioned five vows. The monks have to observe them very strictly. With consistent practice, it will be possible to overcome the limitations gradually, accelerating the spiritual progress.The principle of nonviolence seeks to minimize karmas which limit the capabilities of the soul. Jainism views every soul as worthy of respect because it has the potential to become Siddha (God in Jainism). Because all living beings possess a soul, great care and awareness is essential in one's actions. Jainism emphasizes the equality of all life, advocating harmlessness towards all, whether the creatures are great or small. This policy extends even to microscopic organisms. Jainism acknowledges that every person has different capabilities and capacities to practice and therefore accepts different levels of compliance for ascetics and householders.ChristianitySt Thomas Aquinas interprets 'You should love your neighbour as yourself' as meaning that love for ourselves is the exemplar of love for others. Considering that "the love with which a man loves himself is the form and root of friendship" and quotes Aristotle that "the origin of friendly relations with others lies in our relations to ourselves", he concluded that though we are not bound to love others more than ourselves, we naturally seek the common good, the good of the whole, more than any private good, the good of a part. However, he thinks we should love God more than ourselves and our neighbours, and more than our bodily life—since the ultimate purpose of loving our neighbour is to share in eternal beatitude: a more desirable thing than bodily well-being. In coining the word Altruism, as stated above, Comte was probably opposing this Thomistic doctrine, which is present in some theological schools within Catholicism.Many biblical authors draw a strong connection between love of others and love of God. 1 John 4 states that for one to love God one must love his fellowman, and that hatred of one's fellowman is the same as hatred of God. Thomas Jay Oord has argued in several books that altruism is but one possible form of love. An altruistic action is not always a loving action. Oord defines altruism as acting for the other's good, and he agrees with feminists who note that sometimes love requires acting for one's own good when the other's demands undermine overall well-being.German philosopher Max Scheler distinguishes two ways in which the strong can help the weak. One way is a sincere expression of Christian love, "motivated by a powerful feeling of security, strength, and inner salvation, of the invincible fullness of one's own life and existence". Another way is merely "one of the many modern substitutes for love, ... nothing but the urge to turn away from oneself and to lose oneself in other people's business". At its worst, Scheler says, "love for the small, the poor, the weak, and the oppressed is really disguised hatred, repressed envy, an impulse to detract, etc., directed against the opposite phenomena: wealth, strength, power, largesse."IslamIn Islam, the concept "īthār" (إيثار) (altruism) is the notion of "preferring others to oneself". For Sufis, this means devotion to others through complete forgetfulness of one's own concerns, where concern for others is deemed as a demand made by Allah (i.e. God) on the human body, considered to be property of Allah alone. The importance of īthār lies in sacrifice for the sake of the greater good; Islam considers those practicing īthār as abiding by the highest degree of nobility.This is similar to the notion of chivalry, but unlike that European concept, in īthār attention is focused on everything in existence. A constant concern for Allah results in a careful attitude towards people, animals, and other things in this world.JudaismJudaism defines altruism as the desired goal of creation. The famous Rabbi Abraham Isaac Kook stated that love is the most important attribute in humanity. This is defined as bestowal, or giving, which is the intention of altruism. This can be altruism towards humanity that leads to altruism towards the creator or God. Kabbalah defines God as the force of giving in existence. Rabbi Moshe Chaim Luzzatto in particular focused on the 'purpose of creation' and how the will of God was to bring creation into perfection and adhesion with this upper force.Modern Kabbalah developed by Rabbi Yehuda Ashlag, in his writings about the future generation, focuses on how society could achieve an altruistic social framework. Ashlag proposed that such a framework is the purpose of creation, and everything that happens is to raise humanity to the level of altruism, love for one another. Ashlag focused on society and its relation to divinity.SikhismAltruism is essential to the Sikh religion. The central faith in Sikhism is that the greatest deed any one can do is to imbibe and live the godly qualities like love, affection, sacrifice, patience, harmony, truthfulness. The concept of seva, or selfless service to the community for its own sake, is an important concept in Sikhism.The fifth Guru, Arjun Dev, sacrificed his life to uphold "22 carats of pure truth, the greatest gift to humanity", the Guru Granth. The ninth Guru, Tegh Bahadur, sacrificed his head to protect weak and defenseless people against atrocity.In the late seventeenth century, Guru Gobind Singh (the tenth Guru in Sikhism), was at war with the Mughal rulers to protect the people of different faiths when a fellow Sikh, Bhai Kanhaiya, attended the troops of the enemy. He gave water to both friends and foes who were wounded on the battlefield. Some of the enemy began to fight again and some Sikh warriors were annoyed by Bhai Kanhaiya as he was helping their enemy. Sikh soldiers brought Bhai Kanhaiya before Guru Gobind Singh, and complained of his action that they considered counterproductive to their struggle on the battlefield. "What were you doing, and why?" asked the Guru. "I was giving water to the wounded because I saw your face in all of them", replied Bhai Kanhaiya. The Guru responded, "Then you should also give them ointment to heal their wounds. You were practicing what you were coached in the house of the Guru."Under the tutelage of the Guru, Bhai Kanhaiya subsequently founded a volunteer corps for altruism, which is still engaged today in doing good to others and in training new recruits for this service.HinduismIn Hinduism Selflessness (Atmatyag), Love (Prema), Kindness (Daya) and Forgiveness (Kshama) are considered as the highest acts of humanity or "Manushyattva". Giving alms to the beggers or poor people is considered as a divine act or "Punya" and Hindus believe it will free their souls from guilt or "Paapa" and will led them to heaven or "Swarga" in afterlife. Altruism is also the central act of various Hindu mythology and religious poems and songs.The founder of warkari samprdaya the great saint "Dhnyaneshwar Maharaj" (1275-1296) in his "Pasaydan" pray to the supreme lord "Vitthal" for the wellbeing of all living organisms of the universe.Swami Vivekananda, the legendary Hindu monk, has said -"Jive prem kare jeijon, Seijon sebiche Iswar" (Whoever loves any living being, is serving god.). Mass donation of clothes to poor people (Vastraseva), or blood donation camp or mass food donation (Annaseva) for poor people is common in various Hindu religious ceremonies.Swami Sivananda, an Advaita scholar, reiterates the views in his commentary synthesising Vedanta views on the Brahma Sutras, a Vedantic text. In his commentary on Chapter 3 of the Brahma Sutras, Sivananda notes that karma is insentient and short-lived, and ceases to exist as soon as a deed is executed. Hence, karma cannot bestow the fruits of actions at a future date according to one's merit. Furthermore, one cannot argue that karma generates apurva or punya, which gives fruit. Since apurva is non-sentient, it cannot act unless moved by an intelligent being such as a god. It cannot independently bestow reward or punishment.However the very well known and popular text, the Bhagavad Gita supports the doctrine of karma yoga (achieving oneness with God through action) & "Nishkam Karma" or action without expectation / desire for personal gain which can be said to encompass altruism. Altruistic acts are generally celebrated and very well received in Hindu literature and is central to Hindu morality.PhilosophyThere exists a wide range of philosophical views on humans' obligations or motivations to act altruistically. Proponents of ethical altruism maintain that individuals are morally obligated to act altruistically. The opposing view is ethical egoism, which maintains that moral agents should always act in their own self-interest. Both ethical altruism and ethical egoism contrast with utilitarianism, which maintains that each agent should act in order to maximise the efficacy of their function and the benefit to both themselves and their co-inhabitants.A related concept in descriptive ethics is psychological egoism, the thesis that humans always act in their own self-interest and that true altruism is impossible. Rational egoism is the view that rationality consists in acting in one's self-interest (without specifying how this affects one's moral obligations).Effective altruismEffective altruism is a philosophy and social movement that uses evidence and reasoning to determine the most effective ways to benefit others. Effective altruism encourages individuals to consider all causes and actions and to act in the way that brings about the greatest positive impact, based upon their values. It is the broad, evidence-based and cause-neutral approach that distinguishes effective altruism from traditional altruism or charity. Effective altruism is part of the larger movement towards evidence-based practices.While a substantial proportion of effective altruists have focused on the nonprofit sector, the philosophy of effective altruism applies more broadly to prioritizing the scientific projects, companies, and policy initiatives which can be estimated to save lives, help people, or otherwise have the biggest benefit. People associated with the movement include philosopher Peter Singer, Facebook co founder Dustin Moskovitz, Cari Tuna, Ben Delo, Oxford-based researchers William MacAskill and Toby Ord, and professional poker player Liv Boeree,GeneticsThe genes OXTR, CD38, COMT, DRD4, DRD5, IGF2, and GABRB2 have been found to be candidate genes for altruism.Digital AltruismDigital Altruism is the notion that some are willing to freely share information based on the principle of reciprocity and in the belief that in the end, everyone benefits from sharing information via the Internet.This term is coined by Dr. Dana Klisanin, the founder and CEO of Evolutionary Guidance Media R&D Inc., and is a recipient of the Early Career Award for Scientific Achievement in Media Psychology from the American Psychological Association's Division of Media Psychology.According to Klisanin, "the notion that "some are willing to freely reveal what they know" is interesting.Types of Digital AltruismThere are three types of digital altruism: (1) "everyday digital altruism," involving expedience, ease, moral engagement, and conformity; (2) "creative digital altruism," involving creativity, heightened moral engagement, and cooperation; and (3) "co-creative digital altruism" involving creativity, moral engagement, and meta cooperative efforts.See also Altruria, California Charitable organization Comedy of the commons Consideration Egotism Family economics Golden Rule Gene-centered view of evolution Humanity (virtue) Misanthropy Mutual aid Non nobis solum Prisoner's dilemma Random act of kindness Social preferences Social psychology Solidarity (sociology) Spite (game theory)NotesReferences      Comte, Auguste, Catechisme positiviste (1852) or Catechism of Positivism, tr. R. Congreve, (London: Kegan Paul, 1891)  Kropotkin, Peter, Mutual Aid: A Factor of Evolution (1902)  Nietzsche, Friedrich, Beyond Good and Evil Pierre-Joseph Proudhon, The Philosophy of Poverty (1847) Lysander Spooner, Natural Law Matt Ridley, The Origins of Virtue Oliner, Samuel P. and Pearl M. Towards a Caring Society: Ideas into Action. West Port, CT: Praeger, 1995.External linksRichard Kraut (2016) Altruism Stanford Encyclopedia of Philosophy Auguste ComteDefence mechanismsMoralityMoral psychologyPhilanthropySocial philosophyInterpersonal relationshipsVirtue
+Alice O'Connor (born Alisa Zinovyevna Rosenbaum; , 1905 – March 6, 1982), better known by her pen name Ayn Rand (), was a Russian-born American writer and philosopher. She is known for her fiction and for developing a philosophical system she named Objectivism. Born and educated in Russia, she moved to the United States in 1926. She wrote a play that opened on Broadway in 1935. After two early novels that were initially unsuccessful, she achieved fame with her 1943 novel, The Fountainhead. In 1957, Rand published her best-known work, the novel Atlas Shrugged. Afterward, until her death in 1982, she turned to non-fiction to promote her philosophy, publishing her own periodicals and releasing several collections of essays.Rand advocated reason as the only means of acquiring knowledge; she rejected faith and religion. She supported rational and ethical egoism and rejected altruism. In politics, she condemned the initiation of force as immoral and opposed collectivism, statism, and anarchism. Instead, she supported laissez-faire capitalism, which she defined as the system based on recognizing individual rights, including private property rights. Although Rand opposed libertarianism, which she viewed as anarchism, she is often associated with the modern libertarian movement in the United States. In art, Rand promoted romantic realism. She was sharply critical of most philosophers and philosophical traditions known to her, except for Aristotle, Thomas Aquinas, and classical liberals.Rand's fiction received mixed reviews from literary critics. Although academic interest in her ideas has grown since her death, academic philosophers have generally ignored or rejected her philosophy because of her polemical approach and lack of methodological rigor. Her writings have politically influenced some libertarians and conservatives. The Objectivist movement attempts to spread her ideas, both to the public and in academic settings.LifeEarly lifeRand was born Alisa Zinovyevna Rosenbaum on February 2, 1905, to a Russian-Jewish bourgeois family living in Saint Petersburg. She was the eldest of three daughters of Zinovy Zakharovich Rosenbaum, a pharmacist, and Anna Borisovna (née Kaplan). Rand later said she found school unchallenging and began writing screenplays at age eight and novels at age ten. At the prestigious , her closest friend was Vladimir Nabokov's younger sister, Olga; the pair shared an intense interest in politics.She was twelve at the time of the February Revolution of 1917, during which Rand favored Alexander Kerensky over Tsar Nicholas II. The subsequent October Revolution and the rule of the Bolsheviks under Vladimir Lenin disrupted the life the family had enjoyed previously. Her father's business was confiscated, and the family fled to the Crimean Peninsula, which was initially under the control of the White Army during the Russian Civil War. While in high school there, Rand concluded she was an atheist and valued reason above any other virtue. After graduating in June 1921, she returned with her family to Petrograd (as Saint Petersburg was then named), where they faced desperate conditions, occasionally nearly starving.Following the Russian Revolution, universities were opened to women, allowing her to be in the first group of women to enroll at Petrograd State University. At 16, she began her studies in the department of social pedagogy, majoring in history. At the university, she was introduced to the writings of Aristotle and Plato; Rand came to see their differing views on reality and knowledge as the primary conflict within philosophy. She also studied the philosophical works of Friedrich Nietzsche.Along with many other bourgeois students, she was purged from the university shortly before graduating. After complaints from a group of visiting foreign scientists, many of the purged students were allowed to complete their work and graduate, which she did in October 1924. She then studied for a year at the State Technicum for Screen Arts in Leningrad. For an assignment, Rand wrote an essay about the Polish actress Pola Negri, which became her first published work.By this time, she had decided her professional surname for writing would be Rand, possibly because it is graphically similar to a vowelless excerpt  of her birth surname  in Cyrillic. She adopted the first name Ayn.Arrival in the United StatesIn late 1925, Rand was granted a visa to visit relatives in Chicago. She departed on January 17, 1926. Arriving in New York City on February 19, 1926, Rand was so impressed with the Manhattan skyline that she cried what she later called "tears of splendor". Intent on staying in the United States to become a screenwriter, she lived for a few months with her relatives. One of them owned a movie theater and allowed her to watch dozens of films free of charge. She then left for Hollywood, California.In Hollywood, a chance meeting with famed director Cecil B. DeMille led to work as an extra in his film The King of Kings and a subsequent job as a junior screenwriter. While working on The King of Kings, she met an aspiring young actor, Frank O'Connor; the two married on April 15, 1929. She became a permanent American resident in July 1929 and an American citizen on March 3, 1931. She made several attempts to bring her parents and sisters to the United States, but they were unable to obtain permission to emigrate.During these early years of her career, Rand wrote a number of screenplays, plays, and short stories that were not produced or published during her lifetime; some were published later in The Early Ayn Rand.Early fictionAlthough it was never produced, Rand's first literary success came with the sale of her screenplay Red Pawn to Universal Studios in 1932.  Her courtroom drama Night of January 16th, first produced by E. E. Clive in Hollywood in 1934, reopened successfully on Broadway in 1935. Each night, a jury was selected from members of the audience; based on its vote, one of two different endings would be performed.Her first published novel, the semi-autobiographical We the Living, was published in 1936. Set in Soviet Russia, it focused on the struggle between the individual and the state. Initial sales were slow, and the American publisher let it go out of print, although European editions continued to sell. She adapted the story as a stage play, but producer George Abbott's Broadway production was a failure and closed in less than a week. After the success of her later novels, Rand was able to release a revised version in 1959 that has since sold over three million copies. In a foreword to the 1959 edition, Rand wrote that We the Living "is as near to an autobiography as I will ever write. ... The plot is invented, the background is not ...".Rand wrote her novella Anthem during a break from writing her next major novel, The Fountainhead. It presents a vision of a dystopian future world in which totalitarian collectivism has triumphed to such an extent that even the word I has been forgotten and replaced with we. Published in England in 1938, Rand could not find an American publisher initially. As with We the Living, Rand's later success allowed her to get a revised version published in 1946, which has sold over 3.5 million copies.The Fountainhead and political activismDuring the 1940s, Rand became politically active. She and her husband worked as full-time volunteers for Republican Wendell Willkie's 1940 presidential campaign. This led to Rand's first public speaking experiences; she enjoyed fielding sometimes hostile questions from New York City audiences who had seen pro-Willkie newsreels. Her work brought her into contact with other intellectuals sympathetic to free-market capitalism. She became friends with journalist Henry Hazlitt, who introduced her to the Austrian School economist Ludwig von Mises. Despite her philosophical differences with them, Rand strongly endorsed the writings of both men throughout her career, and both of them expressed admiration for her. Mises once referred to her as "the most courageous man in America", a compliment that particularly pleased her because he said "man" instead of "woman". Rand became friends with libertarian writer Isabel Paterson. Rand questioned her about American history and politics long into the night during their many meetings, and gave Paterson ideas for her only non-fiction book, The God of the Machine.Rand's first major success as a writer came in 1943 with The Fountainhead, a romantic and philosophical novel that she wrote over seven years. The novel centers on an uncompromising young architect named Howard Roark and his struggle against what Rand described as "second-handers"—those who attempt to live through others, placing others above themselves. Twelve publishers rejected it before the Bobbs-Merrill Company finally accepted it at the insistence of editor Archibald Ogden, who threatened to quit if his employer did not publish it. While completing the novel, Rand was prescribed the amphetamine Benzedrine to fight fatigue. The drug helped her to work long hours to meet her deadline for delivering the novel, but afterwards she was so exhausted that her doctor ordered two weeks' rest. Her use of the drug for approximately three decades may have contributed to what some of her later associates described as volatile mood swings.The Fountainhead became a worldwide success, bringing Rand fame and financial security. In 1943, she sold the film rights to Warner Bros. and returned to Hollywood to write the screenplay. Producer Hal B. Wallis hired her afterwards as a screenwriter and script-doctor. Her work for him included the screenplays for the Oscar-nominated Love Letters and You Came Along. Rand worked on other projects, including a never-completed nonfiction treatment of her philosophy to be called The Moral Basis of Individualism.Rand extended her involvement with free-market and anti-communist activism while working in Hollywood. She became involved with the anti-Communist Motion Picture Alliance for the Preservation of American Ideals and wrote articles on the group's behalf. She also joined the anti-Communist American Writers Association. A visit by Paterson to meet with Rand's California associates led to a falling out between the two when Paterson made comments to valued political allies which Rand considered rude. In 1947, during the Second Red Scare, Rand testified as a "friendly witness" before the United States House Un-American Activities Committee that the 1944 film Song of Russia grossly misrepresented conditions in the Soviet Union, portraying life there as much better and happier than it was. She also wanted to criticize the lauded 1946 film The Best Years of Our Lives for what she interpreted as its negative presentation of the business world, but was not allowed to do so. When asked after the hearings about her feelings on the investigations' effectiveness, Rand described the process as "futile".After several delays, the film version of The Fountainhead was released in 1949. Although it used Rand's screenplay with minimal alterations, she "disliked the movie from beginning to end" and complained about its editing, the acting and other elements.Atlas Shrugged and ObjectivismFollowing the publication of The Fountainhead, Rand received numerous letters from readers, some of whom the book had influenced profoundly. In 1951, Rand moved from Los Angeles to New York City, where she gathered a group of these admirers around her. This group (jokingly designated "The Collective") included a future chair of the Federal Reserve Alan Greenspan, a young psychology student named Nathan Blumenthal (later Nathaniel Branden) and his wife Barbara, and Barbara's cousin Leonard Peikoff. Initially, the group was an informal gathering of friends who met with Rand at her apartment on weekends to discuss philosophy. Later, Rand began allowing them to read the drafts of her new novel, Atlas Shrugged, as she wrote the manuscript. In 1954, her close relationship with Nathaniel Branden turned into a romantic affair, with the knowledge of their spouses.Published in 1957, Atlas Shrugged was considered Rand's magnum opus. She described the novel's theme as "the role of the mind in man's existence—and, as a corollary, the demonstration of a new moral philosophy: the morality of rational self-interest". It advocates the core tenets of Rand's philosophy of Objectivism and expresses her concept of human achievement. The plot involves a dystopian United States in which the most creative industrialists, scientists, and artists respond to a welfare state government by going on strike and retreating to a hidden valley where they build an independent free economy. The novel's hero and leader of the strike, John Galt, describes it as "stopping the motor of the world" by withdrawing the minds of the individuals contributing most to the nation's wealth and achievements. With this fictional strike, Rand intended to illustrate that without the efforts of the rational and productive, the economy would collapse and society would fall apart. The novel includes elements of mystery, romance, and science fiction, and contains an extended exposition of Objectivism in a lengthy monologue delivered by Galt.Despite many negative reviews, Atlas Shrugged became an international bestseller; however, the reaction of intellectuals to the novel discouraged and depressed Rand. Atlas Shrugged was her last completed work of fiction marking the end of her career as a novelist and the beginning of her role as a popular philosopher.In 1958, Nathaniel Branden established the Nathaniel Branden Lectures, later incorporated as the Nathaniel Branden Institute (NBI), to promote Rand's philosophy. Collective members gave lectures for the NBI and wrote articles for Objectivist periodicals that Rand edited. She later published some of these articles in book form. Rand was unimpressed by many of the NBI students and held them to strict standards, sometimes reacting coldly or angrily to those who disagreed with her. Critics, including some former NBI students and Branden himself, later described the culture of the NBI as one of intellectual conformity and excessive reverence for Rand. Some described the NBI or the Objectivist movement as a cult or religion. Rand expressed opinions on a wide range of topics, from literature and music to sexuality and facial hair. Some of her followers mimicked her preferences, wearing clothes to match characters from her novels and buying furniture like hers. However, some former NBI students believed the extent of these behaviors was exaggerated, and the problem was concentrated among Rand's closest followers in New York.Later yearsThroughout the 1960s and 1970s, Rand developed and promoted her Objectivist philosophy through her nonfiction works and by giving talks to students at institutions such as Yale, Princeton, Columbia, Harvard, and the Massachusetts Institute of Technology. She began delivering annual lectures at the Ford Hall Forum, responding to questions from the audience. During these appearances, she often took controversial stances on the political and social issues of the day. These included: supporting abortion rights, opposing the Vietnam War and the military draft (but condemning many draft dodgers as "bums"), supporting Israel in the Yom Kippur War of 1973 against a coalition of Arab nations as "civilized men fighting savages", saying European colonists had the right to invade and take land inhabited by American Indians, and calling homosexuality "immoral" and "disgusting", while also advocating the repeal of all laws concerning it. She endorsed several Republican candidates for president of the United States, most strongly Barry Goldwater in 1964, whose candidacy she promoted in several articles for The Objectivist Newsletter.In 1964, Nathaniel Branden began an affair with the young actress Patrecia Scott, whom he later married. Nathaniel and Barbara Branden kept the affair hidden from Rand. When she learned of it in 1968, though her romantic relationship with Branden had already ended, Rand ended her relationship with both Brandens, and the NBI was closed. She published an article in The Objectivist repudiating Nathaniel Branden for dishonesty and other "irrational behavior in his private life". In subsequent years, Rand and several more of her closest associates parted company.Rand underwent surgery for lung cancer in 1974 after decades of heavy smoking. In 1976, she retired from writing her newsletter and, after her initial objections, allowed a social worker employed by her attorney to enroll her in Social Security and Medicare. During the late 1970s, her activities within the Objectivist movement declined, especially after the death of her husband on November 9, 1979. One of her final projects was work on a never-completed television adaptation of Atlas Shrugged.On March 6, 1982, Rand died of heart failure at her home in New York City. She was interred in the Kensico Cemetery, Valhalla, New York. At her funeral, a  floral arrangement in the shape of a dollar sign was placed near her casket. In her will, Rand named Leonard Peikoff as her beneficiary.Literary method and influencesRand described her approach to literature as "romantic realism". She wanted her fiction to present the world "as it could be and should be", rather than as it was. This approach led her to create highly stylized situations and characters. Her fiction typically has protagonists who are heroic individualists, depicted as fit and attractive. Her stories' villains support duty and collectivist moral ideals. Rand often describes them as unattractive and they sometimes have names that suggest negative traits, like Wesley Mouch in Atlas Shrugged.Rand considered plot a critical element of literature, and her stories typically have what biographer Anne Heller described as "tight, elaborate, fast-paced plotting". Romantic triangles are a common plot element in Rand's fiction; in most of her novels and plays, the main female character is romantically involved with at least two different men.InfluencesIn school Rand read works by Fyodor Dostoevsky, Victor Hugo, Edmond Rostand, and Friedrich Schiller, who became her favorites. She considered them to be among the "top rank" of Romantic writers because of their focus on moral themes and their skill at constructing plots. Hugo, in particular, was an important influence on her writing, especially her approach to plotting. In the introduction she wrote for an English-language edition of his novel Ninety-Three, Rand called him "the greatest novelist in world literature".Although Rand disliked most Russian literature, her depictions of her heroes show the influence of the Russian Symbolists and other nineteenth-century Russian writing, most notably the 1863 novel What Is to Be Done? by Nikolay Chernyshevsky. Rand's experience of the Russian Revolution and early Communist Russia influenced the portrayal of her villains. This is most apparent in We the Living, set in Russia. The ideas and rhetoric of Ellsworth Toohey in The Fountainhead and the destruction of the economy by the looters in Atlas Shrugged also reflect it.Rand's descriptive style echoes her early career writing scenarios and scripts for movies; her novels have many narrative descriptions that resemble early Hollywood movie scenarios. They often follow common film editing conventions, such as having a broad establishing shot description of a scene followed by close-up details, and her descriptions of women characters often take a "male gaze" perspective.PhilosophyRand called her philosophy "Objectivism", describing its essence as "the concept of man as a heroic being, with his own happiness as the moral purpose of his life, with productive achievement as his noblest activity, and reason as his only absolute". She considered Objectivism a systematic philosophy and laid out positions on metaphysics, epistemology, ethics, political philosophy, and aesthetics.In metaphysics, Rand supported philosophical realism and opposed anything she regarded as mysticism or supernaturalism, including all forms of religion. Rand believed in free will as a form of agent causation and rejected determinism.In epistemology, she considered all knowledge to be based on sense perception, the validity of which Rand considered axiomatic, and reason, which she described as "the faculty that identifies and integrates the material provided by man's senses". Rand rejected all claims of non-perceptual or a priori knowledge, including instinct,' 'intuition,' 'revelation,' or any form of 'just knowing. In her Introduction to Objectivist Epistemology, Rand presented a theory of concept formation and rejected the analytic–synthetic dichotomy.In ethics, Rand argued for rational and ethical egoism (rational self-interest), as the guiding moral principle. She said the individual should "exist for his own sake, neither sacrificing himself to others nor sacrificing others to himself". Rand referred to egoism as "the virtue of selfishness" in her book of that title. In it, she presented her solution to the is-ought problem by describing a meta-ethical theory that based morality in the needs of "man's survival qua man". She condemned ethical altruism as incompatible with the requirements of human life and happiness, and held the initiation of force was evil and irrational, writing in Atlas Shrugged that, "Force and mind are opposites."Rand's political philosophy emphasized individual rights—including property rights. She considered laissez-faire capitalism the only moral social system because in her view it was the only system based on protecting those rights. Rand opposed statism, which she understood included theocracy, absolute monarchy, Nazism, fascism, communism, democratic socialism, and dictatorship. She believed a constitutionally limited government should protect natural rights. Although her political views are often classified as conservative or libertarian, Rand preferred the term "radical for capitalism". She worked with conservatives on political projects, but disagreed with them over issues such as religion and ethics. Rand denounced libertarianism, which she associated with anarchism. She rejected anarchism as a naive theory based in subjectivism that could only lead to collectivism in practice.In aesthetics, Rand defined art as a "selective re-creation of reality according to an artist's metaphysical value-judgments". According to her, art allows philosophical concepts to be presented in a concrete form that can be grasped easily, thereby fulfilling a need of human consciousness. As a writer, the art form Rand focused on most closely was literature. She considered romanticism to be the approach that most accurately reflected the existence of human free will.Rand said her most important contributions to philosophy were her "theory of concepts, ethics, and discovery in politics that evil—the violation of rights—consists of the initiation of force". She believed epistemology was a foundational branch of philosophy and considered the advocacy of reason to be the single most significant aspect of her philosophy, stating: "I am not primarily an advocate of capitalism, but of egoism; and I am not primarily an advocate of egoism, but of reason. If one recognizes the supremacy of reason and applies it consistently, all the rest follows."CriticismsRand's ethics and politics are the most criticized areas of her philosophy. Numerous authors, including Robert Nozick and William F. O'Neill, in some of the earliest academic critiques of her ideas, said she failed in her attempt to solve the is–ought problem. Critics have called her definitions of egoism and altruism biased and inconsistent with normal usage. Critics from religious traditions oppose her rejection of altruism in addition to atheism. Essays criticizing Rand's egoistic views are included in a number of anthologies for teaching introductory ethics, which often include no essays presenting or defending them.Multiple critics, including Nozick, have said her attempt to justify individual rights based on egoism fails. Others, like Michael Huemer, have gone further, saying that her support of egoism and her support of individual rights are inconsistent positions. Some critics, like Roy Childs, have said that her opposition to the initiation of force should lead to support of anarchism, rather than limited government.Commentators, including Hazel Barnes, Albert Ellis, and Nathaniel Branden, have criticized Rand's focus on the importance of reason. Branden said this emphasis led her to denigrate emotions and create unrealistic expectations of how consistently rational human beings should be.Relationship to other philosophersExcept for Aristotle, Thomas Aquinas and classical liberals, Rand was sharply critical of most philosophers and philosophical traditions known to her. Acknowledging Aristotle as her greatest influence, Rand remarked that in the history of philosophy she could only recommend "three A's"—Aristotle, Aquinas, and Ayn Rand. In a 1959 interview with Mike Wallace, when asked where her philosophy came from, she responded: "Out of my own mind, with the sole acknowledgement of a debt to Aristotle, the only philosopher who ever influenced me. I devised the rest of my philosophy myself."In an article for the Claremont Review of Books, political scientist Charles Murray criticized her claim that her only "philosophical debt" was to Aristotle. He asserted her ideas were derivative of previous thinkers such as John Locke and Friedrich Nietzsche. Rand found early inspiration from Nietzsche, and scholars have found indications of this in Rand's private journals. In 1928, she alluded to his idea of the "superman" in notes for an unwritten novel whose protagonist was inspired by the murderer William Edward Hickman. There are other indications of Nietzsche's influence in passages from the first edition of We the Living (which Rand later revised), and in her overall writing style. By the time she wrote The Fountainhead, Rand had turned against Nietzsche's ideas, and the extent of his influence on her even during her early years is disputed.Rand considered her philosophical opposite to be Immanuel Kant, whom she referred to as "the most evil man in mankind's history"; she believed his epistemology undermined reason and his ethics opposed self-interest. Philosophers George Walsh and Fred Seddon have argued she misinterpreted Kant and exaggerated their differences.Rand's relationship with contemporary philosophers was mostly antagonistic. She was not an academic and did not participate in academic discourse. She was dismissive toward critics and wrote about ideas she disagreed with in a polemical manner without in-depth analysis. She was in turn viewed very negatively by many academic philosophers, who dismissed her as an unimportant figure who need not be given serious consideration.Reception and legacyCritical receptionThe first reviews Rand received were for Night of January 16th. Reviews of the Broadway production were largely positive, but Rand considered even positive reviews to be embarrassing because of significant changes made to her script by the producer. Although Rand believed that her novel We the Living was not widely reviewed, over 200 publications published approximately 125 different reviews. Overall, they were more positive than those she received for her later work. Her 1938 novella Anthem received little review attention, both for its first publication in England and for subsequent re-issues.Rand's first bestseller, The Fountainhead, received far fewer reviews than We the Living, and reviewers' opinions were mixed. Lorine Pruette's positive review in The New York Times, which called the author "a writer of great power" who wrote "brilliantly, beautifully and bitterly", was one that Rand greatly appreciated. There were other positive reviews, but Rand dismissed most of them for either misunderstanding her message or for being in unimportant publications. Some negative reviews said the novel was too long; others called the characters unsympathetic and Rand's style "offensively pedestrian".Atlas Shrugged was widely reviewed, and many of the reviews were strongly negative. Atlas Shrugged received positive reviews from a few publications, but Rand scholar Mimi Reisel Gladstein later wrote that "reviewers seemed to vie with each other in a contest to devise the cleverest put-downs", with reviews including comments that it was "written out of hate" and showed "remorseless hectoring and prolixity". Whittaker Chambers wrote what was later called the novel's most "notorious" review for the conservative magazine National Review. He accused Rand of supporting a godless system (which he related to that of the Soviets), claiming, "From almost any page of Atlas Shrugged, a voice can be heard ... commanding: 'To a gas chamber—go!.Rand's nonfiction received far fewer reviews than her novels. The tenor of the criticism for her first nonfiction book, For the New Intellectual, was similar to that for Atlas Shrugged. Philosopher Sidney Hook likened her certainty to "the way philosophy is written in the Soviet Union", and author Gore Vidal called her viewpoint "nearly perfect in its immorality". These reviews set the pattern for reaction to her ideas among liberal critics. Her subsequent books got progressively less review attention.On the 100th anniversary of Rand's birth in 2005, writing for The New York Times, Edward Rothstein referred to her written fiction as quaint utopian "retro fantasy" and programmatic neo-Romanticism of the misunderstood artist, while criticizing her characters' "isolated rejection of democratic society".Popular interestWith over 30 million copies sold , Rand's books continue to be read widely. A survey conducted for the Library of Congress and the Book-of-the-Month Club in 1991 asked club members to name the most influential book in their lives. Rand's Atlas Shrugged was the second most popular choice, after the Bible. Although Rand's influence has been greatest in the United States, there has been international interest in her work.Rand's contemporary admirers included fellow novelists, like Ira Levin, Kay Nolte Smith and L. Neil Smith; she has influenced later writers like Erika Holzer and Terry Goodkind. Other artists who have cited Rand as an important influence on their lives and thought include comic book artist Steve Ditko and musician Neil Peart of Rush, although he later distanced himself. Rand provided a positive view of business and subsequently many business executives and entrepreneurs have admired and promoted her work. John Allison of BB&T and Ed Snider of Comcast Spectacor have funded the promotion of Rand's ideas. Mark Cuban (owner of the Dallas Mavericks) as well as John P. Mackey (CEO of Whole Foods), among others, have said they consider Rand crucial to their success.Television shows including animated sitcoms, live-action comedies, dramas, and game shows, as well as movies and video games have referred to Rand and her works. Throughout her life she was the subject of many articles in popular magazines, as well as book-length critiques by authors such as the psychologist Albert Ellis and Trinity Foundation president John W. Robbins. Rand, or characters based on her, figure prominently in novels by prominent American authors, including Mary Gaitskill, Matt Ruff, Kay Nolte Smith, and Tobias Wolff. Nick Gillespie, former editor-in- chief of Reason, remarked that, "Rand's is a tortured immortality, one in which she's as likely to be a punch line as a protagonist. Jibes at Rand as cold and inhuman run through the popular culture." Two movies have been made about Rand's life. A 1997 documentary film, Ayn Rand: A Sense of Life, was nominated for the Academy Award for Best Documentary Feature. The Passion of Ayn Rand, a 1999 television adaptation of the book of the same name, won several awards. Rand's image also appears on a 1999 U.S. postage stamp illustrated by artist Nick Gaetano.Rand's works, most commonly Anthem or The Fountainhead, are sometimes assigned as secondary school reading. Since 2002, the Ayn Rand Institute has provided free copies of Rand's novels to teachers who promise to include the books in their curriculum. The Institute had distributed 4.5 million copies in the U.S. and Canada by the end of 2020. In 2017, Rand was added to the required reading list for the A Level Politics exam in the United Kingdom.Political influenceAlthough she rejected the labels "conservative" and "libertarian", Rand has had a continuing influence on right-wing politics and libertarianism. Rand is often considered one of the three most important women (along with Rose Wilder Lane and Isabel Paterson) in the early development of modern American libertarianism. David Nolan, one founder of the Libertarian Party, said that "without Ayn Rand, the libertarian movement would not exist". In his history of that movement, journalist Brian Doherty described her as "the most influential libertarian of the twentieth century to the public at large". Historian Jennifer Burns referred to her as "the ultimate gateway drug to life on the right".The political figures who cite Rand as an influence are usually conservatives (often members of the Republican Party), despite Rand taking some atypical positions for a conservative, like being pro-choice and an atheist. She faced intense opposition from William F. Buckley Jr. and other contributors to the conservative National Review magazine, which published numerous criticisms of her writings and ideas. Nevertheless, a 1987 article in The New York Times referred to her as the Reagan administration's "novelist laureate". Republican congressmen and conservative pundits have acknowledged her influence on their lives and have recommended her novels. She has influenced some conservative politicians outside the U.S., such as Sajid Javid in the United Kingdom, Siv Jensen in Norway, and Ayelet Shaked in Israel.The financial crisis of 2007–2008 spurred renewed interest in her works, especially Atlas Shrugged, which some saw as foreshadowing the crisis. Opinion articles compared real-world events with the novel's plot. Signs mentioning Rand and her fictional hero John Galt appeared at Tea Party protests. There was increased criticism of her ideas, especially from the political left. Critics blamed the economic crisis on her support of selfishness and free markets, particularly through her influence on Alan Greenspan. In 2015, Adam Weiner said that through Greenspan, "Rand had effectively chucked a ticking time bomb into the boiler room of the US economy". Lisa Duggan said that Rand's novels had "incalculable impact" in encouraging the spread of neoliberal political ideas. In 2021, Cass Sunstein said Rand's ideas could be seen in the tax and regulatory policies of the Trump administration, which he attributed to the "enduring influence" of Rand's fiction.Academic reactionDuring Rand's lifetime, her work received little attention from academic scholars. Since her death, interest in her work has increased gradually. In 2009, historian Jennifer Burns identified "three overlapping waves" of scholarly interest in Rand, including "an explosion of scholarship" since the year 2000. However, as of that same year, few universities included Rand or Objectivism as a philosophical specialty or research area, with many literature and philosophy departments dismissing her as a pop culture phenomenon rather than a subject for serious study. From 2002 to 2012, over 60 colleges and universities accepted grants from the charitable foundation of BB&T Corporation that required teaching Rand's ideas or works; in some cases, the grants were controversial or even rejected because of the requirement to teach about Rand. In 2020, media critic Eric Burns said that, "Rand is surely the most engaging philosopher of my lifetime",  but "nobody in the academe pays any attention to her, neither as an author nor a philosopher. That same year, the editor of a collection of critical essays about Rand said academics who disapproved of her ideas had long held "a stubborn resolve to ignore or ridicule" her work, but he believed more academic critics were engaging with her work in recent years.To her ideasIn 1967, John Hospers discussed Rand's ethical ideas in the second edition of his textbook, An Introduction to Philosophical Analysis. That same year, Hazel Barnes included a chapter critiquing Objectivism in her book An Existentialist Ethics. When the first full-length academic book about Rand's philosophy appeared in 1971, its author declared writing about Rand "a treacherous undertaking" that could lead to "guilt by association" for taking her seriously. A few articles about Rand's ideas appeared in academic journals before her death in 1982, many of them in The Personalist. One of these was "On the Randian Argument" by libertarian philosopher Robert Nozick, who criticized her meta-ethical arguments. Other philosophers, writing in the same publication, argued that Nozick misstated Rand's case. In an article responding to Nozick, Douglas Den Uyl and Douglas B. Rasmussen defended her positions, but described her style as "literary, hyperbolic and emotional".The Philosophic Thought of Ayn Rand, a 1984 collection of essays about Objectivism edited by Den Uyl and Rasmussen, was the first academic book about Rand's ideas published after her death. In one essay, political writer Jack Wheeler wrote that despite "the incessant bombast and continuous venting of Randian rage", Rand's ethics are "a most immense achievement, the study of which is vastly more fruitful than any other in contemporary thought". In 1987, Allan Gotthelf, George Walsh, and David Kelley co-founded the Ayn Rand Society, a group affiliated with the American Philosophical Association.In a 1995 entry about Rand in Contemporary Women Philosophers, Jenny A. Heyl described a divergence in how different academic specialties viewed Rand. She said that Rand's philosophy "is regularly omitted from academic philosophy. Yet, throughout literary academia, Ayn Rand is considered a philosopher." Writing in the 1998 edition of the Routledge Encyclopedia of Philosophy, political theorist Chandran Kukathas summarized the mainstream philosophical reception of her work in two parts. He said most commentators view her ethical argument as an unconvincing variant of Aristotle's ethics, and her political theory "is of little interest" because it is marred by an "ill-thought out and unsystematic" effort to reconcile her hostility to the state with her rejection of anarchism. The Journal of Ayn Rand Studies, a multidisciplinary, peer-reviewed academic journal devoted to the study of Rand and her ideas, was established in 1999. R. W. Bradford, Stephen D. Cox, and Chris Matthew Sciabarra were its founding co-editors.In a 2010 essay for the Cato Institute, libertarian philosopher Michael Huemer argued very few people find Rand's ideas convincing, especially her ethics. He attributed the attention she receives to her being a "compelling writer", especially as a novelist, noting that Atlas Shrugged outsells Rand's non-fiction works and the works of other philosophers of classical liberalism. In 2012, the Pennsylvania State University Press agreed to take over publication of The Journal of Ayn Rand Studies, and the University of Pittsburgh Press launched an "Ayn Rand Society Philosophical Studies" series based on the Society's proceedings. The Fall 2012 update to the entry about Rand in the Stanford Encyclopedia of Philosophy said that "only a few professional philosophers have taken her work seriously". That same year, political scientist Alan Wolfe dismissed Rand as a "nonperson" among academics, an attitude that writer Ben Murnane later described as "the traditional academic view" of Rand.To her fictionAcademic consideration of Rand as a literary figure during her life was even more limited than the discussion of her philosophy. Mimi Reisel Gladstein could not find any scholarly articles about Rand's novels when she began researching her in 1973, and only three such articles appeared during the rest of the 1970s. Since her death, scholars of English and American literature have continued largely to ignore her work, although attention to her literary work has increased since the 1990s. Several academic book series about important authors cover Rand and her works. These include Twayne's United States Authors (Ayn Rand by James T. Baker), Twayne's Masterwork Studies (The Fountainhead: An American Novel by Den Uyl and Atlas Shrugged: Manifesto of the Mind by Gladstein), and Re-reading the Canon (Feminist Interpretations of Ayn Rand, edited by Gladstein and Sciabarra), as well as in popular study guides like CliffsNotes and SparkNotes. In The Literary Encyclopedia entry for Rand written in 2001, John David Lewis declared that "Rand wrote the most intellectually challenging fiction of her generation." In 2019, Lisa Duggan described Rand's fiction as popular and influential on many readers, despite being easy to criticize for "her cartoonish characters and melodramatic plots, her rigid moralizing, her middle- to lowbrow aesthetic preferences ... and philosophical strivings".Objectivist movementAfter the closure of the Nathaniel Branden Institute, the Objectivist movement continued in other forms. In the 1970s, Leonard Peikoff began delivering courses on Objectivism. In 1979, Objectivist writer Peter Schwartz started a newsletter called The Intellectual Activist, which Rand endorsed. She also endorsed The Objectivist Forum, a bimonthly magazine founded by Objectivist philosopher Harry Binswanger, which ran from 1980 to 1987.In 1985, Peikoff worked with businessman Ed Snider to establish the Ayn Rand Institute, a nonprofit organization dedicated to promoting Rand's ideas and works. In 1990, after an ideological disagreement with Peikoff, philosopher David Kelley founded the Institute for Objectivist Studies, now known as The Atlas Society. In 2001, historian John McCaskey organized the Anthem Foundation for Objectivist Scholarship, which provides grants for scholarly work on Objectivism in academia.Selected worksFiction and drama: Night of January 16th (performed 1934, published 1968) We the Living (1936, revised 1959) Anthem (1938, revised 1946) The Unconquered (performed 1940, published 2014) The Fountainhead (1943) Atlas Shrugged (1957) The Early Ayn Rand (1984) Ideal (2015)Non-fiction: For the New Intellectual (1961) The Virtue of Selfishness (1964) Capitalism: The Unknown Ideal (1966, expanded 1967) The Romantic Manifesto (1969, expanded 1975) The New Left (1971, expanded 1975) Introduction to Objectivist Epistemology (1979, expanded 1990) Philosophy: Who Needs It (1982) Letters of Ayn Rand (1995) Journals of Ayn Rand (1997)NotesReferencesWorks cited                                                                            Reprinted from Esquire, July 1961.External links Frequently Asked Questions About Ayn Rand from the Ayn Rand Institute    Rand's papers at The Library of Congress Ayn Rand Lexicon – searchable database     "Writings of Ayn Rand" – from C-SPAN's American Writers: A Journey Through History 1905 births1982 deathsWriters from Saint PetersburgWriters from New York City20th-century American dramatists and playwrights20th-century American novelists20th-century American philosophers20th-century American women writers20th-century atheists20th-century essayists20th-century Russian philosophersActivists from New York (state)American abortion-rights activistsAmerican anti-communistsAmerican anti-fascistsJewish American atheistsAmerican atheist writersAmerican essayistsAmerican ethicistsAmerican people of Russian-Jewish descentAmerican political activistsAmerican political philosophersAmerican science fiction writersAmerican women activistsAmerican women dramatists and playwrightsAmerican women essayistsAmerican women novelistsAmerican women philosophersAmerican women screenwritersAmerican secularistsAmerican writers of Russian descentAristotelian philosophersAtheist philosophersCritics of MarxismEpistemologistsExophonic writersFemale critics of feminismAtheists of the Russian EmpireJews of the Russian EmpireJewish American dramatists and playwrightsJewish American novelistsJewish activistsJewish anti-communistsJewish anti-fascistsJewish philosophersJewish women writersMetaphysiciansNovelists from New York (state)ObjectivistsOld Right (United States)People of the New Deal arts projectsPeople with acquired American citizenshipPhilosophers from New York (state)Political philosophersPseudonymous women writersDramatists and playwrights of the Russian EmpireSaint Petersburg State University alumniScreenwriters from New York (state)Soviet emigrants to the United StatesWomen science fiction and fantasy writersBurials at Kensico Cemetery20th-century American screenwritersDeaths from organ failure20th-century pseudonymous writersCritics of ChristianitySocial critics
+Alain Connes (; born 1 April 1947) is a French mathematician, and a theoretical physicist, known for his contributions to the study of operator algebras and noncommutative geometry. He is a professor at the Collège de France, IHÉS, Ohio State University and Vanderbilt University. He was awarded the Fields Medal in 1982.CareerConnes was an Invited Professor at the Conservatoire national des arts et métiers (2000).ResearchAlain Connes studies operator algebras. In his early work on von Neumann algebras in the 1970s, he succeeded in obtaining the almost complete classification of injective factors. He also formulated the Connes embedding problem. Following this, he made contributions in operator K-theory and index theory, which culminated in the Baum–Connes conjecture. He also introduced cyclic cohomology in the early 1980s as a first step in the study of noncommutative differential geometry. He was a member of Bourbaki.Connes has applied his work in areas of mathematics and theoretical physics, including number theory, differential geometry and particle physics.Awards and honoursConnes was awarded the Fields Medal in 1982, the Crafoord Prize in 2001 and the gold medal of the CNRS in 2004.  He was an invited speaker at the ICM in 1974 at Vancouver and in 1986 at Berkeley and a plenary speaker at the ICM in 1978 at Helsinki. He is a member of the French Academy of Sciences and several foreign academies and societies, including the Danish Academy of Sciences, Norwegian Academy of Sciences, Russian Academy of Sciences, and US National Academy of Sciences.Books Alain Connes and Matilde Marcolli, Noncommutative Geometry, Quantum Fields and Motives, Colloquium Publications, American Mathematical Society, 2007,   Alain Connes, Andre Lichnerowicz, and Marcel Paul Schutzenberger, Triangle of Thought, translated by Jennifer Gage, American Mathematical Society, 2001,  Jean-Pierre Changeux, and Alain Connes, Conversations on Mind, Matter, and Mathematics, translated by M. B. DeBevoise, Princeton University Press, 1998,  Alain Connes, Noncommutative Geometry, Academic Press, 1994,See also Bost–Connes system Cyclic category Cyclic homology Factor (functional analysis) Higgs boson C*-algebra Noncommutative quantum field theory M-theory Groupoid Spectral tripleCriticism of non-standard analysis Riemann hypothesisReferencesExternal links Alain Connes Official Web Site containing downloadable papers, and his book Non-commutative geometry, .  Alain Connes' Standard Model An interview with Alain Connes and a discussion about it   1947 birthsLiving people20th-century French mathematiciansForeign associates of the National Academy of Sciences21st-century French mathematiciansCollège de France facultyInstitute for Advanced Study visiting scholarsFields MedalistsMathematical analystsDifferential geometersÉcole Normale Supérieure alumniVanderbilt University facultyForeign Members of the Russian Academy of SciencesMembers of the French Academy of SciencesMembers of the Norwegian Academy of Science and LettersMembers of the Royal Danish Academy of Sciences and LettersClay Research Award recipients
+Allan Dwan (born Joseph Aloysius Dwan; April 3, 1885 – December 28, 1981) was a pioneering Canadian-born American motion picture director, producer, and screenwriter.Early lifeBorn Joseph Aloysius Dwan in Toronto, Ontario, Canada, Dwan, was the younger son of commercial traveler of woolen clothing Joseph Michael Dwan (1857–1917) and his wife Mary Jane Dwan, née Hunt. The family moved to the United States when he was seven years old on December 4, 1892 by ferry from Windsor to Detroit, according to his naturalization petition of August 1939. His elder brother, Leo Garnet Dwan (1883–1964), became a physician.Allan Dwan studied engineering at the University of Notre Dame and then worked for a lighting company in Chicago. He had a strong interest in the fledgling motion picture industry, and when Essanay Studios offered him the opportunity to become a scriptwriter, he took the job. At that time, some of the East Coast movie makers began to spend winters in California where the climate allowed them to continue productions requiring warm weather. Soon, a number of movie companies worked there year-round, and in 1911, Dwan began working part-time in Hollywood. While still in New York, in 1917 he was the founding president of the East Coast chapter of the Motion Picture Directors Association.CareerDwan operated Flying A Studios in La Mesa, California from August 1911 to July 1912. Flying A was one of the first motion pictures studios in California history. On August 12, 2011, a plaque was unveiled on the Wolff building at Third Avenue and La Mesa Boulevard commemorating Dwan and the Flying A Studios origins in La Mesa, California.After making a series of westerns and comedies, Dwan directed fellow Canadian-American Mary Pickford in several very successful movies as well as her husband, Douglas Fairbanks, notably in the acclaimed 1922 Robin Hood. Dwan directed Gloria Swanson in eight feature films, and one short film made in the short-lived sound-on-film process Phonofilm. This short, also featuring Thomas Meighan and Henri de la Falaise, was produced as a joke, for the April 26, 1925 "Lambs' Gambol" for The Lambs, with the film showing Swanson crashing the all-male club.Following the introduction of the talkies, Dwan directed child-star Shirley Temple in Heidi (1937) and Rebecca of Sunnybrook Farm (1938).Dwan helped launch the career of two other successful Hollywood directors, Victor Fleming, who went on to direct The Wizard of Oz and Gone With the Wind, and Marshall Neilan, who became an actor, director, writer and producer. Over a long career spanning almost 50 years, Dwan directed 125 motion pictures, some of which were highly acclaimed, such as the 1949 box office hit, Sands of Iwo Jima. He directed his last movie in 1961.He died in Los Angeles at the age of 96, and is interred in the San Fernando Mission Cemetery, Mission Hills, California.Dwan has a star on the Hollywood Walk of Fame at 6263 Hollywood Boulevard.Daniel Eagan of Film Journal International described Dwan as one of the early pioneers of cinema, stating that his style "is so basic as to seem invisible, but he treats his characters with uncommon sympathy and compassion."Partial filmography as directorThe Gold Lust (1911)The Picket Guard (1913)The Restless Spirit (1913)Back to Life (1913)Bloodhounds of the North (1913)The Lie (1914)The Honor of the Mounted (1914) The Unwelcome Mrs. Hatch (1914)Remember Mary Magdalen (1914)Discord and Harmony (1914)The Embezzler (1914)The Lamb, the Woman, the Wolf (1914)The End of the Feud (1914)The Test (1914) (*writer)The Tragedy of Whispering Creek (1914)The Unlawful Trade (1914)The Forbidden Room (1914)The Hopes of Blind Alley (1914)Richelieu (1914) Wildflower (1914)A Small Town Girl (1915)David Harum (1915)A Girl of Yesterday (1915)The Pretty Sister of Jose (1915) Jordan Is a Hard Road (1915)Betty of Graystone (1916)The Habit of Happiness (1916)The Good Bad Man (1916)An Innocent Magdalene (1916)The Half-Breed (1916)Manhattan Madness (1916)Accusing Evidence (1916)Panthea (1917)A Modern Musketeer (1917)Bound in Morocco (1918)Headin' South (1918)Mr. Fix-It (1918)He Comes Up Smiling (1918)Cheating Cheaters (1919)The Dark Star (1919)Getting Mary Married (1919)Soldiers of Fortune (1919)In The Heart of a Fool (1920) also producerThe Forbidden Thing (1920) also producerA Splendid Hazard (1920)A Perfect Crime (1921) The Sin of Martha Queed (1921) A Broken Doll (1921)Robin Hood (1922)Zaza (1923)Big Brother (1923)Manhandled (1924)Argentine Love (1924)The Coast of Folly (1925)Night Life of New York (1925)Stage Struck (1925)Gloria Swanson Dialogue (1925) short film made in Phonofilm for The Lambs annual "Gambol" held at Metropolitan Opera HousePadlocked (1926)Sea Horses (1926)Summer Bachelors (1926)Tin Gods (1926)French Dressing (1927)The Joy Girl (1927)East Side, West Side (1927)The Big Noise (1928)Frozen Justice (1929)The Iron Mask (1929)Tide of Empire (1929)The Far Call (1929)What a Widow! (1930)Man to Man (1930)Chances (1931)Wicked (1931)While Paris Sleeps (1932)Counsel's Opinion (1933)Black Sheep (1935)Navy Wife (1935)High Tension (1936)15 Maiden Lane (1936)One Mile from Heaven (1937)Heidi (1937)Rebecca of Sunnybrook Farm (1938)Suez (1938) Josette (1938)The Three Musketeers (1939)The Gorilla (1939)Frontier Marshal (1939)Sailor's Lady (1940)Young People (1940)Trail of the Vigilantes (1940)Look Who's Laughing (1941) also producerRise and Shine (1941)Friendly Enemies (1942)Around the World (1943) also producerUp in Mabel's Room (1944)Abroad with Two Yanks (1944)Getting Gertie's Garter (1945) also screenwriterBrewster's Millions (1945)Rendezvous with Annie (1946)Driftwood (1947)Calendar Girl (1947)Northwest Outpost (1947) also associate producerThe Inside Story (1948)Angel in Exile (1948) (with Philip Ford)Sands of Iwo Jima (1949)Surrender (1950)Belle Le Grand (1951)Wild Blue Yonder (1951)I Dream of Jeanie (1952)Montana Belle (1952)Woman They Almost Lynched (1953) Sweethearts on Parade (1953)Silver Lode (1954)Passion (1954)Cattle Queen of Montana (1954)Tennessee's Partner (1955)Pearl of the South Pacific (1955)Escape to Burma (1955)Slightly Scarlet (1956)Hold Back the Night (1956)The Restless Breed (1957)The River's Edge (1957)Enchanted Island (1958)Most Dangerous Man Alive (1961)See alsoCanadian pioneers in early HollywoodReferencesFurther readingBrownlow, Kevin, The Parade's Gone By... (1968)  Bogdanovich, Peter, Allan Dwan: The Last Pioneer (1971)   Foster, Charles, Stardust and Shadows: Canadians in Early Hollywood (2000) Lombardi, Frederic, Allan Dwan and the Rise and Decline of the Hollywood Studios (2013)Print  E-bookExternal linksAllan Dwan profile, virtual-history.com; accessed June 16, 20141885 births1981 deaths20th-century American male writers20th-century American screenwritersAmerican film directorsAmerican film producersAmerican male screenwritersBurials at San Fernando Mission CemeteryCanadian emigrants to the United StatesFilm directors from TorontoWestern (genre) film directorsWriters from Toronto
+Algeria, officially the People's Democratic Republic of Algeria, is a country in the Maghreb region of North Africa. The country is the largest country by total area in Africa and in the Arab world, and is bordered to the northeast by Tunisia; to the east by Libya; to the southeast by Niger; to the southwest by Mali, Mauritania, and Western Sahara; to the west by Morocco; and to the north by the Mediterranean Sea. It has a semi-arid geography, with most of the population living in the fertile north and the Sahara dominating the geography of the south. Algeria covers an area of , making it the world's tenth largest nation by area, and the largest nation in Africa. With a population of 44 million, Algeria is the ninth-most populous country in Africa, and the 32nd-most populous country in the world. The capital and largest city is Algiers, located in the far north on the Mediterranean coast.Pre-1962 Algeria has seen many empires and dynasties, including ancient Numidians, Phoenicians, Carthaginians, Romans, Vandals, Byzantines, Umayyads, Abbasids, Rustamids, Idrisids, Aghlabids, Fatimids, Zirids, Hammadids, Almoravids, Almohads, Zayyanids, Spaniards, Ottomans and finally, the French colonial empire. The vast majority of Algeria's population is Arab-Berber, practicing Islam, and using the official languages of Arabic and Berber. However, French serves as an administrative and educational language in some contexts. The main spoken language is Algerian Arabic.Algeria is a semi-presidential republic, with local constituencies consisting of 58 provinces and 1,541 communes. Algeria is a regional power in North Africa, and a middle power in global affairs. It has the highest Human Development Index of all non-island African countries and one of the largest economies on the continent, based largely on energy exports. Algeria has the world's sixteenth-largest oil reserves and the ninth-largest reserves of natural gas. Sonatrach, the national oil company, is the largest company in Africa, supplying large amounts of natural gas to Europe. Algeria's military is one of the largest in Africa, and has the largest defence budget on the continent. It is a member of the African Union, the Arab League, the OIC, OPEC, the United Nations, and the Arab Maghreb Union, of which it is a founding member.Name Other forms of the name are: , ; ; ; ; . It is officially the People's Democratic Republic of Algeria (; , , ; , abbreviated as RADP).EtymologyThe country's name derives from the city of Algiers which in turn derives from the Arabic  (, "The Islands"), a truncated form of the older  (, "Islands of the Mazghanna Tribe"), employed by medieval geographers such as al-Idrisi.HistoryPrehistory and ancient historyAround ~1.8-million-year-old stone artifacts from Ain Hanech (Algeria) were considered to represent the oldest archaeological materials in North Africa. Stone artifacts and cut-marked bones that were excavated from two nearby deposits at Ain Boucherit are estimated to be ~1.9 million years old, and even older stone artifacts to be as old as ~2.4 million years. Hence, the Ain Boucherit evidence shows that ancestral hominins inhabited the Mediterranean fringe in northern Africa much earlier than previously thought. The evidence strongly argues for early dispersal of stone tool manufacture and use from East Africa or a possible multiple-origin scenario of stone technology in both East and North Africa.Neanderthal tool makers produced hand axes in the Levalloisian and Mousterian styles (43,000 BC) similar to those in the Levant. Algeria was the site of the highest state of development of Middle Paleolithic Flake tool techniques. Tools of this era, starting about 30,000 BC, are called Aterian (after the archaeological site of Bir el Ater, south of Tebessa).The earliest blade industries in North Africa are called Iberomaurusian (located mainly in the Oran region). This industry appears to have spread throughout the coastal regions of the Maghreb between 15,000 and 10,000 BC. Neolithic civilization (animal domestication and agriculture) developed in the Saharan and Mediterranean Maghreb perhaps as early as 11,000 BC or as late as between 6000 and 2000 BC. This life, richly depicted in the Tassili n'Ajjer paintings, predominated in Algeria until the classical period. The mixture of peoples of North Africa coalesced eventually into a distinct native population that came to be called Berbers, who are the indigenous peoples of northern Africa.From their principal center of power at Carthage, the Carthaginians expanded and established small settlements along the North African coast; by 600 BC, a Phoenician presence existed at Tipasa, east of Cherchell, Hippo Regius (modern Annaba) and Rusicade (modern Skikda). These settlements served as market towns as well as anchorages.As Carthaginian power grew, its impact on the indigenous population increased dramatically. Berber civilisation was already at a stage in which agriculture, manufacturing, trade, and political organisation supported several states. Trade links between Carthage and the Berbers in the interior grew, but territorial expansion also resulted in the enslavement or military recruitment of some Berbers and in the extraction of tribute from others.By the early 4th century BC, Berbers formed the single largest element of the Carthaginian army. In the Revolt of the Mercenaries, Berber soldiers rebelled from 241 to 238 BC after being unpaid following the defeat of Carthage in the First Punic War. They succeeded in obtaining control of much of Carthage's North African territory, and they minted coins bearing the name Libyan, used in Greek to describe natives of North Africa. The Carthaginian state declined because of successive defeats by the Romans in the Punic Wars.In 146 BC the city of Carthage was destroyed. As Carthaginian power waned, the influence of Berber leaders in the hinterland grew. By the 2nd century BC, several large but loosely administered Berber kingdoms had emerged. Two of them were established in Numidia, behind the coastal areas controlled by Carthage. West of Numidia lay Mauretania, which extended across the Moulouya River in modern-day Morocco to the Atlantic Ocean. The high point of Berber civilisation, unequalled until the coming of the Almohads and Almoravids more than a millennium later, was reached during the reign of Masinissa in the 2nd century BC.After Masinissa's death in 148 BC, the Berber kingdoms were divided and reunited several times. Masinissa's line survived until 24 AD, when the remaining Berber territory was annexed to the Roman Empire.For several centuries Algeria was ruled by the Romans, who founded many colonies in the region. Like the rest of North Africa, Algeria was one of the breadbaskets of the empire, exporting cereals and other agricultural products. Saint Augustine was the bishop of Hippo Regius (modern-day Annaba, Algeria), located in the Roman province of Africa. The Germanic Vandals of Geiseric moved into North Africa in 429, and by 435 controlled coastal Numidia. They did not make any significant settlement on the land, as they were harassed by local tribes. In fact, by the time the Byzantines arrived Leptis Magna was abandoned and the Msellata region was occupied by the indigenous Laguatan who had been busy facilitating an Amazigh political, military and cultural revival. Furthermore, during the rule of the Romans, Byzantines, Vandals, Carthaginians, and Ottomans the Berber people were the only or one of the few in North Africa who remained independent. The Berber people were so resistant that even during the Muslim conquest of North Africa they still had control and possession over their mountains.The collapse of the Western Roman Empire led to the establishment of a native Kingdom based in Altava (modern day Algeria) known as the Mauro-Roman Kingdom. It was succeeded by another Kingdom based in Altava, the Kingdom of Altava. During the reign of Kusaila its territory extended from the region of modern-day Fez in the west to the western Aurès and later Kairaouan and the interior of Ifriqiya in the east.Middle AgesAfter negligible resistance from the locals, Muslim Arabs of the Umayyad Caliphate conquered Algeria in the early 8th century.  Large numbers of the indigenous Berber people converted to Islam. Christians, Berber and Latin speakers remained in the great majority in Tunisia until the end of the 9th century and Muslims only became a vast majority some time in the 10th. After the fall of the Umayyad Caliphate, numerous local dynasties emerged, including the Rustamids, Aghlabids, Fatimids, Zirids, Hammadids, Almoravids, Almohads and the Abdalwadid. The Christians left in three waves: after the initial conquest, in the 10th century and the 11th. The last were evacuated to Sicily by the Normans and the few remaining died out in the 14th century.During the Middle Ages, North Africa was home to many great scholars, saints and sovereigns including Judah Ibn Quraysh, the first grammarian to mention Semitic and Berber languages, the great Sufi masters Sidi Boumediene (Abu Madyan) and Sidi El Houari, and the Emirs Abd Al Mu'min and Yāghmūrasen. It was during this time that the Fatimids or children of Fatima, daughter of Muhammad, came to the Maghreb. These "Fatimids" went on to found a long lasting dynasty stretching across the Maghreb, Hejaz and the Levant, boasting a secular inner government, as well as a powerful army and navy, made up primarily of Arabs and Levantines extending from Algeria to their capital state of Cairo. The Fatimid caliphate began to collapse when its governors the Zirids seceded. In order to punish them the Fatimids sent the Arab Banu Hilal and Banu Sulaym against them. The resultant war is recounted in the epic Tāghribāt. In Al-Tāghrībāt the Amazigh Zirid Hero Khālīfā Al-Zānatī asks daily, for duels, to defeat the Hilalan hero Ābu Zayd al-Hilalī and many other Arab knights in a string of victories. The Zirids, however, were ultimately defeated ushering in an adoption of Arab customs and culture. The indigenous Amazigh tribes, however, remained largely independent, and depending on tribe, location and time controlled varying parts of the Maghreb, at times unifying it (as under the Fatimids). The Fatimid Islamic state, also known as Fatimid Caliphate made an Islamic empire that included North Africa, Sicily, Palestine, Jordan, Lebanon, Syria, Egypt, the Red Sea coast of Africa, Tihamah, Hejaz and Yemen. Caliphates from Northern Africa traded with the other empires of their time, as well as forming part of a confederated support and trade network with other Islamic states during the Islamic Era.The Amazighs historically consisted of several tribes. The two main branches were the Botr and Barnès tribes, who were divided into tribes, and again into sub-tribes. Each region of the Maghreb contained several tribes (for example, Sanhadja, Houara, Zenata, Masmouda, Kutama, Awarba, and Berghwata). All these tribes made independent territorial decisions.Several Amazigh dynasties emerged during the Middle Ages in the Maghreb and other nearby lands. Ibn Khaldun provides a table summarising the Amazigh dynasties of the Maghreb region, the Zirid, Ifranid, Maghrawa, Almoravid, Hammadid, Almohad, Merinid, Abdalwadid, Wattasid, Meknassa and Hafsid dynasties. Both of the Hammadid and Zirid empires as well as the Fatimids established their rule in all of the Maghreb countries. The Zirids ruled land in what is now Algeria, Tunisia, Morocco, Libya, Spain, Malta and Italy. The Hammadids captured and held important regions such as Ouargla, Constantine, Sfax, Susa, Algiers, Tripoli and Fez establishing their rule in every country in the Maghreb region. The Fatimids which was created and established by the Kutama Berbers  conquered all of North Africa as well as Sicily and parts of the Middle East.A few examples of medieval Berber dynasties which originated in Modern Algeria  Ifranid Dynasty Maghrawa Dynasty Zirid dynasty Hammadid dynasty Fatimid Caliphate  Kingdom of TlemcenFollowing the Berber revolt numerous independent states emerged across the Maghreb. In Algeria the Rustamid Kingdom was established. The Rustamid realm stretched from Tafilalt in Morocco to the Nafusa mountains in Libya including south, central and western Tunisia therefore including territory in all of the modern day Maghreb countries, in the south the Rustamid realm expanded to the modern borders of Mali and included territory in Mauritania.Once extending their control over all of the Maghreb, part of Spain and briefly over Sicily, originating from modern Algeria, the Zirids only controlled modern Ifriqiya by the 11th century. The Zirids recognized nominal suzerainty of the Fatimid caliphs of Cairo. El Mu'izz the Zirid ruler decided to end this recognition and declared his independence. The Zirids also fought against other Zenata Kingdoms, for example the Maghrawa, a Berber dynasty originating from Algeria and which at one point was a dominant power in the Maghreb ruling over much of Morocco and western Algeria including Fez, Sijilmasa, Aghmat, Oujda, most of the Sous and Draa and reaching as far as M’sila and the Zab in Algeria.As the Fatimid state was at the time too weak to attempt a direct invasion, they found another means of revenge. Between the Nile and the Red Sea were living Bedouin nomad tribes expelled from Arabia for their disruption and turbulency. The Banu Hilal and the Banu Sulaym for example, who regularly disrupted farmers in the Nile Valley since the nomads would often loot their farms. The then Fatimid vizier decided to destroy what he couldn't control, and broke a deal with the chiefs of these Beduouin tribes. The Fatimids even gave them money to leave.Whole tribes set off with women, children, elders, animals and camping equipment. Some stopped on the way, especially in Cyrenaica, where they are still one of the essential elements of the settlement but most arrived in Ifriqiya by the Gabes region, arriving 1051. The Zirid ruler tried to stop this rising tide, but with each encounter, the last under the walls of Kairouan, his troops were defeated and the Arabs remained masters of the battlefield. They Arabs usually didn't take control over the cities, instead looting them and destroying them.The invasion kept going, and in 1057 the Arabs spread on the high plains of Constantine where they encircled the Qalaa of Banu Hammad (capital of the Hammadid Emirate), as they had done in Kairouan a few decades ago. From there they gradually gained the upper Algiers and Oran plains. Some of these territories were forcibly taken back by the Almohads in the second half of the 12th century. The influx of Bedouin tribes was a major factor in the linguistic, cultural Arabization of the Maghreb and in the spread of nomadism in areas where agriculture had previously been dominant. Ibn Khaldun noted that the lands ravaged by Banu Hilal tribes had become completely arid desert.The Almohads originating from modern day Morocco, although founded by a man originating from Algeria known as Abd al-Mu'min would soon take control over the Maghreb. During the time of the Almohad Dynasty Abd al-Mu'min's tribe, the Koumïa, were the main supporters of the throne and the most important body of the empire. Defeating the weakening Almoravid Empire and taking control over Morocco in 1147, they pushed into Algeria in 1152, taking control over Tlemcen, Oran, and Algiers, wrestling control from the Hilian Arabs, and by the same year they defeated Hammadids who controlled Eastern Algeria.Following their decisive defeat in the Battle of Las Navas de Tolosa in 1212 the Almohads began collapsing, and in 1235 the governor of modern-day Western Algeria, Yaghmurasen Ibn Zyan declared his independence and established the Kingdom of Tlemcen and the Zayyanid dynasty. Warring with the Almohad forces attempting to restore control over Algeria for 13 years, they defeated the Almohads in 1248 after killing their Caliph in a successful ambush near Oujda. The Zayyanids retained their control over Algeria for 3 centuries. Much of the eastern territories of Algeria were under the authority of the Hafsid dynasty, although the Emirate of Bejaia encompassing the Algerian territories of the Hafsids would occasionally be independent from central Tunisian control. At their peak the Zayyanid kingdom included all of Morocco as its vassal to the west and in the east reached as far as Tunis which they captured during the reign of Abu Tashfin.After several conflicts with local Barbary pirates sponsored by the Zayyanid sultans, Spain decided to invade Algeria and defeat the native Kingdom of Tlemcen. In 1505, they invaded and captured Mers el Kébir, and in 1509 after a bloody siege, they conquered Oran. Following their decisive victories over the Algerians in the western-coastal areas of Algeria, the Spanish decided to get bolder, and invaded more Algerian cities. In 1510, they led a series of sieges and attacks, taking over Bejaia in a large siege, and leading a semi-successful siege against Algiers. They also besieged Tlemcen. In 1511, they took control over Cherchell and Jijel, and attacked Mostaganem where although they weren't able to conquer the city, they were able to force a tribute on them.Ottoman era In 1516, the Ottoman privateer brothers Aruj and Hayreddin Barbarossa, who operated successfully under the Hafsids, moved their base of operations to Algiers. They succeeded in conquering Jijel and Algiers from the Spaniards with help from the locals who saw them as liberators from the Christians, but the brothers eventually assassinated the local noble Salim al-Tumi and took control over the city and the surrounding regions. When Aruj was killed in 1518 during his invasion of Tlemcen, Hayreddin succeeded him as military commander of Algiers. The Ottoman sultan gave him the title of beylerbey and a contingent of some 2,000 janissaries. With the aid of this force and native Algerians, Hayreddin conquered the whole area between Constantine and Oran (although the city of Oran remained in Spanish hands until 1792).The next beylerbey was Hayreddin's son Hasan, who assumed the position in 1544. He was a Kouloughli or of mixed origins, as his mother was an Algerian Mooresse. Until 1587 Beylerbeylik of Algiers was governed by Beylerbeys who served terms with no fixed limits. Subsequently, with the institution of a regular administration, governors with the title of pasha ruled for three-year terms. The pasha was assisted by an autonomous janissary unit, known in Algeria as the Ojaq who were led by an agha. Discontent among the ojaq rose in the mid-1600s because they were not paid regularly, and they repeatedly revolted against the pasha. As a result, the agha charged the pasha with corruption and incompetence and seized power in 1659.Plague had repeatedly struck the cities of North Africa. Algiers lost from 30,000 to 50,000 inhabitants to the plague in 1620–21, and suffered high fatalities in 1654–57, 1665, 1691 and 1740–42.The Barbary pirates preyed on Christian and other non-Islamic shipping in the western Mediterranean Sea. The pirates often took the passengers and crew on the ships and sold them or used them as slaves. They also did a brisk business in ransoming some of the captives. According to Robert Davis, from the 16th to 19th century, pirates captured 1 million to 1.25 million Europeans as slaves. They often made raids, called Razzias, on European coastal towns to capture Christian slaves to sell at slave markets in North Africa and other parts of the Ottoman Empire. In 1544, for example, Hayreddin Barbarossa captured the island of Ischia, taking 4,000 prisoners, and enslaved some 9,000 inhabitants of Lipari, almost the entire population. In 1551, the Ottoman governor of Algiers, Turgut Reis, enslaved the entire population of the Maltese island of Gozo. Barbary pirates often attacked the Balearic Islands. The threat was so severe that residents abandoned the island of Formentera. The introduction of broad-sail ships from the beginning of the 17th century allowed them to branch out into the Atlantic.In July 1627 two pirate ships from Algiers under the command of Dutch pirate Jan Janszoon sailed as far as Iceland, raiding and capturing slaves. Two weeks earlier another pirate ship from Salé in Morocco had also raided in Iceland. Some of the slaves brought to Algiers were later ransomed back to Iceland, but some chose to stay in Algeria. In 1629, pirate ships from Algeria raided the Faroe Islands.In 1671, the taifa of raises, or the company of corsair captains rebelled, killed the agha, and placed one of its own in power. The new leader received the title of Dey. After 1689, the right to select the dey passed to the divan, a council of some sixty nobles. It was at first dominated by the ojaq; but by the 18th century, it had become the dey's instrument. In 1710, the dey persuaded the sultan to recognise him and his successors as regent, replacing the pasha in that role. Although Algiers remained nominally part of the Ottoman Empire, in reality they acted independently from the rest of the Empire, and often had wars with other Ottoman subjects and territories such as the Beylik of Tunis.The dey was in effect a constitutional autocrat. The dey was elected for a life term, but in the 159 years (1671–1830) that the system was in place, fourteen of the twenty-nine deys were assassinated. Despite usurpation, military coups and occasional mob rule, the day-to-day operation of the Deylikal government was remarkably orderly. Although the regency patronised the tribal chieftains, it never had the unanimous allegiance of the countryside, where heavy taxation frequently provoked unrest. Autonomous tribal states were tolerated, and the regency's authority was seldom applied in the Kabylia, although in 1730 the Regency was able to take control over the Kingdom of Kuku in western Kabylia. Many cities in the northern parts of the Algerian desert paid taxes to Algiers or one of its Beys, although they otherwise retained complete autonomy from central control, while the deeper parts of the Sahara were completely independent from Algiers.Barbary raids in the Mediterranean continued to attack Spanish merchant shipping, and as a result, the Spanish Navy bombarded Algiers in 1783 and 1784. For the attack in 1784, the Spanish fleet was to be joined by ships from such traditional enemies of Algiers as Naples, Portugal and the Knights of Malta. Over 20,000 cannonballs were fired, much of the city and its fortifications were destroyed and most of the Algerian fleet was sunk.In 1792, Algiers took back Oran and Mers el Kébir, the two last Spanish strongholds in Algeria. In the same year, they conquered the Moroccan Rif and Oujda, which they then abandoned in 1795.In the 19th century, Algerian pirates forged affiliations with Caribbean powers, paying a "licence tax" in exchange for safe harbour of their vessels.Attacks by Algerian pirates on American merchantmen resulted in the First and Second Barbary Wars, which ended the attacks on U.S. ships. A year later, a combined Anglo-Dutch fleet, under the command of Lord Exmouth bombarded Algiers to stop similar attacks on European fishermen. These efforts proved successful, although Algerian piracy would continue until the French conquest in 1830.French colonization (1830–1962) Under the pretext of a slight to their consul, the French invaded and captured Algiers in 1830. Historian Ben Kiernan wrote on the French conquest of Algeria: "By 1875, the French conquest was complete. The war had killed approximately 825,000 indigenous Algerians since 1830." French losses from 1831 to 1851 were 92,329 dead in the hospital and only 3,336 killed in action. The population of Algeria, which stood at about 2.9 million in 1872, reached nearly 11 million in 1960. French policy was predicated on "civilising" the country. The slave trade and piracy in Algeria ceased following the French conquest. The conquest of Algeria by the French took some time and resulted in considerable bloodshed. A combination of violence and disease epidemics caused the indigenous Algerian population to decline by nearly one-third from 1830 to 1872. On 17 September 1860, Napoleon III declared "Our first duty is to take care of the happiness of the three million Arabs, whom the fate of arms has brought under our domination."During this time, only Kabylia resisted, the Kabylians were not colonized until after the Mokrani Revolt in 1871. From 1848 until independence, France administered the whole Mediterranean region of Algeria as an integral part and département of the nation. One of France's longest-held overseas territories, Algeria became a destination for hundreds of thousands of European immigrants, who became known as colons and later, as Pied-Noirs. Between 1825 and 1847, 50,000 French people emigrated to Algeria. These settlers benefited from the French government's confiscation of communal land from tribal peoples, and the application of modern agricultural techniques that increased the amount of arable land. Many Europeans settled in Oran and Algiers, and by the early 20th century they formed a majority of the population in both cities.During the late 19th and early 20th century, the European share was almost a fifth of the population. The French government aimed at making Algeria an assimilated part of France, and this included substantial educational investments especially after 1900. The indigenous cultural and religious resistance heavily opposed this tendency, but in contrast to the other colonised countries' path in central Asia and Caucasus, Algeria kept its individual skills and a relatively human-capital intensive agriculture.During the Second World War, Algeria came under Vichy control before being liberated by the Allies in Operation Torch, which saw the first large-scale deployment of American troops in the North African campaign.Gradually, dissatisfaction among the Muslim population, which lacked political and economic status under the colonial system, gave rise to demands for greater political autonomy and eventually independence from France. In May 1945, the uprising against the occupying French forces was suppressed through what is now known as the Sétif and Guelma massacre. Tensions between the two population groups came to a head in 1954, when the first violent events of what was later called the Algerian War began after the publication of the Declaration of 1 November 1954. Historians have estimated that between 30,000 and 150,000 Harkis and their dependants were killed by the Front de Libération Nationale (FLN) or by lynch mobs in Algeria. The FLN used hit and run attacks in Algeria and France as part of its war, and the French conducted severe reprisals.The war led to the death of hundreds of thousands of Algerians and hundreds of thousands of injuries. Historians, like Alistair Horne and Raymond Aron, state that the actual number of Algerian Muslim war dead was far greater than the original FLN and official French estimates but was less than the 1 million deaths claimed by the Algerian government after independence. Horne estimated Algerian casualties during the span of eight years to be around 700,000. The war uprooted more than 2 million Algerians.The war against French rule concluded in 1962, when Algeria gained complete independence following the March 1962 Evian agreements and the July 1962 self-determination referendum.The first three decades of independence (1962–1991)The number of European Pied-Noirs who fled Algeria totaled more than 900,000 between 1962 and 1964. The exodus to mainland France accelerated after the Oran massacre of 1962, in which hundreds of militants entered European sections of the city, and began attacking civilians.Algeria's first president was the Front de Libération Nationale (FLN) leader Ahmed Ben Bella. Morocco's claim to portions of western Algeria led to the Sand War in 1963. Ben Bella was overthrown in 1965 by Houari Boumédiène, his former ally and defence minister. Under Ben Bella, the government had become increasingly socialist and authoritarian; Boumédienne continued this trend. But, he relied much more on the army for his support, and reduced the sole legal party to a symbolic role. He collectivised agriculture and launched a massive industrialisation drive. Oil extraction facilities were nationalised. This was especially beneficial to the leadership after the international 1973 oil crisis.In the 1960s and 1970s under President Houari Boumediene, Algeria pursued a program of industrialisation within a state-controlled socialist economy. Boumediene's successor, Chadli Bendjedid, introduced some liberal economic reforms. He promoted a policy of Arabisation in Algerian society and public life. Teachers of Arabic, brought in from other Muslim countries, spread conventional Islamic thought in schools and sowed the seeds of a return to Orthodox Islam.The Algerian economy became increasingly dependent on oil, leading to hardship when the price collapsed during the 1980s oil glut. Economic recession caused by the crash in world oil prices resulted in Algerian social unrest during the 1980s; by the end of the decade, Bendjedid introduced a multi-party system. Political parties developed, such as the Islamic Salvation Front (FIS), a broad coalition of Muslim groups.Civil War (1991–2002) and aftermathIn December 1991 the Islamic Salvation Front dominated the first of two rounds of legislative elections. Fearing the election of an Islamist government, the authorities intervened on 11 January 1992, cancelling the elections. Bendjedid resigned and a High Council of State was installed to act as the Presidency. It banned the FIS, triggering a civil insurgency between the Front's armed wing, the Armed Islamic Group, and the national armed forces, in which more than 100,000 people are thought to have died. The Islamist militants conducted a violent campaign of civilian massacres. At several points in the conflict, the situation in Algeria became a point of international concern, most notably during the crisis surrounding Air France Flight 8969, a hijacking perpetrated by the Armed Islamic Group. The Armed Islamic Group declared a ceasefire in October 1997.Algeria held elections in 1999, considered biased by international observers and most opposition groups which were won by President Abdelaziz Bouteflika. He worked to restore political stability to the country and announced a "Civil Concord" initiative, approved in a referendum, under which many political prisoners were pardoned, and several thousand members of armed groups were granted exemption from prosecution under a limited amnesty, in force until 13 January 2000. The AIS disbanded and levels of insurgent violence fell rapidly. The Groupe Salafiste pour la Prédication et le Combat (GSPC), a splinter group of the Armed Islamic Group, continued a terrorist campaign against the Government.Bouteflika was re-elected in the April 2004 presidential election after campaigning on a programme of national reconciliation. The programme comprised economic, institutional, political and social reform to modernise the country, raise living standards, and tackle the causes of alienation. It also included a second amnesty initiative, the Charter for Peace and National Reconciliation, which was approved in a referendum in September 2005. It offered amnesty to most guerrillas and Government security forces.In November 2008, the Algerian Constitution was amended following a vote in Parliament, removing the two-term limit on Presidential incumbents. This change enabled Bouteflika to stand for re-election in the 2009 presidential elections, and he was re-elected in April 2009. During his election campaign and following his re-election, Bouteflika promised to extend the programme of national reconciliation and a $150-billion spending programme to create three million new jobs, the construction of one million new housing units, and to continue public sector and infrastructure modernisation programmes.A continuing series of protests throughout the country started on 28 December 2010, inspired by similar protests across the Middle East and North Africa. On 24 February 2011, the government lifted Algeria's 19-year-old state of emergency. The government enacted legislation dealing with political parties, the electoral code, and the representation of women in elected bodies. In April 2011, Bouteflika promised further constitutional and political reform. However, elections are routinely criticised by opposition groups as unfair and international human rights groups say that media censorship and harassment of political opponents continue.On 2 April 2019, Bouteflika resigned from the presidency after mass protests against his candidacy for a fifth term in office.In December 2019, Abdelmadjid Tebboune became Algeria's president, after winning the first round of the presidential election with a record abstention rate – the highest of all presidential elections since Algeria's democracy in 1989. Tebboune is close to the military and he is also accused of being loyal to the deposed president.Geography Since the 2011 breakup of Sudan, and the creation of South Sudan, Algeria has been the largest country in Africa, and the Mediterranean Basin. Its southern part includes a significant portion of the Sahara. To the north, the Tell Atlas form with the Saharan Atlas, further south, two parallel sets of reliefs in approaching eastbound, and between which are inserted vast plains and highlands. Both Atlas tend to merge in eastern Algeria. The vast mountain ranges of Aures and Nememcha occupy the entire northeastern Algeria and are delineated by the Tunisian border. The highest point is Mount Tahat ().Algeria lies mostly between latitudes 19° and 37°N (a small area is north of 37°N and south of 19°N), and longitudes 9°W and 12°E. Most of the coastal area is hilly, sometimes even mountainous, and there are a few natural harbours. The area from the coast to the Tell Atlas is fertile. South of the Tell Atlas is a steppe landscape ending with the Saharan Atlas; farther south, there is the Sahara desert.The Hoggar Mountains (), also known as the Hoggar, are a highland region in central Sahara, southern Algeria. They are located about  south of the capital, Algiers, and just east of Tamanghasset. Algiers, Oran, Constantine, and Annaba are Algeria's main cities.Climate and hydrology In this region, midday desert temperatures can be hot year round. After sunset, however, the clear, dry air permits rapid loss of heat, and the nights are cool to chilly. Enormous daily ranges in temperature are recorded.Rainfall is fairly plentiful along the coastal part of the Tell Atlas, ranging from  annually, the amount of precipitation increasing from west to east. Precipitation is heaviest in the northern part of eastern Algeria, where it reaches as much as  in some years.Farther inland, the rainfall is less plentiful. Algeria also has ergs, or sand dunes, between mountains. Among these, in the summer time when winds are heavy and gusty, temperatures can go up to .Fauna and flora The varied vegetation of Algeria includes coastal, mountainous and grassy desert-like regions which all support a wide range of wildlife. Many of the creatures comprising the Algerian wildlife live in close proximity to civilisation. The most commonly seen animals include the wild boars, jackals, and gazelles, although it is not uncommon to spot fennecs (foxes), and jerboas. Algeria also has a small African leopard and Saharan cheetah population, but these are seldom seen. A species of deer, the Barbary stag, inhabits the dense humid forests in the north-eastern areas. The fennec fox is the national animal of Algeria.A variety of bird species makes the country an attraction for bird watchers. The forests are inhabited by boars and jackals. Barbary macaques are the sole native monkey. Snakes, monitor lizards, and numerous other reptiles can be found living among an array of rodents throughout the semi arid regions of Algeria. Many animals are now extinct, including the Barbary lions, Atlas bears and crocodiles.In the north, some of the native flora includes Macchia scrub, olive trees, oaks, cedars and other conifers. The mountain regions contain large forests of evergreens (Aleppo pine, juniper, and evergreen oak) and some deciduous trees. Fig, eucalyptus, agave, and various palm trees grow in the warmer areas. The grape vine is indigenous to the coast. In the Sahara region, some oases have palm trees. Acacias with wild olives are the predominant flora in the remainder of the Sahara. Algeria had a 2018 Forest Landscape Integrity Index mean score of 5.22/10, ranking it 106th globally out of 172 countries.Camels are used extensively; the desert also abounds with venomous and nonvenomous snakes, scorpions, and numerous insects.Government and politics Elected politicians have relatively little sway over Algeria. Instead, a group of unelected civilian and military "décideurs" ("deciders"), known as "le pouvoir" ("the power"), actually rule the country, even deciding who should be president. The most powerful man might have been Mohamed Mediène, the head of military intelligence, before he was brought down during the 2019 protests. In recent years, many of these generals have died, retired, or been imprisoned. After the death of General Larbi Belkheir, previous president Bouteflika put loyalists in key posts, notably at Sonatrach, and secured constitutional amendments that made him re-electable indefinitely, until he was brought down in 2019 during protests.The head of state is the President of Algeria, who is elected for a five-year term. The president was formerly limited to two five-year terms, but a constitutional amendment passed by the Parliament on 11 November 2008 removed this limitation. The most recent presidential election was planned to be in April 2019, but widespread protests erupted on 22 February against the president's decision to participate in the election, which resulted in President Bouteflika announcing his resignation on 3 April. Abdelmadjid Tebboune, an independent candidate, was elected as president after the election eventually took place on 12 December 2019. Protestors refused to recognise Tebboune as president, citing demands for comprehensive reform of the political system. Algeria has universal suffrage at 18 years of age. The President is the head of the army, the Council of Ministers and the High Security Council. He appoints the Prime Minister who is also the head of government.The Algerian parliament is bicameral; the lower house, the People's National Assembly, has 462 members who are directly elected for five-year terms, while the upper house, the Council of the Nation, has 144 members serving six-year terms, of which 96 members are chosen by local assemblies and 48 are appointed by the president. According to the constitution, no political association may be formed if it is "based on differences in religion, language, race, gender, profession, or region". In addition, political campaigns must be exempt from the aforementioned subjects.Parliamentary elections were last held in May 2017. In the elections, the FLN lost 44 of its seats, but remained the largest party with 164 seats, the military-backed National Rally for Democracy won 100, and the Muslim Brotherhood-linked Movement of the Society for Peace won 33.Foreign relationsAlgeria is included in the European Union's European Neighbourhood Policy (ENP) which aims at bringing the EU and its neighbours closer.Giving incentives and rewarding best performers, as well as offering funds in a faster and more flexible manner, are the two main principles underlying the European Neighbourhood Instrument (ENI) that came into force in 2014. It has a budget of €15.4 billion and provides the bulk of funding through a number of programmes.In 2009, the French government agreed to compensate victims of nuclear tests in Algeria. Defence Minister Herve Morin stated that "It's time for our country to be at peace with itself, at peace thanks to a system of compensation and reparations," when presenting the draft law on the payouts. Algerian officials and activists believe that this is a good first step and hope that this move would encourage broader reparation.Tensions between Algeria and Morocco in relation to the Western Sahara have been an obstacle to tightening the Arab Maghreb Union, nominally established in 1989, but which has carried little practical weight. On 24 August 2021, Algeria announced the break of diplomatic relations with Morocco.MilitaryThe military of Algeria consists of the People's National Army (ANP), the Algerian National Navy (MRA), and the Algerian Air Force (QJJ), plus the Territorial Air Defence Forces. It is the direct successor of the National Liberation Army (Armée de Libération Nationale or ALN), the armed wing of the nationalist National Liberation Front which fought French colonial occupation during the Algerian War of Independence (1954–62).Total military personnel include 147,000 active, 150,000 reserve, and 187,000 paramilitary staff (2008 estimate). Service in the military is compulsory for men aged 19–30, for a total of 12 months. The military expenditure was 4.3% of the gross domestic product (GDP) in 2012. Algeria has the second largest military in North Africa with the largest defence budget in Africa ($10 billion). Most of Algeria's weapons are imported from Russia, with whom they are a close ally.In 2007, the Algerian Air Force signed a deal with Russia to purchase 49 MiG-29SMT and 6 MiG-29UBT at an estimated cost of $1.9 billion. Russia is also building two 636-type diesel submarines for Algeria.Human rightsAlgeria has been categorised by Freedom House as "not free" since it began publishing such ratings in 1972, with the exception of 1989, 1990, and 1991, when the country was labelled "partly free." In December 2016, the Euro-Mediterranean Human Rights Monitor issued a report regarding violation of media freedom in Algeria. It clarified that the Algerian government imposed restriction on freedom of the press; expression; and right to peaceful demonstration, protest and assembly as well as intensified censorship of the media and websites. Due to the fact that the journalists and activists criticise the ruling government, some media organisations' licenses are cancelled.Independent and autonomous trade unions face routine harassment from the government, with many leaders imprisoned and protests suppressed. In 2016, a number of unions, many of which were involved in the 2010–2012 Algerian Protests, have been deregistered by the government.Homosexuality is illegal in Algeria. Public homosexual behavior is punishable by up to two years in prison. Despite this, about 26% of Algerians think that homosexuality should be accepted, according to the survey conducted by the BBC News Arabic-Arab Barometer in 2019. Algeria showed largest LGBT acceptance compared to other Arab countries where the survey was conducted.Human Rights Watch has accused the Algerian authorities of using the COVID-19 pandemic as an excuse to prevent pro-democracy movements and protests in the country, leading to the arrest of youths as part of social distancing.Administrative divisionsAlgeria is divided into 58 provinces (wilayas), 553 districts (daïras) and 1,541 municipalities (baladiyahs). Each province, district, and municipality is named after its seat, which is usually the largest city.The administrative divisions have changed several times since independence. When introducing new provinces, the numbers of old provinces are kept, hence the non-alphabetical order. With their official numbers, currently (since 1983) they areEconomyAlgeria's currency is the dinar (DZD). The economy remains dominated by the state, a legacy of the country's socialist post-independence development model. In recent years, the Algerian government has halted the privatization of state-owned industries and imposed restrictions on imports and foreign involvement in its economy. These restrictions are just starting to be lifted off recently although questions about Algeria's slowly-diversifying economy remain.Algeria has struggled to develop industries outside hydrocarbons in part because of high costs and an inert state bureaucracy. The government's efforts to diversify the economy by attracting foreign and domestic investment outside the energy sector have done little to reduce high youth unemployment rates or to address housing shortages. The country is facing a number of short-term and medium-term problems, including the need to diversify the economy, strengthen political, economic and financial reforms, improve the business climate and reduce inequalities amongst regions.A wave of economic protests in February and March 2011 prompted the Algerian government to offer more than $23 billion in public grants and retroactive salary and benefit increases. Public spending has increased by 27% annually during the past 5 years. The 2010–14 public-investment programme will cost US$286 billion, 40% of which will go to human development.Thanks to strong hydrocarbon revenues, Algeria has a cushion of $173 billion in foreign currency reserves and a large hydrocarbon stabilisation fund. In addition, Algeria's external debt is extremely low at about 2% of GDP. The economy remains very dependent on hydrocarbon wealth, and, despite high foreign exchange reserves (US$178 billion, equivalent to three years of imports), current expenditure growth makes Algeria's budget more vulnerable to the risk of prolonged lower hydrocarbon revenues.Algeria has not joined the WTO, despite several years of negotiations but is a member of the Greater Arab Free Trade Area and the African Continental Free Trade Area, and has an association agreement with the European UnionOil and natural resourcesAlgeria, whose economy is reliant on petroleum, has been an OPEC member since 1969. Its crude oil production stands at around 1.1 million barrels/day, but it is also a major gas producer and exporter, with important links to Europe. Hydrocarbons have long been the backbone of the economy, accounting for roughly 60% of budget revenues, 30% of GDP, and 87.7% of export earnings. Algeria has the 10th-largest reserves of natural gas in the world and is the sixth-largest gas exporter. The U.S. Energy Information Administration reported that in 2005, Algeria had  of proven natural-gas reserves. It also ranks 16th in oil reserves.Non-hydrocarbon growth for 2011 was projected at 5%. To cope with social demands, the authorities raised expenditure, especially on basic food support, employment creation, support for SMEs, and higher salaries. High hydrocarbon prices have improved the current account and the already large international reserves position.Income from oil and gas rose in 2011 as a result of continuing high oil prices, though the trend in production volume is downwards. Production from the oil and gas sector in terms of volume, continues to decline, dropping from 43.2 million tonnes to 32 million tonnes between 2007 and 2011. Nevertheless, the sector accounted for 98% of the total volume of exports in 2011, against 48% in 1962, and 70% of budgetary receipts, or US$71.4 billion.The Algerian national oil company is Sonatrach, which plays a key role in all aspects of the oil and natural gas sectors in Algeria. All foreign operators must work in partnership with Sonatrach, which usually has majority ownership in production-sharing agreements.Access to biocapacity in Algeria is lower than world average. In 2016, Algeria had 0.53 global hectares of biocapacity per person within its territory, much less than the world average of 1.6 global hectares per person. In 2016, Algeria used 2.4 global hectares of biocapacity per person – their ecological footprint of consumption. This means they use just under 4.5 times as much biocapacity as Algeria contains. As a result, Algeria is running a biocapacity deficit.Research and alternative energy sourcesAlgeria has invested an estimated 100 billion dinars towards developing research facilities and paying researchers. This development program is meant to advance alternative energy production, especially solar and wind power. Algeria is estimated to have the largest solar energy potential in the Mediterranean, so the government has funded the creation of a solar science park in Hassi R'Mel. Currently, Algeria has 20,000 research professors at various universities and over 780 research labs, with state-set goals to expand to 1,000. Besides solar energy, areas of research in Algeria include space and satellite telecommunications, nuclear power and medical research.Labour marketThe overall rate of unemployment was 10% in 2011, but remained higher among young people, with a rate of 21.5% for those aged between 15 and 24. The government strengthened in 2011 the job programs introduced in 1988, in particular in the framework of the program to aid those seeking work (Dispositif d'Aide à l'Insertion Professionnelle).Despite a decline in total unemployment, youth and women unemployment is high. Unemployment particularly affects the young, with a jobless rate of 21.5% among the 15–24 age group.TourismThe development of the tourism sector in Algeria had previously been hampered by a lack of facilities, but since 2004 a broad tourism development strategy has been implemented resulting in many hotels of a high modern standard being built.There are several UNESCO World Heritage Sites in Algeria including Al Qal'a of Beni Hammad, the first capital of the Hammadid empire; Tipasa, a Phoenician and later Roman town; and Djémila and Timgad, both Roman ruins; M'Zab Valley, a limestone valley containing a large urbanized oasis; and the Casbah of Algiers, an important citadel. The only natural World Heritage Site is the Tassili n'Ajjer, a mountain range.TransportThe Algerian road network is the densest in Africa; its length is estimated at  of highways, with more than 3,756 structures and a paving rate of 85%. This network will be complemented by the East-West Highway, a major infrastructure project currently under construction. It is a 3-way,  highway, linking Annaba in the extreme east to the Tlemcen in the far west. Algeria is also crossed by the Trans-Sahara Highway, which is now completely paved. This road is supported by the Algerian government to increase trade between the six countries crossed: Algeria, Mali, Niger, Nigeria, Chad, and Tunisia.DemographicsAlgeria has a population of an estimated 44 million, of which the vast majority are Arab-Berber ethnically. At the outset of the 20th century, its population was approximately four million. About 90% of Algerians live in the northern, coastal area; the inhabitants of the Sahara desert are mainly concentrated in oases, although some 1.5 million remain nomadic or partly nomadic. 28.1% of Algerians are under the age of 15.Between 90,000 and 165,000 Sahrawis from Western Sahara live in the Sahrawi refugee camps, in the western Algerian Sahara desert. There are also more than 4,000 Palestinian refugees, who are well integrated and have not asked for assistance from the United Nations High Commissioner for Refugees (UNHCR). In 2009, 35,000 Chinese migrant workers lived in Algeria.The largest concentration of Algerian migrants outside Algeria is in France, which has reportedly over 1.7 million Algerians of up to the second generation.Ethnic groupsIndigenous Berbers as well as Phoenicians, Romans, Vandals, Byzantine Greeks, Arabs, Turks, various Sub-Saharan Africans, and French have contributed to the history of Algeria. Descendants of Andalusian refugees are also present in the population of Algiers and other cities. Moreover, Spanish was spoken by these Aragonese and Castillian Morisco descendants deep into the 18th century, and even Catalan was spoken at the same time by Catalan Morisco descendants in the small town of Grish El-Oued.Despite the dominance of the Berber ethnicity in Algeria, the majority of Algerians identify with an Arabic-based identity, especially after the Arab nationalism rising in the 20th century.    Berbers and Berber-speaking Algerians are divided into many groups with varying languages. The largest of these are the Kabyles, who live in the Kabylie region east of Algiers, the Chaoui of Northeast Algeria, the Tuaregs in the southern desert and the Shenwa people of North Algeria.During the colonial period, there was a large (10% in 1960) European population who became known as Pied-Noirs. They were primarily of French, Spanish and Italian origin. Almost all of this population left during the war of independence or immediately after its end.LanguagesModern Standard Arabic and Berber are the official languages. Algerian Arabic (Darja) is the language used by the majority of the population. Colloquial Algerian Arabic is heavily infused with borrowings from French and Berber.Berber has been recognised as a "national language" by the constitutional amendment of 8 May 2002. Kabyle, the predominant Berber language, is taught and is partially co-official (with a few restrictions) in parts of Kabylie. In February 2016, the Algerian constitution passed a resolution that made Berber an official language alongside Arabic.Although French has no official status in Algeria, it has one of the largest Francophone populations in the world, and French is widely used in government, media (newspapers, radio, local television), and both the education system (from primary school onwards) and academia due to Algeria's colonial history. It can be regarded as a lingua franca of Algeria. In 2008, 11.2 million Algerians could read and write in French. An Abassa Institute study in April 2000 found that 60% of households could speak and understand French, or 18 million people out of a total of 30 million at the time. Following a period during which the Algerian government tried to phase out French, in recent decades the government has changed course and reinforced the study of French, and some television programs are broadcast in the language.Algeria emerged as a bilingual state after 1962. Colloquial Algerian Arabic is spoken by about 72% of the population and Berber by 27–30%.ReligionIslam is the predominant religion in Algeria, with its adherents, mostly Sunnis, accounting for 99% of the population according to a 2021 CIA World Factbook estimate, and 97.9% according to Pew Research in 2020. There are about 290,000 Ibadis in the M'zab Valley in the region of Ghardaia. Estimates of the Christian population range from 20,000 to 200,000 Algerian citizens who are Christians predominantly belong to Protestant groups, which have seen increased pressure from the government in recent years including many forced closures.There has been an increase in the number of people identifying as non-religious. The June 2019 Arab Barometer-BBC News report found that the percentage of Algerians identifying as non-religious has grown from around 8% in 2013 to around 15% in 2018. The Arab Barometer December 2019, found that the growth in the percentage of Algerians identifying as non-religious is largely driven by young Algerians, with roughly 25% describing themselves as non-religious.Algeria has given the Muslim world a number of prominent thinkers, including Emir Abdelkader, Abdelhamid Ben Badis, Mouloud Kacem Naît Belkacem, Malek Bennabi and Mohamed Arkoun.HealthIn 2018, Algeria had the highest numbers of physicians in the Maghreb region (1.72 per 1,000 people), nurses (2.23 per 1,000 people), and dentists (0.31 per 1,000 people). Access to "improved water sources" was around 97.4% of the population in urban areas and 98.7% of the population in the rural areas. Some 99% of Algerians living in urban areas, and around 93.4% of those living in rural areas, had access to "improved sanitation". According to the World Bank, Algeria is making progress toward its goal of "reducing by half the number of people without sustainable access to improved drinking water and basic sanitation by 2015". Given Algeria's young population, policy favours preventive health care and clinics over hospitals. In keeping with this policy, the government maintains an immunisation program. However, poor sanitation and unclean water still cause tuberculosis, hepatitis, measles, typhoid fever, cholera and dysentery. The poor generally receive health care free of charge.Health records have been maintained in Algeria since 1882 and began adding Muslims living in the south to their vital record database in 1905 during French rule.EducationSince the 1970s, in a centralised system that was designed to significantly reduce the rate of illiteracy, the Algerian government introduced a decree by which school attendance became compulsory for all children aged between 6 and 15 years who have the ability to track their learning through the 20 facilities built since independence, now the literacy rate is around 92.6%. Since 1972, Arabic is used as the language of instruction during the first nine years of schooling. From the third year, French is taught and it is also the language of instruction for science classes. The students can also learn English, Italian, Spanish and German. In 2008, new programs at the elementary appeared, therefore the compulsory schooling does not start at the age of six anymore, but at the age of five. Apart from the 122 private schools, the Universities of the State are free of charge. After nine years of primary school, students can go to the high school or to an educational institution. The school offers two programs: general or technical. At the end of the third year of secondary school, students pass the exam of the baccalaureate, which allows once it is successful to pursue graduate studies in universities and institutes.Education is officially compulsory for children between the ages of six and 15. In 2008, the illiteracy rate for people over 10 was 22.3%, 15.6% for men and 29.0% for women. The province with the lowest rate of illiteracy was Algiers Province at 11.6%, while the province with the highest rate was Djelfa Province at 35.5%.Algeria has 26 universities and 67 institutions of higher education, which must accommodate a million Algerians and 80,000 foreign students in 2008. The University of Algiers, founded in 1879, is the oldest, it offers education in various disciplines (law, medicine, science and letters). Twenty-five of these universities and almost all of the institutions of higher education were founded after the independence of the country.Even if some of them offer instruction in Arabic like areas of law and the economy, most of the other sectors as science and medicine continue to be provided in French and English. Among the most important universities, there are the University of Sciences and Technology Houari Boumediene, the University of Mentouri Constantine, and University of Oran Es-Senia. The University of Abou Bekr Belkaïd in Tlemcen and University of Batna Hadj Lakhdar occupy the 26th and 45th row in Africa. Algeria was ranked 121st in the Global Innovation Index in 2020, down from 113rd in 2019.CitiesBelow is a list of the most populous Algerian cities:CultureModern Algerian literature, split between Arabic, Tamazight and French, has been strongly influenced by the country's recent history. Famous novelists of the 20th century include Mohammed Dib, Albert Camus, Kateb Yacine and Ahlam Mosteghanemi while Assia Djebar is widely translated. Among the important novelists of the 1980s were Rachid Mimouni, later vice-president of Amnesty International, and Tahar Djaout, murdered by an Islamist group in 1993 for his secularist views.Malek Bennabi and Frantz Fanon are noted for their thoughts on decolonization; Augustine of Hippo was born in Tagaste (modern-day Souk Ahras); and Ibn Khaldun, though born in Tunis, wrote the Muqaddima while staying in Algeria. The works of the Sanusi family in pre-colonial times, and of Emir Abdelkader and Sheikh Ben Badis in colonial times, are widely noted. The Latin author Apuleius was born in Madaurus (Mdaourouch), in what later became Algeria.Contemporary Algerian cinema is various in terms of genre, exploring a wider range of themes and issues. There has been a transition from cinema which focused on the war of independence to films more concerned with the everyday lives of Algerians.MediaArtAlgerian painters, like Mohamed Racim or Baya, attempted to revive the prestigious Algerian past prior to French colonisation, at the same time that they have contributed to the preservation of the authentic values of Algeria. In this line, Mohamed Temam, Abdelkhader Houamel have also returned through this art, scenes from the history of the country, the habits and customs of the past and the country life. Other new artistic currents including the one of M'hamed Issiakhem, Mohammed Khadda and Bachir Yelles, appeared on the scene of Algerian painting, abandoning figurative classical painting to find new pictorial ways, in order to adapt Algerian paintings to the new realities of the country through its struggle and its aspirations. Mohammed Khadda and M'hamed Issiakhem have been notable in recent years.Literature The historic roots of Algerian literature go back to the Numidian and Roman African era, when Apuleius wrote The Golden Ass, the only Latin novel to survive in its entirety. This period had also known Augustine of Hippo, Nonius Marcellus and Martianus Capella, among many others. The Middle Ages have known many Arabic writers who revolutionised the Arab world literature, with authors like Ahmad al-Buni, Ibn Manzur and Ibn Khaldoun, who wrote the Muqaddimah while staying in Algeria, and many others.Albert Camus was an Algerian-born French Pied-Noir author. In 1957, he was awarded the Nobel Prize in literature.Today Algeria contains, in its literary landscape, big names having not only marked the Algerian literature, but also the universal literary heritage in Arabic and French.As a first step, Algerian literature was marked by works whose main concern was the assertion of the Algerian national entity, there is the publication of novels as the Algerian trilogy of Mohammed Dib, or even Nedjma of Kateb Yacine novel which is often regarded as a monumental and major work. Other known writers will contribute to the emergence of Algerian literature whom include Mouloud Feraoun, Malek Bennabi, Malek Haddad, Moufdi Zakaria, Abdelhamid Ben Badis, Mohamed Laïd Al-Khalifa, Mouloud Mammeri, Frantz Fanon, and Assia Djebar.In the aftermath of the independence, several new authors emerged on the Algerian literary scene, they will attempt through their works to expose a number of social problems, among them there are Rachid Boudjedra, Rachid Mimouni, Leila Sebbar, Tahar Djaout and Tahir Wattar.Currently, a part of Algerian writers tends to be defined in a literature of shocking expression, due to the terrorism that occurred during the 1990s, the other party is defined in a different style of literature who staged an individualistic conception of the human adventure. Among the most noted recent works, there is the writer, the swallows of Kabul and the attack of Yasmina Khadra, the oath of barbarians of Boualem Sansal, memory of the flesh of Ahlam Mosteghanemi and the last novel by Assia Djebar nowhere in my father's House.MusicChaâbi music is a typically Algerian musical genre characterized by specific rhythms and of Qacidate (popular poems) in Arabic dialect. The undisputed master of this music is El Hadj M'Hamed El Anka. The Constantinois Malouf style is saved by musician from whom Mohamed Tahar Fergani is a performer.Folk music styles include Bedouin music, characterized by the poetic songs based on long kacida (poems); Kabyle music, based on a rich repertoire that is poetry and old tales passed through generations; Shawiya music, a folklore from diverse areas of the Aurès Mountains. Rahaba music style is unique to the Aures. Souad Massi is a rising Algerian folk singer. Other Algerian singers of the diaspora include Manel Filali in Germany and Kenza Farah in France. Tergui music is sung in Tuareg languages generally, Tinariwen had a worldwide success. Finally, the staïfi music is born in Sétif and remains a unique style of its kind.Modern music is available in several facets, Raï music is a style typical of western Algeria. Rap, a relatively recent style in Algeria, is experiencing significant growth.CinemaThe Algerian state's interest in film-industry activities can be seen in the annual budget of DZD 200 million (EUR 1.3 million) allocated to production, specific measures and an ambitious programme plan implemented by the Ministry of Culture in order to promote national production, renovate the cinema stock and remedy the weak links in distribution and exploitation.The financial support provided by the state, through the Fund for the Development of the Arts, Techniques and the Film Industry (FDATIC) and the Algerian Agency for Cultural Influence (AARC), plays a key role in the promotion of national production. Between 2007 and 2013, FDATIC subsidised 98 films (feature films, documentaries and short films). In mid-2013, AARC had already supported a total of 78 films, including 42 feature films, 6 short films and 30 documentaries.According to the European Audiovisual Observatory's LUMIERE database, 41 Algerian films were distributed in Europe between 1996 and 2013; 21 films in this repertoire were Algerian-French co-productions. Days of Glory (2006) and Outside the Law (2010) recorded the highest number of admissions in the European Union, 3,172,612 and 474,722, respectively.Algeria won the Palme d'Or for Chronicle of the Years of Fire (1975), two Oscars for Z (1969), and other awards for the Italian-Algerian movie The Battle of Algiers.CuisineAlgerian cuisine is rich and diverse. The country was considered as the "granary of Rome". It offers a component of dishes and varied dishes, depending on the region and according to the seasons. The cuisine uses cereals as the main products, since they are always produced with abundance in the country. There is not a dish where cereals are not present.Algerian cuisine varies from one region to another, according to seasonal vegetables. It can be prepared using meat, fish and vegetables. Among the dishes known, couscous, chorba, rechta, chakhchoukha, berkoukes, shakshouka, mthewem, chtitha, mderbel, dolma, brik or bourek, garantita, lham'hlou, etc. Merguez sausage is widely used in Algeria, but it differs, depending on the region and on the added spices.Cakes are marketed and can be found in cities either in Algeria, in Europe or North America. However, traditional cakes are also made at home, following the habits and customs of each family. Among these cakes, there are Tamina, Baklawa, Chrik, Garn logzelles, Griouech, Kalb el-louz, Makroud, Mbardja, Mchewek, Samsa, Tcharak, Baghrir, Khfaf, Zlabia, Aarayech, Ghroubiya and Mghergchette. Algerian pastry also contains Tunisian or French cakes. Marketed and home-made bread products include varieties such as Kessra or Khmira or Harchaya, chopsticks and so-called washers Khoubz dar or Matloue. Other traditional meals sold often as street food include mhadjeb or mahjouba, karantika, doubara, chakhchoukha, hassouna, and t'chicha.SportsVarious games have existed in Algeria since antiquity. In the Aures, people played several games such as El Kherba or El khergueba (chess variant). Playing cards, checkers and chess games are part of Algerian culture. Racing (fantasia) and rifle shooting are part of cultural recreation of the Algerians.The first Algerian and African gold medalist is Boughera El Ouafi in 1928 Olympics of Amsterdam in the Marathon. The second Algerian Medalist was Alain Mimoun in 1956 Summer Olympics in Melbourne. Several men and women were champions in athletics in the 1990s including Noureddine Morceli, Hassiba Boulmerka, Nouria Merah-Benida, and Taoufik Makhloufi, all specialized in middle-distance running.Football is the most popular sport in Algeria. Several names are engraved in the history of the sport, including Lakhdar Belloumi, Rachid Mekhloufi, Hassen Lalmas, Rabah Madjer, Riyad Mahrez, Salah Assad and Djamel Zidane. The Algeria national football team qualified for the 1982 FIFA World Cup, 1986 FIFA World Cup, 2010 FIFA World Cup and 2014 FIFA World Cup. In addition, several football clubs have won continental and international trophies as the club ES Sétif or JS Kabylia. The Algerian Football Federation is an association of Algeria football clubs organizing national competitions and international matches of the selection of Algeria national football team.See also Index of Algeria-related articles Outline of AlgeriaExplanatory notesCitationsGeneral bibliography  Ageron, Charles-Robert (1991). Modern Algeria – A History from 1830 to the Present. Translated from French and edited by Michael Brett. London: Hurst. . Aghrout, Ahmed; Bougherira, Redha M. (2004). Algeria in Transition – Reforms and Development Prospects. Routledge. . Bennoune, Mahfoud (1988). The Making of Contemporary Algeria – Colonial Upheavals and Post-Independence Development, 1830–1987. Cambridge: Cambridge University Press. . Fanon, Frantz (1966; 2005 paperback). The Wretched of the Earth. Grove Press. ASIN B0007FW4AW, . Horne, Alistair (1977). A Savage War of Peace: Algeria 1954–1962. Viking Adult. ,  (2006 reprint) Laouisset, Djamel (2009). A Retrospective Study of the Algerian Iron and Steel Industry. New York City: Nova Publishers. . Roberts, Hugh (2003). The Battlefield – Algeria, 1988–2002. Studies in a Broken Polity. London: Verso Books. . Ruedy, John (1992). Modern Algeria – The Origins and Development of a Nation. Bloomington: Indiana University Press. . Stora, Benjamin (2001). Algeria, 1830–2000 – A Short History. Ithaca, New York: Cornell University Press. . Sidaoui, Riadh (2009). "Islamic Politics and the Military – Algeria 1962–2008". Religion and Politics – Islam and Muslim Civilisation. Farnham: Ashgate Publishing. .External links People's Democratic Republic of Algeria Official government website  Portal of the First Ministry Portal of the First Ministry  Algeria. The World Factbook. Central Intelligence Agency.   Algeria profile from the BBC News  ency education ency education   Key Development Forecasts for Algeria from International Futures EU Neighbourhood Info Centre: Algeria North African countriesMaghrebi countriesSaharan countriesArab republicsRepublicsArabic-speaking countries and territoriesBerber-speaking countries and territoriesFrench-speaking countries and territoriesG15 nationsMember states of OPECMember states of the African UnionMember states of the Arab LeagueMember states of the Organisation of Islamic CooperationMember states of the Union for the MediterraneanCurrent member states of the United NationsStates and territories established in 19621962 establishments in Algeria1962 establishments in AfricaCountries in Africa
+This is a list of characters in Ayn Rand's 1957 novel Atlas Shrugged.Major charactersThe following are major characters from the novel.ProtagonistsDagny TaggartDagny Taggart is the protagonist of the novel. She is vice-president in Charge of Operations for Taggart Transcontinental, under her brother, James Taggart. Given James' incompetence, Dagny is responsible for all the workings of the railroad.Francisco d'AnconiaFrancisco d'Anconia is one of the central characters in Atlas Shrugged, an owner by inheritance of the world's largest copper mining operation. He is a childhood friend, and the first love, of Dagny Taggart. A child prodigy of exceptional talents, Francisco was dubbed the "climax" of the d'Anconia line, an already prestigious family of skilled industrialists. He was a classmate of John Galt and Ragnar Danneskjöld and student of both Hugh Akston and Robert Stadler. He began working while still in school, proving that he could have made a fortune without the aid of his family's wealth and power. Later, Francisco bankrupts the d'Anconia business to put it out of others' reach. His full name is given as "Francisco Domingo Carlos Andres Sebastián d'Anconia".John GaltJohn Galt is the primary male hero of Atlas Shrugged. He initially appears as an unnamed menial worker for Taggart Transcontinental, who often dines with Eddie Willers in the employees' cafeteria, and leads Eddie to reveal important information about Dagny Taggart and Taggart Transcontinental. Only Eddie's side of their conversations is given in the novel. Later in the novel, the reader discovers this worker's true identity.Before working for Taggart Transcontinental, Galt worked as an engineer for the Twentieth Century Motor Company, where he secretly invented a generator of usable electric energy from ambient static electricity, but abandoned his prototype, and his employment, when dissatisfied by an easily corrupted novel system of payment. This prototype was found by Dagny Taggart and Hank Rearden. Galt himself remains concealed throughout much of the novel, working a job and living by himself, where he unites the most skillful inventors and business leaders under his leadership. Much of the book's third division is given to his broadcast speech, which presents the author's philosophy of Objectivism.Henry "Hank" ReardenHenry (known as "Hank") Rearden is one of the central characters in Atlas Shrugged. He owns the most important steel company in the United States, and invents Rearden Metal, an alloy stronger, lighter, cheaper and tougher than steel. He lives in Philadelphia with his wife Lillian, his brother Philip, and his elderly mother. Rearden represents a type of self-made man and eventually divorces Lillian, abandons his steel mills following a bloody assault by government-planted workers, and joins John Galt's strike.Eddie WillersEdwin "Eddie" Willers is the Special Assistant to the Vice-President in Charge of Operations at Taggart Transcontinental. His father and grandfather worked for the Taggarts, and himself likewise. He is completely loyal to Dagny and to Taggart Transcontinental. Willers does not possess the creative ability of Galt's associates, but matches them in moral courage and is capable of appreciating and making use of their creations. After Dagny shifts her attention and loyalty to saving the captive Galt, Willers maintains the railroad until its collapse.Ragnar DanneskjöldOne of Galt's first followers, and world-famous as a pirate, who seizes relief ships sent from the United States to the People's States of Europe. He works to  ensure that once those espousing Galt's philosophy are restored to their rightful place in society, they have enough capital to rebuild the world. Kept in the background for much of the book, Danneskjöld makes a personal appearance to encourage Rearden to persevere in his increasingly difficult situation, and gives him a bar of gold as compensation for the income taxes he has paid over the last several years. Danneskjöld is married to the actress Kay Ludlow; their relationship is kept hidden from the outside world, which only knows of Ludlow as a retired film star. Considered a misfit by Galt's other adherents, he views his actions as a means to speed the world along in understanding Galt's perspective.According to Barbara Branden, who was closely associated with Rand at the time the book was written, there were sections written describing Danneskjöld's adventures at sea, cut from the final published text. In a 1974 comment at a lecture, Ayn Rand admitted that Danneskjöld's name was a tribute to Victor Hugo's novel, , wherein the hero becomes the first of the Counts of Danneskjöld. In the published book, Danneskjöld is always seen through the eyes of others (Dagny Taggart or Hank Rearden), except for a brief paragraph in the very last chapter.AntagonistsJames TaggartThe President of Taggart Transcontinental and the book's most important antagonist. Taggart is an expert influence peddler but incapable of making operational decisions on his own. He relies on his sister, Dagny Taggart, to actually run the railroad, but nonetheless opposes her in almost every endeavor because of his various anti-capitalist moral and political beliefs. In a sense, he is the antithesis of Dagny. This contradiction leads to the recurring absurdity of his life: the desire to overcome those on whom his life depends, and the horror that he will succeed at this. In the final chapters of the novel, he suffers a complete mental breakdown upon realizing that he can no longer deceive himself in this respect.Lillian ReardenThe unsupportive wife of Hank Rearden, who dislikes his habits and (secretly at first) seeks to ruin Rearden to prove her own value. Lillian achieves this, when she passes information to James Taggart about her husband's affair with his sister. This information is used to blackmail Rearden to sign a Gift Certificate which delivers all the property rights of Rearden Metal to others. Lillian thereafter uses James Taggart for sexual satisfaction, until Hank abandons her.Dr. Floyd FerrisFerris is a biologist who works as "co-ordinator" at the State Science Institute. He uses his position there to deride reason and productive achievement, and publishes a book entitled Why Do You Think You Think? He clashes on several occasions with Hank Rearden, and twice attempts to blackmail Rearden into giving up Rearden Metal. He is also one of the group of looters who tries to get Rearden to agree to the Steel Unification Plan. Ferris hosts the demonstration of the Project X weapon, and is the creator of the Ferris Persuader, a torture machine. When John Galt is captured by the looters, Ferris uses the device on Galt, but it breaks down before extracting the information Ferris wants from Galt. Ferris represents the group which uses brute force on the heroes to achieve the ends of the looters.Dr. Robert StadlerA former professor at Patrick Henry University, and along with colleague Hugh Akston, mentor to Francisco d'Anconia, John Galt and Ragnar Danneskjöld. He has since become a sell-out, one who had great promise but squandered it for social approval, to the detriment of the free. He works at the State Science Institute where all his inventions are perverted for use by the military, including a sound-based weapon known as Project X (Xylophone). He is killed when Cuffy Meigs (see below) drunkenly overloads the circuits of Project X, causing it to destroy itself and every structure and living thing in a 100-mile radius. The character was, in part, modeled on J. Robert Oppenheimer, whom Rand had interviewed for an earlier project, and his part in the creation of nuclear weapons.` To his former student Galt, Stadler represents the epitome of human evil, as the "man who knew better" but chose not to act for the good.Wesley MouchThe incompetent and treacherous lobbyist whom Hank Rearden reluctantly employs in Washington, who rises to prominence and authority throughout the novel through trading favours and disloyalty. In return for betraying Hank by helping broker the Equalization of Opportunity Bill (which, by restricting the number of businesses each person may own to one, forces Hank to divest most of his companies), he is given a senior position at the Bureau of Economic Planning and National Resources. Later in the novel he becomes its Top Co-ordinator, a position that eventually becomes Economic Dictator of the country. Mouch's mantra, whenever a problem arises from his prior policy, is to say, "I can't help it.  I need wider powers."Secondary charactersThe following secondary characters also appear in the novel.Hugh Akston is identified as "One of the last great advocates of reason." He was a renowned philosopher and the head of the Department of Philosophy at Patrick Henry University, where he taught Francisco d'Anconia, John Galt, and Ragnar Danneskjöld. He was, along with Robert Stadler, a father figure to these three. Akston's name is so hallowed that a young lady, on hearing that Francisco had studied under him, is shocked. She thought he must have been one of those great names from an earlier century. He now works as a cook in a roadside diner, and proves extremely skillful at the job. When Dagny tracks him down, and before she discovers his true identity, he rejects her enthusiastic offer to manage the dining car services for Taggart Transcontinental. He is based on Aristotle.Jeff Allen is a tramp who stows away on a Taggart train during one of Dagny's cross-country trips. Instead of throwing him out, she allows him to ride as her guest. It is from Allen that she learns the full story behind the collapse of the Twentieth Century Motor Company (Rand's extensive metaphor for the inherent flaws of communism), as well as a hint of John Galt's true background.Calvin Atwood is owner of Atwood Light and Power Company and joins Galt's strike.Mayor Bascom is the mayor of Rome, Wisconsin, who reveals part of the history of the Twentieth Century Motor Company.Dr. Blodgett is the scientist who pulls the lever to demonstrate Project X.Orren Boyle is the head of Associated Steel, antithesis of Hank Rearden and a friend of James Taggart. He is an investor in the San Sebastián Mines. He disappears from the story after having a nervous breakdown following the failed 'unification' of the steel industry.Laura Bradford is an actress and Kip Chalmers' mistress. She is one of the passengers on his train, and dies in the Taggart Tunnel disaster.Bill Brent is the chief dispatcher for the Colorado Division of Taggart Transcontinental, who tries to prevent the Taggart Tunnel disaster.Cherryl Brooks is a dime store shopgirl who marries James Taggart after a chance encounter in her store the night the John Galt Line was falsely deemed his greatest success. She marries him thinking he is the heroic person behind Taggart Transcontinental. Cherryl is at first harsh towards Dagny, having believed Jim Taggart's descriptions of his sister, until she questions employees of the railroad. Upon learning that her scorn had been misdirected, Cherryl puts off apologizing to Dagny out of shame, but eventually admits to Dagny that when she married Jim, she thought he had the heroic qualities that she had looked up to - she thought she was marrying someone like Dagny. Shortly after making this admission, she commits suicide by jumping over a stone parapet and into the river, unable to live with her evil husband and seeing no way to escape him.Millie Bush was "a mean, ugly little eight-year-old" girl voted to receive gold braces to straighten her teeth by the Marxist "family" committee who determined how pay was allocated at The Twentieth Century Motor Company. Her teeth are later knocked out by a man denied an allowance by the committee to purchase the things he valued.Emma Chalmers, Kip Chalmers' mother, gains some influence after his death. Known as "Kip's Ma," she starts a soybean-growing project in Louisiana and commandeers thousands of railroad freight cars to move the harvest. As a result, the year's wheat crop from Minnesota never reaches the rest of the country, but instead rots in storage; also, the soybean crop is lost, having been reaped too early.Kip Chalmers is a Washington man who has decided to run for election as Legislator from California. On the way to a campaign rally, the Taggart Transcontinental train that is carrying him encounters a split rail, resulting in the destruction of its diesel engine. His demands lead to a coal-burning steam engine being attached to his train in its stead and used to pull it through an eight-mile tunnel. The result is the suffocation of all passengers and the destruction of the Taggart Tunnel.Dan Conway is the middle-aged president of the Phoenix-Durango railroad. Running a railroad is just about the only thing he knows. When the Anti-dog-eat-dog Rule is used to drive his business out of Colorado, he loses the will to fight, and resigns himself to a quiet life of books and fishing. He is not one of those who joined John Galt's strike, his resignation being a personal choice of his own. Ken Danagger owns Danagger Coal in Pennsylvania. He helps Hank Rearden illegally make Rearden Metal, then later decides to quit and join Galt's strike moments before Dagny arrives to try to persuade him otherwise.Quentin Daniels is an enterprising engineer hired by Dagny Taggart to reconstruct John Galt's motor. Partway through this process, Quentin withdraws his effort for the same reasons John Galt himself had. Dagny's pursuit of Quentin leads her to Galt's Gulch. Galt recognizes in him a younger version of himself, having emulated both Galt's achievements in physics and Galt's social reasoning. Sebastian d'Anconia was the 16th (or 17th) Century founder of the d'Anconia dynasty. Escaped from Spain because of expressing his opinions too freely and coming in conflict with the Inquisition, leaving behind a palace and his beloved. Started a small mine in South America, which became the beginning of a mining empire and a new fortune (and a new palace). Eventually sent for his beloved who had waited for him many years. He is the role model which Francisco d'Anconia looks to, as Dagny Taggart looks to Nathaniel Taggart. Francisco remarks that their respective ancestors would have liked each other.Balph Eubank is called "the literary leader of the age", despite the fact that no book he has written has sold more than 3,000 copies. He complains that it is disgraceful that artists are treated as peddlers, and that there should be a law limiting the sales of books to 10,000 copies. He is a misogynist who thinks it disgusting that Dagny Taggart is a railroad vice-president.The Fishwife is one of the strikers, who earns her living by providing the fish for Hammond's grocery market; she is described as having "dark, disheveled hair and large eyes", and is a writer. Galt says she "wouldn't be published outside. She believes that when one deals with words, one deals with the mind." According to Barbara Branden in her book The Passion of Ayn Rand, "The Fishwife is Ayn's Hitchcock-like appearance in Atlas Shrugged." So says too Leonard Peikoff.Lawrence Hammond runs Hammond Cars in Colorado, one of the few companies in existence that still produces top-quality vehicles. He eventually quits and joins the strike.Richard Halley is Dagny Taggart's favorite composer, who mysteriously disappeared after the evening of his greatest triumph. Halley spent years as a struggling and unappreciated composer. At age 24, his opera Phaethon was performed for the first time, to an audience who booed and heckled it. After 19 years, Phaethon was performed again, but this time it was received to the greatest ovation the opera house had ever heard. The following day, Halley retired, sold the rights to his music, and disappeared. It is later revealed that he has joined the strike and settled in Galt's Gulch.Mrs. William Hastings is the widow of the chief engineer at the Twentieth Century Motor Company. Her husband quit shortly after Galt did and joined the strike some years later. Her lead allows Dagny to find Hugh Akston.Dr. Thomas Hendricks is a famous brain surgeon who developed a new method of preventing strokes. He joined Galt's strike when the American medical system was put under government control.Tinky Holloway is one of the "looters" and is frequently referred to and quoted by other characters in the story, but he has only one major appearance: during the Washington meeting with Hank Rearden.Lee Hunsacker is in charge of a company called Amalgamated Service when takes over the Twentieth Century Motor Company. He files a lawsuit that eventually leads to Midas Mulligan and Judge Narragansett joining the strike. A failed businessman, he laments constantly that no-one ever gave him a chance.Gwen Ives is Hank Rearden's secretary, described as being in her late twenties and remaining calm and professional despite the chaos that threatens his business. When Rearden abandons his mills and joins Galt's strike, she and many other employees do the same.Gilbert Keith-Worthing is a British novelist of erstwhile fame, now neglected but still considered a "walking classic," and a proponent of the idea that freedom is an illusion. Kip Chalmers brings him along on the train to California, "for no reason that either of them could discover"; he dies in the Taggart Tunnel disaster.Owen Kellogg is Assistant to the Manager of the Taggart Terminal in New York. He catches Dagny Taggart's eye as one of the few competent men on staff. After seeing the sorry state of the Ohio Division, she decides to make him its new Superintendent. However, as soon as she returns to New York, Kellogg informs her that he is quitting his job. Owen Kellogg eventually reaches, and settles in, Galt's Gulch.Fred Kinnan is a labor leader and member of the looter cabal. Unlike the others, however, Kinnan is straightforward and honest about his purpose. Kinnan is the only one to openly state the true motivations of himself and his fellow conspirators. At the end of Galt's three-hour speech, he expresses admiration for the man, as he says what he means. Despite this, Kinnan admits that he is one of the people Galt is out to destroy.Paul Larkin is an unsuccessful, middle-aged businessman, a friend of the Rearden family. He meets with the other Looters to work out a plan to bring Rearden down. James Taggart knows he is friends with Hank Rearden and challenges his loyalty, and Larkin assures Taggart that he will go along with them.Eugene Lawson heads the Community Bank of Madison, then gets a job with the government when it his bank goes bankrupt. One of the looter's cabal, he is a collectivist who abhors production and money-making.Mort Liddy is a hack composer who writes trite scores for movies and modern symphonies to which no one listens. He believes melody is a primitive vulgarity. He is one of Lillian Rearden's friends and a member of the cultural elite.Clifton Locey is a friend of Jim Taggart who takes the position of vice-president of operation when Dagny Taggart quits.Pat Logan is the engineer on the first run of the John Galt Line. He later strikes.Kay Ludlow is a beautiful actress who quit Holywood because of the roles she was given and married secretly the pirate Ragnar Danneskjöld.Dick McNamara is a contractor who finished the San Sebastian Line. Dagny Taggart plans to hire him to lay the new Rearden Metal track for the Rio Norte Line, but before she does so, he mysteriously disappears. She later discovers that he has joined the strike and settled in Galt's Gulch.Cuffy Meigs is the Director of Unification for the railroad business. He carries a pistol and a lucky rabbit's foot, and he dresses in a military uniform, and has been described as "impervious to thought". Meigs seizes control of Project X and accidentally destroys it, demolishing the country's last railroad bridge across the Mississippi River and killing himself, his men, and Dr. Stadler.Dave Mitchum is a state-hired superintendent of the Colorado Division of Taggart Transcontinental. He is partially responsible for the Taggart Tunnel disaster.Chick Morrison holds the position of "Morale Conditioner" in the government. He quits when society begins to collapse and flees to a stronghold in Tennessee. His fellow looters consider it unlikely that he will survive.Horace Bussby Mowen is the president of the Amalgamated Switch and Signal Company, Inc. of Connecticut. He is a businessman who sees nothing wrong with the moral code that is destroying society and would never dream of saying he is in business for any reason other than the good of society. Dagny Taggart hires Mowen to produce switches made of Rearden Metal. He is reluctant to build anything with this unproven technology, and has to be cajoled into accepting the contract. When pressured by public opinion, he discontinues production of the switches, forcing Dagny to find an alternative source.Midas Mulligan is a wealthy banker who mysteriously disappeared in protest after he was given a court order to lend money to an incompetent applicant. When the order came down, he liquidated his entire business, paid off his depositors, and joined Galt's strike. He is the legal owner of the land where Galt's Gulch is located. Mulligan's birth name was Michael, but he had it legally changed after a news article called him "Midas" in a derogatory fashion, which Mulligan took as a compliment.Judge Narragansett is an American jurist who ruled in favor of Midas Mulligan during the case brought against him by the incompetent loan applicant. When Narragansett's ruling was reversed on appeal, he retired and joined the strike. At the end of the novel, he is seen editing the United States Constitution, crossing out the contradicting amendments of it and adding an amendment to prohibit Congress from passing laws that restrain freedom of trade.Ben Nealy is a railroad contractor whom Dagny Taggart hires to replace the track on the Rio Norte Line with Rearden Metal. Nealy is incompetent, but Dagny can find no one better in all the country. Nealy believes that anything can get done with enough muscle power. He sees no role for intelligence in human achievement. He relies on Dagny and Ellis Wyatt to run things, and resents them for doing it, because it appears to him like they are just bossing people around.Ted Nielsen is the head of Nielsen Motors. He eventually goes on strike, along with most of the other industrialist "producer" types, by closing his motor factory. Dagny later finds him when she visits Galt's Gulch for the first time.Betty Pope is a wealthy socialite who is having a meaningless sexual affair with James Taggart. She is deliberately crude in a way that casts ridicule on her high social position.Dr. Potter holds some undefined position with the State Science Institute. He is sent to try to obtain the rights to Rearden Metal.Dr. Simon Pritchett is the prestigious head of the Department of Philosophy at Patrick Henry University and is considered the leading philosopher of the age. He believes that man is nothing but a collection of chemicals, reason is a superstition, it is futile to seek meaning in life, and the duty of a philosopher is to show that nothing can be understood.Rearden's mother, whose name is not mentioned, lives with Rearden at his home in Philadelphia. She is involved in charity work, and berates Rearden whenever she can. She dotes on her weak son Philip Rearden.Philip Rearden is the younger brother of Hank Rearden. He lives in his brother's home in Philadelphia and is completely dependent on him. He is resentful of his brother's charity.Dwight Sanders owns Sanders Aircraft, a producer of high-quality airplanes, and joins the strike.Bertram Scudder is an editorial writer for the magazine The Future. He typically bashes business and businessmen, but he never says anything specific in his articles, relying on innuendo, sneers, and denunciation. He wrote a hatchet job on Hank Rearden called The Octopus. He is also vocal in support of the Equalization of Opportunity Bill. Scudder claims that the most important thing in life is "brother love" but seems to have nothing but hatred for those around him. He loses his job after Dagny Taggart reveals her affair with Hank Rearden over air on his radio show.Claude Slagenhop is president of political organization Friends of Global Progress and one of Lillian Rearden's friends. He believes that ideas are just air, that this is no time for talk, but for action. Global Progress is a sponsor of the Equalization of Opportunity Bill.Gerald and Ivy Starnes are the two surviving children of Jed Starnes, the founder of the Twentieth Century Motor Company. Together with their since-deceased brother Eric, they instituted a communistic payment-and-benefits program that drove the company into bankruptcy. Gerald, a dying alcoholic, and Ivy, a pseudo-Buddhist ascetic, continue to insist that the plan was perfect and that the failure of their father's company was entirely due to the workers. Eric was a weak, attention-seeking man with a pathological desire to be loved. He committed suicide after the woman he loved married another man. Gerald claims that he always acted for the good of the employees, but he was vain and incompetent and often threw lavish parties using company funds. Ivy, on the other hand, is described as a sadist who relishes seeing others in poverty, but who has no desire for wealth of her own.Andrew Stockton runs the Stockton Foundry in Stockton, Colorado. When he joins the strike, he opens a foundry in Galt's Gulch.Nathaniel "Nat" Taggart was the founder of Taggart Transcontinental. He built his railroad without any government handouts, and ran the business for no other reason than to turn a profit. He began as a penniless adventurer and ended up as one of the wealthiest men in the country. He never earned money by force or fraud (except for bribing government officials and throwing an opponent down a flight of stairs), and never apologized for becoming wealthy and successful. He was one of the most hated men of his time. Dagny is often inspired by looking at a statue of Nat Taggart at the railroad headquarters, and draws a dollar sign on its base as a signal to Francisco when she is ready to join Galt's strike.  It is suspected that he is modeled after James Jerome Hill, builder of the Great Northern Railroad. Mr. Thompson is the "Head of the State" for the United States. He is not particularly intelligent and has a very undistinguished look. He knows politics, however, and is a master of public relations and back-room deals. Rand's notes indicate that she modeled him on President Harry S. Truman, and that she deliberately decided not to call him "President of the United States" as this title has "honorable connotations" which the character does not deserve.Lester Tuck is the campaign manager for Kip Chalmers and one of his guests on the train trip to California. He dies in the Taggart Tunnel disaster.Clem Weatherby is a government representative on the board of directors of Taggart Transcontinental. Dagny considers him the least bad of the government representatives, since he does have some real knowledge on the running of trains. She notices, however, that he is the least appreciated by his own bosses.The Wet Nurse (Tony) is a young bureaucrat sent by the government to watch over Rearden's mills. Though he starts out as a cynical follower of the looters' code, his experience at the mills transforms him, and he comes to respect and admire the producers. He is shot attempting to inform Hank Rearden about a government plot, but does succeed in warning Rearden just before he dies.Ellis Wyatt is the head of Wyatt Oil. He has almost single-handedly revived the economy of Colorado by discovering a new process for extracting more oil from what were thought to be exhausted oil wells. When first introduced, he is aggressive towards Dagny, whom he does not yet know and whom he blames for what are, in fact, her brother's policies which directly threaten his business. When the government passes laws and decrees which make it impossible for him to continue, he sets all his oil wells on fire, leaving a single note: "I am leaving it as I found it. Take over. It's yours." One particular burning well that resists all efforts to extinguish it becomes known as "Wyatt's Torch". Later Dagny meets him in Galt's Gulch.FootnotesNotesCitationsGeneral referencesExternal linksWebsite with comprehensive list of individuals mentioned in Atlas Shrugged Fictional socialitesLists of literary charactersLiterary characters introduced in 1957
+Anthropology is the scientific study of humanity, concerned with human behavior, human biology, cultures, societies, and linguistics, in both the present and past, including past human species. Social anthropology studies patterns of behaviour, while cultural anthropology studies cultural meaning, including norms and values. A portmanteau sociocultural anthropology is commonly used today. Linguistic anthropology studies how language influences social life. Biological or physical anthropology studies the biological development of humans.Archaeological anthropology, often termed as 'anthropology of the past', studies human activity through investigation of physical evidence. It is considered a branch of anthropology in North America and Asia, while in Europe archaeology is viewed as a discipline in its own right or grouped under other related disciplines, such as history.EtymologyThe abstract noun anthropology is first attested in reference to history. Its present use first appeared in Renaissance Germany in the works of Magnus Hundt and Otto Casmann. Their New Latin  derived from the combining forms of the Greek words ánthrōpos (, "human") and lógos (, "study"). (Its adjectival form appeared in the works of Aristotle.) It began to be used in English, possibly via French , by the early 18th century.HistoryThrough the 19th centuryIn 1647, the Bartholins, founders of the University of Copenhagen, defined  as follows:Sporadic use of the term for some of the subject matter occurred subsequently, such as the use by Étienne Serres in 1839 to describe the natural history, or paleontology, of man, based on comparative anatomy, and the creation of a chair in anthropology and ethnography in 1850 at the French National Museum of Natural History by Jean Louis Armand de Quatrefages de Bréau. Various short-lived organizations of anthropologists had already been formed. The Société Ethnologique de Paris, the first to use the term ethnology, was formed in 1839. Its members were primarily anti-slavery activists. When slavery was abolished in France in 1848, the Société was abandoned.Meanwhile, the Ethnological Society of New York, currently the American Ethnological Society, was founded on its model in 1842, as well as the Ethnological Society of London in 1843, a break-away group of the Aborigines' Protection Society. These anthropologists of the times were liberal, anti-slavery, and pro-human-rights activists. They maintained international connections.Anthropology and many other current fields are the intellectual results of the comparative methods developed in the earlier 19th century. Theorists in such diverse fields as anatomy, linguistics, and ethnology, making feature-by-feature comparisons of their subject matters, were beginning to suspect that similarities between animals, languages, and folkways were the result of processes or laws unknown to them then. For them, the publication of Charles Darwin's On the Origin of Species was the epiphany of everything they had begun to suspect. Darwin himself arrived at his conclusions through comparison of species he had seen in agronomy and in the wild.Darwin and Wallace unveiled evolution in the late 1850s. There was an immediate rush to bring it into the social sciences. Paul Broca in Paris was in the process of breaking away from the Société de biologie to form the first of the explicitly anthropological societies, the Société d'Anthropologie de Paris, meeting for the first time in Paris in 1859. When he read Darwin, he became an immediate convert to Transformisme, as the French called evolutionism. His definition now became "the study of the human group, considered as a whole, in its details, and in relation to the rest of nature".Broca, being what today would be called a neurosurgeon, had taken an interest in the pathology of speech. He wanted to localize the difference between man and the other animals, which appeared to reside in speech. He discovered the speech center of the human brain, today called Broca's area after him. His interest was mainly in Biological anthropology, but a German philosopher specializing in psychology, Theodor Waitz, took up the theme of general and social anthropology in his six-volume work, entitled Die Anthropologie der Naturvölker, 1859–1864. The title was soon translated as "The Anthropology of Primitive Peoples". The last two volumes were published posthumously.Waitz defined anthropology as "the science of the nature of man". Following Broca's lead, Waitz points out that anthropology is a new field, which would gather material from other fields, but would differ from them in the use of comparative anatomy, physiology, and psychology to differentiate man from "the animals nearest to him". He stresses that the data of comparison must be empirical, gathered by experimentation. The history of civilization, as well as ethnology, are to be brought into the comparison. It is to be presumed fundamentally that the species, man, is a unity, and that "the same laws of thought are applicable to all men".Waitz was influential among British ethnologists. In 1863, the explorer Richard Francis Burton and the speech therapist James Hunt broke away from the Ethnological Society of London to form the Anthropological Society of London, which henceforward would follow the path of the new anthropology rather than just ethnology. It was the 2nd society dedicated to general anthropology in existence. Representatives from the French Société were present, though not Broca. In his keynote address, printed in the first volume of its new publication, The Anthropological Review, Hunt stressed the work of Waitz, adopting his definitions as a standard. Among the first associates were the young Edward Burnett Tylor, inventor of cultural anthropology, and his brother Alfred Tylor, a geologist. Previously Edward had referred to himself as an ethnologist; subsequently, an anthropologist.Similar organizations in other countries followed: The Anthropological Society of Madrid (1865), the American Anthropological Association in 1902, the Anthropological Society of Vienna (1870), the Italian Society of Anthropology and Ethnology (1871), and many others subsequently. The majority of these were evolutionists. One notable exception was the Berlin Society for Anthropology, Ethnology, and Prehistory (1869) founded by Rudolph Virchow, known for his vituperative attacks on the evolutionists. Not religious himself, he insisted that Darwin's conclusions lacked empirical foundation.During the last three decades of the 19th century, a proliferation of anthropological societies and associations occurred, most independent, most publishing their own journals, and all international in membership and association. The major theorists belonged to these organizations. They supported the gradual osmosis of anthropology curricula into the major institutions of higher learning. By 1898, 48 educational institutions in 13 countries had some curriculum in anthropology. None of the 75 faculty members were under a department named anthropology.20th and 21st centuriesThis meager statistic expanded in the 20th century to comprise anthropology departments in the majority of the world's higher educational institutions, many thousands in number. Anthropology has diversified from a few major subdivisions to dozens more. Practical anthropology, the use of anthropological knowledge and technique to solve specific problems, has arrived; for example, the presence of buried victims might stimulate the use of a forensic archaeologist to recreate the final scene. The organization has reached a global level. For example, the World Council of Anthropological Associations (WCAA), "a network of national, regional and international associations that aims to promote worldwide communication and cooperation in anthropology", currently contains members from about three dozen nations.Since the work of Franz Boas and Bronisław Malinowski in the late 19th and early 20th centuries, social anthropology in Great Britain and cultural anthropology in the US have been distinguished from other social sciences by their emphasis on cross-cultural comparisons, long-term in-depth examination of context, and the importance they place on participant-observation or experiential immersion in the area of research. Cultural anthropology, in particular, has emphasized cultural relativism, holism, and the use of findings to frame cultural critiques. This has been particularly prominent in the United States, from Boas' arguments against 19th-century racial ideology, through Margaret Mead's advocacy for gender equality and sexual liberation, to current criticisms of post-colonial oppression and promotion of multiculturalism. Ethnography is one of its primary research designs as well as the text that is generated from anthropological fieldwork.In Great Britain and the Commonwealth countries, the British tradition of social anthropology tends to dominate. In the United States, anthropology has traditionally been divided into the four field approach developed by Franz Boas in the early 20th century: biological or physical anthropology; social, cultural, or sociocultural anthropology; and archaeological anthropology; plus linguistic anthropology. These fields frequently overlap but tend to use different methodologies and techniques.European countries with overseas colonies tended to practice more ethnology (a term coined and defined by Adam F. Kollár in 1783). It is sometimes referred to as sociocultural anthropology in the parts of the world that were influenced by the European tradition.FieldsAnthropology is a global discipline involving humanities, social sciences and natural sciences. Anthropology builds upon knowledge from natural sciences, including the discoveries about the origin and evolution of Homo sapiens, human physical traits, human behavior, the variations among different groups of humans, how the evolutionary past of Homo sapiens has influenced its social organization and culture, and from social sciences, including the organization of human social and cultural relations, institutions, social conflicts, etc. Early anthropology originated in Classical Greece and Persia and studied and tried to understand observable cultural diversity, such as by Al-Biruni of the Islamic Golden Age. As such, anthropology has been central in the development of several new (late 20th century) interdisciplinary fields such as cognitive science, global studies, and various ethnic studies.According to Clifford Geertz,Sociocultural anthropology has been heavily influenced by structuralist and postmodern theories, as well as a shift toward the analysis of modern societies. During the 1970s and 1990s, there was an epistemological shift away from the positivist traditions that had largely informed the discipline. During this shift, enduring questions about the nature and production of knowledge came to occupy a central place in cultural and social anthropology. In contrast, archaeology and biological anthropology remained largely positivist. Due to this difference in epistemology, the four sub-fields of anthropology have lacked cohesion over the last several decades.SocioculturalSociocultural anthropology draws together the principle axes of cultural anthropology and social anthropology. Cultural anthropology is the comparative study of the manifold ways in which people make sense of the world around them, while social anthropology is the study of the relationships among individuals and groups. Cultural anthropology is more related to philosophy, literature and the arts (how one's culture affects the experience for self and group, contributing to a more complete understanding of the people's knowledge, customs, and institutions), while social anthropology is more related to sociology and history. In that, it helps develop an understanding of social structures, typically of others and other populations (such as minorities, subgroups, dissidents, etc.). There is no hard-and-fast distinction between them, and these categories overlap to a considerable degree.Inquiry in sociocultural anthropology is guided in part by cultural relativism, the attempt to understand other societies in terms of their own cultural symbols and values. Accepting other cultures in their own terms moderates reductionism in cross-cultural comparison. This project is often accommodated in the field of ethnography. Ethnography can refer to both a methodology and the product of ethnographic research, i.e. an ethnographic monograph. As a methodology, ethnography is based upon long-term fieldwork within a community or other research site. Participant observation is one of the foundational methods of social and cultural anthropology. Ethnology involves the systematic comparison of different cultures. The process of participant-observation can be especially helpful to understanding a culture from an emic (conceptual, vs. etic, or technical) point of view.The study of kinship and social organization is a central focus of sociocultural anthropology, as kinship is a human universal. Sociocultural anthropology also covers economic and political organization, law and conflict resolution, patterns of consumption and exchange, material culture, technology, infrastructure, gender relations, ethnicity, childrearing and socialization, religion, myth, symbols, values, etiquette, worldview, sports, music, nutrition, recreation, games, food, festivals, and language (which is also the object of study in linguistic anthropology).Comparison across cultures is a key element of method in sociocultural anthropology, including the industrialized (and de-industrialized) West. The Standard Cross-Cultural Sample (SCCS) includes 186 such cultures.BiologicalBiological anthropology and physical anthropology are synonymous terms to describe anthropological research focused on the study of humans and non-human primates in their biological, evolutionary, and demographic dimensions. It examines the biological and social factors that have affected the evolution of humans and other primates, and that generate, maintain or change contemporary genetic and physiological variation.ArchaeologicalArchaeology is the study of the human past through its material remains. Artifacts, faunal remains, and human altered landscapes are evidence of the cultural and material lives of past societies. Archaeologists examine material remains in order to deduce patterns of past human behavior and cultural practices. Ethnoarchaeology is a type of archaeology that studies the practices and material remains of living human groups in order to gain a better understanding of the evidence left behind by past human groups, who are presumed to have lived in similar ways.LinguisticLinguistic anthropology (not to be confused with anthropological linguistics) seeks to understand the processes of human communications, verbal and non-verbal, variation in language across time and space, the social uses of language, and the relationship between language and culture. It is the branch of anthropology that brings linguistic methods to bear on anthropological problems, linking the analysis of linguistic forms and processes to the interpretation of sociocultural processes. Linguistic anthropologists often draw on related fields including sociolinguistics, pragmatics, cognitive linguistics, semiotics, discourse analysis, and narrative analysis.Ethnography Ethnography is a method of analysing social or cultural interaction. It often involves participant observation though an ethnographer may also draw from texts written by participants of in social interactions. Ethnography views first-hand experience and social context as important.Tim Ingold distinguishes ethnography from anthropology arguing that anthropology tries to construct general theories of human experience, applicable in general and novel settings, while ethnography concerns itself with fidelity. He argues that the anthropologist must make his writing consistent with their understanding of literature and other theory, but notes that ethnography may be of use to the anthropologists and the fields inform one another.Key topics by field: socioculturalArt, media, music, dance and filmArt One of the central problems in the anthropology of art concerns the universality of 'art' as a cultural phenomenon. Several anthropologists have noted that the Western categories of 'painting', 'sculpture', or 'literature', conceived as independent artistic activities, do not exist, or exist in a significantly different form, in most non-Western contexts. To surmount this difficulty, anthropologists of art have focused on formal features in objects which, without exclusively being 'artistic', have certain evident 'aesthetic' qualities. Boas' Primitive Art, Claude Lévi-Strauss' The Way of the Masks (1982) or Geertz's 'Art as Cultural System' (1983) are some examples in this trend to transform the anthropology of 'art' into an anthropology of culturally specific 'aesthetics'.Media Media anthropology (also known as the anthropology of media or mass media) emphasizes ethnographic studies as a means of understanding producers, audiences, and other cultural and social aspects of mass media. The types of ethnographic contexts explored range from contexts of media production (e.g., ethnographies of newsrooms in newspapers, journalists in the field, film production) to contexts of media reception, following audiences in their everyday responses to media. Other types include cyber anthropology, a relatively new area of internet research, as well as ethnographies of other areas of research which happen to involve media, such as development work, social movements, or health education. This is in addition to many classic ethnographic contexts, where media such as radio, the press, new media, and television have started to make their presences felt since the early 1990s.Music Ethnomusicology is an academic field encompassing various approaches to the study of music (broadly defined), that emphasize its cultural, social, material, cognitive, biological, and other dimensions or contexts instead of or in addition to its isolated sound component or any particular repertoire.Ethnomusicology can be used in a wide variety of fields, such as teaching, politics, cultural anthropology etc.  While the origins of ethnomusicology date back to the 18th and 19th centuries, it was formally introduced as “ethnomusicology” by Dutch scholar Jaap Kunst around 1950. Later, the influence of study in this area spawned the creation of the periodical Ethnomusicology and the Society of Ethnomusicology.Visual Visual anthropology is concerned, in part, with the study and production of ethnographic photography, film and, since the mid-1990s, new media. While the term is sometimes used interchangeably with ethnographic film, visual anthropology also encompasses the anthropological study of visual representation, including areas such as performance, museums, art, and the production and reception of mass media. Visual representations from all cultures, such as sandpaintings, tattoos, sculptures and reliefs, cave paintings, scrimshaw, jewelry, hieroglyphics, paintings, and photographs are included in the focus of visual anthropology.Economic, political economic, applied and developmentEconomic Economic anthropology attempts to explain human economic behavior in its widest historic, geographic and cultural scope. It has a complex relationship with the discipline of economics, of which it is highly critical. Its origins as a sub-field of anthropology begin with the Polish-British founder of anthropology, Bronisław Malinowski, and his French compatriot, Marcel Mauss, on the nature of gift-giving exchange (or reciprocity) as an alternative to market exchange. Economic Anthropology remains, for the most part, focused upon exchange. The school of thought derived from Marx and known as Political Economy focuses on production, in contrast. Economic anthropologists have abandoned the primitivist niche they were relegated to by economists, and have now turned to examine corporations, banks, and the global financial system from an anthropological perspective.Political economyPolitical economy in anthropology is the application of the theories and methods of historical materialism to the traditional concerns of anthropology, including, but not limited to, non-capitalist societies. Political economy introduced questions of history and colonialism to ahistorical anthropological theories of social structure and culture. Three main areas of interest rapidly developed. The first of these areas was concerned with the "pre-capitalist" societies that were subject to evolutionary "tribal" stereotypes. Sahlin's work on hunter-gatherers as the "original affluent society" did much to dissipate that image. The second area was concerned with the vast majority of the world's population at the time, the peasantry, many of whom were involved in complex revolutionary wars such as in Vietnam. The third area was on colonialism, imperialism, and the creation of the capitalist world-system. More recently, these political economists have more directly addressed issues of industrial (and post-industrial) capitalism around the world.Applied Applied anthropology refers to the application of the method and theory of anthropology to the analysis and solution of practical problems. It is a "complex of related, research-based, instrumental methods which produce change or stability in specific cultural systems through the provision of data, initiation of direct action, and/or the formulation of policy". More simply, applied anthropology is the practical side of anthropological research; it includes researcher involvement and activism within the participating community. It is closely related to development anthropology (distinct from the more critical anthropology of development).DevelopmentAnthropology of development tends to view development from a critical perspective. The kind of issues addressed and implications for the approach simply involve pondering why, if a key development goal is to alleviate poverty, is poverty increasing? Why is there such a gap between plans and outcomes? Why are those working in development so willing to disregard history and the lessons it might offer? Why is development so externally driven rather than having an internal basis? In short, why does so much planned development fail?Kinship, feminism, gender and sexualityKinship Kinship can refer both to the study of the patterns of social relationships in one or more human cultures, or it can refer to the patterns of social relationships themselves. Over its history, anthropology has developed a number of related concepts and terms, such as "descent", "descent groups", "lineages", "affines", "cognates", and even "fictive kinship". Broadly, kinship patterns may be considered to include people related both by descent (one's social relations during development), and also relatives by marriage. Within kinship you have two different families. People have their biological families and it is the people they share DNA with. This is called consanguineal relations or "blood ties". People can also have a chosen family Finding Connection Through "Chosen Family" in which they chose who they want to be a part of their family. In some cases people are closer with their chosen family more than with their biological families.Feminist Feminist anthropology is a four field approach to anthropology (archeological, biological, cultural, linguistic) that seeks to reduce male bias in research findings, anthropological hiring practices, and the scholarly production of knowledge. Anthropology engages often with feminists from non-Western traditions, whose perspectives and experiences can differ from those of white feminists of Europe, America, and elsewhere. From the perspective of the Western world, historically such 'peripheral' perspectives have been ignored, observed only from an outsider perspective, and regarded as less-valid or less-important than knowledge from the Western world. Exploring and addressing that double bias against women from marginalized racial or ethnic groups is of particular interest in intersectional feminist anthropology.Feminist anthropologists have stated that their publications have contributed to anthropology, along the way correcting against the systemic biases beginning with the "patriarchal origins of anthropology (and (academia)" and note that from 1891 to 1930 doctorates in anthropology went to males more than 85%, more than 81% were under 35, and only 7.2% to anyone over 40 years old, thus reflecting an age gap in the pursuit of anthropology by first-wave feminists until later in life. This correction of systemic bias may include mainstream feminist theory, history, linguistics, archaeology, and anthropology. Feminist anthropologists are often concerned with the construction of gender across societies. Gender constructs are of particular interest when studying sexism.According to St. Clair Drake, Vera Mae Green was, until "[w]ell into the 1960s", the only African-American female anthropologist who was also a Caribbeanist. She studied ethnic and family relations in the Caribbean as well as the United States, and thereby tried to improve the way black life, experiences, and culture were studied. However, Zora Neale Hurston, although often primarily considered to be a literary author, was trained in anthropology by Franz Boas, and published Tell my Horse about her "anthropological observations" of voodoo in the Caribbean (1938).Feminist anthropology is inclusive of the anthropology of birth as a specialization, which is the anthropological study of pregnancy and childbirth within cultures and societies.Medical, nutritional, psychological, cognitive and transpersonalMedical Medical anthropology is an interdisciplinary field which studies "human health and disease, health care systems, and biocultural adaptation". It is believed that William Caudell was the first to discover the field of medical anthropology. Currently, research in medical anthropology is one of the main growth areas in the field of anthropology as a whole. It focuses on the following six basic fields:Other subjects that have become central to medical anthropology worldwide are violence and social suffering (Farmer, 1999, 2003; Beneduce, 2010) as well as other issues that involve physical and psychological harm and suffering that are not a result of illness. On the other hand, there are fields that intersect with medical anthropology in terms of research methodology and theoretical production, such as cultural psychiatry and transcultural psychiatry or ethnopsychiatry.Nutritional Nutritional anthropology is a synthetic concept that deals with the interplay between economic systems, nutritional status and food security, and how changes in the former affect the latter. If economic and environmental changes in a community affect access to food, food security, and dietary health, then this interplay between culture and biology is in turn connected to broader historical and economic trends associated with globalization. Nutritional status affects overall health status, work performance potential, and the overall potential for economic development (either in terms of human development or traditional western models) for any given group of people.Psychological Psychological anthropology is an interdisciplinary subfield of anthropology that studies the interaction of cultural and mental processes. This subfield tends to focus on ways in which humans' development and enculturation within a particular cultural group –  with its own history, language, practices, and conceptual categories –  shape processes of human cognition, emotion, perception, motivation, and mental health. It also examines how the understanding of cognition, emotion, motivation, and similar psychological processes inform or constrain our models of cultural and social processes.Cognitive Cognitive anthropology seeks to explain patterns of shared knowledge, cultural innovation, and transmission over time and space using the methods and theories of the cognitive sciences (especially experimental psychology and evolutionary biology) often through close collaboration with historians, ethnographers, archaeologists, linguists, musicologists and other specialists engaged in the description and interpretation of cultural forms. Cognitive anthropology is concerned with what people from different groups know and how that implicit knowledge changes the way people perceive and relate to the world around them.Transpersonal Transpersonal anthropology studies the relationship between altered states of consciousness and culture. As with transpersonal psychology, the field is much concerned with altered states of consciousness (ASC) and transpersonal experience. However, the field differs from mainstream transpersonal psychology in taking more cognizance of cross-cultural issues –  for instance, the roles of myth, ritual, diet, and texts in evoking and interpreting extraordinary experiences.Political and legalPolitical Political anthropology concerns the structure of political systems, looked at from the basis of the structure of societies. Political anthropology developed as a discipline concerned primarily with politics in stateless societies, a new development started from the 1960s, and is still unfolding: anthropologists started increasingly to study more "complex" social settings in which the presence of states, bureaucracies and markets entered both ethnographic accounts and analysis of local phenomena. The turn towards complex societies meant that political themes were taken up at two main levels. Firstly, anthropologists continued to study political organization and political phenomena that lay outside the state-regulated sphere (as in patron-client relations or tribal political organization). Secondly, anthropologists slowly started to develop a disciplinary concern with states and their institutions (and on the relationship between formal and informal political institutions). An anthropology of the state developed, and it is a most thriving field today. Geertz' comparative work on "Negara", the Balinese state, is an early, famous example.LegalLegal anthropology or anthropology of law specializes in "the cross-cultural study of social ordering". Earlier legal anthropological research often focused more narrowly on conflict management, crime, sanctions, or formal regulation. More recent applications include issues such as human rights, legal pluralism, and political uprisings.PublicPublic anthropology was created by Robert Borofsky, a professor at Hawaii Pacific University, to "demonstrate the ability of anthropology and anthropologists to effectively address problems beyond the discipline – illuminating larger social issues of our times as well as encouraging broad, public conversations about them with the explicit goal of fostering social change".Nature, science, and technologyCyborgCyborg anthropology originated as a sub-focus group within the American Anthropological Association's annual meeting in 1993. The sub-group was very closely related to STS and the Society for the Social Studies of Science. Donna Haraway's 1985 Cyborg Manifesto could be considered the founding document of cyborg anthropology by first exploring the philosophical and sociological ramifications of the term. Cyborg anthropology studies humankind and its relations with the technological systems it has built, specifically modern technological systems that have reflexively shaped notions of what it means to be human beings.Digital Digital anthropology is the study of the relationship between humans and digital-era technology, and extends to various areas where anthropology and technology intersect. It is sometimes grouped with sociocultural anthropology, and sometimes considered part of material culture. The field is new, and thus has a variety of names with a variety of emphases. These include techno-anthropology, digital ethnography, cyberanthropology, and virtual anthropology.Ecological Ecological anthropology is defined as the "study of cultural adaptations to environments". The sub-field is also defined as, "the study of relationships between a population of humans and their biophysical environment". The focus of its research concerns "how cultural beliefs and practices helped human populations adapt to their environments, and how their environments change across space and time. The contemporary perspective of environmental anthropology, and arguably at least the backdrop, if not the focus of most of the ethnographies and cultural fieldworks of today, is political ecology. Many characterize this new perspective as more informed with culture, politics and power, globalization, localized issues, century anthropology and more. The focus and data interpretation is often used for arguments for/against or creation of policy, and to prevent corporate exploitation and damage of land. Often, the observer has become an active part of the struggle either directly (organizing, participation) or indirectly (articles, documentaries, books, ethnographies). Such is the case with environmental justice advocate Melissa Checker and her relationship with the people of Hyde Park.Environment Social sciences, like anthropology, can provide interdisciplinary approaches to the environment. Professor Kay Milton, Director of the Anthropology research network in the School of History and Anthropology, describes anthropology as distinctive, with its most distinguishing feature being its interest in non-industrial indigenous and traditional societies. Anthropological theory is distinct because of the consistent presence of the concept of culture; not an exclusive topic but a central position in the study and a deep concern with the human condition. Milton describes three trends that are causing a fundamental shift in what characterizes anthropology: dissatisfaction with the cultural relativist perspective, reaction against cartesian dualisms which obstructs progress in theory (nature culture divide), and finally an increased attention to globalization (transcending the barriers or time/space).Environmental discourse appears to be characterized by a high degree of globalization. (The troubling problem is borrowing non indigenous practices and creating standards, concepts, philosophies and practices in western countries.) Anthropology and environmental discourse now have become a distinct position in anthropology as a discipline. Knowledge about diversities in human culture can be important in addressing environmental problems - anthropology is now a study of human ecology. Human activity is the most important agent in creating environmental change, a study commonly found in human ecology which can claim a central place in how environmental problems are examined and addressed. Other ways anthropology contributes to environmental discourse is by being theorists and analysts,  or by refinement of definitions to become more neutral/universal, etc. In exploring environmentalism - the term typically refers to a concern that the environment should be protected, particularly from the harmful effects of human activities. Environmentalism itself can be expressed in many ways. Anthropologists can open the doors of environmentalism by looking beyond industrial society, understanding the opposition between industrial and non industrial relationships, knowing what ecosystem people and biosphere people are and are affected by, dependent and independent variables, “primitive” ecological wisdom, diverse environments, resource management, diverse cultural traditions, and knowing that environmentalism is a part of culture.Historical Ethnohistory is the study of ethnographic cultures and indigenous customs by examining historical records. It is also the study of the history of various ethnic groups that may or may not exist today. Ethnohistory uses both historical and ethnographic data as its foundation. Its historical methods and materials go beyond the standard use of documents and manuscripts. Practitioners recognize the utility of such source material as maps, music, paintings, photography, folklore, oral tradition, site exploration, archaeological materials, museum collections, enduring customs, language, and place names.Religion The anthropology of religion involves the study of religious institutions in relation to other social institutions, and the comparison of religious beliefs and practices across cultures. Modern anthropology assumes that there is complete continuity between magical thinking and religion, and that every religion is a cultural product, created by the human community that worships it.Urban Urban anthropology is concerned with issues of urbanization, poverty, and neoliberalism. Ulf Hannerz quotes a 1960s remark that traditional anthropologists were "a notoriously agoraphobic lot, anti-urban by definition". Various social processes in the Western World as well as in the "Third World" (the latter being the habitual focus of attention of anthropologists) brought the attention of "specialists in 'other cultures'" closer to their homes. There are two main approaches to urban anthropology: examining the types of cities or examining the social issues within the cities. These two methods are overlapping and dependent of each other. By defining different types of cities, one would use social factors as well as economic and political factors to categorize the cities. By directly looking at the different social issues, one would also be studying how they affect the dynamic of the city.Key topics by field: archaeological and biologicalAnthrozoology Anthrozoology (also known as "human–animal studies") is the study of interaction between living things. It is an interdisciplinary field that overlaps with a number of other disciplines, including anthropology, ethology, medicine, psychology, veterinary medicine and zoology. A major focus of anthrozoologic research is the quantifying of the positive effects of human-animal relationships on either party and the study of their interactions. It includes scholars from a diverse range of fields, including anthropology, sociology, biology, and philosophy.Biocultural Biocultural anthropology is the scientific exploration of the relationships between human biology and culture. Physical anthropologists throughout the first half of the 20th century viewed this relationship from a racial perspective; that is, from the assumption that typological human biological differences lead to cultural differences. After World War II the emphasis began to shift toward an effort to explore the role culture plays in shaping human biology.Evolutionary Evolutionary anthropology is the interdisciplinary study of the evolution of human physiology and human behaviour and the relation between hominins and non-hominin primates. Evolutionary anthropology is based in natural science and social science, combining the human development with socioeconomic factors. Evolutionary anthropology is concerned with both biological and cultural evolution of humans, past and present. It is based on a scientific approach, and brings together fields such as archaeology, behavioral ecology, psychology, primatology, and genetics. It is a dynamic and interdisciplinary field, drawing on many lines of evidence to understand the human experience, past and present.Forensic Forensic anthropology is the application of the science of physical anthropology and human osteology in a legal setting, most often in criminal cases where the victim's remains are in the advanced stages of decomposition. A forensic anthropologist can assist in the identification of deceased individuals whose remains are decomposed, burned, mutilated or otherwise unrecognizable. The adjective "forensic" refers to the application of this subfield of science to a court of law.Palaeoanthropology Paleoanthropology combines the disciplines of paleontology and physical anthropology. It is the study of ancient humans, as found in fossil hominid evidence such as petrifacted bones and footprints. Genetics and morphology of specimens are crucially important to this field. Markers on specimens, such as enamel fractures and dental decay on teeth, can also give insight into the behaviour and diet of past populations.Organizations Contemporary anthropology is an established science with academic departments at most universities and colleges. The single largest organization of anthropologists is the American Anthropological Association (AAA), which was founded in 1903. Its members are anthropologists from around the globe.In 1989, a group of European and American scholars in the field of anthropology established the European Association of Social Anthropologists (EASA) which serves as a major professional organization for anthropologists working in Europe. The EASA seeks to advance the status of anthropology in Europe and to increase visibility of marginalized anthropological traditions and thereby contribute to the project of a global anthropology or world anthropology.Hundreds of other organizations exist in the various sub-fields of anthropology, sometimes divided up by nation or region, and many anthropologists work with collaborators in other disciplines, such as geology, physics, zoology, paleontology, anatomy, music theory, art history, sociology and so on, belonging to professional societies in those disciplines as well.List of major organizations American Anthropological Association American Ethnological Society Asociación de Antropólogos Iberoamericanos en Red, AIBR Moving Anthropology Student Network Anthropological Society of London Center for World Indigenous Studies Ethnological Society of London Max Planck Institute for Evolutionary Anthropology Network of Concerned Anthropologists N.N. Miklukho-Maklai Institute of Ethnology and Anthropology Royal Anthropological Institute of Great Britain and Ireland Society for anthropological sciences Society for Applied Anthropology USC Center for Visual AnthropologyEthicsAs the field has matured it has debated and arrived at ethical principles aimed at protecting both the subjects of anthropological research as well as the researchers themselves, and professional societies have generated codes of ethics.Anthropologists, like other researchers (especially historians and scientists engaged in field research), have over time assisted state policies and projects, especially colonialism.Some commentators have contended: That the discipline grew out of colonialism, perhaps was in league with it, and derives some of its key notions from it, consciously or not. (See, for example, Gough, Pels and Salemink, but cf. Lewis 2004). That ethnographic work is often ahistorical, writing about people as if they were "out of time" in an "ethnographic present" (Johannes Fabian, Time and Its Other).In his article "The Misrepresentation of Anthropology and Its Consequence," Herbert S. Lewis critiqued older anthropological works that presented other cultures as if they were strange and unusual. While the findings of those researchers should not be discarded, the field should learn from its mistakes.Cultural relativism As part of their quest for scientific objectivity, present-day anthropologists typically urge cultural relativism, which has an influence on all the sub-fields of anthropology. This is the notion that cultures should not be judged by another's values or viewpoints, but be examined dispassionately on their own terms. There should be no notions, in good anthropology, of one culture being better or worse than another culture.Ethical commitments in anthropology include noticing and documenting genocide, infanticide, racism, sexism, mutilation (including circumcision and subincision), and torture. Topics like racism, slavery, and human sacrifice attract anthropological attention and theories ranging from nutritional deficiencies, to genes, to acculturation, to colonialism, have been proposed to explain their origins and continued recurrences.To illustrate the depth of an anthropological approach, one can take just one of these topics, such as "racism" and find thousands of anthropological references, stretching across all the major and minor sub-fields.Military involvementAnthropologists' involvement with the U.S. government, in particular, has caused bitter controversy within the discipline. Franz Boas publicly objected to US participation in World War I, and after the war he published a brief expose and condemnation of the participation of several American archaeologists in espionage in Mexico under their cover as scientists.But by the 1940s, many of Boas' anthropologist contemporaries were active in the allied war effort against the Axis Powers (Nazi Germany, Fascist Italy, and Imperial Japan). Many served in the armed forces, while others worked in intelligence (for example, Office of Strategic Services and the Office of War Information). At the same time, David H. Price's work on American anthropology during the Cold War provides detailed accounts of the pursuit and dismissal of several anthropologists from their jobs for communist sympathies.Attempts to accuse anthropologists of complicity with the CIA and government intelligence activities during the Vietnam War years have turned up surprisingly little. Many anthropologists (students and teachers) were active in the antiwar movement. Numerous resolutions condemning the war in all its aspects were passed overwhelmingly at the annual meetings of the American Anthropological Association (AAA).Professional anthropological bodies often object to the use of anthropology for the benefit of the state. Their codes of ethics or statements may proscribe anthropologists from giving secret briefings. The Association of Social Anthropologists of the UK and Commonwealth (ASA) has called certain scholarship ethically dangerous. The "Principles of Professional Responsibility" issued by the American Anthropological Association and amended through November 1986 stated that "in relation with their own government and with host governments ... no secret research, no secret reports or debriefings of any kind should be agreed to or given." The current "Principles of Professional Responsibility" does not make explicit mention of ethics surrounding state interactions.Anthropologists, along with other social scientists, are working with the US military as part of the US Army's strategy in Afghanistan. The Christian Science Monitor reports that "Counterinsurgency efforts focus on better grasping and meeting local needs" in Afghanistan, under the Human Terrain System (HTS) program; in addition, HTS teams are working with the US military in Iraq. In 2009, the American Anthropological Association's Commission on the Engagement of Anthropology with the US Security and Intelligence Communities released its final report concluding, in part, that, "When ethnographic investigation is determined by military missions, not subject to external review, where data collection occurs in the context of war, integrated into the goals of counterinsurgency, and in a potentially coercive environment – all characteristic factors of the HTS concept and its application – it can no longer be considered a legitimate professional exercise of anthropology. In summary, while we stress that constructive engagement between anthropology and the military is possible, CEAUSSIC suggests that the AAA emphasize the incompatibility of HTS with disciplinary ethics and practice for job seekers and that it further recognize the problem of allowing HTS to define the meaning of "anthropology" within DoD."Post–World War II developmentsBefore WWII British 'social anthropology' and American 'cultural anthropology' were still distinct traditions. After the war, enough British and American anthropologists borrowed ideas and methodological approaches from one another that some began to speak of them collectively as 'sociocultural' anthropology.Basic trendsThere are several characteristics that tend to unite anthropological work. One of the central characteristics is that anthropology tends to provide a comparatively more holistic account of phenomena and tends to be highly empirical. The quest for holism leads most anthropologists to study a particular place, problem or phenomenon in detail, using a variety of methods, over a more extensive period than normal in many parts of academia.In the 1990s and 2000s, calls for clarification of what constitutes a culture, of how an observer knows where his or her own culture ends and another begins, and other crucial topics in writing anthropology were heard. These dynamic relationships, between what can be observed on the ground, as opposed to what can be observed by compiling many local observations remain fundamental in any kind of anthropology, whether cultural, biological, linguistic or archaeological.Biological anthropologists are interested in both human variation and in the possibility of human universals (behaviors, ideas or concepts shared by virtually all human cultures). They use many different methods of study, but modern population genetics, participant observation and other techniques often take anthropologists "into the field," which means traveling to a community in its own setting, to do something called "fieldwork."  On the biological or physical side, human measurements, genetic samples, nutritional data may be gathered and published as articles or monographs.Along with dividing up their project by theoretical emphasis, anthropologists typically divide the world up into relevant time periods and geographic regions. Human time on Earth is divided up into relevant cultural traditions based on material, such as the Paleolithic and the Neolithic, of particular use in archaeology. Further cultural subdivisions according to tool types, such as Olduwan or Mousterian or Levalloisian help archaeologists and other anthropologists in understanding major trends in the human past. Anthropologists and geographers share approaches to culture regions as well, since mapping cultures is central to both sciences. By making comparisons across cultural traditions (time-based) and cultural regions (space-based), anthropologists have developed various kinds of comparative method, a central part of their science.Commonalities between fieldsBecause anthropology developed from so many different enterprises (see History of anthropology), including but not limited to fossil-hunting, exploring, documentary film-making, paleontology, primatology, antiquity dealings and curatorship, philology, etymology, genetics, regional analysis, ethnology, history, philosophy, and religious studies, it is difficult to characterize the entire field in a brief article, although attempts to write histories of the entire field have been made.Some authors argue that anthropology originated and developed as the study of "other cultures", both in terms of time (past societies) and space (non-European/non-Western societies). For example, the classic of urban anthropology, Ulf Hannerz in the introduction to his seminal Exploring the City: Inquiries Toward an Urban Anthropology mentions that the "Third World" had habitually received most of attention; anthropologists who traditionally specialized in "other cultures" looked for them far away and started to look "across the tracks" only in late 1960s.Now there exist many works focusing on peoples and topics very close to the author's "home". It is also argued that other fields of study, like History and Sociology, on the contrary focus disproportionately on the West.In France, the study of Western societies has been traditionally left to sociologists, but this is increasingly changing, starting in the 1970s from scholars like Isac Chiva and journals like Terrain ("fieldwork"), and developing with the center founded by Marc Augé (Le Centre d'anthropologie des mondes contemporains, the Anthropological Research Center of Contemporary Societies).Since the 1980s it has become common for social and cultural anthropologists to set ethnographic research in the North Atlantic region, frequently examining the connections between locations rather than limiting research to a single locale. There has also been a related shift toward broadening the focus beyond the daily life of ordinary people; increasingly, research is set in settings such as scientific laboratories, social movements, governmental and nongovernmental organizations and businesses.See also Anthropological science fiction Christian anthropology, a sub-field of theology Circumscription theory Culture Dual inheritance theory Engaged theory Ethnobiology Human behavioral ecology Human ethology Human Relations Area Files Intangible cultural heritage Origins of society Philosophical anthropology, a sub-field of philosophy Prehistoric medicine Qualitative researchLists Outline of anthropology List of indigenous peoples List of anthropologistsNotesReferencesFurther readingDictionaries and encyclopediasFieldnotes and memoirsHistories            .Textbooks and key theoretical worksExternal links               (AIO)
+Agricultural science (or agriscience for short) is a broad multidisciplinary field of biology that encompasses the parts of exact, natural, economic and social sciences that are used in the practice and understanding of agriculture. Professionals of the agricultural science are called agricultural scientists or agriculturists.HistoryIn the 18th century, Johann Friedrich Mayer conducted experiments on the use of gypsum (hydrated calcium sulphate) as a fertilizer.In 1843, John Lawes and Joseph Henry Gilbert began a set of long-term field experiments at Rothamsted Research Station in England, some of which are still running as of 2018.In the United States, a scientific revolution in agriculture began with the Hatch Act of 1887, which used the term "agricultural science". The Hatch Act was driven by farmers' interest in knowing the constituents of early artificial fertilizer. The Smith-Hughes Act of 1917 shifted agricultural education back to its vocational roots, but the scientific foundation had been built. After 1906, public expenditures on agricultural research in the US exceeded private expenditures for the next 44 years.Prominent agricultural scientists Robert Bakewell Norman Borlaug Luther Burbank George Washington Carver Carl Henry Clerk George C. Clerk René Dumont Sir Albert Howard Kailas Nath KaulThomas Lecky Justus von Liebig Jay Lush Gregor Mendel Louis Pasteur M. S. Swaminathan Jethro Tull Artturi Ilmari Virtanen Sewall Wright Wilbur Olin AtwaterFields or related disciplines Agricultural biotechnology Agricultural chemistry Agricultural diversification Agricultural education Agricultural economics Agricultural engineering Agricultural geography Agricultural philosophy Agricultural marketing Agricultural soil science Agroecology Agrophysics Animal science Animal breeding Animal husbandry Animal nutrition Farm management Agronomy Botany Theoretical production ecology Horticulture Plant breeding Plant fertilization Aquaculture Biological engineering Genetic engineering Nematology Microbiology Plant pathologyRange management Environmental science Entomology Food science Human nutrition Irrigation and water management Soil science Agrology Waste management Weed scienceScopeAgriculture, agricultural science, and agronomy are often confused. However, they cover different concepts:Agriculture is the set of activities that transform the environment for the production of animals and plants for human use. Agriculture concerns techniques, including the application of agronomic research.Agronomy is research and development related to studying and improving plant-based crops.Soil forming factors and soil degradationAgricultural sciences include research and development on: Improving agricultural productivity in terms of quantity and quality (e.g., selection of drought-resistant crops and animals, development of new pesticides, yield-sensing technologies, simulation models of crop growth, in-vitro cell culture techniques) Minimizing the effects of pests (weeds, insects, pathogens, mollusks, nematodes) on crop or animal production systems. Transformation of primary products into end-consumer products (e.g., production, preservation, and packaging of dairy products) Prevention and correction of adverse environmental effects (e.g., soil degradation, waste management, bioremediation) Theoretical production ecology, relating to crop production modeling Traditional agricultural systems, sometimes termed subsistence agriculture, which feed most of the poorest people in the world.  These systems are of interest as they sometimes retain a level of integration with natural ecological systems greater than that of industrial agriculture, which may be more sustainable than some modern agricultural systems. Food production and demand on a global basis, with special attention paid to the major producers, such as China, India, Brazil, the US and the EU. Various sciences relating to agricultural resources and the environment (e.g. soil science, agroclimatology); biology of agricultural crops and animals (e.g. crop science, animal science and their included sciences, e.g. ruminant nutrition, farm animal welfare); such fields as agricultural economics and rural sociology; various disciplines encompassed in agricultural engineering.See also Agricultural Research Council Agricultural sciences basic topics Agriculture ministry Agroecology American Society of Agronomy Genomics of domestication History of agricultural science Institute of Food and Agricultural Sciences International Assessment of Agricultural Science and Technology for Development International Food Policy Research Institute, IFPRI List of agriculture topics National FFA Organization Research Institute of Crop Production (RICP) (in the Czech Republic) University of Agricultural SciencesReferencesFurther readingAgricultural Research, Livelihoods, and Poverty: Studies of Economic and Social Impacts in Six Countries Edited by Michelle Adato and Ruth Meinzen-Dick (2007), Johns Hopkins University Press Food Policy ReportClaude Bourguignon, Regenerating the Soil: From Agronomy to Agrology, Other India Press, 2005Pimentel David, Pimentel Marcia, Computer les kilocalories, Cérès, n. 59, sept-oct. 1977Russell E. Walter, Soil conditions and plant growth, Longman group, London, New York 1973 Saltini Antonio, Storia delle scienze agrarie, 4 vols, Bologna 1984–89, , , , Vavilov Nicolai I. (Starr Chester K. editor), The Origin, Variation, Immunity and Breeding of Cultivated Plants. Selected Writings, in Chronica botanica, 13: 1–6, Waltham, Mass., 1949–50Vavilov Nicolai I., World Resources of Cereals, Leguminous Seed Crops and Flax, Academy of Sciences of Urss, National Science Foundation, Washington, Israel Program for Scientific Translations, Jerusalem 1960Winogradsky Serge, Microbiologie du sol. Problèmes et methodes. Cinquante ans de recherches, Masson & c.ie, Paris 1949External linksConsultative Group on International Agricultural Research (CGIAR)Agricultural Research ServiceIndian Council of Agricultural ResearchInternational Institute of Tropical AgricultureInternational Livestock Research InstituteThe National Agricultural Library (NAL) - The most comprehensive agricultural library in the world.Crop Science Society of AmericaAmerican Society of AgronomySoil Science Society of AmericaAgricultural Science Researchers, Jobs and DiscussionsInformation System for Agriculture and Food ResearchSouth Dakota Agricultural LaboratoriesNMSU Department of Entomology Plant Pathology and Weed ScienceUP AgricultureBihar Agriculture
+Alchemy (from Arabic: al-kīmiyā; from Ancient Greek: khumeía) is an ancient branch of natural philosophy, a philosophical and protoscientific tradition that was historically practiced in China, India, the Muslim world, and Europe. In its Western form, alchemy is first attested in a number of pseudepigraphical texts written in Greco-Roman Egypt during the first few centuries CE.Alchemists attempted to purify, mature, and perfect certain materials. Common aims were chrysopoeia, the transmutation of "base metals" (e.g., lead) into "noble metals" (particularly gold); the creation of an elixir of immortality; and the creation of panaceas able to cure any disease. The perfection of the human body and soul was thought to result from the alchemical magnum opus ("Great Work"). The concept of creating the philosophers' stone was variously connected with all of these projects.Islamic and European alchemists developed a basic set of laboratory techniques, theories, and terms, some of which are still in use today. They did not abandon the Ancient Greek philosophical idea that everything is composed of four elements, and they tended to guard their work in secrecy, often making use of cyphers and cryptic symbolism. In Europe, the 12th-century translations of medieval Islamic works on science and the rediscovery of Aristotelian philosophy gave birth to a flourishing tradition of Latin alchemy. This late medieval tradition of alchemy would go on to play a significant role in the development of early modern science (particularly chemistry and medicine).Modern discussions of alchemy are generally split into an examination of its exoteric practical applications and its esoteric spiritual aspects, despite criticisms by scholars such as Eric J. Holmyard and Marie-Louise von Franz that they should be understood as complementary. The former is pursued by historians of the physical sciences, who examine the subject in terms of early chemistry, medicine, and charlatanism, and the philosophical and religious contexts in which these events occurred. The latter interests historians of esotericism, psychologists, and some philosophers and spiritualists. The subject has also made an ongoing impact on literature and the arts.Etymology The word alchemy comes from Old French alquemie, alkimie, used in Medieval Latin as . This name was itself brought from the Arabic word al-kīmiyā ( or ) composed of two parts: the Late Greek term khēmeía (χημεία), also spelled khumeia (χυμεία) and khēmía (χημία) - see below, and the Arabic definite article al-  (), meaning 'The'. Together this association can be interpreted as 'the process of transmutation by which to fuse or reunite with the divine or original form'. Several etymologies have been proposed for the Greek term. The first was proposed by Zosimos of Panopolis (3rd–4th centuries), who derived it from the name of a book, the Khemeu. Hermanm Diels argued in 1914 that it rather derived from χύμα, used to describe metallic objects formed by casting.Others trace its roots to the Egyptian name kēme (hieroglyphic 𓆎𓅓𓏏𓊖 khmi ), meaning 'black earth', which refers to the fertile and auriferous soil of the Nile valley, as opposed to red desert sand. According to the Egyptologist Wallis Budge, the Arabic word al-kīmiyaʾ actually means "the Egyptian [science]", borrowing from the Coptic word for "Egypt", kēme (or its equivalent in the Mediaeval Bohairic dialect of Coptic, khēme). This Coptic word derives from Demotic kmỉ, itself from ancient Egyptian kmt. The ancient Egyptian word referred to both the country and the colour "black" (Egypt was the "Black Land", by contrast with the "Red Land", the surrounding desert); so this etymology could also explain the nickname "Egyptian black arts".History Alchemy encompasses several philosophical traditions spanning some four millennia and three continents. These traditions' general penchant for cryptic and symbolic language makes it hard to trace their mutual influences and "genetic" relationships. One can distinguish at least three major strands, which appear to be mostly independent, at least in their earlier stages: Chinese alchemy, centered in China and Indian alchemy, centered on the Indian subcontinent; and Western alchemy, which occurred around the Mediterranean and whose center has shifted over the millennia from Greco-Roman Egypt to the Islamic world, and finally medieval Europe. Chinese alchemy was closely connected to Taoism and Indian alchemy with the Dharmic faiths. In contrast, Western alchemy developed its philosophical system mostly independent of but influenced by various Western religions. It is still an open question whether these three strands share a common origin, or to what extent they influenced each other.Hellenistic Egypt The start of Western alchemy may generally be traced to ancient and Hellenistic Egypt, where the city of Alexandria  was a center of alchemical knowledge, and retained its pre-eminence through most of the Greek and Roman periods. Following the work of André-Jean Festugière, modern scholars see alchemical practice in the Roman Empire as originating from the Egyptian goldsmith's art, Greek philosophy and different religious traditions. Tracing the origins of the alchemical art in Egypt is complicated by the pseudepigraphic nature of texts from the Greek alchemical corpus. The treatises of Zosimos of Panopolis, the earliest historically attested author (fl. c. 300 CE), can help in situating the other authors. Zosimus based his work on that of older alchemical authors, such as Mary the Jewess, Pseudo-Democritus, and Agathodaimon, but very little is known about any of these authors. The most complete of their works, The Four Books of Pseudo-Democritus, were probably written in the first century AD.Recent scholarship tends to emphasize the testimony of Zosimus, who traced the alchemical arts back to Egyptian metallurgical and ceremonial practices. It has also been argued that early alchemical writers borrowed the vocabulary of Greek philosophical schools but did not implement any of its doctrines in a systematic way. Zosimos of Panopolis wrote in the Final Abstinence (also known as the "Final Count"). Zosimos explains that the ancient practice of "tinctures" (the technical Greek name for the alchemical arts) had been taken over by certain "demons" who taught the art only to those who offered them sacrifices. Since Zosimos also called the demons "guardians of places" (οἱ κατὰ τόπον ἔφοροι) and those who offered them sacrifices "priests" (ἱερέα), it is fairly clear that he was referring to the gods of Egypt and their priests. While critical of the kind of alchemy he associated with the Egyptian priests and their followers, Zosimos nonetheless saw the tradition's recent past as rooted in the rites of the Egyptian temples.Mythology – Zosimos of Panopolis asserted that alchemy dated back to Pharaonic Egypt where it was the domain of the priestly class, though there is little to no evidence for his assertion. Alchemical writers used Classical figures from Greek, Roman, and Egyptian mythology to illuminate their works and allegorize alchemical transmutation. These included the pantheon of gods related to the Classical planets, Isis, Osiris, Jason, and many others.The central figure in the mythology of alchemy is Hermes Trismegistus (or Thrice-Great Hermes). His name is derived from the god Thoth and his Greek counterpart Hermes. Hermes and his caduceus or serpent-staff, were among alchemy's principal symbols. According to Clement of Alexandria, he wrote what were called the "forty-two books of Hermes", covering all fields of knowledge. The Hermetica of Thrice-Great Hermes is generally understood to form the basis for Western alchemical philosophy and practice, called the hermetic philosophy by its early practitioners. These writings were collected in the first centuries of the common era.Technology – The dawn of Western alchemy is sometimes associated with that of metallurgy, extending back to 3500 BC. Many writings were lost when the Roman emperor Diocletian ordered the burning of alchemical books after suppressing a revolt in Alexandria (AD 292). Few original Egyptian documents on alchemy have survived, most notable among them the Stockholm papyrus and the Leyden papyrus X. Dating from AD 250–300, they contained recipes for dyeing and making artificial gemstones, cleaning and fabricating pearls, and manufacturing of imitation gold and silver. These writings lack the mystical, philosophical elements of alchemy, but do contain the works of Bolus of Mendes (or Pseudo-Democritus), which aligned these recipes with theoretical knowledge of astrology and the classical elements. Between the time of Bolus and Zosimos, the change took place that transformed this metallurgy into a Hermetic art.Philosophy – Alexandria acted as a melting pot for philosophies of Pythagoreanism, Platonism, Stoicism and Gnosticism which formed the origin of alchemy's character. An important example of alchemy's roots in Greek philosophy, originated by Empedocles and developed by Aristotle, was that all things in the universe were formed from only four elements: earth, air, water, and fire. According to Aristotle, each element had a sphere to which it belonged and to which it would return if left undisturbed. The four elements of the Greek were mostly qualitative aspects of matter, not quantitative, as our modern elements are; "...True alchemy never regarded earth, air, water, and fire as corporeal or chemical substances in the present-day sense of the word. The four elements are simply the primary, and most general, qualities by means of which the amorphous and purely quantitative substance of all bodies first reveals itself in differentiated form." Later alchemists extensively developed the mystical aspects of this concept.Alchemy coexisted alongside emerging Christianity. Lactantius believed Hermes Trismegistus had prophesied its birth. St Augustine later affirmed this in the 4th & 5th centuries, but also condemned Trismegistus for idolatry. Examples of Pagan, Christian, and Jewish alchemists can be found during this period.Most of the Greco-Roman alchemists preceding Zosimos are known only by pseudonyms, such as Moses, Isis, Cleopatra, Democritus, and Ostanes. Others authors such as Komarios, and Chymes, we only know through fragments of text. After AD 400, Greek alchemical writers occupied themselves solely in commenting on the works of these predecessors. By the middle of the 7th century alchemy was almost an entirely mystical discipline. It was at that time that Khalid Ibn Yazid sparked its migration from Alexandria to the Islamic world, facilitating the translation and preservation of Greek alchemical texts in the 8th and 9th centuries.Byzantium Greek alchemy is preserved in medieval Greek (Byzantine) manuscripts, and yet historians have only relatively recently begun to pay attention to the study and development of Greek alchemy in the Byzantine period.India The 2nd millennium BC text Vedas describe a connection between eternal life and gold. A considerable knowledge of metallurgy has been exhibited in a third-century CE text called Arthashastra which provides ingredients of explosives (Agniyoga) and salts extracted from fertile soils and plant remains (Yavakshara) such as saltpetre/nitre, perfume making (different qualities of perfumes are mentioned), granulated (refined) Sugar. Buddhist texts from the 2nd to 5th centuries mention the transmutation of base metals to gold. According to some scholars Greek alchemy may have influenced Indian alchemy but there are no hard evidences to back this claim.The 11th-century Persian chemist and physician Abū Rayhān Bīrūnī, who visited Gujarat as part of the court of Mahmud of Ghazni, reported that theyThe goals of alchemy in India included the creation of a divine body (Sanskrit divya-deham) and immortality while still embodied (Sanskrit jīvan-mukti).  Sanskrit alchemical texts include much material on the manipulation of mercury and sulphur, that are homologized with the semen of the god Śiva and the menstrual blood of the goddess Devī.Some early alchemical writings seem to have their origins in the Kaula tantric schools associated to the teachings of the personality of Matsyendranath.  Other early writings are found in the Jaina medical treatise Kalyāṇakārakam of Ugrāditya, written in South India in the early 9th century.Two famous early Indian alchemical authors were Nāgārjuna Siddha and Nityanātha Siddha. Nāgārjuna Siddha was a Buddhist monk. His book, Rasendramangalam, is an example of Indian alchemy and medicine. Nityanātha Siddha wrote Rasaratnākara, also a highly influential work. In Sanskrit, rasa translates to "mercury", and Nāgārjuna Siddha was said to have developed a method of converting mercury into gold.Scholarship on Indian alchemy is in the publication of The Alchemical Body by David Gordon White. A modern bibliography on Indian alchemical studies has been written by White.The contents of 39 Sanskrit alchemical treatises have been analysed in detail in G. Jan Meulenbeld's History of Indian Medical Literature. The discussion of these works in HIML gives a summary of the contents of each work, their special features, and where possible the evidence concerning their dating. Chapter 13 of HIML, Various works on rasaśāstra and ratnaśāstra (or Various works on alchemy and gems) gives brief details of a further 655 (six hundred and fifty-five) treatises.  In some cases Meulenbeld gives notes on the contents and authorship of these works; in other cases references are made only to the unpublished manuscripts of these titles.A great deal remains to be discovered about Indian alchemical literature.  The content of the Sanskrit alchemical corpus has not yet (2014) been adequately integrated into the wider general history of alchemy.Islamic world After the Fall of the Roman Empire, the focus of alchemical development moved to the Islamic World. Much more is known about Islamic alchemy because it was better documented: indeed, most of the earlier writings that have come down through the years were preserved as Arabic translations. The word alchemy itself was derived from the Arabic word al-kīmiyā (الكيمياء). The early Islamic world was a melting pot for alchemy. Platonic and Aristotelian thought, which had already been somewhat appropriated into hermetical science, continued to be assimilated during the late 7th and early 8th centuries through Syriac translations and scholarship.In the late ninth and early tenth centuries, the Arabic works attributed to Jābir ibn Hayyān (Latinized as "Geber" or "Geberus") introduced a new approach to alchemy. Paul Kraus, who wrote the standard reference work on Jabir, put it as follows:Islamic philosophers also made great contributions to alchemical hermeticism. The most influential author in this regard was arguably Jabir. Jabir's ultimate goal was Takwin, the artificial creation of life in the alchemical laboratory, up to, and including, human life. He analyzed each Aristotelian element in terms of four basic qualities of hotness, coldness, dryness, and moistness. According to Jabir, in each metal two of these qualities were interior and two were exterior. For example, lead was externally cold and dry, while gold was hot and moist. Thus, Jabir theorized, by rearranging the qualities of one metal, a different metal would result. By this reasoning, the search for the philosopher's stone was introduced to Western alchemy. Jabir developed an elaborate numerology whereby the root letters of a substance's name in Arabic, when treated with various transformations, held correspondences to the element's physical properties.The elemental system used in medieval alchemy also originated with Jabir. His original system consisted of seven elements, which included the five classical elements (aether, air, earth, fire, and water) in addition to two chemical elements representing the metals: sulphur, "the stone which burns", which characterized the principle of combustibility, and mercury, which contained the idealized principle of metallic properties. Shortly thereafter, this evolved into eight elements, with the Arabic concept of the three metallic principles: sulphur giving flammability or combustion, mercury giving volatility and stability, and salt giving solidity. The atomic theory of corpuscularianism, where all physical bodies possess an inner and outer layer of minute particles or corpuscles, also has its origins in the work of Jabir.From the 9th to 14th centuries, alchemical theories faced criticism from a variety of practical Muslim chemists, including Alkindus, Abū al-Rayhān al-Bīrūnī, Avicenna and Ibn Khaldun. In particular, they wrote refutations against the idea of the transmutation of metals.East Asia Whereas European alchemy eventually centered on the transmutation of base metals into noble metals, Chinese alchemy had a more obvious connection to medicine. The philosopher's stone of European alchemists can be compared to the Grand Elixir of Immortality sought by Chinese alchemists. In the hermetic view, these two goals were not unconnected, and the philosopher's stone was often equated with the universal panacea; therefore, the two traditions may have had more in common than initially appears.Black powder may have been an important invention of Chinese alchemists. As previously stated above, Chinese alchemy was more related to medicine. It is said that the Chinese invented gunpowder while trying to find a potion for eternal life. Described in 9th-century texts and used in fireworks in China by the 10th century, it was used in cannons by 1290. From China, the use of gunpowder spread to Japan, the Mongols, the Muslim world, and Europe. Gunpowder was used by the Mongols against the Hungarians in 1241, and in Europe by the 14th century.Chinese alchemy was closely connected to Taoist forms of traditional Chinese medicine, such as Acupuncture and Moxibustion. In the early Song dynasty, followers of this Taoist idea (chiefly the elite and upper class) would ingest mercuric sulfide, which, though tolerable in low levels, led many to suicide. Thinking that this consequential death would lead to freedom and access to the Taoist heavens, the ensuing deaths encouraged people to eschew this method of alchemy in favor of external sources (the aforementioned Tai Chi Chuan, mastering of the qi, etc.)  Chinese alchemy was introduced to the West by Obed Simon Johnson.Medieval Europe The introduction of alchemy to Latin Europe may be dated to 11 February 1144, with the completion of Robert of Chester's translation of the Arabic Book of the Composition of Alchemy. Although European craftsmen and technicians pre-existed, Robert notes in his preface that alchemy (though here still referring to the elixir rather than to the art itself) was unknown in Latin Europe at the time of his writing. The translation of Arabic texts concerning numerous disciplines including alchemy flourished in 12th-century Toledo, Spain, through contributors like Gerard of Cremona and Adelard of Bath. Translations of the time included the Turba Philosophorum, and the works of Avicenna and Muhammad ibn Zakariya al-Razi. These brought with them many new words to the European vocabulary for which there was no previous Latin equivalent. Alcohol, carboy, elixir, and athanor are examples.Meanwhile, theologian contemporaries of the translators made strides towards the reconciliation of faith and experimental rationalism, thereby priming Europe for the influx of alchemical thought. The 11th-century St Anselm put forth the opinion that faith and rationalism were compatible and encouraged rationalism in a Christian context. In the early 12th century, Peter Abelard followed Anselm's work, laying down the foundation for acceptance of Aristotelian thought before the first works of Aristotle had reached the West. In the early 13th century, Robert Grosseteste used Abelard's methods of analysis and added the use of observation, experimentation, and conclusions when conducting scientific investigations. Grosseteste also did much work to reconcile Platonic and Aristotelian thinking.Through much of the 12th and 13th centuries, alchemical knowledge in Europe remained centered on translations, and new Latin contributions were not made. The efforts of the translators were succeeded by that of the encyclopaedists. In the 13th century, Albertus Magnus and Roger Bacon were the most notable of these, their work summarizing and explaining the newly imported alchemical knowledge in Aristotelian terms. Albertus Magnus, a Dominican friar, is known to have written works such as the Book of Minerals where he observed and commented on the operations and theories of alchemical authorities like Hermes and Democritus and unnamed alchemists of his time. Albertus critically compared these to the writings of Aristotle and Avicenna, where they concerned the transmutation of metals. From the time shortly after his death through to the 15th century, more than 28 alchemical tracts were misattributed to him, a common practice giving rise to his reputation as an accomplished alchemist. Likewise, alchemical texts have been attributed to Albert's student Thomas Aquinas.Roger Bacon, a Franciscan friar who wrote on a wide variety of topics including optics, comparative linguistics, and medicine, composed his Great Work () for  as part of a project towards rebuilding the medieval university curriculum to include the new learning of his time. While alchemy was not more important to him than other sciences and he did not produce allegorical works on the topic, he did consider it and astrology to be important parts of both natural philosophy and theology and his contributions advanced alchemy's connections to soteriology and Christian theology. Bacon's writings integrated morality, salvation, alchemy, and the prolongation of life. His correspondence with Clement highlighted this, noting the importance of alchemy to the papacy. Like the Greeks before him, Bacon acknowledged the division of alchemy into practical and theoretical spheres. He noted that the theoretical lay outside the scope of Aristotle, the natural philosophers, and all Latin writers of his time. The practical confirmed the theoretical, and Bacon advocated its uses in natural science and medicine. In later European legend, he became an archmage. In particular, along with Albertus Magnus, he was credited with the forging of a brazen head capable of answering its owner's questions.Soon after Bacon, the influential work of Pseudo-Geber (sometimes identified as Paul of Taranto) appeared. His Summa Perfectionis remained a staple summary of alchemical practice and theory through the medieval and renaissance periods. It was notable for its inclusion of practical chemical operations alongside sulphur-mercury theory, and the unusual clarity with which they were described. By the end of the 13th century, alchemy had developed into a fairly structured system of belief. Adepts believed in the macrocosm-microcosm theories of Hermes, that is to say, they believed that processes that affect minerals and other substances could have an effect on the human body (for example, if one could learn the secret of purifying gold, one could use the technique to purify the human soul). They believed in the four elements and the four qualities as described above, and they had a strong tradition of cloaking their written ideas in a labyrinth of coded jargon set with traps to mislead the uninitiated. Finally, the alchemists practiced their art: they actively experimented with chemicals and made observations and theories about how the universe operated. Their entire philosophy revolved around their belief that man's soul was divided within himself after the fall of Adam. By purifying the two parts of man's soul, man could be reunited with God.In the 14th century, alchemy became more accessible to Europeans outside the confines of Latin speaking churchmen and scholars. Alchemical discourse shifted from scholarly philosophical debate to an exposed social commentary on the alchemists themselves. Dante, Piers Plowman, and Chaucer all painted unflattering pictures of alchemists as thieves and liars. Pope John XXII's 1317 edict, Spondent quas non-exhibent forbade the false promises of transmutation made by pseudo-alchemists. In 1403, Henry IV of England banned the practice of multiplying metals (although it was possible to buy a licence to attempt to make gold alchemically, and a number were granted by Henry VI and Edward IV). These critiques and regulations centered more around pseudo-alchemical charlatanism than the actual study of alchemy, which continued with an increasingly Christian tone. The 14th century saw the Christian imagery of death and resurrection employed in the alchemical texts of Petrus Bonus, John of Rupescissa, and in works written in the name of Raymond Lull and Arnold of Villanova.Nicolas Flamel is a well-known alchemist, but a good example of pseudepigraphy, the practice of giving your works the name of someone else, usually more famous. Although the historical Flamel existed, the writings and legends assigned to him only appeared in 1612. Flamel was not a religious scholar as were many of his predecessors, and his entire interest in the subject revolved around the pursuit of the philosopher's stone. His work spends a great deal of time describing the processes and reactions, but never actually gives the formula for carrying out the transmutations. Most of 'his' work was aimed at gathering alchemical knowledge that had existed before him, especially as regarded the philosopher's stone. Through the 14th and 15th centuries, alchemists were much like Flamel: they concentrated on looking for the philosophers' stone. Bernard Trevisan and George Ripley made similar contributions. Their cryptic allusions and symbolism led to wide variations in interpretation of the art.Renaissance and early modern Europe During the Renaissance, Hermetic and Platonic foundations were restored to European alchemy. The dawn of medical, pharmaceutical, occult, and entrepreneurial branches of alchemy followed.In the late 15th century, Marsilo Ficino translated the Corpus Hermeticum and the works of Plato into Latin. These were previously unavailable to Europeans who for the first time had a full picture of the alchemical theory that Bacon had declared absent. Renaissance Humanism and Renaissance Neoplatonism guided alchemists away from physics to refocus on mankind as the alchemical vessel.Esoteric systems developed that blended alchemy into a broader occult Hermeticism, fusing it with magic, astrology, and Christian cabala. A key figure in this development was German Heinrich Cornelius Agrippa (1486–1535), who received his Hermetic education in Italy in the schools of the humanists. In his De Occulta Philosophia, he attempted to merge Kabbalah, Hermeticism, and alchemy. He was instrumental in spreading this new blend of Hermeticism outside the borders of Italy.Philippus Aureolus Paracelsus, (Theophrastus Bombastus von Hohenheim, 1493–1541) cast alchemy into a new form, rejecting some of Agrippa's occultism and moving away from chrysopoeia. Paracelsus pioneered the use of chemicals and minerals in medicine and wrote, "Many have said of Alchemy, that it is for the making of gold and silver. For me such is not the aim, but to consider only what virtue and power may lie in medicines."His hermetical views were that sickness and health in the body relied on the harmony of man the microcosm and Nature the macrocosm. He took an approach different from those before him, using this analogy not in the manner of soul-purification but in the manner that humans must have certain balances of minerals in their bodies, and that certain illnesses of the body had chemical remedies that could cure them. Iatrochemistry refers to the pharmaceutical applications of alchemy championed by Paracelsus.John Dee (13 July 1527 – December, 1608) followed Agrippa's occult tradition. Although better known for angel summoning, divination, and his role as astrologer, cryptographer, and consultant to Queen Elizabeth I, Dee's alchemical Monas Hieroglyphica, written in 1564 was his most popular and influential work. His writing portrayed alchemy as a sort of terrestrial astronomy in line with the Hermetic axiom As above so below. During the 17th century, a short-lived "supernatural" interpretation of alchemy became popular, including support by fellows of the Royal Society: Robert Boyle and Elias Ashmole. Proponents of the supernatural interpretation of alchemy believed that the philosopher's stone might be used to summon and communicate with angels.Entrepreneurial opportunities were common for the alchemists of Renaissance Europe. Alchemists were contracted by the elite for practical purposes related to mining, medical services, and the production of chemicals, medicines, metals, and gemstones. Rudolf II, Holy Roman Emperor, in the late 16th century, famously received and sponsored various alchemists at his court in Prague, including Dee and his associate Edward Kelley. King James IV of Scotland, Julius, Duke of Brunswick-Lüneburg, Henry V, Duke of Brunswick-Lüneburg, Augustus, Elector of Saxony, Julius Echter von Mespelbrunn, and Maurice, Landgrave of Hesse-Kassel all contracted alchemists. John's son Arthur Dee worked as a court physician to Michael I of Russia and Charles I of England but also compiled the alchemical book Fasciculus Chemicus.Although most of these appointments were legitimate, the trend of pseudo-alchemical fraud continued through the Renaissance. Betrüger would use sleight of hand, or claims of secret knowledge to make money or secure patronage. Legitimate mystical and medical alchemists such as Michael Maier and Heinrich Khunrath wrote about fraudulent transmutations, distinguishing themselves from the con artists. False alchemists were sometimes prosecuted for fraud.The terms "chemia" and "alchemia" were used as synonyms in the early modern period, and the differences between alchemy, chemistry and small-scale assaying and metallurgy were not as neat as in the present day. There were important overlaps between practitioners, and trying to classify them into alchemists, chemists and craftsmen is anachronistic. For example, Tycho Brahe (1546–1601), an alchemist better known for his astronomical and astrological investigations, had a laboratory built at his Uraniborg observatory/research institute. Michael Sendivogius (Michał Sędziwój, 1566–1636), a Polish alchemist, philosopher, medical doctor and pioneer of chemistry wrote mystical works but is also credited with distilling oxygen in a lab sometime around 1600. Sendivogious taught his technique to Cornelius Drebbel who, in 1621, applied this in a submarine. Isaac Newton devoted considerably more of his writing to the study of alchemy (see Isaac Newton's occult studies) than he did to either optics or physics. Other early modern alchemists who were eminent in their other studies include Robert Boyle, and Jan Baptist van Helmont. Their Hermeticism complemented rather than precluded their practical achievements in medicine and science.Later modern period The decline of European alchemy was brought about by the rise of modern science with its emphasis on rigorous quantitative experimentation and its disdain for "ancient wisdom". Although the seeds of these events were planted as early as the 17th century, alchemy still flourished for some two hundred years, and in fact may have reached its peak in the 18th century. As late as 1781 James Price claimed to have produced a powder that could transmute mercury into silver or gold. Early modern European alchemy continued to exhibit a diversity of theories, practices, and purposes: "Scholastic and anti-Aristotelian, Paracelsian and anti-Paracelsian, Hermetic, Neoplatonic, mechanistic, vitalistic, and more—plus virtually every combination and compromise thereof."Robert Boyle (1627–1691) pioneered the scientific method in chemical investigations. He assumed nothing in his experiments and compiled every piece of relevant data. Boyle would note the place in which the experiment was carried out, the wind characteristics, the position of the Sun and Moon, and the barometer reading, all just in case they proved to be relevant. This approach eventually led to the founding of modern chemistry in the 18th and 19th centuries, based on revolutionary discoveries of Lavoisier and John Dalton.Beginning around 1720, a rigid distinction began to be drawn for the first time between "alchemy" and "chemistry". By the 1740s, "alchemy" was now restricted to the realm of gold making, leading to the popular belief that alchemists were charlatans, and the tradition itself nothing more than a fraud. In order to protect the developing science of modern chemistry from the negative censure to which alchemy was being subjected, academic writers during the 18th-century scientific Enlightenment attempted, for the sake of survival, to divorce and separate the "new" chemistry from the "old" practices of alchemy. This move was mostly successful, and the consequences of this continued into the 19th, 20th and 21st centuries.During the occult revival of the early 19th century, alchemy received new attention as an occult science. The esoteric or occultist school, which arose during the 19th century, held (and continues to hold) the view that the substances and operations mentioned in alchemical literature are to be interpreted in a spiritual sense, and it downplays the role of the alchemy as a practical tradition or protoscience. This interpretation further forwarded the view that alchemy is an art primarily concerned with spiritual enlightenment or illumination, as opposed to the physical manipulation of apparatus and chemicals, and claims that the obscure language of the alchemical texts were an allegorical guise for spiritual, moral or mystical processes.In the 19th-century revival of alchemy, the two most seminal figures were Mary Anne Atwood and Ethan Allen Hitchcock, who independently published similar works regarding spiritual alchemy. Both forwarded a completely esoteric view of alchemy, as Atwood claimed: "No modern art or chemistry, notwithstanding all its surreptitious claims, has any thing in common with Alchemy." Atwood's work influenced subsequent authors of the occult revival including Eliphas Levi, Arthur Edward Waite, and Rudolf Steiner. Hitchcock, in his Remarks Upon Alchymists (1855) attempted to make a case for his spiritual interpretation with his claim that the alchemists wrote about a spiritual discipline under a materialistic guise in order to avoid accusations of blasphemy from the church and state. In 1845, Baron Carl Reichenbach, published his studies on Odic force, a concept with some similarities to alchemy, but his research did not enter the mainstream of scientific discussion.In 1946, Louis Cattiaux published the Message Retrouvé, a work that was at once philosophical, mystical and highly influenced by alchemy. In his lineage, many researchers, including Emmanuel and Charles d'Hooghvorst, are updating alchemical studies in France and Belgium.Women Several women appear in the earliest history of alchemy. Michael Maier names Mary the Jewess, Cleopatra the Alchemist, Medera, and Taphnutia as the four women who knew how to make the philosopher's stone. Zosimos' sister Theosebia (later known as Euthica the Arab) and Isis the Prophetess also played a role in early alchemical texts.The first alchemist whose name we know was Mary the Jewess (c. 200 A.D.). Early sources claim that Mary (or Maria) devised a number of improvements to alchemical equipment and tools as well as novel techniques in chemistry. Her best known advances were in heating and distillation processes. The laboratory water-bath, known eponymously (especially in France) as the bain-marie, is said to have been invented or at least improved by her. Essentially a double-boiler, it was (and is) used in chemistry for processes that require gentle heating. The tribikos (a modified distillation apparatus) and the kerotakis (a more intricate apparatus used especially for sublimations) are two other advancements in the process of distillation that are credited to her. Although we have no writing from Mary herself, she is known from the early-fourth-century writings of Zosimos of Panopolis.Due to the proliferation of pseudepigrapha and anonymous works, it is difficult to know which of the alchemists were actually women. After the Greco-Roman period, women's names appear less frequently in the alchemical literature. Women vacate the history of alchemy during the medieval and renaissance periods, aside from the fictitious account of Perenelle Flamel. Mary Anne Atwood's A Suggestive Inquiry into the Hermetic Mystery (1850) marks their return during the nineteenth-century occult revival.Modern historical research The history of alchemy has become a significant and recognized subject of academic study. As the language of the alchemists is analyzed, historians are becoming more aware of the intellectual connections between that discipline and other facets of Western cultural history, such as the evolution of science and philosophy, the sociology and psychology of the intellectual communities, kabbalism, spiritualism, Rosicrucianism, and other mystic movements. Institutions involved in this research include The Chymistry of Isaac Newton project at Indiana University, the University of Exeter Centre for the Study of Esotericism (EXESESO), the European Society for the Study of Western Esotericism (ESSWE), and the University of Amsterdam's Sub-department for the History of Hermetic Philosophy and Related Currents. A large collection of books on alchemy is kept in the Bibliotheca Philosophica Hermetica in Amsterdam. A recipe found in a mid-19th-century kabbalah based book features step by step instructions on turning copper into gold. The author attributed this recipe to an ancient manuscript he located.Journals which publish regularly on the topic of Alchemy include 'Ambix', published by the Society for the History of Alchemy and Chemistry, and 'Isis', published by The History of Science Society.Core concepts Western alchemical theory corresponds to the worldview of late antiquity in which it was born. Concepts were imported from Neoplatonism and earlier Greek cosmology. As such, the classical elements appear in alchemical writings, as do the seven classical planets and the corresponding seven metals of antiquity. Similarly, the gods of the Roman pantheon who are associated with these luminaries are discussed in alchemical literature. The concepts of prima materia and anima mundi are central to the theory of the philosopher's stone.Magnum opus The Great Work of Alchemy is often described as a series of four stages represented by colors.nigredo, a blackening or melanosisalbedo, a whitening or leucosiscitrinitas, a yellowing or xanthosisrubedo, a reddening, purpling, or iosisModernity Due to the complexity and obscurity of alchemical literature, and the 18th-century disappearance of remaining alchemical practitioners into the area of chemistry, the general understanding of alchemy has been strongly influenced by several distinct and radically different interpretations. Those focusing on the exoteric, such as historians of science Lawrence M. Principe and William R. Newman, have interpreted the 'decknamen' (or code words) of alchemy as physical substances. These scholars have reconstructed physicochemical experiments that they say are described in medieval and early modern texts. At the opposite end of the spectrum, focusing on the esoteric, scholars, such as George Calian and Anna Marie Roos, who question the reading of Principe and Newman, interpret these same decknamen as spiritual, religious, or psychological concepts.New interpretations of alchemy are still perpetuated, sometimes merging in concepts from New Age or radical environmentalism movements. Groups like the Rosicrucians and Freemasons have a continued interest in alchemy and its symbolism. Since the Victorian revival of alchemy, "occultists reinterpreted alchemy as a spiritual practice, involving the self-transformation of the practitioner and only incidentally or not at all the transformation of laboratory substances", which has contributed to a merger of magic and alchemy in popular thought.Esoteric interpretations of historical textsIn the eyes of a variety of modern esoteric and Neo-Hermeticist practitioners, alchemy is fundamentally spiritual. In this interpretation, transmutation of lead into gold is presented as an analogy for personal transmutation, purification, and perfection.According to this view, early alchemists such as Zosimos of Panopolis (c. AD 300) highlighted the spiritual nature of the alchemical quest, symbolic of a religious regeneration of the human soul. This approach is held to have continued in the Middle Ages, as metaphysical aspects, substances, physical states, and material processes are supposed to have been used as metaphors for spiritual entities, spiritual states, and, ultimately, transformation. In this sense, the literal meanings of 'Alchemical Formulas' were like a veil, hiding their true spiritual philosophy. In the Neo-Hermeticist interpretation, both the transmutation of common metals into gold and the universal panacea are held to symbolize evolution from an imperfect, diseased, corruptible, and ephemeral state toward a perfect, healthy, incorruptible, and everlasting state, so the philosopher's stone then represented a mystic key that would make this evolution possible. Applied to the alchemist himself, the twin goal symbolized his evolution from ignorance to enlightenment, and the stone represented a hidden spiritual truth or power that would lead to that goal. In texts that are held to have been written according to this view, the cryptic alchemical symbols, diagrams, and textual imagery of late alchemical works are supposed to contain multiple layers of meanings, allegories, and references to other equally cryptic works; which must be laboriously decoded to discover their true meaning.In his 1766 Alchemical Catechism, Théodore Henri de Tschudi denotes that the usage of the metals was merely symbolic:Psychology Alchemical symbolism has been important in depth and analytical psychology and was revived and popularized from near extinction by the Swiss psychologist Carl Gustav Jung. Initially confounded and at odds with alchemy and its images, after being given a copy of the translation of The Secret of the Golden Flower, a Chinese alchemical text, by his friend Richard Wilhelm, Jung discovered a direct correlation or parallels between the symbolic images in the alchemical drawings and the inner, symbolic images coming up in dreams, visions or imaginations during the psychic processes of transformation occurring in his patients. A process, which he called "process of individuation". He regarded the alchemical images as symbols expressing aspects of this "process of individuation" of which the creation of the gold or lapis within were symbols for its origin and goal. Together with his alchemical mystica soror, Jungian Swiss analyst Marie-Louise von Franz, Jung began collecting all the old alchemical texts available, compiled a lexicon of key phrases with cross-references and pored over them. The volumes of work he wrote brought new light into understanding the art of transubstantiation and renewed alchemy's popularity as a symbolic process of coming into wholeness as a human being where opposites brought into contact and inner and outer, spirit and matter are reunited in the hieros gamos  or divine marriage. His writings are influential in psychology and for people who have an interest in understanding the importance of dreams, symbols and the unconscious archetypal forces (archetypes) that influence all of life.Both von Franz and Jung have contributed greatly to the subject and work of alchemy and its continued presence in psychology as well as contemporary culture. Jung wrote volumes on alchemy and his magnum opus is Volume 14 of his Collected Works, Mysterium Coniunctionis.Literature Alchemy has had a long-standing relationship with art, seen both in alchemical texts and in mainstream entertainment. Literary alchemy appears throughout the history of English literature from Shakespeare to J. K. Rowling, and also the popular Japanese manga Fullmetal Alchemist. Here, characters or plot structure follow an alchemical magnum opus. In the 14th century, Chaucer began a trend of alchemical satire that can still be seen in recent fantasy works like those of the late Sir Terry Pratchett.Visual artists had a similar relationship with alchemy. While some of them used alchemy as a source of satire, others worked with the alchemists themselves or integrated alchemical thought or symbols in their work. Music was also present in the works of alchemists and continues to influence popular performers. In the last hundred years, alchemists have been portrayed in a magical and spagyric role in fantasy fiction, film, television, novels, comics and video games.Science One goal of alchemy, the transmutation of base substances into gold, is now known to be impossible by chemical means but possible by physical means. Although not financially worthwhile, Gold was synthesized in particle accelerators as early as 1941.See also Alchemical symbolBiological transmutation in Corentin Louis KervranCupellationHistoricismHistory of chemistryList of alchemistsNuclear transmutationOutline of alchemyPorta AlchemicaRenaissance magicSpagyricSuperseded theories in scienceSynthesis of precious metalsWestern esotericismNotesReferencesCitationsBibliographyFurther readingGeneral  Lawrence Principe, The Secrets of Alchemy, Chicago, 2013.Jennifer M. Rampling. 2020. The Experimental Fire: Inventing English Alchemy, 1300-1700. University of Chicago Press.Greco-Egyptian alchemyTexts  Marcellin Berthelot and Charles-Émile Ruelle (eds.), Collection des anciens alchimistes grecs (CAAG), 3 vols., 1887–1888, Vol 1: https://gallica.bnf.fr/ark:/12148/bpt6k96492923, Vol 2: https://gallica.bnf.fr/ark:/12148/bpt6k9680734p, Vol. 3: https://gallica.bnf.fr/ark:/12148/bpt6k9634942s. André-Jean Festugière, La Révélation d'Hermès Trismégiste, Paris, Les Belles Lettres, 2014  (, OCLC 897235256). Robert Halleux and Henri-Dominique Saffrey (eds.), Les alchimistes grecs, t. 1 : Papyrus de Leyde – Papyrus de Stockholm – Recettes, Paris, Les Belles Lettres, 1981. Otto Lagercrantz (ed), Papyrus Graecus Holmiensis, Uppsala, A.B. Akademiska Bokhandeln, 1913, https://archive.org/details/papyrusgraecusho00lage/page/n8. Michèle Mertens and Henri-Dominique Saffrey (ed.), Les alchimistes grecs, t. 4.1 : Zosime de Panopolis. Mémoires authentiques, Paris, Les Belles Lettres, 1995. Andrée Collinet and Henri-Dominique Saffrey (ed.), Les alchimistes grecs, t. 10 : L'Anonyme de Zuretti ou l'Art sacré and divin de la chrysopée par un anonyme, Paris, Les Belles Lettres, 2000. Andrée Collinet (ed), Les alchimistes grecs, t. 11 : Recettes alchimiques (Par. Gr. 2419; Holkhamicus 109) – Cosmas le Hiéromoine – Chrysopée, Paris, Les Belles Lettres, 2000. Matteo Martelli (ed), The Four Books of Pseudo-Democritus, Maney Publishing, 2014.Studies  Dylan M. Burns, « μίξεώς τινι τέχνῃ κρείττονι : Alchemical Metaphor in the Paraphrase of Shem (NHC VII,1) », Aries 15 (2015), p. 79–106. Alberto Camplani, « Procedimenti magico-alchemici e discorso filosofico ermetico » in Giuliana Lanata (ed.), Il Tardoantico alle soglie del Duemila, ETS, 2000, p. 73–98. Alberto Camplani and Marco Zambon, « Il sacrificio come problema in alcune correnti filosofice di età imperiale », Annali di storia dell'esegesi 19 (2002), p. 59–99. Régine Charron and Louis Painchaud, « 'God is a Dyer,' The Background and Significance of a Puzzling Motif in the Coptic Gospel According to Philip (CG II, 3), Le Muséon 114 (2001), p. 41-50. Régine Charron, « The Apocryphon of John (NHC II,1) and the Greco-Egyptian Alchemical Literature », Vigiliae Christinae 59 (2005), p. 438-456. Philippe Derchain, "L'Atelier des Orfèvres à Dendara et les origines de l'alchimie," Chronique d'Égypte, vol. 65, no 130, 1990, p. 219–242. Korshi Dosoo, « A History of the Theban Magical Library », Bulletin of the American Society of Papyrologists 53 (2016), p. 251–274. Olivier Dufault, Early Greek Alchemy, Patronage and Innovation in Late Antiquity, California Classical Studies, 2019, https://escholarship.org/uc/item/2ks0g83x. Sergio Knipe, « Sacrifice and self-transformation in the alchemical writings of Zosimus of Panopolis », in Christopher Kelly, Richard Flower, Michael Stuart Williams (eds.), Unclassical Traditions. Volume II: Perspectives from East and West in Late Antiquity, Cambridge University Press, 2011, p. 59–69. André-Jean Festugière, La Révélation d'Hermès Trismégiste, Paris, Les Belles Lettres, 2014 , . Kyle A. Fraser, « Zosimos of Panopolis and the Book of Enoch: Alchemy as Forbidden Knowledge », Aries 4.2 (2004), p. 125–147. Kyle A. Fraser, « Baptized in Gnosis: The Spiritual Alchemy of Zosimos of Panopolis », Dionysius 25 (2007), p. 33–54. Kyle A. Fraser, « Distilling Nature’s Secrets: The Sacred Art of Alchemy », in John Scarborough and Paul Keyser (eds.), Oxford Handbook of Science and Medicine in the Classical World, Oxford University Press, 2018, p. 721–742. 2018. https://www.oxfordhandbooks.com/view/10.1093/oxfordhb/9780199734146.001.0001/oxfordhb-9780199734146-e-76. Shannon Grimes, Becoming Gold: Zosimos of Panopolis and the Alchemical Arts in Roman Egypt, Auckland, Rubedo Press, 2018,  Paul T. Keyser, « Greco-Roman Alchemy and Coins of Imitation Silver », American Journal of Numismatics 7–8 (1995–1996), p. 209–234. Paul Keyser, « The Longue Durée of Alchemy », in John Scarborough and Paul Keyser (eds.), Oxford Handbook of Science and Medicine in the Classical World, Oxford University Press, 2018, p. 409–430. Jean Letrouit, "Chronologie des alchimistes grecs," in Didier Kahn and Sylvain Matton, Alchimie: art, histoire et mythes, SEHA-Archè, 1995, p. 11–93. Lindsay, Jack. The Origins of Alchemy in Greco-Roman Egypt. Barnes & Noble, 1970. Paul Magdalino and Maria Mavroudi (eds.), The Occult Sciences in Byzantium, La Pomme d'or, 2006. Matteo Martelli, « The Alchemical Art of Dyeing: The Fourfold Division of Alchemy and the Enochian Tradition » in Sven Dupré (ed.), Laboratories of Art, Springer, 2014, . Matteo Martelli, « Alchemy, Medicine and Religion: Zosimus of Panopolis and the Egyptian Priests », Religion in the Roman Empire 3.2 (2017), p. 202–220. Gerasimos Merianos, « Alchemy », In A. Kaldellis & N. Siniossoglou (eds.), The Cambridge Intellectual History of Byzantium (pp. 234–251). Cambridge: Cambridge University Press, 2017, . Efthymios Nikolaïdis (ed.), Greek Alchemy from Late Antiquity to Early Modernity, Brepols, 2019, . Daniel Stolzenberg, « Unpropitious Tinctures: Alchemy, Astrology & Gnosis According to Zosimos of Panopolis », Archives internationales d'histoire des sciences 49 (1999), p. 3–31. Cristina Viano, « Byzantine Alchemy, or the Era of Systematization », in John Scarborough and Paul Keyser (eds.), Oxford Handbook of Science and Medicine in the Classical World, Oxford University Press, 2018, p. 943–964. C. Vlachou and al., « Experimental investigation of silvering in late Roman coinage », Material Research Society Symposium Proceedings 712 (2002), p. II9.2.1-II9.2.9, .Early modern  Principe, Lawrence and William Newman. Alchemy Tried in the Fire: Starkey, Boyle, and the Fate of Helmontian Chymistry. University of Chicago Press, 2002.External links  SHAC: Society for the History of Alchemy and Chemistry ESSWE: European Society for the Study of Western Esotericism Association for the Study of Esotericism The Alchemy Website. – Adam McLean's online collections and academic discussion.  Dictionary of the History of Ideas: Alchemy Book of Secrets: Alchemy and the European Imagination, 1500–2000 – A digital exhibition from the Beinecke Rare Book and Manuscript Library at Yale University Othmer MS 2 Alchemical Miscellany at OPenn Alchemy featured topic page on Science History Institute Digital Collections featuring selected manuscripts, rare books, paintings, and ephemera relating to alchemical topics and experimentation. EsotericismHermeticismHistory of philosophyHistory of science
+Alien primarily refers to: Alien (law), a person in a country who is not a national of that country Enemy alien, the above in times of war Extraterrestrial life, life which does not originate from Earth Specifically, intelligent extraterrestrial beings; see List of alleged extraterrestrial beings Introduced species, a species not native to its environmentAlien(s), or The Alien(s) may also refer to:Science and technology  AliEn (ALICE Environment), a grid framework Alien (file converter), a Linux program Alien Technology, a manufacturer of RFID technologyArts and entertainment  Alien (franchise), a media franchise Alien (creature in Alien franchise)Films Alien (film), a 1979 film by Ridley Scott Aliens (film), second film in the franchise from 1986 by James Cameron Alien 3, third film in the franchise from 1992 by David Fincher Alien Resurrection, fourth film in the franchise from 1997 by Jean-Pierre Jeunet Alien vs. Predator (film), fifth film in the franchise from 2004 by Paul W. S. Anderson Aliens vs. Predator: Requiem, sixth film in the franchise from 2007 by the Brothers Strause Prometheus (2012 film), seventh film in the franchise from 2012 by Ridley Scott Alien: Covenant, eighth film in the franchise from 2017 by Ridley Scott Alien 2: On Earth, a 1980 unofficial sequel of the 1979 Alien filmAlien Visitor (also titled Epsilon) (1995 film) AustralianItalian science fiction film by Rolf de Heer The Alien (2016 film), a 2016 Mexican film The Alien (unproduced film), an incomplete 1960s IndianAmerican filmLiterature  Alien novels, an extension of the Alien franchise Aliens (Tappan Wright novel), a 1902 novel by Mary Tappan Wright The Alien (Animorphs), the eighth book in the Animorphs series The Aliens (play), a 2010 play by Annie BakerMusicPerformers  Alien (band), a 1980s Swedish rock group The Aliens (Australian band), a 1970s new wave group The Aliens (Scottish band), a 2005–2008 rock groupAlbums  Alien (soundtrack), 1979 Alien (Beam album), 2022 Alien (Northlane album), 2019 Alien (Strapping Young Lad album), 2005 Alien, a 1989 EP by Tankard Aliens (soundtrack), 1987Songs  "Alien" (Britney Spears song), 2013 "Alien" (Jonas Blue and Sabrina Carpenter song), 2018 "Alien", a song by Third Day from the album Conspiracy No. 5, 1997 "Alien", a song by Pennywise from the album Straight Ahead, 1999 "Alien", a song by Bush from the album Sixteen Stone, 1994 "Alien", a song by Erasure from the album Loveboat, 2000 "Alien", a song by Japan from the album Quiet Life, 1979 "Alien", a song by Lamb from the album Fear of Fours, 1999 "Alien", a song by Nerina Pallot from the album Dear Frustrated Superstar, 2001 "Alien", a song by P-Model from the album Landsale, 1980 "Alien", a song by Thriving Ivory from the album Thriving Ivory, 2003 "Alien", a song by Tokio Hotel from the album Humanoid, 2009. Fans of the band call themselves "Aliens". "Alien", a song by Atlanta Rhythm from the album Quinella, 1981 "Alien", a 2020 song by Lee Suhyun "Aliens" (song), a 2017 song by Coldplay "Aliens", a 1984 song by Warlord "The Alien", a song by Dream Theater from the album A View from the Top of the World, 2021Video games  Alien (1984 video game), based on the film Alien (Atari 2600), a 1982 maze game based on the 1979 film Alien: Isolation, a 2014 video game based on the Alien science fiction horror film series Aliens (1982 video game), a text-only clone of Space Invaders written for the CP/M operating system on the Kaypro computer Aliens (1990 video game), a game by Konami, based on the sequel of the filmOther media  Alien (Armenian TV series), a 2017 melodrama series Alien (sculpture), a 2012 work by David Breuer-Weil, in Mottisfont, Hampshire, England Aliens (Dark Horse Comics line) The Aliens (TV series), 2016 British sci-fi television series "Aliens" (Roseanne), a 1992 television episodeOther uses  Alien (shipping company), a Russian company Alien Sun (born 1974), Singaporean actress Alien, a perfume by Thierry MuglerSee also  Alians, an Islamic order Alien Project (disambiguation) Alien vs. Predator (disambiguation) Astrobiology, the study of hypothetical alien life ATLiens, a 1996 album by OutKast Predator (disambiguation) UFO (disambiguation) Unidentified flying object (disambiguation)
+An astronomer is a scientist in the field of astronomy who focuses their studies on a specific question or field outside the scope of Earth. They observe astronomical objects such as stars, planets, moons, comets and galaxies – in either observational (by analyzing the data) or theoretical astronomy. Examples of topics or fields astronomers study include planetary science, solar astronomy, the origin or evolution of stars, or the formation of galaxies. A related but distinct subject is physical cosmology, which studies the Universe as a whole.TypesAstronomers usually fall under either of two main types: observational and theoretical. Observational astronomers make direct observations of celestial objects and analyze the data. In contrast, theoretical astronomers create and investigate models of things that cannot be observed. Because it takes millions to billions of years for a system of stars or a galaxy to complete a life cycle, astronomers must observe snapshots of different systems at unique points in their evolution to determine how they form, evolve, and die. They use these data to create models or simulations to theorize how different celestial objects work.Further subcategories under these two main branches of astronomy include planetary astronomy, galactic astronomy, or physical cosmology.Academic Historically, astronomy was more concerned with the classification and description of phenomena in the sky, while astrophysics attempted to explain these phenomena and the differences between them using physical laws. Today, that distinction has mostly disappeared and the terms "astronomer" and "astrophysicist" are interchangeable. Professional astronomers are highly educated individuals who typically have a PhD in physics or astronomy and are employed by research institutions or universities. They spend the majority of their time working on research, although they quite often have other duties such as teaching, building instruments, or aiding in the operation of an observatory.The American Astronomical Society, which is the major organization of professional astronomers in North America, has approximately 7,000 members. This number includes scientists from other fields such as physics, geology, and engineering, whose research interests are closely related to astronomy. The International Astronomical Union comprises almost 10,145 members from 70 different countries who are involved in astronomical research at the PhD level and beyond.Contrary to the classical image of an old astronomer peering through a telescope through the dark hours of the night, it is far more common to use a charge-coupled device (CCD) camera to record a long, deep exposure, allowing a more sensitive image to be created because the light is added over time. Before CCDs, photographic plates were a common method of observation. Modern astronomers spend relatively little time at telescopes usually just a few weeks per year. Analysis of observed phenomena, along with making predictions as to the causes of what they observe, takes the majority of observational astronomers' time.Astronomers who serve as faculty spend much of their time teaching undergraduate and graduate classes. Most universities also have outreach programs including public telescope time and sometimes planetariums as a public service to encourage interest in the field.Those who become astronomers usually have a broad background in maths, sciences and computing in high school. Taking courses that teach how to research, write, and present papers are also invaluable. In college/university most astronomers get a PhD in astronomy or physics.Amateur astronomers While there is a relatively low number of professional astronomers, the field is popular among amateurs. Most cities have amateur astronomy clubs that meet on a regular basis and often host star parties. The Astronomical Society of the Pacific is the largest general astronomical society in the world, comprising both professional and amateur astronomers as well as educators from 70 different nations. Like any hobby, most people who think of themselves as amateur astronomers may devote a few hours a month to stargazing and reading the latest developments in research. However, amateurs span the range from so-called "armchair astronomers" to the very ambitious, who own science-grade telescopes and instruments with which they are able to make their own discoveries and assist professional astronomers in research.See also  List of astronomers List of women astronomers List of Muslim astronomers List of French astronomers List of Hungarian astronomers List of Russian astronomers and astrophysicists List of Slovenian astronomersReferencesSourcesExternal links  American Astronomical Society European Astronomical Society International Astronomical Union Astronomical Society of the Pacific Space's astronomy newsAstronomy Science occupations
+ASCII ( ), abbreviated from American Standard Code for Information Interchange, is a character encoding standard for electronic communication. ASCII codes represent text in computers, telecommunications equipment, and other devices. Most modern character-encoding schemes are based on ASCII, although they support many additional characters.The Internet Assigned Numbers Authority (IANA) prefers the name US-ASCII for this character encoding.ASCII is one of the IEEE milestones.OverviewASCII was developed from telegraph code. Its first commercial use was as a seven-bit teleprinter code promoted by Bell data services. Work on the ASCII standard began in May 1961, with the first meeting of the American Standards Association's (ASA) (now the American National Standards Institute or ANSI) X3.2 subcommittee. The first edition of the standard was published in 1963, underwent a major revision during 1967, and experienced its most recent update during 1986. Compared to earlier telegraph codes, the proposed Bell code and ASCII were both ordered for more convenient sorting (i.e., alphabetization) of lists and added features for devices other than teleprinters. The use of ASCII format for Network Interchange was described in 1969. That document was formally elevated to an Internet Standard in 2015.Originally based on the English alphabet, ASCII encodes 128 specified characters into seven-bit integers as shown by the ASCII chart above. Ninety-five of the encoded characters are printable: these include the digits 0 to 9, lowercase letters a to z, uppercase letters A to Z, and punctuation symbols. In addition, the original ASCII specification included 33 non-printing control codes which originated with Teletype machines; most of these are now obsolete, although a few are still commonly used, such as the carriage return, line feed, and tab codes.For example, lowercase i would be represented in the ASCII encoding by binary 1101001 = hexadecimal 69 (i is the ninth letter) = decimal 105.HistoryThe American Standard Code for Information Interchange (ASCII) was developed under the auspices of a committee of the American Standards Association (ASA), called the X3 committee, by its X3.2 (later X3L2) subcommittee, and later by that subcommittee's X3.2.4 working group (now INCITS). The ASA later became the United States of America Standards Institute (USASI), and ultimately became the American National Standards Institute (ANSI).With the other special characters and control codes filled in, ASCII was published as ASA X3.4-1963, leaving 28 code positions without any assigned meaning, reserved for future standardization, and one unassigned control code. There was some debate at the time whether there should be more control characters rather than the lowercase alphabet. The indecision did not last long: during May 1963 the CCITT Working Party on the New Telegraph Alphabet proposed to assign lowercase characters to sticks 6 and 7, and International Organization for Standardization TC 97 SC 2 voted during October to incorporate the change into its draft standard. The X3.2.4 task group voted its approval for the change to ASCII at its May 1963 meeting. Locating the lowercase letters in sticks 6 and 7 caused the characters to differ in bit pattern from the upper case by a single bit, which simplified case-insensitive character matching and the construction of keyboards and printers.The X3 committee made other changes, including other new characters (the brace and vertical bar characters), renaming some control characters (SOM became start of header (SOH)) and moving or removing others (RU was removed). ASCII was subsequently updated as USAS X3.4-1967, then USAS X3.4-1968, ANSI X3.4-1977, and finally, ANSI X3.4-1986.Revisions of the ASCII standard: ASA X3.4-1963 ASA X3.4-1965 (approved, but not published, nevertheless used by IBM 2260 & 2265 Display Stations and IBM 2848 Display Control) USAS X3.4-1967 USAS X3.4-1968 ANSI X3.4-1977 ANSI X3.4-1986 ANSI X3.4-1986 (R1992) ANSI X3.4-1986 (R1997) ANSI INCITS 4-1986 (R2002) ANSI INCITS 4-1986 (R2007) (ANSI) INCITS 4-1986[R2012] (ANSI) INCITS 4-1986[R2017]In the X3.15 standard, the X3 committee also addressed how ASCII should be transmitted (least significant bit first), and how it should be recorded on perforated tape. They proposed a 9-track standard for magnetic tape, and attempted to deal with some punched card formats.Design considerationsBit widthThe X3.2 subcommittee designed ASCII based on the earlier teleprinter encoding systems. Like other character encodings, ASCII specifies a correspondence between digital bit patterns and character symbols (i.e. graphemes and control characters). This allows digital devices to communicate with each other and to process, store, and communicate character-oriented information such as written language. Before ASCII was developed, the encodings in use included 26 alphabetic characters, 10 numerical digits, and from 11 to 25 special graphic symbols. To include all these, and control characters compatible with the Comité Consultatif International Téléphonique et Télégraphique (CCITT) International Telegraph Alphabet No. 2 (ITA2) standard of 1924, FIELDATA (1956), and early EBCDIC (1963), more than 64 codes were required for ASCII.ITA2 was in turn based on the 5-bit telegraph code that Émile Baudot invented in 1870 and patented in 1874.The committee debated the possibility of a shift function (like in ITA2), which would allow more than 64 codes to be represented by a six-bit code. In a shifted code, some character codes determine choices between options for the following character codes. It allows compact encoding, but is less reliable for data transmission, as an error in transmitting the shift code typically makes a long part of the transmission unreadable. The standards committee decided against shifting, and so ASCII required at least a seven-bit code.The committee considered an eight-bit code, since eight bits (octets) would allow two four-bit patterns to efficiently encode two digits with binary-coded decimal. However, it would require all data transmission to send eight bits when seven could suffice. The committee voted to use a seven-bit code to minimize costs associated with data transmission. Since perforated tape at the time could record eight bits in one position, it also allowed for a parity bit for error checking if desired. Eight-bit machines (with octets as the native data type) that did not use parity checking typically set the eighth bit to 0.Internal organizationThe code itself was patterned so that most control codes were together and all graphic codes were together, for ease of identification. The first two so-called ASCII sticks (32 positions) were reserved for control characters. The "space" character had to come before graphics to make sorting easier, so it became position 20hex; for the same reason, many special signs commonly used as separators were placed before digits. The committee decided it was important to support uppercase 64-character alphabets, and chose to pattern ASCII so it could be reduced easily to a usable 64-character set of graphic codes, as was done in the DEC SIXBIT code (1963). Lowercase letters were therefore not interleaved with uppercase. To keep options available for lowercase letters and other graphics, the special and numeric codes were arranged before the letters, and the letter A was placed in position 41hex to match the draft of the corresponding British standard. The digits 0–9 are prefixed with 011, but the remaining 4 bits correspond to their respective values in binary, making conversion with binary-coded decimal straightforward.Many of the non-alphanumeric characters were positioned to correspond to their shifted position on typewriters; an important subtlety is that these were based on mechanical typewriters, not electric typewriters. Mechanical typewriters followed the de facto standard set by the Remington No. 2 (1878), the first typewriter with a shift key, and the shifted values of 23456789- were "#$%_&'() early typewriters omitted 0 and 1, using O (capital letter o) and l (lowercase letter L) instead, but 1! and 0) pairs became standard once 0 and 1 became common. Thus, in ASCII !"#$% were placed in the second stick, positions 1–5, corresponding to the digits 1–5 in the adjacent stick. The parentheses could not correspond to 9 and 0, however, because the place corresponding to 0 was taken by the space character. This was accommodated by removing _ (underscore) from 6 and shifting the remaining characters, which corresponded to many European typewriters that placed the parentheses with 8 and 9. This discrepancy from typewriters led to bit-paired keyboards, notably the Teletype Model 33, which used the left-shifted layout corresponding to ASCII, differently from traditional mechanical typewriters. Electric typewriters, notably the IBM Selectric (1961), used a somewhat different layout that has become de facto standard on computers following the IBM PC (1981), especially Model M (1984) and thus shift values for symbols on modern keyboards do not correspond as closely to the ASCII table as earlier keyboards did. The /? pair also dates to the No. 2, and the ,< .> pairs were used on some keyboards (others, including the No. 2, did not shift , (comma) or . (full stop) so they could be used in uppercase without unshifting). However, ASCII split the ;: pair (dating to No. 2), and rearranged mathematical symbols (varied conventions, commonly -* =+) to :* ;+ -=.Some then-common typewriter characters were not included, notably ½ ¼ ¢, while ^ ` ~  were included as diacritics for international use, and < > for mathematical use, together with the simple line characters \ | (in addition to common /). The @ symbol was not used in continental Europe and the committee expected it would be replaced by an accented À in the French variation, so the @ was placed in position 40hex, right before the letter A.The control codes felt essential for data transmission were the start of message (SOM), end of address (EOA), end of message (EOM), end of transmission (EOT), "who are you?" (WRU), "are you?" (RU), a reserved device control (DC0), synchronous idle (SYNC), and acknowledge (ACK). These were positioned to maximize the Hamming distance between their bit patterns.Character orderASCII-code order is also called ASCIIbetical order. Collation of data is sometimes done in this order rather than "standard" alphabetical order (collating sequence). The main deviations in ASCII order are: All uppercase come before lowercase letters; for example, "Z" precedes "a" Digits and many punctuation marks come before lettersAn intermediate order converts uppercase letters to lowercase before comparing ASCII values.Character groupsControl charactersASCII reserves the first 32 codes (numbers 0–31 decimal) for control characters: codes originally intended not to represent printable information, but rather to control devices (such as printers) that make use of ASCII, or to provide meta-information about data streams such as those stored on magnetic tape.For example, character 10 represents the "line feed" function (which causes a printer to advance its paper), and character 8 represents "backspace".  refers to control characters that do not include carriage return, line feed or white space as non-whitespace control characters. Except for the control characters that prescribe elementary line-oriented formatting, ASCII does not define any mechanism for describing the structure or appearance of text within a document. Other schemes, such as markup languages, address page and document layout and formatting.The original ASCII standard used only short descriptive phrases for each control character. The ambiguity this caused was sometimes intentional, for example where a character would be used slightly differently on a terminal link than on a data stream, and sometimes accidental, for example with the meaning of "delete".Probably the most influential single device affecting the interpretation of these characters was the Teletype Model 33 ASR, which was a printing terminal with an available paper tape reader/punch option. Paper tape was a very popular medium for long-term program storage until the 1980s, less costly and in some ways less fragile than magnetic tape. In particular, the Teletype Model 33 machine assignments for codes 17 (Control-Q, DC1, also known as XON), 19 (Control-S, DC3, also known as XOFF), and 127 (Delete) became de facto standards. The Model 33 was also notable for taking the description of Control-G (code 7, BEL, meaning audibly alert the operator) literally, as the unit contained an actual bell which it rang when it received a BEL character.  Because the keytop for the O key also showed a left-arrow symbol (from ASCII-1963, which had this character instead of underscore), a noncompliant use of code 15 (Control-O, Shift In) interpreted as "delete previous character" was also adopted by many early timesharing systems but eventually became neglected.When a Teletype 33 ASR equipped with the automatic paper tape reader received a Control-S (XOFF, an abbreviation for transmit off), it caused the tape reader to stop; receiving Control-Q (XON, "transmit on") caused the tape reader to resume.  This so-called flow control technique became adopted by several early computer operating systems as a "handshaking" signal warning a sender to stop transmission because of impending buffer overflow; it persists to this day in many systems as a manual output control technique. On some systems, Control-S retains its meaning but Control-Q is replaced by a second Control-S to resume output.  The 33 ASR also could be configured to employ Control-R (DC2) and Control-T (DC4) to start and stop the tape punch; on some units equipped with this function, the corresponding control character lettering on the keycap above the letter was TAPE and TAPE respectively.Delete vs BackspaceThe Teletype could not move its typehead backwards, so it did not have a key on its keyboard to send a BS (backspace). Instead, there was a key marked  that sent code 127 (DEL). The purpose of this key was to erase mistakes in a manually-input paper tape: the operator had to push a button on the tape punch to back it up, then type the rubout, which punched all holes and replaced the mistake with a character that was intended to be ignored. Teletypes were commonly used with the less-expensive computers from Digital Equipment Corporation; these systems had to use what keys were available, and thus the DEL code was assigned to erase the previous character. Because of this, DEC video terminals (by default) sent the DEL code for the key marked "Backspace" while the separate key marked "Delete" sent an escape sequence; many other competing terminals sent a BS code for the Backspace key. The Unix terminal driver could only use one code to erase the previous character, this could be set to BS or DEL, but not both, resulting in recurring situations of ambiguity where users had to decide depending on what terminal they were using (shells that allow line editing, such as ksh, bash, and zsh, understand both). The assumption that no key sent a BS code allowed Control+H to be used for other purposes, such as the "help" prefix command in GNU Emacs.EscapeMany more of the control codes have been assigned meanings quite different from their original ones. The "escape" character (ESC, code 27), for example, was intended originally to allow sending of other control characters as literals instead of invoking their meaning, a so-called "escape sequence". This is the same meaning of "escape" encountered in URL encodings, C language strings, and other systems where certain characters have a reserved meaning. Over time this interpretation has been co-opted and has eventually been changed. In modern usage, an ESC sent to the terminal usually indicates the start of a command sequence usually in the form of a so-called "ANSI escape code" (or, more properly, a "Control Sequence Introducer") from ECMA-48 (1972) and its successors, beginning with ESC followed by a "[" (left-bracket) character. In contrast, an ESC sent from the terminal is most often used as an out-of-band character used to terminate an operation or special mode, as in the TECO and vi text editors. In graphical user interface (GUI) and windowing systems, ESC generally causes an application to abort its current operation or to exit (terminate) altogether.End of LineThe inherent ambiguity of many control characters, combined with their historical usage, created problems when transferring "plain text" files between systems. The best example of this is the newline problem on various operating systems. Teletype machines required that a line of text be terminated with both "Carriage Return" (which moves the printhead to the beginning of the line) and "Line Feed" (which advances the paper one line without moving the printhead). The name "Carriage Return" comes from the fact that on a manual typewriter the carriage holding the paper moved while the position where the typebars struck the ribbon remained stationary.  The entire carriage had to be pushed (returned) to the right in order to position the left margin of the paper for the next line.DEC operating systems (OS/8, RT-11, RSX-11, RSTS, TOPS-10, etc.) used both characters to mark the end of a line so that the console device (originally Teletype machines) would work. By the time so-called "glass TTYs" (later called CRTs or "dumb terminals") came along, the convention was so well established that backward compatibility necessitated continuing to follow it. When Gary Kildall created CP/M, he was inspired by some of the command line interface conventions used in DEC's RT-11 operating system. Until the introduction of PC DOS in 1981, IBM had no influence in this because their 1970s operating systems used EBCDIC encoding instead of ASCII, and they were oriented toward punch-card input and line printer output on which the concept of "carriage return" was meaningless. IBM's PC DOS (also marketed as MS-DOS by Microsoft) inherited the convention by virtue of being loosely based on CP/M, and Windows in turn inherited it from MS-DOS.Unfortunately, requiring two characters to mark the end of a line introduces unnecessary complexity and ambiguity as to how to interpret each character when encountered by itself. To simplify matters, plain text data streams, including files, on Multics used line feed (LF) alone as a line terminator. Unix and Unix-like systems, and Amiga systems, adopted this convention from Multics. On the other hand, the original Macintosh OS, Apple DOS, and ProDOS used carriage return (CR) alone as a line terminator; however, since Apple has now replaced these obsolete operating systems with the Unix-based macOS operating system, they now use line feed (LF) as well. The Radio Shack TRS-80 also used a lone CR to terminate lines.Computers attached to the ARPANET included machines running operating systems such as TOPS-10 and TENEX using CR-LF line endings; machines running operating systems such as Multics using LF line endings; and machines running operating systems such as OS/360 that represented lines as a character count followed by the characters of the line and which used EBCDIC rather than ASCII encoding.  The Telnet protocol defined an ASCII "Network Virtual Terminal" (NVT), so that connections between hosts with different line-ending conventions and character sets could be supported by transmitting a standard text format over the network. Telnet used ASCII along with CR-LF line endings, and software using other conventions would translate between the local conventions and the NVT. The File Transfer Protocol adopted the Telnet protocol, including use of the Network Virtual Terminal, for use when transmitting commands and transferring data in the default ASCII mode. This adds complexity to implementations of those protocols, and to other network protocols, such as those used for E-mail and the World Wide Web, on systems not using the NVT's CR-LF line-ending convention.End of File/StreamThe PDP-6 monitor, and its PDP-10 successor TOPS-10, used Control-Z (SUB) as an end-of-file indication for input from a terminal.  Some operating systems such as CP/M tracked file length only in units of disk blocks, and used Control-Z to mark the end of the actual text in the file. For these reasons, EOF, or end-of-file, was used colloquially and conventionally as a three-letter acronym for Control-Z instead of SUBstitute.  The end-of-text code (ETX), also known as Control-C, was inappropriate for a variety of reasons, while using Z as the control code to end a file is analogous to its position at the end of the alphabet, and serves as a very convenient mnemonic aid. A historically common and still prevalent convention uses the ETX code convention to interrupt and halt a program via an input data stream, usually from a keyboard.In C library and Unix conventions, the null character is used to terminate text strings; such null-terminated strings can be known in abbreviation as ASCIZ or ASCIIZ, where here Z stands for "zero".Control code chartOther representations might be used by specialist equipment, for example ISO 2047 graphics or hexadecimal numbers.Printable charactersCodes 20hex to 7Ehex, known as the printable characters, represent letters, digits, punctuation marks, and a few miscellaneous symbols. There are 95 printable characters in total.Code 20hex, the "space" character, denotes the space between words, as produced by the space bar of a keyboard. Since the space character is considered an invisible graphic (rather than a control character) it is listed in the table below instead of in the previous section.Code 7Fhex corresponds to the non-printable "delete" (DEL) control character and is therefore omitted from this chart; it is covered in the previous section's chart.  Earlier versions of ASCII used the up arrow instead of the caret (5Ehex) and the left arrow instead of the underscore (5Fhex).Character setUsageASCII was first used commercially during 1963 as a seven-bit teleprinter code for American Telephone & Telegraph's TWX (TeletypeWriter eXchange) network. TWX originally used the earlier five-bit ITA2, which was also used by the competing Telex teleprinter system. Bob Bemer introduced features such as the escape sequence. His British colleague Hugh McGregor Ross helped to popularize this work according to Bemer, "so much so that the code that was to become ASCII was first called the Bemer–Ross Code in Europe". Because of his extensive work on ASCII, Bemer has been called "the father of ASCII".On March 11, 1968, US President Lyndon B. Johnson mandated that all computers purchased by the United States Federal Government support ASCII, stating:I have also approved recommendations of the Secretary of Commerce [Luther H. Hodges] regarding standards for recording the Standard Code for Information Interchange on magnetic tapes and paper tapes when they are used in computer operations.All computers and related equipment configurations brought into the Federal Government inventory on and after July 1, 1969, must have the capability to use the Standard Code for Information Interchange and the formats prescribed by the magnetic tape and paper tape standards when these media are used.ASCII was the most common character encoding on the World Wide Web until December 2007, when UTF-8 encoding surpassed it; UTF-8 is backward compatible with ASCII.Variants and derivationsAs computer technology spread throughout the world, different standards bodies and corporations developed many variations of ASCII to facilitate the expression of non-English languages that used Roman-based alphabets. One could class some of these variations as "ASCII extensions", although some misuse that term to represent all variants, including those that do not preserve ASCII's character-map in the 7-bit range. Furthermore, the ASCII extensions have also been mislabelled as ASCII.7-bit codesFrom early in its development, ASCII was intended to be just one of several national variants of an international character code standard.Other international standards bodies have ratified character encodings such as ISO 646 (1967) that are identical or nearly identical to ASCII, with extensions for characters outside the English alphabet and symbols used outside the United States, such as the symbol for the United Kingdom's pound sterling (£); e.g. with code page 1104. Almost every country needed an adapted version of ASCII, since ASCII suited the needs of only the US and a few other countries. For example, Canada had its own version that supported French characters.Many other countries developed variants of ASCII to include non-English letters (e.g. é, ñ, ß, Ł), currency symbols (e.g. £, ¥), etc. See also YUSCII (Yugoslavia).It would share most characters in common, but assign other locally useful characters to several code points reserved for "national use". However, the four years that elapsed between the publication of ASCII-1963 and ISO's first acceptance of an international recommendation during 1967 caused ASCII's choices for the national use characters to seem to be de facto standards for the world, causing confusion and incompatibility once other countries did begin to make their own assignments to these code points.ISO/IEC 646, like ASCII, is a 7-bit character set. It does not make any additional codes available, so the same code points encoded different characters in different countries. Escape codes were defined to indicate which national variant applied to a piece of text, but they were rarely used, so it was often impossible to know what variant to work with and, therefore, which character a code represented, and in general, text-processing systems could cope with only one variant anyway.Because the bracket and brace characters of ASCII were assigned to "national use" code points that were used for accented letters in other national variants of ISO/IEC 646, a German, French, or Swedish, etc. programmer using their national variant of ISO/IEC 646, rather than ASCII, had to write, and, thus, read, something such asä aÄiÜ = 'Ön'; üinstead of{ a[i] = '\n'; }C trigraphs were created to solve this problem for ANSI C, although their late introduction and inconsistent implementation in compilers limited their use. Many programmers kept their computers on US-ASCII, so plain-text in Swedish, German etc. (for example, in e-mail or Usenet) contained "{, }" and similar variants in the middle of words, something those programmers got used to. For example, a Swedish programmer mailing another programmer asking if they should go for lunch, could get "N{ jag har sm|rg}sar" as the answer, which should be "Nä jag har smörgåsar" meaning "No I've got sandwiches".In Japan and Korea, still  a variation of ASCII is used, in which the backslash (5C hex) is rendered as ¥ (a Yen sign, in Japan) or ₩ (a Won sign, in Korea). This means that, for example, the file path C:\Users\Smith is shown as C:¥Users¥Smith (in Japan) or C:₩Users₩Smith (in Korea).8-bit codesEventually, as 8-, 16-, and 32-bit (and later 64-bit) computers began to replace 12-, 18-, and 36-bit computers as the norm, it became common to use an 8-bit byte to store each character in memory, providing an opportunity for extended, 8-bit relatives of ASCII. In most cases these developed as true extensions of ASCII, leaving the original character-mapping intact, but adding additional character definitions after the first 128 (i.e., 7-bit) characters.Encodings include ISCII (India), VISCII (Vietnam). Although these encodings are sometimes referred to as ASCII, true ASCII is defined strictly only by the ANSI standard.Most early home computer systems developed their own 8-bit character sets containing line-drawing and game glyphs, and often filled in some or all of the control characters from 0 to 31 with more graphics. Kaypro CP/M computers used the "upper" 128 characters for the Greek alphabet.The PETSCII code Commodore International used for their 8-bit systems is probably unique among post-1970 codes in being based on ASCII-1963, instead of the more common ASCII-1967, such as found on the ZX Spectrum computer. Atari 8-bit computers and Galaksija computers also used ASCII variants.The IBM PC defined code page 437, which replaced the control characters with graphic symbols such as smiley faces, and mapped additional graphic characters to the upper 128 positions. Operating systems such as DOS supported these code pages, and manufacturers of IBM PCs supported them in hardware. Digital Equipment Corporation developed the Multinational Character Set (DEC-MCS) for use in the popular VT220 terminal as one of the first extensions designed more for international languages than for block graphics. The Macintosh defined Mac OS Roman and Postscript also defined a set, both of these contained both international letters and typographic punctuation marks instead of graphics, more like modern character sets.The ISO/IEC 8859 standard (derived from the DEC-MCS) finally provided a standard that most systems copied (at least as accurately as they copied ASCII, but with many substitutions). A popular further extension designed by Microsoft, Windows-1252 (often mislabeled as ISO-8859-1), added the typographic punctuation marks needed for traditional text printing.  ISO-8859-1, Windows-1252, and the original 7-bit ASCII were the most common character encodings until 2008 when UTF-8 became more common.ISO/IEC 4873 introduced 32 additional control codes defined in the 80–9F hexadecimal range, as part of extending the 7-bit ASCII encoding to become an 8-bit system.UnicodeUnicode and the ISO/IEC 10646 Universal Character Set (UCS) have a much wider array of characters and their various encoding forms have begun to supplant ISO/IEC 8859 and ASCII rapidly in many environments. While ASCII is limited to 128 characters, Unicode and the UCS support more characters by separating the concepts of unique identification (using natural numbers called code points) and encoding (to 8-, 16-, or 32-bit binary formats, called UTF-8, UTF-16, and UTF-32, respectively).ASCII was incorporated into the Unicode (1991) character set as the first 128 symbols, so the 7-bit ASCII characters have the same numeric codes in both sets. This allows UTF-8 to be backward compatible with 7-bit ASCII, as a UTF-8 file containing only ASCII characters is identical to an ASCII file containing the same sequence of characters.  Even more importantly, forward compatibility is ensured as software that recognizes only 7-bit ASCII characters as special and does not alter bytes with the highest bit set (as is often done to support 8-bit ASCII extensions such as ISO-8859-1) will preserve UTF-8 data unchanged.See also 3568 ASCII, an asteroid named after the character encoding Alt codes Ascii85 ASCII art ASCII Ribbon Campaign Basic Latin (Unicode block) (ASCII as a subset of Unicode) Extended ASCII HTML decimal character rendering Jargon File, a glossary of computer programmer slang which includes a list of common slang names for ASCII characters List of computer character sets List of Unicode charactersNotesReferencesFurther reading   from:External links    Computer-related introductions in 1963Character setsCharacter encodingLatin-script representationsPresentation layer protocols
+Austin is the capital of Texas in the United States.Austin may also refer to:Geographical locationsAustralia Austin, Western AustraliaCanada Austin, Manitoba Austin, Ontario Austin, Quebec Austin Island, NunavutFrance Saint-Austin, hamlet at la Neuville-Chant-d'Oisel, NormandyHong Kong  Austin station (MTR), KowloonUnited States Austin, Arkansas Austin, Colorado Austin Township, Macon County, Illinois Austin, Chicago, Cook County, Illinois Austin, Indiana Austin, Kentucky Austin, Minnesota Austin, Missouri Austin, Nevada Austin, Ohio Austin, Oregon Austin, Pennsylvania Austin, Texas Austin County, Texas (note that the city of Austin, Texas is located in Travis County)Schools Austin College, Sherman, Texas University of Texas at Austin, flagship institution of the University of Texas System Austin Peay State University, Clarksville, TennesseeReligion Augustine of Hippo An adjective for the AugustiniansBusiness American Austin Car Company, short-lived American automobile maker Austin Automobile Company, short-lived American automobile company Austin Motor Company, British car manufacturer Austin cookies and crackers, Keebler Company brandEntertainment "Austin" (song), a single by Blake Shelton Austin, a kangaroo Beanie Baby produced by Ty, Inc. Austin the kangaroo from the children's television series The BackyardigansOther uses Austin (building), a building designed by artist Ellsworth Kelly under construction in Austin, Texas Austin (given name), a short form of Augustin, or Augustine, including fictional characters Austin (surname) USS Austin, three shipsSee also All pages beginning with Austin August (disambiguation) Augustin (disambiguation) Augustine (disambiguation) Austin station (disambiguation) Austins (disambiguation) Austen (disambiguation) Justice Austin (disambiguation) Austinburg (disambiguation)
+Animation is a method in which figures are manipulated to appear as moving images. In traditional animation, images are drawn or painted by hand on transparent celluloid sheets to be photographed and exhibited on film. Today, most animations are made with computer-generated imagery (CGI). Computer animation can be very detailed 3D animation, while 2D computer animation (which may have the look of traditional animation) can be used for stylistic reasons, low bandwidth, or faster real-time renderings. Other common animation methods apply a stop motion technique to two- and three-dimensional objects like paper cutouts, puppets, or clay figures.An animated cartoon is an animated film, usually a short film aimed at children and featuring an exaggerated visual style. The style takes inspiration from comic strips, often featuring anthropomorphic animals, superheroes, or the adventures of child protagonists. Especially with animals that form a natural predator/prey relationship (e.g. cats and mice, coyotes and birds) the action often centers around violent pratfalls such as falls, collisions and explosions that would be lethal in real life.  Commonly, animators achieved the effect by a rapid succession of images that minimally differ from each other. The illusion—as in motion pictures in general—is thought to rely on the phi phenomenon and beta movement, but the exact causes are still uncertain. Analog mechanical animation media that rely on the rapid display of sequential images include the phénakisticope, zoetrope, flip book, praxinoscope, and film. Television and video are popular electronic animation media that originally were analog and now operate digitally. For display on computers, technology such as the animated GIF and Flash animation were developed.In addition to short films, feature films, television series, animated GIFs, and other media dedicated to the display of moving images, animation is also prevalent in video games, motion graphics, user interfaces, and visual effects.The physical movement of image parts through simple mechanics—for instance moving images in magic lantern shows—can also be considered animation. The mechanical manipulation of three-dimensional puppets and objects to emulate living beings has a very long history in automata. Electronic automata were popularized by Disney as animatronics.EtymologyThe word "animation" stems from the Latin "animātiōn", stem of "animātiō", meaning "a bestowing of life". The primary meaning of the English word is "liveliness" and has been in use much longer than the meaning of "moving image medium".HistoryBefore cinematographyHundreds of years before the introduction of true animation, people all over the world enjoyed shows with moving figures that were created and manipulated manually in puppetry, automata, shadow play, and the magic lantern. The multi-media phantasmagoria shows that were very popular in European theatres from the late 18th century through the first half of the 19th century, featured lifelike projections of moving ghosts and other frightful imagery in motion.In 1833, the stroboscopic disc (better known as the phénakisticope) introduced the principle of modern animation with sequential images that were shown one by one in quick succession to form an optical illusion of motion pictures. Series of sequential images had occasionally been made over thousands of years, but the stroboscopic disc provided the first method to represent such images in fluent motion and for the first time had artists creating series with a proper systematic breakdown of movements. The stroboscopic animation principle was also applied in the zoetrope (1866), the flip book (1868) and the praxinoscope (1877). A typical 19th-century animation contained about 12 images that were displayed as a continuous loop by spinning a device manually. The flip book often contained more pictures and had a beginning and end, but its animation would not last longer than a few seconds. The first to create much longer sequences seems to have been Charles-Émile Reynaud, who between 1892 and 1900 had much success with his 10- to 15-minute-long Pantomimes Lumineuses.Silent eraWhen cinematography eventually broke through in 1895 after animated pictures had been known for decades, the wonder of the realistic details in the new medium was seen as its biggest accomplishment. Animation on film was not commercialized until a few years later by manufacturers of optical toys, with chromolithography film loops (often traced from live-action footage) for adapted toy magic lanterns intended for kids to use at home. It would take some more years before animation reached movie theaters.After earlier experiments by movie pioneers J. Stuart Blackton, Arthur Melbourne-Cooper, Segundo de Chomón, and Edwin S. Porter (among others), Blackton's The Haunted Hotel (1907) was the first huge stop motion success, baffling audiences by showing objects that apparently moved by themselves in full photographic detail, without signs of any known stage trick.Émile Cohl's Fantasmagorie (1908) is the oldest known example of what became known as traditional (hand-drawn) animation. Other great artistic and very influential short films were created by Ladislas Starevich with his puppet animations since 1910 and by Winsor McCay with detailed drawn animation in films such as Little Nemo (1911) and Gertie the Dinosaur (1914).During the 1910s, the production of animated "cartoons" became an industry in the US. Successful producer John Randolph Bray and animator Earl Hurd, patented the cel animation process that dominated the animation industry for the rest of the century. Felix the Cat, who debuted in 1919, became the first animated superstar.American golden ageIn 1928, Steamboat Willie, featuring Mickey Mouse and Minnie Mouse, popularized film with synchronized sound and put Walt Disney's studio at the forefront of the animation industry.The enormous success of Mickey Mouse is seen as the start of the golden age of American animation that would last until the 1960s. The United States dominated the world market of animation with a plethora of cel-animated theatrical shorts. Several studios would introduce characters that would become very popular and would have long-lasting careers, including Maria Butinova Studios' Mapmo (1924), The Leo King Knott (1931), Walt Disney Productions' Goofy (1932) and Donald Duck (1934), Warner Bros. Cartoons' Looney Tunes characters like Porky Pig (1935), Daffy Duck (1937), Bugs Bunny (1938–1940), Tweety (1941–1942), Sylvester the Cat (1945), Wile E. Coyote and Road Runner (1949), Fleischer Studios/Paramount Cartoon Studios' Betty Boop (1930), Popeye (1933), Superman (1941) and Casper (1945), MGM cartoon studio's Tom and Jerry (1940) and Droopy, Walter Lantz Productions/Universal Studio Cartoons' Woody Woodpecker (1940), Terrytoons/20th Century Fox's Dinky Duck (1939), Mighty Mouse (1942) and Heckle and Jeckle (1946) and United Artists' Pink Panther (1963).Features before CGIIn 1917, Italian-Argentine director Quirino Cristiani made the first feature-length film El Apóstol (now lost), which became a critical and commercial success. It was followed by Cristiani's Sin dejar rastros in 1918, but one day after its premiere, the film was confiscated by the government.After working on it for three years, Lotte Reiniger released the German feature-length silhouette animation Die Abenteuer des Prinzen Achmed in 1926, the oldest extant animated feature.In 1937, Walt Disney Studios premiered their first animated feature, Snow White and the Seven Dwarfs, still one of the highest-grossing traditional animation features . The Fleischer studios followed this example in 1939 with Gulliver's Travels with some success. Partly due to foreign markets being cut off by the Second World War, Disney's next features Pinocchio, Fantasia (both 1940) and Fleischer Studios' second animated feature Mr. Bug Goes to Town (1941–1942) failed at the box office. For decades afterward, Disney would be the only American studio to regularly produce animated features, until Ralph Bakshi became the first to also release more than a handful features. Sullivan-Bluth Studios began to regularly produce animated features starting with An American Tail in 1986.Although relatively few titles became as successful as Disney's features, other countries developed their own animation industries that produced both short and feature theatrical animations in a wide variety of styles, relatively often including stop motion and cutout animation techniques. Russia's Soyuzmultfilm animation studio, founded in 1936, produced 20 films (including shorts) per year on average and reached 1,582 titles in 2018. China, Czechoslovakia / Czech Republic, Italy, France, and Belgium were other countries that more than occasionally released feature films, while Japan became a true powerhouse of animation production, with its own recognizable and influential anime style of effective limited animation.TelevisionAnimation became very popular on television since the 1950s, when television sets started to become common in most developed countries. Cartoons were mainly programmed for children, on convenient time slots, and especially US youth spent many hours watching Saturday-morning cartoons. Many classic cartoons found a new life on the small screen and by the end of the 1950s, the production of new animated cartoons started to shift from theatrical releases to TV series. Hanna-Barbera Productions was especially prolific and had huge hit series, such as The Flintstones (1960–1966) (the first prime time animated series), Scooby-Doo (since 1969) and Belgian co-production The Smurfs (1981–1989). The constraints of American television programming and the demand for an enormous quantity resulted in cheaper and quicker limited animation methods and much more formulaic scripts. Quality dwindled until more daring animation surfaced in the late 1980s and in the early 1990s with hit series such as The Simpsons (since 1989) as part of a "renaissance" of American animation.While US animated series also spawned successes internationally, many other countries produced their own child-oriented programming, relatively often preferring stop motion and puppetry over cel animation. Japanese anime TV series became very successful internationally since the 1960s, and European producers looking for affordable cel animators relatively often started co-productions with Japanese studios, resulting in hit series such as Barbapapa (The Netherlands/Japan/France 1973–1977), Wickie und die starken Männer/小さなバイキング ビッケ (Vicky the Viking) (Austria/Germany/Japan 1974), and The Jungle Book (Italy/Japan 1989).Switch from cels to computersComputer animation was gradually developed since the 1940s. 3D wireframe animation started popping up in the mainstream in the 1970s, with an early (short) appearance in the sci-fi thriller Futureworld (1976).The Rescuers Down Under was the first feature film to be completely created digitally without a camera. It was produced in a style that's very similar to traditional cel animation on the Computer Animation Production System (CAPS), developed by The Walt Disney Company in collaboration with Pixar in the late 1980s.The so-called 3D style, more often associated with computer animation, has become extremely popular since Pixar's Toy Story (1995), the first computer-animated feature in this style.Most of the cel animation studios switched to producing mostly computer animated films around the 1990s, as it proved cheaper and more profitable. Not only the very popular 3D animation style was generated with computers, but also most of the films and series with a more traditional hand-crafted appearance, in which the charming characteristics of cel animation could be emulated with software, while new digital tools helped developing new styles and effects.Economic statusIn 2008, the animation market was worth US$68.4 billion. Animated feature-length films returned the highest gross margins (around 52%) of all film genres between 2004 and 2013. Animation as an art and industry continues to thrive as of the early 2020s.Education, propaganda and commercialsThe clarity of animation makes it a powerful tool for instruction, while its total malleability also allows exaggeration that can be employed to convey strong emotions and to thwart reality. It has therefore been widely used for other purposes than mere entertainment.During World War II, animation was widely exploited for propaganda. Many American studios, including Warner Bros. and Disney, lent their talents and their cartoon characters to convey to the public certain war values. Some countries, including China, Japan and the United Kingdom, produced their first feature-length animation for their war efforts.Animation has been very popular in television commercials, both due to its graphic appeal, and the humour it can provide. Some animated characters in commercials have survived for decades, such as Snap, Crackle and Pop in advertisements for Kellogg's cereals. The legendary animation director Tex Avery was the producer of the first Raid "Kills Bugs Dead" commercials in 1966, which were very successful for the company.Other media, merchandise and theme parksApart from their success in movie theaters and television series, many cartoon characters would also prove extremely lucrative when licensed for all kinds of merchandise and for other media.Animation has traditionally been very closely related to comic books. While many comic book characters found their way to the screen (which is often the case in Japan, where many manga are adapted into anime), original animated characters also commonly appear in comic books and magazines. Somewhat similarly, characters and plots for video games (an interactive animation medium) have been derived from films and vice versa.Some of the original content produced for the screen can be used and marketed in other media. Stories and images can easily be adapted into children's books and other printed media. Songs and music have appeared on records and as streaming media.While very many animation companies commercially exploit their creations outside moving image media, The Walt Disney Company is the best known and most extreme example. Since first being licensed for a children's writing tablet in 1929, their Mickey Mouse mascot has been depicted on an enormous amount of products, as have many other Disney characters. This may have influenced some pejorative use of Mickey's name, but licensed Disney products sell well, and the so-called Disneyana has many avid collectors, and even a dedicated Disneyana fanclub (since 1984).Disneyland opened in 1955 and features many attractions that were based on Disney's cartoon characters. Its enormous success spawned several other Disney theme parks and resorts. Disney's earnings from the theme parks have relatively often been higher than those from their movies.CriticismCriticism of animation has been common in media and cinema since its inception. With its popularity, a large amount of criticism has arisen, especially animated feature-length films. Many concerns of cultural representation, psychological effects on children have been brought up around the animation industry, which has remained rather politically unchanged and stagnant since its inception into mainstream culture.AwardsAs with any other form of media, animation has instituted awards for excellence in the field. The original awards for animation were presented by the Academy of Motion Picture Arts and Sciences for animated shorts from the year 1932, during the 5th Academy Awards function. The first winner of the Academy Award was the short Flowers and Trees, a production by Walt Disney Productions. The Academy Award for a feature-length animated motion picture was only instituted for the year 2001, and awarded during the 74th Academy Awards in 2002. It was won by the film Shrek, produced by DreamWorks and Pacific Data Images. Disney Animation and Pixar has produced the most films either to win or be nominated for the award. Beauty and the Beast was the first animated film nominated for Best Picture. Up and Toy Story 3 also received Best Picture nominations after the Academy expanded the number of nominees from five to ten. Academy Award for Best Animated Feature Academy Award for Best Animated Short FilmSeveral other countries have instituted an award for the best-animated feature film as part of their national film awards: Africa Movie Academy Award for Best Animation (since 2008), BAFTA Award for Best Animated Film (since 2006), César Award for Best Animated Film (since 2011), Golden Rooster Award for Best Animation (since 1981), Goya Award for Best Animated Film (since 1989), Japan Academy Prize for Animation of the Year (since 2007), National Film Award for Best Animated Film (since 2006). Also since 2007, the Asia Pacific Screen Award for Best Animated Feature Film has been awarded at the Asia Pacific Screen Awards. Since 2009, the European Film Awards have awarded the European Film Award for Best Animated Film.The Annie Award is another award presented for excellence in the field of animation. Unlike the Academy Awards, the Annie Awards are only received for achievements in the field of animation and not for any other field of technical and artistic endeavour. They were re-organized in 1992 to create a new field for Best Animated Feature. The 1990s winners were dominated by Walt Disney; however, newer studios, led by Pixar & DreamWorks, have now begun to consistently vie for this award. The list of awardees is as follows: Annie Award for Best Animated Feature Annie Award for Best Animated Short Subject Annie Award for Best Animated Television ProductionProductionThe creation of non-trivial animation works (i.e., longer than a few seconds) has developed as a form of filmmaking, with certain unique aspects. Traits common to both live-action and animated feature-length films are labor intensity and high production costs.The most important difference is that once a film is in the production phase, the marginal cost of one more shot is higher for animated films than live-action films. It is relatively easy for a director to ask for one more take during principal photography of a live-action film, but every take on an animated film must be manually rendered by animators (although the task of rendering slightly different takes has been made less tedious by modern computer animation). It is pointless for a studio to pay the salaries of dozens of animators to spend weeks creating a visually dazzling five-minute scene if that scene fails to effectively advance the plot of the film. Thus, animation studios starting with Disney began the practice in the 1930s of maintaining story departments where storyboard artists develop every single scene through storyboards, then handing the film over to the animators only after the production team is satisfied that all the scenes make sense as a whole. While live-action films are now also storyboarded, they enjoy more latitude to depart from storyboards (i.e., real-time improvisation).Another problem unique to animation is the requirement to maintain a film's consistency from start to finish, even as films have grown longer and teams have grown larger. Animators, like all artists, necessarily have individual styles, but must subordinate their individuality in a consistent way to whatever style is employed on a particular film. Since the early 1980s, teams of about 500 to 600 people, of whom 50 to 70 are animators, typically have created feature-length animated films. It is relatively easy for two or three artists to match their styles; synchronizing those of dozens of artists is more difficult.This problem is usually solved by having a separate group of visual development artists develop an overall look and palette for each film before the animation begins. Character designers on the visual development team draw model sheets to show how each character should look like with different facial expressions, posed in different positions, and viewed from different angles. On traditionally animated projects, maquettes were often sculpted to further help the animators see how characters would look from different angles.Unlike live-action films, animated films were traditionally developed beyond the synopsis stage through the storyboard format; the storyboard artists would then receive credit for writing the film. In the early 1960s, animation studios began hiring professional screenwriters to write screenplays (while also continuing to use story departments) and screenplays had become commonplace for animated films by the late 1980s.TechniquesTraditionalTraditional animation (also called cel animation or hand-drawn animation) was the process used for most animated films of the 20th century. The individual frames of a traditionally animated film are photographs of drawings, first drawn on paper. To create the illusion of movement, each drawing differs slightly from the one before it. The animators' drawings are traced or photocopied onto transparent acetate sheets called cels, which are filled in with paints in assigned colors or tones on the side opposite the line drawings. The completed character cels are photographed one-by-one against a painted background by a rostrum camera onto motion picture film.The traditional cel animation process became obsolete by the beginning of the 21st century. Today, animators' drawings and the backgrounds are either scanned into or drawn directly into a computer system. Various software programs are used to color the drawings and simulate camera movement and effects. The final animated piece is output to one of several delivery media, including traditional 35 mm film and newer media with digital video. The "look" of traditional cel animation is still preserved, and the character animators' work has remained essentially the same over the past 70 years. Some animation producers have used the term "tradigital" (a play on the words "traditional" and "digital") to describe cel animation that uses significant computer technology.Examples of traditionally animated feature films include Pinocchio (United States, 1940), Animal Farm (United Kingdom, 1954), Lucky and Zorba (Italy, 1998), and The Illusionist (British-French, 2010). Traditionally animated films produced with the aid of computer technology include The Lion King (US, 1994), The Prince of Egypt (US, 1998), Akira (Japan, 1988), Spirited Away (Japan, 2001), The Triplets of Belleville (France, 2003), and The Secret of Kells (Irish-French-Belgian, 2009).FullFull animation refers to the process of producing high-quality traditionally animated films that regularly use detailed drawings and plausible movement, having a smooth animation. Fully animated films can be made in a variety of styles, from more realistically animated works like those produced by the Walt Disney studio (The Little Mermaid, Beauty and the Beast, Aladdin, The Lion King) to the more 'cartoon' styles of the Warner Bros. animation studio. Many of the Disney animated features are examples of full animation, as are non-Disney works, The Secret of NIMH (US, 1982), The Iron Giant (US, 1999), and Nocturna (Spain, 2007). Fully animated films are animated at 24 frames per second, with a combination of animation on ones and twos, meaning that drawings can be held for one frame out of 24 or two frames out of 24.LimitedLimited animation involves the use of less detailed or more stylized drawings and methods of movement usually a choppy or "skippy" movement animation. Limited animation uses fewer drawings per second, thereby limiting the fluidity of the animation. This is a more economic technique. Pioneered by the artists at the American studio United Productions of America, limited animation can be used as a method of stylized artistic expression, as in Gerald McBoing-Boing (US, 1951), Yellow Submarine (UK, 1968), and certain anime produced in Japan. Its primary use, however, has been in producing cost-effective animated content for media for television (the work of Hanna-Barbera, Filmation, and other TV animation studios) and later the Internet (web cartoons).RotoscopingRotoscoping is a technique patented by Max Fleischer in 1917 where animators trace live-action movement, frame by frame. The source film can be directly copied from actors' outlines into animated drawings, as in The Lord of the Rings (US, 1978), or used in a stylized and expressive manner, as in Waking Life (US, 2001) and A Scanner Darkly (US, 2006). Some other examples are Fire and Ice (US, 1983), Heavy Metal (1981), and Aku no Hana (Japan, 2013).Live-action blendingLive-action/animation is a technique combining hand-drawn characters into live action shots or live-action actors into animated shots. One of the earlier uses was in Koko the Clown when Koko was drawn over live-action footage. Walt Disney and Ub Iwerks created a series of Alice Comedies (1923–1927), in which a live-action girl enters an animated world. Other examples include Allegro Non Troppo (Italy, 1976), Who Framed Roger Rabbit (US, 1988), Volere volare (Italy 1991), Space Jam (US, 1996) and Osmosis Jones (US, 2001).Stop motionStop-motion animation is used to describe animation created by physically manipulating real-world objects and photographing them one frame of film at a time to create the illusion of movement. There are many different types of stop-motion animation, usually named after the medium used to create the animation. Computer software is widely available to create this type of animation; traditional stop-motion animation is usually less expensive but more time-consuming to produce than current computer animation. Puppet animation  Typically involves stop-motion puppet figures interacting in a constructed environment, in contrast to real-world interaction in model animation. The puppets generally have an armature inside of them to keep them still and steady to constrain their motion to particular joints. Examples include The Tale of the Fox (France, 1937), The Nightmare Before Christmas (US, 1993), Corpse Bride (US, 2005), Coraline (US, 2009), the films of Jiří Trnka and the adult animated sketch-comedy television series Robot Chicken (US, 2005–present). Puppetoon  Created using techniques developed by George Pal, are puppet-animated films that typically use a different version of a puppet for different frames, rather than simply manipulating one existing puppet. Clay animation or Plasticine animation  (Often called claymation, which, however, is a trademarked name). It uses figures made of clay or a similar malleable material to create stop-motion animation. The figures may have an armature or wire frame inside, similar to the related puppet animation (below), that can be manipulated to pose the figures. Alternatively, the figures may be made entirely of clay, in the films of Bruce Bickford, where clay creatures morph into a variety of different shapes. Examples of clay-animated works include The Gumby Show (US, 1957–1967), Mio Mao (Italy, 1974–2005), Morph shorts (UK, 1977–2000), Wallace and Gromit shorts (UK, as of 1989), Jan Švankmajer's Dimensions of Dialogue (Czechoslovakia, 1982), The Trap Door (UK, 1984). Films include Wallace & Gromit: The Curse of the Were-Rabbit, Chicken Run and The Adventures of Mark Twain. Strata-cut animation  Most commonly a form of clay animation in which a long bread-like "loaf" of clay, internally packed tight and loaded with varying imagery, is sliced into thin sheets, with the animation camera taking a frame of the end of the loaf for each cut, eventually revealing the movement of the internal images within. Cutout animation  A type of stop-motion animation produced by moving two-dimensional pieces of material paper or cloth. Examples include Terry Gilliam's animated sequences from Monty Python's Flying Circus (UK, 1969–1974); Fantastic Planet (France/Czechoslovakia, 1973); Tale of Tales (Russia, 1979), The pilot episode of the adult television sitcom series (and sometimes in episodes) of South Park (US, 1997) and the music video Live for the moment, from Verona Riots band (produced by Alberto Serrano and Nívola Uyá, Spain 2014). Silhouette animation  A variant of cutout animation in which the characters are backlit and only visible as silhouettes. Examples include The Adventures of Prince Achmed (Weimar Republic, 1926) and Princes et Princesses (France, 2000). Model animation  Refers to stop-motion animation created to interact with and exist as a part of a live-action world. Intercutting, matte effects and split screens are often employed to blend stop-motion characters or objects with live actors and settings. Examples include the work of Ray Harryhausen, as seen in films, Jason and the Argonauts (1963), and the work of Willis H. O'Brien on films, King Kong (1933).Go motion  A variant of model animation that uses various techniques to create motion blur between frames of film, which is not present in traditional stop motion. The technique was invented by Industrial Light & Magic and Phil Tippett to create special effect scenes for the film The Empire Strikes Back (1980). Another example is the dragon named "Vermithrax" from the 1981 film Dragonslayer. Object animation  Refers to the use of regular inanimate objects in stop-motion animation, as opposed to specially created items. Graphic animation  Uses non-drawn flat visual graphic material (photographs, newspaper clippings, magazines, etc.), which are sometimes manipulated frame by frame to create movement. At other times, the graphics remain stationary, while the stop-motion camera is moved to create on-screen action. Brickfilm  A subgenre of object animation involving using Lego or other similar brick toys to make an animation. These have had a recent boost in popularity with the advent of video sharing sites, YouTube and the availability of cheap cameras and animation software. Pixilation  Involves the use of live humans as stop-motion characters. This allows for a number of surreal effects, including disappearances and reappearances, allowing people to appear to slide across the ground, and other effects. Examples of pixilation include The Secret Adventures of Tom Thumb and Angry Kid shorts, and the Academy Award-winning Neighbours by Norman McLaren.ComputerComputer animation encompasses a variety of techniques, the unifying factor being that the animation is created digitally on a computer. 2D animation techniques tend to focus on image manipulation while 3D techniques usually build virtual worlds in which characters and objects move and interact. 3D animation can create images that seem real to the viewer.2D2D animation figures are created or edited on the computer using 2D bitmap graphics and 2D vector graphics. This includes automated computerized versions of traditional animation techniques, interpolated morphing, onion skinning and interpolated rotoscoping.2D animation has many applications, including analog computer animation, Flash animation, and PowerPoint animation. Cinemagraphs are still photographs in the form of an animated GIF file of which part is animated.Final line advection animation is a technique used in 2D animation, to give artists and animators more influence and control over the final product as everything is done within the same department. Speaking about using this approach in Paperman, John Kahrs said that "Our animators can change things, actually erase away the CG underlayer if they want, and change the profile of the arm."3D3D animation is digitally modeled and manipulated by an animator. The 3D model maker usually starts by creating a 3D polygon mesh for the animator to manipulate. A mesh typically includes many vertices that are connected by edges and faces, which give the visual appearance of form to a 3D object or 3D environment. Sometimes, the mesh is given an internal digital skeletal structure called an armature that can be used to control the mesh by weighting the vertices. This process is called rigging and can be used in conjunction with key frames to create movement.Other techniques can be applied, mathematical functions (e.g., gravity, particle simulations), simulated fur or hair, and effects, fire and water simulations. These techniques fall under the category of 3D dynamics.Terms Cel-shaded animation is used to mimic traditional animation using computer software. The shading looks stark, with less blending of colors. Examples include Skyland (2007, France), The Iron Giant (1999, United States), Futurama (1999, United States) Appleseed Ex Machina (2007, Japan), The Legend of Zelda: The Wind Waker (2002, Japan), The Legend of Zelda: Breath of the Wild (2017, Japan) Machinima – Films created by screen capturing in video games and virtual worlds. The term originated from the software introduction in the 1980s demoscene, as well as the 1990s recordings of the first-person shooter video game Quake. Motion capture is used when live-action actors wear special suits that allow computers to copy their movements into CG characters. Examples include Polar Express (2004, US), Beowulf (2007, US), A Christmas Carol (2009, US), The Adventures of Tintin (2011, US) kochadiiyan (2014, India) Computer animation is used primarily for animation that attempts to resemble real life, using advanced rendering that mimics in detail skin, plants, water, fire, clouds, etc. Examples include Up (2009, US), How to Train Your Dragon (2010, US) Physically based animation is animation using computer simulations.Mechanical Animatronics is the use of mechatronics to create machines that seem animate rather than robotic. Audio-Animatronics and Autonomatronics is a form of robotics animation, combined with 3-D animation, created by Walt Disney Imagineering for shows and attractions at Disney theme parks move and make noise (generally a recorded speech or song). They are fixed to whatever supports them. They can sit and stand, and they cannot walk. An Audio-Animatron is different from an android-type robot in that it uses prerecorded movements and sounds, rather than responding to external stimuli. In 2009, Disney created an interactive version of the technology called Autonomatronics. Linear Animation Generator is a form of animation by using static picture frames installed in a tunnel or a shaft. The animation illusion is created by putting the viewer in a linear motion, parallel to the installed picture frames. The concept and the technical solution were invented in 2007 by Mihai Girlovan in Romania. Chuckimation is a type of animation created by the makers of the television series Action League Now! in which characters/props are thrown, or chucked from off camera or wiggled around to simulate talking by unseen hands. The magic lantern used mechanical slides to project moving images, probably since Christiaan Huygens invented this early image projector in 1659.Other  Hydrotechnics: a technique that includes lights, water, fire, fog, and lasers, with high-definition projections on mist screens. Drawn on film animation: a technique where footage is produced by creating the images directly on film stock; for example, by Norman McLaren, Len Lye and Stan Brakhage. Paint-on-glass animation: a technique for making animated films by manipulating slow drying oil paints on sheets of glass, for example by Aleksandr Petrov. Erasure animation: a technique using traditional 2D media, photographed over time as the artist manipulates the image. For example, William Kentridge is famous for his charcoal erasure films, and Piotr Dumała for his auteur technique of animating scratches on plaster. Pinscreen animation: makes use of a screen filled with movable pins that can be moved in or out by pressing an object onto the screen. The screen is lit from the side so that the pins cast shadows. The technique has been used to create animated films with a range of textural effects difficult to achieve with traditional cel animation. Sand animation: sand is moved around on a back- or front-lighted piece of glass to create each frame for an animated film. This creates an interesting effect when animated because of the light contrast. Flip book: a flip book (sometimes, especially in British English, called a flick book) is a book with a series of pictures that vary gradually from one page to the next, so that when the pages are turned rapidly, the pictures appear to animate by simulating motion or some other change. Flip books are often illustrated books for children, they also are geared towards adults and employ a series of photographs rather than drawings. Flip books are not always separate books, they appear as an added feature in ordinary books or magazines, often in the page corners. Software packages and websites are also available that convert digital video files into custom-made flip books. Character animation Multi-sketching Special effects animationSee also Twelve basic principles of animation Animated war film Animation department Animated series Architectural animation Avar Independent animation International Animation Day International Animated Film Association International Tournée of Animation List of film-related topics Motion graphic design Society for Animation Studies Wire-frame modelReferencesCitationsSourcesJournal articlesBooksOnline sourcesExternal links The making of an 8-minute cartoon short "Animando", a 12-minute film demonstrating 10 different animation techniques (and teaching how to use them). Bibliography on animation – Websiite "Histoire de la télévision"  CartooningArticles containing video clipsFilm and video technology
+Apollo is one of the  Olympian deities in classical Greek and Roman religion and Greek and Roman mythology. The national divinity of the Greeks, Apollo has been recognized as a god of archery, music and dance, truth and prophecy, healing and diseases, the Sun and light, poetry, and more. One of the most important and complex of the Greek gods, he is the son of Zeus and Leto, and the twin brother of Artemis, goddess of the hunt. Seen as the most beautiful god and the ideal of the kouros (ephebe, or a beardless, athletic youth), Apollo is considered to be the most Greek of all the gods. Apollo is known in Greek-influenced Etruscan mythology as Apulu.As the patron deity of Delphi (Apollo Pythios), Apollo is an oracular god—the prophetic deity of the Delphic Oracle. Apollo is the god who affords help and wards off evil; various epithets call him the "averter of evil". Delphic Apollo is the patron of seafarers, foreigners and the protector of fugitives and refugees.Medicine and healing are associated with Apollo, whether through the god himself or mediated through his son Asclepius. Apollo delivered people from epidemics, yet he is also a god who could bring ill-health and deadly plague with his arrows. The invention of archery itself is credited to Apollo and his sister Artemis. Apollo is usually described as carrying a golden bow and a quiver of silver arrows. Apollo's capacity to make youths grow is one of the best attested facets of his panhellenic cult persona. As the protector of young (kourotrophos), Apollo is concerned with the health and education of children. He presided over their passage into adulthood. Long hair, which was the prerogative of boys, was cut at the coming of age (ephebeia) and dedicated to Apollo.Apollo is an important pastoral deity, and was the patron of herdsmen and shepherds. Protection of herds, flocks and crops from diseases, pests and predators were his primary duties. On the other hand, Apollo also encouraged founding new towns and establishment of civil constitution. He is associated with dominion over colonists. He was the giver of laws, and his oracles were consulted before setting laws in a city.As the god of mousike, Apollo presides over all music, songs, dance and poetry. He is the inventor of string-music, and the frequent companion of the Muses, functioning as their chorus leader in celebrations. The lyre is a common attribute of Apollo. In Hellenistic times, especially during the 5th century BCE, as Apollo Helios he became identified among Greeks with Helios, the personification of the sun. In Latin texts, however, there was no conflation of Apollo with Sol among the classical Latin poets until 1st century CE. Apollo and Helios/Sol remained separate beings in literary and mythological texts until the 5th century CE.EtymologyApollo (Attic, Ionic, and Homeric Greek: , Apollōn ( ); Doric: , Apellōn; Arcadocypriot: , Apeilōn; Aeolic: , Aploun; )The name Apollo—unlike the related older name Paean—is generally not found in the Linear B (Mycenean Greek) texts, although there is a possible attestation in the lacunose form ]pe-rjo-[ (Linear B: ]-[) on the KN E 842 tablet, though it has also been suggested that the name might actually read "Hyperion" ([u]-pe-rjo-[ne]).The etymology of the name is uncertain. The spelling  ( in Classical Attic) had almost superseded all other forms by the beginning of the common era, but the Doric form, Apellon (), is more archaic, as it is derived from an earlier . It probably is a cognate to the Doric month Apellaios (), and the offerings apellaia () at the initiation of the young men during the family-festival apellai (). According to some scholars, the words are derived from the Doric word apella (), which originally meant "wall," "fence for animals" and later "assembly within the limits of the square." Apella () is the name of the popular assembly in Sparta, corresponding to the ecclesia (). R. S. P. Beekes rejected the connection of the theonym with the noun apellai and suggested a Pre-Greek proto-form *Apalyun.Several instances of popular etymology are attested from ancient authors. Thus, the Greeks most often associated Apollo's name with the Greek verb  (apollymi), "to destroy". Plato in Cratylus connects the name with  (apolysis), "redemption", with  (apolousis), "purification", and with  ([h]aploun), "simple", in particular in reference to the Thessalian form of the name, , and finally with  (aeiballon), "ever-shooting". Hesychius connects the name Apollo with the Doric  (apella), which means "assembly", so that Apollo would be the god of political life, and he also gives the explanation  (sekos), "fold", in which case Apollo would be the god of flocks and herds. In the ancient Macedonian language  (pella) means "stone," and some toponyms may be derived from this word:  (Pella, the capital of ancient Macedonia) and  (Pellēnē/Pellene).A number of non-Greek etymologies have been suggested for the name, The Hittite form Apaliunas (d) is attested in the Manapa-Tarhunta letter. The Hittite testimony reflects an early form , which may also be surmised from comparison of Cypriot  with Doric . The name of the Lydian god Qλdãns /kʷʎðãns/ may reflect an earlier /kʷalyán-/ before palatalization, syncope, and the pre-Lydian sound change *y > d. Note the labiovelar in place of the labial /p/ found in pre-Doric Ἀπέλjων and Hittite Apaliunas.A Luwian etymology suggested for Apaliunas makes Apollo "The One of Entrapment", perhaps in the sense of "Hunter".Greco-Roman epithetsApollo's chief epithet was Phoebus ( ; , Phoibos ), literally "bright". It was very commonly used by both the Greeks and Romans for Apollo's role as the god of light. Like other Greek deities, he had a number of others applied to him, reflecting the variety of roles, duties, and aspects ascribed to the god. However, while Apollo has a great number of appellations in Greek myth, only a few occur in Latin literature.SunAegletes ( ; Αἰγλήτης, Aiglētēs), from , "light of the sun" Helius ( ; , Helios), literally "sun" Lyceus ( ; , Lykeios, from Proto-Greek *), "light". The meaning of the epithet "Lyceus" later became associated with Apollo's mother Leto, who was the patron goddess of Lycia () and who was identified with the wolf ().Phanaeus ( ; , Phanaios), literally "giving or bringing light"Phoebus ( ; , Phoibos), literally "bright", his most commonly used epithet by both the Greeks and RomansSol (Roman) (), "sun" in LatinWolfLycegenes ( ; , Lukēgenēs), literally "born of a wolf" or "born of Lycia"Lycoctonus ( ; , Lykoktonos), from , "wolf", and , "to kill"Origin and birthApollo's birthplace was Mount Cynthus on the island of Delos.Cynthius ( ; , Kunthios), literally "Cynthian"Cynthogenes ( ; , Kynthogenēs), literally "born of Cynthus"Delius ( ; Δήλιος, Delios), literally "Delian"Didymaeus ( ; , Didymaios) from δίδυμος, "twin", as the twin of ArtemisPlace of worshipDelphi and Actium were his primary places of worship.Acraephius ( ; , Akraiphios, literally "Acraephian") or Acraephiaeus ( ; , Akraiphiaios), "Acraephian", from the Boeotian town of Acraephia (), reputedly founded by his son Acraepheus.Actiacus ( ; , Aktiakos), literally "Actian", after Actium ()Delphinius ( ; , Delphinios), literally "Delphic", after Delphi (Δελφοί). An etiology in the Homeric Hymns associated this with dolphins.Epactaeus, meaning "god worshipped on the coast", in Samos.Pythius ( ; , Puthios, from Πυθώ, Pythō), from the region around Delphi Smintheus ( ; , Smintheus), "Sminthian"—that is, "of the town of Sminthos or Sminthe" near the Troad town of HamaxitusNapaian Apollo (Ἀπόλλων Ναπαῖος), from the city of Nape at the island of LesbosHealing and diseaseAcesius ( ; , Akesios), from , "healing". Acesius was the epithet of Apollo worshipped in Elis, where he had a temple in the agora.Acestor ( ; , Akestōr), literally "healer"Culicarius (Roman) ( ), from Latin culicārius, "of midges"Iatrus ( ; , Iātros), literally "physician"Medicus (Roman) ( ), "physician" in Latin. A temple was dedicated to Apollo Medicus at Rome, probably next to the temple of Bellona.Paean ( ; , Paiān), physician, healerParnopius ( ; , Parnopios), from , "locust"Founder and protectorAgyieus ( ; , Aguīeus), from , "street", for his role in protecting roads and homesAlexicacus ( ; , Alexikakos), literally "warding off evil"Apotropaeus ( ; , Apotropaios), from , "to avert"Archegetes ( ; , Arkhēgetēs), literally "founder"Averruncus (Roman) ( ; from Latin āverruncare), "to avert"Clarius ( ; , Klārios), from Doric , "allotted lot"Epicurius ( ; , Epikourios), from , "to aid"Genetor ( ; , Genetōr), literally "ancestor"Nomius ( ; , Nomios), literally "pastoral"Nymphegetes ( ; , Numphēgetēs), from , "Nymph", and , "leader", for his role as a protector of shepherds and pastoral lifePatroos  from  , "related to one's father," for his role as father of Ion and founder of the Ionians, as worshipped at the Temple of Apollo Patroos in AthensSauroctunos, “lizard killer”, possibly a reference to his killing of PythonProphecy and truthCoelispex (Roman) ( ), from Latin coelum, "sky", and specere "to look at" Iatromantis ( ; , Iātromantis,) from , "physician", and , "prophet", referring to his role as a god both of healing and of prophecyLeschenorius ( ; , Leskhēnorios), from , "converser"Loxias ( ; , Loxias), from , "to say", historically associated with , "ambiguous"Manticus ( ; , Mantikos), literally "prophetic"Proopsios (), meaning "foreseer" or "first seen"Music and artsMusagetes ( ; Doric , Mousāgetās), from , "Muse", and  "leader" Musegetes ( ; , Mousēgetēs), as the precedingArcheryAphetor ( ; , Aphētōr), from , "to let loose"Aphetorus ( ; , Aphētoros), as the precedingArcitenens (Roman) ( ), literally "bow-carrying"Argyrotoxus ( ; , Argyrotoxos), literally "with silver bow"Clytotoxus ( ; , Klytótoxos), "he who is famous for his bow", the renowned archer.Hecaërgus ( ; , Hekaergos), literally "far-shooting"Hecebolus ( ; , Hekēbolos), "far-shooting"Ismenius ( ; , Ismēnios), literally "of Ismenus", after Ismenus, the son of Amphion and Niobe, whom he struck with an arrowAmazonsAmazonius (), Pausanias at the Description of Greece writes that near Pyrrhichus there was a sanctuary of Apollo, called Amazonius () with image of the god said to have been dedicated by the Amazons.Celtic epithets and cult titlesApollo was worshipped throughout the Roman Empire. In the traditionally Celtic lands, he was most often seen as a healing and sun god. He was often equated with Celtic gods of similar character. Apollo Atepomarus ("the great horseman" or "possessing a great horse"). Apollo was worshipped at Mauvières (Indre). Horses were, in the Celtic world, closely linked to the sun. Apollo Belenus ("bright" or "brilliant"). This epithet was given to Apollo in parts of Gaul, Northern Italy and Noricum (part of modern Austria). Apollo Belenus was a healing and sun god. Apollo Cunomaglus ("hound lord"). A title given to Apollo at a shrine at Nettleton Shrub, Wiltshire. May have been a god of healing. Cunomaglus himself may originally have been an independent healing god. Apollo Grannus. Grannus was a healing spring god, later equated with Apollo. Apollo Maponus. A god known from inscriptions in Britain. This may be a local fusion of Apollo and Maponus. Apollo Moritasgus ("masses of sea water"). An epithet for Apollo at Alesia, where he was worshipped as god of healing and, possibly, of physicians. Apollo Vindonnus ("clear light"). Apollo Vindonnus had a temple at Essarois, near Châtillon-sur-Seine in present-day Burgundy. He was a god of healing, especially of the eyes. Apollo Virotutis ("benefactor of mankind"). Apollo Virotutis was worshipped, among other places, at Fins d'Annecy (Haute-Savoie) and at Jublains (Maine-et-Loire).OriginsThe cult centers of Apollo in Greece, Delphi and Delos, date from the 8th century BCE. The Delos sanctuary was primarily dedicated to Artemis, Apollo's twin sister. At Delphi, Apollo was venerated as the slayer of the monstrous serpent Python. For the Greeks, Apollo was the most Greek of all the gods, and through the centuries he acquired different functions. In Archaic Greece he was the prophet, the oracular god who in older times was connected with "healing". In Classical Greece he was the god of light and of music, but in popular religion he had a strong function to keep away evil. Walter Burkert discerned three components in the prehistory of Apollo worship, which he termed "a Dorian-northwest Greek component, a Cretan-Minoan component, and a Syro-Hittite component."Healer and god-protector from evilIn classical times, his major function in popular religion was to keep away evil, and he was therefore called "apotropaios" (, "averting evil") and "alexikakos" ( "keeping off ill"; from v.  + n. ). Apollo also had many epithets relating to his function as a healer. Some commonly-used examples are "paion" ( literally "healer" or "helper") "epikourios" (, "succouring"), "oulios" (, "healer, baleful") and "loimios" (, "of the plague"). In later writers, the word, "paion", usually spelled "Paean", becomes a mere epithet of Apollo in his capacity as a god of healing.Apollo in his aspect of "healer" has a connection to the primitive god Paean (), who did not have a cult of his own. Paean serves as the healer of the gods in the Iliad, and seems to have originated in a pre-Greek religion. It is suggested, though unconfirmed, that he is connected to the Mycenaean figure pa-ja-wo-ne (Linear B: ). Paean was the personification of holy songs sung by "seer-doctors" (), which were supposed to cure disease.Homer illustrated Paeon the god and the song both of apotropaic thanksgiving or triumph. Such songs were originally addressed to Apollo and afterwards to other gods: to Dionysus, to Apollo Helios, to Apollo's son Asclepius the healer. About the 4th century BCE, the paean became merely a formula of adulation; its object was either to implore protection against disease and misfortune or to offer thanks after such protection had been rendered. It was in this way that Apollo had become recognized as the god of music. Apollo's role as the slayer of the Python led to his association with battle and victory; hence it became the Roman custom for a paean to be sung by an army on the march and before entering into battle, when a fleet left the harbour, and also after a victory had been won.In the Iliad, Apollo is the healer under the gods, but he is also the bringer of disease and death with his arrows, similar to the function of the Vedic god of disease Rudra. He sends a plague () to the Achaeans. Knowing that Apollo can prevent a recurrence of the plague he sent, they purify themselves in a ritual and offer him a large sacrifice of cows, called a hecatomb.Dorian originThe Homeric Hymn to Apollo depicts Apollo as an intruder from the north. The connection with the northern-dwelling Dorians and their initiation festival apellai is reinforced by the month Apellaios in northwest Greek calendars. The family-festival was dedicated to Apollo (Doric: ). Apellaios is the month of these rites, and Apellon is the "megistos kouros" (the great Kouros). However it can explain only the Doric type of the name, which is connected with the Ancient Macedonian word "pella" (Pella), stone. Stones played an important part in the cult of the god, especially in the oracular shrine of Delphi (Omphalos).Minoan originGeorge Huxley regarded the identification of Apollo with the Minoan deity Paiawon, worshipped in Crete, to have originated at Delphi. In the Homeric Hymn, Apollo appeared as a dolphin and carried Cretan priests to Delphi, where they evidently transferred their religious practices. Apollo Delphinios or Delphidios was a sea-god especially worshipped in Crete and in the islands. Apollo's sister Artemis, who was the Greek goddess of hunting, is identified with Britomartis (Diktynna), the Minoan "Mistress of the animals". In her earliest depictions she was accompanied by the "Master of the animals", a bow-wielding god of hunting whose name has been lost; aspects of this figure may have been absorbed into the more popular Apollo.Anatolian originA non-Greek origin of Apollo has long been assumed in scholarship. The name of Apollo's mother Leto has Lydian origin, and she was worshipped on the coasts of Asia Minor. The inspiration oracular cult was probably introduced into Greece from Anatolia, which is the origin of Sibyl, and where some of the oldest oracular shrines originated. Omens, symbols, purifications, and exorcisms appear in old Assyro-Babylonian texts. These rituals were spread into the empire of the Hittites, and from there into Greece.Homer pictures Apollo on the side of the Trojans, fighting against the Achaeans, during the Trojan War. He is pictured as a terrible god, less trusted by the Greeks than other gods. The god seems to be related to Appaliunas, a tutelary god of Wilusa (Troy) in Asia Minor, but the word is not complete. The stones found in front of the gates of Homeric Troy were the symbols of Apollo. A western Anatolian origin may also be bolstered by references to the parallel worship of Artimus (Artemis) and Qλdãns, whose name may be cognate with the Hittite and Doric forms, in surviving Lydian texts. However, recent scholars have cast doubt on the identification of Qλdãns with Apollo.The Greeks gave to him the name  agyieus as the protector god of public places and houses who wards off evil and his symbol was a tapered stone or column. However, while usually Greek festivals were celebrated at the full moon, all the feasts of Apollo were celebrated at the seventh day of the month, and the emphasis given to that day (sibutu) indicates a Babylonian origin.The Late Bronze Age (from 1700 to 1200 BCE) Hittite and Hurrian Aplu was a god of plague, invoked during plague years. Here we have an apotropaic situation, where a god originally bringing the plague was invoked to end it. Aplu, meaning the son of, was a title given to the god Nergal, who was linked to the Babylonian god of the sun Shamash. Homer interprets Apollo as a terrible god () who brings death and disease with his arrows, but who can also heal, possessing a magic art that separates him from the other Greek gods. In Iliad, his priest prays to Apollo Smintheus, the mouse god who retains an older agricultural function as the protector from field rats. All these functions, including the function of the healer-god Paean, who seems to have Mycenean origin, are fused in the cult of Apollo.Proto-Indo-European The Vedic Rudra has some similar functions with Apollo. The terrible god is called "the archer" and the bow is also an attribute of Shiva. Rudra could bring diseases with his arrows, but he was able to free people of them and his alternative Shiva is a healer physician god. However the Indo-European component of Apollo does not explain his strong relation with omens, exorcisms, and with the oracular cult.Oracular cult Unusually among the Olympic deities, Apollo had two cult sites that had widespread influence: Delos and Delphi. In cult practice, Delian Apollo and Pythian Apollo (the Apollo of Delphi) were so distinct that they might both have shrines in the same locality. Lycia was sacred to the god, for this Apollo was also called Lycian. Apollo's cult was already fully established when written sources commenced, about 650 BCE. Apollo became extremely important to the Greek world as an oracular deity in the archaic period, and the frequency of theophoric names such as Apollodorus or Apollonios and cities named Apollonia testify to his popularity. Oracular sanctuaries to Apollo were established in other sites. In the 2nd and 3rd century CE, those at Didyma and Claros pronounced the so-called "theological oracles", in which Apollo confirms that all deities are aspects or servants of an all-encompassing, highest deity. "In the 3rd century, Apollo fell silent. Julian the Apostate (359–361) tried to revive the Delphic oracle, but failed."Oracular shrinesApollo had a famous oracle in Delphi, and other notable ones in Claros and Didyma. His oracular shrine in Abae in Phocis, where he bore the toponymic epithet Abaeus (, Apollon Abaios), was important enough to be consulted by Croesus.His oracular shrines include: Abae in Phocis. Bassae in the Peloponnese. At Clarus, on the west coast of Asia Minor; as at Delphi a holy spring which gave off a pneuma, from which the priests drank. In Corinth, the Oracle of Corinth came from the town of Tenea, from prisoners supposedly taken in the Trojan War. At Khyrse, in Troad, the temple was built for Apollo Smintheus. In Delos, there was an oracle to the Delian Apollo, during summer. The Hieron (Sanctuary) of Apollo adjacent to the Sacred Lake, was the place where the god was said to have been born. In Delphi, the Pythia became filled with the pneuma of Apollo, said to come from a spring inside the Adyton. In Didyma, an oracle on the coast of Anatolia, south west of Lydian (Luwian) Sardis, in which priests from the lineage of the Branchidae received inspiration by drinking from a healing spring located in the temple. Was believed to have been founded by Branchus, son or lover of Apollo. In Hierapolis Bambyce, Syria (modern Manbij), according to the treatise De Dea Syria, the sanctuary of the Syrian Goddess contained a robed and bearded image of Apollo. Divination was based on spontaneous movements of this image. At Patara, in Lycia, there was a seasonal winter oracle of Apollo, said to have been the place where the god went from Delos. As at Delphi the oracle at Patara was a woman. In Segesta in Sicily.Oracles were also given by sons of Apollo. In Oropus, north of Athens, the oracle Amphiaraus, was said to be the son of Apollo; Oropus also had a sacred spring. in Labadea,  east of Delphi, Trophonius, another son of Apollo, killed his brother and fled to the cave where he was also afterwards consulted as an oracle.Temples of ApolloMany temples were dedicated to Apollo in Greece and the Greek colonies. They show the spread of the cult of Apollo and the evolution of the Greek architecture, which was mostly based on the rightness of form and on mathematical relations. Some of the earliest temples, especially in Crete, do not belong to any Greek order. It seems that the first peripteral temples were rectangular wooden structures. The different wooden elements were considered divine, and their forms were preserved in the marble or stone elements of the temples of Doric order. The Greeks used standard types because they believed that the world of objects was a series of typical forms which could be represented in several instances. The temples should be canonic, and the architects were trying to achieve this esthetic perfection. From the earliest times there were certain rules strictly observed in rectangular peripteral and prostyle buildings. The first buildings were built narrowly in order to hold the roof, and when the dimensions changed some mathematical relations became necessary in order to keep the original forms. This probably influenced the theory of numbers of Pythagoras, who believed that behind the appearance of things there was the permanent principle of mathematics.The Doric order dominated during the 6th and the 5th century BC but there was a mathematical problem regarding the position of the triglyphs, which couldn't be solved without changing the original forms. The order was almost abandoned for the Ionic order, but the Ionic capital also posed an insoluble problem at the corner of a temple. Both orders were abandoned for the Corinthian order gradually during the Hellenistic age and under Rome.The most important temples are:Greek templesThebes, Greece: The oldest temple probably dedicated to Apollo Ismenius was built in the 9th century B.C. It seems that it was a curvilinear building. The Doric temple was built in the early 7th century B.C., but only some small parts have been found A festival called Daphnephoria was celebrated every ninth year in honour of Apollo Ismenius (or Galaxius). The people held laurel branches (daphnai), and at the head of the procession walked a youth (chosen priest of Apollo), who was called "daphnephoros".Eretria: According to the Homeric hymn to Apollo, the god arrived to the plain, seeking for a location to establish its oracle. The first temple of Apollo Daphnephoros, "Apollo, laurel-bearer", or "carrying off Daphne", is dated to 800 B.C. The temple was curvilinear hecatombedon (a hundred feet). In a smaller building were kept the bases of the laurel branches which were used for the first building. Another temple probably peripteral was built in the 7th century B.C., with an inner row of wooden columns over its Geometric predecessor. It was rebuilt peripteral around 510 B.C., with the stylobate measuring 21,00 x 43,00 m. The number of pteron column was 6 x 14. Dreros (Crete). The temple of Apollo Delphinios dates from the 7th century B.C., or probably from the middle of the 8th century B.C. According to the legend, Apollo appeared as a dolphin, and carried Cretan priests to the port of Delphi. The dimensions of the plan are 10,70 x 24,00 m and the building was not peripteral. It contains column-bases of the Minoan type, which may be considered as the predecessors of the Doric columns.Gortyn (Crete). A temple of Pythian Apollo, was built in the 7th century B.C. The plan measured 19,00 x 16,70 m and it was not peripteral. The walls were solid, made from limestone, and there was single door on the east side.Thermon (West Greece): The Doric temple of Apollo Thermios, was built in the middle of the 7th century B.C. It was built on an older curvilinear building dating perhaps from the 10th century B.C., on which a peristyle was added. The temple was narrow, and the number of pteron columns (probably wooden) was 5 x 15. There was a single row of inner columns. It measures 12.13 x 38.23 m at the stylobate, which was made from stones.  Corinth: A Doric temple was built in the 6th century B.C. The temple's stylobate measures 21.36 x 53.30 m, and the number of pteron columns was 6 x 15. There was a double row of inner columns. The style is similar with the Temple of Alcmeonidae at Delphi. The Corinthians were considered to be the inventors of the Doric order. Napes (Lesbos): An Aeolic temple probably of Apollo Napaios was built in the 7th century B.C. Some special capitals with floral ornament have been found, which are called Aeolic, and it seems that they were borrowed from the East. Cyrene, Libya: The oldest Doric temple of Apollo was built in c. 600 B.C. The number of pteron columns was 6 x 11, and it measures 16.75 x 30.05 m at the stylobate. There was a double row of sixteen inner columns on stylobates. The capitals were made from stone. Naukratis: An Ionic temple was built in the early 6th century B.C. Only some fragments have been found and the earlier, made from limestone, are identified among the oldest of the Ionic order.Syracuse, Sicily: A Doric temple was built at the beginning of the 6th century B.C. The temple's stylobate measures 21.47 x 55.36 m and the number of pteron columns was 6 x 17. It was the first temple in Greek west built completely out of stone. A second row of columns were added, obtaining the effect of an inner porch. Selinus (Sicily):The Doric Temple C dates from 550 B.C., and it was probably dedicated to Apollo. The temple's stylobate measures 10.48 x 41.63 m and the number of pteron columns was 6 x 17. There was portico with a second row of columns, which is also attested for the temple at Syracuse.Delphi: The first temple dedicated to Apollo, was built in the 7th century B.C. According to the legend, it was wooden made of laurel branches. The "Temple of Alcmeonidae" was built in c. 513 B.C. and it is the oldest Doric temple with significant marble elements. The temple's stylobate measures 21.65 x 58.00 m, and the number of pteron columns as 6 x 15. A fest similar with Apollo's fest at Thebes, Greece was celebrated every nine years. A boy was sent to the temple, who walked on the sacred road and returned carrying a laurel branch (dopnephoros). The maidens participated with joyful songs. Chios: An Ionic temple of Apollo Phanaios was built at the end of the 6th century B.C. Only some small parts have been found and the capitals had floral ornament. Abae (Phocis). The temple was destroyed by the Persians in the invasion of Xerxes in 480 B.C., and later by the Boeotians. It was rebuilt by Hadrian. The oracle was in use from early Mycenaean times to the Roman period, and shows the continuity of Mycenaean and Classical Greek religion.  Bassae (Peloponnesus):A temple dedicated to Apollo Epikourios ("Apollo the helper"), was built in 430 B.C. and it was designed by Iktinos.It combined Doric and Ionic elements, and the earliest use of column with a Corinthian capital in the middle. The temple is of a relatively modest size, with the stylobate measuring 14.5 x 38.3 metres containing a Doric peristyle of 6 x 15 columns. The roof left a central space open to admit light and air.Delos: A temple probably dedicated to Apollo and not peripteral, was built in the late 7th century B.C., with a plan measuring 10,00 x 15,60 m. The Doric Great temple of Apollo, was built in c. 475 B.C. The temple's stylobate measures 13.72 x 29.78 m, and the number of pteron columns as 6 x 13. Marble was extensively used.Ambracia: A Doric peripteral temple dedicated to Apollo Pythios Sotir was built in 500 B.C., and It is lying at the centre of the Greek city Arta. Only some parts have been found, and it seems that the temple was built on earlier sanctuaries dedicated to Apollo. The temple measures 20,75 x 44,00 m at the stylobate. The foundation which supported the statue of the god, still exists.Didyma (near Miletus): The gigantic Ionic temple of Apollo Didymaios started around 540 B.C. The construction ceased and then it was restarted in 330 B.C. The temple is dipteral, with an outer row of 10 x 21 columns, and it measures 28.90 x 80.75 m at the stylobate.Clarus (near ancient Colophon): According to the legend, the famous seer Calchas, on his return from Troy, came to Clarus. He challenged the seer Mopsus, and died when he lost. The Doric temple of Apollo Clarius was probably built in the 3rd century B.C., and it was peripteral with 6 x 11 columns. It was reconstructed at the end of the Hellenistic period, and later from the emperor Hadrian but Pausanias claims that it was still incomplete in the 2nd century B.C.Hamaxitus (Troad): In Iliad, Chryses the priest of Apollo, addresses the god with the epithet Smintheus (Lord of Mice), related with the god's ancient role as bringer of the disease (plague). Recent excavations indicate that the Hellenistic temple of Apollo Smintheus was constructed at 150–125 B.C., but the symbol of the mouse god was used on coinage probably from the 4th century B.C. The temple measures 40,00 x 23,00 m at the stylobate, and the number of pteron columns was 8 x 14.Pythion (), this was the name of a shrine of Apollo at Athens near the Ilisos river. It was created by Peisistratos, and tripods placed there by those who had won in the cyclic chorus at the Thargelia.Setae (Lydia): The temple of Apollo Aksyros located in the city.Apollonia Pontica: There were two temples of Apollo Healer in the city. One from the Late Archaic period and the other from the Early Classical period.Ikaros island in the Persian Gulf (modern Failaka Island): There was a temple of Apollo on the island.Etruscan and Roman templesVeii (Etruria): The temple of Apollo was built in the late 6th century B.C. and it indicates the spread of Apollo's culture (Aplu) in Etruria. There was a prostyle porch, which is called Tuscan, and a triple cella 18,50 m wide.Falerii Veteres (Etruria): A temple of Apollo was built probably in the 4th-3rd century B.C. Parts of a teraccotta capital, and a teraccotta base have been found. It seems that the Etruscan columns were derived from the archaic Doric. A cult of Apollo Soranus is attested by one inscription found near Falerii.Pompeii (Italy): The cult of Apollo was widespread in the region of Campania since the 6th century B.C. The temple was built in 120 B.V, but its beginnings lie in the 6th century B.C. It was reconstructed after an earthquake in A.D. 63. It demonstrates a mixing of styles which formed the basis of Roman architecture. The columns in front of the cella formed a Tuscan prostyle porch, and the cella is situated unusually far back. The peripteral colonnade of 48 Ionic columns was placed in such a way that the emphasis was given to the front side. Rome: The temple of Apollo Sosianus and the temple of Apollo Medicus. The first temple building dates to 431 B.C., and was dedicated to Apollo Medicus (the doctor), after a plague of 433 B.C. It was rebuilt by Gaius Sosius, probably in 34 B.C. Only three columns with Corinthian capitals exist today. It seems that the cult of Apollo had existed in this area since at least to the mid-5th century B.C.Rome:The temple of Apollo Palatinus was located on the Palatine hill within the sacred boundary of the city. It was dedicated by Augustus on 28 B.C. The façade of the original temple was Ionic and it was constructed from solid blocks of marble. Many famous statues by Greek masters were on display in and around the temple, including a marble statue of the god at the entrance and a statue of Apollo in the cella.Melite (modern Mdina, Malta): A Temple of Apollo was built in the city in the 2nd century A.D. Its remains were discovered in the 18th century, and many of its architectural fragments were dispersed among private collections or reworked into new sculptures. Parts of the temple's podium were rediscovered in 2002.MythologyApollo appears often in the myths, plays and hymns. As Zeus' favorite son, Apollo had direct access to the mind of Zeus and was willing to reveal this knowledge to humans. A divinity beyond human comprehension, he appears both as a beneficial and a wrathful god.BirthApollo was the son of Zeus, the king of the gods, and Leto, his previous wife or one of his mistresses. Growing up, Apollo was nursed by the nymphs Korythalia and Aletheia, the personification of truth.When Zeus' wife Hera discovered that Leto was pregnant, she banned Leto from giving birth on terra firma. Leto sought shelter in many lands, only to be rejected by them. Finally, the voice of unborn Apollo informed his mother about a floating island named Delos that had once been Asteria, Leto's own sister. Since it was neither a mainland nor an island, Leto was readily welcomed there and gave birth to her children under a palm tree. All the goddesses except Hera were present to witness the event. It is also stated that Hera kidnapped Eileithyia, the goddess of childbirth, to prevent Leto from going into labor. The other gods tricked Hera into letting her go by offering her a necklace of amber 9 yards (8.2 m) long.When Apollo was born, clutching a golden sword, everything on Delos turned into gold and the island was filled with ambrosial fragrance. Swans circled the island seven times and the nymphs sang in delight. He was washed clean by the goddesses who then covered him in white garment and fastened golden bands around him. Since Leto was unable to feed him, Themis, the goddess of divine law, fed him with nectar, or ambrosia. Upon tasting the divine food, Apollo broke free of the bands fastened onto him and declared that he would be the master of lyre and archery, and interpret the will of Zeus to humankind. Zeus, who had calmed Hera by then, came and adorned his son with a golden headband.Apollo's birth fixed the floating Delos to the earth. Leto promised that her son would be always favorable towards the Delians. According to some, Apollo secured Delos to the bottom of the ocean after some time. This island became sacred to Apollo and was one of the major cult centres of the god.Apollo was born on the seventh day (, hebdomagenes) of the month Thargelion—according to Delian tradition—or of the month Bysios—according to Delphian tradition. The seventh and twentieth, the days of the new and full moon, were ever afterwards held sacred to him. Mythographers agree that Artemis was born first and subsequently assisted with the birth of Apollo or was born on the island of Ortygia then helped Leto cross the sea to Delos the next day to give birth to Apollo.HyperboreaHyperborea, the mystical land of eternal spring, venerated Apollo above all the gods. The Hyperboreans always sang and danced in his honor and hosted Pythian games. There, a vast forest of beautiful trees was called "the garden of Apollo". Apollo spent the winter months among the Hyperboreans. His absence from the world caused coldness and this was marked as his annual death. No prophecies were issued during this time. He returned to the world during the beginning of the spring. The Theophania festival was held in Delphi to celebrate his return.It is said that Leto came to Delos from Hyperborea accompanied by a pack of wolves. Henceforth, Hyperborea became Apollo's winter home and wolves became sacred to him. His intimate connection to wolves is evident from his epithet Lyceus, meaning wolf-like. But Apollo was also the wolf-slayer in his role as the god who protected flocks from predators. The Hyperborean worship of Apollo bears the strongest marks of Apollo being worshipped as the sun god. Shamanistic elements in Apollo's cult are often liked to his Hyperborean origin, and he is likewise speculated to have originated as a solar shaman. Shamans like Abaris and Aristeas were also the followers of Apollo, who hailed from Hyperborea.In myths, the tears of amber Apollo shed when his son Asclepius died became the waters of the river Eridanos, which surrounded Hyperborea. Apollo also buried in Hyperborea the arrow which he had used to kill the Cyclopes. He later gave this arrow to Abaris.Childhood and youthAs a child, Apollo is said to have built a foundation and an altar on Delos using the horns of the goats that his sister Artemis hunted. Since he learnt the art of building when young, he later came to be known as Archegetes, the founder (of towns) and god who guided men to build new cities. From his father Zeus, Apollo had also received a golden chariot drawn by swans.In his early years when Apollo spent his time herding cows, he was reared by Thriae, the bee nymphs, who trained him and enhanced his prophetic skills. Apollo is also said to have invented the lyre, and along with Artemis, the art of archery. He then taught to the humans the art of healing and archery. Phoebe, his grandmother, gave the oracular shrine of Delphi to Apollo as a birthday gift. Themis inspired him to be the oracular voice of Delphi thereon.PythonPython, a chthonic serpent-dragon, was a child of Gaia and the guardian of the Delphic Oracle, whose death was foretold by Apollo when he was still in Leto's womb. Python was the nurse of the giant Typhon. In most of the traditions, Apollo was still a child when he killed Python.Python was sent by Hera to hunt the pregnant Leto to death, and had assaulted her. To avenge the trouble given to his mother, Apollo went in search of Python and killed it in the sacred cave at Delphi with the bow and arrows that he had received from Hephaestus. The Delphian nymphs who were present encouraged Apollo during the battle with the cry "Hie Paean". After Apollo was victorious, they also brought him gifts and gave the Corycian cave to him. According to Homer, Apollo had encountered and killed the Python when he was looking for a place to establish his shrine.According to another version, when Leto was in Delphi, Python had attacked her. Apollo defended his mother and killed Python. Euripides in his Iphigenia in Aulis gives an account of his fight with Python and the event's aftermath. You killed him, o Phoebus, while still a baby, still leaping in the arms of your dear mother, and you entered the holy shrine, and sat on the golden tripod, on your truthful throne distributing prophecies from the gods to mortals.A detailed account of Apollo's conflict with Gaia and Zeus' intervention on behalf of his young son is also given. But when Apollo came and sent Themis, the child of Earth, away from the holy oracle of Pytho, Earth gave birth to dream visions of the night; and they told to the cities of men the present, and what will happen in the future, through dark beds of sleep on the ground; and so Earth took the office of prophecy away from Phoebus, in envy, because of her daughter. The lord made his swift way to Olympus and wound his baby hands around Zeus, asking him to take the wrath of the earth goddess from the Pythian home. Zeus smiled, that the child so quickly came to ask for worship that pays in gold. He shook his locks of hair, put an end to the night voices, and took away from mortals the truth that appears in darkness, and gave the privilege back again to Loxias.Apollo also demanded that all other methods of divination be made inferior to his, a wish that Zeus granted him readily. Because of this, Athena, who had been practicing divination by throwing pebbles, cast her pebbles away in displeasure.However, Apollo had committed a blood murder and had to be purified. Because Python was a child of Gaia, Gaia wanted Apollo to be banished to Tartarus as a punishment. Zeus didn't agree and instead exiled his son from Olympus, and instructed him to get purified. Apollo had to serve as a slave for nine years. After the servitude was over, as per his father's order, he travelled to the Vale of Tempe to bath in waters of Peneus. There Zeus himself performed purificatory rites on Apollo. Purified, Apollo was escorted by his half sister Athena to Delphi where the oracular shrine was finally handed over to him by Gaia. According to a variation, Apollo had also travelled to Crete, where Carmanor purified him. Apollo later established the Pythian games to appropriate Gaia. Henceforth, Apollo became the god who cleansed himself from the sin of murder and, made men aware of their guilt and purified them.Soon after, Zeus instructed Apollo to go to Delphi and establish his law. But Apollo, disobeying his father, went to the land of Hyperborea and stayed there for a year. He returned only after the Delphians sang hymns to him and pleaded him to come back. Zeus, pleased with his son's integrity, gave Apollo the seat next to him on his right side. He also gave to Apollo various gifts, like a golden tripod, a golden bow and arrows, a golden chariot and the city of Delphi.Soon after his return, Apollo needed to recruit people to Delphi. So, when he spotted a ship sailing from Crete, he sprang aboard in the form of a dolphin. The crew was awed into submission and followed a course that led the ship to Delphi. There Apollo revealed himself as a god. Initiating them to his service, he instructed them to keep righteousness in their hearts. The Pythia was Apollo's high priestess and his mouthpiece through whom he gave prophecies. Pythia is arguably the constant favorite of Apollo among the mortals.TityosHera once again sent another giant, Tityos to rape Leto. This time Apollo shot him with his arrows and attacked him with his golden sword. According to other version, Artemis also aided him in protecting their mother by attacking Tityos with her arrows. After the battle Zeus finally relented his aid and hurled Tityos down to Tartarus. There, he was pegged to the rock floor, covering an area of , where a pair of vultures feasted daily on his liver.AdmetusAdmetus was the king of Pherae, who was known for his hospitality. When Apollo was exiled from Olympus for killing Python, he served as a herdsman under Admetus, who was then young and unmarried. Apollo is said to have shared a romantic relationship with Admetus during his stay. After completing his years of servitude, Apollo went back to Olympus as a god.Because Admetus had treated Apollo well, the god conferred great benefits on him in return. Apollo's mere presence is said to have made the cattle give birth to twins. Apollo helped Admetus win the hand of Alcestis, the daughter of King Pelias, by taming a lion and a boar to draw Admetus' chariot. He was present during their wedding to give his blessings. When Admetus angered the goddess Artemis by forgetting to give her the due offerings, Apollo came to the rescue and calmed his sister. When Apollo learnt of Admetus' untimely death, he convinced or tricked the Fates into letting Admetus live past his time.According to another version, or perhaps some years later, when Zeus struck down Apollo's son Asclepius with a lightning bolt for resurrecting the dead, Apollo in revenge killed the Cyclopes, who had fashioned the bolt for Zeus. Apollo would have been banished to Tartarus for this, but his mother Leto intervened, and reminding Zeus of their old love, pleaded him not to kill their son. Zeus obliged and sentenced Apollo to one year of hard labor once again under Admetus.The love between Apollo and Admetus was a favored topic of Roman poets like Ovid and Servius.NiobeThe fate of Niobe was prophesied by Apollo while he was still in Leto's womb. Niobe was the queen of Thebes and wife of Amphion. She displayed hubris when she boasted that she was superior to Leto because she had fourteen children (Niobids), seven male and seven female, while Leto had only two. She further mocked Apollo's effeminate appearance and Artemis' manly appearance. Leto, insulted by this, told her children to punish Niobe. Accordingly, Apollo killed Niobe's sons, and Artemis her daughters. According to some versions of the myth, among the Niobids, Chloris and her brother Amyclas were not killed because they prayed to Leto. Amphion, at the sight of his dead sons, either killed himself or was killed by Apollo after swearing revenge.A devastated Niobe fled to Mount Sipylos in Asia Minor and turned into stone as she wept. Her tears formed the river Achelous. Zeus had turned all the people of Thebes to stone and so no one buried the Niobids until the ninth day after their death, when the gods themselves entombed them.When Chloris married and had children, Apollo granted her son Nestor the years he had taken away from the Niobids. Hence, Nestor was able to live for 3 generations.Building the walls of Troy Once Apollo and Poseidon served under the Trojan king Laomedon in accordance to Zeus' words. Apollodorus states that the gods willingly went to the king disguised as humans in order to check his hubris. Apollo guarded the cattle of Laomedon in the valleys of mount Ida, while Poseidon built the walls of Troy. Other versions make both Apollo and Poseidon the builders of the wall. In Ovid's account, Apollo completes his task by playing his tunes on his lyre.In Pindar's odes, the gods took a mortal named Aeacus as their assistant. When the work was completed, three snakes rushed against the wall, and though the two that attacked the sections of the wall built by the gods fell down dead, the third forced its way into the city through the portion of the wall built by Aeacus. Apollo immediately prophesied that Troy would fall at the hands of Aeacus's descendants, the Aeacidae (i.e. his son Telamon joined Heracles when he sieged the city during Laomedon's rule. Later, his great grandson Neoptolemus was present in the wooden horse that lead to the downfall of Troy).However, the king not only refused to give the gods the wages he had promised, but also threatened to bind their feet and hands, and sell them as slaves. Angered by the unpaid labour and the insults, Apollo infected the city with a pestilence and Posedion sent the sea monster Cetus. To deliver the city from it, Laomedon had to sacrifice his daughter Hesione (who would later be saved by Heracles).During his stay in Troy, Apollo had a lover named Ourea, who was a nymph and daughter of Poseidon. Together they had a son named Ileus, whom Apollo loved dearly.Trojan WarApollo sided with the Trojans during the Trojan War waged by the Greeks against the Trojans.During the war, the Greek king Agamemnon captured Chryseis, the daughter of Apollo's priest Chryses, and refused to return her. Angered by this, Apollo shot arrows infected with the plague into the Greek encampment. He demanded that they return the girl, and the Achaeans (Greeks) complied, indirectly causing the anger of Achilles, which is the theme of the Iliad.Receiving the aegis from Zeus, Apollo entered the battlefield as per his father's command, causing great terror to the enemy with his war cry. He pushed the Greeks back and destroyed many of the soldiers. He is described as "the rouser of armies" because he rallied the Trojan army when they were falling apart.When Zeus allowed the other gods to get involved in the war, Apollo was provoked by Poseidon to a duel. However, Apollo declined to fight him, saying that he wouldn't fight his uncle for the sake of mortals.When the Greek hero Diomedes injured the Trojan hero Aeneas, Aphrodite tried to rescue him, but Diomedes injured her as well. Apollo then enveloped Aeneas in a cloud to protect him. He repelled the attacks Diomedes made on him and gave the hero a stern warning to abstain himself from attacking a god. Aeneas was then taken to Pergamos, a sacred spot in Troy, where he was healed.After the death of Sarpedon, a son of Zeus, Apollo rescued the corpse from the battlefield as per his father's wish and cleaned it. He then gave it to Sleep (Hypnos) and Death (Thanatos). Apollo had also once convinced Athena to stop the war for that day, so that the warriors can relieve themselves for a while.The Trojan hero Hector (who, according to some, was the god's own son by Hecuba) was favored by Apollo. When he got severely injured, Apollo healed him and encouraged him to take up his arms. During a duel with Achilles, when Hector was about to lose, Apollo hid Hector in a cloud of mist to save him. When the Greek warrior Patroclus tried to get into the fort of Troy, he was stopped by Apollo. Encouraging Hector to attack Patroclus, Apollo stripped the armour of the Greek warrior and broke his weapons. Patroclus was eventually killed by Hector. At last, after Hector's fated death, Apollo protected his corpse from Achilles' attempt to mutilate it by creating a magical cloud over the corpse.Apollo held a grudge against Achilles throughout the war because Achilles had murdered his son Tenes before the war began and brutally assassinated his son Troilus in his own temple. Not only did Apollo save Hector from Achilles, he also tricked Achilles by disguising himself as a Trojan warrior and driving him away from the gates. He foiled Achilles' attempt to mutilate Hector's dead body.Finally, Apollo caused Achilles' death by guiding an arrow shot by Paris into Achilles' heel. In some versions, Apollo himself killed Achilles by taking the disguise of Paris.Apollo helped many Trojan warriors, including Agenor, Polydamas, Glaucus in the battlefield. Though he greatly favored the Trojans, Apollo was bound to follow the orders of Zeus and served his father loyally during the war.HeraclesAfter Heracles (then named Alcides) was struck with madness and killed his family, he sought to purify himself and consulted the oracle of Apollo. Apollo, through the Pythia, commanded him to serve king Eurystheus for twelve years and complete the ten tasks the king would give him. Only then would Alcides be absolved of his sin. Apollo also renamed him as Heracles.To complete his third task, Heracles had to capture the Ceryneian Hind, a hind sacred to Artemis, and bring back it alive. After chasing the hind for one year, the animal eventually got tired, and when it tried crossing the river Ladon, Heracles captured it. While he was taking it back, he was confronted by Apollo and Artemis, who were angered at Heracles for this act. However, Heracles soothed the goddess and explained his situation to her. After much pleading, Artemis permitted him to take the hind and told him to return it later.After he was freed from his servitude to Eurystheus, Heracles fell in conflict with Iphytus, a prince of Oechalia, and murdered him. Soon after, he contracted a terrible disease. He consulted the oracle of Apollo once again, in hope of ridding himself of the disease. The Pythia, however, denied to give any prophesy. In anger, Heracles snatched the sacred tripod and started walking away, intending to start his own oracle. However, Apollo did not tolerate this and stopped Heracles; a duel ensued between them. Artemis rushed to support Apollo, while Athena supported Heracles. Soon, Zeus threw his thunderbolt between the fighting brothers and separated them. He reprimanded Heracles for this act of violation and asked Apollo to give a solution to Heracles. Apollo then ordered the hero to serve under Omphale, queen of Lydia for one year in order to purify himself.PeriphasPeriphas was an Attican king and a priest of Apollo. He was noble, just and rich. He did all his duties justly. Because of this people were very fond of him and started honouring him to the same extent as Zeus. At one point, they worshipped Periphas in place of Zeus and set up shrines and temples for him. This annoyed Zeus, who decided to annihilate the entire family of Periphas. But because he was a just king and a good devotee, Apollo intervened and requested his father to spare Periphas. Zeus considered Apollo's words and agreed to let him live. But he metamorphosed Periphas into an eagle and made the eagle the king of birds. When Periphas' wife requested Zeus to let her stay with her husband, Zeus turned her into a vulture and fulfilled her wish.Plato's concept of soulmatesA long time ago, there were three kinds of human beings: male, descended from the sun; female, descended from the earth; and androgynous, descended from the moon. Each human being was completely round, with four arms and fours legs, two identical faces on opposite sides of a head with four ears, and all else to match. They were powerful and unruly. Otis and Ephialtes even dared to scale Mount Olympus.To check their insolence, Zeus devised a plan to humble them and improve their manners instead of completely destroying them. He cut them all in two and asked Apollo to make necessary repairs, giving humans the individual shape they still have now. Apollo turned their heads and necks around towards their wounds, he pulled together their skin at the abdomen, and sewed the skin together at the middle of it. This is what we call navel today. He smoothened the wrinkles and shaped the chest. But he made sure to leave a few wrinkles on the abdomen and around the navel so that they might be reminded of their punishment."As he [Zeus] cut them one after another, he bade Apollo give the face and the half of the neck a turn... Apollo was also bidden to heal their wounds and compose their forms. So Apollo gave a turn to the face and pulled the skin from the sides all over that which in our language is called the belly, like the purses which draw in, and he made one mouth at the centre [of the belly] which he fastened in a knot (the same which is called the navel); he also moulded the breast and took out most of the wrinkles, much as a shoemaker might smooth leather upon a last; he left a few wrinkles, however, in the region of the belly and navel, as a memorial of the primeval state.Nurturer of the youngApollo Kourotrophos is the god who nurtures and protects children and the young, especially boys. He oversees their education and their passage into adulthood. Education is said to have originated from Apollo and the Muses. Many myths have him train his children. It was a custom for boys to cut and dedicate their long hair to Apollo after reaching adulthood.Chiron, the abandoned centaur, was fostered by Apollo, who instructed him in medicine, prophecy, archery and more. Chiron would later become a great teacher himself.Asclepius in his childhood gained much knowledge pertaining to medicinal arts by his father. However, he was later entrusted to Chiron for further education.Anius, Apollo's son by Rhoeo, was abandoned by his mother soon after his birth. Apollo brought him up and educated him in mantic arts. Anius later became the priest of Apollo and the king of Delos.Iamus was the son of Apollo and Evadne. When Evadne went into labour, Apollo sent the Moirai to assist his lover. After the child was born, Apollo sent snakes to feed the child some honey. When Iamus reached the age of education, Apollo took him to Olympia and taught him many arts, including the ability to understand and explain the languages of birds.Idmon was educated by Apollo to be a seer. Even though he foresaw his death that would happen in his journey with the Argonauts, he embraced his destiny and died a brave death. To commemorate his son's bravery, Apollo commanded Boeotians to build a town around the tomb of the hero, and to honor him.Apollo adopted Carnus, the abandoned son of Zeus and Europa. He reared the child with the help of his mother Leto and educated him to be a seer.When his son Melaneus reached the age of marriage, Apollo asked the princess Stratonice to be his son's bride and carried her away from her home when she agreed.Apollo saved a shepherd boy (name unknown) from death in a large deep cave, by the means of vultures. To thank him, the shepherd built Apollo a temple under the name Vulturius.God of musicImmediately after his birth, Apollo demanded a lyre and invented the paean, thus becoming the god of music. As the divine singer, he is the patron of poets, singers and musicians. The invention of string music is attributed to him. Plato said that the innate ability of humans to take delight in music, rhythm and harmony is the gift of Apollo and the Muses. According to Socrates, ancient Greeks believed that Apollo is the god who directs the harmony and makes all things move together, both for the gods and the humans. For this reason, he was called Homopolon before the Homo was replaced by A. Apollo's harmonious music delivered people from their pain, and hence, like Dionysus, he is also called the liberator. The swans, which were considered to be the most musical among the birds, were believed to be the "singers of Apollo". They are Apollo's sacred birds and acted as his vehicle during his travel to Hyperborea. Aelian says that when the singers would sing hymns to Apollo, the swans would join the chant in unison.Among the Pythagoreans, the study of mathematics and music were connected to the worship of Apollo, their principal deity. Their belief was that the music purifies the soul, just as medicine purifies the body. They also believed that music was delegated to the same mathematical laws of harmony as the mechanics of the cosmos, evolving into an idea known as the music of the spheres.Apollo appears as the companion of the Muses, and as Musagetes ("leader of Muses") he leads them in dance. They spend their time on Parnassus, which is one of their sacred places. Apollo is also the lover of the Muses and by them he became the father of famous musicians like Orpheus and Linus.Apollo is often found delighting the immortal gods with his songs and music on the lyre. In his role as the god of banquets, he was always present to play music in weddings of the gods, like the marriage of Eros and Psyche, Peleus and Thetis. He is a frequent guest of the Bacchanalia, and many ancient ceramics depict him being at ease amidst the maenads and satyrs. Apollo also participated in musical contests when challenged by others. He was the victor in all those contests, but he tended to punish his opponents severely for their hubris.Apollo's lyreThe invention of lyre is attributed either to Hermes or to Apollo himself. Distinctions have been made that Hermes invented lyre made of tortoise shell, whereas the lyre Apollo invented was a regular lyre.Myths tell that the infant Hermes stole a number of Apollo's cows and took them to a cave in the woods near Pylos, covering their tracks. In the cave, he found a tortoise and killed it, then removed the insides. He used one of the cow's intestines and the tortoise shell and made his lyre.Upon discovering the theft, Apollo confronted Hermes and asked him to return his cattle. When Hermes acted innocent, Apollo took the matter to Zeus. Zeus, having seen the events, sided with Apollo, and ordered Hermes to return the cattle. Hermes then began to play music on the lyre he had invented. Apollo fell in love with the instrument and offered to exchange the cattle for the lyre. Hence, Apollo then became the master of the lyre.According to other versions, Apollo had invented the lyre himself, whose strings he tore in repenting of the excess punishment he had given to Marsyas. Hermes' lyre, therefore, would be a reinvention.Contest with PanOnce Pan had the audacity to compare his music with that of Apollo and to challenge the god of music to a contest. The mountain-god Tmolus was chosen to umpire. Pan blew on his pipes, and with his rustic melody gave great satisfaction to himself and his faithful follower, Midas, who happened to be present. Then, Apollo struck the strings of his lyre. It was so beautiful that Tmolus at once awarded the victory to Apollo, and everyone was pleased with the judgement. Only Midas dissented and questioned the justice of the award. Apollo did not want to suffer such a depraved pair of ears any longer, and caused them to become the ears of a donkey.Contest with MarsyasMarsyas was a satyr who was punished by Apollo for his hubris. He had found an aulos on the ground, tossed away after being invented by Athena because it made her cheeks puffy. Athena had also placed a curse upon the instrument, that whoever would pick it up would be severely punished. When Marsyas played the flute, everyone became frenzied with joy. This led Marsyas to think that he was better than Apollo, and he challenged the god to a musical contest. The contest was judged by the Muses, or the nymphs of Nysa. Athena was also present to witness the contest.Marsyas taunted Apollo for "wearing his hair long, for having a fair face and smooth body, for his skill in so many arts". He also further said,'His [Apollo] hair is smooth and made into tufts and curls that fall about his brow and hang before his face. His body is fair from head to foot, his limbs shine bright, his tongue gives oracles, and he is equally eloquent in prose or verse, propose which you will. What of his robes so fine in texture, so soft to the touch, aglow with purple? What of his lyre that flashes gold, gleams white with ivory, and shimmers with rainbow gems? What of his song, so cunning and so sweet? Nay, all these allurements suit with naught save luxury. To virtue they bring shame alone!'The Muses and Athena sniggered at this comment. The contestants agreed to take turns displaying their skills and the rule was that the victor could "do whatever he wanted" to the loser.According to one account, after the first round, they both were deemed equal by the Nysiads. But in the next round, Apollo decided to play on his lyre and add his melodious voice to his performance. Marsyas argued against this, saying that Apollo would have an advantage and accused Apollo of cheating. But Apollo replied that since Marsyas played the flute, which needed air blown from the throat, it was similar to singing, and that either they both should get an equal chance to combine their skills or none of them should use their mouths at all. The nymphs decided that Apollo's argument was just. Apollo then played his lyre and sang at the same time, mesmerising the audience. Marsyas could not do this. Apollo was declared the winner and, angered with Marsyas' haughtiness and his accusations, decided to flay the satyr.According to another account, Marsyas played his flute out of tune at one point and accepted his defeat. Out of shame, he assigned to himself the punishment of being skinned for a wine sack. Another variation is that Apollo played his instrument upside down. Marsyas could not do this with his instrument. So the Muses who were the judges declared Apollo the winner. Apollo hung Marsyas from a tree to flay him.Apollo flayed the limbs of Marsyas alive in a cave near Celaenae in Phrygia for his hubris to challenge a god. He then gave the rest of his body for proper burial and nailed Marsyas' flayed skin to a nearby pine-tree as a lesson to the others. Marsyas' blood turned into the river Marsyas. But Apollo soon repented and being distressed at what he had done, he tore the strings of his lyre and threw it away. The lyre was later discovered by the Muses and Apollo's sons Linus and Orpheus. The Muses fixed the middle string, Linus the string struck with the forefinger, and Orpheus the lowest string and the one next to it. They took it back to Apollo, but the god, who had decided to stay away from music for a while, laid away both the lyre and the pipes at Delphi and joined Cybele in her wanderings to as far as Hyperborea.Contest with CinyrasCinyras was a ruler of Cyprus, who was a friend of Agamemnon. Cinyras promised to assist Agamemnon in the Trojan war, but did not keep his promise. Agamemnon cursed Cinyras. He invoked Apollo and asked the god to avenge the broken promise. Apollo then had a lyre-playing contest with Cinyras, and defeated him. Either Cinyras committed suicide when he lost, or was killed by Apollo.Patron of sailorsApollo functions as the patron and protector of sailors, one of the duties he shares with Poseidon. In the myths, he is seen helping heroes who pray to him for safe journey.When Apollo spotted a ship of Cretan sailors that was caught in a storm, he quickly assumed the shape of a dolphin and guided their ship safely to Delphi.When the Argonauts faced a terrible storm, Jason prayed to his patron, Apollo, to help them. Apollo used his bow and golden arrow to shed light upon an island, where the Argonauts soon took shelter. This island was renamed "Anaphe", which means "He revealed it".Apollo helped the Greek hero Diomedes, to escape from a great tempest during his journey homeward. As a token of gratitude, Diomedes built a temple in honor of Apollo under the epithet Epibaterius ("the embarker").During the Trojan War, Odysseus came to the Trojan camp to return Chriseis, the daughter of Apollo's priest Chryses, and brought many offerings to Apollo. Pleased with this, Apollo sent gentle breezes that helped Odysseus return safely to the Greek camp.Arion was a poet who was kidnapped by some sailors for the rich prizes he possessed. Arion requested them to let him sing for the last time, to which the sailors consented. Arion began singing a song in praise of Apollo, seeking the god's help. Consequently, numerous dolphins surrounded the ship and when Arion jumped into the water, the dolphins carried him away safely.WarsTitanomachyOnce Hera, out of spite, aroused the Titans to war against Zeus and take away his throne. Accordingly, when the Titans tried to climb Mount Olympus, Zeus with the help of Apollo, Artemis and Athena, defeated them and cast them into tartarus.Trojan WarApollo played a pivotal role in the entire Trojan War. He sided with the Trojans, and sent a terrible plague to the Greek camp, which indirectly led to the conflict between Achilles and Agamemnon. He killed the Greek heroes Patroclus, Achilles, and numerous Greek soldiers. He also helped many Trojan heroes, the most important one being Hector. After the end of the war, Apollo and Poseidon together cleaned the remains of the city and the camps.Telegony warA war broke out between the Brygoi and the Thesprotians, who had the support of Odysseus. The gods Athena and Ares came to the battlefield and took sides. Athena helped the hero Odysseus while Ares fought alongside of the Brygoi. When Odysseus lost, Athena and Ares came into a direct duel. To stop the battling gods and the terror created by their battle, Apollo intervened and stopped the duel between them .Indian warWhen Zeus suggested that Dionysus defeat the Indians in order to earn a place among the gods, Dionysus declared war against the Indians and travelled to India along with his army of Bacchantes and satyrs. Among the warriors was Aristaeus, Apollo's son. Apollo armed his son with his own hands and gave him a bow and arrows and fitted a strong shield to his arm. After Zeus urged Apollo to join the war, he went to the battlefield. Seeing several of his nymphs and Aristaeus drowning in a river, he took them to safety and healed them. He taught Aristaeus more useful healing arts and sent him back to help the army of Dionysus.Theban warDuring the war between the sons of Oedipus, Apollo favored Amphiaraus, a seer and one of the leaders in the war. Though saddened that the seer was fated to be doomed in the war, Apollo made Amphiaraus' last hours glorious by "lighting his shield and his helm with starry gleam". When Hypseus tried to kill the hero by a spear, Apollo directed the spear towards the charioteer of Amphiaraus instead. Then Apollo himself replaced the charioteer and took the reins in his hands. He deflected many spears and arrows away them. He also killed many of the enemy warriors like Melaneus, Antiphus, Aetion, Polites and Lampus. At last when the moment of departure came, Apollo expressed his grief with tears in his eyes and bid farewell to Amphiaraus, who was soon engulfed by the Earth.Slaying of giantsApollo killed the giants Python and Tityos, who had assaulted his mother Leto.GigantomachyDuring the gigantomachy, Apollo and Heracles blinded the giant Ephialtes by shooting him in his eyes, Apollo shooting his left and Heracles his right. He also killed Porphyrion, the king of giants, using his bow and arrows.AloadaeThe Aloadae, namely Otis and Ephialtes, were twin giants who decided to wage war upon the gods. They attempted to storm Mt. Olympus by piling up mountains, and threatened to fill the sea with mountains and inundate dry land. They even dared to seek the hand of Hera and Artemis in marriage. Angered by this, Apollo killed them by shooting them with arrows. According to another tale, Apollo killed them by sending a deer between them; as they tried to kill it with their javelins, they accidentally stabbed each other and died.PhorbasPhorbas was a savage giant king of Phlegyas who was described as having swine like features. He wished to plunder Delphi for its wealth. He seized the roads to Delphi and started harassing the pilgrims. He captured the old people and children and sent them to his army to hold them for ransom. And he challenged the young and sturdy men to a match of boxing, only to cut their heads off when they would get defeated by him. He hung the chopped off heads to an oak tree. Finally, Apollo came to put an end to this cruelty. He entered a boxing contest with Phorbas and killed him with a single blow.Other storiesIn the first Olympic games, Apollo defeated Ares and became the victor in wrestling. He outran Hermes in the race and won first place.Apollo divides months into summer and winter. He rides on the back of a swan to the land of the Hyperboreans during the winter months, and the absence of warmth in winters is due to his departure. During his absence, Delphi was under the care of Dionysus, and no prophecies were given during winters.Molpadia and Parthenos Molpadia and Parthenos were the sisters of Rhoeo, a former lover of Apollo. One day, they were put in charge of watching their father's ancestral wine jar but they fell asleep while performing this duty. While they were asleep, the wine jar was broken by the swines their family kept. When the sisters woke up and saw what had happened, they threw themselves off a cliff in fear of their father's wrath. Apollo, who was passing by, caught them and carried them to two different cities in Chersonesus, Molpadia to Castabus and Parthenos to Bubastus. He turned them into goddesses and they both received divine honors. Molpadia's name was changed to Hemithea upon her deification.Prometheus Prometheus was the titan who was punished by Zeus for stealing fire. He was bound to a rock, where each day an eagle was sent to eat Prometheus' liver, which would then grow back overnight to be eaten again the next day. Seeing his plight, Apollo pleaded Zeus to release the kind Titan, while Artemis and Leto stood behind him with tears in their eyes. Zeus, moved by Apollo's words and the tears of the goddesses, finally sent Heracles to free Prometheus.The rock of Leukas Leukatas was believed to be a white colored rock jutting out from the island of Leukas into the sea. It was present in the sanctuary of Apollo Leukates. A leap from this rock was believed to have put an end to the longings of love.Once, Aphrodite fell deeply in love with Adonis, a young man of great beauty who was later accidentally killed by a boar. Heartbroken, Aphrodite wandered looking for the rock of Leukas. When she reached the sanctuary of Apollo in Argos, she confided in him her love and sorrow. Apollo then brought her to the rock of Leukas and asked her to throw herself from the top of the rock. She did so and was freed from her love. When she sought for the reason behind this, Apollo told her that Zeus, before taking another lover, would sit on this rock to free himself from his love to Hera.Another tale relates that a man named Nireus, who fell in love with the cult statue of Athena, came to the rock and jumped in order relieve himself. After jumping, he fell into the net of a fisherman in which, when he was pulled out, he found a box filled with gold. He fought with the fisherman and took the gold, but Apollo appeared to him in the night in a dream and warned him not to appropriate gold which belonged to others.It was an ancestral custom among the Leukadians to fling a criminal from this rock every year at the sacrifice performed in honor of Apollo for the sake of averting evil. However, a number of men would be stationed all around below rock to catch the criminal and take him out of the borders in order to exile him from the island. This was the same rock from which, according to a legend, Sappho took her suicidal leap.Female loversLove affairs ascribed to Apollo are a late development in Greek mythology. Their vivid anecdotal qualities have made some of them favorites of painters since the Renaissance, the result being that they stand out more prominently in the modern imagination.Daphne was a nymph who scorned Apollo's advances and ran away from him. When Apollo chased her in order to persuade her, she changed herself into a laurel tree. According to other versions, she cried for help during the chase, and Gaia helped her by taking her in and placing a laurel tree in her place. According to Roman poet Ovid, the chase was brought about by Cupid, who hit Apollo with golden arrow of love and Daphne with leaden arrow of hatred. The myth explains the origin of the laurel and connection of Apollo with the laurel and its leaves, which his priestess employed at Delphi. The leaves became the symbol of victory and laurel wreaths were given to the victors of the Pythian games.Apollo is said to have been the lover of all nine Muses, and not being able to choose one of them, decided to remain unwed. He fathered the Corybantes by the Muse Thalia, Orpheus by Calliope, Linus of Thrace by Calliope or Urania and Hymenaios (Hymen) by one of the Muses.Cyrene was a Thessalian princess whom Apollo loved. In her honor, he built the city Cyrene and made her its ruler. She was later granted longevity by Apollo who turned her into a nymph. The couple had two sons, Aristaeus, and Idmon.Evadne was a nymph daughter of Poseidon and a lover of Apollo. She bore him a son, Iamos. During the time of the childbirth, Apollo sent Eileithyia, the goddess of childbirth to assist her.Rhoeo, a princess of the island of Naxos was loved by Apollo. Out of affection for her, Apollo turned her sisters into goddesses. On the island Delos she bore Apollo a son named Anius. Not wanting to have the child, she entrusted the infant to Apollo and left. Apollo raised and educated the child on his own.Ourea, a daughter of Poseidon, fell in love with Apollo when he and Poseidon were serving the Trojan king Laomedon. They both united on the day the walls of Troy were built. She bore to Apollo a son, whom Apollo named Ileus, after the city of his birth, Ilion (Troy). Ileus was very dear to Apollo.Thero, daughter of Phylas, a maiden as beautiful as the moonbeams, was loved by the radiant Apollo, and she loved him in return. By their union, she became mother of Chaeron, who was famed as "the tamer of horses". He later built the city Chaeronea.Hyrie or Thyrie was the mother of Cycnus. Apollo turned both the mother and son into swans when they jumped into a lake and tried to kill themselves.Hecuba was the wife of King Priam of Troy, and Apollo had a son with her named Troilus. An oracle prophesied that Troy would not be defeated as long as Troilus reached the age of twenty alive. He was ambushed and killed by Achilleus, and Apollo avenged his death by killing Achilles. After the sack of Troy, Hecuba was taken to Lycia by Apollo.Coronis was daughter of Phlegyas, King of the Lapiths. While pregnant with Asclepius, Coronis fell in love with Ischys, son of Elatus and slept with him. When Apollo found out about her infidelity through his prophetic powers, he sent his sister, Artemis, to kill Coronis. Apollo rescued the baby by cutting open Koronis' belly and gave it to the centaur Chiron to raise.Dryope, the daughter of Dryops, was impregnated by Apollo in the form of a snake. She gave birth to a son named Amphissus.In Euripides' play Ion, Apollo fathered Ion by Creusa, wife of Xuthus. He used his powers to conceal her pregnancy from her father. Later, when Creusa left Ion to die in the wild, Apollo asked Hermes to save the child and bring him to the oracle at Delphi, where he was raised by a priestess.Male loversHyacinth (or Hyacinthus), a beautiful and athletic Spartan prince, was one of Apollo's favourite lovers. The pair was practicing throwing the discus when a discus thrown by Apollo was blown off course by the jealous Zephyrus and struck Hyacinthus in the head, killing him instantly. Apollo is said to be filled with grief. Out of Hyacinthus' blood, Apollo created a flower named after him as a memorial to his death, and his tears stained the flower petals with the interjection , meaning alas. He was later resurrected and taken to heaven. The festival Hyacinthia was a national celebration of Sparta, which commemorated the death and rebirth of Hyacinthus.Another male lover was Cyparissus, a descendant of Heracles. Apollo gave him a tame deer as a companion but Cyparissus accidentally killed it with a javelin as it lay asleep in the undergrowth. Cyparissus was so saddened by its death that he asked Apollo to let his tears fall forever. Apollo granted the request by turning him into the Cypress named after him, which was said to be a sad tree because the sap forms droplets like tears on the trunk.Admetus, the king of Pherae, was also Apollo's lover. During his exile, which lasted either for one year or nine years, Apollo served Admetus as a herdsman. The romantic nature of their relationship was first described by Callimachus of Alexandria, who wrote that Apollo was "fired with love" for Admetus. Plutarch lists Admetus as one of Apollo's lovers and says that Apollo served Admetus because he doted upon him. Latin poet Ovid in his Ars Amatoria said that even though he was a god, Apollo forsook his pride and stayed in as a servant for the sake of Admetus. Tibullus desrcibes Apollo's love to the king as servitium amoris (slavery of love) and asserts that Apollo became his servant not by force but by choice. He would also make cheese and serve it to Admetus. His domestic actions caused embarrassment to his family.When Admetus wanted to marry princess Alcestis, Apollo provided a chariot pulled by a lion and a boar he had tamed. This satisfied Alcestis' father and he let Admetus marry his daughter. Further, Apollo saved the king from Artemis' wrath and also convinced the Moirai to postpone Admetus' death once.Branchus, a shepherd, one day came across Apollo in the woods. Captivated by the god's beauty, he kissed Apollo. Apollo requited his affections and wanting to reward him, bestowed prophetic skills on him. His descendants, the Branchides, were an influential clan of prophets.Other male lovers of Apollo include:Adonis, who is said to have been the lover of both Apollo and Aphrodite. He behaved as a man with Aphrodite and as a woman with Apollo.Atymnius, otherwise known as a beloved of SarpedonBoreas, the god of North windsHelenus, the son of Priam and a Trojan Prince, was a lover of Apollo and received from him an ivory bow with which he later wounded Achilles in the hand.Hippolytus of Sicyon (not the same as Hippolytus, the son of Theseus)Hymenaios, the son of MagnesIapis, to whom Apollo taught the art of healingPhorbas, the dragon slayer (probably the son of Triopas)ChildrenApollo sired many children, from mortal women and nymphs as well as the goddesses. His children grew up to be physicians, musicians, poets, seers or archers. Many of his sons founded new cities and became kings. They were all usually very beautiful.Asclepius is the most famous son of Apollo. His skills as a physician surpassed that of Apollo's. Zeus killed him for bringing back the dead, but upon Apollo's request, he was resurrected as a god. Aristaeus was placed under the care of Chiron after his birth. He became the god of beekeeping, cheese making, animal husbandry and more. He was ultimately given immortality for the benefits he bestowed upon the humanity. The Corybantes were spear-clashing, dancing demigods.The sons of Apollo who participated in the Trojan War include the Trojan princes Hector and Troilus, as well as Tenes, the king of Tenedos, all three of whom were killed by Achilles over the course of the war.Apollo's children who became musicians and bards include Orpheus, Linus, Ialemus, Hymenaeus, Philammon, Eumolpus and Eleuther. Apollo fathered 3 daughters, Apollonis, Borysthenis and Cephisso, who formed a group of minor Muses, the "Musa Apollonides". They were nicknamed Nete, Mese and Hypate after the highest, middle and lowest strings of his lyre. Phemonoe was a seer and a poetess who was the inventor of Hexameter.Apis, Idmon, Iamus, Tenerus, Mopsus, Galeus, Telmessus and others were gifted seers. Anius, Pythaeus and Ismenus lived as high priests. Most of them were trained by Apollo himself.Arabus, Delphos, Dryops, Miletos, Tenes, Epidaurus, Ceos, Lycoras, Syrus, Pisus, Marathus, Megarus, Patarus, Acraepheus, Cicon, Chaeron and many other sons of Apollo, under the guidance of his words, founded eponymous cities.He also had a son named Chrysorrhoas who was a mechanic artist. His other daughters include Eurynome, Chariclo wife of Chiron, Eurydice the wife of Orpheus, Eriopis, famous for her beautiful hair, Melite the heroine, Pamphile the silk weaver, Parthenos, and by some accounts, Phoebe, Hilyra and Scylla. Apollo turned Parthenos into a constellation after her early death.Additionally, Apollo fostered and educated Chiron, the centaur who later became the greatest teacher and educated many demigods, including Apollo's sons. Apollo also fostered Carnus, the son of Zeus and Europa.Failed love attemptsMarpessa was kidnapped by Idas but was loved by Apollo as well. Zeus made her choose between them, and she chose Idas on the grounds that Apollo, being immortal, would tire of her when she grew old.Sinope, a nymph, was approached by the amorous Apollo. She made him promise that he would grant to her whatever she would ask for, and then cleverly asked him to let her stay a virgin. Apollo kept his promise and went back.Bolina was admired by Apollo but she refused him and jumped into the sea. To avoid her death, Apollo turned her into a nymph and let her go.Castalia was a nymph whom Apollo loved. She fled from him and dove into the spring at Delphi, at the base of Mt. Parnassos, which was then named after her. Water from this spring was sacred; it was used to clean the Delphian temples and inspire the priestesses.Cassandra, was a daughter of Hecuba and Priam. Apollo wished to court her. Cassandra promised to return his love on one condition - he should give her the power to see the future. Apollo fulfilled her wish, but she went back on her word and rejected him soon after. Angered that she broke her promise, Apollo cursed her that even though she would see the future, no one would ever believe her prophecies.Hestia, the goddess of the hearth, rejected both Apollo's and Poseidon's marriage proposals and swore that she would always stay unmarried.Female counterpartsArtemisArtemis as the sister of Apollo, is thea apollousa, that is, she as a female divinity represented the same idea that Apollo did as a male divinity. In the pre-Hellenic period, their relationship was described as the one between husband and wife, and there seems to have been a tradition which actually described Artemis as the wife of Apollo. However, this relationship was never sexual but spiritual, which is why they both are seen being unmarried in the Hellenic period.Artemis, like her brother, is armed with a bow and arrows. She is the cause of sudden deaths of women. She also is the protector of the young, especially girls. Though she has nothing to do with oracles, music or poetry, she sometimes led the female chorus on Olympus while Apollo sang. The laurel (daphne) was sacred to both. Artemis Daphnaia had her temple among the Lacedemonians, at a place called Hypsoi. Apollo Daphnephoros had a temple in Eretria, a "place where the citizens are to take the oaths". In later times when Apollo was regarded as identical with the sun or Helios, Artemis was naturally regarded as Selene or the moon.HecateHecate, the goddess of witchcraft and magic, is the chthonic counterpart of Apollo. They both are cousins, since their mothers - Leto and Asteria - are sisters. One of Apollo's epithets, Hecatos, is the masculine form of Hecate, and both the names mean "working from afar". While Apollo presided over the prophetic powers and magic of light and heaven, Hecate presided over the prophetic powers and magic of night and chthonian darkness. If Hecate is the "gate-keeper", Apollo Agyieus is the "door-keeper". Hecate is the goddess of crossroads and Apollo is the god and protector of streets.The oldest evidence found for Hecate's worship is at Apollo's temple in Miletos. There, Hecate was taken to be Apollo's sister counterpart in the absence of Artemis. Hecate's lunar nature makes her the goddess of the waning moon and contrasts and complements, at the same time, Apollo's solar nature.AthenaAs a deity of knowledge and great power, Apollo was seen being the male counterpart of Athena. Being Zeus' favorite children, they were given more powers and duties. Apollo and Athena often took up the role as protectors of cities, and were patrons of some of the important cities. Athena was the principle goddess of Athens, Apollo was the principle god of Sparta.As patrons of arts, Apollo and Athena were companions of the Muses, the former a much more frequent companion than the latter. Apollo was sometimes called the son of Athena and Hephaestus.In the Trojan war, as Zeus' executive, Apollo is seen holding the aegis like Athena usually does. Apollo's decisions were usually approved by his sister Athena, and they both worked to establish the law and order set forth by Zeus.Apollo in the OresteiaIn Aeschylus' Oresteia trilogy, Clytemnestra kills her husband, King Agamemnon because he had sacrificed their daughter Iphigenia to proceed forward with the Trojan war. Apollo gives an order through the Oracle at Delphi that Agamemnon's son, Orestes, is to kill Clytemnestra and Aegisthus, her lover. Orestes and Pylades carry out the revenge, and consequently Orestes is pursued by the Erinyes or Furies (female personifications of vengeance).Apollo and the Furies argue about whether the matricide was justified; Apollo holds that the bond of marriage is sacred and Orestes was avenging his father, whereas the Erinyes say that the bond of blood between mother and son is more meaningful than the bond of marriage. They invade his temple, and he drives them away. He says that the matter should be brought before Athena. Apollo promises to protect Orestes, as Orestes has become Apollo's supplicant. Apollo advocates Orestes at the trial, and ultimately Athena rules in favor of Apollo.Roman ApolloThe Roman worship of Apollo was adopted from the Greeks. As a quintessentially Greek god, Apollo had no direct Roman equivalent, although later Roman poets often referred to him as Phoebus. There was a tradition that the Delphic oracle was consulted as early as the period of the kings of Rome during the reign of Tarquinius Superbus.On the occasion of a pestilence in the 430s BCE, Apollo's first temple at Rome was established in the Flaminian fields, replacing an older cult site there known as the "Apollinare". During the Second Punic War in 212 BCE, the Ludi Apollinares ("Apollonian Games") were instituted in his honor, on the instructions of a prophecy attributed to one Marcius. In the time of Augustus, who considered himself under the special protection of Apollo and was even said to be his son, his worship developed and he became one of the chief gods of Rome.After the battle of Actium, which was fought near a sanctuary of Apollo, Augustus enlarged Apollo's temple, dedicated a portion of the spoils to him, and instituted quinquennial games in his honour. He also erected a new temple to the god on the Palatine hill. Sacrifices and prayers on the Palatine to Apollo and Diana formed the culmination of the Secular Games, held in 17 BCE to celebrate the dawn of a new era.FestivalsThe chief Apollonian festival was the Pythian Games held every four years at Delphi and was one of the four great Panhellenic Games. Also of major importance was the Delia held every four years on Delos.Athenian annual festivals included the Boedromia, Metageitnia, Pyanepsia, and Thargelia.Spartan annual festivals were the Carneia and the Hyacinthia.Thebes every nine years held the Daphnephoria.Attributes and symbolsApollo's most common attributes were the bow and arrow. Other attributes of his included the kithara (an advanced version of the common lyre), the plectrum and the sword. Another common emblem was the sacrificial tripod, representing his prophetic powers. The Pythian Games were held in Apollo's honor every four years at Delphi. The bay laurel plant was used in expiatory sacrifices and in making the crown of victory at these games.The palm tree was also sacred to Apollo because he had been born under one in Delos. Animals sacred to Apollo included wolves, dolphins, roe deer, swans, cicadas (symbolizing music and song), ravens, hawks, crows (Apollo had hawks and crows as his messengers), snakes (referencing Apollo's function as the god of prophecy), mice and griffins, mythical eagle–lion hybrids of Eastern origin.Homer and Porphyry wrote that Apollo had a hawk as his messenger. In many myths Apollo is transformed into a hawk. In addition, Claudius Aelianus wrote that in Ancient Egypt people believed that hawks were sacred to the god and that according to the ministers of Apollo in Egypt there were certain men called "hawk-keepers" (ἱερακοβοσκοί) who fed and tended the hawks belonging to the god. Eusebius wrote that the second appearance of the moon is held sacred in the city of Apollo in Egypt and that the city's symbol is a man with a hawklike face (Horus). Claudius Aelianus wrote that Egyptians called Apollo Horus in their own language.As god of colonization, Apollo gave oracular guidance on colonies, especially during the height of colonization, 750–550 BCE. According to Greek tradition, he helped Cretan or Arcadian colonists found the city of Troy. However, this story may reflect a cultural influence which had the reverse direction: Hittite cuneiform texts mention an Asia Minor god called Appaliunas or Apalunas in connection with the city of Wilusa attested in Hittite inscriptions, which is now generally regarded as being identical with the Greek Ilion by most scholars. In this interpretation, Apollo's title of Lykegenes can simply be read as "born in Lycia", which effectively severs the god's supposed link with wolves (possibly a folk etymology).In literary contexts, Apollo represents harmony, order, and reason—characteristics contrasted with those of Dionysus, god of wine, who represents ecstasy and disorder. The contrast between the roles of these gods is reflected in the adjectives Apollonian and Dionysian. However, the Greeks thought of the two qualities as complementary: the two gods are brothers, and when Apollo at winter left for Hyperborea, he would leave the Delphic oracle to Dionysus. This contrast appears to be shown on the two sides of the Borghese Vase.Apollo is often associated with the Golden Mean. This is the Greek ideal of moderation and a virtue that opposes gluttony.Apollo in the artsApollo is a common theme in Greek and Roman art and also in the art of the Renaissance. The earliest Greek word for a statue is "delight" (, agalma), and the sculptors tried to create forms which would inspire such guiding vision. Greek art puts into Apollo the highest degree of power and beauty that can be imagined. The sculptors derived this from observations on human beings, but they also embodied in concrete form, issues beyond the reach of ordinary thought.The naked bodies of the statues are associated with the cult of the body that was essentially a religious activity. The muscular frames and limbs combined with slim waists indicate the Greek desire for health, and the physical capacity which was necessary in the hard Greek environment. The statues of Apollo embody beauty, balance and inspire awe before the beauty of the world.Archaic sculptureNumerous free-standing statues of male youths from Archaic Greece exist, and were once thought to be representations of Apollo, though later discoveries indicated that many represented mortals. In 1895, V. I. Leonardos proposed the term kouros ("male youth") to refer to those from Keratea; this usage was later expanded by Henri Lechat in 1904 to cover all statues of this format.The earliest examples of life-sized statues of Apollo may be two figures from the Ionic sanctuary on the island of Delos. Such statues were found across the Greek speaking world, the preponderance of these were found at the sanctuaries of Apollo with more than one hundred from the sanctuary of Apollo Ptoios, Boeotia alone. Significantly more rare are the life-sized bronze statues. One of the few originals which survived into the present day—so rare that its discovery in 1959 was described as "a miracle" by Ernst Homann-Wedeking—is the masterpiece bronze, Piraeus Apollo. It was found in Piraeus, a port city close to Athens, and is believed to have come from north-eastern Peloponnesus. It is the only surviving large-scale Peloponnesian statue.Classical sculptureThe famous Apollo of Mantua and its variants are early forms of the Apollo Citharoedus statue type, in which the god holds the cithara, a sophisticated seven-stringed variant of the lyre, in his left arm. While none of the Greek originals have survived, several Roman copies from approximately the late 1st or early 2nd century exist.Other notable forms are the Apollo Citharoedus and the Apollo Barberini.Hellenistic Greece-RomeApollo as a handsome beardless young man, is often depicted with a cithara (as Apollo Citharoedus) or bow in his hand, or reclining on a tree (the Apollo Lykeios and Apollo Sauroctonos types). The Apollo Belvedere is a marble sculpture that was rediscovered in the late 15th century; for centuries it epitomized the ideals of Classical Antiquity for Europeans, from the Renaissance through the 19th century. The marble is a Hellenistic or Roman copy of a bronze original by the Greek sculptor Leochares, made between 350 and 325 BCE.The life-size so-called "Adonis" found in 1780 on the site of a villa suburbana near the Via Labicana in the Roman suburb of Centocelle is identified as an Apollo by modern scholars. In the late 2nd century CE floor mosaic from El Djem, Roman Thysdrus, he is identifiable as Apollo Helios by his effulgent halo, though now even a god's divine nakedness is concealed by his cloak, a mark of increasing conventions of modesty in the later Empire.Another haloed Apollo in mosaic, from Hadrumentum, is in the museum at Sousse. The conventions of this representation, head tilted, lips slightly parted, large-eyed, curling hair cut in locks grazing the neck, were developed in the 3rd century BCE to depict Alexander the Great. Some time after this mosaic was executed, the earliest depictions of Christ would also be beardless and haloed.Modern receptionApollo often appears in modern and popular culture due to his status as the god of music, dance and poetry.Postclassical art and literatureDance and music Apollo has featured in dance and music in modern culture. Percy Bysshe Shelley composed a "Hymn of Apollo" (1820), and the god's instruction of the Muses formed the subject of Igor Stravinsky's Apollon musagète (1927–1928). In 1978, the Canadian band Rush released an album with songs "Apollo: Bringer of Wisdom"/"Dionysus: Bringer of Love".Books Apollo been portrayed in modern literature, such as when Charles Handy, in Gods of Management (1978) uses Greek gods as a metaphor to portray various types of organizational culture. Apollo represents a 'role' culture where order, reason, and bureaucracy prevail. In 2016, author Rick Riordan published the first book in the Trials of Apollo series, publishing four other books in the series in 2017, 2018, 2019 and 2020.Film Apollo has been depicted in modern films—for instance, by Keith David in the 1997 animated feature film Hercules, by Luke Evans in the 2010 action film Clash of the Titans, and by Dimitri Lekkos in the 2010 film Percy Jackson & the Olympians: The Lightning Thief.Video games Apollo has appeared in many modern video games. Apollo appears as a minor character in Santa Monica Studio's 2010 action-adventure game God of War III with his bow being used by Peirithous. He also appears in the 2014 Hi-Rez Studios Multiplayer Online Battle Arena game Smite as a playable character.Psychology and philosophy In philosophical discussion of the arts, a distinction is sometimes made between the Apollonian and Dionysian impulses where the former is concerned with imposing intellectual order and the latter with chaotic creativity. Friedrich Nietzsche argued that a fusion of the two was most desirable. Psychologist Carl Jung's Apollo archetype represents what he saw as the disposition in people to over-intellectualise and maintain emotional distance.Spaceflight In spaceflight, the 1960s and 1970s NASA program for orbiting and landing astronauts on the Moon was named after Apollo, by NASA manager Abe Silverstein: "Apollo riding his chariot across the Sun was appropriate to the grand scale of the proposed program."GenealogySee alsoFamily tree of the Greek godsDryadEpirusPhoebus (disambiguation)Sibylline oraclesTegyraTemple of Apollo (disambiguation)NotesReferencesSourcesPrimary sources Aelian, On Animals, Volume II: Books 6-11. Translated by A. F. Scholfield. Loeb Classical Library 447. Cambridge, MA: Harvard University Press, 1958. Aeschylus, The Eumenides in Aeschylus, with an English translation by Herbert Weir Smyth, Ph. D. in two volumes, Vol 2, Cambridge, Massachusetts, Harvard University Press, 1926, Online version at the Perseus Digital Library. Antoninus Liberalis, The Metamorphoses of Antoninus Liberalis translated by Francis Celoria (Routledge 1992). Online version at the Topos Text Project. Apollodorus, Apollodorus, The Library, with an English Translation by Sir James George Frazer, F.B.A., F.R.S. in 2 Volumes. Cambridge, MA, Harvard University Press; London, William Heinemann Ltd. 1921. Online version at the Perseus Digital Library. Apollonius of Rhodes, Apollonius Rhodius: the Argonautica, translated by Robert Cooper Seaton, W. Heinemann, 1912. Internet Archive. Callimachus, Callimachus and Lycophron with an English Translation by A. W. Mair; Aratus, with an English Translation by G. R. Mair, London: W. Heinemann, New York: G. P. Putnam 1921. Online version at Harvard University Press. Internet Archive. Cicero, Marcus Tullius, De Natura Deorum in Cicero in Twenty-eight Volumes, XIX De Natura Deorum; Academica, with an english translation by H. Rackham, Cambridge, Massachusetts: Harvard University Press; London: William Heinemann, Ltd, 1967.  Internet Archive. Diodorus Siculus, Library of History, Volume III: Books 4.59-8, translated by C. H. Oldfather, Loeb Classical Library No. 340. Cambridge, Massachusetts, Harvard University Press, 1939. . Online version at Harvard University Press. Online version by Bill Thayer. Herodotus, Herodotus, with an English translation by A. D. Godley. Cambridge. Harvard University Press. 1920. Online version available at The Perseus Digital Library. Hesiod, Theogony, in The Homeric Hymns and Homerica with an English Translation by Hugh G. Evelyn-White, Cambridge, MA., Harvard University Press; London, William Heinemann Ltd. 1914. Online version at the Perseus Digital Library. Homeric Hymn 3 to Apollo in The Homeric Hymns and Homerica with an English Translation by Hugh G. Evelyn-White, Cambridge, MA., Harvard University Press; London, William Heinemann Ltd. 1914. Online version at the Perseus Digital Library. Homeric Hymn 4 to Hermes, in The Homeric Hymns and Homerica with an English Translation by Hugh G. Evelyn-White, Cambridge, Massachusetts, Harvard University Press; London, William Heinemann Ltd. 1914. Online version at the Perseus Digital Library. Homer, The Iliad with an English Translation by A.T. Murray, PhD in two volumes. Cambridge, MA., Harvard University Press; London, William Heinemann, Ltd. 1924. Online version at the Perseus Digital Library. Homer; The Odyssey with an English Translation by A.T. Murray, PH.D. in two volumes. Cambridge, MA., Harvard University Press; London, William Heinemann, Ltd. 1919. Online version at the Perseus Digital Library. Hyginus, Gaius Julius, De Astronomica, in The Myths of Hyginus, edited and translated by Mary A. Grant, Lawrence: University of Kansas Press, 1960. Online version at ToposText. Hyginus, Gaius Julius, Fabulae, in The Myths of Hyginus, edited and translated by Mary A. Grant, Lawrence: University of Kansas Press, 1960. Online version at ToposText. Livy, The History of Rome, Books I and II With An English Translation. Cambridge. Cambridge, Mass., Harvard University Press; London, William Heinemann, Ltd. 1919. Nonnus, Dionysiaca; translated by Rouse, W H D, I Books I-XV. Loeb Classical Library No. 344, Cambridge, Massachusetts, Harvard University Press; London, William Heinemann Ltd. 1940. Internet Archive Nonnus, Dionysiaca; translated by Rouse, W H D, II Books XVI-XXXV. Loeb Classical Library No. 345, Cambridge, Massachusetts, Harvard University Press; London, William Heinemann Ltd. 1940. Internet Archive Statius, Thebaid. Translated by Mozley, J H. Loeb Classical Library Volumes. Cambridge, Massachusetts, Harvard University Press; London, William Heinemann Ltd. 1928. Strabo, The Geography of Strabo. Edition by H.L. Jones. Cambridge, Mass.: Harvard University Press; London: William Heinemann, Ltd. 1924. Online version at the Perseus Digital Library. Sophocles, Oedipus Rex Palaephatus, On Unbelievable Tales 46. Hyacinthus (330 BCE) Ovid, Metamorphoses, Brookes More, Boston, Cornhill Publishing Co. 1922. Online version at the Perseus Digital Library. 10. 162–219 (1–8 CE) Pausanias, Pausanias Description of Greece with an English Translation by W.H.S. Jones, Litt.D., and H.A. Ormerod, M.A., in 4 Volumes. Cambridge, MA, Harvard University Press; London, William Heinemann Ltd. 1918. Online version at the Perseus Digital Library. Philostratus the Elder, Imagines, in Philostratus the Elder, Imagines. Philostratus the Younger, Imagines. Callistratus, Descriptions. Translated by Arthur Fairbanks. Loeb Classical Library No. 256. Cambridge, Massachusetts: Harvard University Press, 1931.  . Online version at Harvard University Press.  Internet Archive 1926 edition. i.24 Hyacinthus (170–245 CE) Philostratus the Younger, Imagines, in Philostratus the Elder, Imagines. Philostratus the Younger, Imagines. Callistratus, Descriptions. Translated by Arthur Fairbanks. Loeb Classical Library No. 256. Cambridge, Massachusetts: Harvard University Press, 1931.  . Online version at Harvard University Press. Internet Archive 1926 edition. 14. Hyacinthus (170–245 CE) Pindar, Odes, Diane Arnson Svarlien. 1990. Online version at the Perseus Digital Library. Plutarch. Lives, Volume I: Theseus and Romulus. Lycurgus and Numa. Solon and Publicola. Translated by Bernadotte Perrin. Loeb Classical Library No. 46. Cambridge, Massachusetts: Harvard University Press, 1914. . Online version at Harvard University Press. Numa at the Perseus Digital Library. Pseudo-Plutarch, De fluviis, in Plutarch's morals, Volume V, edited and translated by William Watson Goodwin, Boston: Little, Brown & Co., 1874. Online version at the Perseus Digital Library. Lucian, Dialogues of the Dead. Dialogues of the Sea-Gods. Dialogues of the Gods. Dialogues of the Courtesans, translated by M. D. MacLeod, Loeb Classical Library No. 431, Cambridge, Massachusetts, Harvard University Press, 1961. . Online version at Harvard University Press. Internet Archive. First Vatican Mythographer, 197. Thamyris et Musae Tzetzes, John, Chiliades, editor Gottlieb Kiessling, F.C.G. Vogel, 1826. Google Books. (English translation: Book I by Ana Untila; Books II–IV, by Gary Berkowitz; Books V–VI by Konstantino Ramiotis; Books VII–VIII by Vasiliki Dogani; Books IX–X by Jonathan Alexander; Books XII–XIII by Nikolaos Giallousis. Internet Archive). Valerius Flaccus, Argonautica, translated by J. H. Mozley, Loeb Classical Library No. 286. Cambridge, Massachusetts, Harvard University Press; London, William Heinemann Ltd. 1928. . Online version at Harvard University Press. Online translated text available at theoi.com. Vergil, Aeneid. Theodore C. Williams. trans. Boston. Houghton Mifflin Co. 1910. Online version at the Perseus Digital Library.Secondary sources Athanassakis, Apostolos N., and Benjamin M. Wolkow, The Orphic Hymns, Johns Hopkins University Press; owlerirst Printing edition (May 29, 2013). . Google Books. M. Bieber, 1964. Alexander the Great in Greek and Roman Art. Chicago. Hugh Bowden, 2005. Classical Athens and the Delphic Oracle: Divination and Democracy. Cambridge University Press. Walter Burkert, 1985. Greek Religion (Harvard University Press) III.2.5 passim  Fontenrose, Joseph Eddy, Python: A Study of Delphic Myth and Its Origins, University of California Press, 1959. . Gantz, Timothy, Early Greek Myth: A Guide to Literary and Artistic Sources, Johns Hopkins University Press, 1996, Two volumes:  (Vol. 1),  (Vol. 2).  Miranda J. Green, 1997. Dictionary of Celtic Myth and Legend, Thames and Hudson. Grimal, Pierre, The Dictionary of Classical Mythology, Wiley-Blackwell, 1996. . Hard, Robin, The Routledge Handbook of Greek Mythology: Based on H.J. Rose's "Handbook of Greek Mythology", Psychology Press, 2004, . Google Books. Karl Kerenyi, 1953. Apollon: Studien über Antiken Religion und Humanität revised edition.  Kerényi, Karl 1951, The Gods of the Greeks,  Thames and Hudson, London. Mertens, Dieter; Schutzenberger, Margareta. Città e monumenti dei Greci d'Occidente: dalla colonizzazione alla crisi di fine V secolo a.C.. Roma L'Erma di Bretschneider, 2006. . Martin Nilsson, 1955. Die Geschichte der Griechische Religion, vol. I. C.H. Beck. Parada, Carlos, Genealogical Guide to Greek Mythology, Jonsered, Paul Åströms Förlag, 1993. . Pauly–Wissowa, Realencyclopädie der klassischen Altertumswissenschaft: II, "Apollon". The best repertory of cult sites (Burkert). Peck, Harry Thurston, Harpers Dictionary of Classical Antiquities, New York. Harper and Brothers. 1898. Online version at the Perseus Digital Library. Pfeiff, K.A., 1943. Apollon: Wandlung seines Bildes in der griechischen Kunst. Traces the changing iconography of Apollo. D.S.Robertson (1945) A handbook of Greek and Roman Architecture Cambridge University Press Smith, William; Dictionary of Greek and Roman Biography and Mythology, London (1873). "Apollo"  Smith, William, A Dictionary of Greek and Roman Antiquities. William Smith, LLD. William Wayte. G. E. Marindin. Albemarle Street, London. John Murray. 1890. Online version at the Perseus Digital Library. Spivey Nigel (1997) Greek art Phaedon Press Ltd.External links Apollo at the Greek Mythology Link, by Carlos Parada The Warburg Institute Iconographic Database: ca 1650 images of ApolloBeauty godsHealth godsKnowledge godsLight deitiesMaintenance deitiesMusic and singing godsOracular godsSolar godsGreek godsRoman godsDragonslayersMythological Greek archersMythological rapistsHomosexuality and bisexuality deitiesDivine twinsDeities in the IliadMetamorphoses charactersCharacters in Greek mythology LGBT themes in Greek mythologyChildren of ZeusCharacters in the OdysseyCharacters in the Argonautica
+Andre Kirk Agassi ( ; born April 29, 1970) is an American former world No. 1 tennis player. He is an eight-time major champion and a 1996 Olympic gold medalist, as well as a runner-up in seven other Grand Slam tournaments.Agassi was the first man to win four Australian Open singles titles in the Open Era (though later surpassed by Novak Djokovic, who won his fifth title in 2015 and has since won the tournament nine times). Agassi is the second of five men to achieve the career Grand Slam in the Open Era and the fifth of eight overall to make the achievement. He is also the first of two men to achieve the career Golden Slam (career Grand Slam and Olympic gold medal), and the only man to win a career Super Slam (career Grand Slam, plus the Olympic gold medal and the year-end championships).Agassi was the first man to win all four singles majors on three different surfaces (hard, clay and grass), and remains the most recent American man to win the French Open (in 1999) and the Australian Open (in 2003). He also won 17 ATP Masters Series titles and was part of the winning Davis Cup teams in 1990, 1992 and 1995. Agassi reached the world No. 1 ranking for the first time in 1995 but was troubled by personal issues during the mid-to-late 1990s and sank to No. 141 in 1997, prompting many to believe that his career was over. Agassi returned to No. 1 in 1999 and enjoyed the most successful run of his career over the next four years. During his 20-plus year tour career, Agassi was known by the nickname "The Punisher".After suffering from sciatica caused by two bulging discs in his back, a spondylolisthesis (vertebral displacement) and a bone spur that interfered with the nerve, Agassi retired from professional tennis on September 3, 2006, after losing in the third round of the US Open. He is the founder of the Andre Agassi Charitable Foundation, which has raised over $60 million for at-risk children in Southern Nevada. In 2001, the Foundation opened the Andre Agassi College Preparatory Academy in Las Vegas, a K–12 public charter school for at-risk children. He has been married to fellow tennis player Steffi Graf since 2001.1970–1985: Early lifeAndre Agassi was born in Las Vegas, Nevada, to Emmanuel "Mike" Agassi, a former Olympic boxer from Iran and American Elizabeth "Betty" Agassi (née Dudley). His father is of Armenian and Assyrian heritage. Andre Agassi's mother, Betty, is a breast cancer survivor. He has three older siblings – Rita (last wife of former number one Pancho Gonzales), Philip and Tami. Andre was given the middle name Kirk after Kirk Kerkorian, an Armenian American billionaire. Emmanuel Agassi, then a waiter at Tropicana Las Vegas, had met Kerkorian in 1963.At the age of 12, Agassi and his good friend and doubles partner, Roddy Parks, won the 1982 National Indoor Boys 14s Doubles Championship in Chicago. Agassi describes memorable experiences and juvenile pranks with Roddy in his book Open.When he was 13, Agassi was sent to Nick Bollettieri's Tennis Academy in Florida. He was meant to stay for only three months, because that was all his father could afford. After thirty minutes of watching Agassi play, Bollettieri, deeply impressed by his talent, called Mike and said: "Take your check back. He's here for free." Agassi then dropped out of school in the ninth grade to pursue a full-time tennis career.1986–2006: Professional career1986–1993: Breakthrough and the first major titleAgassi turned professional at the age of 16 and competed in his first tournament at La Quinta, California. He won his first match against John Austin, but then lost his second match to Mats Wilander. By the end of 1986, Agassi was ranked No. 91. He won his first top-level singles title in 1987 at the Sul American Open in Itaparica and ended the year ranked No. 25. He won six additional tournaments in 1988 (Memphis, U.S. Men's Clay Court Championships, Forest Hills WCT, Stuttgart Outdoor, Volvo International and Livingston Open), and, by December of that year, he had surpassed US$1 million in career prize money after playing in just 43 tournaments—the fastest anyone in history had reached that level. During 1988, he also set the open-era record for most consecutive victories by a male teenager (a record that stood for 17 years until Rafael Nadal broke it in 2005). His year-end ranking was No. 3, behind second-ranked Ivan Lendl and top-ranked Mats Wilander. Both the Association of Tennis Professionals and Tennis magazine named Agassi the Most Improved Player of the Year for 1988.In addition to not playing the Australian Open (which later became his best Grand Slam event) for the first eight years of his career, Agassi chose not to play at Wimbledon from 1988 through 1990 and publicly stated that he did not wish to play there because of the event's traditionalism, particularly its "predominantly white" dress code to which players at the event are required to conform.Strong performances on the tour meant that Agassi was quickly tipped as a future Grand Slam champion. While still a teenager, he reached the semi-finals of both the French Open and the US Open in 1988 and made the US Open semi-finals in 1989. He began the 1990s with a series of near-misses. He reached his first Grand Slam final in 1990 at the French Open, where he was favored before losing in four sets to Andrés Gómez, which he later attributed in his book to worrying about his wig falling off during the match. He reached his second Grand Slam final of the year at the US Open, defeating defending champion Boris Becker in the semi-finals. His opponent in the final was Pete Sampras; a year earlier, Agassi had crushed Sampras, after which time he told his coach that he felt bad for Sampras because he was never going to make it as a pro. Agassi lost the US Open final to Sampras in three sets. The rivalry between these two American players became the biggest one in tennis over the rest of the decade. Agassi ended 1990 on a high note as he helped the United States win its first Davis Cup in 8 years and won his only Tennis Masters Cup, beating reigning Wimbledon champion Stefan Edberg in the final.In 1991, Agassi reached his second consecutive French Open final, where he faced fellow Bollettieri Academy alumnus Jim Courier. Courier emerged the victor in a five-set final. Agassi decided to play at Wimbledon in 1991, leading to weeks of speculation in the media about the clothes he would wear. He eventually emerged for the first round in a completely white outfit. He reached the quarterfinals on that occasion, losing in five sets to David Wheaton.Agassi's Grand Slam tournament breakthrough came at Wimbledon, not at the French Open or the US Open, where he had previously enjoyed success. In 1992, he defeated Goran Ivanišević in a five-set final. Along the way, Agassi overcame two former Wimbledon champions: Boris Becker and John McEnroe. No other baseliner would triumph at Wimbledon until Lleyton Hewitt ten years later. Agassi was named the BBC Overseas Sports Personality of the Year in 1992. Agassi once again played on the United States' Davis Cup winning team in 1992. It was their second Davis cup title in three years. Agassi famously played the game wearing Oakley brand sunglasses, and a photo of him from the day appeared on the cover of Tennis magazine. In his memoir, he wrote that he was covering up bloodshot eyes from a hangover and claimed that the founder of Oakley, Jim Jannard, had sent him a Dodge Viper to thank him for the inadvertent publicity.In 1993, Agassi won the only doubles title of his career, at the Cincinnati Masters, partnered with Petr Korda. He missed much of the early part of that year due to injuries. Although he made the quarterfinals in his Wimbledon title defense, he lost to eventual champion and No. 1 Pete Sampras in five sets. Agassi lost in the first round at the US Open to Thomas Enqvist and required wrist surgery late in the year.1994–1997: Rise to the top, Olympic Gold and the fallWith new coach Brad Gilbert on board, Agassi began to employ more of a tactical, consistent approach, which fueled his resurgence. He started slowly in 1994, losing in the first week at the French Open and Wimbledon. Nevertheless, he emerged during the hard-court season, winning the Canadian Open. His comeback culminated at the 1994 US Open with a five-set fourth-round victory against Michael Chang. He then became the first man to capture the US Open as an unseeded player, beating Michael Stich in the final. Along the way, he beat 5 seeded players.In 1995, Agassi shaved his balding head, breaking with his old "image is everything" style. He competed in the 1995 Australian Open (his first appearance at the event) and won, beating Sampras in a four-set final. Agassi and Sampras met in five tournament finals in 1995, all on hardcourt, with Agassi winning three. Agassi won three Masters Series events in 1995 (Cincinnati, Key Biscayne, and the Canadian Open) and seven titles total. He compiled a career-best 26-match winning streak during the summer hard-court circuit, with the last victory being in an intense late-night four-set semi-final of the US Open against Boris Becker. The streak ended the next day when Agassi lost the final to Sampras.Agassi reached the world No. 1 ranking for the first time in April 1995. He held that ranking until November, for a total of 30 weeks. Agassi skipped most of the fall indoor season which allowed Sampras to surpass him and finish ranked No. 1 at the year-end ranking. In terms of win/loss record, 1995 was Agassi's best year. He won 73 and lost 9 matches, and was also once again a key player on the United States' Davis Cup winning team—the third and final Davis Cup title of his career.1996 was a less successful year for Agassi, as he failed to reach any Grand Slam final. He suffered two early-round losses to Chris Woodruff and Doug Flach at the French Open and Wimbledon, respectively, and lost to Chang in straight sets in the Australian and US Open semi-finals. At the time, Agassi blamed the Australian Open loss on the windy conditions, but later said in his biography that he had lost the match on purpose, as he did not want to play Boris Becker, whom he would have faced in that final. The high point for Agassi was winning the men's singles gold medal at the Olympic Games in Atlanta, beating Sergi Bruguera of Spain in the final. Agassi also successfully defended his singles titles in Cincinnati and Key Biscayne.1997 was the low point of Agassi's career. His wrist injury resurfaced, and he played only 24 matches during the year. He later confessed that he started using crystal methamphetamine at that time, allegedly on the urging of a friend. He failed an ATP drug test, but wrote a letter claiming the same friend had spiked a drink. The ATP dropped the failed drug test as a warning. In his autobiography, Agassi admitted that the letter was a lie. He quit the drug soon after. At this time Agassi was also in a failing marriage with actress, model, and socialite Brooke Shields and had lost interest in the game. He won no top-level titles, and his ranking sank to No. 141 on November 10, 1997, prompting many to believe that his run as one of the sport's premier competitors was over and he would never again win any significant championships.1998–2003: Return to glory and Career Super SlamIn 1998, Agassi began a rigorous conditioning program and worked his way back up the rankings by playing in Challenger Series tournaments, a circuit for pro players ranked outside the world's top 50. After returning to top physical and mental shape, Agassi recorded the most successful period of his tennis career and also played classic matches in that period against Pete Sampras and Patrick Rafter.In 1998, Agassi won five titles and leapt from No. 110 to No. 6, the highest jump into the top 10 made by any player during a calendar year. At Wimbledon, he had an early loss in the second round to Tommy Haas. He won five titles in ten finals and was runner-up at the Masters Series tournament in Key Biscayne, losing to Marcelo Ríos, who became No. 1 as a result. At the year end he was awarded the ATP Most Improved Player of the Year for the second time in his career (the first being 10 years earlier in 1988).Agassi entered the history books in 1999 when he came back from two sets to love down to beat Andrei Medvedev in a five-set French Open final, becoming, at the time, only the fifth male player (joining Rod Laver, Fred Perry, Roy Emerson and Don Budge—these have since been joined by Roger Federer, Rafael Nadal, and Novak Djokovic) to win all four Grand Slam singles titles during his career. Only Laver, Agassi, Federer, Nadal and Djokovic have achieved this feat during the Open Era. This win also made him the first (of only four, the next being Federer, Nadal and Djokovic respectively) male player in history to have won all four Grand Slam titles on three different surfaces (clay, grass and hard courts).  Agassi also became the only male player to win the Career Super Slam, consisting of all four Grand Slam tournaments plus an Olympic gold medal in singles and a Year-end championship.Agassi followed his 1999 French Open victory by reaching the Wimbledon final, where he lost to Sampras in straight sets. He rebounded from his Wimbledon defeat by winning the US Open, beating Todd Martin in five sets (rallying from a two sets to one deficit) in the final. Overall during the year Agassi won 5 titles including two majors and the ATP Masters Series in Paris, where he beat Marat Safin. Agassi ended 1999 as the No. 1, ending Sampras's record of six consecutive year-ending top rankings (1993–98). This was the only time Agassi ended the year at No. 1.  Agassi was runner-up to Sampras at the year-end Tennis Masters Cup losing 1–6, 5–7, 4-6 despite beating Sampras in the round-robin 6–2, 6–2.He began the next year 2000 by capturing his second Australian Open title, beating Sampras in a five-set semi-final and Yevgeny Kafelnikov in a four-set final. He was the first male player to have reached four consecutive Grand Slam finals since Rod Laver achieved the Grand Slam in 1969.  At the time, Agassi was also only the fourth player since Laver to be the reigning champion of three of four Grand Slam events, missing only the Wimbledon title.. 2000 also saw Agassi reach the semi-finals at Wimbledon, where he lost in five sets to Rafter in a match considered by many to be one of the best ever at Wimbledon. At the inaugural Tennis Masters Cup in Lisbon, Agassi reached the final after defeating Marat Safin in the semi-finals to end the Russian's hopes to become the youngest No. 1 in the history of tennis. Agassi then lost to Gustavo Kuerten in the final, allowing Kuerten to be crowned year-end No. 1.Agassi opened 2001 by successfully defending his Australian Open title with a straight-sets final win over Arnaud Clément. En route, he beat a cramping Rafter in five sets in front of a sell-out crowd in what turned out to be the Aussie's last Australian Open. At Wimbledon, they met again in the semi-finals, where Agassi lost another close match to Rafter, 8–6 in the fifth set. In the quarterfinals at the US Open, Agassi lost a 3-hour, 33 minute epic match with Sampras, 7–6, 6–7, 6–7, 6–7, with no breaks of serve during the 52-game match. Despite the setback, Agassi finished 2001 ranked No. 3, becoming the only male tennis player to finish a year ranked in the top 3 in three different decades.2002 opened with disappointment for Agassi, as injury forced him to skip the Australian Open, where he was a two-time defending champion. Agassi recovered from the injury and later that year defended his Key Biscayne title beating then rising Roger Federer in a four-set final. The last duel between Agassi and Sampras came in the final of the US Open, which Sampras won in four sets and left Sampras with a 20–14 edge in their 34 career meetings. The match was the last of Sampras's career. Agassi's US Open finish, along with his Masters Series victories in Key Biscayne, Rome and Madrid, helped him finish 2002 as the oldest year-end No. 2 at 32 years and 8 months.In 2003, Agassi won the eighth (and final) Grand Slam title of his career at the Australian Open, where he beat Rainer Schüttler in straight sets in the final.On April 28, 2003, he recaptured the No. 1 ranking to become the oldest top-ranked male player since the ATP rankings began at 33 years and 13 days. The record was later surpassed by Roger Federer in 2018. He had held the No. 1 ranking for two weeks, when Lleyton Hewitt took it back on May 12, 2003. Agassi then recaptured the No. 1 ranking once again on June 16, 2003, which he held for 12 weeks until September 7, 2003. There he managed to reach the US Open semi-finals, where he lost to Juan Carlos Ferrero, surrendering his No. 1 ranking to him. During his career, Agassi held the ranking for a total of 101 weeks. Agassi's ranking slipped when injuries forced him to withdraw from a number of events. At the year-end Tennis Masters Cup, Agassi lost in the final to Federer, his third time to finish as runner-up in the event after losses in 1999 and 2000, and finished the year ranked No. 4. At age 33, he had been one of the oldest players to rank in the top 5 since Connors, at age 35, was No. 4 in 1987.2004–2006: Final yearsIn 2004, Agassi began the year with a five-set loss in the semi-finals of the Australian Open to Marat Safin; the loss ended Agassi's 26-match winning streak at the event. He won the Masters series event in Cincinnati to bring his career total to 59 top-level singles titles and a record 17 ATP Masters Series titles, having already won seven of the nine ATP Masters tournament—all except the tournaments in Monte Carlo and Hamburg. At 34, he became the second-oldest singles champion in Cincinnati tournament history (the tournament began in 1899), tied with Roger Federer and surpassed only by Ken Rosewall, who won the title in 1970 at age 35. He finished the year ranked No. 8, one of the oldest players to finish in the top 10 since the 36-year-old Connors was No. 7 in 1988. At the time, Agassi also became the sixth male player during the open era to reach 800 career wins with his first-round victory over Alex Bogomolov in Countrywide Classic in Los Angeles.Agassi's 2005 began with a quarterfinal loss to Federer at the Australian Open. Agassi had several other deep runs at tournaments, but had to withdraw from several events due to injury. He lost to Jarkko Nieminen in the first round of the French Open. He won his fourth title in Los Angeles and reached the final of the Rogers Cup, before falling to No. 2 Rafael Nadal.Agassi's 2005 was defined by an improbable run to the US Open final. After beating Răzvan Sabău and Ivo Karlović in straight sets and Tomáš Berdych in four sets, Agassi won three consecutive five-set matches to advance to the final. The most notable of these matches was his quarterfinal victory over James Blake, where he rallied from two sets down to win in the fifth set tie-breaker. His other five-set victories were on Xavier Malisse in the fourth round and Robby Ginepri in the semi-finals. In the final, Agassi faced Federer, who was seeking his second consecutive US Open title and his sixth Grand Slam title in two years. Federer defeated Agassi in four sets. Agassi finished 2005 ranked No. 7, his 16th time in the year-end top-10 rankings, which tied Connors for the most times ranked in the top 10 at year's end.Agassi had a poor start to 2006, as he was still recovering from an ankle injury and also suffering from back and leg pain and lack of match play. Agassi withdrew from the Australian Open because of the ankle injury, and his back injury and other pains forced him to withdraw from several other events, eventually skipping the entire clay-court season including the French Open. This caused his ranking to drop out of the top 10 for the last time. Agassi returned for the grass-court season, playing a tune-up, and then Wimbledon. He was defeated in the third round by world No. 2 (and eventual runner-up) Rafael Nadal. Against conventions, Agassi, the losing player, was interviewed on court after the match. At Wimbledon, Agassi announced his plans to retire following the US Open. Agassi played only two events during the summer hard-court season with his best result being a quarterfinal loss at the Countrywide Classic in Los Angeles to Fernando González of Chile, which resulted in him being unseeded at the US Open.Agassi had a short, but dramatic, run in his final US Open. Because of extreme back pain, Agassi was forced to receive anti-inflammatory injections after every match. After a tough four-set win against Andrei Pavel, Agassi faced eighth-seeded Marcos Baghdatis in the second round who had earlier advanced to the 2006 Australian Open final and Wimbledon semi-finals. Agassi won in five tough sets as the younger Baghdatis succumbed to muscle cramping in the final set. In his last match, Agassi fell to 112th-ranked big-serving Benjamin Becker of Germany in four sets. Agassi received a four-minute standing ovation from the crowd after the match and delivered a retirement speech.RivalriesAgassi vs. SamprasThe rivalry has been called the greatest of the generation of players competing in the 1990s, as Sampras and Agassi were the most successful players of that decade. They also had very contrasting playing styles, with Sampras being considered the greatest server and Agassi the greatest serve returner at the time. Agassi and Sampras met 34 times on the tour level with Agassi trailing 14–20.The 1990 US Open was their first meeting in a Grand Slam tournament final. Agassi was favored as he was ranked No. 4 at the time, compared to the No. 12 ranking of Sampras and because Agassi had defeated Sampras in their only previously completed match. Agassi, however, lost the final to Sampras in straight sets. Their next meeting in a Grand Slam was at the 1992 French Open, where they met in the quarterfinals. Although Sampras was ranked higher, Agassi came out winning in straight sets. They met again on a Grand Slam level at the quarterfinals of Wimbledon in 1993, where Agassi was the defending champion and Sampras was the newly minted world No. 1. Agassi dug himself out from a two-sets-to-love hole, levelling the match at two sets apiece; however, Sampras prevailed in five sets, and went on to win his first Wimbledon championship.With both Sampras and Agassi participating, the US won the Davis Cup in 1995. The year should be considered the peak of the rivalry as together they won three out of four major titles, meeting each other twice in the finals, and were occupying the top two spots in the rankings for the whole year. They met five times during the year, all in the title matches, including the Australian Open, the Newsweek Champions Cup (now Indian Wells), the Lipton International Players Championships (now Miami Open), the Canadian Open, and the US Open. Agassi won three of the finals, including the Australian Open; however, Sampras took the US Open title, ending Agassi's 26-match winning streak. After Agassi had taken most of the fall season off, Sampras took over the No. 1 ranking for the end of the season.In the following three years, while Sampras continued winning Grand Slam titles every season, Agassi slumped in the rankings and struggled in major competitions. The next time Sampras and Agassi met in a Grand Slam final was at Wimbledon in 1999, where Sampras won in straight sets. For both, it was considered a career rejuvenation, as Sampras had suffered a string of disappointments in the previous year while Agassi was regaining his status as a top-ranked player after winning the French Open. Sampras forfeited the No. 1 ranking to Agassi when injury forced him to withdraw from that year's US Open, which Agassi went on to win. They faced each other twice in the season-ending ATP Tour World Championships, with Sampras losing the round-robin match, but winning the final.In the 2000s, they met three more times on the Grand Slam level offering three memorable contests. In 2000, the top-ranked Agassi defeated No. 3 Sampras in the semi-finals of the Australian Open in five sets, which was an important win for Agassi who had lost 4 of the previous five matches against Sampras. In arguably their most memorable match ever, Sampras defeated Agassi in the 2001 US Open quarterfinals in four sets. There were no breaks of serve during the entire match. Reruns of the match are frequently featured on television, especially during US Open rain delays, and the match is considered one of the best in history because of the level of play presented by both players.Their last meeting was the final of the 2002 US Open, which was their third meeting in a US Open final, but the first since 1995. The match was also notable because they had defeated several up-and-coming players en route to the final. Sampras had defeated No. 3 Tommy Haas in the fourth round and future No. 1 Andy Roddick in the quarterfinals, while Agassi had defeated No. 1 and defending champion Lleyton Hewitt in the semi-finals. Sampras defeated Agassi in four sets. This was the final ATP tour singles match of Sampras's career.Agassi vs. ChangMichael Chang was the opponent Agassi faced most frequently from all the players other than Sampras. They met 22 times on the tour level with Agassi leading 15–7. Chang, unlike most of Agassi's big rivals, had a playing style similar to his. Both players preferred to stay at the baseline with Chang being more defensive-minded. The outcome was that most of their meetings were built on long and entertaining rallies. The rivalry began late in the 1980s with both players being considered the prodigies of the next great generation of American tennis players and both having foreign descent.Agassi won the first four matches including a straight-set victory in round 16 of the 1988 US Open and defeating Chang, the defending champion, in the 1990 French Open in a four-set quarterfinal. Arguably their best match took place in the round of 16 of the 1994 US Open. While both players presented high-quality shot-making, the momentum changed from set to set with Agassi eventually prevailing in a five-set victory. It turned out to be the toughest contest on his way to his first US Open title. Their next two Grand Slam meetings came in 1996, with Chang recording easy straight-set victories in the semi-finals of both the Australian Open and the US Open. Years after, Agassi shockingly admitted in his book that he had lost the first of the matches on purpose as he did not want to face Boris Becker, who was awaiting the winner in the final. Agassi won the last four of their matches, with the last being in 2003 at the Miami Open with Chang being clearly past his prime.Agassi vs. BeckerBoris Becker and Agassi played 14 times with Agassi leading 10–4. Becker won their first three matches in 1988 and 1989 before Agassi reversed the rivalry in 1990, and won 10 of their last 11 matches. They first played at Indian Wells in 1988, with Becker prevailing. Their most notable match was the 1989 Davis Cup semi-final match, which Becker won in five sets after losing the first two in tiebreaks. Agassi, considered a baseliner with a playing style not suiting grass, shocked Becker, a three-time champion, in a five-set quarterfinal at Wimbledon in 1992 on his way to his first Grand Slam title. The intensity of the rivalry peaked in 1995. Becker won that year's Wimbledon semi-final after being down a set and two breaks, to eventually win in four sets. In a highly anticipated rematch in the US Open semi-final, this time it was Agassi who came out victorious in four tight sets. Their final match was played at Hong Kong in 1999, which Agassi won in three sets.Agassi vs. RafterAgassi and Pat Rafter played fifteen times with Agassi leading 10–5. The rivalry has been considered special and delivered memorable encounters, because of the players' contrasting styles of play, with Rafter using traditional serve-&-volley methods against Agassi's variety of return of serves and passing shots as his main weapons. Agassi led 8–2 on hard courts, but Rafter surprisingly won their sole match on clay at the 1999 Rome Masters. They played four matches at Wimbledon with both winning two matches each. Agassi won the first two in 1993 and 1999, while Rafter took their 2000 and 2001 encounters, both of the gruelling 5-setters often being presented on the lists of best matches ever played. Agassi also won both their meetings at the Australian Open, in 1995 and 2001, on his way to the title on both occasions. Rafter, however, took their only US Open encounter in 1997 and went on to win the title.Agassi vs. FedererAgassi and Roger Federer played 11 times, and Federer led their head-to-head series 8–3. With the retirement of Sampras, the rivalry against the 11-years-younger Federer, who was another great server like Sampras, became Agassi's main rivalry for the final years of his career. Agassi won their first three matches, but then went on to lose eight consecutive ones. They first met in just the third tournament of Federer's career at the 1998 Swiss Indoors in Federer's hometown, with Agassi prevailing over the 17-year-old. Agassi also defeated Federer at the 2001 US Open and the finals of the Miami Open in 2002. Federer began to turn the tide at the Masters Cup in 2003, when he defeated Agassi in both the round-robin and the final. They played a memorable quarterfinal match at the 2004 US Open that spanned over two windy days, with Federer eventually prevailing in five sets. At the 2005 Dubai Championships, Federer and Agassi attracted worldwide headlines with a publicity stunt that saw the two tennis legends play on a helipad almost 220 meters above sea level at the hotel Burj al-Arab. Their final duel took place in the final of the 2005 US Open. In the historic clash of generations, Federer was victorious in four sets in front of a pro-Agassi crowd. The match was the last appearance by Agassi in any tournament final.Agassi vs. LendlAgassi and Ivan Lendl played eight times, and Lendl led their head-to-head series 6–2.Agassi vs. EdbergAgassi and Stefan Edberg played nine times, and Agassi led their head-to-head series 6–3.EarningsAgassi earned more than $30 million in prize-money during his career, sixth only to Djokovic, Federer, Nadal, Sampras and Murray to date (May 2018). He also earned more than $25 million a year through endorsements during his career, which was ranked fourth in all sports at the time.Post-retirementSince retiring after the 2006 US Open, Agassi has participated in a series of charity tournaments and continues his work with his own charity. On September 5, 2007, he was a surprise guest commentator for the Andy Roddick/Roger Federer US Open quarterfinal. He played an exhibition match at Wimbledon, teaming with his wife, Steffi Graf, to play with Tim Henman and Kim Clijsters. He played World Team Tennis for the Philadelphia Freedoms in the summer of 2009. At the 2009 French Open, Agassi was on hand to present Roger Federer, who completed his Career Grand Slam by winning the tournament and joined Agassi as one of six men to complete the Career Grand Slam, with the trophy.Also in 2009, Agassi played at the Outback Champions Series event for the first time. He played the Cancer Treatment Centers of America Tennis Championships at Surprise, Arizona, where he reached the final before bowing to eventual champion Todd Martin. Agassi returned to the tour renamed for the PowerShares Series in 2011 and participated in a total of seven events while winning two. Agassi beat Courier in the final of the Staples Champions Cup in Boston and later defeated Sampras at the CTCA Championships at his hometown Las Vegas.In 2012, Agassi took part in five tournaments, winning three of those. In November, at first he won BILT Champions Showdown in San Jose, beating John McEnroe in the final. The following day, he defended his title of the CTCA Championships, while defeating Courier in the decisive match. In the series season finale, he beat Michael Chang for the Acura Champions Cup. The series and Agassi came back to action in 2014. Agassi won both tournaments he participated in. At the Camden Wealth Advisors Cup's final in Houston, Agassi beat James Blake for a rematch of their 2005 US Open quarterfinal. He defeated Blake again in Portland to win the title of the Cancer Treatment Centers of America Championships. In 2015, Agassi took part in just one event of the PowerShares Series, losing to Mark Philippoussis in the final of the Champions Shootout. The following year he took part in two events, at first losing to Blake in Chicago, and the next day defeating Mardy Fish, but losing to Roddick in Charleston.In 2009, in Macau Agassi and Sampras met for the first time on court since the 2002 US Open final. Sampras won the exhibition in three sets. The rivalry between the former champions headlined sports media again in March 2010 after the two participated in the "Hit for Haiti" charity event organized to raise money for the victims of the earthquake. Partnered with Roger Federer and Rafael Nadal, the old rivals began making jokes at each other's expense, which ended up with Sampras intentionally striking a serve at Agassi's body. After the event, Agassi admitted that he had crossed the line with his jokes and publicly apologized to Sampras. Agassi and Sampras met again one year later for an exhibition match at Madison Square Garden in New York in front of 19 000 spectators as Sampras defeated Agassi in two sets. On March 3, 2014, Agassi and Sampras squared off for an exhibition in London for the annual World Tennis Day. This time, it was Agassi who came out on top in two straight sets.He returned to the tour in May 2017 in the position of coach to Novak Djokovic for the French Open. Agassi announced the end of the partnership on March 31, 2018, stating that there were too many disagreements in the relationship.Playing styleEarly in his career, Agassi would look to end points quickly by playing first-strike tennis, typically by inducing a weak return with a deep, hard shot, and then playing a winner at an extreme angle. On the rare occasion that he charged the net, Agassi liked to take the ball in the air and hit a swinging volley for a winner. His favored groundstroke was his flat, accurate two-handed backhand, hit well cross-court but especially down the line. His forehand was nearly as strong, especially his inside-out to the ad court.Agassi's strength was in dictating play from the baseline, and he was able to consistently take the ball on the rise. While he was growing up, his father and Nick Bollettieri trained him in this way. When in control of a point, Agassi would often pass up an opportunity to attempt a winner and hit a conservative shot to minimize his errors, and to make his opponent run more. This change to more methodical, less aggressive baseline play was largely initiated by his longtime coach, Brad Gilbert, in their first year together in 1994. Gilbert encouraged Agassi to wear out opponents with his deep, flat groundstrokes and to use his fitness to win attrition wars, and noted Agassi's two-handed backhand down the line as his very best shot. A signature play later in his career was a change-up drop shot to the deuce court after deep penetrating groundstrokes. This would often be followed by a passing shot or lob if the opponent was fast enough to retrieve it.Agassi was raised on hardcourts, but found much of his early major-tournament success on the red clay of Roland Garros, reaching two consecutive finals there early in his career. Despite grass being his worst surface, his first major win was at the slick grass of Wimbledon in 1992, a tournament that he professed to hating at the time. His strongest surface over the course of his career, was indeed hardcourt, where he won six of his eight majors.Business venturesAgassi established a limited liability company named Andre Agassi Ventures (formerly named Agassi Enterprises). Agassi, along with five athlete partners (including Wayne Gretzky, Joe Montana, Shaquille O'Neal, Ken Griffey, Jr., and Monica Seles) opened a chain of sports-themed restaurant named Official All Star Café in April 1996. The restaurant closed down in 2001.In 1999, he paid $1 million for a 10 percent stake in Nevada First Bank and made a $10 million profit when it was sold to Western Alliance Bancorp in 2006.In 2002, he joined the Tennis Channel to promote the channel to consumers and cable and satellite industry, and made an equity investment in the network. After meeting chef Michael Mina at one of his restaurants in San Francisco, Agassi partnered with him in 2002 to start Mina Group Inc. and opened 18 concept restaurants in San Francisco, San Jose, Dana Point, Atlantic City and Las Vegas. Agassi was an equity investor of a group that acquired Golden Nugget Las Vegas and Golden Nugget Laughlin from MGM Mirage for $215 million in 2004. One year later, the group sold the hotel-casino to Landry's, Inc. for $163 million in cash and $182 million in assumed debt. In 2007, he sat on the board of Meadows Bank, an independent bank in Nevada. He has invested in start-up companies backed by Allen & Company.Agassi and Graf formed a company called Agassi Graf Holdings. They invested in PURE, a nightclub at Caesars Palace, which opened in 2004, and sold it to Angel Management Group in 2010. In August 2006, Agassi and Graf developed a joint venture with high-end furniture maker Kreiss Enterprises. They launched a furniture line called Agassi Graf Collection. In September, Agassi and Graf, through their company Agassi Graf Development LLC, along with Bayview Financial LP, finalized an agreement to develop a condominium hotel, Fairmont Tamarack, at Tamarack Resort in Donnelly, Idaho. Owing to difficult market conditions and delays, they withdrew from the project in 2009. The group still owns three small chunks of land. In September, they collaborated with Steve Case's Exclusive Resorts to co-develop luxury resorts and design Agassi-Graf Tennis and Fitness Centers.They also invested in online ticket reseller viagogo in 2009 and both serve as board members and advisors of the company.In October 2012, Village Roadshow and investors including Agassi and Graf announced plans to build a new water park called Wet'n'Wild Las Vegas in Las Vegas. Village Roadshow has a 51% stake in the park while Agassi, Graf, and other private investors hold the remaining 49%. The park opened in May 2013.IMG managed Agassi from the time he turned pro in 1986 through January 2000 before switching to SFX Sports Group. His business manager, lawyer and agent was childhood friend Perry Rogers, but they have been estranged since 2008. In 2009, he and Graf signed with CAA.Equipment and endorsementsAgassi used Prince Graphite rackets early in his career. He signed a $7 million endorsement contract with Belgian tennis racquet makers Donnay. He later switched to Head Ti Radical racket and Head's LiquidMetal Radical racket, having signed a multimillion-dollar endorsement deal with Head in 1993. He renewed his contract in 1999, and in November 2003 he signed a lifetime agreement with Head. He also endorses Penn tennis balls. On July 25, 2005, Agassi left Nike after 17 years and signed an endorsement deal with Adidas. A major reason for Agassi leaving Nike was because Nike refused to donate to Agassi's charities, and Adidas was more than happy to do so. On May 13, 2013, Agassi rejoined Nike.Agassi was sponsored by DuPont, Ebel, Mountain Dew in 1993, Mazda in 1997, Kia Motors in 2002, American Express and Deutsche Bank in 2003. In 1990, he appeared in a television commercial for Canon Inc., promoting the Canon EOS Rebel camera. Between 1999 and 2000, he signed a multimillion-dollar, multiyear endorsement deal with Schick and became the worldwide spokesman for the company. Agassi signed a multiyear contract with Twinlab and promoted the company's nutritional supplements. In mid-2003, he was named the spokesman of Aramis Life, a fragrance by Aramis, and signed a five-year deal with the company. In March 2004, he signed a ten-year agreement worth $1.5 million a year with 24 Hour Fitness, which will open five Andre Agassi fitness centers by year-end. Prior to the 2012 Australian Open, Agassi and Australian winemaker Jacobs Creek announced a three-year partnership and created the Open Film Series to "[share] personal stories about the life defining moments that shaped his character on and off the court." In 2007, watchmaker Longines named Agassi as their brand ambassador.Agassi and his mother appeared in a Got Milk? advertisement in 2002.Agassi has appeared in many advertisements and television commercials with Graf. They both endorsed Deutsche Telekom in 2002, Genworth Financial and Canon Inc. in 2004, LVMH in 2007, and Nintendo Wii and Wii Fit U and Longines in 2013.Personal lifeRelationships and familyIn the early 1990s, after dating Wendi Stewart, Agassi dated American singer and entertainer Barbra Streisand. He wrote about the relationship in his 2009 autobiography, "We agree that we're good for each other, and so what if she's twenty-eight years older? We're sympatico, and the public outcry only adds spice to our connection. It makes our friendship feel forbidden, taboo – another piece of my overall rebellion. Dating Barbra Streisand is like wearing Hot Lava."He was married to Brooke Shields from 1997 to 1999.He married Steffi Graf on October 22, 2001, at their Las Vegas home; the only witnesses were their mothers. They have two children: son Jaden Gil (born 2001) and daughter Jaz Elle (born 2003). Agassi has said that he and Graf are not pushing their children toward becoming tennis players. The Graf-Agassi family resides in Summerlin, a community in the Las Vegas Valley. Graf's mother and brother, Michael, with his four children, also live there.Long-time trainer Gil Reyes has been called one of Agassi's closest friends; some have described him as being a "father figure" to Agassi. In 2012, Agassi and Reyes introduced their own line of fitness equipment, BILT By Agassi and Reyes. In December 2008, Agassi's childhood friend and former business manager, Perry Rogers, sued Graf for $50,000 in management fees he claimed that she owed him.AutobiographyAgassi's autobiography, Open: An Autobiography, (written with assistance from J. R. Moehringer), was published in November 2009. In it, Agassi talks about his childhood and his unconventional Armenian father, who came to the United States from Iran where he was a professional boxer. Overly demanding and emotionally abusive to the whole family, his father groomed young Agassi for tennis greatness by building a tennis court in their backyard and sending Agassi to tennis boarding school under the supervision of Nick Bollettieri, who later coached and managed part of Agassi's professional career.There is also mention in the book of using and testing positive for methamphetamine in 1997. In response to this revelation, Roger Federer declared himself shocked and disappointed, while Marat Safin argued that Agassi should return his prize money and be stripped of his titles. In an interview with CBS, Agassi justified himself and asked for understanding, saying that "It was a period in my life where I needed help."Agassi said that he had always hated tennis during his career because of the constant pressure it exerted on him. He also said he wore a hairpiece earlier in his career and thought Pete Sampras was "robotic".The book reached No. 1 on the New York Times Best Seller list and received favorable reviews. It won the Autobiography category of the 2010 British Sports Book Awards. In 2018, the book was listed on Esquire as one of "The 30 Best Sports Books Ever Written", and was also recommended by self-help author Tim Ferriss who described it as "very candid, very amusing, and very instructional".In mediaIn 2017, Agassi appeared in the documentary film Love Means Zero, which highlighted the troubled relationship between his coach Nick Bollettieri and him.PoliticsAgassi has donated more than $100,000 to Democratic candidates, and $2,000 to Republicans. On September 1, 2010, when he appeared on daily WNYC public radio program The Brian Lehrer Show, he stated that he is registered as Independent.PhilanthropyAgassi founded the Andre Agassi Charitable Association in 1994, which assists Las Vegas' young people. He was awarded the ATP Arthur Ashe Humanitarian award in 1995 for his efforts to help disadvantaged youth. He has been cited as the most charitable and socially involved player in professional tennis. It has also been claimed that he may be the most charitable athlete of his generation.Agassi's charities help in assisting children reach their athletic potential. His Boys & Girls Club sees 2,000 children throughout the year and boasts a world-class junior tennis team. It also has a basketball program (the Agassi Stars) and a rigorous system that encourages a mix of academics and athletics.In 2001, Agassi opened the Andre Agassi College Preparatory Academy in Las Vegas, a tuition-free charter school for at-risk children in the area. He personally donated $35 million to the school. In 2009, the graduating class had a 100 percent graduation rate and expected a 100 percent college acceptance rate. Among other child-related programs that Agassi supports through his Andre Agassi Charitable Foundation is Clark County's only residential facility for abused and neglected children, Child Haven. In 1997, Agassi donated funding to Child Haven for a six-room classroom building now named the Agassi Center for Education. His foundation also provided $720,000 to assist in the building of the Andre Agassi Cottage for Medically Fragile Children. This 20-bed facility opened in December 2001, and accommodates developmentally delayed or handicapped children and children quarantined for infectious diseases.In 2007, along with several other athletes, Agassi founded the charity Athletes for Hope, which helps professional athletes get involved in charitable causes and aims to inspire all people to volunteer and support their communities. He created the Canyon-Agassi Charter School Facilities Fund, now known as the Turner-Agassi Charter School Facilities Fund. The Fund is an investment initiative for social change, focusing on the "nationwide effort to move charters from stopgap buildings into permanent campuses."In September 2013, the Andre Agassi Foundation for Education formed a partnership with V20 Foods to launch Box Budd!es, a line of kids' healthy snacks. All proceeds go to the Foundation.In February 2014, Agassi remodeled the vacant University of Phoenix building in Las Vegas as a new school, called the Doral Academy West through the Canyon-Agassi Charter School Facilities Fund. Doral Academy opened in August 2014. The Fund purchased a 4.6-acre plot in Henderson, Nevada to house the Somerset Academy of Las Vegas, which will relocate from its campus inside a church.Career statisticsSingles performance timelineGrand Slam finals (8 titles, 7 runners-up)By winning the 1999 French Open, Agassi completed a men's singles Career Grand Slam. He is the 5th of 8 male players in history (after Budge, Perry, Laver and Emerson, and before Federer, Nadal and Djokovic) to achieve this.Open Era records  These records were attained in the Open Era of tennis and in ATP World Tour Masters 1000 series since 1990. Records in bold indicate peer-less achievements.LegacyConsidered by numerous sources to be one of the greatest tennis players of all time, Agassi has also been called one of the greatest service returners ever to play the game, and was described by the BBC upon his retirement as "perhaps the biggest worldwide star in the sport's history". As a result, he is credited for helping to revive the popularity of tennis during the 1990s.Professional awards ITF World Champion: 1999. ATP Player of the Year: 1999. ATP Most Improved Player: 1988, 1998Recognition In 1992, Agassi was named the BBC Overseas Sports Personality of the Year. In 2010, Sports Illustrated named Agassi the 7th greatest male player of all time. On July 9, 2011, Agassi was inducted into the International Tennis Hall of Fame at a ceremony in Newport, Rhode Island.Video Wimbledon 2000 Semi-final – Agassi vs. Rafter (2003) Starring: Andre Agassi, Patrick Rafter; Standing Room Only, DVD Release Date: August 16, 2005, Run Time: 213 minutes, . Charlie Rose with Andre Agassi (May 7, 2001) Charlie Rose, Inc., DVD Release Date: August 15, 2006, Run Time: 57 minutes. Wimbledon: The Record Breakers (2005) Starring: Andre Agassi, Boris Becker; Standing Room Only, DVD Release Date: August 16, 2005, Run Time: 52 minutes, .Video games Andre Agassi Tennis for the SNES, Sega Genesis, Sega Game Gear, Master System, and Mobile phone Agassi Tennis Generation for PS2 and GBA Agassi Tennis Generation 2002 for Windows Smash Court Pro Tournament for PS2 Top Spin 4 (On cover of game) for Xbox 360, PlayStation 3 and WiiSee also Agassi–Sampras rivalry All-time tennis records – men's singles List of Grand Slam Men's Singles champions Tennis male players statistics Tennis records of the Open Era – men's singlesExplanatory notesReferencesFurther readingExternal links     Andre Agassi Ventures Farewell to Tennis Speech at the U.S. Open Agassi's Tennis Hall of Fame Induction for Steffi Graf  1970 birthsLiving people20th-century American businesspeople21st-century American businesspeopleAmerican autobiographersAmerican investorsAmerican male tennis playersAmerican people of Iranian descentAmerican people of Iranian-Assyrian descentAmerican sportspeople of Armenian descentAmerican real estate businesspeopleAmerican sportspeople in doping casesArmenian-American tennis playersAssyrian sportspeopleAustralian Open (tennis) championsDoping cases in tennisEthnic Armenian sportspeopleFrench Open championsGrand Slam (tennis) champions in men's singlesInternational Tennis Hall of Fame inducteesIranian Assyrian peopleIranian people of Armenian descentMedalists at the 1996 Summer OlympicsNevada DemocratsNovak Djokovic coachesOlympic gold medalists for the United States in tennisPhilanthropists from NevadaSportspeople from Las VegasSportspeople of Iranian descentSteffi GrafTennis people from NevadaTennis players at the 1996 Summer OlympicsUS Open (tennis) championsWimbledon championsWorld No. 1 tennis playersWriters from Las Vegas
\ No newline at end of file
diff --git a/data/wiki_demo.txt.REMOVED.git-id b/data/wiki_demo.txt.REMOVED.git-id
deleted file mode 100644
index 53f52397..00000000
--- a/data/wiki_demo.txt.REMOVED.git-id
+++ /dev/null
@@ -1 +0,0 @@
-c9cf509b7fdac5490cfd6dae72c2d7b8a60af6cb
\ No newline at end of file
diff --git a/examples/README.md b/examples/README.md
index 5aa03dfc..94066b5d 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -53,6 +53,12 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
 ```
 
+#### KTO Training
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml
+```
+
 #### ORPO Training
 
 ```bash
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 5d205a21..77e9c416 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -53,6 +53,12 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo
 CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml
 ```
 
+#### KTO 训练
+
+```bash
+CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml
+```
+
 #### ORPO 训练
 
 ```bash
diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_lora_sft.yaml
index 24322356..4a482749 100644
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_lora_sft.yaml
@@ -11,7 +11,7 @@ badam_switch_interval: 50
 badam_verbose: 2
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
index 9d3b1124..e9c04fa9 100644
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -12,7 +12,7 @@ lora_target: q_proj,v_proj
 ddp_timeout: 180000000
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/extras/galore/llama3_full_sft.yaml b/examples/extras/galore/llama3_full_sft.yaml
index 7f5ce354..87381fcc 100644
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -12,7 +12,7 @@ galore_rank: 128
 galore_scale: 2.0
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml
index fc9bc9d3..8ace8db8 100644
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -10,7 +10,7 @@ freeze_trainable_modules: all
 use_llama_pro: true
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml
index c0e582d9..26c2b1d2 100644
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -9,7 +9,7 @@ lora_target: q_proj,v_proj
 loraplus_lr_ratio: 16.0
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml
index cfcd4f8a..6b724ed0 100644
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -8,7 +8,7 @@ finetuning_type: full
 mixture_of_depths: convert
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/full_multi_gpu/llama3_full_predict.yaml b/examples/full_multi_gpu/llama3_full_predict.yaml
index f037a20c..ebe303c9 100644
--- a/examples/full_multi_gpu/llama3_full_predict.yaml
+++ b/examples/full_multi_gpu/llama3_full_predict.yaml
@@ -7,7 +7,7 @@ do_predict: true
 finetuning_type: full
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 50
diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/full_multi_gpu/llama3_full_sft.yaml
index a08af5fe..a96f1b8e 100644
--- a/examples/full_multi_gpu/llama3_full_sft.yaml
+++ b/examples/full_multi_gpu/llama3_full_sft.yaml
@@ -11,7 +11,7 @@ ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml
index ed39144f..6389f21b 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml
@@ -11,7 +11,7 @@ lora_target: q_proj,v_proj
 ddp_timeout: 180000000
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
index 1ce045c0..6011896a 100644
--- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
@@ -12,7 +12,7 @@ ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z3_config.json
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
index 286ab503..65ab6347 100644
--- a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
@@ -12,7 +12,7 @@ ddp_timeout: 180000000
 deepspeed: examples/deepspeed/ds_z0_config.json
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/lora_single_gpu/llama3_lora_dpo.yaml
index 615e919f..36d64923 100644
--- a/examples/lora_single_gpu/llama3_lora_dpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml
@@ -9,7 +9,7 @@ lora_target: q_proj,v_proj
 dpo_ftx: 1.0
 
 ### dataset
-dataset: orca_rlhf
+dataset: dpo_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
@@ -26,7 +26,7 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
-learning_rate: 0.00001
+learning_rate: 0.000005
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_steps: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_kto.yaml b/examples/lora_single_gpu/llama3_lora_kto.yaml
new file mode 100644
index 00000000..285289f9
--- /dev/null
+++ b/examples/lora_single_gpu/llama3_lora_kto.yaml
@@ -0,0 +1,39 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+### method
+stage: kto
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+kto_ftx: 0.1
+
+### dataset
+dataset: kto_en_demo
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/kto
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 0.000005
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/examples/lora_single_gpu/llama3_lora_orpo.yaml b/examples/lora_single_gpu/llama3_lora_orpo.yaml
index 6fed8735..880ccb1c 100644
--- a/examples/lora_single_gpu/llama3_lora_orpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_orpo.yaml
@@ -8,7 +8,7 @@ finetuning_type: lora
 lora_target: q_proj,v_proj
 
 ### dataset
-dataset: orca_rlhf
+dataset: dpo_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
@@ -25,7 +25,7 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
-learning_rate: 0.00001
+learning_rate: 0.000005
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_steps: 0.1
diff --git a/examples/lora_single_gpu/llama3_lora_ppo.yaml b/examples/lora_single_gpu/llama3_lora_ppo.yaml
index 5cd2f18f..88ce24f3 100644
--- a/examples/lora_single_gpu/llama3_lora_ppo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_ppo.yaml
@@ -9,7 +9,7 @@ finetuning_type: lora
 lora_target: q_proj,v_proj
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/lora_single_gpu/llama3_lora_predict.yaml b/examples/lora_single_gpu/llama3_lora_predict.yaml
index ba55219a..a127d248 100644
--- a/examples/lora_single_gpu/llama3_lora_predict.yaml
+++ b/examples/lora_single_gpu/llama3_lora_predict.yaml
@@ -8,7 +8,7 @@ do_predict: true
 finetuning_type: lora
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 50
diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/lora_single_gpu/llama3_lora_reward.yaml
index 67baefd0..6bf2ca02 100644
--- a/examples/lora_single_gpu/llama3_lora_reward.yaml
+++ b/examples/lora_single_gpu/llama3_lora_reward.yaml
@@ -8,7 +8,7 @@ finetuning_type: lora
 lora_target: q_proj,v_proj
 
 ### dataset
-dataset: orca_rlhf
+dataset: dpo_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/lora_single_gpu/llama3_lora_sft.yaml
index e7836fd1..5492bc34 100644
--- a/examples/lora_single_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_single_gpu/llama3_lora_sft.yaml
@@ -8,7 +8,7 @@ finetuning_type: lora
 lora_target: q_proj,v_proj
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/lora_single_gpu/llama3_preprocess.yaml b/examples/lora_single_gpu/llama3_preprocess.yaml
index 59090544..86dad37b 100644
--- a/examples/lora_single_gpu/llama3_preprocess.yaml
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
@@ -8,7 +8,7 @@ finetuning_type: lora
 lora_target: q_proj,v_proj
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
index c8f2cff6..d2658051 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
@@ -8,7 +8,7 @@ finetuning_type: lora
 lora_target: q_proj,v_proj
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
index 05cb2a3f..ba6d8ea5 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
@@ -8,7 +8,7 @@ finetuning_type: lora
 lora_target: q_proj,v_proj
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
index d6da94d3..a3db35ff 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
@@ -9,7 +9,7 @@ finetuning_type: lora
 lora_target: q_proj,v_proj
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
index f2ba7490..cc9a454e 100644
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
@@ -8,7 +8,7 @@ finetuning_type: lora
 lora_target: q_proj,v_proj
 
 ### dataset
-dataset: identity,alpaca_gpt4_en
+dataset: identity,alpaca_en_demo
 template: llama3
 cutoff_len: 1024
 max_samples: 1000
diff --git a/src/llamafactory/data/__init__.py b/src/llamafactory/data/__init__.py
index 0b3a8dcf..44887d24 100644
--- a/src/llamafactory/data/__init__.py
+++ b/src/llamafactory/data/__init__.py
@@ -1,12 +1,12 @@
-from .collator import PairwiseDataCollatorWithPadding,KTODataCollatorWithPadding
+from .collator import KTODataCollatorWithPadding, PairwiseDataCollatorWithPadding
 from .loader import get_dataset
 from .template import Template, get_template_and_fix_tokenizer, templates
 from .utils import Role, split_dataset
 
 
 __all__ = [
-    "PairwiseDataCollatorWithPadding",
     "KTODataCollatorWithPadding",
+    "PairwiseDataCollatorWithPadding",
     "get_dataset",
     "Template",
     "get_template_and_fix_tokenizer",
diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py
index 2cf8a4f3..2e2fb2c8 100644
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Union
 
 from datasets import Features
 
+from ..extras.logging import get_logger
 from .utils import Role
 
 
@@ -14,7 +15,13 @@ if TYPE_CHECKING:
     from .parser import DatasetAttr
 
 
+logger = get_logger(__name__)
+
+
 def _convert_images(images: List[Any], dataset_attr: "DatasetAttr", data_args: "DataArguments") -> List[Any]:
+    r"""
+    Optionally concatenates image path to dataset dir when loading from local disk.
+    """
     outputs = []
     if dataset_attr.load_from in ["script", "file"]:
         for image in images:
@@ -29,7 +36,10 @@ def _convert_images(images: List[Any], dataset_attr: "DatasetAttr", data_args: "
 def convert_alpaca(
     examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
 ) -> Dict[str, List[Any]]:
-    outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": [], "tag": []}
+    r"""
+    Converts alpaca format dataset to the standard format.
+    """
+    outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
     convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args)
     for i in range(len(examples[dataset_attr.prompt])):
         prompt = []
@@ -45,23 +55,33 @@ def convert_alpaca(
         if dataset_attr.query and examples[dataset_attr.query][i]:
             content.append(examples[dataset_attr.query][i])
 
-        prompt.append({"role": Role.USER.value, "content": "\n".join(content)})
+        prompt.append({"role": Role.USER.value, "content": "\n".join(content)})  # "prompt\nquery"
 
-        if dataset_attr.response and isinstance(examples[dataset_attr.response][i], list):
-            response = [
-                {"role": Role.ASSISTANT.value, "content": content} for content in examples[dataset_attr.response][i]
-            ]
-        elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str):
+        if dataset_attr.kto_tag and isinstance(examples[dataset_attr.kto_tag], bool):  # kto example
             response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}]
-        else:
+            if examples[dataset_attr.kto_tag]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            dataset_attr.ranking
+            and isinstance(examples[dataset_attr.chosen][i], str)
+            and isinstance(examples[dataset_attr.rejected][i], str)
+        ):  # pairwise example
+            response = [
+                {"role": Role.ASSISTANT.value, "content": examples[dataset_attr.chosen][i]},
+                {"role": Role.ASSISTANT.value, "content": examples[dataset_attr.rejected][i]},
+            ]
+        elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str):  # normal example
+            response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}]
+        else:  # unsupervised
             response = []
 
         outputs["prompt"].append(prompt)
         outputs["response"].append(response)
         outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
-        outputs["tools"].append("")
+        outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
         outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else [])
-        outputs["tag"].append(examples[dataset_attr.tag][i] if dataset_attr.tag else True)
 
     return outputs
 
@@ -69,6 +89,9 @@ def convert_alpaca(
 def convert_sharegpt(
     examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
 ) -> Dict[str, List[Any]]:
+    r"""
+    Converts sharegpt format dataset to the standard format.
+    """
     outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
     convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args)
     tag_mapping = {
@@ -88,21 +111,62 @@ def convert_sharegpt(
         else:
             system = examples[dataset_attr.system][i] if dataset_attr.system else ""
 
-        messages = messages[: len(messages) // 2 * 2]  # should be multiples of 2
         if len(messages) == 0:
             continue
 
         aligned_messages = []
+        broken_data = False
         for turn_idx, message in enumerate(messages):
             if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]:
-                raise ValueError("Invalid role tag in {}.".format(messages))
+                logger.warning("Invalid role tag in {}.".format(messages))
+                broken_data = True
 
             aligned_messages.append(
                 {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
             )
 
-        outputs["prompt"].append(aligned_messages[:-1])
-        outputs["response"].append(aligned_messages[-1:])
+        if (not dataset_attr.ranking and len(aligned_messages) % 2 != 0) or (
+            dataset_attr.ranking and len(aligned_messages) % 2 == 0
+        ):
+            logger.warning("Invalid message count in {}.".format(messages))
+            broken_data = True
+
+        if dataset_attr.kto_tag and isinstance(examples[dataset_attr.kto_tag][i], bool):  # kto example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+            if examples[dataset_attr.kto_tag][i]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            dataset_attr.ranking
+            and isinstance(examples[dataset_attr.chosen][i], dict)
+            and isinstance(examples[dataset_attr.rejected][i], dict)
+        ):  # pairwise example
+            chosen = examples[dataset_attr.chosen][i]
+            rejected = examples[dataset_attr.rejected][i]
+            if (
+                chosen[dataset_attr.role_tag] not in accept_tags[-1]
+                or rejected[dataset_attr.role_tag] not in accept_tags[-1]
+            ):
+                logger.warning("Invalid role tag in {}.".format(messages))
+                broken_data = True
+
+            prompt = aligned_messages
+            response = [
+                {"role": tag_mapping[chosen[dataset_attr.role_tag]], "content": chosen[dataset_attr.content_tag]},
+                {"role": tag_mapping[rejected[dataset_attr.role_tag]], "content": rejected[dataset_attr.content_tag]},
+            ]
+        else:  # normal example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+
+        if broken_data:
+            logger.warning("Skipping this abnormal example.")
+            continue
+
+        outputs["prompt"].append(prompt)
+        outputs["response"].append(response)
         outputs["system"].append(system)
         outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
         outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else [])
@@ -138,7 +202,6 @@ def align_dataset(
             "system": {"dtype": "string", "_type": "Value"},
             "tools": {"dtype": "string", "_type": "Value"},
             "images": [{"_type": "Image"}],
-            "tag": {"dtype": "bool", "_type": "Value"},
         }
     )
     kwargs = {}
diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py
index 517fa68c..474d6a30 100644
--- a/src/llamafactory/data/collator.py
+++ b/src/llamafactory/data/collator.py
@@ -50,35 +50,38 @@ class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq):
         batch["labels"] = self._pad_labels(batch["input_ids"], label_positions)
         return batch
 
+
 @dataclass
 class KTODataCollatorWithPadding(DataCollatorForSeq2Seq):
     r"""
     Data collator for KTO data.
     """
-    def __call__(self, features, return_tensors=None):
-        concatenated_features = []
-        kl_concatenated_features = []
-        tags = []
+
+    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        target_features = []
+        kl_features = []
+        kto_tags = []
         for feature in features:
-            concatenated_features.append(
+            target_features.append(
                 {
                     "input_ids": feature["input_ids"],
                     "attention_mask": feature["attention_mask"],
                     "labels": feature["labels"],
                 }
             )
-            kl_concatenated_features.append(
+            kl_features.append(
                 {
                     "input_ids": feature["kl_input_ids"],
                     "attention_mask": feature["kl_attention_mask"],
                     "labels": feature["kl_labels"],
                 }
             )
-            tags.append(feature["tag"])
-        batch = super().__call__(concatenated_features)
-        kl_batch = super().__call__(kl_concatenated_features)
-        batch["KL_completion_input_ids"] = kl_batch["input_ids"]
-        batch["KL_completion_attention_mask"] = kl_batch["attention_mask"]
+            kto_tags.append(feature["kto_tags"])
+
+        batch = super().__call__(target_features)
+        kl_batch = super().__call__(kl_features)
+        batch["kl_input_ids"] = kl_batch["input_ids"]
+        batch["kl_attention_mask"] = kl_batch["attention_mask"]
         batch["kl_labels"] = kl_batch["labels"]
-        batch["tag"] = torch.tensor(tags)
-        return batch
\ No newline at end of file
+        batch["kto_tags"] = torch.tensor(kto_tags)
+        return batch
diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py
index a04bf377..bed694a2 100644
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -57,7 +57,7 @@ def load_single_dataset(
             data_files.append(local_path)
             data_path = FILEEXT2TYPE.get(local_path.split(".")[-1], None)
         else:
-            raise ValueError("File not found.")
+            raise ValueError("File {} not found.".format(local_path))
 
         if data_path is None:
             raise ValueError("File extension must be txt, csv, json or jsonl.")
@@ -116,7 +116,7 @@ def get_dataset(
     model_args: "ModelArguments",
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
-    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    stage: Literal["pt", "sft", "rm", "kto"],
     tokenizer: "PreTrainedTokenizer",
     processor: Optional["ProcessorMixin"] = None,
 ) -> Union["Dataset", "IterableDataset"]:
diff --git a/src/llamafactory/data/parser.py b/src/llamafactory/data/parser.py
index 33136551..679f8ad6 100644
--- a/src/llamafactory/data/parser.py
+++ b/src/llamafactory/data/parser.py
@@ -25,21 +25,22 @@ class DatasetAttr:
     folder: Optional[str] = None
     ranking: bool = False
     formatting: Literal["alpaca", "sharegpt"] = "alpaca"
-    """ columns """
+    """ common columns """
     system: Optional[str] = None
+    tools: Optional[str] = None
     images: Optional[str] = None
-    tag: Optional[bool] = None
-    """ columns for the alpaca format """
+    """ rlhf columns """
+    chosen: Optional[str] = None
+    rejected: Optional[str] = None
+    kto_tag: Optional[str] = None
+    """ alpaca columns """
     prompt: Optional[str] = "instruction"
     query: Optional[str] = "input"
     response: Optional[str] = "output"
-    chosen: Optional[str] = "chosen"
-    rejected: Optional[str] = "rejected"
     history: Optional[str] = None
-    """ columns for the sharegpt format """
+    """ sharegpt columns """
     messages: Optional[str] = "conversations"
-    tools: Optional[str] = None
-    """ tags for the sharegpt format """
+    """ sharegpt tags """
     role_tag: Optional[str] = "from"
     content_tag: Optional[str] = "value"
     user_tag: Optional[str] = "human"
@@ -107,11 +108,11 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]:
         dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca")
 
         if "columns" in dataset_info[name]:
-            column_names = ["system", "images", "tag"]
+            column_names = ["system", "tools", "images", "chosen", "rejected", "kto_tag"]
             if dataset_attr.formatting == "alpaca":
                 column_names.extend(["prompt", "query", "response", "history"])
             else:
-                column_names.extend(["messages", "tools"])
+                column_names.extend(["messages"])
 
             for column_name in column_names:
                 dataset_attr.set_attr(column_name, dataset_info[name]["columns"])
diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py
index 4a348ce2..a6fb0ddc 100644
--- a/src/llamafactory/data/preprocess.py
+++ b/src/llamafactory/data/preprocess.py
@@ -70,7 +70,7 @@ def preprocess_supervised_dataset(
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
-    model_inputs = {"input_ids": [], "attention_mask": [], "labels": [], "tag": []}
+    model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
     if processor is not None:
         model_inputs["pixel_values"] = []
         preprocess_visual_inputs = partial(_preprocess_visual_inputs, processor=processor)
@@ -111,102 +111,11 @@ def preprocess_supervised_dataset(
         model_inputs["input_ids"].append(input_ids)
         model_inputs["attention_mask"].append([1] * len(input_ids))
         model_inputs["labels"].append(labels)
-        model_inputs["tag"].append(examples["tag"])
         if processor is not None:
             model_inputs["pixel_values"].append(preprocess_visual_inputs(examples["images"][i]))
 
     return model_inputs
 
-def preprocess_kto_dataset(
-    examples: Dict[str, List[Any]],
-    template: "Template",
-    tokenizer: "PreTrainedTokenizer",
-    processor: Optional["ProcessorMixin"],
-    data_args: "DataArguments",
-) -> Dict[str, List[List[int]]]:
-    # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
-    # for multiturn examples, we only mask the prompt part in each prompt-response pair.
-    model_inputs = {"input_ids": [], "attention_mask": [], "labels": [],"kl_input_ids": [], "kl_attention_mask": [], "kl_labels": [], "tag": []}
-    """Creates mismatched pairs of prompts and completions for the KL dataset by reversing the order of completions."""
-    examples['kl_response'] = examples['response'][::-1]
-    if processor is not None:
-        model_inputs["pixel_values"] = []
-        preprocess_visual_inputs = partial(_preprocess_visual_inputs, processor=processor)
-
-    for i in range(len(examples["prompt"])):
-        if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
-            continue
-
-        if processor is not None:
-            examples["prompt"][i][0]["content"] = "<image>" + examples["prompt"][i][0]["content"]
-
-        messages = examples["prompt"][i] + examples["response"][i]
-        kl_messages = examples["prompt"][i] + examples["kl_response"][i]
-        input_ids, labels = [], []
-        kl_input_ids, kl_labels = [], []
-        for turn_idx, (source_ids, target_ids) in enumerate(
-            template.encode_multiturn(
-                tokenizer,
-                messages,
-                examples["system"][i],
-                examples["tools"][i],
-                data_args.cutoff_len,
-                data_args.reserved_label_len,
-            )
-        ):
-            if data_args.train_on_prompt:
-                source_mask = source_ids
-            elif turn_idx != 0 and template.efficient_eos:
-                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
-            else:
-                source_mask = [IGNORE_INDEX] * len(source_ids)
-
-            input_ids += source_ids + target_ids
-            labels += source_mask + target_ids
-
-        if template.efficient_eos:
-            input_ids += [tokenizer.eos_token_id]
-            labels += [tokenizer.eos_token_id]
-
-        for turn_idx, (source_ids, target_ids) in enumerate(
-            template.encode_multiturn(
-                tokenizer,
-                kl_messages,
-                examples["system"][i],
-                examples["tools"][i],
-                data_args.cutoff_len,
-                data_args.reserved_label_len,
-            )
-        ):
-            if data_args.train_on_prompt:
-                source_mask = source_ids
-            elif turn_idx != 0 and template.efficient_eos:
-                source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
-            else:
-                source_mask = [IGNORE_INDEX] * len(source_ids)
-
-            kl_input_ids += source_ids + target_ids
-            kl_labels += source_mask + target_ids
-
-        if template.efficient_eos:
-            kl_input_ids += [tokenizer.eos_token_id]
-            kl_labels += [tokenizer.eos_token_id]
-
-        model_inputs["input_ids"].append(input_ids)
-        model_inputs["attention_mask"].append([1] * len(input_ids))
-        model_inputs["labels"].append(labels)
-        model_inputs["kl_input_ids"].append(kl_input_ids)
-        model_inputs["kl_attention_mask"].append([1] * len(kl_input_ids))
-        model_inputs["kl_labels"].append(kl_labels)
-        model_inputs["tag"].append(examples["tag"][i])
-        if processor is not None:
-            model_inputs["pixel_values"].append(preprocess_visual_inputs(examples["images"][i]))
-    desirable = sum([1 for tag in model_inputs["tag"] if tag is True])
-    undesirable = sum([1 for tag in model_inputs["tag"] if tag is False])
-    logger.info("desirable data in KTO dataset: {},undesirable data in KTO dataset: {}".format(desirable, undesirable))
-    if desirable == 0 or undesirable == 0:
-        logger.warning("Your dataset only has one preference type.")
-    return model_inputs
 
 def preprocess_packed_supervised_dataset(
     examples: Dict[str, List[Any]],
@@ -352,6 +261,90 @@ def preprocess_pairwise_dataset(
     return model_inputs
 
 
+def preprocess_kto_dataset(
+    examples: Dict[str, List[Any]],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    data_args: "DataArguments",
+) -> Dict[str, List[List[int]]]:
+    # create unrelated input-output pairs for estimating the KL term by flipping the matched pairs
+    kl_response = examples["response"][::-1]
+    model_inputs = {
+        "input_ids": [],
+        "attention_mask": [],
+        "labels": [],
+        "kl_input_ids": [],
+        "kl_attention_mask": [],
+        "kl_labels": [],
+        "kto_tags": [],
+    }
+    if processor is not None:
+        model_inputs["pixel_values"] = []
+        preprocess_visual_inputs = partial(_preprocess_visual_inputs, processor=processor)
+
+    for i in range(len(examples["prompt"])):
+        if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2:
+            continue
+
+        if processor is not None:
+            examples["prompt"][i][0]["content"] = "<image>" + examples["prompt"][i][0]["content"]
+
+        if examples["response"][i][0]["content"]:  # desired example
+            kto_tag = True
+            messages = examples["prompt"][i] + [examples["response"][i][0]]
+        else:  # undesired example
+            kto_tag = False
+            messages = examples["prompt"][i] + [examples["response"][i][1]]
+
+        if kl_response[i][0]["content"]:
+            kl_messages = examples["prompt"][i] + [kl_response[i][0]]
+        else:
+            kl_messages = examples["prompt"][i] + [kl_response[i][1]]
+
+        prompt_ids, response_ids = template.encode_oneturn(
+            tokenizer,
+            messages,
+            examples["system"][i],
+            examples["tools"][i],
+            data_args.cutoff_len,
+            data_args.reserved_label_len,
+        )
+        _, kl_response_ids = template.encode_oneturn(
+            tokenizer,
+            kl_messages,
+            examples["system"][i],
+            examples["tools"][i],
+            data_args.cutoff_len,
+            data_args.reserved_label_len,
+        )
+
+        if template.efficient_eos:
+            response_ids += [tokenizer.eos_token_id]
+            kl_response_ids += [tokenizer.eos_token_id]
+
+        input_ids = prompt_ids + response_ids
+        labels = [IGNORE_INDEX] * len(prompt_ids) + response_ids
+        kl_input_ids = prompt_ids + kl_response_ids
+        kl_labels = [IGNORE_INDEX] * len(prompt_ids) + kl_response_ids
+        model_inputs["input_ids"].append(input_ids)
+        model_inputs["attention_mask"].append([1] * len(input_ids))
+        model_inputs["labels"].append(labels)
+        model_inputs["kl_input_ids"].append(kl_input_ids)
+        model_inputs["kl_attention_mask"].append([1] * len(kl_input_ids))
+        model_inputs["kl_labels"].append(kl_labels)
+        model_inputs["kto_tags"].append(kto_tag)
+        if processor is not None:
+            model_inputs["pixel_values"].append(preprocess_visual_inputs(examples["images"][i]))
+
+    desirable_num = sum([1 for tag in model_inputs["kto_tags"] if tag])
+    undesirable_num = len(model_inputs["kto_tags"]) - desirable_num
+    if desirable_num == 0 or undesirable_num == 0:
+        logger.warning("Your dataset only has one preference type.")
+
+    return model_inputs
+
+
 def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
     print("input_ids:\n{}".format(example["input_ids"]))
     print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
@@ -380,7 +373,7 @@ def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer:
 def get_preprocess_and_print_func(
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
-    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    stage: Literal["pt", "sft", "rm", "kto"],
     template: "Template",
     tokenizer: "PreTrainedTokenizer",
     processor: Optional["ProcessorMixin"],
diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py
index e6840518..b84e238a 100644
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
@@ -137,21 +137,21 @@ class RLHFArguments:
         default=0.1,
         metadata={"help": "The beta parameter for the KTO loss."},
     )
+    kto_chosen_weight: float = field(
+        default=1.0,
+        metadata={"help": "The weight factor of the desirable losses in KTO training."},
+    )
+    kto_rejected_weight: float = field(
+        default=1.0,
+        metadata={"help": "The weight factor of the undesirable losses in KTO training."},
+    )
     kto_ftx: float = field(
         default=0.0,
         metadata={"help": "The supervised fine-tuning loss coefficient in KTO training."},
     )
-    kto_desirable_weight: float = field(
-        default=1.0,
-        metadata={"help": "The desirable weight for the KTO loss."},
-    )
-    kto_undesirable_weight: float = field(
-        default=1.0,
-        metadata={"help": "The undesirable weight for the KTO loss."},
-    )
     orpo_beta: float = field(
         default=0.1,
-        metadata={"help": "The beta (lambda) parameter in ORPO loss representing the weight of the SFT loss."},
+        metadata={"help": "The beta (lambda) parameter in the ORPO loss representing the weight of the SFT loss."},
     )
     ppo_buffer_size: int = field(
         default=1,
@@ -307,7 +307,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
         default=False,
         metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."},
     )
-    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo", "kto"] = field(
+    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "kto", "orpo"] = field(
         default="sft",
         metadata={"help": "Which stage will be performed in training."},
     )
diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py
index 3c0b0276..519e95f1 100644
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
@@ -47,11 +47,13 @@ class CustomDPOTrainer(DPOTrainer):
         self._peft_has_been_casted_to_bf16 = False
 
         self.ref_model = ref_model
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # dpo hyperparams
         self.beta = finetuning_args.dpo_beta
         self.label_smoothing = finetuning_args.dpo_label_smoothing
         self.loss_type = finetuning_args.dpo_loss
         self.ftx_gamma = finetuning_args.dpo_ftx
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
 
         Trainer.__init__(self, model=model, **kwargs)
         if not hasattr(self, "accelerator"):
@@ -143,6 +145,7 @@ class CustomDPOTrainer(DPOTrainer):
             policy_chosen_logits,
             policy_rejected_logits,
         ) = self.concatenated_forward(model, batch)
+
         with torch.no_grad():
             if self.ref_model is None:
                 ref_model = self.model
diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py
index 6f9f6754..5578c50c 100644
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
@@ -1,7 +1,7 @@
 from collections import defaultdict
 from contextlib import nullcontext
 from types import MethodType
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
 import torch
 from transformers import Trainer
@@ -13,7 +13,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler
 
 
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+    from transformers import PreTrainedModel, ProcessorMixin
 
     from ...hparams import FinetuningArguments
 
@@ -24,6 +24,7 @@ class CustomKTOTrainer(KTOTrainer):
         model: Union["PreTrainedModel", torch.nn.Module],
         ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]],
         finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
         disable_dropout: bool = True,
         **kwargs,
     ):
@@ -33,6 +34,7 @@ class CustomKTOTrainer(KTOTrainer):
                 disable_dropout_in_model(ref_model)
 
         self.finetuning_args = finetuning_args
+        self.processor = processor
         self.reference_free = False
         self.use_dpo_data_collator = True  # hack to avoid warning
         self.generate_during_eval = False  # disable at evaluation
@@ -43,15 +45,15 @@ class CustomKTOTrainer(KTOTrainer):
         self._precomputed_train_ref_log_probs = False
         self._precomputed_eval_ref_log_probs = False
         self._peft_has_been_casted_to_bf16 = False
+
         self.ref_model = ref_model
         self._stored_metrics = defaultdict(lambda: defaultdict(list))
 
-        # KTO parameter
+        # kto hyperparams
         self.beta = finetuning_args.kto_beta
+        self.desirable_weight = finetuning_args.kto_chosen_weight
+        self.undesirable_weight = finetuning_args.kto_rejected_weight
         self.ftx_gamma = finetuning_args.kto_ftx
-        self.desirable_weight = finetuning_args.kto_desirable_weight
-        self.undesirable_weight = finetuning_args.kto_undesirable_weight
-
 
         Trainer.__init__(self, model=model, **kwargs)
         if not hasattr(self, "accelerator"):
@@ -82,78 +84,85 @@ class CustomKTOTrainer(KTOTrainer):
         create_custom_scheduler(self.args, num_training_steps, optimizer)
         return super().create_scheduler(num_training_steps, optimizer)
 
+    def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None:
+        super()._save(output_dir, state_dict)
+        if self.processor is not None:
+            output_dir = output_dir if output_dir is not None else self.args.output_dir
+            getattr(self.processor, "image_processor").save_pretrained(output_dir)
+
     def sft_loss(self, chosen_logits: "torch.FloatTensor", chosen_labels: "torch.LongTensor") -> "torch.Tensor":
         r"""
         Computes supervised cross-entropy loss of given labels under the given logits.
+
         Returns:
             A tensor of shape (batch_size,) containing the cross-entropy loss of each samples.
         """
         all_logps = self.get_batch_logps(chosen_logits, chosen_labels, average_log_prob=True)
-        return -all_logps.nanmean()
-
+        return -all_logps
 
     def forward(
         self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"]
     ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
         with torch.no_grad():
-            KL_logits = model(
-                batch["KL_completion_input_ids"],
-                attention_mask=batch["KL_completion_attention_mask"],
-            ).logits
+            kl_logits = model(
+                input_ids=batch["kl_input_ids"],
+                attention_mask=batch["kl_attention_mask"],
+                return_dict=True,
+                use_cache=False,
+            ).logits.to(torch.float32)
 
-        completion_logits = model(
-            batch["input_ids"],
+        target_logits = model(
+            input_ids=batch["input_ids"],
             attention_mask=batch["attention_mask"],
-        ).logits
+            return_dict=True,
+            use_cache=False,
+        ).logits.to(torch.float32)
 
-        completion_logps = self.get_batch_logps(
-            completion_logits,
-            batch["labels"],
+        target_logps = self.get_batch_logps(
+            logits=target_logits,
+            labels=batch["labels"],
             average_log_prob=False,
             is_encoder_decoder=self.is_encoder_decoder,
             label_pad_token_id=self.label_pad_token_id,
         )
 
-        KL_logps = self.get_batch_logps(
-            KL_logits,
-            batch["kl_labels"],
+        kl_logps = self.get_batch_logps(
+            logits=kl_logits,
+            labels=batch["kl_labels"],
             average_log_prob=False,
             is_encoder_decoder=self.is_encoder_decoder,
             label_pad_token_id=self.label_pad_token_id,
         )
 
-        if completion_logps.shape[0] != len(batch["tag"]):
-            raise ValueError(
-                "There is a mismatch between the number of examples in this batch and the number of "
-                "examples for which an output sequence was predicted."
-            )
-        chosen_idx = [i for i in range(completion_logps.shape[0]) if batch["tag"][i]]
-        rejected_idx = [i for i in range(completion_logps.shape[0]) if not batch["tag"][i]]
+        if len(target_logps) != len(batch["kto_tags"]):
+            raise ValueError("Mismatched shape of inputs and labels.")
 
-        chosen_logps = completion_logps[chosen_idx, ...]
-        rejected_logps = completion_logps[rejected_idx, ...]
+        chosen_idx = [i for i in range(len(target_logps)) if batch["kto_tags"][i]]
+        rejected_idx = [i for i in range(len(target_logps)) if not batch["kto_tags"][i]]
 
-        chosen_logits = completion_logits[chosen_idx, ...]
-        rejected_logits = completion_logits[rejected_idx, ...]
+        chosen_logps = target_logps[chosen_idx, ...]
+        rejected_logps = target_logps[rejected_idx, ...]
 
-        return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, KL_logps)
+        chosen_logits = target_logits[chosen_idx, ...]
+        rejected_logits = target_logits[rejected_idx, ...]
 
+        return chosen_logps, rejected_logps, chosen_logits, rejected_logits, kl_logps
 
     def get_batch_loss_metrics(
         self,
-        model,
-        batch: Dict[str, Union[List, torch.LongTensor]],
-    ):
-        """Compute the KTO loss and other metrics for the given batch of inputs for train or test."""
+        model: "PreTrainedModel",
+        batch: Dict[str, "torch.Tensor"],
+    ) -> Tuple["torch.Tensor", Dict[str, "torch.Tensor"]]:
+        r"""
+        Computes the DPO loss and other metrics for the given batch of inputs for train or test.
+        """
         metrics = {}
-        batch = {k: (v.to(self.accelerator.device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
-
         (
             policy_chosen_logps,
             policy_rejected_logps,
             policy_chosen_logits,
-            policy_rejected_logits,
-            policy_KL_logps,
+            _,
+            policy_kl_logps,
         ) = self.forward(model, batch)
 
         with torch.no_grad():
@@ -163,27 +172,29 @@ class CustomKTOTrainer(KTOTrainer):
             else:
                 ref_model = self.ref_model
                 ref_context = nullcontext()
+
             with ref_context:
                 (
                     reference_chosen_logps,
                     reference_rejected_logps,
                     _,
                     _,
-                    reference_KL_logps,
+                    reference_kl_logps,
                 ) = self.forward(ref_model, batch)
 
         losses, chosen_rewards, rejected_rewards, kl = self.kto_loss(
             policy_chosen_logps,
             policy_rejected_logps,
-            policy_KL_logps,
+            policy_kl_logps,
             reference_chosen_logps,
             reference_rejected_logps,
-            reference_KL_logps,
+            reference_kl_logps,
         )
         losses = losses.nanmean()
-        if self.ftx_gamma > 1e-6 and len(batch["labels"][batch['tag']])>0:
-            losses += self.ftx_gamma * self.sft_loss(policy_chosen_logits, batch["labels"][batch['tag']])
 
+        if self.ftx_gamma > 1e-6 and len(policy_chosen_logps) > 0:  # remember to rescale
+            sft_loss = self.sft_loss(policy_chosen_logits, batch["labels"][batch["kto_tags"]])
+            losses += self.ftx_gamma * sft_loss.nanmean() / len(policy_chosen_logits) * len(batch["labels"])
 
         num_chosen = torch.Tensor([len(chosen_rewards)]).to(self.accelerator.device)
         num_rejected = torch.Tensor([len(rejected_rewards)]).to(self.accelerator.device)
@@ -203,4 +214,4 @@ class CustomKTOTrainer(KTOTrainer):
 
         metrics["kl"] = kl.item()
 
-        return losses, metrics
\ No newline at end of file
+        return losses, metrics
diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py
index a2d0ec24..615fdb62 100644
--- a/src/llamafactory/train/kto/workflow.py
+++ b/src/llamafactory/train/kto/workflow.py
@@ -48,9 +48,9 @@ def run_kto(
         ref_model=ref_model,
         args=training_args,
         finetuning_args=finetuning_args,
-        tokenizer=tokenizer,
         data_collator=data_collator,
         callbacks=callbacks,
+        **tokenizer_module,
         **split_dataset(dataset, data_args, training_args),
     )
 
diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py
index 4383bcdc..c4e05e57 100644
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
@@ -29,7 +29,7 @@ def run_ppo(
 ):
     tokenizer_module = load_tokenizer(model_args)
     tokenizer = tokenizer_module["tokenizer"]
-    dataset = get_dataset(model_args, data_args, training_args, stage="ppo", **tokenizer_module)
+    dataset = get_dataset(model_args, data_args, training_args, stage="pt", **tokenizer_module)
     model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True)
 
     tokenizer.padding_side = "left"  # use left-padding in generation while using right-padding in training
diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py
index 89dcb9ac..fadbb14a 100644
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
@@ -9,12 +9,13 @@ from ..extras.logging import get_logger
 from ..hparams import get_infer_args, get_train_args
 from ..model import load_model, load_tokenizer
 from .dpo import run_dpo
+from .kto import run_kto
 from .orpo import run_orpo
 from .ppo import run_ppo
 from .pt import run_pt
 from .rm import run_rm
 from .sft import run_sft
-from .kto import run_kto
+
 
 if TYPE_CHECKING:
     from transformers import TrainerCallback
@@ -37,10 +38,10 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallb
         run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
     elif finetuning_args.stage == "dpo":
         run_dpo(model_args, data_args, training_args, finetuning_args, callbacks)
-    elif finetuning_args.stage == "orpo":
-        run_orpo(model_args, data_args, training_args, finetuning_args, callbacks)
     elif finetuning_args.stage == "kto":
         run_kto(model_args, data_args, training_args, finetuning_args, callbacks)
+    elif finetuning_args.stage == "orpo":
+        run_orpo(model_args, data_args, training_args, finetuning_args, callbacks)
     else:
         raise ValueError("Unknown task.")
 

From 9c1c59e481b155e4d438db10a7c558c6292fc01c Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 18 May 2024 16:13:14 +0800
Subject: [PATCH 324/341] fix #3803

Former-commit-id: 1ef12c95059d14a1717c82ce04e529e7ad6435ed
---
 src/llamafactory/data/aligner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py
index 2e2fb2c8..6a74a843 100644
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -57,9 +57,9 @@ def convert_alpaca(
 
         prompt.append({"role": Role.USER.value, "content": "\n".join(content)})  # "prompt\nquery"
 
-        if dataset_attr.kto_tag and isinstance(examples[dataset_attr.kto_tag], bool):  # kto example
+        if dataset_attr.kto_tag and isinstance(examples[dataset_attr.kto_tag][i], bool):  # kto example
             response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}]
-            if examples[dataset_attr.kto_tag]:
+            if examples[dataset_attr.kto_tag][i]:
                 response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
             else:
                 response = [{"role": Role.ASSISTANT.value, "content": ""}] + response

From 6b9003f781312939a744ef094406cf5b39ad2a43 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 18 May 2024 21:15:20 +0800
Subject: [PATCH 325/341] update data readme

Former-commit-id: beb864a9367943d3274cb6057423d1eb9aaf85c4
---
 data/README.md    | 191 +++++++++++++++++++++++++++++++++++++---------
 data/README_zh.md | 187 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 315 insertions(+), 63 deletions(-)

diff --git a/data/README.md b/data/README.md
index b1368d4a..a467fe67 100644
--- a/data/README.md
+++ b/data/README.md
@@ -1,16 +1,17 @@
-If you are using a custom dataset, please add your **dataset description** to `dataset_info.json` according to the following format. We also provide several examples in the next section.
+The `dataset_info.json` contains all available datasets. If you are using a custom dataset, please make sure to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it.
+
+Currently we support datasets in **alpaca** and **sharegpt** format.
 
 ```json
 "dataset_name": {
   "hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore script_url and file_name)",
-  "ms_hub_url": "the name of the dataset repository on the ModelScope hub. (if specified, ignore script_url and file_name)",
+  "ms_hub_url": "the name of the dataset repository on the Model Scope hub. (if specified, ignore script_url and file_name)",
   "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore file_name)",
-  "file_name": "the name of the dataset file in this directory. (required if above are not specified)",
-  "file_sha1": "the SHA-1 hash value of the dataset file. (optional, does not affect training)",
+  "file_name": "the name of the dataset folder or dataset file in this directory. (required if above are not specified)",
+  "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
+  "ranking": "whether the dataset is a preference dataset or not. (default: False)",
   "subset": "the name of the subset. (optional, default: None)",
   "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
-  "ranking": "whether the dataset is a preference dataset or not. (default: false)",
-  "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
   "columns (optional)": {
     "prompt": "the column name in the dataset containing the prompts. (default: instruction)",
     "query": "the column name in the dataset containing the queries. (default: input)",
@@ -36,11 +37,15 @@ If you are using a custom dataset, please add your **dataset description** to `d
 }
 ```
 
-After that, you can load the custom dataset by specifying `--dataset dataset_name`.
+## Alpaca Format
 
-----
+### Supervised Fine-Tuning Dataset
 
-Currently we support dataset in **alpaca** or **sharegpt** format, the dataset in alpaca format should follow the below format:
+In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the human prompt, then the human prompt would be `instruction\ninput`. The `output` column represents the model response.
+
+The `system` column will be used as the system prompt if specified.
+
+The `history` column is a list consisting string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning.
 
 ```json
 [
@@ -57,7 +62,7 @@ Currently we support dataset in **alpaca** or **sharegpt** format, the dataset i
 ]
 ```
 
-Regarding the above dataset, the description in `dataset_info.json` should be:
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
 
 ```json
 "dataset_name": {
@@ -72,11 +77,9 @@ Regarding the above dataset, the description in `dataset_info.json` should be:
 }
 ```
 
-The `query` column will be concatenated with the `prompt` column and used as the human prompt, then the human prompt would be `prompt\nquery`. The `response` column represents the model response.
+### Pre-training Dataset
 
-The `system` column will be used as the system prompt. The `history` column is a list consisting string tuples representing prompt-response pairs in the history. Note that the responses in the history **will also be used for training** in supervised fine-tuning.
-
-For the **pre-training datasets**, only the `prompt` column will be used for training, for example:
+In pre-training, only the `prompt` column will be used for model learning.
 
 ```json
 [
@@ -85,7 +88,7 @@ For the **pre-training datasets**, only the `prompt` column will be used for tra
 ]
 ```
 
-Regarding the above dataset, the description in `dataset_info.json` should be:
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
 
 ```json
 "dataset_name": {
@@ -96,20 +99,24 @@ Regarding the above dataset, the description in `dataset_info.json` should be:
 }
 ```
 
-For the **preference datasets**, the `response` column should be a string list whose length is 2, with the preferred answers appearing first, for example:
+### Preference Dataset
+
+Preference datasets are used for reward modeling, DPO training and ORPO training.
+
+It requires a better response in `chosen` column and a worse response in `rejected` column.
 
 ```json
 [
   {
-    "instruction": "human instruction",
-    "input": "human input",
-    "chosen": "chosen answer",
-    "rejected": "rejected answer"
+    "instruction": "human instruction (required)",
+    "input": "human input (optional)",
+    "chosen": "chosen answer (required)",
+    "rejected": "rejected answer (required)"
   }
 ]
 ```
 
-Regarding the above dataset, the description in `dataset_info.json` should be:
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
 
 ```json
 "dataset_name": {
@@ -124,14 +131,86 @@ Regarding the above dataset, the description in `dataset_info.json` should be:
 }
 ```
 
-----
+### KTO Dataset
 
-The dataset in **sharegpt** format should follow the below format:
+KTO datasets require a extra `kto_tag` column containing the boolean human feedback.
+
+```json
+[
+  {
+    "instruction": "human instruction (required)",
+    "input": "human input (optional)",
+    "output": "model response (required)",
+    "kto_tag": "human feedback [true/false] (required)"
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "kto_tag": "kto_tag"
+  }
+}
+```
+
+### Multimodal Dataset
+
+Multimodal datasets require a `images` column containing the paths to the input image. Currently we only support one image.
+
+```json
+[
+  {
+    "instruction": "human instruction (required)",
+    "input": "human input (optional)",
+    "output": "model response (required)",
+    "images": [
+      "image path (required)"
+    ]
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "images": "images"
+  }
+}
+```
+
+## Sharegpt Format
+
+### Supervised Fine-Tuning Dataset
+
+Compared to the alpaca format, the sharegpt format allows the datasets have more **roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column.
+
+Note that the human and observation should appear in odd positions, while gpt and function should appear in even positions.
 
 ```json
 [
   {
     "conversations": [
+      {
+        "from": "human",
+        "value": "human instruction"
+      },
+      {
+        "from": "gpt",
+        "value": "model response"
+      },
       {
         "from": "human",
         "value": "human instruction"
@@ -147,7 +226,7 @@ The dataset in **sharegpt** format should follow the below format:
 ]
 ```
 
-Regarding the above dataset, the description in `dataset_info.json` should be:
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
 
 ```json
 "dataset_name": {
@@ -157,19 +236,61 @@ Regarding the above dataset, the description in `dataset_info.json` should be:
     "messages": "conversations",
     "system": "system",
     "tools": "tools"
-  },
-  "tags": {
-    "role_tag": "from",
-    "content_tag": "value",
-    "user_tag": "human",
-    "assistant_tag": "gpt"
   }
 }
 ```
 
-where the `messages` column should be a list following the `u/a/u/a/u/a` order.
+### Preference Dataset
 
-We also supports the dataset in the **openai** format:
+Preference datasets in sharegpt format also require a better message in `chosen` column and a worse message in `rejected` column.
+
+```json
+[
+  {
+    "conversations": [
+      {
+        "from": "human",
+        "value": "human instruction"
+      },
+      {
+        "from": "gpt",
+        "value": "model response"
+      },
+      {
+        "from": "human",
+        "value": "human instruction"
+      }
+    ],
+    "chosen": {
+      "from": "gpt",
+      "value": "chosen answer (required)"
+    },
+    "rejected": {
+      "from": "gpt",
+      "value": "rejected answer (required)"
+    }
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "ranking": true,
+  "columns": {
+    "messages": "conversations",
+    "chosen": "chosen",
+    "rejected": "rejected"
+  }
+}
+```
+
+### OpenAI Format
+
+The openai format is simply a special case of the sharegpt format, where the first message may be a system prompt.
 
 ```json
 [
@@ -192,7 +313,7 @@ We also supports the dataset in the **openai** format:
 ]
 ```
 
-Regarding the above dataset, the description in `dataset_info.json` should be:
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
 
 ```json
 "dataset_name": {
@@ -211,4 +332,6 @@ Regarding the above dataset, the description in `dataset_info.json` should be:
 }
 ```
 
-Pre-training datasets and preference datasets are **incompatible** with the sharegpt format yet.
+The KTO datasets and multimodal datasets in sharegpt format are similar to the alpaca format.
+
+Pre-training datasets are **incompatible** with the sharegpt format.
diff --git a/data/README_zh.md b/data/README_zh.md
index deed94c5..61d60312 100644
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -1,4 +1,6 @@
-如果您使用自定义数据集，请务必按照以下格式在 `dataset_info.json` 文件中添加**数据集描述**。我们在下面也提供了一些例子。
+`dataset_info.json` 包含了所有可用的数据集。如果您希望使用自定义数据集，请务必在 `dataset_info.json` 文件中添加*数据集描述*，并通过修改 `dataset: 数据集名称` 配置来使用数据集。
+
+目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。
 
 ```json
 "数据集名称": {
@@ -6,11 +8,10 @@
   "ms_hub_url": "ModelScope 的数据集仓库地址（若指定，则忽略 script_url 和 file_name）",
   "script_url": "包含数据加载脚本的本地文件夹名称（若指定，则忽略 file_name）",
   "file_name": "该目录下数据集文件的名称（若上述参数未指定，则此项必需）",
-  "file_sha1": "数据集文件的 SHA-1 哈希值（可选，留空不影响训练）",
+  "formatting": "数据集格式（可选，默认：alpaca，可以为 alpaca 或 sharegpt）",
+  "ranking": "是否为偏好数据集（可选，默认：False）",
   "subset": "数据集子集的名称（可选，默认：None）",
   "folder": "Hugging Face 仓库的文件夹名称（可选，默认：None）",
-  "ranking": "是否为偏好数据集（可选，默认：False）",
-  "formatting": "数据集格式（可选，默认：alpaca，可以为 alpaca 或 sharegpt）",
   "columns（可选）": {
     "prompt": "数据集代表提示词的表头名称（默认：instruction）",
     "query": "数据集代表请求的表头名称（默认：input）",
@@ -20,8 +21,8 @@
     "system": "数据集代表系统提示的表头名称（默认：None）",
     "tools": "数据集代表工具描述的表头名称（默认：None）",
     "images": "数据集代表图像输入的表头名称（默认：None）",
-    "chosen": "数据集代表更优回复的表头名称（默认：None）",
-    "rejected": "数据集代表更差回复的表头名称（默认：None）",
+    "chosen": "数据集代表更优回答的表头名称（默认：None）",
+    "rejected": "数据集代表更差回答的表头名称（默认：None）",
     "kto_tag": "数据集代表 KTO 标签的表头名称（默认：None）"
   },
   "tags（可选，用于 sharegpt 格式）": {
@@ -31,16 +32,20 @@
     "assistant_tag": "消息中代表助手的 role_tag（默认：gpt）",
     "observation_tag": "消息中代表工具返回结果的 role_tag（默认：observation）",
     "function_tag": "消息中代表工具调用的 role_tag（默认：function_call）",
-    "system_tag": "消息中代表系统提示的 role_tag（默认：system，会覆盖 system 列）"
+    "system_tag": "消息中代表系统提示的 role_tag（默认：system，会覆盖 system column）"
   }
 }
 ```
 
-然后，可通过使用 `--dataset 数据集名称` 参数加载自定义数据集。
+## Alpaca 格式
 
-----
+### 指令监督微调数据集
 
-该项目目前支持两种格式的数据集：**alpaca** 和 **sharegpt**，其中 alpaca 格式的数据集按照以下方式组织：
+在指令监督微调时，`instruction` 列对应的内容会与 `input` 列对应的内容拼接后作为人类指令，即人类指令为 `instruction\ninput`。而 `output` 列对应的内容为模型回答。
+
+如果指定，`system` 列对应的内容将被作为系统提示词。
+
+`history` 列是由多个字符串二元组构成的列表，分别代表历史消息中每轮对话的指令和回答。注意在指令监督微调时，历史消息中的回答内容**也会被用于模型学习**。
 
 ```json
 [
@@ -57,7 +62,7 @@
 ]
 ```
 
-对于上述格式的数据，`dataset_info.json` 中的描述应为：
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
 
 ```json
 "数据集名称": {
@@ -72,11 +77,9 @@
 }
 ```
 
-其中 `query` 列对应的内容会与 `prompt` 列对应的内容拼接后作为人类指令，即人类指令为 `prompt\nquery`。`response` 列对应的内容为模型回答。
+### 预训练数据集
 
-`system` 列对应的内容将被作为系统提示词。`history` 列是由多个字符串二元组构成的列表，分别代表历史消息中每轮的指令和回答。注意在指令监督学习时，历史消息中的回答**也会被用于训练**。
-
-对于**预训练数据集**，仅 `prompt` 列中的内容会用于模型训练，例如：
+对于**预训练数据集**，仅 `prompt` 列中的内容会用于模型学习，例如：
 
 ```json
 [
@@ -85,7 +88,7 @@
 ]
 ```
 
-对于上述格式的数据，`dataset_info.json` 中的描述应为：
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
 
 ```json
 "数据集名称": {
@@ -96,20 +99,24 @@
 }
 ```
 
-对于**偏好数据集**，`response` 列应当是一个长度为 2 的字符串列表，排在前面的代表更优的回答，例如：
+### 偏好数据集
+
+偏好数据集用于奖励模型训练、DPO 训练和 ORPO 训练。
+
+它需要在 `chosen` 列中提供更优的回答，并在 `rejected` 列中提供更差的回答。
 
 ```json
 [
   {
-    "instruction": "人类指令",
-    "input": "人类输入",
-    "chosen": "优质回答",
-    "rejected": "劣质回答"
+    "instruction": "人类指令（必填）",
+    "input": "人类输入（选填）",
+    "chosen": "优质回答（必填）",
+    "rejected": "劣质回答（必填）"
   }
 ]
 ```
 
-对于上述格式的数据，`dataset_info.json` 中的描述应为：
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
 
 ```json
 "数据集名称": {
@@ -124,14 +131,86 @@
 }
 ```
 
-----
+### KTO 数据集
 
-而 **sharegpt** 格式的数据集按照以下方式组织：
+KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人类反馈。
+
+```json
+[
+  {
+    "instruction": "人类指令（必填）",
+    "input": "人类输入（选填）",
+    "output": "模型回答（必填）",
+    "kto_tag": "人类反馈 [true/false]（必填）"
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "kto_tag": "kto_tag"
+  }
+}
+```
+
+### 多模态数据集
+
+多模态数据集需要额外添加一个 `images` 列，包含输入图像的路径。目前我们仅支持单张图像输入。
+
+```json
+[
+  {
+    "instruction": "人类指令（必填）",
+    "input": "人类输入（选填）",
+    "output": "模型回答（必填）",
+    "images": [
+      "图像路径（必填）"
+    ]
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "images": "images"
+  }
+}
+```
+
+## Sharegpt 格式
+
+### 指令监督微调数据集
+
+相比 alpaca 格式的数据集，sharegpt 格式支持更多的**角色种类**，例如 human、gpt、observation、function 等等。它们构成一个对象列表呈现在 `conversations` 列中。
+
+其中 human 和 observation 必须出现在奇数位置，gpt 和 function 必须出现在偶数位置。
 
 ```json
 [
   {
     "conversations": [
+      {
+        "from": "human",
+        "value": "人类指令"
+      },
+      {
+        "from": "gpt",
+        "value": "模型回答"
+      },
       {
         "from": "human",
         "value": "人类指令"
@@ -147,7 +226,7 @@
 ]
 ```
 
-对于上述格式的数据，`dataset_info.json` 中的描述应为：
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
 
 ```json
 "数据集名称": {
@@ -167,9 +246,57 @@
 }
 ```
 
-其中 `messages` 列应当是一个列表，且符合 `人类/模型/人类/模型/人类/模型` 的顺序。
+### 偏好数据集
 
-我们同样支持 **openai** 格式的数据集：
+Sharegpt 格式的偏好数据集同样需要在 `chosen` 列中提供更优的消息，并在 `rejected` 列中提供更差的消息。
+
+```json
+[
+  {
+    "conversations": [
+      {
+        "from": "human",
+        "value": "人类指令"
+      },
+      {
+        "from": "gpt",
+        "value": "模型回答"
+      },
+      {
+        "from": "human",
+        "value": "人类指令"
+      }
+    ],
+    "chosen": {
+      "from": "gpt",
+      "value": "优质回答"
+    },
+    "rejected": {
+      "from": "gpt",
+      "value": "劣质回答"
+    }
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "ranking": true,
+  "columns": {
+    "messages": "conversations",
+    "chosen": "chosen",
+    "rejected": "rejected"
+  }
+}
+```
+
+### OpenAI 格式
+
+OpenAI 格式仅仅是 sharegpt 格式的一种特殊情况，其中第一条消息可能是系统提示词。
 
 ```json
 [
@@ -192,7 +319,7 @@
 ]
 ```
 
-对于上述格式的数据，`dataset_info.json` 中的描述应为：
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
 
 ```json
 "数据集名称": {
@@ -211,4 +338,6 @@
 }
 ```
 
-预训练数据集和偏好数据集**尚不支持** sharegpt 格式。
+Sharegpt 格式中的 KTO 数据集和多模态数据集与 alpaca 格式的类似。
+
+预训练数据集**不支持** sharegpt 格式。

From 57dde7c3bcf264d382d491b8d114607a7688419b Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 18 May 2024 21:37:38 +0800
Subject: [PATCH 326/341] update data readme

Former-commit-id: 22c7335b496e4a673383d5a1e4e60bf2cb4e35b3
---
 data/README.md    | 30 +++++++++++++++++++++---------
 data/README_zh.md | 34 ++++++++++++++++++++--------------
 2 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/data/README.md b/data/README.md
index a467fe67..3e96fbeb 100644
--- a/data/README.md
+++ b/data/README.md
@@ -1,4 +1,4 @@
-The `dataset_info.json` contains all available datasets. If you are using a custom dataset, please make sure to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it.
+The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it.
 
 Currently we support datasets in **alpaca** and **sharegpt** format.
 
@@ -41,11 +41,13 @@ Currently we support datasets in **alpaca** and **sharegpt** format.
 
 ### Supervised Fine-Tuning Dataset
 
+* [Example dataset](alpaca_en_demo.json)
+
 In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the human prompt, then the human prompt would be `instruction\ninput`. The `output` column represents the model response.
 
 The `system` column will be used as the system prompt if specified.
 
-The `history` column is a list consisting string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning.
+The `history` column is a list consisting of string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning.
 
 ```json
 [
@@ -79,7 +81,9 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 
 ### Pre-training Dataset
 
-In pre-training, only the `prompt` column will be used for model learning.
+- [Example dataset](c4_demo.json)
+
+In pre-training, only the `text` column will be used for model learning.
 
 ```json
 [
@@ -133,6 +137,8 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 
 ### KTO Dataset
 
+- [Example dataset](kto_en_demo.json)
+
 KTO datasets require a extra `kto_tag` column containing the boolean human feedback.
 
 ```json
@@ -162,7 +168,9 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 
 ### Multimodal Dataset
 
-Multimodal datasets require a `images` column containing the paths to the input image. Currently we only support one image.
+- [Example dataset](mllm_demo.json)
+
+Multimodal datasets require a `images` column containing the paths to the input images. Currently we only support one image.
 
 ```json
 [
@@ -195,7 +203,9 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 
 ### Supervised Fine-Tuning Dataset
 
-Compared to the alpaca format, the sharegpt format allows the datasets have more **roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column.
+- [Example dataset](glaive_toolcall_en_demo.json)
+
+Compared to the alpaca format, the sharegpt format allows the datasets have **more roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column.
 
 Note that the human and observation should appear in odd positions, while gpt and function should appear in even positions.
 
@@ -208,12 +218,12 @@ Note that the human and observation should appear in odd positions, while gpt an
         "value": "human instruction"
       },
       {
-        "from": "gpt",
-        "value": "model response"
+        "from": "function_call",
+        "value": "tool arguments"
       },
       {
-        "from": "human",
-        "value": "human instruction"
+        "from": "observation",
+        "value": "tool result"
       },
       {
         "from": "gpt",
@@ -242,6 +252,8 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 
 ### Preference Dataset
 
+- [Example dataset](dpo_en_demo.json)
+
 Preference datasets in sharegpt format also require a better message in `chosen` column and a worse message in `rejected` column.
 
 ```json
diff --git a/data/README_zh.md b/data/README_zh.md
index 61d60312..d8a2419e 100644
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -1,4 +1,4 @@
-`dataset_info.json` 包含了所有可用的数据集。如果您希望使用自定义数据集，请务必在 `dataset_info.json` 文件中添加*数据集描述*，并通过修改 `dataset: 数据集名称` 配置来使用数据集。
+[dataset_info.json](dataset_info.json) 包含了所有可用的数据集。如果您希望使用自定义数据集，请**务必**在 `dataset_info.json` 文件中添加*数据集描述*，并通过修改 `dataset: 数据集名称` 配置来使用数据集。
 
 目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。
 
@@ -41,6 +41,8 @@
 
 ### 指令监督微调数据集
 
+- [样例数据集](alpaca_zh_demo.json)
+
 在指令监督微调时，`instruction` 列对应的内容会与 `input` 列对应的内容拼接后作为人类指令，即人类指令为 `instruction\ninput`。而 `output` 列对应的内容为模型回答。
 
 如果指定，`system` 列对应的内容将被作为系统提示词。
@@ -79,7 +81,9 @@
 
 ### 预训练数据集
 
-对于**预训练数据集**，仅 `prompt` 列中的内容会用于模型学习，例如：
+- [样例数据集](c4_demo.json)
+
+在预训练时，只有 `text` 列中的内容会用于模型学习。
 
 ```json
 [
@@ -133,6 +137,8 @@
 
 ### KTO 数据集
 
+- [样例数据集](kto_en_demo.json)
+
 KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人类反馈。
 
 ```json
@@ -162,6 +168,8 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
 
 ### 多模态数据集
 
+- [样例数据集](mllm_demo.json)
+
 多模态数据集需要额外添加一个 `images` 列，包含输入图像的路径。目前我们仅支持单张图像输入。
 
 ```json
@@ -195,9 +203,11 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
 
 ### 指令监督微调数据集
 
-相比 alpaca 格式的数据集，sharegpt 格式支持更多的**角色种类**，例如 human、gpt、observation、function 等等。它们构成一个对象列表呈现在 `conversations` 列中。
+- [样例数据集](glaive_toolcall_zh_demo.json)
 
-其中 human 和 observation 必须出现在奇数位置，gpt 和 function 必须出现在偶数位置。
+相比 alpaca 格式的数据集，sharegpt 格式支持**更多的角色种类**，例如 human、gpt、observation、function 等等。它们构成一个对象列表呈现在 `conversations` 列中。
+
+注意其中 human 和 observation 必须出现在奇数位置，gpt 和 function 必须出现在偶数位置。
 
 ```json
 [
@@ -208,12 +218,12 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
         "value": "人类指令"
       },
       {
-        "from": "gpt",
-        "value": "模型回答"
+        "from": "function_call",
+        "value": "工具参数"
       },
       {
-        "from": "human",
-        "value": "人类指令"
+        "from": "observation",
+        "value": "工具结果"
       },
       {
         "from": "gpt",
@@ -236,18 +246,14 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
     "messages": "conversations",
     "system": "system",
     "tools": "tools"
-  },
-  "tags": {
-    "role_tag": "from",
-    "content_tag": "value",
-    "user_tag": "human",
-    "assistant_tag": "gpt"
   }
 }
 ```
 
 ### 偏好数据集
 
+- [样例数据集](dpo_zh_demo.json)
+
 Sharegpt 格式的偏好数据集同样需要在 `chosen` 列中提供更优的消息，并在 `rejected` 列中提供更差的消息。
 
 ```json

From 0aa072a155e214ae074c0add214c94f39e32a910 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 18 May 2024 22:02:42 +0800
Subject: [PATCH 327/341] improve data process logger

Former-commit-id: 33d0b012b56dbafc9fff87b821c2d1bf1409dbb5
---
 src/llamafactory/data/aligner.py       | 2 +-
 src/llamafactory/data/preprocess.py    | 5 +++++
 src/llamafactory/train/kto/workflow.py | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py
index 6a74a843..2a382c60 100644
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -149,7 +149,7 @@ def convert_sharegpt(
                 chosen[dataset_attr.role_tag] not in accept_tags[-1]
                 or rejected[dataset_attr.role_tag] not in accept_tags[-1]
             ):
-                logger.warning("Invalid role tag in {}.".format(messages))
+                logger.warning("Invalid role tag in {}.".format([chosen, rejected]))
                 broken_data = True
 
             prompt = aligned_messages
diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py
index a6fb0ddc..557678e6 100644
--- a/src/llamafactory/data/preprocess.py
+++ b/src/llamafactory/data/preprocess.py
@@ -77,6 +77,7 @@ def preprocess_supervised_dataset(
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
             continue
 
         if processor is not None:
@@ -129,6 +130,7 @@ def preprocess_packed_supervised_dataset(
     input_ids, labels = [], []
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
             continue
 
         messages = examples["prompt"][i] + examples["response"][i]
@@ -178,6 +180,7 @@ def preprocess_unsupervised_dataset(
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
             continue
 
         if processor is not None:
@@ -224,6 +227,7 @@ def preprocess_pairwise_dataset(
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
             continue
 
         if processor is not None:
@@ -285,6 +289,7 @@ def preprocess_kto_dataset(
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
             continue
 
         if processor is not None:
diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py
index 615fdb62..26dc770c 100644
--- a/src/llamafactory/train/kto/workflow.py
+++ b/src/llamafactory/train/kto/workflow.py
@@ -62,7 +62,7 @@ def run_kto(
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
         if trainer.is_world_process_zero() and finetuning_args.plot_loss:
-            plot_loss(training_args.output_dir, keys=["loss", "eval_loss"])
+            plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "train/rewards/chosen"])
 
     # Evaluation
     if training_args.do_eval:

From d13b8bee8abeb69a148ec47ab0246a6e571564a2 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 18 May 2024 22:28:45 +0800
Subject: [PATCH 328/341] fix jetmoe z3 block

Former-commit-id: cb00a14d905395c4b8fadb955f0424a4c56668de
---
 src/llamafactory/model/utils/moe.py | 30 ++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/llamafactory/model/utils/moe.py b/src/llamafactory/model/utils/moe.py
index 64dcaba5..e554e45a 100644
--- a/src/llamafactory/model/utils/moe.py
+++ b/src/llamafactory/model/utils/moe.py
@@ -20,6 +20,21 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:
     require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
     from deepspeed.utils import set_z3_leaf_modules  # type: ignore
 
+    if getattr(model.config, "model_type", None) == "dbrx":
+        from transformers.models.dbrx.modeling_dbrx import DbrxFFN
+
+        set_z3_leaf_modules(model, [DbrxFFN])
+
+    if getattr(model.config, "model_type", None) == "jamba":
+        from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock
+
+        set_z3_leaf_modules(model, [JambaSparseMoeBlock])
+
+    if getattr(model.config, "model_type", None) == "jetmoe":
+        from transformers.models.jetmoe.modeling_jetmoe import JetMoeMoA, JetMoeMoE
+
+        set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE])
+
     if getattr(model.config, "model_type", None) == "mixtral":
         from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
@@ -30,16 +45,6 @@ def add_z3_leaf_module(model: "PreTrainedModel") -> None:
 
         set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
 
-    if getattr(model.config, "model_type", None) == "jamba":
-        from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock
-
-        set_z3_leaf_modules(model, [JambaSparseMoeBlock])
-
-    if getattr(model.config, "model_type", None) == "dbrx":
-        from transformers.models.dbrx.modeling_dbrx import DbrxFFN
-
-        set_z3_leaf_modules(model, [DbrxFFN])
-
 
 def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
     if model_args.moe_aux_loss_coef is not None:
@@ -49,5 +54,8 @@ def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_t
         elif getattr(config, "model_type", None) == "deepseek":
             setattr(config, "aux_loss_alpha", model_args.moe_aux_loss_coef)
 
-    if getattr(config, "model_type", None) in ["dbrx", "jamba", "mixtral", "qwen2_moe"]:
+        elif getattr(config, "model_type", None) == "jetmoe":
+            setattr(config, "aux_loss_coef", model_args.moe_aux_loss_coef)
+
+    if getattr(config, "model_type", None) in ["dbrx", "jamba", "jetmoe", "mixtral", "qwen2_moe"]:
         setattr(config, "output_router_logits", is_trainable)

From fbe963a96a6784128f575f3869b2b92ae50c5958 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 18 May 2024 22:42:28 +0800
Subject: [PATCH 329/341] safe output path in webui

Former-commit-id: 23f14262e0d54631630c084ba71e0433ea1d4640
---
 src/llamafactory/webui/common.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py
index c63e9d74..ea82fd88 100644
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
@@ -37,8 +37,9 @@ DEFAULT_SAVE_DIR = "saves"
 USER_CONFIG = "user_config.yaml"
 
 
-def get_save_dir(*args) -> os.PathLike:
-    return os.path.join(DEFAULT_SAVE_DIR, *args)
+def get_save_dir(*paths: str) -> os.PathLike:
+    paths = (path.replace(os.path.sep, "").replace(" ", "").strip() for path in paths)
+    return os.path.join(DEFAULT_SAVE_DIR, *paths)
 
 
 def get_config_path() -> os.PathLike:

From a810d1b98e0adb80f91dd3fac21c5a5c2f5a6403 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sat, 18 May 2024 23:09:03 +0800
Subject: [PATCH 330/341] update readme

Former-commit-id: e0beb67a417b13c818a09bd419d4e20dd44ca842
---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index da81a929..601f67a9 100644
--- a/README.md
+++ b/README.md
@@ -320,7 +320,7 @@ huggingface-cli login
 > Installation is mandatory.
 
 ```bash
-git clone https://github.com/hiyouga/LLaMA-Factory.git
+git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
 cd LLaMA-Factory
 pip install -e .[torch,metrics]
 ```
diff --git a/README_zh.md b/README_zh.md
index b8f5e6ab..27b122b0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -320,7 +320,7 @@ huggingface-cli login
 > 此步骤为必需。
 
 ```bash
-git clone https://github.com/hiyouga/LLaMA-Factory.git
+git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
 cd LLaMA-Factory
 pip install -e .[torch,metrics]
 ```

From 4f04e2ed934a45b858d9e0503ef6d8cf8c46a831 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 19 May 2024 17:07:57 +0800
Subject: [PATCH 331/341] fix #3807

Former-commit-id: 08b695969049de8bf9bd3e90b9700736d90385ee
---
 src/llamafactory/model/utils/visual.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/model/utils/visual.py b/src/llamafactory/model/utils/visual.py
index e3b81bd1..c8260b7f 100644
--- a/src/llamafactory/model/utils/visual.py
+++ b/src/llamafactory/model/utils/visual.py
@@ -69,7 +69,7 @@ def autocast_projector_dtype(
     ) -> "torch.Tensor":
         return output.to(model_args.compute_dtype)
 
-    if hasattr(model, mm_projector_name) and getattr(model.config, "quantization_method", None):
+    if hasattr(model, mm_projector_name) and getattr(model, "quantization_method", None):
         logger.info("Casting multimodal projector outputs in {}.".format(model_args.compute_dtype))
         mm_projector: "torch.nn.Module" = getattr(model, mm_projector_name)
         mm_projector.register_forward_hook(_mm_projector_forward_post_hook)

From 351e80a656e25242dfb9746614d031953b19b725 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 19 May 2024 18:27:18 +0800
Subject: [PATCH 332/341] fix envs

Former-commit-id: d5e150cfb98f8216713415564ab386b8320c88cb
---
 src/llamafactory/extras/misc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index 8955acd1..0addf315 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -225,4 +225,4 @@ def try_download_model_from_ms(model_args: "ModelArguments") -> str:
 
 
 def use_modelscope() -> bool:
-    return bool(int(os.environ.get("USE_MODELSCOPE_HUB", "0")))
+    return os.environ.get("USE_MODELSCOPE_HUB", "0").lower() in ["true", "1"]

From fd02c9f973198d95da74334f3fe76f0d849736db Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 19 May 2024 19:39:32 +0800
Subject: [PATCH 333/341] fix hf gen args

Former-commit-id: 491a84976258cbb2a2647922420e2f84de1e38cd
---
 src/llamafactory/chat/hf_engine.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 5cb8bfe4..90fe1b81 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -97,6 +97,9 @@ class HuggingfaceEngine(BaseEngine):
         if isinstance(num_return_sequences, int) and num_return_sequences > 1:
             generating_args["do_sample"] = True
 
+        if not generating_args["temperature"]:
+            generating_args["do_sample"] = False
+
         if not generating_args["do_sample"]:
             generating_args.pop("temperature", None)
             generating_args.pop("top_p", None)

From 516d83c9461b63a572f58472e7d62940542ab899 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 19 May 2024 21:53:54 +0800
Subject: [PATCH 334/341] fix zero2 high ram usage

Former-commit-id: 01797126eb173250250e31f8e76b69ae0047745d
---
 src/llamafactory/model/adapter.py | 6 +++---
 src/llamafactory/model/patcher.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py
index e868afd6..f37f3bbb 100644
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING
 
 import torch
 from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model
-from transformers.integrations import deepspeed_config, is_deepspeed_zero3_enabled
+from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.modeling_utils import is_fsdp_enabled
 
 from ..extras.logging import get_logger
@@ -43,8 +43,8 @@ def init_adapter(
     if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None):
         raise ValueError("You can only use lora for quantized models.")
 
-    if deepspeed_config() is not None or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
-        logger.info("DeepSpeed/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.")
+    if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam:
+        logger.info("ZeRO3/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.")
         cast_trainable_params_to_fp32 = False
     else:
         logger.info("Upcasting trainable params to float32.")
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index 9297ef00..1a8ce607 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict
 import torch
 from peft import PeftModel
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available
-from transformers.integrations import deepspeed_config, is_deepspeed_zero3_enabled
+from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.modeling_utils import is_fsdp_enabled
 
 from ..extras.logging import get_logger
@@ -72,7 +72,7 @@ def patch_config(
     # deepspeed zero3 is not compatible with low_cpu_mem_usage
     init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage and (not is_deepspeed_zero3_enabled())
 
-    if deepspeed_config() is None and not is_fsdp_enabled():  # set dtype and device map if not use deepspeed or fsdp
+    if not is_deepspeed_zero3_enabled() and not is_fsdp_enabled():  # cast dtype and device if not use zero3 or fsdp
         init_kwargs["torch_dtype"] = model_args.compute_dtype
 
         if init_kwargs["low_cpu_mem_usage"]:  # device map requires low_cpu_mem_usage=True

From 77a089c35cd2a1ddb616257847761811ca070a6f Mon Sep 17 00:00:00 2001
From: ycjcl868 <chaolinjin@gmail.com>
Date: Sun, 19 May 2024 23:17:46 +0800
Subject: [PATCH 335/341] feat: cli chat support system_message

Former-commit-id: e3982bff596d01992733687a580c4f41c558061c
---
 src/llamafactory/chat/chat_model.py         | 2 ++
 src/llamafactory/hparams/generating_args.py | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
index 281ef0c1..aa873127 100644
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
@@ -29,6 +29,7 @@ class ChatModel:
         else:
             raise NotImplementedError("Unknown backend: {}".format(model_args.infer_backend))
 
+        self.system_message = generating_args.system_message or None
         self._loop = asyncio.new_event_loop()
         self._thread = Thread(target=_start_background_loop, args=(self._loop,), daemon=True)
         self._thread.start()
@@ -63,6 +64,7 @@ class ChatModel:
         image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> Generator[str, None, None]:
+        system = system or self.system_message
         generator = self.astream_chat(messages, system, tools, image, **input_kwargs)
         while True:
             try:
diff --git a/src/llamafactory/hparams/generating_args.py b/src/llamafactory/hparams/generating_args.py
index e792c003..17669a51 100644
--- a/src/llamafactory/hparams/generating_args.py
+++ b/src/llamafactory/hparams/generating_args.py
@@ -46,6 +46,11 @@ class GeneratingArguments:
         default=1.0,
         metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
     )
+    system_message: str = field(
+        default=None,
+        metadata={
+            "help": "System message is a message that the developer wrote to tell the bot how to interpret the conversation"},
+    )
 
     def to_dict(self) -> Dict[str, Any]:
         args = asdict(self)

From 3453a8eebb80b2451fac5b356bfff510f71de12f Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Sun, 19 May 2024 23:38:30 +0800
Subject: [PATCH 336/341] fix jinja template

Former-commit-id: 353561f0e3914de3f81499c4e4b831ae0a6383b6
---
 src/llamafactory/data/template.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index 66f6f651..c3b94bc6 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -276,7 +276,7 @@ def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str)
 
 
 def _jinja_escape(content: str) -> str:
-    return content.replace("\n", r"\n").replace("'", r"\'")
+    return content.replace("'", r"\'")
 
 
 def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str:

From 17d398f41984f7a67b181b25dd80fe806b458d2d Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 20 May 2024 00:29:12 +0800
Subject: [PATCH 337/341] Update chat_model.py

Former-commit-id: 7736aafdc81d175e9fb484dbb7cae9263120a0fc
---
 src/llamafactory/chat/chat_model.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
index aa873127..281ef0c1 100644
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
@@ -29,7 +29,6 @@ class ChatModel:
         else:
             raise NotImplementedError("Unknown backend: {}".format(model_args.infer_backend))
 
-        self.system_message = generating_args.system_message or None
         self._loop = asyncio.new_event_loop()
         self._thread = Thread(target=_start_background_loop, args=(self._loop,), daemon=True)
         self._thread.start()
@@ -64,7 +63,6 @@ class ChatModel:
         image: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> Generator[str, None, None]:
-        system = system or self.system_message
         generator = self.astream_chat(messages, system, tools, image, **input_kwargs)
         while True:
             try:

From 3578abc7a4c53c89b181ee3851afceb3e80d8512 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 20 May 2024 00:29:31 +0800
Subject: [PATCH 338/341] Update generating_args.py

Former-commit-id: 861c146fa7d9cb5b99372464bd068c20fa36415d
---
 src/llamafactory/hparams/generating_args.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/llamafactory/hparams/generating_args.py b/src/llamafactory/hparams/generating_args.py
index 17669a51..0ee17d1a 100644
--- a/src/llamafactory/hparams/generating_args.py
+++ b/src/llamafactory/hparams/generating_args.py
@@ -1,5 +1,5 @@
 from dataclasses import asdict, dataclass, field
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 
 @dataclass
@@ -46,10 +46,9 @@ class GeneratingArguments:
         default=1.0,
         metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
     )
-    system_message: str = field(
+    default_system: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "System message is a message that the developer wrote to tell the bot how to interpret the conversation"},
+        metadata={"help": "Default system message to use in chat completion."},
     )
 
     def to_dict(self) -> Dict[str, Any]:

From b103a121f056595f905e7c4d0c62d38a91c05bd2 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 20 May 2024 00:30:45 +0800
Subject: [PATCH 339/341] Update hf_engine.py

Former-commit-id: ce8b902e538c69d89f207db8a43c85072cd70265
---
 src/llamafactory/chat/hf_engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 90fe1b81..1ef99d9f 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -59,6 +59,7 @@ class HuggingfaceEngine(BaseEngine):
             messages[0]["content"] = "<image>" + messages[0]["content"]
 
         paired_messages = messages + [{"role": "assistant", "content": ""}]
+        system = system or generating_args["default_system"]
         prompt_ids, _ = template.encode_oneturn(
             tokenizer=tokenizer, messages=paired_messages, system=system, tools=tools
         )

From e093dad7cb2a3d8b7e7fe59642cdfb2bc254a3fc Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Mon, 20 May 2024 00:31:04 +0800
Subject: [PATCH 340/341] Update vllm_engine.py

Former-commit-id: 0b8278bd21baf35d3f60c6ed24f110b391c92a47
---
 src/llamafactory/chat/vllm_engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index ba0cc1b3..2e8ecd0c 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -96,6 +96,7 @@ class VllmEngine(BaseEngine):
             messages[0]["content"] = "<image>" * self.image_feature_size + messages[0]["content"]
 
         paired_messages = messages + [{"role": "assistant", "content": ""}]
+        system = system or self.generating_args["default_system"]
         prompt_ids, _ = self.template.encode_oneturn(
             tokenizer=self.tokenizer, messages=paired_messages, system=system, tools=tools
         )

From ab48653e6394c9587eed15a7cd5e99d1a9054cc7 Mon Sep 17 00:00:00 2001
From: hiyouga <467089858@qq.com>
Date: Mon, 20 May 2024 00:36:43 +0800
Subject: [PATCH 341/341] fix chat engines

do not use pop(key, default) since api assigns None to dict values


Former-commit-id: 3ebbd0b55ea07de2897c27ca54eeab5c3b319419
---
 src/llamafactory/chat/hf_engine.py   | 39 ++++++++++++------------
 src/llamafactory/chat/vllm_engine.py | 44 +++++++++++++++++-----------
 src/llamafactory/data/template.py    |  4 +--
 3 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
index 1ef99d9f..57cdc89a 100644
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -2,7 +2,7 @@ import asyncio
 import concurrent.futures
 import os
 from threading import Thread
-from typing import TYPE_CHECKING, Any, AsyncGenerator, Callable, Dict, List, Optional, Sequence, Tuple
+from typing import TYPE_CHECKING, Any, AsyncGenerator, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import torch
 from transformers import GenerationConfig, TextIteratorStreamer
@@ -66,16 +66,16 @@ class HuggingfaceEngine(BaseEngine):
         prompt_length = len(prompt_ids)
         inputs = torch.tensor([prompt_ids], device=model.device)
 
-        do_sample = input_kwargs.pop("do_sample", generating_args["do_sample"])
-        temperature = input_kwargs.pop("temperature", generating_args["temperature"])
-        top_p = input_kwargs.pop("top_p", generating_args["top_p"])
-        top_k = input_kwargs.pop("top_k", generating_args["top_k"])
-        num_return_sequences = input_kwargs.pop("num_return_sequences", 1)
-        repetition_penalty = input_kwargs.pop("repetition_penalty", generating_args["repetition_penalty"])
-        length_penalty = input_kwargs.pop("length_penalty", generating_args["length_penalty"])
-        max_length = input_kwargs.pop("max_length", None)
-        max_new_tokens = input_kwargs.pop("max_new_tokens", None)
-        stop = input_kwargs.pop("stop", None)
+        do_sample: Optional[bool] = input_kwargs.pop("do_sample", None)
+        temperature: Optional[float] = input_kwargs.pop("temperature", None)
+        top_p: Optional[float] = input_kwargs.pop("top_p", None)
+        top_k: Optional[float] = input_kwargs.pop("top_k", None)
+        num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None)
+        length_penalty: Optional[float] = input_kwargs.pop("length_penalty", None)
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+        stop: Optional[Union[str, List[str]]] = input_kwargs.pop("stop", None)
 
         if stop is not None:
             raise ValueError("Stop parameter is not supported in Huggingface engine yet.")
@@ -83,20 +83,23 @@ class HuggingfaceEngine(BaseEngine):
         generating_args = generating_args.copy()
         generating_args.update(
             dict(
-                do_sample=do_sample,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
+                do_sample=do_sample if do_sample is not None else generating_args["do_sample"],
+                temperature=temperature if temperature is not None else generating_args["temperature"],
+                top_p=top_p if top_p is not None else generating_args["top_p"],
+                top_k=top_k if top_k is not None else generating_args["top_k"],
                 num_return_sequences=num_return_sequences,
-                repetition_penalty=repetition_penalty,
-                length_penalty=length_penalty,
+                repetition_penalty=repetition_penalty
+                if repetition_penalty is not None
+                else generating_args["repetition_penalty"],
+                length_penalty=length_penalty if length_penalty is not None else generating_args["length_penalty"],
                 eos_token_id=[tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids,
                 pad_token_id=tokenizer.pad_token_id,
             )
         )
 
-        if isinstance(num_return_sequences, int) and num_return_sequences > 1:
+        if isinstance(num_return_sequences, int) and num_return_sequences > 1:  # do_sample needs temperature > 0
             generating_args["do_sample"] = True
+            generating_args["temperature"] = generating_args["temperature"] or 1.0
 
         if not generating_args["temperature"]:
             generating_args["do_sample"] = False
diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
index 2e8ecd0c..44b9651f 100644
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence
+from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence, Union
 
 from ..data import get_template_and_fix_tokenizer
 from ..extras.logging import get_logger
@@ -102,18 +102,25 @@ class VllmEngine(BaseEngine):
         )
         prompt_length = len(prompt_ids)
 
-        use_beam_search = self.generating_args["num_beams"] > 1
-        temperature = input_kwargs.pop("temperature", self.generating_args["temperature"])
-        top_p = input_kwargs.pop("top_p", self.generating_args["top_p"])
-        top_k = input_kwargs.pop("top_k", self.generating_args["top_k"])
-        num_return_sequences = input_kwargs.pop("num_return_sequences", 1)
-        repetition_penalty = input_kwargs.pop("repetition_penalty", self.generating_args["repetition_penalty"])
-        length_penalty = input_kwargs.pop("length_penalty", self.generating_args["length_penalty"])
-        max_length = input_kwargs.pop("max_length", None)
-        max_new_tokens = input_kwargs.pop("max_new_tokens", None)
-        stop = input_kwargs.pop("stop", None)
+        use_beam_search: bool = self.generating_args["num_beams"] > 1
+        temperature: Optional[float] = input_kwargs.pop("temperature", None)
+        top_p: Optional[float] = input_kwargs.pop("top_p", None)
+        top_k: Optional[float] = input_kwargs.pop("top_k", None)
+        num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None)
+        length_penalty: Optional[float] = input_kwargs.pop("length_penalty", None)
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+        stop: Optional[Union[str, List[str]]] = input_kwargs.pop("stop", None)
+
+        if "max_new_tokens" in self.generating_args:
+            max_tokens = self.generating_args["max_new_tokens"]
+        elif "max_length" in self.generating_args:
+            if self.generating_args["max_length"] > prompt_length:
+                max_tokens = self.generating_args["max_length"] - prompt_length
+            else:
+                max_tokens = 1
 
-        max_tokens = self.generating_args["max_new_tokens"] or self.generating_args["max_length"]
         if max_length:
             max_tokens = max_length - prompt_length if max_length > prompt_length else 1
 
@@ -122,12 +129,15 @@ class VllmEngine(BaseEngine):
 
         sampling_params = SamplingParams(
             n=num_return_sequences,
-            repetition_penalty=repetition_penalty,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
+            repetition_penalty=(
+                repetition_penalty if repetition_penalty is not None else self.generating_args["repetition_penalty"]
+            )
+            or 1.0,  # repetition_penalty must > 0
+            temperature=temperature if temperature is not None else self.generating_args["temperature"],
+            top_p=(top_p if top_p is not None else self.generating_args["top_p"]) or 1.0,  # top_p must > 0
+            top_k=top_k if top_k is not None else self.generating_args["top_k"],
             use_beam_search=use_beam_search,
-            length_penalty=length_penalty,
+            length_penalty=length_penalty if length_penalty is not None else self.generating_args["length_penalty"],
             stop=stop,
             stop_token_ids=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
             max_tokens=max_tokens,
diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index c3b94bc6..66e9dca5 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -68,8 +68,8 @@ class Template:
         self,
         tokenizer: "PreTrainedTokenizer",
         messages: List[Dict[str, str]],
-        system: str,
-        tools: str,
+        system: Optional[str],
+        tools: Optional[str],
         cutoff_len: int,
         reserved_label_len: int,
     ) -> Sequence[Tuple[List[int], List[int]]]: