diff --git a/README.md b/README.md
index 5f5ab7f9..9d1ea252 100644
--- a/README.md
+++ b/README.md
@@ -47,10 +47,11 @@
 | [Falcon](https://huggingface.co/tiiuae/falcon-7b)        | 7B/40B                      | query_key_value   | -         |
 | [Baichuan](https://github.com/baichuan-inc/Baichuan-13B) | 7B/13B                      | W_pack            | baichuan  |
 | [Baichuan2](https://github.com/baichuan-inc/Baichuan2)   | 7B/13B                      | W_pack            | baichuan2 |
-| [InternLM](https://github.com/InternLM/InternLM)         | 7B                          | q_proj,v_proj     | intern    |
+| [InternLM](https://github.com/InternLM/InternLM)         | 7B/20B                      | q_proj,v_proj     | intern    |
 | [Qwen](https://github.com/QwenLM/Qwen-7B)                | 7B                          | c_attn            | chatml    |
 | [XVERSE](https://github.com/xverse-ai/XVERSE-13B)        | 13B                         | q_proj,v_proj     | xverse    |
 | [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)         | 6B                          | query_key_value   | chatglm2  |
+| [Phi-1.5](https://huggingface.co/microsoft/phi-1_5)      | 1.5B                        | Wqkv              | -         |
 
 > [!NOTE]
 > **Default module** is used for the `--lora_target` argument, you can use `--lora_target all` to specify all the available modules.
@@ -157,7 +158,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 CUDA_VISIBLE_DEVICES=0 python src/train_web.py
 ```
 
-We strongly recommend using the all-in-one Web UI for newcomers since it can also generate training scripts **automatically**.
+We **strongly recommend** using the all-in-one Web UI for newcomers since it can also generate training scripts automatically, even without a GPU environment.
 
 > [!WARNING]
 > Currently the web UI only supports training on **a single GPU**.
@@ -457,6 +458,7 @@ Please follow the model licenses to use the corresponding model weights:
 - [Qwen](https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/LICENSE)
 - [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf)
 - [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B/blob/main/MODEL_LICENSE)
+- [Phi-1.5](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx)
 
 ## Citation
 
diff --git a/README_zh.md b/README_zh.md
index b5e94252..350e2ddf 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -47,10 +47,11 @@
 | [Falcon](https://huggingface.co/tiiuae/falcon-7b)        | 7B/40B                      | query_key_value   | -         |
 | [Baichuan](https://github.com/baichuan-inc/Baichuan-13B) | 7B/13B                      | W_pack            | baichuan  |
 | [Baichuan2](https://github.com/baichuan-inc/Baichuan2)   | 7B/13B                      | W_pack            | baichuan2 |
-| [InternLM](https://github.com/InternLM/InternLM)         | 7B                          | q_proj,v_proj     | intern    |
+| [InternLM](https://github.com/InternLM/InternLM)         | 7B/20B                      | q_proj,v_proj     | intern    |
 | [Qwen](https://github.com/QwenLM/Qwen-7B)                | 7B                          | c_attn            | chatml    |
 | [XVERSE](https://github.com/xverse-ai/XVERSE-13B)        | 13B                         | q_proj,v_proj     | xverse    |
 | [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)         | 6B                          | query_key_value   | chatglm2  |
+| [Phi-1.5](https://huggingface.co/microsoft/phi-1_5)      | 1.5B                        | Wqkv              | -         |
 
 > [!NOTE]
 > **默认模块**应作为 `--lora_target` 参数的默认值，可使用 `--lora_target all` 参数指定全部模块。
@@ -157,7 +158,7 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 CUDA_VISIBLE_DEVICES=0 python src/train_web.py
 ```
 
-我们极力推荐新手使用浏览器一体化界面，因为它还可以**自动**生成运行所需的命令行脚本。
+我们**极力推荐**新手使用浏览器一体化界面，因为它还可以不依赖 GPU 环境自动生成在 GPU 上运行的命令行脚本。
 
 > [!WARNING]
 > 目前网页 UI 仅支持**单卡训练**。
@@ -456,6 +457,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
 - [Qwen](https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/LICENSE)
 - [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf)
 - [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B/blob/main/MODEL_LICENSE)
+- [Phi-1.5](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx)
 
 ## 引用
 
diff --git a/src/llmtuner/tuner/core/loader.py b/src/llmtuner/tuner/core/loader.py
index 6cc40a33..a14e02a7 100644
--- a/src/llmtuner/tuner/core/loader.py
+++ b/src/llmtuner/tuner/core/loader.py
@@ -13,14 +13,14 @@ from transformers import (
     PreTrainedModel,
     PreTrainedTokenizerBase
 )
-from transformers.utils import check_min_version, is_torch_npu_available
+from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
 from trl import AutoModelForCausalLMWithValueHead
 
 try:
-    from transformers.deepspeed import is_deepspeed_zero3_enabled
-except ImportError:
     from transformers.integrations import is_deepspeed_zero3_enabled
+except ImportError:
+    from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 from llmtuner.extras.logging import reset_logging, get_logger
 from llmtuner.extras.misc import count_parameters
@@ -85,7 +85,7 @@ def load_model_and_tokenizer(
     config = AutoConfig.from_pretrained(model_to_load, **config_kwargs)
 
     # Fix config (for Qwen)
-    if is_trainable and hasattr(config, "fp16") and hasattr(config, "bf16"):
+    if hasattr(config, "fp16") and hasattr(config, "bf16"):
         setattr(config, "fp16", model_args.compute_dtype == torch.float16)
         setattr(config, "bf16", model_args.compute_dtype == torch.bfloat16)
 
@@ -215,11 +215,7 @@ def load_model_and_tokenizer(
     # Prepare model for inference
     if not is_trainable:
         model.requires_grad_(False) # fix all model params
-        if is_torch_npu_available():
-            infer_dtype = torch.float16
-        else:
-            infer_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 # detect cuda capability
-        model = model.to(infer_dtype) if model_args.quantization_bit is None else model
+        model = model.to(model_args.compute_dtype) if model_args.quantization_bit is None else model
 
     trainable_params, all_param = count_parameters(model)
     logger.info("trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format(
diff --git a/src/llmtuner/tuner/core/parser.py b/src/llmtuner/tuner/core/parser.py
index 2bb21325..47fb8b2d 100644
--- a/src/llmtuner/tuner/core/parser.py
+++ b/src/llmtuner/tuner/core/parser.py
@@ -8,6 +8,14 @@ from transformers import HfArgumentParser, Seq2SeqTrainingArguments
 from transformers.utils.versions import require_version
 from transformers.trainer_utils import get_last_checkpoint
 
+try:
+    from transformers.utils import is_torch_bf16_gpu_available, is_torch_npu_available
+    is_bf16_available = is_torch_bf16_gpu_available()
+    is_npu_available = is_torch_npu_available()
+except ImportError:
+    is_bf16_available = torch.cuda.is_bf16_supported()
+    is_npu_available = False
+
 from llmtuner.extras.logging import get_logger
 from llmtuner.hparams import (
     ModelArguments,
@@ -197,7 +205,7 @@ def get_train_args(
 
     # postprocess model_args
     if training_args.bf16:
-        if not torch.cuda.is_bf16_supported():
+        if not is_bf16_available:
             raise ValueError("Current device does not support bf16 training.")
         model_args.compute_dtype = torch.bfloat16
     elif training_args.fp16:
@@ -243,4 +251,12 @@ def get_infer_args(
         if model_args.quantization_bit is not None and len(model_args.checkpoint_dir) != 1:
             raise ValueError("Quantized model only accepts a single checkpoint. Merge them first.")
 
+    # auto-detect cuda capability
+    if is_npu_available:
+        model_args.compute_dtype = torch.float16
+    elif is_bf16_available:
+        model_args.compute_dtype = torch.bfloat16
+    else:
+        model_args.compute_dtype = torch.float16
+
     return model_args, data_args, finetuning_args, generating_args