diff --git a/requirements.txt b/requirements.txt
index fe60a183..67982a3a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,16 +3,14 @@ datasets>=2.16.0,<=3.5.0
 accelerate>=0.34.0,<=1.6.0
 peft>=0.14.0,<=0.15.1
 trl>=0.8.6,<=0.9.6
-tokenizers>=0.19.0,<=0.21.0
-gradio>=4.38.0,<=5.21.0
-pandas>=2.0.0
+tokenizers>=0.19.0,<=0.21.1
+gradio>=4.38.0,<=5.25.0
 scipy
 einops
 sentencepiece
 tiktoken
 protobuf
 uvicorn
-pydantic
 fastapi
 sse-starlette
 matplotlib>=3.7.0
@@ -21,6 +19,7 @@ packaging
 pyyaml
 numpy<2.0.0
 pydantic<=2.10.6
+pandas>=2.0.0
 av
 librosa
 tyro<0.9.0
diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index 075ef845..7e7e06cf 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -15,6 +15,7 @@
 import os
 import subprocess
 import sys
+from copy import deepcopy
 from enum import Enum, unique
 
 from . import launcher
@@ -96,6 +97,13 @@ def main():
             if int(nnodes) > 1:
                 print(f"Multi-node training enabled: num nodes: {nnodes}, node rank: {node_rank}")
 
+            env = deepcopy(os.environ)
+            if is_env_enabled("OPTIM_TORCH", "1"):
+                # optimize DDP, see https://zhuanlan.zhihu.com/p/671834539
+                env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+                env["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+            # NOTE: DO NOT USE shell=True to avoid security risk
             process = subprocess.run(
                 (
                     "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
@@ -110,7 +118,9 @@ def main():
                     file_name=launcher.__file__,
                     args=" ".join(sys.argv[1:]),
                 )
-                .split()
+                .split(),
+                env=env,
+                check=True,
             )
             sys.exit(process.returncode)
         else:
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 9291968f..e10b6b7f 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -727,23 +727,23 @@ register_model_group(
         },
         "GLM-4-9B-Chat-0414": {
             DownloadSource.DEFAULT: "THUDM/GLM-4-9B-Chat-0414",
-            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-9B-Chat-0414" ,
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-9B-Chat-0414",
         },
         "GLM-4-32B-0414": {
             DownloadSource.DEFAULT: "THUDM/GLM-4-32B-0414",
-            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-0414" ,
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-0414",
         },
         "GLM-4-32B-Chat-0414": {
             DownloadSource.DEFAULT: "THUDM/GLM-4-32B-Chat-0414",
-            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-Chat-0414" ,
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-Chat-0414",
         },
         "GLM-4-Z1-9B-Chat-0414": {
             DownloadSource.DEFAULT: "THUDM/GLM-4-Z1-9B-0414",
-            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-Z1-9B-0414" ,
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-Z1-9B-0414",
         },
         "GLM-4-Z1-32B-Chat-0414": {
             DownloadSource.DEFAULT: "THUDM/GLM-4-Z1-32B-0414",
-            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-Z1-32B-0414" ,
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-Z1-32B-0414",
         },
     },
     template="glm4",
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 1b9d7153..5be79ed1 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -390,8 +390,10 @@ def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _
 def get_infer_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _INFER_CLS:
     model_args, data_args, finetuning_args, generating_args = _parse_infer_args(args)
 
+    # Setup logging
     _set_transformers_logging()
 
+    # Check arguments
     if model_args.infer_backend == "vllm":
         if finetuning_args.stage != "sft":
             raise ValueError("vLLM engine only supports auto-regressive models.")
@@ -408,6 +410,7 @@ def get_infer_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _
     _verify_model_args(model_args, data_args, finetuning_args)
     _check_extra_dependencies(model_args, finetuning_args)
 
+    # Post-process model arguments
     if model_args.export_dir is not None and model_args.export_device == "cpu":
         model_args.device_map = {"": torch.device("cpu")}
         if data_args.cutoff_len != DataArguments().cutoff_len:  # override cutoff_len if it is not default
@@ -421,8 +424,10 @@ def get_infer_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _
 def get_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _EVAL_CLS:
     model_args, data_args, eval_args, finetuning_args = _parse_eval_args(args)
 
+    # Setup logging
     _set_transformers_logging()
 
+    # Check arguments
     if model_args.infer_backend == "vllm":
         raise ValueError("vLLM backend is only available for API, CLI and Web.")
 
diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py
index de0bd299..8be0e7bd 100644
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
@@ -96,6 +96,7 @@ def patch_config(
             model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
 
     if is_torch_npu_available():
+        # avoid JIT compile on NPU devices, see https://zhuanlan.zhihu.com/p/660875458
         torch.npu.set_compile_mode(jit_compile=is_env_enabled("JIT_COMPILE"))
 
     configure_attn_implementation(config, model_args, is_trainable)
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 8f9176d2..923a7105 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -368,6 +368,7 @@ class Runner:
             if args.get("deepspeed", None) is not None:
                 env["FORCE_TORCHRUN"] = "1"
 
+            # NOTE: DO NOT USE shell=True to avoid security risk
             self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env)
             yield from self.monitor()