diff --git a/.env.local b/.env.local
index fb423d75..8f361917 100644
--- a/.env.local
+++ b/.env.local
@@ -17,7 +17,7 @@ FORCE_TORCHRUN=
 MASTER_ADDR=
 MASTER_PORT=
 NNODES=
-RANK=
+NODE_RANK=
 NPROC_PER_NODE=
 # wandb
 WANDB_DISABLED=
diff --git a/Makefile b/Makefile
index c1c45951..030b39b1 100644
--- a/Makefile
+++ b/Makefile
@@ -18,4 +18,4 @@ style:
 	ruff format $(check_dirs)
 
 test:
-	CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest tests/
+	CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest -vv tests/
diff --git a/examples/README.md b/examples/README.md
index 5df1886f..3a98a088 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -89,8 +89,8 @@ llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml
 #### Supervised Fine-Tuning on Multiple Nodes
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 46d43402..45e96bcf 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -89,8 +89,8 @@ llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml
 #### 多机指令监督微调
 
 ```bash
-FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
-FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```
 
 #### 使用 DeepSpeed ZeRO-3 平均分配显存
diff --git a/src/api.py b/src/api.py
index 25a7c2e7..ad2e8cbb 100644
--- a/src/api.py
+++ b/src/api.py
@@ -23,8 +23,8 @@ from llamafactory.chat import ChatModel
 def main():
     chat_model = ChatModel()
     app = create_app(chat_model)
-    api_host = os.environ.get("API_HOST", "0.0.0.0")
-    api_port = int(os.environ.get("API_PORT", "8000"))
+    api_host = os.getenv("API_HOST", "0.0.0.0")
+    api_port = int(os.getenv("API_PORT", "8000"))
     print(f"Visit http://localhost:{api_port}/docs for API document.")
     uvicorn.run(app, host=api_host, port=api_port)
 
diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index a39c3147..59db566a 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -86,19 +86,19 @@ def main():
     elif command == Command.EXPORT:
         export_model()
     elif command == Command.TRAIN:
-        force_torchrun = os.environ.get("FORCE_TORCHRUN", "0").lower() in ["true", "1"]
+        force_torchrun = os.getenv("FORCE_TORCHRUN", "0").lower() in ["true", "1"]
         if force_torchrun or get_device_count() > 1:
-            master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
-            master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
+            master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
+            master_port = os.getenv("MASTER_PORT", str(random.randint(20001, 29999)))
             logger.info(f"Initializing distributed tasks at: {master_addr}:{master_port}")
             process = subprocess.run(
                 (
                     "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
                     "--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
                 ).format(
-                    nnodes=os.environ.get("NNODES", "1"),
-                    node_rank=os.environ.get("RANK", "0"),
-                    nproc_per_node=os.environ.get("NPROC_PER_NODE", str(get_device_count())),
+                    nnodes=os.getenv("NNODES", "1"),
+                    node_rank=os.getenv("NODE_RANK", "0"),
+                    nproc_per_node=os.getenv("NPROC_PER_NODE", str(get_device_count())),
                     master_addr=master_addr,
                     master_port=master_port,
                     file_name=launcher.__file__,
diff --git a/src/llamafactory/model/model_utils/checkpointing.py b/src/llamafactory/model/model_utils/checkpointing.py
index 88769ae9..bd75821c 100644
--- a/src/llamafactory/model/model_utils/checkpointing.py
+++ b/src/llamafactory/model/model_utils/checkpointing.py
@@ -19,7 +19,7 @@
 # limitations under the License.
 
 import inspect
-from functools import partial, wraps
+from functools import WRAPPER_ASSIGNMENTS, partial, wraps
 from types import MethodType
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 
@@ -81,7 +81,7 @@ def get_custom_gradient_checkpointing_func(gradient_checkpointing_func: Callable
     Only applies gradient checkpointing to trainable layers.
     """
 
-    @wraps(gradient_checkpointing_func)
+    @wraps(gradient_checkpointing_func, assigned=WRAPPER_ASSIGNMENTS + ("__self__",))
     def custom_gradient_checkpointing_func(func: Callable, *args: Union["torch.Tensor", Any], **kwargs):
         module: "torch.nn.Module" = func.__self__
 
@@ -92,9 +92,6 @@ def get_custom_gradient_checkpointing_func(gradient_checkpointing_func: Callable
 
         return gradient_checkpointing_func(func, *args, **kwargs)
 
-    if hasattr(gradient_checkpointing_func, "__self__"):  # fix unsloth gc test case
-        custom_gradient_checkpointing_func.__self__ = gradient_checkpointing_func.__self__
-
     return custom_gradient_checkpointing_func
 
 
diff --git a/src/llamafactory/train/test_utils.py b/src/llamafactory/train/test_utils.py
index 649a4795..55e6c199 100644
--- a/src/llamafactory/train/test_utils.py
+++ b/src/llamafactory/train/test_utils.py
@@ -80,18 +80,17 @@ def load_reference_model(
     is_trainable: bool = False,
     add_valuehead: bool = False,
 ) -> Union["PreTrainedModel", "LoraModel"]:
+    current_device = get_current_device()
     if add_valuehead:
         model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(
-            model_path, torch_dtype=torch.float16, device_map=get_current_device()
+            model_path, torch_dtype=torch.float16, device_map=current_device
         )
         if not is_trainable:
             model.v_head = model.v_head.to(torch.float16)
 
         return model
 
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path, torch_dtype=torch.float16, device_map=get_current_device()
-    )
+    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map=current_device)
     if use_lora or use_pissa:
         model = PeftModel.from_pretrained(
             model, lora_path, subfolder="pissa_init" if use_pissa else None, is_trainable=is_trainable
@@ -110,7 +109,7 @@ def load_train_dataset(**kwargs) -> "Dataset":
     return dataset_module["train_dataset"]
 
 
-def patch_valuehead_model():
+def patch_valuehead_model() -> None:
     def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]) -> None:
         state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
         self.v_head.load_state_dict(state_dict, strict=False)
diff --git a/tests/data/processors/test_feedback.py b/tests/data/processors/test_feedback.py
index 2f87ccc2..c04e823b 100644
--- a/tests/data/processors/test_feedback.py
+++ b/tests/data/processors/test_feedback.py
@@ -23,9 +23,9 @@ from llamafactory.extras.constants import IGNORE_INDEX
 from llamafactory.train.test_utils import load_train_dataset
 
 
-DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data")
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/data/processors/test_pairwise.py b/tests/data/processors/test_pairwise.py
index 4d3f26bd..da50ca24 100644
--- a/tests/data/processors/test_pairwise.py
+++ b/tests/data/processors/test_pairwise.py
@@ -24,9 +24,9 @@ from llamafactory.extras.constants import IGNORE_INDEX
 from llamafactory.train.test_utils import load_train_dataset
 
 
-DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data")
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/data/processors/test_supervised.py b/tests/data/processors/test_supervised.py
index 8df9530a..965429a6 100644
--- a/tests/data/processors/test_supervised.py
+++ b/tests/data/processors/test_supervised.py
@@ -23,11 +23,11 @@ from llamafactory.extras.constants import IGNORE_INDEX
 from llamafactory.train.test_utils import load_train_dataset
 
 
-DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data")
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TINY_DATA = os.environ.get("TINY_DATA", "llamafactory/tiny-supervised-dataset")
+TINY_DATA = os.getenv("TINY_DATA", "llamafactory/tiny-supervised-dataset")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/data/processors/test_unsupervised.py b/tests/data/processors/test_unsupervised.py
index 1bfab53e..c59fa5b2 100644
--- a/tests/data/processors/test_unsupervised.py
+++ b/tests/data/processors/test_unsupervised.py
@@ -22,11 +22,11 @@ from transformers import AutoTokenizer
 from llamafactory.train.test_utils import load_train_dataset
 
 
-DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data")
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TINY_DATA = os.environ.get("TINY_DATA", "llamafactory/tiny-supervised-dataset")
+TINY_DATA = os.getenv("TINY_DATA", "llamafactory/tiny-supervised-dataset")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index 66e9b57c..5ab91d2e 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -31,9 +31,9 @@ if TYPE_CHECKING:
     from llamafactory.data.mm_plugin import BasePlugin
 
 
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
+HF_TOKEN = os.getenv("HF_TOKEN")
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 MM_MESSAGES = [
     {"role": "user", "content": "<image>What is in this image?"},
diff --git a/tests/data/test_template.py b/tests/data/test_template.py
index 18d03958..ba3a9953 100644
--- a/tests/data/test_template.py
+++ b/tests/data/test_template.py
@@ -27,9 +27,9 @@ if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
 
 
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
+HF_TOKEN = os.getenv("HF_TOKEN")
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 MESSAGES = [
     {"role": "user", "content": "How are you"},
diff --git a/tests/e2e/test_chat.py b/tests/e2e/test_chat.py
index 539a0ab9..b95646d7 100644
--- a/tests/e2e/test_chat.py
+++ b/tests/e2e/test_chat.py
@@ -17,7 +17,7 @@ import os
 from llamafactory.chat import ChatModel
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 INFER_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/e2e/test_train.py b/tests/e2e/test_train.py
index a8a24e42..71cda495 100644
--- a/tests/e2e/test_train.py
+++ b/tests/e2e/test_train.py
@@ -19,11 +19,11 @@ import pytest
 from llamafactory.train.tuner import export_model, run_exp
 
 
-DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data")
+DEMO_DATA = os.getenv("DEMO_DATA", "llamafactory/demo_data")
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
+TINY_LLAMA_ADAPTER = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
@@ -46,7 +46,7 @@ INFER_ARGS = {
     "infer_dtype": "float16",
 }
 
-OS_NAME = os.environ.get("OS_NAME", "")
+OS_NAME = os.getenv("OS_NAME", "")
 
 
 @pytest.mark.parametrize(
diff --git a/tests/model/model_utils/test_attention.py b/tests/model/model_utils/test_attention.py
index e263e6da..3861f4bb 100644
--- a/tests/model/model_utils/test_attention.py
+++ b/tests/model/model_utils/test_attention.py
@@ -19,7 +19,7 @@ from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_availabl
 from llamafactory.train.test_utils import load_infer_model
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 INFER_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/model/model_utils/test_checkpointing.py b/tests/model/model_utils/test_checkpointing.py
index 9367eab2..0b171508 100644
--- a/tests/model/model_utils/test_checkpointing.py
+++ b/tests/model/model_utils/test_checkpointing.py
@@ -20,7 +20,7 @@ from llamafactory.extras.misc import get_current_device
 from llamafactory.train.test_utils import load_train_model
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
@@ -54,7 +54,7 @@ def test_checkpointing_disable():
 def test_unsloth_gradient_checkpointing():
     model = load_train_model(use_unsloth_gc=True, **TRAIN_ARGS)
     for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()):
-        assert module._gradient_checkpointing_func.__self__.__name__ == "UnslothGradientCheckpointing"  # classmethod
+        assert module._gradient_checkpointing_func.__self__.__name__ == "UnslothGradientCheckpointing"
 
 
 def test_upcast_layernorm():
diff --git a/tests/model/test_base.py b/tests/model/test_base.py
index 20298fa0..6b6aa8b8 100644
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
@@ -16,17 +16,12 @@ import os
 
 import pytest
 
-from llamafactory.train.test_utils import (
-    compare_model,
-    load_infer_model,
-    load_reference_model,
-    patch_valuehead_model,
-)
+from llamafactory.train.test_utils import compare_model, load_infer_model, load_reference_model, patch_valuehead_model
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
+TINY_LLAMA_VALUEHEAD = os.getenv("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
 
 INFER_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/model/test_freeze.py b/tests/model/test_freeze.py
index 24a1c965..964f52c9 100644
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
@@ -19,7 +19,7 @@ import torch
 from llamafactory.train.test_utils import load_infer_model, load_train_model
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/model/test_full.py b/tests/model/test_full.py
index 383f3b89..6990a0e9 100644
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
@@ -19,7 +19,7 @@ import torch
 from llamafactory.train.test_utils import load_infer_model, load_train_model
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/model/test_lora.py b/tests/model/test_lora.py
index 8c014a15..e1d2148e 100644
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
@@ -27,11 +27,11 @@ from llamafactory.train.test_utils import (
 )
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
+TINY_LLAMA_ADAPTER = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
 
-TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
+TINY_LLAMA_VALUEHEAD = os.getenv("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
diff --git a/tests/model/test_pissa.py b/tests/model/test_pissa.py
index a0985f05..7bfdac51 100644
--- a/tests/model/test_pissa.py
+++ b/tests/model/test_pissa.py
@@ -19,9 +19,9 @@ import pytest
 from llamafactory.train.test_utils import compare_model, load_infer_model, load_reference_model, load_train_model
 
 
-TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+TINY_LLAMA = os.getenv("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
 
-TINY_LLAMA_PISSA = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-pissa")
+TINY_LLAMA_PISSA = os.getenv("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-pissa")
 
 TRAIN_ARGS = {
     "model_name_or_path": TINY_LLAMA,
@@ -49,7 +49,7 @@ INFER_ARGS = {
     "infer_dtype": "float16",
 }
 
-OS_NAME = os.environ.get("OS_NAME", "")
+OS_NAME = os.getenv("OS_NAME", "")
 
 
 @pytest.mark.xfail(reason="PiSSA initialization is not stable in different platform.")