simplify readme

2025-12-16 11:50:35 +08:00 · 2024-04-02 20:07:43 +08:00
parent b267aeb53f
commit 92dab8a90b
24 changed files with 244 additions and 890 deletions
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -193,6 +193,18 @@ def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype:
        return torch.float32


+def is_path_available(path: os.PathLike) -> bool:
+    r"""
+    Checks if the path is empty or not exist.
+    """
+    if not os.path.exists(path):
+        return True
+    elif os.path.isdir(path) and not os.listdir(path):
+        return True
+    else:
+        return False
+
+
 def torch_gc() -> None:
    r"""
    Collects GPU memory.
--- a/src/llmtuner/extras/patches/llama_patch.py
+++ b/src/llmtuner/extras/patches/llama_patch.py
@@ -193,6 +193,6 @@ def llama_flash_attn_forward(


 def apply_llama_patch() -> None:
-    require_version("transformers==4.39.2", "To fix: pip install transformers==4.39.2")
+    require_version("transformers==4.39.3", "To fix: pip install transformers==4.39.3")
    LlamaAttention.forward = llama_torch_attn_forward
    LlamaFlashAttention2.forward = llama_flash_attn_forward
--- a/src/llmtuner/extras/patches/mixtral_patch.py
+++ b/src/llmtuner/extras/patches/mixtral_patch.py
@@ -1,38 +0,0 @@
-import torch
-import torch.nn.functional as F
-from transformers.models.mixtral.modeling_mixtral import MixtralBLockSparseTop2MLP, MixtralSparseMoeBlock
-
-
-def mlp_forward(self: "MixtralBLockSparseTop2MLP", hidden_states: torch.Tensor) -> torch.Tensor:
-    current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
-    current_hidden_states = self.w2(current_hidden_states)
-    return current_hidden_states
-
-
-# Modified from: https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py
-def moe_forward(self: "MixtralSparseMoeBlock", hidden_states: torch.Tensor) -> torch.Tensor:
-    batch_size, sequence_length, hidden_dim = hidden_states.shape
-    hidden_states = hidden_states.view(-1, hidden_dim)
-    # router_logits: (batch * sequence_length, n_experts)
-    router_logits = self.gate(hidden_states)
-
-    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-    topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
-    topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
-    # we cast back to the input dtype
-    topk_weight = topk_weight.to(hidden_states.dtype)
-
-    hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
-    y = torch.empty_like(hidden_states)
-    flat_topk_idx = topk_idx.view(-1)
-    for i in range(self.num_experts):
-        expert = self.experts[i]
-        y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
-    y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
-    final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
-    return final_hidden_states, router_logits
-
-
-def patch_mixtral_replace_moe_impl() -> None:
-    MixtralBLockSparseTop2MLP.forward = mlp_forward
-    MixtralSparseMoeBlock.forward = moe_forward