diff --git a/README.md b/README.md index e9d93daf..798b7bd4 100644 --- a/README.md +++ b/README.md @@ -145,7 +145,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - | | [ChatGLM3](https://huggingface.co/THUDM) | 6B | query_key_value | chatglm3 | | [Command-R](https://huggingface.co/CohereForAI) | 35B/104B | q_proj,v_proj | cohere | -| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B | q_proj,v_proj | deepseek | +| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | q_proj,v_proj | deepseek | | [Falcon](https://huggingface.co/tiiuae) | 7B/40B/180B | query_key_value | falcon | | [Gemma/CodeGemma](https://huggingface.co/google) | 2B/7B | q_proj,v_proj | gemma | | [InternLM2](https://huggingface.co/internlm) | 7B/20B | wqkv | intern2 | diff --git a/README_zh.md b/README_zh.md index 15758ae4..2c5b1aa1 100644 --- a/README_zh.md +++ b/README_zh.md @@ -145,7 +145,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | [BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - | | [ChatGLM3](https://huggingface.co/THUDM) | 6B | query_key_value | chatglm3 | | [Command-R](https://huggingface.co/CohereForAI) | 35B/104B | q_proj,v_proj | cohere | -| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B | q_proj,v_proj | deepseek | +| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | q_proj,v_proj | deepseek | | [Falcon](https://huggingface.co/tiiuae) | 7B/40B/180B | query_key_value | falcon | | [Gemma/CodeGemma](https://huggingface.co/google) | 2B/7B | q_proj,v_proj | gemma | | [InternLM2](https://huggingface.co/internlm) | 7B/20B | wqkv | intern2 | diff --git a/assets/wechat.jpg b/assets/wechat.jpg index 0e17295a..d251f405 100644 Binary files a/assets/wechat.jpg and b/assets/wechat.jpg differ diff --git a/requirements.txt b/requirements.txt index f4818ed2..67bd7033 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ uvicorn pydantic fastapi sse-starlette -matplotlib +matplotlib>=3.7.0 fire packaging pyyaml diff --git a/setup.py b/setup.py index 7b849942..ddc3a594 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ def get_requires(): extra_require = { "metrics": ["nltk", "jieba", "rouge-chinese"], - "deepspeed": ["deepspeed>=0.10.0"], + "deepspeed": ["deepspeed>=0.10.0,<=0.14.0"], "bitsandbytes": ["bitsandbytes>=0.39.0"], "vllm": ["vllm>=0.4.0"], "galore": ["galore-torch"], diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py index bf542e69..e055f1f3 100644 --- a/src/llmtuner/extras/constants.py +++ b/src/llmtuner/extras/constants.py @@ -324,6 +324,14 @@ register_model_group( DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat", DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat", }, + "DeepSeek-MoE-236B": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2", + }, + "DeepSeek-MoE-236B-Chat": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat", + }, }, template="deepseek", ) @@ -569,6 +577,10 @@ register_model_group( DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B-Instruct", DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B-Instruct", }, + "LLaMA3-8B-Chinese-Chat": { + DownloadSource.DEFAULT: "shenzhi-wang/Llama3-8B-Chinese-Chat", + DownloadSource.MODELSCOPE: "LLM-Research/Llama3-8B-Chinese-Chat", + }, }, template="llama3", ) diff --git a/src/llmtuner/model/utils/valuehead.py b/src/llmtuner/model/utils/valuehead.py index a192dcfa..a6180753 100644 --- a/src/llmtuner/model/utils/valuehead.py +++ b/src/llmtuner/model/utils/valuehead.py @@ -57,3 +57,7 @@ def prepare_valuehead_model(model: "PreTrainedModel") -> None: if getattr(model.config, "model_type", None) == "chatglm": setattr(model, "lm_head", model.transformer.output_layer) setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"]) + + if getattr(model.config, "model_type", None) == "internlm2": + setattr(model, "lm_head", model.output) + setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])