From f5350b103be3998b35e0899bba35af7cd5b6c80a Mon Sep 17 00:00:00 2001 From: qvlehao <74954167+Qwtdgh@users.noreply.github.com> Date: Wed, 29 Jan 2025 12:16:26 +0800 Subject: [PATCH] [model] add deepseek-R1 & show think process (#6767) Former-commit-id: 28417f862a1947a24663150ca55f421198b6d8eb --- README.md | 1 + README_zh.md | 1 + src/llamafactory/extras/constants.py | 32 ++++++++++++++++++++++++++++ src/llamafactory/webui/chatter.py | 1 + 4 files changed, 35 insertions(+) diff --git a/README.md b/README.md index 7cf47d9e..0acf68c5 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | | [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | | [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/685B | deepseek3 | +| [DeepSeek R1](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | | [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | diff --git a/README_zh.md b/README_zh.md index bf5c3fd7..a20c6bce 100644 --- a/README_zh.md +++ b/README_zh.md @@ -219,6 +219,7 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 | [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | | [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | | [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/685B | deepseek3 | +| [DeepSeek R1](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | | [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 2cf00205..e98aadbd 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -493,6 +493,38 @@ register_model_group( DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3", DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3", }, + "DeepSeek-R1-1.5B-Distill": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + }, + "DeepSeek-R1-7B-Distill": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + }, + "DeepSeek-R1-8B-Distill": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + }, + "DeepSeek-R1-14B-Distill": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", + }, + "DeepSeek-R1-32B-Distill": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + }, + "DeepSeek-R1-70B-Distill": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + }, + "DeepSeek-R1-671B-Zero": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Zero", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Zero", + }, + "DeepSeek-R1-671B": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1", + }, }, template="deepseek3", ) diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py index e9689df2..7abdf8b5 100644 --- a/src/llamafactory/webui/chatter.py +++ b/src/llamafactory/webui/chatter.py @@ -157,6 +157,7 @@ class WebChatModel(ChatModel): top_p=top_p, temperature=temperature, ): + new_text = '' if any(t in new_text for t in ('', '')) else new_text response += new_text if tools: result = self.engine.template.extract_tool(response)