From 8d987b7af718ca5634120eb8c8c029a8f457ac17 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Mon, 1 Apr 2024 21:49:40 +0800 Subject: [PATCH] add qwen1.5 moe Former-commit-id: 54b7d349088a828d32551fde56b3467a82df1b9b --- README.md | 8 ++++---- README_zh.md | 8 ++++---- src/llmtuner/extras/constants.py | 20 ++++++++++++++++++-- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index e4716873..c90392c8 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main) [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/) [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/) -[![Citation](https://img.shields.io/badge/citation-27-green)](#projects-using-llama-factory) +[![Citation](https://img.shields.io/badge/citation-28-green)](#projects-using-llama-factory) [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls) [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK) [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai) @@ -138,12 +138,11 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [InternLM2](https://huggingface.co/internlm) | 7B/20B | wqkv | intern2 | | [LLaMA](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | q_proj,v_proj | - | | [LLaMA-2](https://huggingface.co/meta-llama) | 7B/13B/70B | q_proj,v_proj | llama2 | -| [Mistral](https://huggingface.co/mistralai) | 7B | q_proj,v_proj | mistral | -| [Mixtral](https://huggingface.co/mistralai) | 8x7B | q_proj,v_proj | mistral | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B | q_proj,v_proj | mistral | | [OLMo](https://huggingface.co/allenai) | 1B/7B | att_proj | olmo | | [Phi-1.5/2](https://huggingface.co/microsoft) | 1.3B/2.7B | q_proj,v_proj | - | | [Qwen](https://huggingface.co/Qwen) | 1.8B/7B/14B/72B | c_attn | qwen | -| [Qwen1.5](https://huggingface.co/Qwen) | 0.5B/1.8B/4B/7B/14B/72B | q_proj,v_proj | qwen | +| [Qwen1.5 (MoE)](https://huggingface.co/Qwen) | 0.5B/1.8B/4B/7B/14B/72B | q_proj,v_proj | qwen | | [StarCoder2](https://huggingface.co/bigcode) | 3B/7B/15B | q_proj,v_proj | - | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | q_proj,v_proj | xverse | | [Yi](https://huggingface.co/01-ai) | 6B/9B/34B | q_proj,v_proj | yi | @@ -716,6 +715,7 @@ docker compose -f ./docker-compose.yml up -d 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333) 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419) 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228) +1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246) 1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008) 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B. 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge. diff --git a/README_zh.md b/README_zh.md index b13c0f19..d591e852 100644 --- a/README_zh.md +++ b/README_zh.md @@ -5,7 +5,7 @@ [![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main) [![PyPI](https://img.shields.io/pypi/v/llmtuner)](https://pypi.org/project/llmtuner/) [![Downloads](https://static.pepy.tech/badge/llmtuner)](https://pypi.org/project/llmtuner/) -[![Citation](https://img.shields.io/badge/citation-27-green)](#使用了-llama-factory-的项目) +[![Citation](https://img.shields.io/badge/citation-28-green)](#使用了-llama-factory-的项目) [![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls) [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK) [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai) @@ -138,12 +138,11 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | [InternLM2](https://huggingface.co/internlm) | 7B/20B | wqkv | intern2 | | [LLaMA](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | q_proj,v_proj | - | | [LLaMA-2](https://huggingface.co/meta-llama) | 7B/13B/70B | q_proj,v_proj | llama2 | -| [Mistral](https://huggingface.co/mistralai) | 7B | q_proj,v_proj | mistral | -| [Mixtral](https://huggingface.co/mistralai) | 8x7B | q_proj,v_proj | mistral | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B | q_proj,v_proj | mistral | | [OLMo](https://huggingface.co/allenai) | 1B/7B | att_proj | olmo | | [Phi-1.5/2](https://huggingface.co/microsoft) | 1.3B/2.7B | q_proj,v_proj | - | | [Qwen](https://huggingface.co/Qwen) | 1.8B/7B/14B/72B | c_attn | qwen | -| [Qwen1.5](https://huggingface.co/Qwen) | 0.5B/1.8B/4B/7B/14B/72B | q_proj,v_proj | qwen | +| [Qwen1.5 (MoE)](https://huggingface.co/Qwen) | 0.5B/1.8B/4B/7B/14B/72B | q_proj,v_proj | qwen | | [StarCoder2](https://huggingface.co/bigcode) | 3B/7B/15B | q_proj,v_proj | - | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | q_proj,v_proj | xverse | | [Yi](https://huggingface.co/01-ai) | 6B/9B/34B | q_proj,v_proj | yi | @@ -715,6 +714,7 @@ docker compose -f ./docker-compose.yml up -d 1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333) 1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419) 1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228) +1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246) 1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. 2024. [[arxiv]](https://arxiv.org/abs/2403.16008) 1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper,基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。 1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM,基于 Baichuan-13B 微调而得,具有法律推理和知识检索能力。 diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py index 8af8d8e8..6e46218b 100644 --- a/src/llmtuner/extras/constants.py +++ b/src/llmtuner/extras/constants.py @@ -463,14 +463,18 @@ register_model_group( register_model_group( models={ - "Mistral-7B": { + "Mistral-7B-v0.1": { DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.1", DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.1", }, - "Mistral-7B-Chat": { + "Mistral-7B-v0.1-Chat": { DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.1", DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.1", }, + "Mistral-7B-v0.2": { + DownloadSource.DEFAULT: "alpindale/Mistral-7B-v0.2-hf", + DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.2-hf", + }, "Mistral-7B-v0.2-Chat": { DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.2", DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.2", @@ -663,6 +667,10 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B", }, + "Qwen1.5-MoE-A2.7B": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B", + }, "Qwen1.5-0.5B-Chat": { DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat", @@ -687,6 +695,10 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat", }, + "Qwen1.5-MoE-A2.7B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat", + }, "Qwen1.5-0.5B-int8-Chat": { DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8", @@ -735,6 +747,10 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-AWQ", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-AWQ", }, + "Qwen1.5-MoE-A2.7B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4", + }, }, template="qwen", )