From 733b395822f100a3dafc10b30c9fbbdb4d40478d Mon Sep 17 00:00:00 2001 From: hiyouga Date: Mon, 7 Aug 2023 15:02:02 +0800 Subject: [PATCH] update readme Former-commit-id: 20cf27976f24db2667955a8007e0ce2baa35fc82 --- README.md | 25 +++++++++++++++---------- README_zh.md | 25 +++++++++++++++---------- src/llmtuner/chat/stream_chat.py | 2 +- src/llmtuner/dsets/preprocess.py | 11 +++++------ src/llmtuner/extras/template.py | 6 +++--- 5 files changed, 39 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index f69bd7e7..cca8fdc3 100644 --- a/README.md +++ b/README.md @@ -41,16 +41,21 @@ [23/05/31] Now we support training the **BLOOM & BLOOMZ** models in this repo. Try `--model_name_or_path bigscience/bloomz-7b1-mt` and `--lora_target query_key_value` arguments to use the BLOOMZ model. ## Supported Models -| model | model size | model_name_or_path | lora_target | template | -|-------------------------------------------------------------|-----------------------------|--------------------------------|-------------------|----------| -| [LLaMA](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | q_proj,v_proj | default | -| [LLaMA-2](https://huggingface.co/meta-llama) | 7B/13B/70B | meta-llama/Llama-2-7b-hf | q_proj,v_proj | llama2 | -| [BLOOM](https://huggingface.co/bigscience/bloom) | 560M/1.1B/1.7B/3B/7.1B/176B | bigscience/bloom-7b1 | query_key_value | default | -| [BLOOMZ](https://huggingface.co/bigscience/bloomz) | 560M/1.1B/1.7B/3B/7.1B/176B | bigscience/bloomz-7b1-mt | query_key_value | default | -| [Falcon](https://huggingface.co/tiiuae/falcon-7b) | 7B/40B | tiiuae/falcon-7b | query_key_value | default | -| [Baichuan](https://huggingface.co/baichuan-inc/baichuan-7B) | 7B/13B | baichuan-inc/Baichuan-13B-Chat | W_pack | baichuan | -| [InternLM](https://github.com/InternLM/InternLM) | 7B | internlm/internlm-7b | q_proj,v_proj | intern | -| [Qwen](https://github.com/QwenLM/Qwen-7B) | 7B | Qwen/Qwen-7B-Chat | c_attn | chatml | + +| Model | Model size | Default module | Template | +| -------------------------------------------------------- | --------------------------- | ----------------- |----------| +| [LLaMA](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | q_proj,v_proj | - | +| [LLaMA-2](https://huggingface.co/meta-llama) | 7B/13B/70B | q_proj,v_proj | llama2 | +| [BLOOM](https://huggingface.co/bigscience/bloom) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - | +| [BLOOMZ](https://huggingface.co/bigscience/bloomz) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - | +| [Falcon](https://huggingface.co/tiiuae/falcon-7b) | 7B/40B | query_key_value | - | +| [Baichuan](https://github.com/baichuan-inc/baichuan-13B) | 7B/13B | W_pack | baichuan | +| [InternLM](https://github.com/InternLM/InternLM) | 7B | q_proj,v_proj | intern | +| [Qwen](https://github.com/QwenLM/Qwen-7B) | 7B | c_attn | chatml | +| [XVERSE](https://github.com/xverse-ai/XVERSE-13B) | 13B | q_proj,v_proj | - | + +> * **Default module** is used for the `--lora_target` argument. Please use `python src/train_bash.py -h` to see all available options. +> * For the "base" models, the `--template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. ## Supported Training Approaches diff --git a/README_zh.md b/README_zh.md index 4acb2cf7..d5eca99d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -41,16 +41,21 @@ [23/05/31] 现在我们支持了 **BLOOM & BLOOMZ** 模型的训练。请尝试使用 `--model_name_or_path bigscience/bloomz-7b1-mt` 和 `--lora_target query_key_value` 参数。 ## 模型 -| model | model size | model_name_or_path | lora_target | template | -|-------------------------------------------------------------|-----------------------------|--------------------------------|-------------------|----------| -| [LLaMA](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | q_proj,v_proj | default | -| [LLaMA-2](https://huggingface.co/meta-llama) | 7B/13B/70B | meta-llama/Llama-2-7b-hf | q_proj,v_proj | llama2 | -| [BLOOM](https://huggingface.co/bigscience/bloom) | 560M/1.1B/1.7B/3B/7.1B/176B | bigscience/bloom-7b1 | query_key_value | default | -| [BLOOMZ](https://huggingface.co/bigscience/bloomz) | 560M/1.1B/1.7B/3B/7.1B/176B | bigscience/bloomz-7b1-mt | query_key_value | default | -| [Falcon](https://huggingface.co/tiiuae/falcon-7b) | 7B/40B | tiiuae/falcon-7b | query_key_value | default | -| [Baichuan](https://huggingface.co/baichuan-inc/baichuan-7B) | 7B/13B | baichuan-inc/Baichuan-13B-Chat | W_pack | baichuan | -| [InternLM](https://github.com/InternLM/InternLM) | 7B | internlm/internlm-7b | q_proj,v_proj | intern | -| [Qwen](https://github.com/QwenLM/Qwen-7B) | 7B | Qwen/Qwen-7B-Chat | c_attn | chatml | + +| 模型名 | 模型大小 | 默认模块 | Template | +| -------------------------------------------------------- | --------------------------- | ----------------- |----------| +| [LLaMA](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | q_proj,v_proj | - | +| [LLaMA-2](https://huggingface.co/meta-llama) | 7B/13B/70B | q_proj,v_proj | llama2 | +| [BLOOM](https://huggingface.co/bigscience/bloom) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - | +| [BLOOMZ](https://huggingface.co/bigscience/bloomz) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - | +| [Falcon](https://huggingface.co/tiiuae/falcon-7b) | 7B/40B | query_key_value | - | +| [Baichuan](https://github.com/baichuan-inc/baichuan-13B) | 7B/13B | W_pack | baichuan | +| [InternLM](https://github.com/InternLM/InternLM) | 7B | q_proj,v_proj | intern | +| [Qwen](https://github.com/QwenLM/Qwen-7B) | 7B | c_attn | chatml | +| [XVERSE](https://github.com/xverse-ai/XVERSE-13B) | 13B | q_proj,v_proj | - | + +> * **默认模块**是 `--lora_target` 参数的默认值。请使用 `python src/train_bash.py -h` 查看全部可选项。 +> * 对于所有“基座”模型,`--template` 参数可以是 `default`, `alpaca`, `vicuna` 等值。 ## 微调方法 diff --git a/src/llmtuner/chat/stream_chat.py b/src/llmtuner/chat/stream_chat.py index ef30a324..c33a7e61 100644 --- a/src/llmtuner/chat/stream_chat.py +++ b/src/llmtuner/chat/stream_chat.py @@ -30,7 +30,7 @@ class ChatModel: ) -> Tuple[Dict[str, Any], int]: prefix = prefix or self.source_prefix - prompt, _ = self.template.get_prompt( + prompt, _ = self.template.encode_oneturn( tokenizer=self.tokenizer, query=query, resp="", history=history, prefix=prefix ) input_ids = torch.tensor([prompt], device=self.model.device) diff --git a/src/llmtuner/dsets/preprocess.py b/src/llmtuner/dsets/preprocess.py index ad481ffd..4be79bad 100644 --- a/src/llmtuner/dsets/preprocess.py +++ b/src/llmtuner/dsets/preprocess.py @@ -47,15 +47,14 @@ def preprocess_dataset( def preprocess_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: # build inputs with format ` X Y ` and labels with format ` ... Y ` - # for input with history, we build multiple input-label pairs just like: - # https://github.com/lm-sys/FastChat/blob/f17c092f64840fa6354ed52789dccb2daa793d0b/fastchat/train/train.py#L112 + # for multiturn examples, we only mask the prompt part in each prompt-response pair. model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} max_length = data_args.max_source_length + data_args.max_target_length for query, response, history, prefix in construct_example(examples): input_ids, labels = [], [] - for source_ids, target_ids in template.get_dialog(tokenizer, query, response, history, prefix): + for source_ids, target_ids in template.encode_multiturn(tokenizer, query, response, history, prefix): if len(source_ids) > data_args.max_source_length: source_ids = source_ids[:data_args.max_source_length] if len(target_ids) > data_args.max_target_length: @@ -78,7 +77,7 @@ def preprocess_dataset( model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} for query, response, history, prefix in construct_example(examples): - source_ids, target_ids = template.get_prompt(tokenizer, query, response, history, prefix) + source_ids, target_ids = template.encode_oneturn(tokenizer, query, response, history, prefix) if len(source_ids) > data_args.max_source_length: source_ids = source_ids[:data_args.max_source_length] @@ -95,8 +94,8 @@ def preprocess_dataset( # build input pairs with format ` X Y1 ` and ` X Y2 ` model_inputs = {"accept_ids": [], "reject_ids": []} for query, response, history, prefix in construct_example(examples): - source_ids, accept_ids = template.get_prompt(tokenizer, query, response[0], history, prefix) - source_ids, reject_ids = template.get_prompt(tokenizer, query, response[1], history, prefix) + source_ids, accept_ids = template.encode_oneturn(tokenizer, query, response[0], history, prefix) + source_ids, reject_ids = template.encode_oneturn(tokenizer, query, response[1], history, prefix) if len(source_ids) > data_args.max_source_length: source_ids = source_ids[:data_args.max_source_length] diff --git a/src/llmtuner/extras/template.py b/src/llmtuner/extras/template.py index a6193744..0d114b43 100644 --- a/src/llmtuner/extras/template.py +++ b/src/llmtuner/extras/template.py @@ -14,7 +14,7 @@ class Template: stop_words: List[str] use_history: bool - def get_prompt( + def encode_oneturn( self, tokenizer: "PreTrainedTokenizer", query: str, @@ -33,7 +33,7 @@ class Template: prompt_ids = prompt_ids + encoded_pairs[-1][0] return prompt_ids, encoded_pairs[-1][1] - def get_dialog( + def encode_multiturn( self, tokenizer: "PreTrainedTokenizer", query: str, @@ -73,7 +73,7 @@ class Template: r""" Encodes formatted inputs to pairs of token ids. """ - if tokenizer.bos_token and getattr(tokenizer, "add_bos_token", False): # bos token is optional + if tokenizer.bos_token_id and getattr(tokenizer, "add_bos_token", False): # bos token is optional bos_token_id = [tokenizer.bos_token_id] else: bos_token_id = []