From 9ed4bb63d45de019c04d67f0bd90e213b52111f6 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Fri, 8 Sep 2023 20:04:31 +0800 Subject: [PATCH] change to right-padding, update reward score #803 Former-commit-id: 8ea32e4046d75ddfa9517669e9de9f48fea720c6 --- src/llmtuner/extras/template.py | 5 +-- src/llmtuner/hparams/data_args.py | 5 ++- src/llmtuner/hparams/model_args.py | 4 -- src/llmtuner/tuner/core/loader.py | 2 +- src/llmtuner/tuner/core/parser.py | 6 +++ src/llmtuner/tuner/core/trainer.py | 31 ++++++++++----- src/llmtuner/tuner/ppo/trainer.py | 16 ++++++-- src/llmtuner/tuner/ppo/workflow.py | 6 ++- src/llmtuner/tuner/rm/trainer.py | 53 ++++++++++++++++++++------ src/llmtuner/tuner/rm/workflow.py | 1 - src/llmtuner/tuner/sft/trainer.py | 7 ++-- src/llmtuner/tuner/sft/workflow.py | 4 ++ src/llmtuner/webui/components/train.py | 3 -- src/llmtuner/webui/locales.py | 10 ----- src/llmtuner/webui/runner.py | 3 -- 15 files changed, 97 insertions(+), 59 deletions(-) diff --git a/src/llmtuner/extras/template.py b/src/llmtuner/extras/template.py index 4cc3c40f..167ef222 100644 --- a/src/llmtuner/extras/template.py +++ b/src/llmtuner/extras/template.py @@ -214,10 +214,7 @@ def get_template_and_fix_tokenizer( logger.info("Add eos token: {}".format(tokenizer.eos_token)) if tokenizer.pad_token_id is None: - if tokenizer.unk_token_id is not None: - tokenizer.pad_token = tokenizer.unk_token - else: - tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token = tokenizer.eos_token logger.info("Add pad token: {}".format(tokenizer.pad_token)) tokenizer.add_special_tokens( diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py index 63e8dacb..2a044076 100644 --- a/src/llmtuner/hparams/data_args.py +++ b/src/llmtuner/hparams/data_args.py @@ -26,7 +26,8 @@ class DataArguments: r""" Arguments pertaining to what data we are going to input our model for training and evaluation. """ - template: str = field( + template: Optional[str] = field( + default=None, metadata={"help": "Which template to use for constructing prompts in training and inference."} ) dataset: Optional[str] = field( @@ -46,7 +47,7 @@ class DataArguments: metadata={"help": "Enable streaming mode."} ) buffer_size: Optional[int] = field( - default=16384, + default=1024, metadata={"help": "Size of the buffer to randomly sample examples from in streaming mode."} ) mix_strategy: Optional[Literal["concat", "interleave_under", "interleave_over"]] = field( diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index 8969b0c1..4638ae91 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -27,10 +27,6 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."} ) - padding_side: Optional[Literal["left", "right"]] = field( - default="left", - metadata={"help": "The side on which the model should have padding applied."} - ) quantization_bit: Optional[int] = field( default=None, metadata={"help": "The number of bits to quantize the model."} diff --git a/src/llmtuner/tuner/core/loader.py b/src/llmtuner/tuner/core/loader.py index f0e4afab..b924919c 100644 --- a/src/llmtuner/tuner/core/loader.py +++ b/src/llmtuner/tuner/core/loader.py @@ -68,7 +68,7 @@ def load_model_and_tokenizer( tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, use_fast=model_args.use_fast_tokenizer, - padding_side=model_args.padding_side, + padding_side="right", # training with left-padded tensors in fp16 precision may cause overflow **config_kwargs ) diff --git a/src/llmtuner/tuner/core/parser.py b/src/llmtuner/tuner/core/parser.py index 1a9d673c..e51acf7a 100644 --- a/src/llmtuner/tuner/core/parser.py +++ b/src/llmtuner/tuner/core/parser.py @@ -96,6 +96,9 @@ def get_train_args( # Check arguments (do not check finetuning_args since it may be loaded from checkpoints) data_args.init_for_training() + if general_args.stage != "pt" and data_args.template is None: + raise ValueError("Please specify which `template` to use.") + if general_args.stage != "sft" and training_args.predict_with_generate: raise ValueError("`predict_with_generate` cannot be set as True except SFT.") @@ -221,6 +224,9 @@ def get_infer_args( ]: model_args, data_args, finetuning_args, generating_args = parse_infer_args(args) + if data_args.template is None: + raise ValueError("Please specify which `template` to use.") + if model_args.quantization_bit is not None and finetuning_args.finetuning_type != "lora": raise ValueError("Quantization is only compatible with the LoRA method.") diff --git a/src/llmtuner/tuner/core/trainer.py b/src/llmtuner/tuner/core/trainer.py index 058bb740..9a46d59f 100644 --- a/src/llmtuner/tuner/core/trainer.py +++ b/src/llmtuner/tuner/core/trainer.py @@ -44,26 +44,37 @@ class PeftModelMixin: output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) logger.info(f"Saving model checkpoint to {output_dir}") + model = self.model + model_unwrapped = unwrap_model(model) - model = unwrap_model(self.model) - if isinstance(model, PreTrainedModelWrapper): - # Custom state dict: https://github.com/lvwerra/trl/blob/v0.4.7/trl/models/modeling_value_head.py#L200 + if isinstance(model_unwrapped, PreTrainedModelWrapper): + # Custom state dict: https://github.com/lvwerra/trl/blob/v0.7.1/trl/models/modeling_value_head.py#L200 model_state_dict = state_dict or model.state_dict() v_head_state_dict = { name.replace("v_head.", ""): model_state_dict[name].cpu().clone().detach() for name in model_state_dict.keys() if name.startswith("v_head.") } - torch.save(v_head_state_dict, os.path.join(output_dir, VALUE_HEAD_FILE_NAME)) - model = model.pretrained_model + model = model_unwrapped.pretrained_model + model_unwrapped = unwrap_model(model) state_dict = state_dict or get_state_dict(model) - if isinstance(model, (PeftModel, PreTrainedModel)): - model.config.use_cache = True - model.save_pretrained(output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors) - model.config.use_cache = False + if not isinstance(model, (PeftModel, PreTrainedModel)): + if isinstance(model_unwrapped, (PeftModel, PreTrainedModel)): + model_unwrapped.config.use_cache = True + model_unwrapped.save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) + model_unwrapped.config.use_cache = False + else: + logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") + torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) else: - torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + model.config.use_cache = True + model.save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) + model.config.use_cache = False if self.finetuning_args.finetuning_type == "full" and self.tokenizer is not None: try: diff --git a/src/llmtuner/tuner/ppo/trainer.py b/src/llmtuner/tuner/ppo/trainer.py index 21c8350d..00fd5e41 100644 --- a/src/llmtuner/tuner/ppo/trainer.py +++ b/src/llmtuner/tuner/ppo/trainer.py @@ -102,6 +102,7 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer): # Get inputs queries, responses = self.get_inputs(batch, length_sampler, **gen_kwargs) + self.tokenizer.padding_side = "right" # change padding side rewards = self.get_rewards(queries, responses, unwrapped_model) # Cast to training mode @@ -110,6 +111,7 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer): # Run PPO step stats = self.step(queries, responses, rewards) + self.tokenizer.padding_side = "left" # restore padding side loss_meter.update(stats["ppo/loss/total"], n=len(rewards)) reward_meter.update(torch.stack(rewards).mean().item(), n=len(rewards)) @@ -169,7 +171,11 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer): query, response = batch["input_ids"].detach().cpu(), response[:, batch["input_ids"].size(-1):].detach().cpu() for i in range(len(query)): query_length = (query[i] != self.tokenizer.pad_token_id).nonzero()[0] - response_length = (response[i] != self.tokenizer.pad_token_id).nonzero()[-1] + 1 + response_index = (response[i] != self.tokenizer.pad_token_id).nonzero() + if len(response_index) == 0: + response_length = 1 # allow empty response + else: + response_length = response_index[-1] + 1 queries.append(query[i, query_length:]) # remove padding from left responses.append(response[i, :response_length]) # remove padding from right @@ -194,7 +200,11 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer): if values.size(0) != batch["input_ids"].size(0): # adapt to chatglm2 values = torch.transpose(values, 0, 1) - rewards = [reward for reward in values[:, -1].float().detach().cpu()] # use fp32 type + rewards = [] + for i in range(values.size(0)): + end_index = batch["attention_mask"][i].nonzero()[-1] + rewards.append(values[i, end_index].float().detach().cpu()) # use fp32 type + replace_model(unwrapped_model, target="default") return rewards @@ -241,7 +251,7 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer): for j in range(len(query_batch)): start = len(query_batch[j]) - 1 - if attention_mask[j, 0] == 0: # offset left padding + if attention_mask[j, 0] == 0: # offset left padding start += attention_mask[j, :].nonzero()[0] end = start + len(response_batch[j]) diff --git a/src/llmtuner/tuner/ppo/workflow.py b/src/llmtuner/tuner/ppo/workflow.py index a6b9a1f0..66daa99c 100644 --- a/src/llmtuner/tuner/ppo/workflow.py +++ b/src/llmtuner/tuner/ppo/workflow.py @@ -4,7 +4,7 @@ import math from trl import PPOConfig from torch.optim import AdamW from typing import TYPE_CHECKING, Optional, List -from transformers import DataCollatorForSeq2Seq +from transformers import DataCollatorWithPadding from transformers.optimization import get_scheduler from llmtuner.dsets import get_dataset, preprocess_dataset @@ -28,7 +28,9 @@ def run_ppo( dataset = get_dataset(model_args, data_args) model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo") dataset = preprocess_dataset(dataset, tokenizer, data_args, training_args, stage="ppo") - data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=tokenizer.pad_token_id) + + tokenizer.padding_side = "left" # use left-padding in generation while using right-padding in training + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) ppo_config = PPOConfig( model_name=model_args.model_name_or_path, diff --git a/src/llmtuner/tuner/rm/trainer.py b/src/llmtuner/tuner/rm/trainer.py index 08feda78..854f9792 100644 --- a/src/llmtuner/tuner/rm/trainer.py +++ b/src/llmtuner/tuner/rm/trainer.py @@ -32,21 +32,50 @@ class PairwisePeftTrainer(PeftTrainer): r""" Computes pairwise loss. The first n examples are chosen and the last n examples are rejected. - We use score on the EOS token to represent reward of the whole sentence. - - Subclass and override to inject custom behavior. It should not be directly used by external scripts. - - Note that the first element will be removed from the output tuple. + Subclass and override to inject custom behavior. + Note that the first element will be removed from the output tuple. See: https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/trainer.py#L3509 """ - batch_size = inputs["input_ids"].size(0) // 2 + # Compute rewards _, _, values = model(**inputs, output_hidden_states=True, return_dict=True) if values.size(0) != inputs["input_ids"].size(0): # adapt to chatglm2 values = torch.transpose(values, 0, 1) - r_accept, r_reject = values[:, -1].split(batch_size, dim=0) - loss = -torch.log(torch.sigmoid(r_accept - r_reject)).mean() - return (loss, [loss, r_accept, r_reject]) if return_outputs else loss + + # Split the inputs and rewards into two parts, chosen and rejected + batch_size = inputs["input_ids"].size(0) // 2 + chosen_input_ids, rejected_input_ids = inputs["input_ids"][:batch_size], inputs["input_ids"][batch_size:] + chosen_attn_mask, rejected_attn_mask = ( + inputs["attention_mask"][:batch_size], inputs["attention_mask"][batch_size:] + ) + chosen_rewards, rejected_rewards = values[:batch_size], values[batch_size:] + chosen_scores, rejected_scores = [], [] + + # Compute pairwise loss. Only backprop on the different tokens before padding + # Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py + loss = 0 + for i in range(batch_size): + chosen_length = chosen_attn_mask[i].nonzero()[-1] + 1 + rejected_length = rejected_attn_mask[i].nonzero()[-1] + 1 + check_divergence = (chosen_input_ids[i] != rejected_input_ids[i]).nonzero() + + if len(check_divergence) == 0: + end_index = chosen_length + div_index = end_index - 1 + else: + end_index = max(chosen_length, rejected_length) + div_index = check_divergence[0] + + assert div_index > 0 + chosen_trunc_rewards = chosen_rewards[i, div_index:end_index] + rejected_trunc_rewards = rejected_rewards[i, div_index:end_index] + chosen_scores.append(chosen_trunc_rewards[-1]) # use the end score for inference + rejected_scores.append(rejected_trunc_rewards[-1]) + loss += -torch.nn.functional.logsigmoid(chosen_trunc_rewards - rejected_trunc_rewards).mean() + + loss = loss / batch_size + chosen_scores, rejected_scores = torch.stack(chosen_scores), torch.stack(rejected_scores) + return (loss, [loss, chosen_scores, rejected_scores]) if return_outputs else loss def save_predictions( self, @@ -63,10 +92,10 @@ class PairwisePeftTrainer(PeftTrainer): output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl") logger.info(f"Saving prediction results to {output_prediction_file}") - acc_scores, rej_scores = predict_results.predictions + chosen_scores, rejected_scores = predict_results.predictions with open(output_prediction_file, "w", encoding="utf-8") as writer: res: List[str] = [] - for acc_score, rej_score in zip(acc_scores, rej_scores): - res.append(json.dumps({"accept": round(float(acc_score), 2), "reject": round(float(rej_score), 2)})) + for c_score, r_score in zip(chosen_scores, rejected_scores): + res.append(json.dumps({"chosen": round(float(c_score), 2), "rejected": round(float(r_score), 2)})) writer.write("\n".join(res)) diff --git a/src/llmtuner/tuner/rm/workflow.py b/src/llmtuner/tuner/rm/workflow.py index fd1e7a47..91441f70 100644 --- a/src/llmtuner/tuner/rm/workflow.py +++ b/src/llmtuner/tuner/rm/workflow.py @@ -1,5 +1,4 @@ # Inspired by: -# https://github.com/lvwerra/trl/blob/main/examples/summarization/scripts/reward_summarization.py # https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py from typing import TYPE_CHECKING, Optional, List diff --git a/src/llmtuner/tuner/sft/trainer.py b/src/llmtuner/tuner/sft/trainer.py index 17cb3949..66fe04a7 100644 --- a/src/llmtuner/tuner/sft/trainer.py +++ b/src/llmtuner/tuner/sft/trainer.py @@ -50,10 +50,9 @@ class Seq2SeqPeftTrainer(PeftTrainer): loss, generated_tokens, labels = super().prediction_step( model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys ) - if generated_tokens is not None: - generated_tokens[:, :max(prompt_len, label_len)] = ( - self.tokenizer.pad_token_id * torch.ones_like(generated_tokens[:, :max(prompt_len, label_len)]) - ) + generated_tokens = ( + generated_tokens[:, max(prompt_len, label_len):] if generated_tokens is not None else None + ) return loss, generated_tokens, labels diff --git a/src/llmtuner/tuner/sft/workflow.py b/src/llmtuner/tuner/sft/workflow.py index a89a7514..2ae86fbd 100644 --- a/src/llmtuner/tuner/sft/workflow.py +++ b/src/llmtuner/tuner/sft/workflow.py @@ -27,6 +27,10 @@ def run_sft( dataset = get_dataset(model_args, data_args) model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="sft") dataset = preprocess_dataset(dataset, tokenizer, data_args, training_args, stage="sft") + + if training_args.predict_with_generate: + tokenizer.padding_side = "left" # use left-padding in generation + data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py index 90bf56bf..12fbade7 100644 --- a/src/llmtuner/webui/components/train.py +++ b/src/llmtuner/webui/components/train.py @@ -56,7 +56,6 @@ def create_train_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dic save_steps = gr.Slider(value=100, minimum=10, maximum=5000, step=10) warmup_steps = gr.Slider(value=0, minimum=0, maximum=5000, step=1) compute_type = gr.Radio(choices=["fp16", "bf16"], value="fp16") - padding_side = gr.Radio(choices=["left", "right"], value="left") with gr.Accordion(label="LoRA config", open=False) as lora_tab: with gr.Row(): @@ -122,7 +121,6 @@ def create_train_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dic save_steps, warmup_steps, compute_type, - padding_side, lora_rank, lora_dropout, lora_target, @@ -168,7 +166,6 @@ def create_train_tab(top_elems: Dict[str, "Component"], runner: "Runner") -> Dic save_steps=save_steps, warmup_steps=warmup_steps, compute_type=compute_type, - padding_side=padding_side, lora_tab=lora_tab, lora_rank=lora_rank, lora_dropout=lora_dropout, diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py index c4032f39..773ba03d 100644 --- a/src/llmtuner/webui/locales.py +++ b/src/llmtuner/webui/locales.py @@ -287,16 +287,6 @@ LOCALES = { "info": "是否启用 FP16 或 BF16 混合精度训练。" } }, - "padding_side": { - "en": { - "label": "Padding side", - "info": "The side on which the model should have padding applied." - }, - "zh": { - "label": "填充位置", - "info": "使用左填充或右填充。" - } - }, "lora_tab": { "en": { "label": "LoRA configurations" diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py index 9f127852..9b46ed9d 100644 --- a/src/llmtuner/webui/runner.py +++ b/src/llmtuner/webui/runner.py @@ -87,7 +87,6 @@ class Runner: save_steps: int, warmup_steps: int, compute_type: str, - padding_side: str, lora_rank: int, lora_dropout: float, lora_target: str, @@ -129,7 +128,6 @@ class Runner: logging_steps=logging_steps, save_steps=save_steps, warmup_steps=warmup_steps, - padding_side=padding_side, lora_rank=lora_rank, lora_dropout=lora_dropout, lora_target=lora_target or DEFAULT_MODULE.get(model_name.split("-")[0], "q_proj,v_proj"), @@ -142,7 +140,6 @@ class Runner: if args["stage"] == "ppo": args["reward_model"] = reward_model - args["padding_side"] = "left" val_size = 0 if args["stage"] == "dpo":