From ab6dc0ea30fd14c331d777109d5b5ca4d632fadf Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Tue, 23 Apr 2024 18:45:43 +0800 Subject: [PATCH 01/29] add multimodal LLM BLIP-2 and InstructBLIP Former-commit-id: a730f89a972f1a9d37c718c716f199cb8d4903b2 --- examples/mllm/sft_blip2.sh | 34 ++++++ examples/mllm/sft_instructblip.sh | 35 ++++++ src/llmtuner/data/__init__.py | 4 +- src/llmtuner/data/loader.py | 50 +++++++-- src/llmtuner/hparams/data_args.py | 4 + src/llmtuner/hparams/finetuning_args.py | 2 +- src/llmtuner/hparams/model_args.py | 4 + src/llmtuner/model/__init__.py | 5 +- src/llmtuner/model/adapter.py | 114 ++++++++++++++++++-- src/llmtuner/model/loader.py | 116 ++++++++++++++++++-- src/llmtuner/train/sftmm/__init__.py | 3 + src/llmtuner/train/sftmm/collator.py | 69 ++++++++++++ src/llmtuner/train/sftmm/metric.py | 61 +++++++++++ src/llmtuner/train/sftmm/trainer.py | 137 ++++++++++++++++++++++++ src/llmtuner/train/sftmm/workflow.py | 105 ++++++++++++++++++ src/llmtuner/train/tuner.py | 5 +- 16 files changed, 710 insertions(+), 38 deletions(-) create mode 100644 examples/mllm/sft_blip2.sh create mode 100644 examples/mllm/sft_instructblip.sh create mode 100644 src/llmtuner/train/sftmm/__init__.py create mode 100644 src/llmtuner/train/sftmm/collator.py create mode 100644 src/llmtuner/train/sftmm/metric.py create mode 100644 src/llmtuner/train/sftmm/trainer.py create mode 100644 src/llmtuner/train/sftmm/workflow.py diff --git a/examples/mllm/sft_blip2.sh b/examples/mllm/sft_blip2.sh new file mode 100644 index 00000000..416bb9cd --- /dev/null +++ b/examples/mllm/sft_blip2.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ + --stage sft_mm \ + --do_train \ + --model_name_or_path /home/LAB/fengzc/LLM/checkpoints/Salesforce/blip2-opt-2.7b \ + --dataset llava_instruct_100 \ + --dataset_dir data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,k_proj \ + --output_dir saves/blip2-opt-2.7b/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 1 \ + --warmup_steps 20 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --quantization_bit 8 \ + --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 + diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh new file mode 100644 index 00000000..a4330a84 --- /dev/null +++ b/examples/mllm/sft_instructblip.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ + --stage sft_mm \ + --do_train \ + --model_name_or_path /home/LAB/fengzc/LLM/checkpoints/Salesforce/instructblip-vicuna-7b \ + --dataset llava_instruct_100 \ + --dataset_dir data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,k_proj \ + --output_dir saves/instructblip-vicuna-7b/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 1 \ + --warmup_steps 20 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --quantization_bit 8 \ + --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 \ + --use_qformer + diff --git a/src/llmtuner/data/__init__.py b/src/llmtuner/data/__init__.py index 792e89d9..27a2f3b8 100644 --- a/src/llmtuner/data/__init__.py +++ b/src/llmtuner/data/__init__.py @@ -1,12 +1,12 @@ from .collator import PairwiseDataCollatorWithPadding -from .loader import get_dataset +from .loader import get_dataset, get_mm_dataset from .template import Template, get_template_and_fix_tokenizer, templates from .utils import Role, split_dataset - __all__ = [ "PairwiseDataCollatorWithPadding", "get_dataset", + "get_mm_dataset", "Template", "get_template_and_fix_tokenizer", "templates", diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 5414150e..b7377379 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -13,23 +13,21 @@ from .preprocess import get_preprocess_and_print_func from .template import get_template_and_fix_tokenizer from .utils import checksum, merge_dataset - if TYPE_CHECKING: from datasets import Dataset, IterableDataset - from transformers import Seq2SeqTrainingArguments + from transformers import Seq2SeqTrainingArguments, AutoProcessor from transformers.tokenization_utils import PreTrainedTokenizer from ..hparams import DataArguments, ModelArguments from .parser import DatasetAttr - logger = get_logger(__name__) def load_single_dataset( - dataset_attr: "DatasetAttr", - model_args: "ModelArguments", - data_args: "DataArguments", + dataset_attr: "DatasetAttr", + model_args: "ModelArguments", + data_args: "DataArguments", ) -> Union["Dataset", "IterableDataset"]: logger.info("Loading dataset {}...".format(dataset_attr)) data_path, data_name, data_dir, data_files = None, None, None, None @@ -115,11 +113,11 @@ def load_single_dataset( def get_dataset( - tokenizer: "PreTrainedTokenizer", - model_args: "ModelArguments", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - stage: Literal["pt", "sft", "rm", "ppo"], + tokenizer: "PreTrainedTokenizer", + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + stage: Literal["pt", "sft", "rm", "ppo"], ) -> Union["Dataset", "IterableDataset"]: template = get_template_and_fix_tokenizer(tokenizer, data_args.template) if data_args.train_on_prompt and template.efficient_eos: @@ -177,3 +175,33 @@ def get_dataset( raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.") return dataset + + +def get_mm_dataset( + processor: "AutoProcessor", + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + stage: Literal["pt", "sft", "rm", "ppo"], +) -> Union["Dataset", "IterableDataset"]: + tokenizer = processor.tokenizer + if data_args.tokenized_path is not None: + if has_tokenized_data(data_args.tokenized_path): + logger.warning("Loading dataset from disk will ignore other data arguments.") + dataset = load_from_disk(data_args.tokenized_path) + logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path)) + if data_args.streaming: + dataset = dataset.to_iterable_dataset() + return dataset + + if data_args.streaming: + raise ValueError("Turn off `streaming` when saving dataset to disk.") + + with training_args.main_process_first(desc="load dataset"): + all_datasets = [] + for dataset_attr in get_dataset_list(data_args): + local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name) + all_datasets.append(load_dataset("json", data_files=local_path)['train']) + dataset = merge_dataset(all_datasets, data_args, training_args) + + return dataset diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py index f5f75c77..3b52f1ea 100644 --- a/src/llmtuner/hparams/data_args.py +++ b/src/llmtuner/hparams/data_args.py @@ -88,6 +88,10 @@ class DataArguments: default=None, metadata={"help": "Path to save or load the tokenized datasets."}, ) + image_path: Optional[str] = field( + default=None, + metadata={"help": "Path to images."}, + ) def __post_init__(self): if self.reserved_label_len >= self.cutoff_len: diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py index f4f71bc5..cb525699 100644 --- a/src/llmtuner/hparams/finetuning_args.py +++ b/src/llmtuner/hparams/finetuning_args.py @@ -260,7 +260,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA default=False, metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."}, ) - stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo"] = field( + stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo", "sft_mm"] = field( default="sft", metadata={"help": "Which stage will be performed in training."}, ) diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index 0e42033f..32637f59 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -165,6 +165,10 @@ class ModelArguments: default=False, metadata={"help": "For debugging purposes, print the status of the parameters in the model."}, ) + use_qformer: bool = field( + default=False, + metadata={"help": "Whether use qformer for Multimodal LLM."}, + ) def __post_init__(self): self.compute_dtype = None diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py index 1eaf4271..cf54dafe 100644 --- a/src/llmtuner/model/__init__.py +++ b/src/llmtuner/model/__init__.py @@ -1,10 +1,11 @@ -from .loader import load_model, load_tokenizer +from .loader import load_model, load_tokenizer, load_processor, load_mm_model from .utils import find_all_linear_modules, load_valuehead_params - __all__ = [ "load_model", + "load_mm_model", "load_tokenizer", + "load_processor", "load_valuehead_params", "find_all_linear_modules", ] diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index f73666d5..624d8a85 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -1,24 +1,25 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Union import torch from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model +from transformers import AutoModelForVision2Seq from transformers.integrations import is_deepspeed_zero3_enabled from ..extras.logging import get_logger from .utils import QuantizationMethod, find_all_linear_modules, find_expanded_modules - if TYPE_CHECKING: - from transformers.modeling_utils import PreTrainedModel + from transformers.modeling_utils import PreTrainedModel, AutoModelForVision2Seq from ..hparams import FinetuningArguments, ModelArguments - logger = get_logger(__name__) def init_adapter( - model: "PreTrainedModel", model_args: "ModelArguments", finetuning_args: "FinetuningArguments", is_trainable: bool + model: "PreTrainedModel", model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: bool ) -> "PreTrainedModel": r""" Initializes the adapters. @@ -43,9 +44,9 @@ def init_adapter( if finetuning_args.finetuning_type == "freeze" and is_trainable: logger.info("Fine-tuning method: Freeze") num_layers = ( - getattr(model.config, "num_hidden_layers", None) - or getattr(model.config, "num_layers", None) - or getattr(model.config, "n_layer", None) + getattr(model.config, "num_hidden_layers", None) + or getattr(model.config, "num_layers", None) + or getattr(model.config, "n_layer", None) ) if not num_layers: raise ValueError("Current model does not support freeze tuning.") @@ -135,9 +136,9 @@ def init_adapter( target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable) if ( - finetuning_args.use_dora - and getattr(model, "quantization_method", None) is not None - and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES + finetuning_args.use_dora + and getattr(model, "quantization_method", None) is not None + and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES ): raise ValueError("DoRA is not compatible with PTQ-quantized models.") @@ -176,3 +177,94 @@ def init_adapter( logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path))) return model + + +def init_mm_adapter( + model: "AutoModelForVision2Seq", model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: bool +) -> "AutoModelForVision2Seq": + if finetuning_args.finetuning_type == "lora": + logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA")) + adapter_to_resume = None + + if model_args.adapter_name_or_path is not None: + is_mergeable = True + if getattr(model, "quantization_method", None): # merge lora in quantized model is unstable + assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter." + is_mergeable = False + + if is_deepspeed_zero3_enabled(): + assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3." + is_mergeable = False + + if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable): + adapter_to_merge = model_args.adapter_name_or_path[:-1] + adapter_to_resume = model_args.adapter_name_or_path[-1] + else: + adapter_to_merge = model_args.adapter_name_or_path + + for adapter in adapter_to_merge: + model: "LoraModel" = PeftModel.from_pretrained( + model, adapter, offload_folder=model_args.offload_folder + ) + model = model.merge_and_unload() + + if len(adapter_to_merge) > 0: + logger.info("Merged {} adapter(s).".format(len(adapter_to_merge))) + + if adapter_to_resume is not None: # resume lora training + model = PeftModel.from_pretrained( + model, adapter_to_resume, is_trainable=is_trainable, offload_folder=model_args.offload_folder + ) + + if is_trainable and adapter_to_resume is None: # create new lora weights while training + if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all": + target_modules = find_all_linear_modules(model) + else: + target_modules = finetuning_args.lora_target + + if finetuning_args.use_llama_pro: + target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable) + + if ( + finetuning_args.use_dora + and getattr(model, "quantization_method", None) is not None + and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES + ): + raise ValueError("DoRA is not compatible with PTQ-quantized models.") + + peft_kwargs = { + "r": finetuning_args.lora_rank, + "target_modules": target_modules, + "lora_alpha": finetuning_args.lora_alpha, + "lora_dropout": finetuning_args.lora_dropout, + "use_rslora": finetuning_args.use_rslora, + "modules_to_save": finetuning_args.additional_target, + } + + if model_args.use_unsloth: + from unsloth import FastLanguageModel # type: ignore + + unsloth_peft_kwargs = { + "model": model, + "max_seq_length": model_args.model_max_length, + "use_gradient_checkpointing": "unsloth", + } + model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs) + else: + lora_config = LoraConfig( + # task_type=TaskType.CAUSAL_LM, + inference_mode=False, + use_dora=finetuning_args.use_dora, + **peft_kwargs, + ) + model = get_peft_model(model, lora_config) + + if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam): + for param in filter(lambda p: p.requires_grad, model.parameters()): + param.data = param.data.to(torch.float32) + + if model_args.adapter_name_or_path is not None: + logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path))) + return model diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index 4935dd52..eeee69a6 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -1,22 +1,20 @@ from typing import TYPE_CHECKING, Any, Dict -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModelForVision2Seq from trl import AutoModelForCausalLMWithValueHead from ..extras.constants import MOD_SUPPORTED_MODELS from ..extras.logging import get_logger from ..extras.misc import count_parameters, get_current_device, try_download_model_from_ms -from .adapter import init_adapter +from .adapter import init_adapter, init_mm_adapter from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model from .utils import load_valuehead_params, register_autoclass - if TYPE_CHECKING: from transformers import PreTrainedModel, PreTrainedTokenizer from ..hparams import FinetuningArguments, ModelArguments - logger = get_logger(__name__) @@ -57,12 +55,38 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer": return tokenizer +def load_processor(model_args: "ModelArguments") -> "AutoProcessor": + r""" + Loads processor. Must before load_model. + + Note: including inplace operation of model_args. + """ + init_kwargs = _get_init_kwargs(model_args) + try: + processor = AutoProcessor.from_pretrained( + model_args.model_name_or_path, + use_fast=model_args.use_fast_tokenizer, + split_special_tokens=model_args.split_special_tokens, + padding_side="right", + **init_kwargs, + ) + except Exception: # try the fast one + processor = AutoProcessor.from_pretrained( + model_args.model_name_or_path, + use_fast=True, + padding_side="right", + **init_kwargs, + ) + + return processor + + def load_model( - tokenizer: "PreTrainedTokenizer", - model_args: "ModelArguments", - finetuning_args: "FinetuningArguments", - is_trainable: bool = False, - add_valuehead: bool = False, + tokenizer: "PreTrainedTokenizer", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: bool = False, + add_valuehead: bool = False, ) -> "PreTrainedModel": r""" Loads pretrained model. Must after load_tokenizer. @@ -159,3 +183,77 @@ def load_model( ) return model + + +def load_mm_model( + processor: "AutoProcessor", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: bool = False, + add_valuehead: bool = False, +) -> "AutoModelForVision2Seq": + r""" + Loads pretrained model. Must after load_tokenizer. + """ + tokenizer = processor.tokenizer + init_kwargs = _get_init_kwargs(model_args) + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) + patch_config(config, tokenizer, model_args, init_kwargs, is_trainable) + + model = None + if is_trainable and model_args.use_unsloth: + from unsloth import FastLanguageModel # type: ignore + + unsloth_kwargs = { + "model_name": model_args.model_name_or_path, + "max_seq_length": model_args.model_max_length, + "dtype": model_args.compute_dtype, + "load_in_4bit": model_args.quantization_bit == 4, + "token": model_args.hf_hub_token, + "device_map": {"": get_current_device()}, + "rope_scaling": getattr(config, "rope_scaling", None), + "fix_tokenizer": False, + "trust_remote_code": True, + } + try: + model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs) + except NotImplementedError: + logger.warning("Unsloth does not support model type {}.".format(getattr(config, "model_type", None))) + model_args.use_unsloth = False + + if model_args.adapter_name_or_path: + model_args.adapter_name_or_path = None + logger.warning("Unsloth does not support loading adapters.") + if model is None: + init_kwargs["config"] = config + init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path + model: "AutoModelForVision2Seq" = AutoModelForVision2Seq.from_pretrained(**init_kwargs) + patch_model(model, tokenizer, model_args, is_trainable) + register_autoclass(config, model, tokenizer) + + model = init_mm_adapter(model, model_args, finetuning_args, is_trainable) + + if not is_trainable: + model.requires_grad_(False) + model.eval() + else: + model.train() + + trainable_params, all_param = count_parameters(model) + if is_trainable: + param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( + trainable_params, all_param, 100 * trainable_params / all_param + ) + else: + param_stats = "all params: {:d}".format(all_param) + logger.info(param_stats) + + if model_args.print_param_status: + for name, param in model.named_parameters(): + print( + "name: {}, dtype: {}, device: {}, trainable: {}".format( + name, param.dtype, param.device, param.requires_grad + ) + ) + + return model diff --git a/src/llmtuner/train/sftmm/__init__.py b/src/llmtuner/train/sftmm/__init__.py new file mode 100644 index 00000000..3eb8b2e2 --- /dev/null +++ b/src/llmtuner/train/sftmm/__init__.py @@ -0,0 +1,3 @@ +from .workflow import run_sft_mm + +__all__ = ["run_sft_mm"] diff --git a/src/llmtuner/train/sftmm/collator.py b/src/llmtuner/train/sftmm/collator.py new file mode 100644 index 00000000..e91374bc --- /dev/null +++ b/src/llmtuner/train/sftmm/collator.py @@ -0,0 +1,69 @@ +import json +import os +from dataclasses import dataclass + +import torch +from torch.utils.data import Dataset as Dataset_torch +from datasets import Dataset +from PIL import Image +from transformers import AutoProcessor + + +class ImageCaptioningDataset(Dataset_torch): + def __init__(self, dataset: Dataset, image_path: str, processor: AutoProcessor): + self.processor = processor + self.dataset = dataset + self.image_path = image_path + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, idx): + source = self.dataset[idx] + image_id = source['image'] + image = Image.open(os.path.join(self.image_path, image_id)) + convs = source['conversations'] + prompt = convs[0]['value'] + label = convs[1]['value'] + image_inputs = self.processor(image, return_tensors="pt") + image_inputs = {k: v.squeeze() for k, v in image_inputs.items()} + inputs = { + "input_ids": prompt, + "labels": label, + } + for key in image_inputs: + inputs[key] = image_inputs[key] + return inputs + + +@dataclass +class DataCollatorForVis2Seq: + processor: AutoProcessor + use_qformer: bool = False + + def __call__(self, features, return_tensors=None): + processed_batch = {} + for key in features[0].keys(): + if key == 'pixel_values': + processed_batch[key] = torch.stack([example[key] for example in features]) + elif key == 'input_ids': + text_inputs = self.processor.tokenizer( + [example[key] for example in features], padding="max_length", return_tensors="pt", + max_length=512, + ) + processed_batch["input_ids"] = text_inputs["input_ids"] + processed_batch["attention_mask"] = text_inputs["attention_mask"] + if self.use_qformer: + qformer_text_inputs = self.processor.qformer_tokenizer( + [example[key] for example in features], padding="max_length", return_tensors="pt", + max_length=512, + ) + processed_batch["qformer_input_ids"] = qformer_text_inputs["input_ids"] + processed_batch["qformer_attention_mask"] = qformer_text_inputs["attention_mask"] + elif key == 'labels': + text_inputs = self.processor.tokenizer( + [example[key] for example in features], padding="max_length", return_tensors="pt", + max_length=512, + ) + processed_batch["labels"] = text_inputs["input_ids"] + return processed_batch diff --git a/src/llmtuner/train/sftmm/metric.py b/src/llmtuner/train/sftmm/metric.py new file mode 100644 index 00000000..d1af4c17 --- /dev/null +++ b/src/llmtuner/train/sftmm/metric.py @@ -0,0 +1,61 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union + +import numpy as np + +from ...extras.constants import IGNORE_INDEX +from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available + + +if TYPE_CHECKING: + from transformers.tokenization_utils import PreTrainedTokenizer + +if is_jieba_available(): + import jieba # type: ignore + +if is_nltk_available(): + from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu + +if is_rouge_available(): + from rouge_chinese import Rouge + + +@dataclass +class ComputeMetrics: + r""" + Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer. + """ + + tokenizer: "PreTrainedTokenizer" + + def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]: + r""" + Uses the model predictions to compute metrics. + """ + preds, labels = eval_preds + score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []} + + preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id) + labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id) + + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + + for pred, label in zip(decoded_preds, decoded_labels): + hypothesis = list(jieba.cut(pred)) + reference = list(jieba.cut(label)) + + if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0: + result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}} + else: + rouge = Rouge() + scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) + result = scores[0] + + for k, v in result.items(): + score_dict[k].append(round(v["f"] * 100, 4)) + + bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3) + score_dict["bleu-4"].append(round(bleu_score * 100, 4)) + + return {k: float(np.mean(v)) for k, v in score_dict.items()} diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py new file mode 100644 index 00000000..96b86b44 --- /dev/null +++ b/src/llmtuner/train/sftmm/trainer.py @@ -0,0 +1,137 @@ +import json +import os +from types import MethodType +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import torch +from transformers import Seq2SeqTrainer + +from ...extras.constants import IGNORE_INDEX +from ...extras.logging import get_logger +from ..utils import create_custom_optimzer, create_custom_scheduler + +if TYPE_CHECKING: + from transformers.trainer import PredictionOutput + from peft import PeftModelForCausalLM + from ...hparams import FinetuningArguments + +logger = get_logger(__name__) + + +class CustomSeq2SeqTrainer(Seq2SeqTrainer): + r""" + Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE. + """ + + def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: + super().__init__(**kwargs) + self.finetuning_args = finetuning_args + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + + # def compute_loss(self, model, inputs, return_outputs=False): + # print(inputs.keys()) + # device = "cuda" + # input_ids = inputs.get("input_ids").to(device) + # pixel_values = inputs.get("pixel_values").to(device, torch.float16) + # attention_mask = inputs.get("attention_mask").to(device) + # labels = inputs.get("labels").to(device) + # + # outputs = model(input_ids=input_ids, + # pixel_values=pixel_values, + # labels=labels, + # # attention_mask=attention_mask, + # ) + # loss = outputs.loss + # print("Loss:", loss.item()) + # return (loss, outputs) if return_outputs else loss + + def create_optimizer(self) -> "torch.optim.Optimizer": + if self.optimizer is None: + self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) + return super().create_optimizer() + + def create_scheduler( + self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None + ) -> "torch.optim.lr_scheduler.LRScheduler": + create_custom_scheduler(self.args, num_training_steps, optimizer) + return super().create_scheduler(num_training_steps, optimizer) + + def prediction_step( + self, + model: "torch.nn.Module", + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + r""" + Removes the prompt part in the generated tokens. + + Subclass and override to inject custom behavior. + """ + labels = inputs["labels"].detach().clone() if "labels" in inputs else None # backup labels + if self.args.predict_with_generate: + assert self.tokenizer.padding_side == "left", "This method only accepts left-padded tensor." + prompt_len, label_len = inputs["input_ids"].size(-1), inputs["labels"].size(-1) + if prompt_len > label_len: + inputs["labels"] = self._pad_tensors_to_target_len(inputs["labels"], inputs["input_ids"]) + if label_len > prompt_len: # truncate the labels instead of padding the inputs (llama2 fp16 compatibility) + inputs["labels"] = inputs["labels"][:, :prompt_len] + + loss, generated_tokens, _ = super().prediction_step( # ignore the returned labels (may be truncated) + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + if generated_tokens is not None and self.args.predict_with_generate: + generated_tokens[:, :prompt_len] = self.tokenizer.pad_token_id + generated_tokens = generated_tokens.contiguous() + + return loss, generated_tokens, labels + + def _pad_tensors_to_target_len(self, src_tensor: torch.Tensor, tgt_tensor: torch.Tensor) -> torch.Tensor: + r""" + Pads the tensor to the same length as the target tensor. + """ + assert self.tokenizer.pad_token_id is not None, "Pad token is required." + padded_tensor = self.tokenizer.pad_token_id * torch.ones_like(tgt_tensor) + padded_tensor[:, -src_tensor.shape[-1]:] = src_tensor # adopt left-padding + return padded_tensor.contiguous() # in contiguous memory + + def save_predictions(self, predict_results: "PredictionOutput") -> None: + r""" + Saves model predictions to `output_dir`. + + A custom behavior that not contained in Seq2SeqTrainer. + """ + if not self.is_world_process_zero(): + return + + output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl") + logger.info(f"Saving prediction results to {output_prediction_file}") + + labels = np.where( + predict_results.label_ids != IGNORE_INDEX, predict_results.label_ids, self.tokenizer.pad_token_id + ) + preds = np.where( + predict_results.predictions != IGNORE_INDEX, predict_results.predictions, self.tokenizer.pad_token_id + ) + + for i in range(len(preds)): + pad_len = np.nonzero(preds[i] != self.tokenizer.pad_token_id)[0] + if len(pad_len): + preds[i] = np.concatenate( + (preds[i][pad_len[0]:], preds[i][: pad_len[0]]), axis=-1 + ) # move pad token to last + + decoded_labels = self.tokenizer.batch_decode( + labels, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True) + + with open(output_prediction_file, "w", encoding="utf-8") as writer: + res: List[str] = [] + for label, pred in zip(decoded_labels, decoded_preds): + res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False)) + writer.write("\n".join(res)) diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py new file mode 100644 index 00000000..9f952772 --- /dev/null +++ b/src/llmtuner/train/sftmm/workflow.py @@ -0,0 +1,105 @@ +# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py +import os +from typing import TYPE_CHECKING, List, Optional + +import torch +from PIL import Image +from torch.utils.data import Dataset +from transformers import DataCollatorForSeq2Seq, LlavaNextForConditionalGeneration, AutoModelForVision2Seq + +from ...data import split_dataset, get_mm_dataset +from ...extras.constants import IGNORE_INDEX +from ...extras.misc import get_logits_processor +from ...extras.ploting import plot_loss +from ...model import load_model, load_tokenizer, load_processor, load_mm_model +from ..utils import create_modelcard_and_push +from .metric import ComputeMetrics +from .trainer import CustomSeq2SeqTrainer +from .collator import DataCollatorForVis2Seq, ImageCaptioningDataset + +if TYPE_CHECKING: + from transformers import Seq2SeqTrainingArguments, TrainerCallback + + from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments + + +def run_sft_mm( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + generating_args: "GeneratingArguments", + callbacks: Optional[List["TrainerCallback"]] = None, +): + processor = load_processor(model_args) + tokenizer = processor.tokenizer + model = load_mm_model(processor, model_args, finetuning_args, training_args.do_train) + dataset = get_mm_dataset(processor, model_args, data_args, training_args, stage="sft") + if training_args.predict_with_generate: + tokenizer.padding_side = "left" # use left-padding in generation + if getattr(model, "is_quantized", False) and not training_args.do_train: + setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction + splited_dataset = split_dataset(dataset, data_args, training_args) + splited_dataset['train_dataset'].set_format(type=splited_dataset['train_dataset'].format["type"], + columns=list(splited_dataset['train_dataset'].features.keys())) + splited_dataset['eval_dataset'].set_format(type=splited_dataset['eval_dataset'].format["type"], + columns=list(splited_dataset['eval_dataset'].features.keys())) + train_dataset = ImageCaptioningDataset(splited_dataset['train_dataset'], data_args.image_path, processor) + eval_dataset = ImageCaptioningDataset(splited_dataset['eval_dataset'], data_args.image_path, processor) + data_collator = DataCollatorForVis2Seq( + processor=processor, + use_qformer=model_args.use_qformer, + ) + + # Override the decoding parameters of Seq2SeqTrainer + training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len + training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams + + # Initialize our Trainer + trainer = CustomSeq2SeqTrainer( + model=model, + args=training_args, + finetuning_args=finetuning_args, + tokenizer=tokenizer, + data_collator=data_collator, + callbacks=callbacks, + compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + + # Keyword arguments for `model.generate` + gen_kwargs = generating_args.to_dict() + gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids + gen_kwargs["pad_token_id"] = tokenizer.pad_token_id + gen_kwargs["logits_processor"] = get_logits_processor() + + # Training + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + trainer.save_model() + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + if trainer.is_world_process_zero() and finetuning_args.plot_loss: + plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) + + # Evaluation + if training_args.do_eval: + metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) + if training_args.predict_with_generate: # eval_loss will be wrong if predict_with_generate is enabled + metrics.pop("eval_loss", None) + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Predict + if training_args.do_predict: + predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs) + if training_args.predict_with_generate: # predict_loss will be wrong if predict_with_generate is enabled + predict_results.metrics.pop("predict_loss", None) + trainer.log_metrics("predict", predict_results.metrics) + trainer.save_metrics("predict", predict_results.metrics) + trainer.save_predictions(predict_results) + + # Create model card + create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py index a8a2b8e9..ac56289c 100644 --- a/src/llmtuner/train/tuner.py +++ b/src/llmtuner/train/tuner.py @@ -14,12 +14,11 @@ from .ppo import run_ppo from .pt import run_pt from .rm import run_rm from .sft import run_sft - +from .sftmm import run_sft_mm if TYPE_CHECKING: from transformers import TrainerCallback - logger = get_logger(__name__) @@ -31,6 +30,8 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["Tra run_pt(model_args, data_args, training_args, finetuning_args, callbacks) elif finetuning_args.stage == "sft": run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) + elif finetuning_args.stage == "sft_mm": + run_sft_mm(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) elif finetuning_args.stage == "rm": run_rm(model_args, data_args, training_args, finetuning_args, callbacks) elif finetuning_args.stage == "ppo": From 0b99b13786f27ef7253332b0d820d68a0e304b31 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Tue, 23 Apr 2024 18:47:03 +0800 Subject: [PATCH 02/29] add multimodal LLM BLIP-2 and InstructBLIP Former-commit-id: b78b5f290aa38a7454e101ee9703fb6fac5064ac --- examples/mllm/sft_instructblip.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh index a4330a84..055c639a 100644 --- a/examples/mllm/sft_instructblip.sh +++ b/examples/mllm/sft_instructblip.sh @@ -31,5 +31,4 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ --plot_loss \ --quantization_bit 8 \ --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 \ - --use_qformer - + --use_qformer \ No newline at end of file From 1451297c78f4a75f0b05884ca1d02aa08d561d61 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Tue, 23 Apr 2024 19:22:42 +0800 Subject: [PATCH 03/29] add multimodal LLM BLIP-2 and InstructBLIP Former-commit-id: 67800c565b086f362b8cf131b0c9babaa7a7ebc7 --- examples/mllm/sft_blip2.sh | 5 ++--- examples/mllm/sft_instructblip.sh | 4 ++-- src/llmtuner/data/loader.py | 1 - 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/mllm/sft_blip2.sh b/examples/mllm/sft_blip2.sh index 416bb9cd..ac0a3f11 100644 --- a/examples/mllm/sft_blip2.sh +++ b/examples/mllm/sft_blip2.sh @@ -14,7 +14,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ --overwrite_output_dir \ --cutoff_len 1024 \ --preprocessing_num_workers 16 \ - --per_device_train_batch_size 1 \ + --per_device_train_batch_size 4 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 8 \ --lr_scheduler_type cosine \ @@ -30,5 +30,4 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ --val_size 0.1 \ --plot_loss \ --quantization_bit 8 \ - --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 - + --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 \ No newline at end of file diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh index 055c639a..92478500 100644 --- a/examples/mllm/sft_instructblip.sh +++ b/examples/mllm/sft_instructblip.sh @@ -14,7 +14,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ --overwrite_output_dir \ --cutoff_len 1024 \ --preprocessing_num_workers 16 \ - --per_device_train_batch_size 1 \ + --per_device_train_batch_size 4 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 8 \ --lr_scheduler_type cosine \ @@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ --eval_steps 100 \ --evaluation_strategy steps \ --load_best_model_at_end \ - --learning_rate 5e-5 \ + --learning_rate 1e-5 \ --num_train_epochs 3.0 \ --max_samples 3000 \ --val_size 0.1 \ diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index b7377379..b3af434b 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -184,7 +184,6 @@ def get_mm_dataset( training_args: "Seq2SeqTrainingArguments", stage: Literal["pt", "sft", "rm", "ppo"], ) -> Union["Dataset", "IterableDataset"]: - tokenizer = processor.tokenizer if data_args.tokenized_path is not None: if has_tokenized_data(data_args.tokenized_path): logger.warning("Loading dataset from disk will ignore other data arguments.") From 12c51655cebff8e60bff48d092916c6e96348852 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 00:22:43 +0800 Subject: [PATCH 04/29] add llava and instructblip Former-commit-id: 142fb6f4541a1acfefe66ff2574dabde53b00c06 --- data/mllm_example_dataset/README.md | 25 +++++ data/mllm_example_dataset/data/test-0.parquet | Bin 0 -> 4580 bytes .../mllm_example_dataset/data/train-0.parquet | Bin 0 -> 4580 bytes examples/mllm/sft_instructblip.sh | 16 ++- examples/mllm/{sft_blip2.sh => sft_llava.sh} | 17 ++-- scripts/make_mllm_instruct.py | 95 ++++++++++++++++++ scripts/test_mllm.py | 84 ++++++++++++++++ src/llmtuner/data/loader.py | 3 +- src/llmtuner/data/preprocess.py | 2 +- src/llmtuner/hparams/data_args.py | 4 - src/llmtuner/hparams/model_args.py | 4 - src/llmtuner/model/adapter.py | 22 ++-- src/llmtuner/model/loader.py | 3 +- src/llmtuner/train/sftmm/collator.py | 82 ++++----------- src/llmtuner/train/sftmm/trainer.py | 95 +----------------- src/llmtuner/train/sftmm/workflow.py | 35 +++---- 16 files changed, 273 insertions(+), 214 deletions(-) create mode 100644 data/mllm_example_dataset/README.md create mode 100644 data/mllm_example_dataset/data/test-0.parquet create mode 100644 data/mllm_example_dataset/data/train-0.parquet rename examples/mllm/{sft_blip2.sh => sft_llava.sh} (58%) create mode 100644 scripts/make_mllm_instruct.py create mode 100644 scripts/test_mllm.py diff --git a/data/mllm_example_dataset/README.md b/data/mllm_example_dataset/README.md new file mode 100644 index 00000000..d5c8c0e6 --- /dev/null +++ b/data/mllm_example_dataset/README.md @@ -0,0 +1,25 @@ +--- +dataset_info: + features: + - name: messages + list: + - name: content + list: + - name: index + dtype: int64 + - name: text + dtype: string + - name: type + dtype: string + - name: role + dtype: string + - name: images + sequence: image +configs: +- config_name: default + data_files: + - split: train + path: data/train-* + - split: test + path: data/test-* +--- \ No newline at end of file diff --git a/data/mllm_example_dataset/data/test-0.parquet b/data/mllm_example_dataset/data/test-0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..42c20b192497168523c3d39447cdae4495085b84 GIT binary patch literal 4580 zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4} z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2 zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ- zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_kMpPvX5(fPocD;ve`lr& zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAHu4)iVC5Rj~iDS@$#AeTYHxA`usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r> zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI z&cAEI zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn# zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL17zNC-8)&Nj5N{w3etSwT_2 z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5 I;r~1T1e%nlz5oCK literal 0 HcmV?d00001 diff --git a/data/mllm_example_dataset/data/train-0.parquet b/data/mllm_example_dataset/data/train-0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..42c20b192497168523c3d39447cdae4495085b84 GIT binary patch literal 4580 zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4} z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2 zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ- zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_kMpPvX5(fPocD;ve`lr& zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAHu4)iVC5Rj~iDS@$#AeTYHxA`usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r> zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI z&cAEI zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn# zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL17zNC-8)&Nj5N{w3etSwT_2 z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5 I;r~1T1e%nlz5oCK literal 0 HcmV?d00001 diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh index 92478500..b3923655 100644 --- a/examples/mllm/sft_instructblip.sh +++ b/examples/mllm/sft_instructblip.sh @@ -3,20 +3,20 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ --stage sft_mm \ --do_train \ - --model_name_or_path /home/LAB/fengzc/LLM/checkpoints/Salesforce/instructblip-vicuna-7b \ - --dataset llava_instruct_100 \ + --model_name_or_path Salesforce/instructblip-vicuna-7b \ + --dataset mllm_instruct_example \ --dataset_dir data \ --template default \ --finetuning_type lora \ - --lora_target q_proj,k_proj \ + --lora_target all \ --output_dir saves/instructblip-vicuna-7b/lora/sft \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ --preprocessing_num_workers 16 \ - --per_device_train_batch_size 4 \ + --per_device_train_batch_size 3 \ --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 8 \ + --gradient_accumulation_steps 1 \ --lr_scheduler_type cosine \ --logging_steps 1 \ --warmup_steps 20 \ @@ -25,10 +25,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ --evaluation_strategy steps \ --load_best_model_at_end \ --learning_rate 1e-5 \ - --num_train_epochs 3.0 \ + --num_train_epochs 50 \ --max_samples 3000 \ --val_size 0.1 \ --plot_loss \ - --quantization_bit 8 \ - --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 \ - --use_qformer \ No newline at end of file + --bf16 \ No newline at end of file diff --git a/examples/mllm/sft_blip2.sh b/examples/mllm/sft_llava.sh similarity index 58% rename from examples/mllm/sft_blip2.sh rename to examples/mllm/sft_llava.sh index ac0a3f11..c1fce693 100644 --- a/examples/mllm/sft_blip2.sh +++ b/examples/mllm/sft_llava.sh @@ -3,20 +3,20 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ --stage sft_mm \ --do_train \ - --model_name_or_path /home/LAB/fengzc/LLM/checkpoints/Salesforce/blip2-opt-2.7b \ - --dataset llava_instruct_100 \ + --model_name_or_path llava-hf/llava-1.5-7b-hf \ + --dataset mllm_instruct_example \ --dataset_dir data \ --template default \ --finetuning_type lora \ - --lora_target q_proj,k_proj \ - --output_dir saves/blip2-opt-2.7b/lora/sft \ + --lora_target all \ + --output_dir saves/llava-1.5-7b/lora/sft \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ --preprocessing_num_workers 16 \ - --per_device_train_batch_size 4 \ + --per_device_train_batch_size 3 \ --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 8 \ + --gradient_accumulation_steps 1 \ --lr_scheduler_type cosine \ --logging_steps 1 \ --warmup_steps 20 \ @@ -25,9 +25,8 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ --evaluation_strategy steps \ --load_best_model_at_end \ --learning_rate 5e-5 \ - --num_train_epochs 3.0 \ + --num_train_epochs 100 \ --max_samples 3000 \ --val_size 0.1 \ --plot_loss \ - --quantization_bit 8 \ - --image_path /home/LAB/fengzc/LLM/checkpoints/liuhaotian/LLaVA-Instruct-150K/images/coco/train2017 \ No newline at end of file + --bf16 \ No newline at end of file diff --git a/scripts/make_mllm_instruct.py b/scripts/make_mllm_instruct.py new file mode 100644 index 00000000..41e13b8e --- /dev/null +++ b/scripts/make_mllm_instruct.py @@ -0,0 +1,95 @@ +import json +import os.path + +import fire +from datasets import Dataset, concatenate_datasets, load_dataset, Value, Image, Features, Sequence + +"""usage +python3 scripts/make_mllm_instruct.py \ +--json_path data/llava_instruct_example.json \ +--image_path data/images \ +--output_path data/mllm_example_dataset +""" + + +def make_one_json(json_path, image_path) -> Dataset: + with open(json_path) as f: + raw_data_ls = json.loads(f.read()) + data_ls = [] + for i, data in enumerate(raw_data_ls): + for j, message in enumerate(data['messages']): + text = message['content'] + message['content'] = [{'index': None, 'text': text, 'type': 'text'}] + if j == 0: + message['content'].append({'index': 0, 'text': None, 'type': 'image'}) + image = data['image'] + if image_path: + image = os.path.join(image_path, data['image']) + data['images'] = [image] + del data['image'] + data_ls.append(data) + + def gen(): + for data in data_ls: + yield data + + features = Features({'messages': [{'content': [ + {'index': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), + 'type': Value(dtype='string', id=None)}], 'role': Value(dtype='string', id=None)}], + 'images': Sequence(feature=Image(decode=True, id=None), length=-1, id=None)}) + dataset = Dataset.from_generator(gen, features=features) + return dataset + + +yaml_content = """--- +dataset_info: + features: + - name: messages + list: + - name: content + list: + - name: index + dtype: int64 + - name: text + dtype: string + - name: type + dtype: string + - name: role + dtype: string + - name: images + sequence: image +configs: +- config_name: default + data_files: + - split: train + path: data/train-* + - split: test + path: data/test-* +---""" + + +def main( + json_path: str, + image_path: str, + output_path: str, +): + json_path_list = json_path.split() + dataset_list = [] + for json_path in json_path_list: + dataset = make_one_json(json_path, image_path) + dataset_list.append(dataset) + dataset = concatenate_datasets(dataset_list) + print(dataset[0]) + data_path = os.path.join(output_path, "data") + os.makedirs(os.path.join(data_path), exist_ok=True) + parquet_path = os.path.join(data_path, "train-0.parquet") + dataset.to_parquet(parquet_path) + parquet_path = os.path.join(data_path, "test-0.parquet") + dataset.to_parquet(parquet_path) + readme_path = os.path.join(output_path, "README.md") + with open(readme_path, 'w') as f: + f.write(yaml_content) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py new file mode 100644 index 00000000..c03525b8 --- /dev/null +++ b/scripts/test_mllm.py @@ -0,0 +1,84 @@ +import os.path + +import fire +import torch +from datasets import load_dataset +from peft import PeftModel +from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor + +"""usage +python3 scripts/test_mllm.py \ +--base_model_path llava-hf/llava-1.5-7b-hf \ +--lora_model_path saves/llava-1.5-7b/lora/sft \ +--model_path saves/llava-1.5-7b/lora/merged \ +--dataset_name data/mllm_example_dataset \ +--do_merge 1 +""" + + +def get_processor(model_path): + CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}""" + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) + tokenizer.chat_template = CHAT_TEMPLATE + processor = AutoProcessor.from_pretrained(model_path) + processor.tokenizer = tokenizer + return processor + + +def apply_lora(base_model_path, model_path, lora_path): + print(f"Loading the base model from {base_model_path}") + base_model = AutoModelForVision2Seq.from_pretrained( + base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="cuda", + ) + processor = get_processor(base_model_path) + tokenizer = processor.tokenizer + print(f"Loading the LoRA adapter from {lora_path}") + + lora_model = PeftModel.from_pretrained( + base_model, + lora_path, + torch_dtype=torch.float16, + ) + + print("Applying the LoRA") + model = lora_model.merge_and_unload() + + print(f"Saving the target model to {model_path}") + model.save_pretrained(model_path) + tokenizer.save_pretrained(model_path) + processor.image_processor.save_pretrained(model_path) + + +def main( + model_path: str, + dataset_name: str, + base_model_path: str = "", + lora_model_path: str = "", + do_merge: bool = False, +): + if not os.path.exists(model_path) or do_merge: + apply_lora(base_model_path, model_path, lora_model_path) + model = AutoModelForVision2Seq.from_pretrained( + model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map="cuda" + ) + processor = get_processor(model_path) + raw_datasets = load_dataset(dataset_name) + train_dataset = raw_datasets['train'] + examples = train_dataset.select(range(3)) + texts = [] + images = [] + for example in examples: + messages = example["messages"][:1] + text = processor.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=False + ) + texts.append(text) + images.append(example["images"][0]) + batch = processor(texts, images, return_tensors="pt", padding=True).to("cuda") + output = model.generate(**batch, max_new_tokens=100) + res = processor.batch_decode(output, skip_special_tokens=True) + print(res) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index b3af434b..18665731 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -199,8 +199,7 @@ def get_mm_dataset( with training_args.main_process_first(desc="load dataset"): all_datasets = [] for dataset_attr in get_dataset_list(data_args): - local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name) - all_datasets.append(load_dataset("json", data_files=local_path)['train']) + all_datasets.append(load_dataset(dataset_attr.dataset_name)['train']) dataset = merge_dataset(all_datasets, data_args, training_args) return dataset diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index b8edfa10..8494ba7e 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -275,4 +275,4 @@ def get_preprocess_and_print_func( ) print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) - return preprocess_func, print_function + return preprocess_func, print_function \ No newline at end of file diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py index 3b52f1ea..f5f75c77 100644 --- a/src/llmtuner/hparams/data_args.py +++ b/src/llmtuner/hparams/data_args.py @@ -88,10 +88,6 @@ class DataArguments: default=None, metadata={"help": "Path to save or load the tokenized datasets."}, ) - image_path: Optional[str] = field( - default=None, - metadata={"help": "Path to images."}, - ) def __post_init__(self): if self.reserved_label_len >= self.cutoff_len: diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index 32637f59..0e42033f 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -165,10 +165,6 @@ class ModelArguments: default=False, metadata={"help": "For debugging purposes, print the status of the parameters in the model."}, ) - use_qformer: bool = field( - default=False, - metadata={"help": "Whether use qformer for Multimodal LLM."}, - ) def __post_init__(self): self.compute_dtype = None diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index 624d8a85..e66a984b 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -182,7 +182,8 @@ def init_adapter( def init_mm_adapter( model: "AutoModelForVision2Seq", model_args: "ModelArguments", finetuning_args: "FinetuningArguments", - is_trainable: bool + is_trainable: bool, + use_clm=True, ) -> "AutoModelForVision2Seq": if finetuning_args.finetuning_type == "lora": logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA")) @@ -253,12 +254,19 @@ def init_mm_adapter( } model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs) else: - lora_config = LoraConfig( - # task_type=TaskType.CAUSAL_LM, - inference_mode=False, - use_dora=finetuning_args.use_dora, - **peft_kwargs, - ) + if use_clm: + lora_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + inference_mode=False, + use_dora=finetuning_args.use_dora, + **peft_kwargs, + ) + else: + lora_config = LoraConfig( + inference_mode=False, + use_dora=finetuning_args.use_dora, + **peft_kwargs, + ) model = get_peft_model(model, lora_config) if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam): diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index eeee69a6..917f11c9 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -191,6 +191,7 @@ def load_mm_model( finetuning_args: "FinetuningArguments", is_trainable: bool = False, add_valuehead: bool = False, + use_clm=True, ) -> "AutoModelForVision2Seq": r""" Loads pretrained model. Must after load_tokenizer. @@ -231,7 +232,7 @@ def load_mm_model( patch_model(model, tokenizer, model_args, is_trainable) register_autoclass(config, model, tokenizer) - model = init_mm_adapter(model, model_args, finetuning_args, is_trainable) + model = init_mm_adapter(model, model_args, finetuning_args, is_trainable, use_clm) if not is_trainable: model.requires_grad_(False) diff --git a/src/llmtuner/train/sftmm/collator.py b/src/llmtuner/train/sftmm/collator.py index e91374bc..95dbd939 100644 --- a/src/llmtuner/train/sftmm/collator.py +++ b/src/llmtuner/train/sftmm/collator.py @@ -1,69 +1,29 @@ -import json -import os from dataclasses import dataclass - -import torch -from torch.utils.data import Dataset as Dataset_torch -from datasets import Dataset -from PIL import Image from transformers import AutoProcessor -class ImageCaptioningDataset(Dataset_torch): - def __init__(self, dataset: Dataset, image_path: str, processor: AutoProcessor): - self.processor = processor - self.dataset = dataset - self.image_path = image_path - - def __len__(self): - return len(self.dataset) - - def __getitem__(self, idx): - source = self.dataset[idx] - image_id = source['image'] - image = Image.open(os.path.join(self.image_path, image_id)) - convs = source['conversations'] - prompt = convs[0]['value'] - label = convs[1]['value'] - image_inputs = self.processor(image, return_tensors="pt") - image_inputs = {k: v.squeeze() for k, v in image_inputs.items()} - inputs = { - "input_ids": prompt, - "labels": label, - } - for key in image_inputs: - inputs[key] = image_inputs[key] - return inputs - - @dataclass class DataCollatorForVis2Seq: processor: AutoProcessor - use_qformer: bool = False - def __call__(self, features, return_tensors=None): - processed_batch = {} - for key in features[0].keys(): - if key == 'pixel_values': - processed_batch[key] = torch.stack([example[key] for example in features]) - elif key == 'input_ids': - text_inputs = self.processor.tokenizer( - [example[key] for example in features], padding="max_length", return_tensors="pt", - max_length=512, - ) - processed_batch["input_ids"] = text_inputs["input_ids"] - processed_batch["attention_mask"] = text_inputs["attention_mask"] - if self.use_qformer: - qformer_text_inputs = self.processor.qformer_tokenizer( - [example[key] for example in features], padding="max_length", return_tensors="pt", - max_length=512, - ) - processed_batch["qformer_input_ids"] = qformer_text_inputs["input_ids"] - processed_batch["qformer_attention_mask"] = qformer_text_inputs["attention_mask"] - elif key == 'labels': - text_inputs = self.processor.tokenizer( - [example[key] for example in features], padding="max_length", return_tensors="pt", - max_length=512, - ) - processed_batch["labels"] = text_inputs["input_ids"] - return processed_batch + def __call__(self, examples): + texts = [] + images = [] + for example in examples: + if len(example["images"]) > 1: + raise ValueError("This collator only supports one image per example") + messages = example["messages"] + text = self.processor.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=False + ) + texts.append(text) + images.append(example["images"][0]) + + batch = self.processor(text=texts, images=images, return_tensors="pt", padding=True) + + labels = batch["input_ids"].clone() + if self.processor.tokenizer.pad_token_id is not None: + labels[labels == self.processor.tokenizer.pad_token_id] = -100 + batch["labels"] = labels + + return batch diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py index 96b86b44..f094e609 100644 --- a/src/llmtuner/train/sftmm/trainer.py +++ b/src/llmtuner/train/sftmm/trainer.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import numpy as np import torch -from transformers import Seq2SeqTrainer +from transformers import Seq2SeqTrainer, Trainer from ...extras.constants import IGNORE_INDEX from ...extras.logging import get_logger @@ -32,23 +32,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer): self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) - # def compute_loss(self, model, inputs, return_outputs=False): - # print(inputs.keys()) - # device = "cuda" - # input_ids = inputs.get("input_ids").to(device) - # pixel_values = inputs.get("pixel_values").to(device, torch.float16) - # attention_mask = inputs.get("attention_mask").to(device) - # labels = inputs.get("labels").to(device) - # - # outputs = model(input_ids=input_ids, - # pixel_values=pixel_values, - # labels=labels, - # # attention_mask=attention_mask, - # ) - # loss = outputs.loss - # print("Loss:", loss.item()) - # return (loss, outputs) if return_outputs else loss - def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) @@ -59,79 +42,3 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer): ) -> "torch.optim.lr_scheduler.LRScheduler": create_custom_scheduler(self.args, num_training_steps, optimizer) return super().create_scheduler(num_training_steps, optimizer) - - def prediction_step( - self, - model: "torch.nn.Module", - inputs: Dict[str, Union[torch.Tensor, Any]], - prediction_loss_only: bool, - ignore_keys: Optional[List[str]] = None, - ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: - r""" - Removes the prompt part in the generated tokens. - - Subclass and override to inject custom behavior. - """ - labels = inputs["labels"].detach().clone() if "labels" in inputs else None # backup labels - if self.args.predict_with_generate: - assert self.tokenizer.padding_side == "left", "This method only accepts left-padded tensor." - prompt_len, label_len = inputs["input_ids"].size(-1), inputs["labels"].size(-1) - if prompt_len > label_len: - inputs["labels"] = self._pad_tensors_to_target_len(inputs["labels"], inputs["input_ids"]) - if label_len > prompt_len: # truncate the labels instead of padding the inputs (llama2 fp16 compatibility) - inputs["labels"] = inputs["labels"][:, :prompt_len] - - loss, generated_tokens, _ = super().prediction_step( # ignore the returned labels (may be truncated) - model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys - ) - if generated_tokens is not None and self.args.predict_with_generate: - generated_tokens[:, :prompt_len] = self.tokenizer.pad_token_id - generated_tokens = generated_tokens.contiguous() - - return loss, generated_tokens, labels - - def _pad_tensors_to_target_len(self, src_tensor: torch.Tensor, tgt_tensor: torch.Tensor) -> torch.Tensor: - r""" - Pads the tensor to the same length as the target tensor. - """ - assert self.tokenizer.pad_token_id is not None, "Pad token is required." - padded_tensor = self.tokenizer.pad_token_id * torch.ones_like(tgt_tensor) - padded_tensor[:, -src_tensor.shape[-1]:] = src_tensor # adopt left-padding - return padded_tensor.contiguous() # in contiguous memory - - def save_predictions(self, predict_results: "PredictionOutput") -> None: - r""" - Saves model predictions to `output_dir`. - - A custom behavior that not contained in Seq2SeqTrainer. - """ - if not self.is_world_process_zero(): - return - - output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl") - logger.info(f"Saving prediction results to {output_prediction_file}") - - labels = np.where( - predict_results.label_ids != IGNORE_INDEX, predict_results.label_ids, self.tokenizer.pad_token_id - ) - preds = np.where( - predict_results.predictions != IGNORE_INDEX, predict_results.predictions, self.tokenizer.pad_token_id - ) - - for i in range(len(preds)): - pad_len = np.nonzero(preds[i] != self.tokenizer.pad_token_id)[0] - if len(pad_len): - preds[i] = np.concatenate( - (preds[i][pad_len[0]:], preds[i][: pad_len[0]]), axis=-1 - ) # move pad token to last - - decoded_labels = self.tokenizer.batch_decode( - labels, skip_special_tokens=True, clean_up_tokenization_spaces=False - ) - decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True) - - with open(output_prediction_file, "w", encoding="utf-8") as writer: - res: List[str] = [] - for label, pred in zip(decoded_labels, decoded_preds): - res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False)) - writer.write("\n".join(res)) diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py index 9f952772..21f4aebf 100644 --- a/src/llmtuner/train/sftmm/workflow.py +++ b/src/llmtuner/train/sftmm/workflow.py @@ -1,21 +1,14 @@ # Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py import os from typing import TYPE_CHECKING, List, Optional - -import torch -from PIL import Image -from torch.utils.data import Dataset -from transformers import DataCollatorForSeq2Seq, LlavaNextForConditionalGeneration, AutoModelForVision2Seq - from ...data import split_dataset, get_mm_dataset -from ...extras.constants import IGNORE_INDEX from ...extras.misc import get_logits_processor from ...extras.ploting import plot_loss -from ...model import load_model, load_tokenizer, load_processor, load_mm_model +from ...model import load_tokenizer, load_processor, load_mm_model from ..utils import create_modelcard_and_push from .metric import ComputeMetrics from .trainer import CustomSeq2SeqTrainer -from .collator import DataCollatorForVis2Seq, ImageCaptioningDataset +from .collator import DataCollatorForVis2Seq if TYPE_CHECKING: from transformers import Seq2SeqTrainingArguments, TrainerCallback @@ -32,28 +25,27 @@ def run_sft_mm( callbacks: Optional[List["TrainerCallback"]] = None, ): processor = load_processor(model_args) - tokenizer = processor.tokenizer - model = load_mm_model(processor, model_args, finetuning_args, training_args.do_train) + tokenizer = load_tokenizer(model_args) + CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}""" + tokenizer.chat_template = CHAT_TEMPLATE + processor.tokenizer = tokenizer + use_clm = True + if "blip" in model_args.model_name_or_path: + use_clm = False + model = load_mm_model(processor, model_args, finetuning_args, training_args.do_train, use_clm=use_clm) dataset = get_mm_dataset(processor, model_args, data_args, training_args, stage="sft") - if training_args.predict_with_generate: - tokenizer.padding_side = "left" # use left-padding in generation if getattr(model, "is_quantized", False) and not training_args.do_train: setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction - splited_dataset = split_dataset(dataset, data_args, training_args) - splited_dataset['train_dataset'].set_format(type=splited_dataset['train_dataset'].format["type"], - columns=list(splited_dataset['train_dataset'].features.keys())) - splited_dataset['eval_dataset'].set_format(type=splited_dataset['eval_dataset'].format["type"], - columns=list(splited_dataset['eval_dataset'].features.keys())) - train_dataset = ImageCaptioningDataset(splited_dataset['train_dataset'], data_args.image_path, processor) - eval_dataset = ImageCaptioningDataset(splited_dataset['eval_dataset'], data_args.image_path, processor) + train_dataset = dataset + eval_dataset = dataset data_collator = DataCollatorForVis2Seq( processor=processor, - use_qformer=model_args.use_qformer, ) # Override the decoding parameters of Seq2SeqTrainer training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams + training_args.remove_unused_columns = False # Initialize our Trainer trainer = CustomSeq2SeqTrainer( @@ -67,7 +59,6 @@ def run_sft_mm( train_dataset=train_dataset, eval_dataset=eval_dataset, ) - # Keyword arguments for `model.generate` gen_kwargs = generating_args.to_dict() gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids From 0e3cc523278b0d10133699d682ca85a2b1dbdc14 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 00:56:06 +0800 Subject: [PATCH 05/29] remove conflicts Former-commit-id: e5750ee202eb67cf5fc54f464548e2eb43d00900 --- scripts/test_mllm.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py index 882bf032..961f02bf 100644 --- a/scripts/test_mllm.py +++ b/scripts/test_mllm.py @@ -5,6 +5,7 @@ import torch from datasets import load_dataset from peft import PeftModel from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor +import shutil """usage python3 scripts/test_mllm.py \ @@ -47,15 +48,14 @@ def apply_lora(base_model_path, model_path, lora_path): model.save_pretrained(model_path) tokenizer.save_pretrained(model_path) processor.image_processor.save_pretrained(model_path) - if 'instructblip' in model_path: - processor.qformer_tokenizer.save_pretrained(model_path) + def main( - model_path: str, - dataset_name: str, - base_model_path: str = "", - lora_model_path: str = "", - do_merge: bool = False, + model_path: str, + dataset_name: str, + base_model_path: str = "", + lora_model_path: str = "", + do_merge: bool = False, ): if not os.path.exists(model_path) or do_merge: apply_lora(base_model_path, model_path, lora_model_path) From 5142349661fc7b9cbe5e30001878e2d68fa9f678 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 01:01:59 +0800 Subject: [PATCH 06/29] remove error Former-commit-id: 2bcd1c7dc3595f17ae4e2c4475196cc2d03d0e75 --- src/llmtuner/model/loader.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index f3856da7..a6c37922 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -202,29 +202,6 @@ def load_mm_model( patch_config(config, tokenizer, model_args, init_kwargs, is_trainable) model = None - if is_trainable and model_args.use_unsloth: - from unsloth import FastLanguageModel # type: ignore - - unsloth_kwargs = { - "model_name": model_args.model_name_or_path, - "max_seq_length": model_args.model_max_length, - "dtype": model_args.compute_dtype, - "load_in_4bit": model_args.quantization_bit == 4, - "token": model_args.hf_hub_token, - "device_map": {"": get_current_device()}, - "rope_scaling": getattr(config, "rope_scaling", None), - "fix_tokenizer": False, - "trust_remote_code": True, - } - try: - model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs) - except NotImplementedError: - logger.warning("Unsloth does not support model type {}.".format(getattr(config, "model_type", None))) - model_args.use_unsloth = False - - if model_args.adapter_name_or_path: - model_args.adapter_name_or_path = None - logger.warning("Unsloth does not support loading adapters.") if model is None: init_kwargs["config"] = config init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path From 00e2a272ef03fa33c6019f11fc0a588f6f0a82b9 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 08:20:41 +0800 Subject: [PATCH 07/29] merge model part to the text stream Former-commit-id: b6fcb832ddaed4647d6f2b926f3dfccd47f3ea84 --- src/llmtuner/hparams/model_args.py | 4 + src/llmtuner/model/__init__.py | 3 +- src/llmtuner/model/adapter.py | 107 +-------------------------- src/llmtuner/model/loader.py | 75 ++++--------------- src/llmtuner/train/sftmm/workflow.py | 7 +- 5 files changed, 24 insertions(+), 172 deletions(-) diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index b60492a0..a6e4b710 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -169,6 +169,10 @@ class ModelArguments: default=False, metadata={"help": "For debugging purposes, print the status of the parameters in the model."}, ) + use_mllm: bool = field( + default=False, + metadata={"help": "Whether use Multimodal LLM."}, + ) def __post_init__(self): self.compute_dtype = None diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py index 2bd73365..f6be60d8 100644 --- a/src/llmtuner/model/__init__.py +++ b/src/llmtuner/model/__init__.py @@ -1,10 +1,9 @@ -from .loader import load_config, load_model, load_tokenizer, load_mm_model +from .loader import load_config, load_model, load_tokenizer, load_processor from .utils.misc import find_all_linear_modules, load_valuehead_params __all__ = [ "load_config", "load_model", - "load_mm_model", "load_tokenizer", "load_processor", "load_valuehead_params", diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index 8079c028..bcefee92 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Union import torch from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model @@ -21,11 +21,11 @@ logger = get_logger(__name__) def init_adapter( config: "PretrainedConfig", - model: "PreTrainedModel", + model: Union["PreTrainedModel","AutoModelForVision2Seq"], model_args: "ModelArguments", finetuning_args: "FinetuningArguments", is_trainable: bool, -) -> "PreTrainedModel": +) -> Union["PreTrainedModel","AutoModelForVision2Seq"]: r""" Initializes the adapters. @@ -195,103 +195,4 @@ def init_adapter( if model_args.adapter_name_or_path is not None: logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path))) - return model - - -def init_mm_adapter( - model: "AutoModelForVision2Seq", model_args: "ModelArguments", - finetuning_args: "FinetuningArguments", - is_trainable: bool, - use_clm=True, -) -> "AutoModelForVision2Seq": - if finetuning_args.finetuning_type == "lora": - logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA")) - adapter_to_resume = None - - if model_args.adapter_name_or_path is not None: - is_mergeable = True - if getattr(model, "quantization_method", None): # merge lora in quantized model is unstable - assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter." - is_mergeable = False - - if is_deepspeed_zero3_enabled(): - assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3." - is_mergeable = False - - if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable): - adapter_to_merge = model_args.adapter_name_or_path[:-1] - adapter_to_resume = model_args.adapter_name_or_path[-1] - else: - adapter_to_merge = model_args.adapter_name_or_path - - for adapter in adapter_to_merge: - model: "LoraModel" = PeftModel.from_pretrained( - model, adapter, offload_folder=model_args.offload_folder - ) - model = model.merge_and_unload() - - if len(adapter_to_merge) > 0: - logger.info("Merged {} adapter(s).".format(len(adapter_to_merge))) - - if adapter_to_resume is not None: # resume lora training - model = PeftModel.from_pretrained( - model, adapter_to_resume, is_trainable=is_trainable, offload_folder=model_args.offload_folder - ) - - if is_trainable and adapter_to_resume is None: # create new lora weights while training - if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all": - target_modules = find_all_linear_modules(model) - else: - target_modules = finetuning_args.lora_target - - if finetuning_args.use_llama_pro: - target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable) - - if ( - finetuning_args.use_dora - and getattr(model, "quantization_method", None) is not None - and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES - ): - raise ValueError("DoRA is not compatible with PTQ-quantized models.") - - peft_kwargs = { - "r": finetuning_args.lora_rank, - "target_modules": target_modules, - "lora_alpha": finetuning_args.lora_alpha, - "lora_dropout": finetuning_args.lora_dropout, - "use_rslora": finetuning_args.use_rslora, - "modules_to_save": finetuning_args.additional_target, - } - - if model_args.use_unsloth: - from unsloth import FastLanguageModel # type: ignore - - unsloth_peft_kwargs = { - "model": model, - "max_seq_length": model_args.model_max_length, - "use_gradient_checkpointing": "unsloth", - } - model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs) - else: - if use_clm: - lora_config = LoraConfig( - task_type=TaskType.CAUSAL_LM, - inference_mode=False, - use_dora=finetuning_args.use_dora, - **peft_kwargs, - ) - else: - lora_config = LoraConfig( - inference_mode=False, - use_dora=finetuning_args.use_dora, - **peft_kwargs, - ) - model = get_peft_model(model, lora_config) - - if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam): - for param in filter(lambda p: p.requires_grad, model.parameters()): - param.data = param.data.to(torch.float32) - - if model_args.adapter_name_or_path is not None: - logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path))) - return model + return model \ No newline at end of file diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index a6c37922..3712a592 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -1,11 +1,11 @@ -from typing import TYPE_CHECKING, Any, Dict +from typing import TYPE_CHECKING, Any, Dict, Union from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModelForVision2Seq from trl import AutoModelForCausalLMWithValueHead from ..extras.logging import get_logger from ..extras.misc import count_parameters, try_download_model_from_ms -from .adapter import init_adapter, init_mm_adapter +from .adapter import init_adapter from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model from .utils.misc import load_valuehead_params, register_autoclass from .utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model @@ -106,12 +106,12 @@ def load_config(model_args: "ModelArguments") -> "PretrainedConfig": def load_model( - tokenizer: "PreTrainedTokenizer", - model_args: "ModelArguments", - finetuning_args: "FinetuningArguments", - is_trainable: bool = False, - add_valuehead: bool = False, -) -> "PreTrainedModel": + tokenizer: "PreTrainedTokenizer", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: bool = False, + add_valuehead: bool = False, +) -> Union["PreTrainedModel", "AutoModelForVision2Seq"]: r""" Loads pretrained model. """ @@ -134,7 +134,10 @@ def load_model( if model_args.mixture_of_depths == "load": model = load_mod_pretrained_model(**init_kwargs) else: - model = AutoModelForCausalLM.from_pretrained(**init_kwargs) + if model_args.use_mllm: + model = AutoModelForVision2Seq.from_pretrained(**init_kwargs) + else: + model = AutoModelForCausalLM.from_pretrained(**init_kwargs) if model_args.mixture_of_depths == "convert": model = convert_pretrained_model_to_mod(model, config, model_args) @@ -182,56 +185,4 @@ def load_model( ) ) - return model - - -def load_mm_model( - processor: "AutoProcessor", - model_args: "ModelArguments", - finetuning_args: "FinetuningArguments", - is_trainable: bool = False, - add_valuehead: bool = False, - use_clm=True, -) -> "AutoModelForVision2Seq": - r""" - Loads pretrained model. Must after load_tokenizer. - """ - tokenizer = processor.tokenizer - init_kwargs = _get_init_kwargs(model_args) - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) - patch_config(config, tokenizer, model_args, init_kwargs, is_trainable) - - model = None - if model is None: - init_kwargs["config"] = config - init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path - model: "AutoModelForVision2Seq" = AutoModelForVision2Seq.from_pretrained(**init_kwargs) - patch_model(model, tokenizer, model_args, is_trainable) - register_autoclass(config, model, tokenizer) - - model = init_mm_adapter(model, model_args, finetuning_args, is_trainable, use_clm) - - if not is_trainable: - model.requires_grad_(False) - model.eval() - else: - model.train() - - trainable_params, all_param = count_parameters(model) - if is_trainable: - param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( - trainable_params, all_param, 100 * trainable_params / all_param - ) - else: - param_stats = "all params: {:d}".format(all_param) - logger.info(param_stats) - - if model_args.print_param_status: - for name, param in model.named_parameters(): - print( - "name: {}, dtype: {}, device: {}, trainable: {}".format( - name, param.dtype, param.device, param.requires_grad - ) - ) - - return model + return model \ No newline at end of file diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py index 21f4aebf..7afd8f6f 100644 --- a/src/llmtuner/train/sftmm/workflow.py +++ b/src/llmtuner/train/sftmm/workflow.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, List, Optional from ...data import split_dataset, get_mm_dataset from ...extras.misc import get_logits_processor from ...extras.ploting import plot_loss -from ...model import load_tokenizer, load_processor, load_mm_model +from ...model import load_tokenizer, load_processor, load_model from ..utils import create_modelcard_and_push from .metric import ComputeMetrics from .trainer import CustomSeq2SeqTrainer @@ -29,10 +29,7 @@ def run_sft_mm( CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}""" tokenizer.chat_template = CHAT_TEMPLATE processor.tokenizer = tokenizer - use_clm = True - if "blip" in model_args.model_name_or_path: - use_clm = False - model = load_mm_model(processor, model_args, finetuning_args, training_args.do_train, use_clm=use_clm) + model = load_model(processor.tokenizer, model_args, finetuning_args, training_args.do_train) dataset = get_mm_dataset(processor, model_args, data_args, training_args, stage="sft") if getattr(model, "is_quantized", False) and not training_args.do_train: setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction From 3c792174dbc46f2b92f5fe5075317f9c1a1956e7 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 19:19:59 +0800 Subject: [PATCH 08/29] merge data part to the text stream Former-commit-id: 7ee20286d9bcc2d5378bfd6bb02cd3648396d873 --- examples/mllm/sft_instructblip.sh | 32 ---- scripts/test_mllm.py | 14 +- src/llmtuner/data/__init__.py | 3 +- src/llmtuner/data/aligner.py | 118 +++++++++++-- src/llmtuner/data/loader.py | 103 +++++------ src/llmtuner/data/parser.py | 37 +++- src/llmtuner/data/preprocess.py | 185 +++++++++++++++++--- src/llmtuner/data/template.py | 248 +++++++++++++++++++++------ src/llmtuner/hparams/model_args.py | 115 ++++++++++--- src/llmtuner/model/adapter.py | 100 ++++++++--- src/llmtuner/model/loader.py | 26 ++- src/llmtuner/train/sftmm/collator.py | 15 +- src/llmtuner/train/sftmm/workflow.py | 90 +++++++--- 13 files changed, 802 insertions(+), 284 deletions(-) delete mode 100644 examples/mllm/sft_instructblip.sh diff --git a/examples/mllm/sft_instructblip.sh b/examples/mllm/sft_instructblip.sh deleted file mode 100644 index b3923655..00000000 --- a/examples/mllm/sft_instructblip.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage sft_mm \ - --do_train \ - --model_name_or_path Salesforce/instructblip-vicuna-7b \ - --dataset mllm_instruct_example \ - --dataset_dir data \ - --template default \ - --finetuning_type lora \ - --lora_target all \ - --output_dir saves/instructblip-vicuna-7b/lora/sft \ - --overwrite_cache \ - --overwrite_output_dir \ - --cutoff_len 1024 \ - --preprocessing_num_workers 16 \ - --per_device_train_batch_size 3 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 1 \ - --lr_scheduler_type cosine \ - --logging_steps 1 \ - --warmup_steps 20 \ - --save_steps 100 \ - --eval_steps 100 \ - --evaluation_strategy steps \ - --load_best_model_at_end \ - --learning_rate 1e-5 \ - --num_train_epochs 50 \ - --max_samples 3000 \ - --val_size 0.1 \ - --plot_loss \ - --bf16 \ No newline at end of file diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py index 961f02bf..94d8670b 100644 --- a/scripts/test_mllm.py +++ b/scripts/test_mllm.py @@ -29,7 +29,10 @@ def get_processor(model_path): def apply_lora(base_model_path, model_path, lora_path): print(f"Loading the base model from {base_model_path}") base_model = AutoModelForVision2Seq.from_pretrained( - base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="cuda", + base_model_path, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + device_map="cuda", ) processor = get_processor(base_model_path) tokenizer = processor.tokenizer @@ -60,11 +63,14 @@ def main( if not os.path.exists(model_path) or do_merge: apply_lora(base_model_path, model_path, lora_model_path) model = AutoModelForVision2Seq.from_pretrained( - model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map="cuda" + model_path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + device_map="cuda", ) processor = get_processor(model_path) raw_datasets = load_dataset(dataset_name) - train_dataset = raw_datasets['train'] + train_dataset = raw_datasets["train"] examples = train_dataset.select(range(3)) texts = [] images = [] @@ -81,5 +87,5 @@ def main( print(res) -if __name__ == '__main__': +if __name__ == "__main__": fire.Fire(main) diff --git a/src/llmtuner/data/__init__.py b/src/llmtuner/data/__init__.py index 27a2f3b8..00a82d73 100644 --- a/src/llmtuner/data/__init__.py +++ b/src/llmtuner/data/__init__.py @@ -1,12 +1,11 @@ from .collator import PairwiseDataCollatorWithPadding -from .loader import get_dataset, get_mm_dataset +from .loader import get_dataset from .template import Template, get_template_and_fix_tokenizer, templates from .utils import Role, split_dataset __all__ = [ "PairwiseDataCollatorWithPadding", "get_dataset", - "get_mm_dataset", "Template", "get_template_and_fix_tokenizer", "templates", diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index 4de37e6d..85202ea8 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -13,7 +13,9 @@ if TYPE_CHECKING: from .parser import DatasetAttr -def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: +def convert_alpaca( + examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" +) -> Dict[str, List[Any]]: outputs = {"prompt": [], "response": [], "system": [], "tools": []} for i in range(len(examples[dataset_attr.prompt])): prompt = [] @@ -31,24 +33,38 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") prompt.append({"role": Role.USER.value, "content": "\n".join(content)}) - if dataset_attr.response and isinstance(examples[dataset_attr.response][i], list): + if dataset_attr.response and isinstance( + examples[dataset_attr.response][i], list + ): response = [ - {"role": Role.ASSISTANT.value, "content": content} for content in examples[dataset_attr.response][i] + {"role": Role.ASSISTANT.value, "content": content} + for content in examples[dataset_attr.response][i] + ] + elif dataset_attr.response and isinstance( + examples[dataset_attr.response][i], str + ): + response = [ + { + "role": Role.ASSISTANT.value, + "content": examples[dataset_attr.response][i], + } ] - elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str): - response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}] else: response = [] outputs["prompt"].append(prompt) outputs["response"].append(response) - outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") + outputs["system"].append( + examples[dataset_attr.system][i] if dataset_attr.system else "" + ) outputs["tools"].append("") - + outputs["images"].append([]) return outputs -def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: +def convert_sharegpt( + examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" +) -> Dict[str, List[Any]]: outputs = {"prompt": [], "response": [], "system": [], "tools": []} tag_mapping = { dataset_attr.user_tag: Role.USER.value, @@ -61,7 +77,10 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag) accept_tags = (odd_tags, even_tags) for i, messages in enumerate(examples[dataset_attr.messages]): - if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag: + if ( + dataset_attr.system_tag + and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag + ): system = messages[0][dataset_attr.content_tag] messages = messages[1:] else: @@ -77,19 +96,81 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" raise ValueError("Invalid role tag in {}.".format(messages)) aligned_messages.append( - {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} + { + "role": tag_mapping[message[dataset_attr.role_tag]], + "content": message[dataset_attr.content_tag], + } ) outputs["prompt"].append(aligned_messages[:-1]) outputs["response"].append(aligned_messages[-1:]) outputs["system"].append(system) - outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") + outputs["tools"].append( + examples[dataset_attr.tools][i] if dataset_attr.tools else "" + ) + outputs["images"].append([]) + + return outputs + + +def convert_llava( + examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" +) -> Dict[str, List[Any]]: + outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} + tag_mapping = { + dataset_attr.user_tag: Role.USER.value, + dataset_attr.assistant_tag: Role.ASSISTANT.value, + dataset_attr.observation_tag: Role.OBSERVATION.value, + dataset_attr.function_tag: Role.FUNCTION.value, + dataset_attr.system_tag: Role.SYSTEM.value, + } + odd_tags = (dataset_attr.user_tag, dataset_attr.observation_tag) + even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag) + accept_tags = (odd_tags, even_tags) + for i, messages in enumerate(examples[dataset_attr.messages]): + if ( + dataset_attr.system_tag + and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag + ): + system = messages[0][dataset_attr.content_tag] + messages = messages[1:] + else: + system = examples[dataset_attr.system][i] if dataset_attr.system else "" + + messages = messages[: len(messages) // 2 * 2] # should be multiples of 2 + if len(messages) == 0: + continue + + aligned_messages = [] + for turn_idx, message in enumerate(messages): + if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]: + raise ValueError("Invalid role tag in {}.".format(messages)) + + aligned_messages.append( + { + "role": tag_mapping[message[dataset_attr.role_tag]], + "content": message[dataset_attr.content_tag], + } + ) + + outputs["prompt"].append(aligned_messages[:-1]) + outputs["response"].append(aligned_messages[-1:]) + outputs["system"].append(system) + outputs["tools"].append( + examples[dataset_attr.tools][i] if dataset_attr.tools else "" + ) + print(examples[dataset_attr.images][i]) + outputs["images"].append( + examples[dataset_attr.images][i] if dataset_attr.images else [] + ) return outputs def align_dataset( - dataset: Union["Dataset", "IterableDataset"], dataset_attr: "DatasetAttr", data_args: "DataArguments" + dataset: Union["Dataset", "IterableDataset"], + dataset_attr: "DatasetAttr", + data_args: "DataArguments", ) -> Union["Dataset", "IterableDataset"]: r""" Aligned dataset: @@ -100,6 +181,8 @@ def align_dataset( """ if dataset_attr.formatting == "alpaca": convert_func = partial(convert_alpaca, dataset_attr=dataset_attr) + elif dataset_attr.formatting == "llava": + convert_func = partial(convert_llava, dataset_attr=dataset_attr) else: convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr) @@ -107,13 +190,20 @@ def align_dataset( features = Features.from_dict( { "prompt": [ - {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}} + { + "role": {"dtype": "string", "_type": "Value"}, + "content": {"dtype": "string", "_type": "Value"}, + } ], "response": [ - {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}} + { + "role": {"dtype": "string", "_type": "Value"}, + "content": {"dtype": "string", "_type": "Value"}, + } ], "system": {"dtype": "string", "_type": "Value"}, "tools": {"dtype": "string", "_type": "Value"}, + "images": {"feature": {"_type": "Image"}, "_type": "Sequence"}, } ) kwargs = {} diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 18665731..c373e196 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -1,6 +1,6 @@ import inspect import os -from typing import TYPE_CHECKING, Literal, Union +from typing import TYPE_CHECKING, Literal, Union, Optional from datasets import load_dataset, load_from_disk @@ -25,9 +25,9 @@ logger = get_logger(__name__) def load_single_dataset( - dataset_attr: "DatasetAttr", - model_args: "ModelArguments", - data_args: "DataArguments", + dataset_attr: "DatasetAttr", + model_args: "ModelArguments", + data_args: "DataArguments", ) -> Union["Dataset", "IterableDataset"]: logger.info("Loading dataset {}...".format(dataset_attr)) data_path, data_name, data_dir, data_files = None, None, None, None @@ -78,14 +78,20 @@ def load_single_dataset( split=data_args.split, cache_dir=cache_dir, token=model_args.ms_hub_token, - use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")), + use_streaming=( + data_args.streaming and (dataset_attr.load_from != "file") + ), ) if isinstance(dataset, MsDataset): dataset = dataset.to_hf_dataset() except ImportError: - raise ImportError("Please install modelscope via `pip install modelscope -U`") + raise ImportError( + "Please install modelscope via `pip install modelscope -U`" + ) else: - if "trust_remote_code" in inspect.signature(load_dataset).parameters: # for datasets==2.16.0 + if ( + "trust_remote_code" in inspect.signature(load_dataset).parameters + ): # for datasets==2.16.0 kwargs = {"trust_remote_code": True} else: kwargs = {} @@ -102,7 +108,9 @@ def load_single_dataset( **kwargs, ) - if data_args.streaming and (dataset_attr.load_from == "file"): # faster than specifying streaming=True + if data_args.streaming and ( + dataset_attr.load_from == "file" + ): # faster than specifying streaming=True dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter if data_args.max_samples is not None: # truncate dataset @@ -113,11 +121,12 @@ def load_single_dataset( def get_dataset( - tokenizer: "PreTrainedTokenizer", - model_args: "ModelArguments", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - stage: Literal["pt", "sft", "rm", "ppo"], + tokenizer: "PreTrainedTokenizer", + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + stage: Literal["pt", "sft", "rm", "ppo"], + processor: Optional["AutoProcessor"] = None, ) -> Union["Dataset", "IterableDataset"]: template = get_template_and_fix_tokenizer(tokenizer, data_args.template) if data_args.train_on_prompt and template.efficient_eos: @@ -126,9 +135,13 @@ def get_dataset( # Load tokenized dataset if data_args.tokenized_path is not None: if has_tokenized_data(data_args.tokenized_path): - logger.warning("Loading dataset from disk will ignore other data arguments.") + logger.warning( + "Loading dataset from disk will ignore other data arguments." + ) dataset = load_from_disk(data_args.tokenized_path) - logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path)) + logger.info( + "Loaded tokenized dataset from {}.".format(data_args.tokenized_path) + ) if data_args.streaming: dataset = dataset.to_iterable_dataset() return dataset @@ -139,15 +152,21 @@ def get_dataset( with training_args.main_process_first(desc="load dataset"): all_datasets = [] for dataset_attr in get_dataset_list(data_args): - if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True): - raise ValueError("The dataset is not applicable in the current training stage.") + if (stage == "rm" and dataset_attr.ranking is False) or ( + stage != "rm" and dataset_attr.ranking is True + ): + raise ValueError( + "The dataset is not applicable in the current training stage." + ) - all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args)) + all_datasets.append( + load_single_dataset(dataset_attr, model_args, data_args) + ) dataset = merge_dataset(all_datasets, data_args, training_args) with training_args.main_process_first(desc="pre-process dataset"): preprocess_func, print_function = get_preprocess_and_print_func( - tokenizer, template, data_args, training_args, stage + tokenizer, template, data_args, training_args, stage, processor ) column_names = list(next(iter(dataset)).keys()) kwargs = {} @@ -158,13 +177,21 @@ def get_dataset( desc="Running tokenizer on dataset", ) - dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs) + dataset = dataset.map( + preprocess_func, batched=True, remove_columns=column_names, **kwargs + ) if data_args.tokenized_path is not None: if training_args.should_save: dataset.save_to_disk(data_args.tokenized_path) - logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path)) - logger.info("Please restart the training with `--tokenized_path {}`.".format(data_args.tokenized_path)) + logger.info( + "Tokenized dataset saved at {}.".format(data_args.tokenized_path) + ) + logger.info( + "Please restart the training with `--tokenized_path {}`.".format( + data_args.tokenized_path + ) + ) exit(0) @@ -172,34 +199,8 @@ def get_dataset( try: print_function(next(iter(dataset))) except StopIteration: - raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.") + raise RuntimeError( + "Cannot find valid samples, check `data/README.md` for the data format." + ) return dataset - - -def get_mm_dataset( - processor: "AutoProcessor", - model_args: "ModelArguments", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - stage: Literal["pt", "sft", "rm", "ppo"], -) -> Union["Dataset", "IterableDataset"]: - if data_args.tokenized_path is not None: - if has_tokenized_data(data_args.tokenized_path): - logger.warning("Loading dataset from disk will ignore other data arguments.") - dataset = load_from_disk(data_args.tokenized_path) - logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path)) - if data_args.streaming: - dataset = dataset.to_iterable_dataset() - return dataset - - if data_args.streaming: - raise ValueError("Turn off `streaming` when saving dataset to disk.") - - with training_args.main_process_first(desc="load dataset"): - all_datasets = [] - for dataset_attr in get_dataset_list(data_args): - all_datasets.append(load_dataset(dataset_attr.dataset_name)['train']) - dataset = merge_dataset(all_datasets, data_args, training_args) - - return dataset diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py index b9c8782a..79d6ed4e 100644 --- a/src/llmtuner/data/parser.py +++ b/src/llmtuner/data/parser.py @@ -25,7 +25,7 @@ class DatasetAttr: subset: Optional[str] = None folder: Optional[str] = None ranking: bool = False - formatting: Literal["alpaca", "sharegpt"] = "alpaca" + formatting: Literal["alpaca", "sharegpt", "llava"] = "alpaca" """ columns """ system: Optional[str] = None """ columns for the alpaca format """ @@ -44,11 +44,15 @@ class DatasetAttr: observation_tag: Optional[str] = "observation" function_tag: Optional[str] = "function_call" system_tag: Optional[str] = "system" + """ columns for the mllm format """ + images: Optional[str] = None def __repr__(self) -> str: return self.dataset_name - def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None: + def set_attr( + self, key: str, obj: Dict[str, Any], default: Optional[Any] = None + ) -> None: setattr(self, key, obj.get(key, default)) @@ -67,12 +71,16 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: except Exception as err: if len(dataset_names) != 0: raise ValueError( - "Cannot open {} due to {}.".format(os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err)) + "Cannot open {} due to {}.".format( + os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err) + ) ) dataset_info = None if data_args.interleave_probs is not None: - data_args.interleave_probs = [float(prob.strip()) for prob in data_args.interleave_probs.split(",")] + data_args.interleave_probs = [ + float(prob.strip()) for prob in data_args.interleave_probs.split(",") + ] dataset_list: List[DatasetAttr] = [] for name in dataset_names: @@ -90,31 +98,42 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: if has_hf_url or has_ms_url: if (use_modelscope() and has_ms_url) or (not has_hf_url): - dataset_attr = DatasetAttr("ms_hub", dataset_name=dataset_info[name]["ms_hub_url"]) + dataset_attr = DatasetAttr( + "ms_hub", dataset_name=dataset_info[name]["ms_hub_url"] + ) else: - dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]) + dataset_attr = DatasetAttr( + "hf_hub", dataset_name=dataset_info[name]["hf_hub_url"] + ) elif "script_url" in dataset_info[name]: - dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) + dataset_attr = DatasetAttr( + "script", dataset_name=dataset_info[name]["script_url"] + ) else: - dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"]) + dataset_attr = DatasetAttr( + "file", dataset_name=dataset_info[name]["file_name"] + ) dataset_attr.set_attr("file_sha1", dataset_info[name]) dataset_attr.set_attr("subset", dataset_info[name]) dataset_attr.set_attr("folder", dataset_info[name]) dataset_attr.set_attr("ranking", dataset_info[name], default=False) dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca") + dataset_attr.set_attr("images", dataset_info[name], default="") if "columns" in dataset_info[name]: column_names = ["system"] if dataset_attr.formatting == "alpaca": column_names.extend(["prompt", "query", "response", "history"]) + elif dataset_attr.formatting == "llava": + column_names.extend(["messages", "images"]) else: column_names.extend(["messages", "tools"]) for column_name in column_names: dataset_attr.set_attr(column_name, dataset_info[name]["columns"]) - if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]: + if dataset_attr.formatting != "alpaca" and "tags" in dataset_info[name]: tag_names = ( "role_tag", "content_tag", diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index 8494ba7e..dc72483f 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -1,6 +1,6 @@ from functools import partial from itertools import chain -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Tuple +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Tuple, Optional from ..extras.constants import IGNORE_INDEX from ..extras.logging import get_logger @@ -9,7 +9,7 @@ from .utils import Role if TYPE_CHECKING: from transformers import Seq2SeqTrainingArguments - from transformers.tokenization_utils import PreTrainedTokenizer + from transformers.tokenization_utils import PreTrainedTokenizer, AutoProcessor from ..hparams import DataArguments from .template import Template @@ -19,19 +19,27 @@ logger = get_logger(__name__) def preprocess_pretrain_dataset( - examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" + examples: Dict[str, List[Any]], + tokenizer: "PreTrainedTokenizer", + data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` if packing is enabled - text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] + text_examples = [ + messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"] + ] if not data_args.packing: if data_args.template == "gemma": text_examples = [tokenizer.bos_token + example for example in text_examples] - result = tokenizer(text_examples, add_special_tokens=False, max_length=data_args.cutoff_len) + result = tokenizer( + text_examples, add_special_tokens=False, max_length=data_args.cutoff_len + ) else: tokenized_examples = tokenizer(text_examples, add_special_tokens=False) - concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()} + concatenated_examples = { + k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys() + } total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]]) block_size = data_args.cutoff_len total_length = (total_length // block_size) * block_size @@ -54,7 +62,11 @@ def preprocess_supervised_dataset( ) -> Dict[str, List[List[int]]]: # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. - model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} + model_inputs = { + "input_ids": [], + "attention_mask": [], + "labels": [], + } for i in range(len(examples["prompt"])): if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: @@ -75,7 +87,9 @@ def preprocess_supervised_dataset( if data_args.train_on_prompt: source_mask = source_ids elif turn_idx != 0 and template.efficient_eos: - source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) + source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * ( + len(source_ids) - 1 + ) else: source_mask = [IGNORE_INDEX] * len(source_ids) @@ -114,7 +128,9 @@ def preprocess_packed_supervised_dataset( if data_args.train_on_prompt: source_mask = source_ids elif len(input_ids) != 0 and template.efficient_eos: - source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) + source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * ( + len(source_ids) - 1 + ) else: source_mask = [IGNORE_INDEX] * len(source_ids) @@ -139,6 +155,64 @@ def preprocess_packed_supervised_dataset( return model_inputs +def preprocess_multimodal_supervised_dataset( + examples: Dict[str, List[Any]], + processor: "AutoProcessor", + template: "Template", + data_args: "DataArguments", +) -> Dict[str, List[List[int]]]: + # build inputs with format ` X Y ` and labels with format ` ... Y ` + # for multiturn examples, we only mask the prompt part in each prompt-response pair. + tokenizer = processor.tokenizer + model_inputs = { + "input_ids": [], + "attention_mask": [], + "labels": [], + "pixel_values": [], + } + + for i in range(len(examples["prompt"])): + if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: + continue + + messages = examples["prompt"][i] + examples["response"][i] + input_ids, labels = [], [] + for turn_idx, (source_ids, target_ids) in enumerate( + template.encode_multiturn( + tokenizer, + messages, + examples["system"][i], + examples["tools"][i], + data_args.cutoff_len, + data_args.reserved_label_len, + ) + ): + if data_args.train_on_prompt: + source_mask = source_ids + elif turn_idx != 0 and template.efficient_eos: + source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * ( + len(source_ids) - 1 + ) + else: + source_mask = [IGNORE_INDEX] * len(source_ids) + + input_ids += source_ids + target_ids + labels += source_mask + target_ids + + if template.efficient_eos: + input_ids += [tokenizer.eos_token_id] + labels += [tokenizer.eos_token_id] + + model_inputs["input_ids"].append(input_ids) + model_inputs["attention_mask"].append([1] * len(input_ids)) + model_inputs["labels"].append(labels) + pixel_values = processor.image_processor( + examples["images"][0], return_tensors="pt" + )["pixel_values"][0] + model_inputs["pixel_values"].append(pixel_values) + return model_inputs + + def preprocess_unsupervised_dataset( examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", @@ -155,7 +229,9 @@ def preprocess_unsupervised_dataset( if len(examples["response"][i]) == 1: messages = examples["prompt"][i] + examples["response"][i] else: - messages = examples["prompt"][i] + [{"role": Role.ASSISTANT.value, "content": ""}] + messages = examples["prompt"][i] + [ + {"role": Role.ASSISTANT.value, "content": ""} + ] input_ids, labels = template.encode_oneturn( tokenizer, @@ -218,29 +294,58 @@ def preprocess_pairwise_dataset( return model_inputs -def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: +def print_supervised_dataset_example( + example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer" +) -> None: print("input_ids:\n{}".format(example["input_ids"])) - print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) + print( + "inputs:\n{}".format( + tokenizer.decode(example["input_ids"], skip_special_tokens=False) + ) + ) print("label_ids:\n{}".format(example["labels"])) print( "labels:\n{}".format( - tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False) + tokenizer.decode( + list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), + skip_special_tokens=False, + ) ) ) -def print_pairwise_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: +def print_pairwise_dataset_example( + example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer" +) -> None: print("prompt_ids:\n{}".format(example["prompt_ids"])) - print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False))) + print( + "prompt:\n{}".format( + tokenizer.decode(example["prompt_ids"], skip_special_tokens=False) + ) + ) print("chosen_ids:\n{}".format(example["chosen_ids"])) - print("chosen:\n{}".format(tokenizer.decode(example["chosen_ids"], skip_special_tokens=False))) + print( + "chosen:\n{}".format( + tokenizer.decode(example["chosen_ids"], skip_special_tokens=False) + ) + ) print("rejected_ids:\n{}".format(example["rejected_ids"])) - print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False))) + print( + "rejected:\n{}".format( + tokenizer.decode(example["rejected_ids"], skip_special_tokens=False) + ) + ) -def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: +def print_unsupervised_dataset_example( + example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer" +) -> None: print("input_ids:\n{}".format(example["input_ids"])) - print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) + print( + "inputs:\n{}".format( + tokenizer.decode(example["input_ids"], skip_special_tokens=False) + ) + ) def get_preprocess_and_print_func( @@ -249,30 +354,56 @@ def get_preprocess_and_print_func( data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments", stage: Literal["pt", "sft", "rm", "ppo"], + processor: Optional["AutoProcessor"] = None, ) -> Tuple[Callable, Callable]: if stage == "pt": - preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args) - print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) + preprocess_func = partial( + preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args + ) + print_function = partial( + print_unsupervised_dataset_example, tokenizer=tokenizer + ) elif stage == "sft" and not training_args.predict_with_generate: if data_args.packing: preprocess_func = partial( - preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args + preprocess_packed_supervised_dataset, + tokenizer=tokenizer, + template=template, + data_args=data_args, + ) + elif processor is not None: + preprocess_func = partial( + preprocess_multimodal_supervised_dataset, + processor=processor, + template=template, + data_args=data_args, ) else: preprocess_func = partial( - preprocess_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args + preprocess_supervised_dataset, + tokenizer=tokenizer, + template=template, + data_args=data_args, ) print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer) elif stage == "rm": preprocess_func = partial( - preprocess_pairwise_dataset, tokenizer=tokenizer, template=template, data_args=data_args + preprocess_pairwise_dataset, + tokenizer=tokenizer, + template=template, + data_args=data_args, ) print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer) else: preprocess_func = partial( - preprocess_unsupervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args + preprocess_unsupervised_dataset, + tokenizer=tokenizer, + template=template, + data_args=data_args, + ) + print_function = partial( + print_unsupervised_dataset_example, tokenizer=tokenizer ) - print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) - return preprocess_func, print_function \ No newline at end of file + return preprocess_func, print_function diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index 73b22eb7..311660aa 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -42,7 +42,9 @@ class Template: r""" Returns a single pair of token ids representing prompt and response respectively. """ - encoded_pairs = self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len) + encoded_pairs = self._encode( + tokenizer, messages, system, tools, cutoff_len, reserved_label_len + ) prompt_ids = [] for query_ids, resp_ids in encoded_pairs[:-1]: prompt_ids += query_ids + resp_ids @@ -62,7 +64,9 @@ class Template: r""" Returns multiple pairs of token ids representing prompts and responses respectively. """ - return self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len) + return self._encode( + tokenizer, messages, system, tools, cutoff_len, reserved_label_len + ) def _encode( self, @@ -89,7 +93,9 @@ class Template: elements += self.format_separator.apply() if message["role"] == Role.USER.value: - elements += self.format_user.apply(content=message["content"], idx=str(i // 2)) + elements += self.format_user.apply( + content=message["content"], idx=str(i // 2) + ) elif message["role"] == Role.ASSISTANT.value: elements += self.format_assistant.apply(content=message["content"]) elif message["role"] == Role.OBSERVATION.value: @@ -104,7 +110,9 @@ class Template: return self._make_pairs(encoded_messages, cutoff_len, reserved_label_len) def _convert_elements_to_ids( - self, tokenizer: "PreTrainedTokenizer", elements: List[Union[str, Dict[str, str]]] + self, + tokenizer: "PreTrainedTokenizer", + elements: List[Union[str, Dict[str, str]]], ) -> List[int]: r""" Converts elements to token ids. @@ -122,7 +130,11 @@ class Template: elif "eos_token" in elem and tokenizer.eos_token_id is not None: token_ids += [tokenizer.eos_token_id] else: - raise ValueError("Input must be string, set[str] or dict[str, str], got {}".format(type(elem))) + raise ValueError( + "Input must be string, set[str] or dict[str, str], got {}".format( + type(elem) + ) + ) return token_ids @@ -180,7 +192,9 @@ class Llama2Template(Template): elements += self.format_separator.apply() if message["role"] == Role.USER.value: - elements += self.format_user.apply(content=system_text + message["content"]) + elements += self.format_user.apply( + content=system_text + message["content"] + ) elif message["role"] == Role.ASSISTANT.value: elements += self.format_assistant.apply(content=message["content"]) elif message["role"] == Role.OBSERVATION.value: @@ -243,7 +257,9 @@ def _register_template( template_class = Llama2Template if name.startswith("llama2") else Template default_user_formatter = StringFormatter(slots=["{{content}}"]) default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots) - default_function_formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots) + default_function_formatter = FunctionFormatter( + slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots + ) default_tool_formatter = ToolFormatter(tool_format="default") default_separator_formatter = EmptyFormatter() templates[name] = template_class( @@ -279,7 +295,9 @@ def _jinja_escape(content: str) -> str: return content.replace("\n", r"\n").replace("'", r"\'") -def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str: +def _convert_slots_to_jinja( + slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content" +) -> str: slot_items = [] for slot in slots: if isinstance(slot, str): @@ -293,7 +311,9 @@ def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", pl elif isinstance(slot, set): if "bos_token" in slot: slot_items.append("'" + tokenizer.bos_token + "'") - elif "eos_token" in slot: # do not use {{ eos_token }} since it may be replaced + elif ( + "eos_token" in slot + ): # do not use {{ eos_token }} since it may be replaced slot_items.append("'" + tokenizer.eos_token + "'") elif isinstance(slot, dict): raise ValueError("Dict is not supported.") @@ -305,25 +325,37 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") jinja_template = "" if template.default_system: - jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}" + jinja_template += ( + "{% set system_message = '" + + _jinja_escape(template.default_system) + + "' %}" + ) jinja_template += ( - "{% if messages[0]['role'] == 'system' %}" "{% set system_message = messages[0]['content'] %}" "{% endif %}" + "{% if messages[0]['role'] == 'system' %}" + "{% set system_message = messages[0]['content'] %}" + "{% endif %}" ) - system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message") + system_message = _convert_slots_to_jinja( + template.format_system.apply(), tokenizer, placeholder="system_message" + ) if isinstance(template, Llama2Template): pass elif template.force_system: jinja_template += "{{ " + system_message + " }}" else: - jinja_template += "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}" + jinja_template += ( + "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}" + ) jinja_template += "{% for message in messages %}" jinja_template += "{% set content = message['content'] %}" if isinstance(template, Llama2Template): jinja_template += "{% if loop.index0 == 0 and system_message is defined %}" - jinja_template += "{% set content = " + system_message + " + message['content'] %}" + jinja_template += ( + "{% set content = " + system_message + " + message['content'] %}" + ) jinja_template += "{% endif %}" jinja_template += "{% if message['role'] == 'user' %}" user_message = _convert_slots_to_jinja(template.format_user.apply(), tokenizer) @@ -366,11 +398,14 @@ def get_template_and_fix_tokenizer( if stop_words: num_added_tokens = tokenizer.add_special_tokens( - dict(additional_special_tokens=stop_words), replace_additional_special_tokens=False + dict(additional_special_tokens=stop_words), + replace_additional_special_tokens=False, ) logger.info("Add {} to stop words.".format(",".join(stop_words))) if num_added_tokens > 0: - logger.warning("New tokens have been added, make sure `resize_vocab` is True.") + logger.warning( + "New tokens have been added, make sure `resize_vocab` is True." + ) try: tokenizer.chat_template = _get_jinja_template(template, tokenizer) @@ -382,7 +417,9 @@ def get_template_and_fix_tokenizer( _register_template( name="alpaca", - format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]), + format_user=StringFormatter( + slots=["### Instruction:\n{{content}}\n\n### Response:\n"] + ), format_separator=EmptyFormatter(slots=["\n\n"]), default_system=( "Below is an instruction that describes a task. " @@ -407,7 +444,13 @@ _register_template( _register_template( name="atom", format_user=StringFormatter( - slots=[{"bos_token"}, "Human: {{content}}\n", {"eos_token"}, {"bos_token"}, "Assistant:"] + slots=[ + {"bos_token"}, + "Human: {{content}}\n", + {"eos_token"}, + {"bos_token"}, + "Assistant:", + ] ), format_assistant=StringFormatter(slots=["{{content}}\n", {"eos_token"}]), ) @@ -415,7 +458,9 @@ _register_template( _register_template( name="baichuan", - format_user=StringFormatter(slots=[{"token": ""}, "{{content}}", {"token": ""}]), + format_user=StringFormatter( + slots=[{"token": ""}, "{{content}}", {"token": ""}] + ), efficient_eos=True, ) @@ -438,7 +483,9 @@ _register_template( _register_template( name="bluelm", - format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]), + format_user=StringFormatter( + slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}] + ), ) @@ -457,7 +504,9 @@ _register_template( _register_template( name="chatglm2", format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]), - format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), + format_system=StringFormatter( + slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"] + ), format_separator=EmptyFormatter(slots=["\n\n"]), efficient_eos=True, force_system=True, @@ -466,12 +515,21 @@ _register_template( _register_template( name="chatglm3", - format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]), + format_user=StringFormatter( + slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}] + ), format_assistant=StringFormatter(slots=["\n", "{{content}}"]), - format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), + format_system=StringFormatter( + slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"] + ), format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]), format_observation=StringFormatter( - slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}] + slots=[ + {"token": "<|observation|>"}, + "\n", + "{{content}}", + {"token": "<|assistant|>"}, + ] ), stop_words=["<|user|>", "<|observation|>"], efficient_eos=True, @@ -481,14 +539,27 @@ _register_template( _register_template( name="chatglm3_system", - format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]), + format_user=StringFormatter( + slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}] + ), format_assistant=StringFormatter(slots=["\n", "{{content}}"]), format_system=StringFormatter( - slots=[{"token": "[gMASK]"}, {"token": "sop"}, {"token": "<|system|>"}, "\n", "{{content}}"] + slots=[ + {"token": "[gMASK]"}, + {"token": "sop"}, + {"token": "<|system|>"}, + "\n", + "{{content}}", + ] ), format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]), format_observation=StringFormatter( - slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}] + slots=[ + {"token": "<|observation|>"}, + "\n", + "{{content}}", + {"token": "<|assistant|>"}, + ] ), default_system=( "You are ChatGLM3, a large language model trained by Zhipu.AI. " @@ -501,9 +572,15 @@ _register_template( _register_template( name="chatml", - format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), - format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), - format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_user=StringFormatter( + slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), + format_system=StringFormatter( + slots=["<|im_start|>system\n{{content}}<|im_end|>\n"] + ), + format_observation=StringFormatter( + slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), format_separator=EmptyFormatter(slots=["\n"]), stop_words=["<|im_end|>", "<|im_start|>"], replace_eos=True, @@ -512,9 +589,15 @@ _register_template( _register_template( name="chatml_de", - format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), - format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), - format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_user=StringFormatter( + slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), + format_system=StringFormatter( + slots=["<|im_start|>system\n{{content}}<|im_end|>\n"] + ), + format_observation=StringFormatter( + slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), format_separator=EmptyFormatter(slots=["\n"]), default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.", stop_words=["<|im_end|>", "<|im_start|>"], @@ -524,7 +607,9 @@ _register_template( _register_template( name="codegeex2", - format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), + format_system=StringFormatter( + slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"] + ), force_system=True, ) @@ -554,9 +639,15 @@ _register_template( _register_template( name="dbrx", - format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), - format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), - format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_user=StringFormatter( + slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), + format_system=StringFormatter( + slots=["<|im_start|>system\n{{content}}<|im_end|>\n"] + ), + format_observation=StringFormatter( + slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), format_separator=EmptyFormatter(slots=["\n"]), default_system=( "You are DBRX, created by Databricks. You were last updated in December 2023. " @@ -634,7 +725,9 @@ _register_template( _register_template( name="gemma", - format_user=StringFormatter(slots=["user\n{{content}}\nmodel\n"]), + format_user=StringFormatter( + slots=["user\n{{content}}\nmodel\n"] + ), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), format_observation=StringFormatter( slots=["tool\n{{content}}\nmodel\n"] @@ -647,7 +740,9 @@ _register_template( _register_template( name="intern", - format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"]), + format_user=StringFormatter( + slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"] + ), format_separator=EmptyFormatter(slots=[{"token": ""}, "\n"]), stop_words=[""], efficient_eos=True, @@ -656,8 +751,12 @@ _register_template( _register_template( name="intern2", - format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), - format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_user=StringFormatter( + slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), + format_system=StringFormatter( + slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"] + ), format_separator=EmptyFormatter(slots=["\n"]), default_system=( "You are an AI assistant whose name is InternLM (书生·浦语).\n" @@ -707,7 +806,10 @@ _register_template( ] ), format_system=StringFormatter( - slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"] + slots=[ + {"bos_token"}, + "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>", + ] ), format_observation=StringFormatter( slots=[ @@ -742,7 +844,13 @@ _register_template( _register_template( name="openchat", - format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]), + format_user=StringFormatter( + slots=[ + "GPT4 Correct User: {{content}}", + {"eos_token"}, + "GPT4 Correct Assistant:", + ] + ), format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), force_system=True, @@ -751,7 +859,9 @@ _register_template( _register_template( name="orion", - format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]), + format_user=StringFormatter( + slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}] + ), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), force_system=True, ) @@ -759,9 +869,15 @@ _register_template( _register_template( name="phi", - format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]), - format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]), - format_observation=StringFormatter(slots=["<|function_output|>\n{{content}}<|end|>\n<|assistant|>\n"]), + format_user=StringFormatter( + slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"] + ), + format_system=StringFormatter( + slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"] + ), + format_observation=StringFormatter( + slots=["<|function_output|>\n{{content}}<|end|>\n<|assistant|>\n"] + ), format_separator=EmptyFormatter(slots=["\n"]), default_system="You are a helpful AI assistant.", stop_words=["<|end|>"], @@ -771,9 +887,15 @@ _register_template( _register_template( name="qwen", - format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), - format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), - format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_user=StringFormatter( + slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), + format_system=StringFormatter( + slots=["<|im_start|>system\n{{content}}<|im_end|>\n"] + ), + format_observation=StringFormatter( + slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), format_separator=EmptyFormatter(slots=["\n"]), default_system="You are a helpful assistant.", stop_words=["<|im_end|>"], @@ -829,8 +951,12 @@ _register_template( _register_template( name="yayi", - format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]), - format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]), + format_user=StringFormatter( + slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"] + ), + format_system=StringFormatter( + slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"] + ), format_separator=EmptyFormatter(slots=["\n\n"]), default_system=( "You are a helpful, respectful and honest assistant named YaYi " @@ -849,7 +975,9 @@ _register_template( _register_template( name="yi", - format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_user=StringFormatter( + slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] + ), format_separator=EmptyFormatter(slots=["\n"]), stop_words=["<|im_end|>"], replace_eos=True, @@ -867,7 +995,9 @@ _register_template( _register_template( name="zephyr", - format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]), + format_user=StringFormatter( + slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"] + ), format_assistant=StringFormatter(slots=["\n{{content}}", {"eos_token"}]), format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]), default_system="You are a friendly chatbot who always responds in the style of a pirate", @@ -879,3 +1009,13 @@ _register_template( format_user=StringFormatter(slots=[":{{content}}\n:"]), format_separator=EmptyFormatter(slots=["\n"]), ) + +_register_template( + name="llava", + format_user=StringFormatter(slots=["USER: {{content}} "]), + format_assistant=StringFormatter(slots=["ASSISTANT: {{content}}"]), + default_system=( + "A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions." + ), +) diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index a6e4b710..63fc7f02 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -15,23 +15,33 @@ class ModelArguments: ) adapter_name_or_path: Optional[str] = field( default=None, - metadata={"help": "Path to the adapter weight or identifier from huggingface.co/models."}, + metadata={ + "help": "Path to the adapter weight or identifier from huggingface.co/models." + }, ) cache_dir: Optional[str] = field( default=None, - metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."}, + metadata={ + "help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn." + }, ) use_fast_tokenizer: bool = field( default=True, - metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."}, + metadata={ + "help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)." + }, ) resize_vocab: bool = field( default=False, - metadata={"help": "Whether or not to resize the tokenizer vocab and the embedding layers."}, + metadata={ + "help": "Whether or not to resize the tokenizer vocab and the embedding layers." + }, ) split_special_tokens: bool = field( default=False, - metadata={"help": "Whether or not the special tokens should be split during the tokenization process."}, + metadata={ + "help": "Whether or not the special tokens should be split during the tokenization process." + }, ) new_special_tokens: Optional[str] = field( default=None, @@ -39,7 +49,9 @@ class ModelArguments: ) model_revision: str = field( default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + metadata={ + "help": "The specific model version to use (can be a branch name, tag name or commit id)." + }, ) low_cpu_mem_usage: bool = field( default=True, @@ -47,7 +59,9 @@ class ModelArguments: ) quantization_bit: Optional[int] = field( default=None, - metadata={"help": "The number of bits to quantize the model using bitsandbytes."}, + metadata={ + "help": "The number of bits to quantize the model using bitsandbytes." + }, ) quantization_type: Literal["fp4", "nf4"] = field( default="nf4", @@ -55,15 +69,21 @@ class ModelArguments: ) double_quantization: bool = field( default=True, - metadata={"help": "Whether or not to use double quantization in int4 training."}, + metadata={ + "help": "Whether or not to use double quantization in int4 training." + }, ) quantization_device_map: Optional[Literal["auto"]] = field( default=None, - metadata={"help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."}, + metadata={ + "help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0." + }, ) rope_scaling: Optional[Literal["linear", "dynamic"]] = field( default=None, - metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."}, + metadata={ + "help": "Which scaling strategy should be adopted for the RoPE embeddings." + }, ) flash_attn: Literal["off", "sdpa", "fa2", "auto"] = field( default="auto", @@ -71,19 +91,27 @@ class ModelArguments: ) shift_attn: bool = field( default=False, - metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."}, + metadata={ + "help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA." + }, ) mixture_of_depths: Optional[Literal["convert", "load"]] = field( default=None, - metadata={"help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."}, + metadata={ + "help": "Convert the model to mixture-of-depths (MoD) or load the MoD model." + }, ) use_unsloth: bool = field( default=False, - metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."}, + metadata={ + "help": "Whether or not to use unsloth's optimization for the LoRA training." + }, ) moe_aux_loss_coef: Optional[float] = field( default=None, - metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."}, + metadata={ + "help": "Coefficient of the auxiliary router loss in mixture-of-experts model." + }, ) disable_gradient_checkpointing: bool = field( default=False, @@ -107,7 +135,9 @@ class ModelArguments: ) vllm_gpu_util: float = field( default=0.9, - metadata={"help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine."}, + metadata={ + "help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine." + }, ) vllm_enforce_eager: bool = field( default=False, @@ -147,7 +177,9 @@ class ModelArguments: ) export_quantization_dataset: Optional[str] = field( default=None, - metadata={"help": "Path to the dataset or dataset name to use in quantizing the exported model."}, + metadata={ + "help": "Path to the dataset or dataset name to use in quantizing the exported model." + }, ) export_quantization_nsamples: int = field( default=128, @@ -155,19 +187,27 @@ class ModelArguments: ) export_quantization_maxlen: int = field( default=1024, - metadata={"help": "The maximum length of the model inputs used for quantization."}, + metadata={ + "help": "The maximum length of the model inputs used for quantization." + }, ) export_legacy_format: bool = field( default=False, - metadata={"help": "Whether or not to save the `.bin` files instead of `.safetensors`."}, + metadata={ + "help": "Whether or not to save the `.bin` files instead of `.safetensors`." + }, ) export_hub_model_id: Optional[str] = field( default=None, - metadata={"help": "The name of the repository if push the model to the Hugging Face hub."}, + metadata={ + "help": "The name of the repository if push the model to the Hugging Face hub." + }, ) print_param_status: bool = field( default=False, - metadata={"help": "For debugging purposes, print the status of the parameters in the model."}, + metadata={ + "help": "For debugging purposes, print the status of the parameters in the model." + }, ) use_mllm: bool = field( default=False, @@ -180,18 +220,39 @@ class ModelArguments: self.model_max_length = None if self.split_special_tokens and self.use_fast_tokenizer: - raise ValueError("`split_special_tokens` is only supported for slow tokenizers.") + raise ValueError( + "`split_special_tokens` is only supported for slow tokenizers." + ) - if self.adapter_name_or_path is not None: # support merging multiple lora weights - self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")] + if ( + self.adapter_name_or_path is not None + ): # support merging multiple lora weights + self.adapter_name_or_path = [ + path.strip() for path in self.adapter_name_or_path.split(",") + ] if self.new_special_tokens is not None: # support multiple special tokens - self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")] + self.new_special_tokens = [ + token.strip() for token in self.new_special_tokens.split(",") + ] - assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." - assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization." + assert self.quantization_bit in [ + None, + 8, + 4, + ], "We only accept 4-bit or 8-bit quantization." + assert self.export_quantization_bit in [ + None, + 8, + 4, + 3, + 2, + ], "We only accept 2/3/4/8-bit quantization." - if self.export_quantization_bit is not None and self.export_quantization_dataset is None: + if ( + self.export_quantization_bit is not None + and self.export_quantization_dataset is None + ): raise ValueError("Quantization dataset is necessary for exporting.") def to_dict(self) -> Dict[str, Any]: diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index bcefee92..e65798b7 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -11,7 +11,7 @@ from .utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model if TYPE_CHECKING: - from transformers import PretrainedConfig, PreTrainedModel, AutoModelForVision2Seq + from transformers import PretrainedConfig, PreTrainedModel from ..hparams import FinetuningArguments, ModelArguments @@ -21,11 +21,11 @@ logger = get_logger(__name__) def init_adapter( config: "PretrainedConfig", - model: Union["PreTrainedModel","AutoModelForVision2Seq"], + model: Union["PreTrainedModel"], model_args: "ModelArguments", finetuning_args: "FinetuningArguments", is_trainable: bool, -) -> Union["PreTrainedModel","AutoModelForVision2Seq"]: +) -> Union["PreTrainedModel"]: r""" Initializes the adapters. @@ -38,7 +38,9 @@ def init_adapter( logger.info("Adapter is not found at evaluation, load the base model.") return model - if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None): + if finetuning_args.finetuning_type != "lora" and getattr( + model, "quantization_method", None + ): raise ValueError("You can only use lora for quantized models.") if finetuning_args.finetuning_type == "full" and is_trainable: @@ -49,9 +51,9 @@ def init_adapter( if finetuning_args.finetuning_type == "freeze" and is_trainable: logger.info("Fine-tuning method: Freeze") num_layers = ( - getattr(model.config, "num_hidden_layers", None) - or getattr(model.config, "num_layers", None) - or getattr(model.config, "n_layer", None) + getattr(model.config, "num_hidden_layers", None) + or getattr(model.config, "num_layers", None) + or getattr(model.config, "n_layer", None) ) if not num_layers: raise ValueError("Current model does not support freeze tuning.") @@ -66,8 +68,12 @@ def init_adapter( stride = num_layers // finetuning_args.num_layer_trainable trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride) - elif finetuning_args.num_layer_trainable > 0: # fine-tuning the last n layers if num_layer_trainable > 0 - trainable_layer_ids = range(num_layers - finetuning_args.num_layer_trainable, num_layers) + elif ( + finetuning_args.num_layer_trainable > 0 + ): # fine-tuning the last n layers if num_layer_trainable > 0 + trainable_layer_ids = range( + num_layers - finetuning_args.num_layer_trainable, num_layers + ) else: # fine-tuning the first n layers if num_layer_trainable < 0 trainable_layer_ids = range(-finetuning_args.num_layer_trainable) @@ -82,11 +88,15 @@ def init_adapter( for module_name in finetuning_args.name_module_trainable: if module_name not in freeze_modules: raise ValueError( - "Module {} is not found, please choose from {}".format(module_name, ", ".join(freeze_modules)) + "Module {} is not found, please choose from {}".format( + module_name, ", ".join(freeze_modules) + ) ) for idx in trainable_layer_ids: - trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else "")) + trainable_layers.append( + ".{:d}.{}".format(idx, module_name if module_name != "all" else "") + ) for name, param in model.named_parameters(): if any(trainable_layer in name for trainable_layer in trainable_layers): @@ -95,27 +105,43 @@ def init_adapter( else: param.requires_grad_(False) - logger.info("Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids)))) + logger.info( + "Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids))) + ) if finetuning_args.finetuning_type == "lora": - logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA")) + logger.info( + "Fine-tuning method: {}".format( + "DoRA" if finetuning_args.use_dora else "LoRA" + ) + ) adapter_to_resume = None if model_args.adapter_name_or_path is not None: is_mergeable = True - if getattr(model, "quantization_method", None): # merge lora in quantized model is unstable - assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter." + if getattr( + model, "quantization_method", None + ): # merge lora in quantized model is unstable + assert ( + len(model_args.adapter_name_or_path) == 1 + ), "Quantized model only accepts a single adapter." is_mergeable = False if is_deepspeed_zero3_enabled(): - assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3." + assert ( + len(model_args.adapter_name_or_path) == 1 + ), "Cannot use multiple adapters in DeepSpeed ZeRO-3." is_mergeable = False if model_args.use_unsloth: - assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter." + assert ( + len(model_args.adapter_name_or_path) == 1 + ), "Unsloth model only accepts a single adapter." is_mergeable = False - if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable): + if (is_trainable and not finetuning_args.create_new_adapter) or ( + not is_mergeable + ): adapter_to_merge = model_args.adapter_name_or_path[:-1] adapter_to_resume = model_args.adapter_name_or_path[-1] else: @@ -132,7 +158,9 @@ def init_adapter( if adapter_to_resume is not None: # resume lora training if model_args.use_unsloth: - model = load_unsloth_peft_model(config, model_args, is_trainable=is_trainable) + model = load_unsloth_peft_model( + config, model_args, is_trainable=is_trainable + ) else: model = PeftModel.from_pretrained( model, @@ -141,19 +169,27 @@ def init_adapter( offload_folder=model_args.offload_folder, ) - if is_trainable and adapter_to_resume is None: # create new lora weights while training - if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all": + if ( + is_trainable and adapter_to_resume is None + ): # create new lora weights while training + if ( + len(finetuning_args.lora_target) == 1 + and finetuning_args.lora_target[0] == "all" + ): target_modules = find_all_linear_modules(model) else: target_modules = finetuning_args.lora_target if finetuning_args.use_llama_pro: - target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable) + target_modules = find_expanded_modules( + model, target_modules, finetuning_args.num_layer_trainable + ) if ( - finetuning_args.use_dora - and getattr(model, "quantization_method", None) is not None - and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES + finetuning_args.use_dora + and getattr(model, "quantization_method", None) is not None + and getattr(model, "quantization_method", None) + != QuantizationMethod.BITS_AND_BYTES ): raise ValueError("DoRA is not compatible with PTQ-quantized models.") @@ -166,7 +202,11 @@ def init_adapter( module_names.add(name.split(".")[-1]) finetuning_args.additional_target = module_names - logger.warning("Vocab has been resized, add {} to trainable params.".format(",".join(module_names))) + logger.warning( + "Vocab has been resized, add {} to trainable params.".format( + ",".join(module_names) + ) + ) peft_kwargs = { "r": finetuning_args.lora_rank, @@ -193,6 +233,10 @@ def init_adapter( param.data = param.data.to(torch.float32) if model_args.adapter_name_or_path is not None: - logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path))) + logger.info( + "Loaded adapter(s): {}".format( + ",".join(model_args.adapter_name_or_path) + ) + ) - return model \ No newline at end of file + return model diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index 3712a592..18b0cf79 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -1,6 +1,12 @@ from typing import TYPE_CHECKING, Any, Dict, Union -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModelForVision2Seq +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + AutoProcessor, + AutoModelForVision2Seq, +) from trl import AutoModelForCausalLMWithValueHead from ..extras.logging import get_logger @@ -62,10 +68,14 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer": dict(additional_special_tokens=model_args.new_special_tokens), replace_additional_special_tokens=False, ) - logger.info("Add {} to special tokens.".format(",".join(model_args.new_special_tokens))) + logger.info( + "Add {} to special tokens.".format(",".join(model_args.new_special_tokens)) + ) if num_added_tokens > 0 and not model_args.resize_vocab: model_args.resize_vocab = True - logger.warning("New tokens have been added, changed `resize_vocab` to True.") + logger.warning( + "New tokens have been added, changed `resize_vocab` to True." + ) patch_tokenizer(tokenizer) return tokenizer @@ -111,7 +121,7 @@ def load_model( finetuning_args: "FinetuningArguments", is_trainable: bool = False, add_valuehead: bool = False, -) -> Union["PreTrainedModel", "AutoModelForVision2Seq"]: +) -> Union["PreTrainedModel"]: r""" Loads pretrained model. """ @@ -170,8 +180,10 @@ def load_model( trainable_params, all_param = count_parameters(model) if is_trainable: - param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( - trainable_params, all_param, 100 * trainable_params / all_param + param_stats = ( + "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( + trainable_params, all_param, 100 * trainable_params / all_param + ) ) else: param_stats = "all params: {:d}".format(all_param) @@ -185,4 +197,4 @@ def load_model( ) ) - return model \ No newline at end of file + return model diff --git a/src/llmtuner/train/sftmm/collator.py b/src/llmtuner/train/sftmm/collator.py index 95dbd939..2931dd9c 100644 --- a/src/llmtuner/train/sftmm/collator.py +++ b/src/llmtuner/train/sftmm/collator.py @@ -19,7 +19,9 @@ class DataCollatorForVis2Seq: texts.append(text) images.append(example["images"][0]) - batch = self.processor(text=texts, images=images, return_tensors="pt", padding=True) + batch = self.processor( + text=texts, images=images, return_tensors="pt", padding=True + ) labels = batch["input_ids"].clone() if self.processor.tokenizer.pad_token_id is not None: @@ -27,3 +29,14 @@ class DataCollatorForVis2Seq: batch["labels"] = labels return batch + + +@dataclass +class DataCollatorForMLLM: + processor: AutoProcessor + + def __call__(self, examples): + print(examples[0].keys()) + print(examples[0]["input_ids"]) + batch = {} + return batch diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py index 7afd8f6f..3849a563 100644 --- a/src/llmtuner/train/sftmm/workflow.py +++ b/src/llmtuner/train/sftmm/workflow.py @@ -1,47 +1,66 @@ # Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py import os from typing import TYPE_CHECKING, List, Optional -from ...data import split_dataset, get_mm_dataset +from ...data import get_dataset from ...extras.misc import get_logits_processor from ...extras.ploting import plot_loss -from ...model import load_tokenizer, load_processor, load_model +from ...model import load_processor, load_model from ..utils import create_modelcard_and_push from .metric import ComputeMetrics from .trainer import CustomSeq2SeqTrainer -from .collator import DataCollatorForVis2Seq +from transformers import DataCollatorForSeq2Seq +from ...extras.constants import IGNORE_INDEX if TYPE_CHECKING: from transformers import Seq2SeqTrainingArguments, TrainerCallback - from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments + from ...hparams import ( + DataArguments, + FinetuningArguments, + GeneratingArguments, + ModelArguments, + ) def run_sft_mm( - model_args: "ModelArguments", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - finetuning_args: "FinetuningArguments", - generating_args: "GeneratingArguments", - callbacks: Optional[List["TrainerCallback"]] = None, + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + generating_args: "GeneratingArguments", + callbacks: Optional[List["TrainerCallback"]] = None, ): processor = load_processor(model_args) - tokenizer = load_tokenizer(model_args) - CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}""" - tokenizer.chat_template = CHAT_TEMPLATE - processor.tokenizer = tokenizer - model = load_model(processor.tokenizer, model_args, finetuning_args, training_args.do_train) - dataset = get_mm_dataset(processor, model_args, data_args, training_args, stage="sft") + tokenizer = processor.tokenizer + dataset = get_dataset( + tokenizer, model_args, data_args, training_args, "sft", processor + ) + model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) if getattr(model, "is_quantized", False) and not training_args.do_train: - setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction + setattr( + model, "_hf_peft_config_loaded", True + ) # hack here: make model compatible with prediction train_dataset = dataset eval_dataset = dataset - data_collator = DataCollatorForVis2Seq( - processor=processor, + data_collator = DataCollatorForSeq2Seq( + tokenizer=tokenizer, + pad_to_multiple_of=( + 8 if tokenizer.padding_side == "right" else None + ), # for shift short attention + label_pad_token_id=( + IGNORE_INDEX + if data_args.ignore_pad_token_for_loss + else tokenizer.pad_token_id + ), ) # Override the decoding parameters of Seq2SeqTrainer - training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len - training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams + training_args.generation_max_length = ( + training_args.generation_max_length or data_args.cutoff_len + ) + training_args.generation_num_beams = ( + data_args.eval_num_beams or training_args.generation_num_beams + ) training_args.remove_unused_columns = False # Initialize our Trainer @@ -52,19 +71,26 @@ def run_sft_mm( tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks, - compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, + compute_metrics=( + ComputeMetrics(tokenizer) if training_args.predict_with_generate else None + ), train_dataset=train_dataset, eval_dataset=eval_dataset, ) + # Keyword arguments for `model.generate` gen_kwargs = generating_args.to_dict() - gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids + gen_kwargs["eos_token_id"] = [ + tokenizer.eos_token_id + ] + tokenizer.additional_special_tokens_ids gen_kwargs["pad_token_id"] = tokenizer.pad_token_id gen_kwargs["logits_processor"] = get_logits_processor() # Training if training_args.do_train: - train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + train_result = trainer.train( + resume_from_checkpoint=training_args.resume_from_checkpoint + ) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) @@ -75,19 +101,27 @@ def run_sft_mm( # Evaluation if training_args.do_eval: metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) - if training_args.predict_with_generate: # eval_loss will be wrong if predict_with_generate is enabled + if ( + training_args.predict_with_generate + ): # eval_loss will be wrong if predict_with_generate is enabled metrics.pop("eval_loss", None) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: - predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs) - if training_args.predict_with_generate: # predict_loss will be wrong if predict_with_generate is enabled + predict_results = trainer.predict( + dataset, metric_key_prefix="predict", **gen_kwargs + ) + if ( + training_args.predict_with_generate + ): # predict_loss will be wrong if predict_with_generate is enabled predict_results.metrics.pop("predict_loss", None) trainer.log_metrics("predict", predict_results.metrics) trainer.save_metrics("predict", predict_results.metrics) trainer.save_predictions(predict_results) # Create model card - create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) + create_modelcard_and_push( + trainer, model_args, data_args, training_args, finetuning_args + ) From d1d08d066a7245e56d1d25c7dda5600b2d0c4e6b Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 19:58:47 +0800 Subject: [PATCH 09/29] merge data part to the text stream Former-commit-id: 80537d580119d9d5a06ab236a5284aaae2f83b5b --- data/mllm_example_dataset/README.md | 25 ------------------ data/mllm_example_dataset/data/test-0.parquet | Bin 4580 -> 0 bytes .../mllm_example_dataset/data/train-0.parquet | Bin 4580 -> 0 bytes scripts/test_mllm.py | 24 +++++++++++------ src/llmtuner/data/template.py | 4 +-- 5 files changed, 18 insertions(+), 35 deletions(-) delete mode 100644 data/mllm_example_dataset/README.md delete mode 100644 data/mllm_example_dataset/data/test-0.parquet delete mode 100644 data/mllm_example_dataset/data/train-0.parquet diff --git a/data/mllm_example_dataset/README.md b/data/mllm_example_dataset/README.md deleted file mode 100644 index d5c8c0e6..00000000 --- a/data/mllm_example_dataset/README.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -dataset_info: - features: - - name: messages - list: - - name: content - list: - - name: index - dtype: int64 - - name: text - dtype: string - - name: type - dtype: string - - name: role - dtype: string - - name: images - sequence: image -configs: -- config_name: default - data_files: - - split: train - path: data/train-* - - split: test - path: data/test-* ---- \ No newline at end of file diff --git a/data/mllm_example_dataset/data/test-0.parquet b/data/mllm_example_dataset/data/test-0.parquet deleted file mode 100644 index 42c20b192497168523c3d39447cdae4495085b84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4580 zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4} z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2 zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ- zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_kMpPvX5(fPocD;ve`lr& zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAHu4)iVC5Rj~iDS@$#AeTYHxA`usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r> zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI z&cAEI zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn# zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL17zNC-8)&Nj5N{w3etSwT_2 z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5 I;r~1T1e%nlz5oCK diff --git a/data/mllm_example_dataset/data/train-0.parquet b/data/mllm_example_dataset/data/train-0.parquet deleted file mode 100644 index 42c20b192497168523c3d39447cdae4495085b84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4580 zcmdTIO>84qc_wRnlaN&LHsc*{qws2Fr_nZRJ5HP?tA%tXA^uq>&A)#Tg6ElM&t%6l zW@a46Sr)C3s!AL=RTT%s38AX0RKx*L#Q`A>sHgT&sS*OM5OAnaFCYOb-+N;_KV-Ko zDiw^9H}8G_|8J5_d3#m}2xG!{K^PFYD;zgC!F3;j6XHT@MwkS2NC-`cgFVd2E+S4} z00p|Mr68=q5`I~@OgC>ge)D7d89u~G5BUip5KVKTvFOoYgSalP6E|QQu6LQ3q(Ln2 zvT9o%yw3oGhNY1sVIVRYd6$oTz)OqL;FFjXodG{NVSs~W3|<@u=Z8bWuLlc)$UOfj z@aNnpz>B(#utSiilg{!C0Cr*X902ZMiy&-iDB}?C_%22@$8I16cZg%5^_FF*dVHJ- zz#a-K6G*cStG+xvnhjRwRdBBAU=JY3ws>P6xwfhj2h|K>YQv*1Yle$-vMhmsEP2jF zySm-LF32@a{>IujI8F>Ma-E^YaJf)-?3Sf1CxBETDsH(1>HNlsYZ}_kMpPvX5(fPocD;ve`lr& zf>9H9OdD%_PE-v{v(5kZ4?8-sj&-|*W*1Pya9zW;kYS;G3*wwgrcyJIgO-b`1T@gN zK}H~4j_aY7(Vyz7acGzZwvb(ejM%Lvnxouq6RX!(#^;h~9e4QkvAHu4)iVC5Rj~iDS@$#AeTYHxA`usg7>WF8Sb8`j{q@qsr)&I$Efy1`0_{8)E#Vgf z;2<@Gy2y$TqA3YCPDPWOW|oRUh6b{yv;BC`rk%XZ2bDqT!z=$`6go}9qVNMg@+L1m zxFWqBo>(}i^g=&w2=SgD!Y1_ty3QGbH-@@(J#2a43phIi)5M>bR4gSDhx#Ny9)9r> zticbTqTYxBKKR37>D!UMH`$9vV!*LYYPcBr5g+*(FTE;A?~F6U&uh5Qc$j+yFpnmI z&cAEI zCI&>0-9F_=VBpdeC;jr7$NLg!VoztTyg3m0)`0Z-HR%^o8rw4p9;x#pm!p3lOLJn# zRdv$9LV!yTi2c06JsT?A22;}kk=<}`83Dd``T5knEAW$uLvV{-AAoM5mm?>Pz@GKb zb*@0~@h$+0{tSQ?Qx^c5yZQYjRRLd`pZTt28o92ZNGLiHHUD>R_y4a!`B@LN|MJNB z6z9ih-@cKbKOG54gOS&+z{gy4M*N)Ks@LJ(u3?pkZw=gw8CK1X-9q)uVQOO&BDdBRcWjpRJP<}%1b)=X0=f{%pKTu*f%Q0wWL17zNC-8)&Nj5N{w3etSwT_2 z-=?Jqabcv4{3O!y7dR0$u><4OyQwytH?iZ`ZFEQ+_UH0!I-ZQDq9%Opo%`Wl8HT_5 I;r~1T1e%nlz5oCK diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py index 94d8670b..b8fe3e0f 100644 --- a/scripts/test_mllm.py +++ b/scripts/test_mllm.py @@ -6,22 +6,23 @@ from datasets import load_dataset from peft import PeftModel from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor import shutil +from PIL import Image """usage python3 scripts/test_mllm.py \ --base_model_path llava-hf/llava-1.5-7b-hf \ --lora_model_path saves/llava-1.5-7b/lora/sft \ --model_path saves/llava-1.5-7b/lora/merged \ ---dataset_name data/mllm_example_dataset \ +--dataset_name data/llava_instruct_example.json \ --do_merge 1 """ def get_processor(model_path): - CHAT_TEMPLATE = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}""" + processor = AutoProcessor.from_pretrained(model_path) + CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {{ message['content'] }} ASSISTANT: {% else %}{{ message['content'] }}{% endif %} {% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}""" tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) tokenizer.chat_template = CHAT_TEMPLATE - processor = AutoProcessor.from_pretrained(model_path) processor.tokenizer = tokenizer return processor @@ -69,7 +70,7 @@ def main( device_map="cuda", ) processor = get_processor(model_path) - raw_datasets = load_dataset(dataset_name) + raw_datasets = load_dataset("json", data_files=dataset_name) train_dataset = raw_datasets["train"] examples = train_dataset.select(range(3)) texts = [] @@ -80,11 +81,18 @@ def main( messages, tokenize=False, add_generation_prompt=False ) texts.append(text) - images.append(example["images"][0]) - batch = processor(texts, images, return_tensors="pt", padding=True).to("cuda") + images.append(Image.open(example["images"][0])) + batch = processor(text=texts, images=images, return_tensors="pt", padding=True).to( + "cuda" + ) output = model.generate(**batch, max_new_tokens=100) - res = processor.batch_decode(output, skip_special_tokens=True) - print(res) + res_list = processor.batch_decode(output, skip_special_tokens=True) + for i, prompt in enumerate(texts): + res = res_list[i] + print(f"#{i}") + print(f"prompt:{prompt}") + print(f"response:{res[len(prompt):].strip()}") + print() if __name__ == "__main__": diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index 311660aa..e6cdadd6 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -1012,8 +1012,8 @@ _register_template( _register_template( name="llava", - format_user=StringFormatter(slots=["USER: {{content}} "]), - format_assistant=StringFormatter(slots=["ASSISTANT: {{content}}"]), + format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT: "]), + format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]), default_system=( "A chat between a curious user and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the user's questions." From 9b210cf4b3bbd1faae0b36e3a4fd1587d1357057 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 20:09:43 +0800 Subject: [PATCH 10/29] rm some Former-commit-id: 2c85b4fabbebd8b51eee53f5d29184d4a6e97569 --- src/llmtuner/train/sftmm/collator.py | 42 ---------------------------- 1 file changed, 42 deletions(-) delete mode 100644 src/llmtuner/train/sftmm/collator.py diff --git a/src/llmtuner/train/sftmm/collator.py b/src/llmtuner/train/sftmm/collator.py deleted file mode 100644 index 2931dd9c..00000000 --- a/src/llmtuner/train/sftmm/collator.py +++ /dev/null @@ -1,42 +0,0 @@ -from dataclasses import dataclass -from transformers import AutoProcessor - - -@dataclass -class DataCollatorForVis2Seq: - processor: AutoProcessor - - def __call__(self, examples): - texts = [] - images = [] - for example in examples: - if len(example["images"]) > 1: - raise ValueError("This collator only supports one image per example") - messages = example["messages"] - text = self.processor.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=False - ) - texts.append(text) - images.append(example["images"][0]) - - batch = self.processor( - text=texts, images=images, return_tensors="pt", padding=True - ) - - labels = batch["input_ids"].clone() - if self.processor.tokenizer.pad_token_id is not None: - labels[labels == self.processor.tokenizer.pad_token_id] = -100 - batch["labels"] = labels - - return batch - - -@dataclass -class DataCollatorForMLLM: - processor: AutoProcessor - - def __call__(self, examples): - print(examples[0].keys()) - print(examples[0]["input_ids"]) - batch = {} - return batch From dbd905438b22bc74576900dc310ef5c506bd88ab Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 21:08:32 +0800 Subject: [PATCH 11/29] add some Former-commit-id: 8d035a849c4a441d457791aab073861adf69a09f --- src/llmtuner/model/__init__.py | 3 +- src/llmtuner/model/loader.py | 50 +++++------ src/llmtuner/train/sft/workflow.py | 72 +++++++++++---- src/llmtuner/train/sftmm/__init__.py | 3 - src/llmtuner/train/sftmm/metric.py | 61 ------------- src/llmtuner/train/sftmm/trainer.py | 44 ---------- src/llmtuner/train/sftmm/workflow.py | 127 --------------------------- src/llmtuner/train/tuner.py | 3 - 8 files changed, 80 insertions(+), 283 deletions(-) delete mode 100644 src/llmtuner/train/sftmm/__init__.py delete mode 100644 src/llmtuner/train/sftmm/metric.py delete mode 100644 src/llmtuner/train/sftmm/trainer.py delete mode 100644 src/llmtuner/train/sftmm/workflow.py diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py index f6be60d8..db81e1dc 100644 --- a/src/llmtuner/model/__init__.py +++ b/src/llmtuner/model/__init__.py @@ -1,11 +1,10 @@ -from .loader import load_config, load_model, load_tokenizer, load_processor +from .loader import load_config, load_model, load_tokenizer from .utils.misc import find_all_linear_modules, load_valuehead_params __all__ = [ "load_config", "load_model", "load_tokenizer", - "load_processor", "load_valuehead_params", "find_all_linear_modules", ] diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index 18b0cf79..99ad9adc 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -40,7 +40,9 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]: } -def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer": +def load_tokenizer( + model_args: "ModelArguments", +) -> Dict[str, Union["PreTrainedTokenizer", "AutoProcesser"]]: r""" Loads pretrained tokenizer. @@ -78,33 +80,25 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer": ) patch_tokenizer(tokenizer) - return tokenizer - - -def load_processor(model_args: "ModelArguments") -> "AutoProcessor": - r""" - Loads processor. Must before load_model. - - Note: including inplace operation of model_args. - """ - init_kwargs = _get_init_kwargs(model_args) - try: - processor = AutoProcessor.from_pretrained( - model_args.model_name_or_path, - use_fast=model_args.use_fast_tokenizer, - split_special_tokens=model_args.split_special_tokens, - padding_side="right", - **init_kwargs, - ) - except Exception: # try the fast one - processor = AutoProcessor.from_pretrained( - model_args.model_name_or_path, - use_fast=True, - padding_side="right", - **init_kwargs, - ) - - return processor + tokenizer_modules = {"tokenizer": tokenizer, "processor": None} + if model_args.use_mllm: + try: + processor = AutoProcessor.from_pretrained( + model_args.model_name_or_path, + use_fast=model_args.use_fast_tokenizer, + split_special_tokens=model_args.split_special_tokens, + padding_side="right", + **init_kwargs, + ) + except Exception: # try the fast one + processor = AutoProcessor.from_pretrained( + model_args.model_name_or_path, + use_fast=True, + padding_side="right", + **init_kwargs, + ) + tokenizer_modules["processor"] = processor + return tokenizer_modules def load_config(model_args: "ModelArguments") -> "PretrainedConfig": diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py index 9ab78850..6f887810 100644 --- a/src/llmtuner/train/sft/workflow.py +++ b/src/llmtuner/train/sft/workflow.py @@ -17,7 +17,12 @@ from .trainer import CustomSeq2SeqTrainer if TYPE_CHECKING: from transformers import Seq2SeqTrainingArguments, TrainerCallback - from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments + from ...hparams import ( + DataArguments, + FinetuningArguments, + GeneratingArguments, + ModelArguments, + ) def run_sft( @@ -28,25 +33,48 @@ def run_sft( generating_args: "GeneratingArguments", callbacks: Optional[List["TrainerCallback"]] = None, ): - tokenizer = load_tokenizer(model_args) - dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft") + tokenizer_modules = load_tokenizer(model_args) + tokenizer = tokenizer_modules["tokenizer"] + processor = tokenizer_modules["processor"] + dataset = get_dataset( + tokenizer, + model_args, + data_args, + training_args, + stage="sft", + processor=processor, + ) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) if training_args.predict_with_generate: tokenizer.padding_side = "left" # use left-padding in generation if getattr(model, "is_quantized", False) and not training_args.do_train: - setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction + setattr( + model, "_hf_peft_config_loaded", True + ) # hack here: make model compatible with prediction data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, - pad_to_multiple_of=8 if tokenizer.padding_side == "right" else None, # for shift short attention - label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id, + pad_to_multiple_of=( + 8 if tokenizer.padding_side == "right" else None + ), # for shift short attention + label_pad_token_id=( + IGNORE_INDEX + if data_args.ignore_pad_token_for_loss + else tokenizer.pad_token_id + ), ) # Override the decoding parameters of Seq2SeqTrainer - training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len - training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams + training_args.generation_max_length = ( + training_args.generation_max_length or data_args.cutoff_len + ) + training_args.generation_num_beams = ( + data_args.eval_num_beams or training_args.generation_num_beams + ) + if model_args.use_mllm: + training_args.remove_unused_columns = False # Initialize our Trainer trainer = CustomSeq2SeqTrainer( @@ -56,19 +84,25 @@ def run_sft( tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks, - compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, + compute_metrics=( + ComputeMetrics(tokenizer) if training_args.predict_with_generate else None + ), **split_dataset(dataset, data_args, training_args), ) # Keyword arguments for `model.generate` gen_kwargs = generating_args.to_dict() - gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids + gen_kwargs["eos_token_id"] = [ + tokenizer.eos_token_id + ] + tokenizer.additional_special_tokens_ids gen_kwargs["pad_token_id"] = tokenizer.pad_token_id gen_kwargs["logits_processor"] = get_logits_processor() # Training if training_args.do_train: - train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + train_result = trainer.train( + resume_from_checkpoint=training_args.resume_from_checkpoint + ) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) @@ -79,19 +113,27 @@ def run_sft( # Evaluation if training_args.do_eval: metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) - if training_args.predict_with_generate: # eval_loss will be wrong if predict_with_generate is enabled + if ( + training_args.predict_with_generate + ): # eval_loss will be wrong if predict_with_generate is enabled metrics.pop("eval_loss", None) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: - predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs) - if training_args.predict_with_generate: # predict_loss will be wrong if predict_with_generate is enabled + predict_results = trainer.predict( + dataset, metric_key_prefix="predict", **gen_kwargs + ) + if ( + training_args.predict_with_generate + ): # predict_loss will be wrong if predict_with_generate is enabled predict_results.metrics.pop("predict_loss", None) trainer.log_metrics("predict", predict_results.metrics) trainer.save_metrics("predict", predict_results.metrics) trainer.save_predictions(predict_results) # Create model card - create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) + create_modelcard_and_push( + trainer, model_args, data_args, training_args, finetuning_args + ) diff --git a/src/llmtuner/train/sftmm/__init__.py b/src/llmtuner/train/sftmm/__init__.py deleted file mode 100644 index 3eb8b2e2..00000000 --- a/src/llmtuner/train/sftmm/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .workflow import run_sft_mm - -__all__ = ["run_sft_mm"] diff --git a/src/llmtuner/train/sftmm/metric.py b/src/llmtuner/train/sftmm/metric.py deleted file mode 100644 index d1af4c17..00000000 --- a/src/llmtuner/train/sftmm/metric.py +++ /dev/null @@ -1,61 +0,0 @@ -from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union - -import numpy as np - -from ...extras.constants import IGNORE_INDEX -from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available - - -if TYPE_CHECKING: - from transformers.tokenization_utils import PreTrainedTokenizer - -if is_jieba_available(): - import jieba # type: ignore - -if is_nltk_available(): - from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu - -if is_rouge_available(): - from rouge_chinese import Rouge - - -@dataclass -class ComputeMetrics: - r""" - Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer. - """ - - tokenizer: "PreTrainedTokenizer" - - def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]: - r""" - Uses the model predictions to compute metrics. - """ - preds, labels = eval_preds - score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []} - - preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id) - labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id) - - decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) - decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) - - for pred, label in zip(decoded_preds, decoded_labels): - hypothesis = list(jieba.cut(pred)) - reference = list(jieba.cut(label)) - - if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0: - result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}} - else: - rouge = Rouge() - scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) - result = scores[0] - - for k, v in result.items(): - score_dict[k].append(round(v["f"] * 100, 4)) - - bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3) - score_dict["bleu-4"].append(round(bleu_score * 100, 4)) - - return {k: float(np.mean(v)) for k, v in score_dict.items()} diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py deleted file mode 100644 index f094e609..00000000 --- a/src/llmtuner/train/sftmm/trainer.py +++ /dev/null @@ -1,44 +0,0 @@ -import json -import os -from types import MethodType -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from transformers import Seq2SeqTrainer, Trainer - -from ...extras.constants import IGNORE_INDEX -from ...extras.logging import get_logger -from ..utils import create_custom_optimzer, create_custom_scheduler - -if TYPE_CHECKING: - from transformers.trainer import PredictionOutput - from peft import PeftModelForCausalLM - from ...hparams import FinetuningArguments - -logger = get_logger(__name__) - - -class CustomSeq2SeqTrainer(Seq2SeqTrainer): - r""" - Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE. - """ - - def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: - super().__init__(**kwargs) - self.finetuning_args = finetuning_args - if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) - - def create_optimizer(self) -> "torch.optim.Optimizer": - if self.optimizer is None: - self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) - return super().create_optimizer() - - def create_scheduler( - self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None - ) -> "torch.optim.lr_scheduler.LRScheduler": - create_custom_scheduler(self.args, num_training_steps, optimizer) - return super().create_scheduler(num_training_steps, optimizer) diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py deleted file mode 100644 index 3849a563..00000000 --- a/src/llmtuner/train/sftmm/workflow.py +++ /dev/null @@ -1,127 +0,0 @@ -# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py -import os -from typing import TYPE_CHECKING, List, Optional -from ...data import get_dataset -from ...extras.misc import get_logits_processor -from ...extras.ploting import plot_loss -from ...model import load_processor, load_model -from ..utils import create_modelcard_and_push -from .metric import ComputeMetrics -from .trainer import CustomSeq2SeqTrainer -from transformers import DataCollatorForSeq2Seq -from ...extras.constants import IGNORE_INDEX - -if TYPE_CHECKING: - from transformers import Seq2SeqTrainingArguments, TrainerCallback - - from ...hparams import ( - DataArguments, - FinetuningArguments, - GeneratingArguments, - ModelArguments, - ) - - -def run_sft_mm( - model_args: "ModelArguments", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - finetuning_args: "FinetuningArguments", - generating_args: "GeneratingArguments", - callbacks: Optional[List["TrainerCallback"]] = None, -): - processor = load_processor(model_args) - tokenizer = processor.tokenizer - dataset = get_dataset( - tokenizer, model_args, data_args, training_args, "sft", processor - ) - model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) - if getattr(model, "is_quantized", False) and not training_args.do_train: - setattr( - model, "_hf_peft_config_loaded", True - ) # hack here: make model compatible with prediction - train_dataset = dataset - eval_dataset = dataset - data_collator = DataCollatorForSeq2Seq( - tokenizer=tokenizer, - pad_to_multiple_of=( - 8 if tokenizer.padding_side == "right" else None - ), # for shift short attention - label_pad_token_id=( - IGNORE_INDEX - if data_args.ignore_pad_token_for_loss - else tokenizer.pad_token_id - ), - ) - - # Override the decoding parameters of Seq2SeqTrainer - training_args.generation_max_length = ( - training_args.generation_max_length or data_args.cutoff_len - ) - training_args.generation_num_beams = ( - data_args.eval_num_beams or training_args.generation_num_beams - ) - training_args.remove_unused_columns = False - - # Initialize our Trainer - trainer = CustomSeq2SeqTrainer( - model=model, - args=training_args, - finetuning_args=finetuning_args, - tokenizer=tokenizer, - data_collator=data_collator, - callbacks=callbacks, - compute_metrics=( - ComputeMetrics(tokenizer) if training_args.predict_with_generate else None - ), - train_dataset=train_dataset, - eval_dataset=eval_dataset, - ) - - # Keyword arguments for `model.generate` - gen_kwargs = generating_args.to_dict() - gen_kwargs["eos_token_id"] = [ - tokenizer.eos_token_id - ] + tokenizer.additional_special_tokens_ids - gen_kwargs["pad_token_id"] = tokenizer.pad_token_id - gen_kwargs["logits_processor"] = get_logits_processor() - - # Training - if training_args.do_train: - train_result = trainer.train( - resume_from_checkpoint=training_args.resume_from_checkpoint - ) - trainer.save_model() - trainer.log_metrics("train", train_result.metrics) - trainer.save_metrics("train", train_result.metrics) - trainer.save_state() - if trainer.is_world_process_zero() and finetuning_args.plot_loss: - plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) - - # Evaluation - if training_args.do_eval: - metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) - if ( - training_args.predict_with_generate - ): # eval_loss will be wrong if predict_with_generate is enabled - metrics.pop("eval_loss", None) - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - # Predict - if training_args.do_predict: - predict_results = trainer.predict( - dataset, metric_key_prefix="predict", **gen_kwargs - ) - if ( - training_args.predict_with_generate - ): # predict_loss will be wrong if predict_with_generate is enabled - predict_results.metrics.pop("predict_loss", None) - trainer.log_metrics("predict", predict_results.metrics) - trainer.save_metrics("predict", predict_results.metrics) - trainer.save_predictions(predict_results) - - # Create model card - create_modelcard_and_push( - trainer, model_args, data_args, training_args, finetuning_args - ) diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py index ac56289c..5f691225 100644 --- a/src/llmtuner/train/tuner.py +++ b/src/llmtuner/train/tuner.py @@ -14,7 +14,6 @@ from .ppo import run_ppo from .pt import run_pt from .rm import run_rm from .sft import run_sft -from .sftmm import run_sft_mm if TYPE_CHECKING: from transformers import TrainerCallback @@ -30,8 +29,6 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["Tra run_pt(model_args, data_args, training_args, finetuning_args, callbacks) elif finetuning_args.stage == "sft": run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) - elif finetuning_args.stage == "sft_mm": - run_sft_mm(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) elif finetuning_args.stage == "rm": run_rm(model_args, data_args, training_args, finetuning_args, callbacks) elif finetuning_args.stage == "ppo": From c425436676d19434a662b6c4780030ec4bb52b8a Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 21:15:16 +0800 Subject: [PATCH 12/29] modify style Former-commit-id: 54b713d0c4ffdfc6a7faeb14471b58bb1cd8acf5 --- src/llmtuner/data/__init__.py | 1 + src/llmtuner/data/aligner.py | 49 ++----- src/llmtuner/data/loader.py | 59 +++------ src/llmtuner/data/parser.py | 28 +--- src/llmtuner/data/preprocess.py | 90 +++---------- src/llmtuner/data/template.py | 188 +++++++-------------------- src/llmtuner/hparams/model_args.py | 101 ++++---------- src/llmtuner/model/__init__.py | 1 + src/llmtuner/model/adapter.py | 82 +++--------- src/llmtuner/model/loader.py | 21 ++- src/llmtuner/train/sft/workflow.py | 50 ++----- src/llmtuner/train/sftmm/__init__.py | 4 + src/llmtuner/train/sftmm/metric.py | 61 +++++++++ src/llmtuner/train/sftmm/trainer.py | 39 ++++++ src/llmtuner/train/sftmm/workflow.py | 101 ++++++++++++++ src/llmtuner/train/tuner.py | 1 + 16 files changed, 374 insertions(+), 502 deletions(-) create mode 100644 src/llmtuner/train/sftmm/__init__.py create mode 100644 src/llmtuner/train/sftmm/metric.py create mode 100644 src/llmtuner/train/sftmm/trainer.py create mode 100644 src/llmtuner/train/sftmm/workflow.py diff --git a/src/llmtuner/data/__init__.py b/src/llmtuner/data/__init__.py index 00a82d73..792e89d9 100644 --- a/src/llmtuner/data/__init__.py +++ b/src/llmtuner/data/__init__.py @@ -3,6 +3,7 @@ from .loader import get_dataset from .template import Template, get_template_and_fix_tokenizer, templates from .utils import Role, split_dataset + __all__ = [ "PairwiseDataCollatorWithPadding", "get_dataset", diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index 85202ea8..9d440aff 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -13,9 +13,7 @@ if TYPE_CHECKING: from .parser import DatasetAttr -def convert_alpaca( - examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" -) -> Dict[str, List[Any]]: +def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: outputs = {"prompt": [], "response": [], "system": [], "tools": []} for i in range(len(examples[dataset_attr.prompt])): prompt = [] @@ -33,16 +31,11 @@ def convert_alpaca( prompt.append({"role": Role.USER.value, "content": "\n".join(content)}) - if dataset_attr.response and isinstance( - examples[dataset_attr.response][i], list - ): + if dataset_attr.response and isinstance(examples[dataset_attr.response][i], list): response = [ - {"role": Role.ASSISTANT.value, "content": content} - for content in examples[dataset_attr.response][i] + {"role": Role.ASSISTANT.value, "content": content} for content in examples[dataset_attr.response][i] ] - elif dataset_attr.response and isinstance( - examples[dataset_attr.response][i], str - ): + elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str): response = [ { "role": Role.ASSISTANT.value, @@ -54,17 +47,13 @@ def convert_alpaca( outputs["prompt"].append(prompt) outputs["response"].append(response) - outputs["system"].append( - examples[dataset_attr.system][i] if dataset_attr.system else "" - ) + outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["tools"].append("") outputs["images"].append([]) return outputs -def convert_sharegpt( - examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" -) -> Dict[str, List[Any]]: +def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: outputs = {"prompt": [], "response": [], "system": [], "tools": []} tag_mapping = { dataset_attr.user_tag: Role.USER.value, @@ -77,10 +66,7 @@ def convert_sharegpt( even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag) accept_tags = (odd_tags, even_tags) for i, messages in enumerate(examples[dataset_attr.messages]): - if ( - dataset_attr.system_tag - and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag - ): + if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag: system = messages[0][dataset_attr.content_tag] messages = messages[1:] else: @@ -105,17 +91,13 @@ def convert_sharegpt( outputs["prompt"].append(aligned_messages[:-1]) outputs["response"].append(aligned_messages[-1:]) outputs["system"].append(system) - outputs["tools"].append( - examples[dataset_attr.tools][i] if dataset_attr.tools else "" - ) + outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") outputs["images"].append([]) return outputs -def convert_llava( - examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" -) -> Dict[str, List[Any]]: +def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} tag_mapping = { dataset_attr.user_tag: Role.USER.value, @@ -128,10 +110,7 @@ def convert_llava( even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag) accept_tags = (odd_tags, even_tags) for i, messages in enumerate(examples[dataset_attr.messages]): - if ( - dataset_attr.system_tag - and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag - ): + if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag: system = messages[0][dataset_attr.content_tag] messages = messages[1:] else: @@ -156,13 +135,9 @@ def convert_llava( outputs["prompt"].append(aligned_messages[:-1]) outputs["response"].append(aligned_messages[-1:]) outputs["system"].append(system) - outputs["tools"].append( - examples[dataset_attr.tools][i] if dataset_attr.tools else "" - ) + outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") print(examples[dataset_attr.images][i]) - outputs["images"].append( - examples[dataset_attr.images][i] if dataset_attr.images else [] - ) + outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) return outputs diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index c373e196..fa4aa9c1 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -1,6 +1,6 @@ import inspect import os -from typing import TYPE_CHECKING, Literal, Union, Optional +from typing import TYPE_CHECKING, Literal, Optional, Union from datasets import load_dataset, load_from_disk @@ -13,9 +13,10 @@ from .preprocess import get_preprocess_and_print_func from .template import get_template_and_fix_tokenizer from .utils import checksum, merge_dataset + if TYPE_CHECKING: from datasets import Dataset, IterableDataset - from transformers import Seq2SeqTrainingArguments, AutoProcessor + from transformers import AutoProcessor, Seq2SeqTrainingArguments from transformers.tokenization_utils import PreTrainedTokenizer from ..hparams import DataArguments, ModelArguments @@ -78,20 +79,14 @@ def load_single_dataset( split=data_args.split, cache_dir=cache_dir, token=model_args.ms_hub_token, - use_streaming=( - data_args.streaming and (dataset_attr.load_from != "file") - ), + use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")), ) if isinstance(dataset, MsDataset): dataset = dataset.to_hf_dataset() except ImportError: - raise ImportError( - "Please install modelscope via `pip install modelscope -U`" - ) + raise ImportError("Please install modelscope via `pip install modelscope -U`") else: - if ( - "trust_remote_code" in inspect.signature(load_dataset).parameters - ): # for datasets==2.16.0 + if "trust_remote_code" in inspect.signature(load_dataset).parameters: # for datasets==2.16.0 kwargs = {"trust_remote_code": True} else: kwargs = {} @@ -108,9 +103,7 @@ def load_single_dataset( **kwargs, ) - if data_args.streaming and ( - dataset_attr.load_from == "file" - ): # faster than specifying streaming=True + if data_args.streaming and (dataset_attr.load_from == "file"): # faster than specifying streaming=True dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter if data_args.max_samples is not None: # truncate dataset @@ -135,13 +128,9 @@ def get_dataset( # Load tokenized dataset if data_args.tokenized_path is not None: if has_tokenized_data(data_args.tokenized_path): - logger.warning( - "Loading dataset from disk will ignore other data arguments." - ) + logger.warning("Loading dataset from disk will ignore other data arguments.") dataset = load_from_disk(data_args.tokenized_path) - logger.info( - "Loaded tokenized dataset from {}.".format(data_args.tokenized_path) - ) + logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path)) if data_args.streaming: dataset = dataset.to_iterable_dataset() return dataset @@ -152,16 +141,10 @@ def get_dataset( with training_args.main_process_first(desc="load dataset"): all_datasets = [] for dataset_attr in get_dataset_list(data_args): - if (stage == "rm" and dataset_attr.ranking is False) or ( - stage != "rm" and dataset_attr.ranking is True - ): - raise ValueError( - "The dataset is not applicable in the current training stage." - ) + if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True): + raise ValueError("The dataset is not applicable in the current training stage.") - all_datasets.append( - load_single_dataset(dataset_attr, model_args, data_args) - ) + all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args)) dataset = merge_dataset(all_datasets, data_args, training_args) with training_args.main_process_first(desc="pre-process dataset"): @@ -177,21 +160,13 @@ def get_dataset( desc="Running tokenizer on dataset", ) - dataset = dataset.map( - preprocess_func, batched=True, remove_columns=column_names, **kwargs - ) + dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs) if data_args.tokenized_path is not None: if training_args.should_save: dataset.save_to_disk(data_args.tokenized_path) - logger.info( - "Tokenized dataset saved at {}.".format(data_args.tokenized_path) - ) - logger.info( - "Please restart the training with `--tokenized_path {}`.".format( - data_args.tokenized_path - ) - ) + logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path)) + logger.info("Please restart the training with `--tokenized_path {}`.".format(data_args.tokenized_path)) exit(0) @@ -199,8 +174,6 @@ def get_dataset( try: print_function(next(iter(dataset))) except StopIteration: - raise RuntimeError( - "Cannot find valid samples, check `data/README.md` for the data format." - ) + raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.") return dataset diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py index 79d6ed4e..4d3d7741 100644 --- a/src/llmtuner/data/parser.py +++ b/src/llmtuner/data/parser.py @@ -50,9 +50,7 @@ class DatasetAttr: def __repr__(self) -> str: return self.dataset_name - def set_attr( - self, key: str, obj: Dict[str, Any], default: Optional[Any] = None - ) -> None: + def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None: setattr(self, key, obj.get(key, default)) @@ -71,16 +69,12 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: except Exception as err: if len(dataset_names) != 0: raise ValueError( - "Cannot open {} due to {}.".format( - os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err) - ) + "Cannot open {} due to {}.".format(os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err)) ) dataset_info = None if data_args.interleave_probs is not None: - data_args.interleave_probs = [ - float(prob.strip()) for prob in data_args.interleave_probs.split(",") - ] + data_args.interleave_probs = [float(prob.strip()) for prob in data_args.interleave_probs.split(",")] dataset_list: List[DatasetAttr] = [] for name in dataset_names: @@ -98,21 +92,13 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: if has_hf_url or has_ms_url: if (use_modelscope() and has_ms_url) or (not has_hf_url): - dataset_attr = DatasetAttr( - "ms_hub", dataset_name=dataset_info[name]["ms_hub_url"] - ) + dataset_attr = DatasetAttr("ms_hub", dataset_name=dataset_info[name]["ms_hub_url"]) else: - dataset_attr = DatasetAttr( - "hf_hub", dataset_name=dataset_info[name]["hf_hub_url"] - ) + dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]) elif "script_url" in dataset_info[name]: - dataset_attr = DatasetAttr( - "script", dataset_name=dataset_info[name]["script_url"] - ) + dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) else: - dataset_attr = DatasetAttr( - "file", dataset_name=dataset_info[name]["file_name"] - ) + dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"]) dataset_attr.set_attr("file_sha1", dataset_info[name]) dataset_attr.set_attr("subset", dataset_info[name]) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index dc72483f..1c8c64a6 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -1,6 +1,6 @@ from functools import partial from itertools import chain -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Tuple, Optional +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple from ..extras.constants import IGNORE_INDEX from ..extras.logging import get_logger @@ -9,7 +9,7 @@ from .utils import Role if TYPE_CHECKING: from transformers import Seq2SeqTrainingArguments - from transformers.tokenization_utils import PreTrainedTokenizer, AutoProcessor + from transformers.tokenization_utils import AutoProcessor, PreTrainedTokenizer from ..hparams import DataArguments from .template import Template @@ -24,22 +24,16 @@ def preprocess_pretrain_dataset( data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` if packing is enabled - text_examples = [ - messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"] - ] + text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] if not data_args.packing: if data_args.template == "gemma": text_examples = [tokenizer.bos_token + example for example in text_examples] - result = tokenizer( - text_examples, add_special_tokens=False, max_length=data_args.cutoff_len - ) + result = tokenizer(text_examples, add_special_tokens=False, max_length=data_args.cutoff_len) else: tokenized_examples = tokenizer(text_examples, add_special_tokens=False) - concatenated_examples = { - k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys() - } + concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()} total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]]) block_size = data_args.cutoff_len total_length = (total_length // block_size) * block_size @@ -87,9 +81,7 @@ def preprocess_supervised_dataset( if data_args.train_on_prompt: source_mask = source_ids elif turn_idx != 0 and template.efficient_eos: - source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * ( - len(source_ids) - 1 - ) + source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) else: source_mask = [IGNORE_INDEX] * len(source_ids) @@ -128,9 +120,7 @@ def preprocess_packed_supervised_dataset( if data_args.train_on_prompt: source_mask = source_ids elif len(input_ids) != 0 and template.efficient_eos: - source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * ( - len(source_ids) - 1 - ) + source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) else: source_mask = [IGNORE_INDEX] * len(source_ids) @@ -190,9 +180,7 @@ def preprocess_multimodal_supervised_dataset( if data_args.train_on_prompt: source_mask = source_ids elif turn_idx != 0 and template.efficient_eos: - source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * ( - len(source_ids) - 1 - ) + source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) else: source_mask = [IGNORE_INDEX] * len(source_ids) @@ -206,9 +194,7 @@ def preprocess_multimodal_supervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) - pixel_values = processor.image_processor( - examples["images"][0], return_tensors="pt" - )["pixel_values"][0] + pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0] model_inputs["pixel_values"].append(pixel_values) return model_inputs @@ -229,9 +215,7 @@ def preprocess_unsupervised_dataset( if len(examples["response"][i]) == 1: messages = examples["prompt"][i] + examples["response"][i] else: - messages = examples["prompt"][i] + [ - {"role": Role.ASSISTANT.value, "content": ""} - ] + messages = examples["prompt"][i] + [{"role": Role.ASSISTANT.value, "content": ""}] input_ids, labels = template.encode_oneturn( tokenizer, @@ -294,15 +278,9 @@ def preprocess_pairwise_dataset( return model_inputs -def print_supervised_dataset_example( - example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer" -) -> None: +def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: print("input_ids:\n{}".format(example["input_ids"])) - print( - "inputs:\n{}".format( - tokenizer.decode(example["input_ids"], skip_special_tokens=False) - ) - ) + print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) print("label_ids:\n{}".format(example["labels"])) print( "labels:\n{}".format( @@ -314,38 +292,18 @@ def print_supervised_dataset_example( ) -def print_pairwise_dataset_example( - example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer" -) -> None: +def print_pairwise_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: print("prompt_ids:\n{}".format(example["prompt_ids"])) - print( - "prompt:\n{}".format( - tokenizer.decode(example["prompt_ids"], skip_special_tokens=False) - ) - ) + print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False))) print("chosen_ids:\n{}".format(example["chosen_ids"])) - print( - "chosen:\n{}".format( - tokenizer.decode(example["chosen_ids"], skip_special_tokens=False) - ) - ) + print("chosen:\n{}".format(tokenizer.decode(example["chosen_ids"], skip_special_tokens=False))) print("rejected_ids:\n{}".format(example["rejected_ids"])) - print( - "rejected:\n{}".format( - tokenizer.decode(example["rejected_ids"], skip_special_tokens=False) - ) - ) + print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False))) -def print_unsupervised_dataset_example( - example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer" -) -> None: +def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: print("input_ids:\n{}".format(example["input_ids"])) - print( - "inputs:\n{}".format( - tokenizer.decode(example["input_ids"], skip_special_tokens=False) - ) - ) + print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) def get_preprocess_and_print_func( @@ -357,12 +315,8 @@ def get_preprocess_and_print_func( processor: Optional["AutoProcessor"] = None, ) -> Tuple[Callable, Callable]: if stage == "pt": - preprocess_func = partial( - preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args - ) - print_function = partial( - print_unsupervised_dataset_example, tokenizer=tokenizer - ) + preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args) + print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) elif stage == "sft" and not training_args.predict_with_generate: if data_args.packing: preprocess_func = partial( @@ -402,8 +356,6 @@ def get_preprocess_and_print_func( template=template, data_args=data_args, ) - print_function = partial( - print_unsupervised_dataset_example, tokenizer=tokenizer - ) + print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) return preprocess_func, print_function diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index e6cdadd6..cf21e932 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -42,9 +42,7 @@ class Template: r""" Returns a single pair of token ids representing prompt and response respectively. """ - encoded_pairs = self._encode( - tokenizer, messages, system, tools, cutoff_len, reserved_label_len - ) + encoded_pairs = self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len) prompt_ids = [] for query_ids, resp_ids in encoded_pairs[:-1]: prompt_ids += query_ids + resp_ids @@ -64,9 +62,7 @@ class Template: r""" Returns multiple pairs of token ids representing prompts and responses respectively. """ - return self._encode( - tokenizer, messages, system, tools, cutoff_len, reserved_label_len - ) + return self._encode(tokenizer, messages, system, tools, cutoff_len, reserved_label_len) def _encode( self, @@ -93,9 +89,7 @@ class Template: elements += self.format_separator.apply() if message["role"] == Role.USER.value: - elements += self.format_user.apply( - content=message["content"], idx=str(i // 2) - ) + elements += self.format_user.apply(content=message["content"], idx=str(i // 2)) elif message["role"] == Role.ASSISTANT.value: elements += self.format_assistant.apply(content=message["content"]) elif message["role"] == Role.OBSERVATION.value: @@ -130,11 +124,7 @@ class Template: elif "eos_token" in elem and tokenizer.eos_token_id is not None: token_ids += [tokenizer.eos_token_id] else: - raise ValueError( - "Input must be string, set[str] or dict[str, str], got {}".format( - type(elem) - ) - ) + raise ValueError("Input must be string, set[str] or dict[str, str], got {}".format(type(elem))) return token_ids @@ -192,9 +182,7 @@ class Llama2Template(Template): elements += self.format_separator.apply() if message["role"] == Role.USER.value: - elements += self.format_user.apply( - content=system_text + message["content"] - ) + elements += self.format_user.apply(content=system_text + message["content"]) elif message["role"] == Role.ASSISTANT.value: elements += self.format_assistant.apply(content=message["content"]) elif message["role"] == Role.OBSERVATION.value: @@ -257,9 +245,7 @@ def _register_template( template_class = Llama2Template if name.startswith("llama2") else Template default_user_formatter = StringFormatter(slots=["{{content}}"]) default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots) - default_function_formatter = FunctionFormatter( - slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots - ) + default_function_formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots) default_tool_formatter = ToolFormatter(tool_format="default") default_separator_formatter = EmptyFormatter() templates[name] = template_class( @@ -295,9 +281,7 @@ def _jinja_escape(content: str) -> str: return content.replace("\n", r"\n").replace("'", r"\'") -def _convert_slots_to_jinja( - slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content" -) -> str: +def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str: slot_items = [] for slot in slots: if isinstance(slot, str): @@ -311,9 +295,7 @@ def _convert_slots_to_jinja( elif isinstance(slot, set): if "bos_token" in slot: slot_items.append("'" + tokenizer.bos_token + "'") - elif ( - "eos_token" in slot - ): # do not use {{ eos_token }} since it may be replaced + elif "eos_token" in slot: # do not use {{ eos_token }} since it may be replaced slot_items.append("'" + tokenizer.eos_token + "'") elif isinstance(slot, dict): raise ValueError("Dict is not supported.") @@ -325,37 +307,25 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") jinja_template = "" if template.default_system: - jinja_template += ( - "{% set system_message = '" - + _jinja_escape(template.default_system) - + "' %}" - ) + jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}" jinja_template += ( - "{% if messages[0]['role'] == 'system' %}" - "{% set system_message = messages[0]['content'] %}" - "{% endif %}" + "{% if messages[0]['role'] == 'system' %}" "{% set system_message = messages[0]['content'] %}" "{% endif %}" ) - system_message = _convert_slots_to_jinja( - template.format_system.apply(), tokenizer, placeholder="system_message" - ) + system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message") if isinstance(template, Llama2Template): pass elif template.force_system: jinja_template += "{{ " + system_message + " }}" else: - jinja_template += ( - "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}" - ) + jinja_template += "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}" jinja_template += "{% for message in messages %}" jinja_template += "{% set content = message['content'] %}" if isinstance(template, Llama2Template): jinja_template += "{% if loop.index0 == 0 and system_message is defined %}" - jinja_template += ( - "{% set content = " + system_message + " + message['content'] %}" - ) + jinja_template += "{% set content = " + system_message + " + message['content'] %}" jinja_template += "{% endif %}" jinja_template += "{% if message['role'] == 'user' %}" user_message = _convert_slots_to_jinja(template.format_user.apply(), tokenizer) @@ -403,9 +373,7 @@ def get_template_and_fix_tokenizer( ) logger.info("Add {} to stop words.".format(",".join(stop_words))) if num_added_tokens > 0: - logger.warning( - "New tokens have been added, make sure `resize_vocab` is True." - ) + logger.warning("New tokens have been added, make sure `resize_vocab` is True.") try: tokenizer.chat_template = _get_jinja_template(template, tokenizer) @@ -417,9 +385,7 @@ def get_template_and_fix_tokenizer( _register_template( name="alpaca", - format_user=StringFormatter( - slots=["### Instruction:\n{{content}}\n\n### Response:\n"] - ), + format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]), format_separator=EmptyFormatter(slots=["\n\n"]), default_system=( "Below is an instruction that describes a task. " @@ -458,9 +424,7 @@ _register_template( _register_template( name="baichuan", - format_user=StringFormatter( - slots=[{"token": ""}, "{{content}}", {"token": ""}] - ), + format_user=StringFormatter(slots=[{"token": ""}, "{{content}}", {"token": ""}]), efficient_eos=True, ) @@ -483,9 +447,7 @@ _register_template( _register_template( name="bluelm", - format_user=StringFormatter( - slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}] - ), + format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]), ) @@ -504,9 +466,7 @@ _register_template( _register_template( name="chatglm2", format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]), - format_system=StringFormatter( - slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"] - ), + format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), format_separator=EmptyFormatter(slots=["\n\n"]), efficient_eos=True, force_system=True, @@ -515,13 +475,9 @@ _register_template( _register_template( name="chatglm3", - format_user=StringFormatter( - slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}] - ), + format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]), format_assistant=StringFormatter(slots=["\n", "{{content}}"]), - format_system=StringFormatter( - slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"] - ), + format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]), format_observation=StringFormatter( slots=[ @@ -539,9 +495,7 @@ _register_template( _register_template( name="chatglm3_system", - format_user=StringFormatter( - slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}] - ), + format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]), format_assistant=StringFormatter(slots=["\n", "{{content}}"]), format_system=StringFormatter( slots=[ @@ -572,15 +526,9 @@ _register_template( _register_template( name="chatml", - format_user=StringFormatter( - slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), - format_system=StringFormatter( - slots=["<|im_start|>system\n{{content}}<|im_end|>\n"] - ), - format_observation=StringFormatter( - slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_separator=EmptyFormatter(slots=["\n"]), stop_words=["<|im_end|>", "<|im_start|>"], replace_eos=True, @@ -589,15 +537,9 @@ _register_template( _register_template( name="chatml_de", - format_user=StringFormatter( - slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), - format_system=StringFormatter( - slots=["<|im_start|>system\n{{content}}<|im_end|>\n"] - ), - format_observation=StringFormatter( - slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_separator=EmptyFormatter(slots=["\n"]), default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.", stop_words=["<|im_end|>", "<|im_start|>"], @@ -607,9 +549,7 @@ _register_template( _register_template( name="codegeex2", - format_system=StringFormatter( - slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"] - ), + format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), force_system=True, ) @@ -639,15 +579,9 @@ _register_template( _register_template( name="dbrx", - format_user=StringFormatter( - slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), - format_system=StringFormatter( - slots=["<|im_start|>system\n{{content}}<|im_end|>\n"] - ), - format_observation=StringFormatter( - slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_separator=EmptyFormatter(slots=["\n"]), default_system=( "You are DBRX, created by Databricks. You were last updated in December 2023. " @@ -725,9 +659,7 @@ _register_template( _register_template( name="gemma", - format_user=StringFormatter( - slots=["user\n{{content}}\nmodel\n"] - ), + format_user=StringFormatter(slots=["user\n{{content}}\nmodel\n"]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), format_observation=StringFormatter( slots=["tool\n{{content}}\nmodel\n"] @@ -740,9 +672,7 @@ _register_template( _register_template( name="intern", - format_user=StringFormatter( - slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"] - ), + format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"]), format_separator=EmptyFormatter(slots=[{"token": ""}, "\n"]), stop_words=[""], efficient_eos=True, @@ -751,12 +681,8 @@ _register_template( _register_template( name="intern2", - format_user=StringFormatter( - slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), - format_system=StringFormatter( - slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"] - ), + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]), format_separator=EmptyFormatter(slots=["\n"]), default_system=( "You are an AI assistant whose name is InternLM (书生·浦语).\n" @@ -859,9 +785,7 @@ _register_template( _register_template( name="orion", - format_user=StringFormatter( - slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}] - ), + format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), force_system=True, ) @@ -869,15 +793,9 @@ _register_template( _register_template( name="phi", - format_user=StringFormatter( - slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"] - ), - format_system=StringFormatter( - slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"] - ), - format_observation=StringFormatter( - slots=["<|function_output|>\n{{content}}<|end|>\n<|assistant|>\n"] - ), + format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]), + format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]), + format_observation=StringFormatter(slots=["<|function_output|>\n{{content}}<|end|>\n<|assistant|>\n"]), format_separator=EmptyFormatter(slots=["\n"]), default_system="You are a helpful AI assistant.", stop_words=["<|end|>"], @@ -887,15 +805,9 @@ _register_template( _register_template( name="qwen", - format_user=StringFormatter( - slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), - format_system=StringFormatter( - slots=["<|im_start|>system\n{{content}}<|im_end|>\n"] - ), - format_observation=StringFormatter( - slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_separator=EmptyFormatter(slots=["\n"]), default_system="You are a helpful assistant.", stop_words=["<|im_end|>"], @@ -951,12 +863,8 @@ _register_template( _register_template( name="yayi", - format_user=StringFormatter( - slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"] - ), - format_system=StringFormatter( - slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"] - ), + format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]), + format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]), format_separator=EmptyFormatter(slots=["\n\n"]), default_system=( "You are a helpful, respectful and honest assistant named YaYi " @@ -975,9 +883,7 @@ _register_template( _register_template( name="yi", - format_user=StringFormatter( - slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"] - ), + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_separator=EmptyFormatter(slots=["\n"]), stop_words=["<|im_end|>"], replace_eos=True, @@ -995,9 +901,7 @@ _register_template( _register_template( name="zephyr", - format_user=StringFormatter( - slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"] - ), + format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]), format_assistant=StringFormatter(slots=["\n{{content}}", {"eos_token"}]), format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]), default_system="You are a friendly chatbot who always responds in the style of a pirate", diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index 66ac93cf..df1a5ec0 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -15,33 +15,23 @@ class ModelArguments: ) adapter_name_or_path: Optional[str] = field( default=None, - metadata={ - "help": "Path to the adapter weight or identifier from huggingface.co/models." - }, + metadata={"help": "Path to the adapter weight or identifier from huggingface.co/models."}, ) cache_dir: Optional[str] = field( default=None, - metadata={ - "help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn." - }, + metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."}, ) use_fast_tokenizer: bool = field( default=True, - metadata={ - "help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)." - }, + metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."}, ) resize_vocab: bool = field( default=False, - metadata={ - "help": "Whether or not to resize the tokenizer vocab and the embedding layers." - }, + metadata={"help": "Whether or not to resize the tokenizer vocab and the embedding layers."}, ) split_special_tokens: bool = field( default=False, - metadata={ - "help": "Whether or not the special tokens should be split during the tokenization process." - }, + metadata={"help": "Whether or not the special tokens should be split during the tokenization process."}, ) new_special_tokens: Optional[str] = field( default=None, @@ -49,9 +39,7 @@ class ModelArguments: ) model_revision: str = field( default="main", - metadata={ - "help": "The specific model version to use (can be a branch name, tag name or commit id)." - }, + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) low_cpu_mem_usage: bool = field( default=True, @@ -59,9 +47,7 @@ class ModelArguments: ) quantization_bit: Optional[int] = field( default=None, - metadata={ - "help": "The number of bits to quantize the model using bitsandbytes." - }, + metadata={"help": "The number of bits to quantize the model using bitsandbytes."}, ) quantization_type: Literal["fp4", "nf4"] = field( default="nf4", @@ -69,21 +55,15 @@ class ModelArguments: ) double_quantization: bool = field( default=True, - metadata={ - "help": "Whether or not to use double quantization in int4 training." - }, + metadata={"help": "Whether or not to use double quantization in int4 training."}, ) quantization_device_map: Optional[Literal["auto"]] = field( default=None, - metadata={ - "help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0." - }, + metadata={"help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."}, ) rope_scaling: Optional[Literal["linear", "dynamic"]] = field( default=None, - metadata={ - "help": "Which scaling strategy should be adopted for the RoPE embeddings." - }, + metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."}, ) flash_attn: Literal["off", "sdpa", "fa2", "auto"] = field( default="auto", @@ -91,27 +71,19 @@ class ModelArguments: ) shift_attn: bool = field( default=False, - metadata={ - "help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA." - }, + metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."}, ) mixture_of_depths: Optional[Literal["convert", "load"]] = field( default=None, - metadata={ - "help": "Convert the model to mixture-of-depths (MoD) or load the MoD model." - }, + metadata={"help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."}, ) use_unsloth: bool = field( default=False, - metadata={ - "help": "Whether or not to use unsloth's optimization for the LoRA training." - }, + metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."}, ) moe_aux_loss_coef: Optional[float] = field( default=None, - metadata={ - "help": "Coefficient of the auxiliary router loss in mixture-of-experts model." - }, + metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."}, ) disable_gradient_checkpointing: bool = field( default=False, @@ -135,9 +107,7 @@ class ModelArguments: ) vllm_gpu_util: float = field( default=0.9, - metadata={ - "help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine." - }, + metadata={"help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine."}, ) vllm_enforce_eager: bool = field( default=False, @@ -177,9 +147,7 @@ class ModelArguments: ) export_quantization_dataset: Optional[str] = field( default=None, - metadata={ - "help": "Path to the dataset or dataset name to use in quantizing the exported model." - }, + metadata={"help": "Path to the dataset or dataset name to use in quantizing the exported model."}, ) export_quantization_nsamples: int = field( default=128, @@ -187,27 +155,19 @@ class ModelArguments: ) export_quantization_maxlen: int = field( default=1024, - metadata={ - "help": "The maximum length of the model inputs used for quantization." - }, + metadata={"help": "The maximum length of the model inputs used for quantization."}, ) export_legacy_format: bool = field( default=False, - metadata={ - "help": "Whether or not to save the `.bin` files instead of `.safetensors`." - }, + metadata={"help": "Whether or not to save the `.bin` files instead of `.safetensors`."}, ) export_hub_model_id: Optional[str] = field( default=None, - metadata={ - "help": "The name of the repository if push the model to the Hugging Face hub." - }, + metadata={"help": "The name of the repository if push the model to the Hugging Face hub."}, ) print_param_status: bool = field( default=False, - metadata={ - "help": "For debugging purposes, print the status of the parameters in the model." - }, + metadata={"help": "For debugging purposes, print the status of the parameters in the model."}, ) use_mllm: bool = field( default=False, @@ -220,21 +180,13 @@ class ModelArguments: self.model_max_length = None if self.split_special_tokens and self.use_fast_tokenizer: - raise ValueError( - "`split_special_tokens` is only supported for slow tokenizers." - ) + raise ValueError("`split_special_tokens` is only supported for slow tokenizers.") - if ( - self.adapter_name_or_path is not None - ): # support merging multiple lora weights - self.adapter_name_or_path = [ - path.strip() for path in self.adapter_name_or_path.split(",") - ] + if self.adapter_name_or_path is not None: # support merging multiple lora weights + self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")] if self.new_special_tokens is not None: # support multiple special tokens - self.new_special_tokens = [ - token.strip() for token in self.new_special_tokens.split(",") - ] + self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")] assert self.quantization_bit in [ None, @@ -249,10 +201,7 @@ class ModelArguments: 2, ], "We only accept 2/3/4/8-bit quantization." - if ( - self.export_quantization_bit is not None - and self.export_quantization_dataset is None - ): + if self.export_quantization_bit is not None and self.export_quantization_dataset is None: raise ValueError("Quantization dataset is necessary for exporting.") def to_dict(self) -> Dict[str, Any]: diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py index db81e1dc..1824f084 100644 --- a/src/llmtuner/model/__init__.py +++ b/src/llmtuner/model/__init__.py @@ -1,6 +1,7 @@ from .loader import load_config, load_model, load_tokenizer from .utils.misc import find_all_linear_modules, load_valuehead_params + __all__ = [ "load_config", "load_model", diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index e65798b7..f3db4d1e 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -38,9 +38,7 @@ def init_adapter( logger.info("Adapter is not found at evaluation, load the base model.") return model - if finetuning_args.finetuning_type != "lora" and getattr( - model, "quantization_method", None - ): + if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None): raise ValueError("You can only use lora for quantized models.") if finetuning_args.finetuning_type == "full" and is_trainable: @@ -68,12 +66,8 @@ def init_adapter( stride = num_layers // finetuning_args.num_layer_trainable trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride) - elif ( - finetuning_args.num_layer_trainable > 0 - ): # fine-tuning the last n layers if num_layer_trainable > 0 - trainable_layer_ids = range( - num_layers - finetuning_args.num_layer_trainable, num_layers - ) + elif finetuning_args.num_layer_trainable > 0: # fine-tuning the last n layers if num_layer_trainable > 0 + trainable_layer_ids = range(num_layers - finetuning_args.num_layer_trainable, num_layers) else: # fine-tuning the first n layers if num_layer_trainable < 0 trainable_layer_ids = range(-finetuning_args.num_layer_trainable) @@ -88,15 +82,11 @@ def init_adapter( for module_name in finetuning_args.name_module_trainable: if module_name not in freeze_modules: raise ValueError( - "Module {} is not found, please choose from {}".format( - module_name, ", ".join(freeze_modules) - ) + "Module {} is not found, please choose from {}".format(module_name, ", ".join(freeze_modules)) ) for idx in trainable_layer_ids: - trainable_layers.append( - ".{:d}.{}".format(idx, module_name if module_name != "all" else "") - ) + trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else "")) for name, param in model.named_parameters(): if any(trainable_layer in name for trainable_layer in trainable_layers): @@ -105,43 +95,27 @@ def init_adapter( else: param.requires_grad_(False) - logger.info( - "Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids))) - ) + logger.info("Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids)))) if finetuning_args.finetuning_type == "lora": - logger.info( - "Fine-tuning method: {}".format( - "DoRA" if finetuning_args.use_dora else "LoRA" - ) - ) + logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA")) adapter_to_resume = None if model_args.adapter_name_or_path is not None: is_mergeable = True - if getattr( - model, "quantization_method", None - ): # merge lora in quantized model is unstable - assert ( - len(model_args.adapter_name_or_path) == 1 - ), "Quantized model only accepts a single adapter." + if getattr(model, "quantization_method", None): # merge lora in quantized model is unstable + assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter." is_mergeable = False if is_deepspeed_zero3_enabled(): - assert ( - len(model_args.adapter_name_or_path) == 1 - ), "Cannot use multiple adapters in DeepSpeed ZeRO-3." + assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3." is_mergeable = False if model_args.use_unsloth: - assert ( - len(model_args.adapter_name_or_path) == 1 - ), "Unsloth model only accepts a single adapter." + assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter." is_mergeable = False - if (is_trainable and not finetuning_args.create_new_adapter) or ( - not is_mergeable - ): + if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable): adapter_to_merge = model_args.adapter_name_or_path[:-1] adapter_to_resume = model_args.adapter_name_or_path[-1] else: @@ -158,9 +132,7 @@ def init_adapter( if adapter_to_resume is not None: # resume lora training if model_args.use_unsloth: - model = load_unsloth_peft_model( - config, model_args, is_trainable=is_trainable - ) + model = load_unsloth_peft_model(config, model_args, is_trainable=is_trainable) else: model = PeftModel.from_pretrained( model, @@ -169,27 +141,19 @@ def init_adapter( offload_folder=model_args.offload_folder, ) - if ( - is_trainable and adapter_to_resume is None - ): # create new lora weights while training - if ( - len(finetuning_args.lora_target) == 1 - and finetuning_args.lora_target[0] == "all" - ): + if is_trainable and adapter_to_resume is None: # create new lora weights while training + if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all": target_modules = find_all_linear_modules(model) else: target_modules = finetuning_args.lora_target if finetuning_args.use_llama_pro: - target_modules = find_expanded_modules( - model, target_modules, finetuning_args.num_layer_trainable - ) + target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable) if ( finetuning_args.use_dora and getattr(model, "quantization_method", None) is not None - and getattr(model, "quantization_method", None) - != QuantizationMethod.BITS_AND_BYTES + and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES ): raise ValueError("DoRA is not compatible with PTQ-quantized models.") @@ -202,11 +166,7 @@ def init_adapter( module_names.add(name.split(".")[-1]) finetuning_args.additional_target = module_names - logger.warning( - "Vocab has been resized, add {} to trainable params.".format( - ",".join(module_names) - ) - ) + logger.warning("Vocab has been resized, add {} to trainable params.".format(",".join(module_names))) peft_kwargs = { "r": finetuning_args.lora_rank, @@ -233,10 +193,6 @@ def init_adapter( param.data = param.data.to(torch.float32) if model_args.adapter_name_or_path is not None: - logger.info( - "Loaded adapter(s): {}".format( - ",".join(model_args.adapter_name_or_path) - ) - ) + logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path))) return model diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index 99ad9adc..47298673 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -3,9 +3,9 @@ from typing import TYPE_CHECKING, Any, Dict, Union from transformers import ( AutoConfig, AutoModelForCausalLM, - AutoTokenizer, - AutoProcessor, AutoModelForVision2Seq, + AutoProcessor, + AutoTokenizer, ) from trl import AutoModelForCausalLMWithValueHead @@ -17,6 +17,7 @@ from .utils.misc import load_valuehead_params, register_autoclass from .utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model from .utils.unsloth import load_unsloth_pretrained_model + if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer @@ -42,7 +43,7 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]: def load_tokenizer( model_args: "ModelArguments", -) -> Dict[str, Union["PreTrainedTokenizer", "AutoProcesser"]]: +) -> Dict[str, Union["PreTrainedTokenizer", "AutoProcessor"]]: r""" Loads pretrained tokenizer. @@ -70,14 +71,10 @@ def load_tokenizer( dict(additional_special_tokens=model_args.new_special_tokens), replace_additional_special_tokens=False, ) - logger.info( - "Add {} to special tokens.".format(",".join(model_args.new_special_tokens)) - ) + logger.info("Add {} to special tokens.".format(",".join(model_args.new_special_tokens))) if num_added_tokens > 0 and not model_args.resize_vocab: model_args.resize_vocab = True - logger.warning( - "New tokens have been added, changed `resize_vocab` to True." - ) + logger.warning("New tokens have been added, changed `resize_vocab` to True.") patch_tokenizer(tokenizer) tokenizer_modules = {"tokenizer": tokenizer, "processor": None} @@ -174,10 +171,8 @@ def load_model( trainable_params, all_param = count_parameters(model) if is_trainable: - param_stats = ( - "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( - trainable_params, all_param, 100 * trainable_params / all_param - ) + param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( + trainable_params, all_param, 100 * trainable_params / all_param ) else: param_stats = "all params: {:d}".format(all_param) diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py index 6f887810..50833a99 100644 --- a/src/llmtuner/train/sft/workflow.py +++ b/src/llmtuner/train/sft/workflow.py @@ -50,29 +50,17 @@ def run_sft( tokenizer.padding_side = "left" # use left-padding in generation if getattr(model, "is_quantized", False) and not training_args.do_train: - setattr( - model, "_hf_peft_config_loaded", True - ) # hack here: make model compatible with prediction + setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, - pad_to_multiple_of=( - 8 if tokenizer.padding_side == "right" else None - ), # for shift short attention - label_pad_token_id=( - IGNORE_INDEX - if data_args.ignore_pad_token_for_loss - else tokenizer.pad_token_id - ), + pad_to_multiple_of=(8 if tokenizer.padding_side == "right" else None), # for shift short attention + label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id), ) # Override the decoding parameters of Seq2SeqTrainer - training_args.generation_max_length = ( - training_args.generation_max_length or data_args.cutoff_len - ) - training_args.generation_num_beams = ( - data_args.eval_num_beams or training_args.generation_num_beams - ) + training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len + training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams if model_args.use_mllm: training_args.remove_unused_columns = False @@ -84,25 +72,19 @@ def run_sft( tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks, - compute_metrics=( - ComputeMetrics(tokenizer) if training_args.predict_with_generate else None - ), + compute_metrics=(ComputeMetrics(tokenizer) if training_args.predict_with_generate else None), **split_dataset(dataset, data_args, training_args), ) # Keyword arguments for `model.generate` gen_kwargs = generating_args.to_dict() - gen_kwargs["eos_token_id"] = [ - tokenizer.eos_token_id - ] + tokenizer.additional_special_tokens_ids + gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids gen_kwargs["pad_token_id"] = tokenizer.pad_token_id gen_kwargs["logits_processor"] = get_logits_processor() # Training if training_args.do_train: - train_result = trainer.train( - resume_from_checkpoint=training_args.resume_from_checkpoint - ) + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) @@ -113,27 +95,19 @@ def run_sft( # Evaluation if training_args.do_eval: metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) - if ( - training_args.predict_with_generate - ): # eval_loss will be wrong if predict_with_generate is enabled + if training_args.predict_with_generate: # eval_loss will be wrong if predict_with_generate is enabled metrics.pop("eval_loss", None) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: - predict_results = trainer.predict( - dataset, metric_key_prefix="predict", **gen_kwargs - ) - if ( - training_args.predict_with_generate - ): # predict_loss will be wrong if predict_with_generate is enabled + predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs) + if training_args.predict_with_generate: # predict_loss will be wrong if predict_with_generate is enabled predict_results.metrics.pop("predict_loss", None) trainer.log_metrics("predict", predict_results.metrics) trainer.save_metrics("predict", predict_results.metrics) trainer.save_predictions(predict_results) # Create model card - create_modelcard_and_push( - trainer, model_args, data_args, training_args, finetuning_args - ) + create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/src/llmtuner/train/sftmm/__init__.py b/src/llmtuner/train/sftmm/__init__.py new file mode 100644 index 00000000..9ebdf821 --- /dev/null +++ b/src/llmtuner/train/sftmm/__init__.py @@ -0,0 +1,4 @@ +from .workflow import run_sft_mm + + +__all__ = ["run_sft_mm"] diff --git a/src/llmtuner/train/sftmm/metric.py b/src/llmtuner/train/sftmm/metric.py new file mode 100644 index 00000000..d1af4c17 --- /dev/null +++ b/src/llmtuner/train/sftmm/metric.py @@ -0,0 +1,61 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union + +import numpy as np + +from ...extras.constants import IGNORE_INDEX +from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available + + +if TYPE_CHECKING: + from transformers.tokenization_utils import PreTrainedTokenizer + +if is_jieba_available(): + import jieba # type: ignore + +if is_nltk_available(): + from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu + +if is_rouge_available(): + from rouge_chinese import Rouge + + +@dataclass +class ComputeMetrics: + r""" + Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer. + """ + + tokenizer: "PreTrainedTokenizer" + + def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]: + r""" + Uses the model predictions to compute metrics. + """ + preds, labels = eval_preds + score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []} + + preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id) + labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id) + + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + + for pred, label in zip(decoded_preds, decoded_labels): + hypothesis = list(jieba.cut(pred)) + reference = list(jieba.cut(label)) + + if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0: + result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}} + else: + rouge = Rouge() + scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) + result = scores[0] + + for k, v in result.items(): + score_dict[k].append(round(v["f"] * 100, 4)) + + bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3) + score_dict["bleu-4"].append(round(bleu_score * 100, 4)) + + return {k: float(np.mean(v)) for k, v in score_dict.items()} diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py new file mode 100644 index 00000000..270e7169 --- /dev/null +++ b/src/llmtuner/train/sftmm/trainer.py @@ -0,0 +1,39 @@ +from types import MethodType +from typing import TYPE_CHECKING, Optional + +import torch +from transformers import Seq2SeqTrainer + +from ...extras.logging import get_logger +from ..utils import create_custom_optimzer, create_custom_scheduler + + +if TYPE_CHECKING: + from ...hparams import FinetuningArguments + +logger = get_logger(__name__) + + +class CustomSeq2SeqTrainer(Seq2SeqTrainer): + r""" + Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE. + """ + + def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: + super().__init__(**kwargs) + self.finetuning_args = finetuning_args + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + + def create_optimizer(self) -> "torch.optim.Optimizer": + if self.optimizer is None: + self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) + return super().create_optimizer() + + def create_scheduler( + self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None + ) -> "torch.optim.lr_scheduler.LRScheduler": + create_custom_scheduler(self.args, num_training_steps, optimizer) + return super().create_scheduler(num_training_steps, optimizer) diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py new file mode 100644 index 00000000..dbda2d05 --- /dev/null +++ b/src/llmtuner/train/sftmm/workflow.py @@ -0,0 +1,101 @@ +# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py +from typing import TYPE_CHECKING, List, Optional + +from transformers import DataCollatorForSeq2Seq + +from ...data import get_dataset +from ...extras.constants import IGNORE_INDEX +from ...extras.misc import get_logits_processor +from ...extras.ploting import plot_loss +from ...model import load_model, load_processor +from ..sft.metric import ComputeMetrics +from ..utils import create_modelcard_and_push +from .trainer import CustomSeq2SeqTrainer + + +if TYPE_CHECKING: + from transformers import Seq2SeqTrainingArguments, TrainerCallback + + from ...hparams import ( + DataArguments, + FinetuningArguments, + GeneratingArguments, + ModelArguments, + ) + + +def run_sft_mm( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + generating_args: "GeneratingArguments", + callbacks: Optional[List["TrainerCallback"]] = None, +): + processor = load_processor(model_args) + tokenizer = processor.tokenizer + dataset = get_dataset(tokenizer, model_args, data_args, training_args, "sft", processor) + model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) + if getattr(model, "is_quantized", False) and not training_args.do_train: + setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction + train_dataset = dataset + eval_dataset = dataset + data_collator = DataCollatorForSeq2Seq( + tokenizer=tokenizer, + pad_to_multiple_of=(8 if tokenizer.padding_side == "right" else None), # for shift short attention + label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id), + ) + + # Override the decoding parameters of Seq2SeqTrainer + training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len + training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams + training_args.remove_unused_columns = False + + # Initialize our Trainer + trainer = CustomSeq2SeqTrainer( + model=model, + args=training_args, + finetuning_args=finetuning_args, + tokenizer=tokenizer, + data_collator=data_collator, + callbacks=callbacks, + compute_metrics=(ComputeMetrics(tokenizer) if training_args.predict_with_generate else None), + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + + # Keyword arguments for `model.generate` + gen_kwargs = generating_args.to_dict() + gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids + gen_kwargs["pad_token_id"] = tokenizer.pad_token_id + gen_kwargs["logits_processor"] = get_logits_processor() + + # Training + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + trainer.save_model() + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + if trainer.is_world_process_zero() and finetuning_args.plot_loss: + plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) + + # Evaluation + if training_args.do_eval: + metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) + if training_args.predict_with_generate: # eval_loss will be wrong if predict_with_generate is enabled + metrics.pop("eval_loss", None) + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Predict + if training_args.do_predict: + predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs) + if training_args.predict_with_generate: # predict_loss will be wrong if predict_with_generate is enabled + predict_results.metrics.pop("predict_loss", None) + trainer.log_metrics("predict", predict_results.metrics) + trainer.save_metrics("predict", predict_results.metrics) + trainer.save_predictions(predict_results) + + # Create model card + create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py index 5f691225..e1999946 100644 --- a/src/llmtuner/train/tuner.py +++ b/src/llmtuner/train/tuner.py @@ -15,6 +15,7 @@ from .pt import run_pt from .rm import run_rm from .sft import run_sft + if TYPE_CHECKING: from transformers import TrainerCallback From 110c2ce2a522c7225f952c3cae6a5035569c0a86 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 21:27:48 +0800 Subject: [PATCH 13/29] modify style Former-commit-id: 3bffc1e1b8bcc4582cebea06d35e5146163c7bec --- src/llmtuner/hparams/model_args.py | 14 +--- src/llmtuner/train/sftmm/__init__.py | 4 -- src/llmtuner/train/sftmm/metric.py | 61 ---------------- src/llmtuner/train/sftmm/trainer.py | 39 ----------- src/llmtuner/train/sftmm/workflow.py | 101 --------------------------- 5 files changed, 2 insertions(+), 217 deletions(-) delete mode 100644 src/llmtuner/train/sftmm/__init__.py delete mode 100644 src/llmtuner/train/sftmm/metric.py delete mode 100644 src/llmtuner/train/sftmm/trainer.py delete mode 100644 src/llmtuner/train/sftmm/workflow.py diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index df1a5ec0..97b908e4 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -188,18 +188,8 @@ class ModelArguments: if self.new_special_tokens is not None: # support multiple special tokens self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")] - assert self.quantization_bit in [ - None, - 8, - 4, - ], "We only accept 4-bit or 8-bit quantization." - assert self.export_quantization_bit in [ - None, - 8, - 4, - 3, - 2, - ], "We only accept 2/3/4/8-bit quantization." + assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." + assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization." if self.export_quantization_bit is not None and self.export_quantization_dataset is None: raise ValueError("Quantization dataset is necessary for exporting.") diff --git a/src/llmtuner/train/sftmm/__init__.py b/src/llmtuner/train/sftmm/__init__.py deleted file mode 100644 index 9ebdf821..00000000 --- a/src/llmtuner/train/sftmm/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .workflow import run_sft_mm - - -__all__ = ["run_sft_mm"] diff --git a/src/llmtuner/train/sftmm/metric.py b/src/llmtuner/train/sftmm/metric.py deleted file mode 100644 index d1af4c17..00000000 --- a/src/llmtuner/train/sftmm/metric.py +++ /dev/null @@ -1,61 +0,0 @@ -from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union - -import numpy as np - -from ...extras.constants import IGNORE_INDEX -from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available - - -if TYPE_CHECKING: - from transformers.tokenization_utils import PreTrainedTokenizer - -if is_jieba_available(): - import jieba # type: ignore - -if is_nltk_available(): - from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu - -if is_rouge_available(): - from rouge_chinese import Rouge - - -@dataclass -class ComputeMetrics: - r""" - Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer. - """ - - tokenizer: "PreTrainedTokenizer" - - def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]: - r""" - Uses the model predictions to compute metrics. - """ - preds, labels = eval_preds - score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []} - - preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id) - labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id) - - decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) - decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) - - for pred, label in zip(decoded_preds, decoded_labels): - hypothesis = list(jieba.cut(pred)) - reference = list(jieba.cut(label)) - - if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0: - result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}} - else: - rouge = Rouge() - scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) - result = scores[0] - - for k, v in result.items(): - score_dict[k].append(round(v["f"] * 100, 4)) - - bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3) - score_dict["bleu-4"].append(round(bleu_score * 100, 4)) - - return {k: float(np.mean(v)) for k, v in score_dict.items()} diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py deleted file mode 100644 index 270e7169..00000000 --- a/src/llmtuner/train/sftmm/trainer.py +++ /dev/null @@ -1,39 +0,0 @@ -from types import MethodType -from typing import TYPE_CHECKING, Optional - -import torch -from transformers import Seq2SeqTrainer - -from ...extras.logging import get_logger -from ..utils import create_custom_optimzer, create_custom_scheduler - - -if TYPE_CHECKING: - from ...hparams import FinetuningArguments - -logger = get_logger(__name__) - - -class CustomSeq2SeqTrainer(Seq2SeqTrainer): - r""" - Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE. - """ - - def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: - super().__init__(**kwargs) - self.finetuning_args = finetuning_args - if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) - - def create_optimizer(self) -> "torch.optim.Optimizer": - if self.optimizer is None: - self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) - return super().create_optimizer() - - def create_scheduler( - self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None - ) -> "torch.optim.lr_scheduler.LRScheduler": - create_custom_scheduler(self.args, num_training_steps, optimizer) - return super().create_scheduler(num_training_steps, optimizer) diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py deleted file mode 100644 index dbda2d05..00000000 --- a/src/llmtuner/train/sftmm/workflow.py +++ /dev/null @@ -1,101 +0,0 @@ -# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py -from typing import TYPE_CHECKING, List, Optional - -from transformers import DataCollatorForSeq2Seq - -from ...data import get_dataset -from ...extras.constants import IGNORE_INDEX -from ...extras.misc import get_logits_processor -from ...extras.ploting import plot_loss -from ...model import load_model, load_processor -from ..sft.metric import ComputeMetrics -from ..utils import create_modelcard_and_push -from .trainer import CustomSeq2SeqTrainer - - -if TYPE_CHECKING: - from transformers import Seq2SeqTrainingArguments, TrainerCallback - - from ...hparams import ( - DataArguments, - FinetuningArguments, - GeneratingArguments, - ModelArguments, - ) - - -def run_sft_mm( - model_args: "ModelArguments", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - finetuning_args: "FinetuningArguments", - generating_args: "GeneratingArguments", - callbacks: Optional[List["TrainerCallback"]] = None, -): - processor = load_processor(model_args) - tokenizer = processor.tokenizer - dataset = get_dataset(tokenizer, model_args, data_args, training_args, "sft", processor) - model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) - if getattr(model, "is_quantized", False) and not training_args.do_train: - setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction - train_dataset = dataset - eval_dataset = dataset - data_collator = DataCollatorForSeq2Seq( - tokenizer=tokenizer, - pad_to_multiple_of=(8 if tokenizer.padding_side == "right" else None), # for shift short attention - label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id), - ) - - # Override the decoding parameters of Seq2SeqTrainer - training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len - training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams - training_args.remove_unused_columns = False - - # Initialize our Trainer - trainer = CustomSeq2SeqTrainer( - model=model, - args=training_args, - finetuning_args=finetuning_args, - tokenizer=tokenizer, - data_collator=data_collator, - callbacks=callbacks, - compute_metrics=(ComputeMetrics(tokenizer) if training_args.predict_with_generate else None), - train_dataset=train_dataset, - eval_dataset=eval_dataset, - ) - - # Keyword arguments for `model.generate` - gen_kwargs = generating_args.to_dict() - gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids - gen_kwargs["pad_token_id"] = tokenizer.pad_token_id - gen_kwargs["logits_processor"] = get_logits_processor() - - # Training - if training_args.do_train: - train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) - trainer.save_model() - trainer.log_metrics("train", train_result.metrics) - trainer.save_metrics("train", train_result.metrics) - trainer.save_state() - if trainer.is_world_process_zero() and finetuning_args.plot_loss: - plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) - - # Evaluation - if training_args.do_eval: - metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) - if training_args.predict_with_generate: # eval_loss will be wrong if predict_with_generate is enabled - metrics.pop("eval_loss", None) - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - # Predict - if training_args.do_predict: - predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs) - if training_args.predict_with_generate: # predict_loss will be wrong if predict_with_generate is enabled - predict_results.metrics.pop("predict_loss", None) - trainer.log_metrics("predict", predict_results.metrics) - trainer.save_metrics("predict", predict_results.metrics) - trainer.save_predictions(predict_results) - - # Create model card - create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) From 058ed5e607b4e6b6f24eddcb1b44de261a002ae9 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 21:29:50 +0800 Subject: [PATCH 14/29] modify style Former-commit-id: c1f1df99e4dc3d0aadf1207b4e9a16218187fd5a --- src/llmtuner/model/adapter.py | 6 +++--- src/llmtuner/model/loader.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index f3db4d1e..d43e00f0 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING import torch from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model @@ -21,11 +21,11 @@ logger = get_logger(__name__) def init_adapter( config: "PretrainedConfig", - model: Union["PreTrainedModel"], + model: "PreTrainedModel", model_args: "ModelArguments", finetuning_args: "FinetuningArguments", is_trainable: bool, -) -> Union["PreTrainedModel"]: +) -> "PreTrainedModel": r""" Initializes the adapters. diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index 47298673..dd7eb44c 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -112,7 +112,7 @@ def load_model( finetuning_args: "FinetuningArguments", is_trainable: bool = False, add_valuehead: bool = False, -) -> Union["PreTrainedModel"]: +) -> "PreTrainedModel": r""" Loads pretrained model. """ From 10d59e9e4a76d85e1fa9b401235dd8c6cc4e6e2e Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 21:32:01 +0800 Subject: [PATCH 15/29] make dataset script Former-commit-id: 25892f958da14976025a775febf628cd0e0a3d85 --- scripts/make_mllm_instruct.py | 95 ----------------------------------- 1 file changed, 95 deletions(-) delete mode 100644 scripts/make_mllm_instruct.py diff --git a/scripts/make_mllm_instruct.py b/scripts/make_mllm_instruct.py deleted file mode 100644 index 41e13b8e..00000000 --- a/scripts/make_mllm_instruct.py +++ /dev/null @@ -1,95 +0,0 @@ -import json -import os.path - -import fire -from datasets import Dataset, concatenate_datasets, load_dataset, Value, Image, Features, Sequence - -"""usage -python3 scripts/make_mllm_instruct.py \ ---json_path data/llava_instruct_example.json \ ---image_path data/images \ ---output_path data/mllm_example_dataset -""" - - -def make_one_json(json_path, image_path) -> Dataset: - with open(json_path) as f: - raw_data_ls = json.loads(f.read()) - data_ls = [] - for i, data in enumerate(raw_data_ls): - for j, message in enumerate(data['messages']): - text = message['content'] - message['content'] = [{'index': None, 'text': text, 'type': 'text'}] - if j == 0: - message['content'].append({'index': 0, 'text': None, 'type': 'image'}) - image = data['image'] - if image_path: - image = os.path.join(image_path, data['image']) - data['images'] = [image] - del data['image'] - data_ls.append(data) - - def gen(): - for data in data_ls: - yield data - - features = Features({'messages': [{'content': [ - {'index': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), - 'type': Value(dtype='string', id=None)}], 'role': Value(dtype='string', id=None)}], - 'images': Sequence(feature=Image(decode=True, id=None), length=-1, id=None)}) - dataset = Dataset.from_generator(gen, features=features) - return dataset - - -yaml_content = """--- -dataset_info: - features: - - name: messages - list: - - name: content - list: - - name: index - dtype: int64 - - name: text - dtype: string - - name: type - dtype: string - - name: role - dtype: string - - name: images - sequence: image -configs: -- config_name: default - data_files: - - split: train - path: data/train-* - - split: test - path: data/test-* ----""" - - -def main( - json_path: str, - image_path: str, - output_path: str, -): - json_path_list = json_path.split() - dataset_list = [] - for json_path in json_path_list: - dataset = make_one_json(json_path, image_path) - dataset_list.append(dataset) - dataset = concatenate_datasets(dataset_list) - print(dataset[0]) - data_path = os.path.join(output_path, "data") - os.makedirs(os.path.join(data_path), exist_ok=True) - parquet_path = os.path.join(data_path, "train-0.parquet") - dataset.to_parquet(parquet_path) - parquet_path = os.path.join(data_path, "test-0.parquet") - dataset.to_parquet(parquet_path) - readme_path = os.path.join(output_path, "README.md") - with open(readme_path, 'w') as f: - f.write(yaml_content) - - -if __name__ == '__main__': - fire.Fire(main) From 8b2a735c14f3193de414f6660af0efd25c1b7bf7 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 21:58:18 +0800 Subject: [PATCH 16/29] modify some style Former-commit-id: b016e6a671a2f228f0bdd9b8d5995b4669609655 --- src/llmtuner/data/aligner.py | 24 ++----- src/llmtuner/data/preprocess.py | 43 +++--------- src/llmtuner/data/template.py | 91 ++----------------------- src/llmtuner/hparams/finetuning_args.py | 2 +- src/llmtuner/model/loader.py | 4 +- src/llmtuner/train/sft/workflow.py | 20 ++---- 6 files changed, 26 insertions(+), 158 deletions(-) diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index 9d440aff..17b9fc6d 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -82,10 +82,7 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" raise ValueError("Invalid role tag in {}.".format(messages)) aligned_messages.append( - { - "role": tag_mapping[message[dataset_attr.role_tag]], - "content": message[dataset_attr.content_tag], - } + {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} ) outputs["prompt"].append(aligned_messages[:-1]) @@ -126,10 +123,7 @@ def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") - raise ValueError("Invalid role tag in {}.".format(messages)) aligned_messages.append( - { - "role": tag_mapping[message[dataset_attr.role_tag]], - "content": message[dataset_attr.content_tag], - } + {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} ) outputs["prompt"].append(aligned_messages[:-1]) @@ -143,9 +137,7 @@ def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") - def align_dataset( - dataset: Union["Dataset", "IterableDataset"], - dataset_attr: "DatasetAttr", - data_args: "DataArguments", + dataset: Union["Dataset", "IterableDataset"], dataset_attr: "DatasetAttr", data_args: "DataArguments" ) -> Union["Dataset", "IterableDataset"]: r""" Aligned dataset: @@ -165,16 +157,10 @@ def align_dataset( features = Features.from_dict( { "prompt": [ - { - "role": {"dtype": "string", "_type": "Value"}, - "content": {"dtype": "string", "_type": "Value"}, - } + {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}} ], "response": [ - { - "role": {"dtype": "string", "_type": "Value"}, - "content": {"dtype": "string", "_type": "Value"}, - } + {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}} ], "system": {"dtype": "string", "_type": "Value"}, "tools": {"dtype": "string", "_type": "Value"}, diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index 1c8c64a6..51af8060 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -14,14 +14,11 @@ if TYPE_CHECKING: from ..hparams import DataArguments from .template import Template - logger = get_logger(__name__) def preprocess_pretrain_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - data_args: "DataArguments", + examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` if packing is enabled text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] @@ -56,11 +53,7 @@ def preprocess_supervised_dataset( ) -> Dict[str, List[List[int]]]: # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. - model_inputs = { - "input_ids": [], - "attention_mask": [], - "labels": [], - } + model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} for i in range(len(examples["prompt"])): if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: @@ -154,12 +147,7 @@ def preprocess_multimodal_supervised_dataset( # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. tokenizer = processor.tokenizer - model_inputs = { - "input_ids": [], - "attention_mask": [], - "labels": [], - "pixel_values": [], - } + model_inputs = {"input_ids": [], "attention_mask": [], "labels": [], "pixel_values": []} for i in range(len(examples["prompt"])): if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: @@ -284,10 +272,7 @@ def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: " print("label_ids:\n{}".format(example["labels"])) print( "labels:\n{}".format( - tokenizer.decode( - list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), - skip_special_tokens=False, - ) + tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False) ) ) @@ -320,33 +305,21 @@ def get_preprocess_and_print_func( elif stage == "sft" and not training_args.predict_with_generate: if data_args.packing: preprocess_func = partial( - preprocess_packed_supervised_dataset, - tokenizer=tokenizer, - template=template, - data_args=data_args, + preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args ) elif processor is not None: preprocess_func = partial( - preprocess_multimodal_supervised_dataset, - processor=processor, - template=template, - data_args=data_args, + preprocess_multimodal_supervised_dataset, processor=processor, template=template, data_args=data_args ) else: preprocess_func = partial( - preprocess_supervised_dataset, - tokenizer=tokenizer, - template=template, - data_args=data_args, + preprocess_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args ) print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer) elif stage == "rm": preprocess_func = partial( - preprocess_pairwise_dataset, - tokenizer=tokenizer, - template=template, - data_args=data_args, + preprocess_pairwise_dataset, tokenizer=tokenizer, template=template, data_args=data_args ) print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer) else: diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index cf21e932..f798ba5a 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -11,7 +11,6 @@ if TYPE_CHECKING: from .formatter import SLOTS, Formatter - logger = get_logger(__name__) @@ -368,8 +367,7 @@ def get_template_and_fix_tokenizer( if stop_words: num_added_tokens = tokenizer.add_special_tokens( - dict(additional_special_tokens=stop_words), - replace_additional_special_tokens=False, + dict(additional_special_tokens=stop_words), replace_additional_special_tokens=False ) logger.info("Add {} to stop words.".format(",".join(stop_words))) if num_added_tokens > 0: @@ -393,7 +391,6 @@ _register_template( ), ) - _register_template( name="aquila", format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]), @@ -406,36 +403,26 @@ _register_template( efficient_eos=True, ) - _register_template( name="atom", format_user=StringFormatter( - slots=[ - {"bos_token"}, - "Human: {{content}}\n", - {"eos_token"}, - {"bos_token"}, - "Assistant:", - ] + slots=[{"bos_token"}, "Human: {{content}}\n", {"eos_token"}, {"bos_token"}, "Assistant:"] ), format_assistant=StringFormatter(slots=["{{content}}\n", {"eos_token"}]), ) - _register_template( name="baichuan", format_user=StringFormatter(slots=[{"token": ""}, "{{content}}", {"token": ""}]), efficient_eos=True, ) - _register_template( name="baichuan2", format_user=StringFormatter(slots=["{{content}}"]), efficient_eos=True, ) - _register_template( name="belle", format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]), @@ -444,13 +431,11 @@ _register_template( force_system=True, ) - _register_template( name="bluelm", format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]), ) - _register_template( name="breeze", format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]), @@ -462,7 +447,6 @@ _register_template( efficient_eos=True, ) - _register_template( name="chatglm2", format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]), @@ -472,7 +456,6 @@ _register_template( force_system=True, ) - _register_template( name="chatglm3", format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]), @@ -480,40 +463,23 @@ _register_template( format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]), format_observation=StringFormatter( - slots=[ - {"token": "<|observation|>"}, - "\n", - "{{content}}", - {"token": "<|assistant|>"}, - ] + slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}] ), stop_words=["<|user|>", "<|observation|>"], efficient_eos=True, force_system=True, ) - _register_template( name="chatglm3_system", format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]), format_assistant=StringFormatter(slots=["\n", "{{content}}"]), format_system=StringFormatter( - slots=[ - {"token": "[gMASK]"}, - {"token": "sop"}, - {"token": "<|system|>"}, - "\n", - "{{content}}", - ] + slots=[{"token": "[gMASK]"}, {"token": "sop"}, {"token": "<|system|>"}, "\n", "{{content}}"] ), format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]), format_observation=StringFormatter( - slots=[ - {"token": "<|observation|>"}, - "\n", - "{{content}}", - {"token": "<|assistant|>"}, - ] + slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}] ), default_system=( "You are ChatGLM3, a large language model trained by Zhipu.AI. " @@ -523,7 +489,6 @@ _register_template( efficient_eos=True, ) - _register_template( name="chatml", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -534,7 +499,6 @@ _register_template( replace_eos=True, ) - _register_template( name="chatml_de", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -546,14 +510,12 @@ _register_template( replace_eos=True, ) - _register_template( name="codegeex2", format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), force_system=True, ) - _register_template( name="cohere", format_user=StringFormatter( @@ -568,7 +530,6 @@ _register_template( force_system=True, ) - _register_template( name="cpm", format_user=StringFormatter(slots=["<用户>{{content}}"]), @@ -576,7 +537,6 @@ _register_template( force_system=True, ) - _register_template( name="dbrx", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -602,7 +562,6 @@ _register_template( replace_eos=True, ) - _register_template( name="deepseek", format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]), @@ -610,7 +569,6 @@ _register_template( force_system=True, ) - _register_template( name="deepseekcoder", format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]), @@ -626,7 +584,6 @@ _register_template( efficient_eos=True, ) - _register_template( name="default", format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]), @@ -634,14 +591,12 @@ _register_template( format_separator=EmptyFormatter(slots=["\n"]), ) - _register_template( name="empty", format_user=StringFormatter(slots=["{{content}}"]), format_assistant=StringFormatter(slots=["{{content}}"]), ) - _register_template( name="falcon", format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]), @@ -649,14 +604,12 @@ _register_template( efficient_eos=True, ) - _register_template( name="fewshot", format_separator=EmptyFormatter(slots=["\n\n"]), efficient_eos=True, ) - _register_template( name="gemma", format_user=StringFormatter(slots=["user\n{{content}}\nmodel\n"]), @@ -669,7 +622,6 @@ _register_template( force_system=True, ) - _register_template( name="intern", format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"]), @@ -678,7 +630,6 @@ _register_template( efficient_eos=True, ) - _register_template( name="intern2", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -695,7 +646,6 @@ _register_template( efficient_eos=True, # internlm2 tokenizer cannot set eos_token_id ) - _register_template( name="llama2", format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]), @@ -712,7 +662,6 @@ _register_template( ), ) - _register_template( name="llama2_zh", format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]), @@ -720,7 +669,6 @@ _register_template( default_system="You are a helpful assistant. 你是一个乐于助人的助手。", ) - _register_template( name="llama3", format_user=StringFormatter( @@ -732,10 +680,7 @@ _register_template( ] ), format_system=StringFormatter( - slots=[ - {"bos_token"}, - "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>", - ] + slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"] ), format_observation=StringFormatter( slots=[ @@ -750,7 +695,6 @@ _register_template( replace_eos=True, ) - _register_template( name="mistral", format_user=StringFormatter(slots=[" [INST] {{content}} [/INST]"]), @@ -758,7 +702,6 @@ _register_template( force_system=True, ) - _register_template( name="olmo", format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]), @@ -767,22 +710,14 @@ _register_template( force_system=True, ) - _register_template( name="openchat", - format_user=StringFormatter( - slots=[ - "GPT4 Correct User: {{content}}", - {"eos_token"}, - "GPT4 Correct Assistant:", - ] - ), + format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]), format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), force_system=True, ) - _register_template( name="orion", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]), @@ -790,7 +725,6 @@ _register_template( force_system=True, ) - _register_template( name="phi", format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]), @@ -802,7 +736,6 @@ _register_template( replace_eos=True, ) - _register_template( name="qwen", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -814,7 +747,6 @@ _register_template( replace_eos=True, ) - _register_template( name="solar", format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]), @@ -822,7 +754,6 @@ _register_template( efficient_eos=True, ) - _register_template( name="starchat", format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>"]), @@ -833,7 +764,6 @@ _register_template( force_system=True, ) - _register_template( name="vicuna", format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]), @@ -843,7 +773,6 @@ _register_template( ), ) - _register_template( name="xuanyuan", format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]), @@ -854,13 +783,11 @@ _register_template( ), ) - _register_template( name="xverse", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]), ) - _register_template( name="yayi", format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]), @@ -880,7 +807,6 @@ _register_template( stop_words=["<|End|>"], ) - _register_template( name="yi", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -889,7 +815,6 @@ _register_template( replace_eos=True, ) - _register_template( name="yuan", format_user=StringFormatter(slots=["{{content}}", {"token": ""}]), @@ -898,7 +823,6 @@ _register_template( replace_eos=True, ) - _register_template( name="zephyr", format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]), @@ -907,7 +831,6 @@ _register_template( default_system="You are a friendly chatbot who always responds in the style of a pirate", ) - _register_template( name="ziya", format_user=StringFormatter(slots=[":{{content}}\n:"]), diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py index cb525699..f4f71bc5 100644 --- a/src/llmtuner/hparams/finetuning_args.py +++ b/src/llmtuner/hparams/finetuning_args.py @@ -260,7 +260,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA default=False, metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."}, ) - stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo", "sft_mm"] = field( + stage: Literal["pt", "sft", "rm", "ppo", "dpo", "orpo"] = field( default="sft", metadata={"help": "Which stage will be performed in training."}, ) diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index dd7eb44c..5b5c0a4d 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -41,9 +41,7 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]: } -def load_tokenizer( - model_args: "ModelArguments", -) -> Dict[str, Union["PreTrainedTokenizer", "AutoProcessor"]]: +def load_tokenizer(model_args: "ModelArguments") -> Dict[str, Union["PreTrainedTokenizer", "AutoProcessor"]]: r""" Loads pretrained tokenizer. diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py index 50833a99..205142e5 100644 --- a/src/llmtuner/train/sft/workflow.py +++ b/src/llmtuner/train/sft/workflow.py @@ -17,12 +17,7 @@ from .trainer import CustomSeq2SeqTrainer if TYPE_CHECKING: from transformers import Seq2SeqTrainingArguments, TrainerCallback - from ...hparams import ( - DataArguments, - FinetuningArguments, - GeneratingArguments, - ModelArguments, - ) + from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments def run_sft( @@ -36,14 +31,7 @@ def run_sft( tokenizer_modules = load_tokenizer(model_args) tokenizer = tokenizer_modules["tokenizer"] processor = tokenizer_modules["processor"] - dataset = get_dataset( - tokenizer, - model_args, - data_args, - training_args, - stage="sft", - processor=processor, - ) + dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft", processor=processor) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) if training_args.predict_with_generate: @@ -54,7 +42,7 @@ def run_sft( data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, - pad_to_multiple_of=(8 if tokenizer.padding_side == "right" else None), # for shift short attention + pad_to_multiple_of=8 if tokenizer.padding_side == "right" else None, # for shift short attention label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id), ) @@ -72,7 +60,7 @@ def run_sft( tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks, - compute_metrics=(ComputeMetrics(tokenizer) if training_args.predict_with_generate else None), + compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, **split_dataset(dataset, data_args, training_args), ) From 514ffafc126bcd0b07978555167fe3eb93090cc2 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 22:04:09 +0800 Subject: [PATCH 17/29] modify some style Former-commit-id: 053062abc007014a7fde95c5ae9f4d859893d8ad --- src/llmtuner/data/preprocess.py | 5 +-- src/llmtuner/data/template.py | 50 ++++++++++++++++++++++++++++-- src/llmtuner/train/sft/workflow.py | 2 +- src/llmtuner/train/tuner.py | 1 + 4 files changed, 50 insertions(+), 8 deletions(-) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index 51af8060..9cdcdfa2 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -324,10 +324,7 @@ def get_preprocess_and_print_func( print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer) else: preprocess_func = partial( - preprocess_unsupervised_dataset, - tokenizer=tokenizer, - template=template, - data_args=data_args, + preprocess_unsupervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args ) print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index f798ba5a..9a3673c3 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -11,6 +11,7 @@ if TYPE_CHECKING: from .formatter import SLOTS, Formatter + logger = get_logger(__name__) @@ -103,9 +104,7 @@ class Template: return self._make_pairs(encoded_messages, cutoff_len, reserved_label_len) def _convert_elements_to_ids( - self, - tokenizer: "PreTrainedTokenizer", - elements: List[Union[str, Dict[str, str]]], + self, tokenizer: "PreTrainedTokenizer", elements: List[Union[str, Dict[str, str]]] ) -> List[int]: r""" Converts elements to token ids. @@ -391,6 +390,7 @@ _register_template( ), ) + _register_template( name="aquila", format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]), @@ -403,6 +403,7 @@ _register_template( efficient_eos=True, ) + _register_template( name="atom", format_user=StringFormatter( @@ -411,18 +412,21 @@ _register_template( format_assistant=StringFormatter(slots=["{{content}}\n", {"eos_token"}]), ) + _register_template( name="baichuan", format_user=StringFormatter(slots=[{"token": ""}, "{{content}}", {"token": ""}]), efficient_eos=True, ) + _register_template( name="baichuan2", format_user=StringFormatter(slots=["{{content}}"]), efficient_eos=True, ) + _register_template( name="belle", format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]), @@ -431,11 +435,13 @@ _register_template( force_system=True, ) + _register_template( name="bluelm", format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]), ) + _register_template( name="breeze", format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]), @@ -447,6 +453,7 @@ _register_template( efficient_eos=True, ) + _register_template( name="chatglm2", format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]), @@ -456,6 +463,7 @@ _register_template( force_system=True, ) + _register_template( name="chatglm3", format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]), @@ -470,6 +478,7 @@ _register_template( force_system=True, ) + _register_template( name="chatglm3_system", format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]), @@ -489,6 +498,7 @@ _register_template( efficient_eos=True, ) + _register_template( name="chatml", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -499,6 +509,7 @@ _register_template( replace_eos=True, ) + _register_template( name="chatml_de", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -510,12 +521,14 @@ _register_template( replace_eos=True, ) + _register_template( name="codegeex2", format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), force_system=True, ) + _register_template( name="cohere", format_user=StringFormatter( @@ -530,6 +543,7 @@ _register_template( force_system=True, ) + _register_template( name="cpm", format_user=StringFormatter(slots=["<用户>{{content}}"]), @@ -537,6 +551,7 @@ _register_template( force_system=True, ) + _register_template( name="dbrx", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -562,6 +577,7 @@ _register_template( replace_eos=True, ) + _register_template( name="deepseek", format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]), @@ -569,6 +585,7 @@ _register_template( force_system=True, ) + _register_template( name="deepseekcoder", format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]), @@ -584,6 +601,7 @@ _register_template( efficient_eos=True, ) + _register_template( name="default", format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]), @@ -591,12 +609,14 @@ _register_template( format_separator=EmptyFormatter(slots=["\n"]), ) + _register_template( name="empty", format_user=StringFormatter(slots=["{{content}}"]), format_assistant=StringFormatter(slots=["{{content}}"]), ) + _register_template( name="falcon", format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]), @@ -604,12 +624,14 @@ _register_template( efficient_eos=True, ) + _register_template( name="fewshot", format_separator=EmptyFormatter(slots=["\n\n"]), efficient_eos=True, ) + _register_template( name="gemma", format_user=StringFormatter(slots=["user\n{{content}}\nmodel\n"]), @@ -622,6 +644,7 @@ _register_template( force_system=True, ) + _register_template( name="intern", format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"]), @@ -630,6 +653,7 @@ _register_template( efficient_eos=True, ) + _register_template( name="intern2", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -646,6 +670,7 @@ _register_template( efficient_eos=True, # internlm2 tokenizer cannot set eos_token_id ) + _register_template( name="llama2", format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]), @@ -662,6 +687,7 @@ _register_template( ), ) + _register_template( name="llama2_zh", format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]), @@ -669,6 +695,7 @@ _register_template( default_system="You are a helpful assistant. 你是一个乐于助人的助手。", ) + _register_template( name="llama3", format_user=StringFormatter( @@ -695,6 +722,7 @@ _register_template( replace_eos=True, ) + _register_template( name="mistral", format_user=StringFormatter(slots=[" [INST] {{content}} [/INST]"]), @@ -702,6 +730,7 @@ _register_template( force_system=True, ) + _register_template( name="olmo", format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]), @@ -710,6 +739,7 @@ _register_template( force_system=True, ) + _register_template( name="openchat", format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]), @@ -718,6 +748,7 @@ _register_template( force_system=True, ) + _register_template( name="orion", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]), @@ -725,6 +756,7 @@ _register_template( force_system=True, ) + _register_template( name="phi", format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]), @@ -736,6 +768,7 @@ _register_template( replace_eos=True, ) + _register_template( name="qwen", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -747,6 +780,7 @@ _register_template( replace_eos=True, ) + _register_template( name="solar", format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]), @@ -754,6 +788,7 @@ _register_template( efficient_eos=True, ) + _register_template( name="starchat", format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>"]), @@ -764,6 +799,7 @@ _register_template( force_system=True, ) + _register_template( name="vicuna", format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]), @@ -773,6 +809,7 @@ _register_template( ), ) + _register_template( name="xuanyuan", format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]), @@ -783,11 +820,13 @@ _register_template( ), ) + _register_template( name="xverse", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]), ) + _register_template( name="yayi", format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]), @@ -807,6 +846,7 @@ _register_template( stop_words=["<|End|>"], ) + _register_template( name="yi", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), @@ -815,6 +855,7 @@ _register_template( replace_eos=True, ) + _register_template( name="yuan", format_user=StringFormatter(slots=["{{content}}", {"token": ""}]), @@ -823,6 +864,7 @@ _register_template( replace_eos=True, ) + _register_template( name="zephyr", format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]), @@ -831,12 +873,14 @@ _register_template( default_system="You are a friendly chatbot who always responds in the style of a pirate", ) + _register_template( name="ziya", format_user=StringFormatter(slots=[":{{content}}\n:"]), format_separator=EmptyFormatter(slots=["\n"]), ) + _register_template( name="llava", format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT: "]), diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py index 205142e5..c5acb4bc 100644 --- a/src/llmtuner/train/sft/workflow.py +++ b/src/llmtuner/train/sft/workflow.py @@ -43,7 +43,7 @@ def run_sft( data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, pad_to_multiple_of=8 if tokenizer.padding_side == "right" else None, # for shift short attention - label_pad_token_id=(IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id), + label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id, ) # Override the decoding parameters of Seq2SeqTrainer diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py index e1999946..a8a2b8e9 100644 --- a/src/llmtuner/train/tuner.py +++ b/src/llmtuner/train/tuner.py @@ -19,6 +19,7 @@ from .sft import run_sft if TYPE_CHECKING: from transformers import TrainerCallback + logger = get_logger(__name__) From 759bee48d23f9a1a3549057b64160bfaa2598aa6 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 22:35:17 +0800 Subject: [PATCH 18/29] merge some func Former-commit-id: 3085107c44715e4b2ca96d73b20d90c172b95219 --- scripts/test_mllm.py | 99 --------------------------------- src/llmtuner/data/aligner.py | 51 +---------------- src/llmtuner/data/preprocess.py | 65 +++------------------- 3 files changed, 10 insertions(+), 205 deletions(-) delete mode 100644 scripts/test_mllm.py diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py deleted file mode 100644 index b8fe3e0f..00000000 --- a/scripts/test_mllm.py +++ /dev/null @@ -1,99 +0,0 @@ -import os.path - -import fire -import torch -from datasets import load_dataset -from peft import PeftModel -from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor -import shutil -from PIL import Image - -"""usage -python3 scripts/test_mllm.py \ ---base_model_path llava-hf/llava-1.5-7b-hf \ ---lora_model_path saves/llava-1.5-7b/lora/sft \ ---model_path saves/llava-1.5-7b/lora/merged \ ---dataset_name data/llava_instruct_example.json \ ---do_merge 1 -""" - - -def get_processor(model_path): - processor = AutoProcessor.from_pretrained(model_path) - CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {{ message['content'] }} ASSISTANT: {% else %}{{ message['content'] }}{% endif %} {% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}""" - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) - tokenizer.chat_template = CHAT_TEMPLATE - processor.tokenizer = tokenizer - return processor - - -def apply_lora(base_model_path, model_path, lora_path): - print(f"Loading the base model from {base_model_path}") - base_model = AutoModelForVision2Seq.from_pretrained( - base_model_path, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - device_map="cuda", - ) - processor = get_processor(base_model_path) - tokenizer = processor.tokenizer - print(f"Loading the LoRA adapter from {lora_path}") - - lora_model = PeftModel.from_pretrained( - base_model, - lora_path, - torch_dtype=torch.float16, - ) - - print("Applying the LoRA") - model = lora_model.merge_and_unload() - - print(f"Saving the target model to {model_path}") - model.save_pretrained(model_path) - tokenizer.save_pretrained(model_path) - processor.image_processor.save_pretrained(model_path) - - -def main( - model_path: str, - dataset_name: str, - base_model_path: str = "", - lora_model_path: str = "", - do_merge: bool = False, -): - if not os.path.exists(model_path) or do_merge: - apply_lora(base_model_path, model_path, lora_model_path) - model = AutoModelForVision2Seq.from_pretrained( - model_path, - torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, - device_map="cuda", - ) - processor = get_processor(model_path) - raw_datasets = load_dataset("json", data_files=dataset_name) - train_dataset = raw_datasets["train"] - examples = train_dataset.select(range(3)) - texts = [] - images = [] - for example in examples: - messages = example["messages"][:1] - text = processor.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=False - ) - texts.append(text) - images.append(Image.open(example["images"][0])) - batch = processor(text=texts, images=images, return_tensors="pt", padding=True).to( - "cuda" - ) - output = model.generate(**batch, max_new_tokens=100) - res_list = processor.batch_decode(output, skip_special_tokens=True) - for i, prompt in enumerate(texts): - res = res_list[i] - print(f"#{i}") - print(f"prompt:{prompt}") - print(f"response:{res[len(prompt):].strip()}") - print() - - -if __name__ == "__main__": - fire.Fire(main) diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index 17b9fc6d..6fd6f404 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -36,12 +36,7 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") {"role": Role.ASSISTANT.value, "content": content} for content in examples[dataset_attr.response][i] ] elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str): - response = [ - { - "role": Role.ASSISTANT.value, - "content": examples[dataset_attr.response][i], - } - ] + response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}] else: response = [] @@ -54,47 +49,6 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: - outputs = {"prompt": [], "response": [], "system": [], "tools": []} - tag_mapping = { - dataset_attr.user_tag: Role.USER.value, - dataset_attr.assistant_tag: Role.ASSISTANT.value, - dataset_attr.observation_tag: Role.OBSERVATION.value, - dataset_attr.function_tag: Role.FUNCTION.value, - dataset_attr.system_tag: Role.SYSTEM.value, - } - odd_tags = (dataset_attr.user_tag, dataset_attr.observation_tag) - even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag) - accept_tags = (odd_tags, even_tags) - for i, messages in enumerate(examples[dataset_attr.messages]): - if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag: - system = messages[0][dataset_attr.content_tag] - messages = messages[1:] - else: - system = examples[dataset_attr.system][i] if dataset_attr.system else "" - - messages = messages[: len(messages) // 2 * 2] # should be multiples of 2 - if len(messages) == 0: - continue - - aligned_messages = [] - for turn_idx, message in enumerate(messages): - if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]: - raise ValueError("Invalid role tag in {}.".format(messages)) - - aligned_messages.append( - {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} - ) - - outputs["prompt"].append(aligned_messages[:-1]) - outputs["response"].append(aligned_messages[-1:]) - outputs["system"].append(system) - outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - outputs["images"].append([]) - - return outputs - - -def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} tag_mapping = { dataset_attr.user_tag: Role.USER.value, @@ -130,7 +84,6 @@ def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") - outputs["response"].append(aligned_messages[-1:]) outputs["system"].append(system) outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - print(examples[dataset_attr.images][i]) outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) return outputs @@ -148,8 +101,6 @@ def align_dataset( """ if dataset_attr.formatting == "alpaca": convert_func = partial(convert_alpaca, dataset_attr=dataset_attr) - elif dataset_attr.formatting == "llava": - convert_func = partial(convert_llava, dataset_attr=dataset_attr) else: convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index 9cdcdfa2..6108b245 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -50,6 +50,7 @@ def preprocess_supervised_dataset( tokenizer: "PreTrainedTokenizer", template: "Template", data_args: "DataArguments", + processor: "AutoProcessor" = None, ) -> Dict[str, List[List[int]]]: # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. @@ -88,7 +89,9 @@ def preprocess_supervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) - + if processor is not None and "images" in examples: + pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0] + model_inputs["pixel_values"].append(pixel_values) return model_inputs @@ -138,55 +141,6 @@ def preprocess_packed_supervised_dataset( return model_inputs -def preprocess_multimodal_supervised_dataset( - examples: Dict[str, List[Any]], - processor: "AutoProcessor", - template: "Template", - data_args: "DataArguments", -) -> Dict[str, List[List[int]]]: - # build inputs with format ` X Y ` and labels with format ` ... Y ` - # for multiturn examples, we only mask the prompt part in each prompt-response pair. - tokenizer = processor.tokenizer - model_inputs = {"input_ids": [], "attention_mask": [], "labels": [], "pixel_values": []} - - for i in range(len(examples["prompt"])): - if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: - continue - - messages = examples["prompt"][i] + examples["response"][i] - input_ids, labels = [], [] - for turn_idx, (source_ids, target_ids) in enumerate( - template.encode_multiturn( - tokenizer, - messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) - ): - if data_args.train_on_prompt: - source_mask = source_ids - elif turn_idx != 0 and template.efficient_eos: - source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) - else: - source_mask = [IGNORE_INDEX] * len(source_ids) - - input_ids += source_ids + target_ids - labels += source_mask + target_ids - - if template.efficient_eos: - input_ids += [tokenizer.eos_token_id] - labels += [tokenizer.eos_token_id] - - model_inputs["input_ids"].append(input_ids) - model_inputs["attention_mask"].append([1] * len(input_ids)) - model_inputs["labels"].append(labels) - pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0] - model_inputs["pixel_values"].append(pixel_values) - return model_inputs - - def preprocess_unsupervised_dataset( examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", @@ -307,15 +261,14 @@ def get_preprocess_and_print_func( preprocess_func = partial( preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args ) - elif processor is not None: - preprocess_func = partial( - preprocess_multimodal_supervised_dataset, processor=processor, template=template, data_args=data_args - ) else: preprocess_func = partial( - preprocess_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args + preprocess_supervised_dataset, + tokenizer=tokenizer, + template=template, + data_args=data_args, + processor=processor, ) - print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer) elif stage == "rm": preprocess_func = partial( From 59817c27e33a10e44197438ff746c18f0c34e0f9 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 22:40:25 +0800 Subject: [PATCH 19/29] modify some style Former-commit-id: d578a90cefa7ec813355795bdd6ead5ee558ce26 --- src/llmtuner/data/preprocess.py | 76 ++++++++++++++++----------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index 6108b245..3487b761 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -6,7 +6,6 @@ from ..extras.constants import IGNORE_INDEX from ..extras.logging import get_logger from .utils import Role - if TYPE_CHECKING: from transformers import Seq2SeqTrainingArguments from transformers.tokenization_utils import AutoProcessor, PreTrainedTokenizer @@ -18,7 +17,7 @@ logger = get_logger(__name__) def preprocess_pretrain_dataset( - examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" + examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` if packing is enabled text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] @@ -35,7 +34,7 @@ def preprocess_pretrain_dataset( block_size = data_args.cutoff_len total_length = (total_length // block_size) * block_size result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + k: [t[i: i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } if data_args.template == "gemma": @@ -46,11 +45,11 @@ def preprocess_pretrain_dataset( def preprocess_supervised_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", - processor: "AutoProcessor" = None, + examples: Dict[str, List[Any]], + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", + processor: "AutoProcessor" = None, ) -> Dict[str, List[List[int]]]: # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. @@ -63,14 +62,14 @@ def preprocess_supervised_dataset( messages = examples["prompt"][i] + examples["response"][i] input_ids, labels = [], [] for turn_idx, (source_ids, target_ids) in enumerate( - template.encode_multiturn( - tokenizer, - messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) + template.encode_multiturn( + tokenizer, + messages, + examples["system"][i], + examples["tools"][i], + data_args.cutoff_len, + data_args.reserved_label_len, + ) ): if data_args.train_on_prompt: source_mask = source_ids @@ -96,10 +95,10 @@ def preprocess_supervised_dataset( def preprocess_packed_supervised_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", + examples: Dict[str, List[Any]], + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build inputs with format ` X1 Y1 X2 Y2 ` # and labels with format ` ... Y1 ... Y2 ` @@ -111,7 +110,7 @@ def preprocess_packed_supervised_dataset( messages = examples["prompt"][i] + examples["response"][i] for source_ids, target_ids in template.encode_multiturn( - tokenizer, messages, examples["system"][i], examples["tools"][i] + tokenizer, messages, examples["system"][i], examples["tools"][i] ): if data_args.train_on_prompt: source_mask = source_ids @@ -133,19 +132,19 @@ def preprocess_packed_supervised_dataset( total_length = (total_length // block_size) * block_size # split by chunks of cutoff_len for i in range(0, total_length, block_size): - if not all(label == IGNORE_INDEX for label in labels[i : i + block_size]): - model_inputs["input_ids"].append(input_ids[i : i + block_size]) + if not all(label == IGNORE_INDEX for label in labels[i: i + block_size]): + model_inputs["input_ids"].append(input_ids[i: i + block_size]) model_inputs["attention_mask"].append([1] * block_size) - model_inputs["labels"].append(labels[i : i + block_size]) + model_inputs["labels"].append(labels[i: i + block_size]) return model_inputs def preprocess_unsupervised_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", + examples: Dict[str, List[Any]], + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build inputs with format ` X` and labels with format `Y ` model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} @@ -179,10 +178,10 @@ def preprocess_unsupervised_dataset( def preprocess_pairwise_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", + examples: Dict[str, List[Any]], + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build input pairs with format ` X`, `Y1 ` and `Y2 ` model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []} @@ -246,12 +245,12 @@ def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: def get_preprocess_and_print_func( - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - stage: Literal["pt", "sft", "rm", "ppo"], - processor: Optional["AutoProcessor"] = None, + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + stage: Literal["pt", "sft", "rm", "ppo"], + processor: Optional["AutoProcessor"] = None, ) -> Tuple[Callable, Callable]: if stage == "pt": preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args) @@ -280,5 +279,4 @@ def get_preprocess_and_print_func( preprocess_unsupervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args ) print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) - return preprocess_func, print_function From 5062ee547ea45f30faa3ee60048cbafec4ee458d Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 22:40:53 +0800 Subject: [PATCH 20/29] modify some style Former-commit-id: 1291c7ee39361dd75247c67f04dcf20b472faf83 --- src/llmtuner/data/preprocess.py | 75 +++++++++++++++++---------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index 3487b761..59b49b9d 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -6,6 +6,7 @@ from ..extras.constants import IGNORE_INDEX from ..extras.logging import get_logger from .utils import Role + if TYPE_CHECKING: from transformers import Seq2SeqTrainingArguments from transformers.tokenization_utils import AutoProcessor, PreTrainedTokenizer @@ -17,7 +18,7 @@ logger = get_logger(__name__) def preprocess_pretrain_dataset( - examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" + examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` if packing is enabled text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] @@ -34,7 +35,7 @@ def preprocess_pretrain_dataset( block_size = data_args.cutoff_len total_length = (total_length // block_size) * block_size result = { - k: [t[i: i + block_size] for i in range(0, total_length, block_size)] + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } if data_args.template == "gemma": @@ -45,11 +46,11 @@ def preprocess_pretrain_dataset( def preprocess_supervised_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", - processor: "AutoProcessor" = None, + examples: Dict[str, List[Any]], + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", + processor: "AutoProcessor" = None, ) -> Dict[str, List[List[int]]]: # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. @@ -62,14 +63,14 @@ def preprocess_supervised_dataset( messages = examples["prompt"][i] + examples["response"][i] input_ids, labels = [], [] for turn_idx, (source_ids, target_ids) in enumerate( - template.encode_multiturn( - tokenizer, - messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) + template.encode_multiturn( + tokenizer, + messages, + examples["system"][i], + examples["tools"][i], + data_args.cutoff_len, + data_args.reserved_label_len, + ) ): if data_args.train_on_prompt: source_mask = source_ids @@ -95,10 +96,10 @@ def preprocess_supervised_dataset( def preprocess_packed_supervised_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", + examples: Dict[str, List[Any]], + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build inputs with format ` X1 Y1 X2 Y2 ` # and labels with format ` ... Y1 ... Y2 ` @@ -110,7 +111,7 @@ def preprocess_packed_supervised_dataset( messages = examples["prompt"][i] + examples["response"][i] for source_ids, target_ids in template.encode_multiturn( - tokenizer, messages, examples["system"][i], examples["tools"][i] + tokenizer, messages, examples["system"][i], examples["tools"][i] ): if data_args.train_on_prompt: source_mask = source_ids @@ -132,19 +133,19 @@ def preprocess_packed_supervised_dataset( total_length = (total_length // block_size) * block_size # split by chunks of cutoff_len for i in range(0, total_length, block_size): - if not all(label == IGNORE_INDEX for label in labels[i: i + block_size]): - model_inputs["input_ids"].append(input_ids[i: i + block_size]) + if not all(label == IGNORE_INDEX for label in labels[i : i + block_size]): + model_inputs["input_ids"].append(input_ids[i : i + block_size]) model_inputs["attention_mask"].append([1] * block_size) - model_inputs["labels"].append(labels[i: i + block_size]) + model_inputs["labels"].append(labels[i : i + block_size]) return model_inputs def preprocess_unsupervised_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", + examples: Dict[str, List[Any]], + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build inputs with format ` X` and labels with format `Y ` model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} @@ -178,10 +179,10 @@ def preprocess_unsupervised_dataset( def preprocess_pairwise_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", + examples: Dict[str, List[Any]], + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build input pairs with format ` X`, `Y1 ` and `Y2 ` model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []} @@ -245,12 +246,12 @@ def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: def get_preprocess_and_print_func( - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - stage: Literal["pt", "sft", "rm", "ppo"], - processor: Optional["AutoProcessor"] = None, + tokenizer: "PreTrainedTokenizer", + template: "Template", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + stage: Literal["pt", "sft", "rm", "ppo"], + processor: Optional["AutoProcessor"] = None, ) -> Tuple[Callable, Callable]: if stage == "pt": preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args) From 5d03ac642d9498870974d5cab9c24319bf7eb3e7 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Thu, 25 Apr 2024 22:59:46 +0800 Subject: [PATCH 21/29] modify some bug Former-commit-id: 593b7b004df74bd24361c9883401a656c08fb589 --- src/llmtuner/data/preprocess.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index 59b49b9d..be566a5b 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -91,6 +91,8 @@ def preprocess_supervised_dataset( model_inputs["labels"].append(labels) if processor is not None and "images" in examples: pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0] + if "pixel_values" not in model_inputs: + model_inputs["pixel_values"] = [] model_inputs["pixel_values"].append(pixel_values) return model_inputs From 13117b69d7e566c61ce277d949cd8c05c7258aa5 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Fri, 26 Apr 2024 02:20:47 +0800 Subject: [PATCH 22/29] delete llava template (use vicuna) Former-commit-id: 420e64970e5a0e45453041927e0366ee8beb73d5 --- src/llmtuner/data/template.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index 9a3673c3..73b22eb7 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -879,14 +879,3 @@ _register_template( format_user=StringFormatter(slots=[":{{content}}\n:"]), format_separator=EmptyFormatter(slots=["\n"]), ) - - -_register_template( - name="llava", - format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT: "]), - format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]), - default_system=( - "A chat between a curious user and an artificial intelligence assistant. " - "The assistant gives helpful, detailed, and polite answers to the user's questions." - ), -) From 279439abbeeea3891ae80f719ab86b440e22cc8c Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Fri, 26 Apr 2024 02:49:39 +0800 Subject: [PATCH 23/29] update hparam name Former-commit-id: 9941adfbf06db37f8ba32c4555f6e58e27188aaf --- src/llmtuner/hparams/model_args.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index 97b908e4..be65cd27 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -81,6 +81,10 @@ class ModelArguments: default=False, metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."}, ) + visual_inputs: bool = field( + default=False, + metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."}, + ) moe_aux_loss_coef: Optional[float] = field( default=None, metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."}, @@ -169,10 +173,6 @@ class ModelArguments: default=False, metadata={"help": "For debugging purposes, print the status of the parameters in the model."}, ) - use_mllm: bool = field( - default=False, - metadata={"help": "Whether use Multimodal LLM."}, - ) def __post_init__(self): self.compute_dtype = None From a6f6b406b3856544a3fb8dcfef528b092e6bb967 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Fri, 26 Apr 2024 03:22:40 +0800 Subject: [PATCH 24/29] Update loader.py Former-commit-id: 72d4817a15f6916706828ea2a61d808183c23773 --- src/llmtuner/model/loader.py | 54 ++++++++++++++---------------------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index 5b5c0a4d..0ff7a350 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -1,12 +1,6 @@ -from typing import TYPE_CHECKING, Any, Dict, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, TypedDict -from transformers import ( - AutoConfig, - AutoModelForCausalLM, - AutoModelForVision2Seq, - AutoProcessor, - AutoTokenizer, -) +from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer from trl import AutoModelForCausalLMWithValueHead from ..extras.logging import get_logger @@ -19,13 +13,19 @@ from .utils.unsloth import load_unsloth_pretrained_model if TYPE_CHECKING: - from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer + from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, ProcessorMixin from ..hparams import FinetuningArguments, ModelArguments + logger = get_logger(__name__) +class TokenizerModule(TypedDict): + tokenizer: "PreTrainedTokenizer" + processor: Optional["ProcessorMixin"] + + def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]: r""" Gets arguments to load config/tokenizer/model. @@ -41,7 +41,7 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]: } -def load_tokenizer(model_args: "ModelArguments") -> Dict[str, Union["PreTrainedTokenizer", "AutoProcessor"]]: +def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule": r""" Loads pretrained tokenizer. @@ -75,25 +75,14 @@ def load_tokenizer(model_args: "ModelArguments") -> Dict[str, Union["PreTrainedT logger.warning("New tokens have been added, changed `resize_vocab` to True.") patch_tokenizer(tokenizer) - tokenizer_modules = {"tokenizer": tokenizer, "processor": None} - if model_args.use_mllm: - try: - processor = AutoProcessor.from_pretrained( - model_args.model_name_or_path, - use_fast=model_args.use_fast_tokenizer, - split_special_tokens=model_args.split_special_tokens, - padding_side="right", - **init_kwargs, - ) - except Exception: # try the fast one - processor = AutoProcessor.from_pretrained( - model_args.model_name_or_path, - use_fast=True, - padding_side="right", - **init_kwargs, - ) - tokenizer_modules["processor"] = processor - return tokenizer_modules + + if model_args.visual_inputs: + processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs) + setattr(processor, "tokenizer", tokenizer) + else: + processor = None + + return {"tokenizer": tokenizer, "processor": processor} def load_config(model_args: "ModelArguments") -> "PretrainedConfig": @@ -132,11 +121,10 @@ def load_model( if model_args.mixture_of_depths == "load": model = load_mod_pretrained_model(**init_kwargs) + elif model_args.visual_inputs: + model = AutoModelForVision2Seq.from_pretrained(**init_kwargs) else: - if model_args.use_mllm: - model = AutoModelForVision2Seq.from_pretrained(**init_kwargs) - else: - model = AutoModelForCausalLM.from_pretrained(**init_kwargs) + model = AutoModelForCausalLM.from_pretrained(**init_kwargs) if model_args.mixture_of_depths == "convert": model = convert_pretrained_model_to_mod(model, config, model_args) From 2eede9ffd643d6475bd3ec742d1fe8aeb46e0938 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Fri, 26 Apr 2024 03:29:12 +0800 Subject: [PATCH 25/29] Update workflow.py Former-commit-id: 5b8b5b975716d539ae2fae8536f79e106aa0b566 --- src/llmtuner/train/sft/workflow.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py index c5acb4bc..3ead9edf 100644 --- a/src/llmtuner/train/sft/workflow.py +++ b/src/llmtuner/train/sft/workflow.py @@ -28,11 +28,10 @@ def run_sft( generating_args: "GeneratingArguments", callbacks: Optional[List["TrainerCallback"]] = None, ): - tokenizer_modules = load_tokenizer(model_args) - tokenizer = tokenizer_modules["tokenizer"] - processor = tokenizer_modules["processor"] - dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft", processor=processor) - model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) + tokenizer_module = load_tokenizer(model_args) + dataset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module) + tokenizer = tokenizer_module["tokenizer"] + model = load_model(tokenizer, model_args, finetuning_args, is_trainable=training_args.do_train) if training_args.predict_with_generate: tokenizer.padding_side = "left" # use left-padding in generation @@ -49,8 +48,7 @@ def run_sft( # Override the decoding parameters of Seq2SeqTrainer training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams - if model_args.use_mllm: - training_args.remove_unused_columns = False + training_args.remove_unused_columns = False if model_args.visual_inputs else training_args.remove_unused_columns # Initialize our Trainer trainer = CustomSeq2SeqTrainer( From e1838e76fe9f2f630b4899ee818c1ef5ee219bb0 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Fri, 26 Apr 2024 03:33:07 +0800 Subject: [PATCH 26/29] Update loader.py Former-commit-id: 6a5f2e2ab7304113ff71cb77aafff6a1f74831f8 --- src/llmtuner/data/loader.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index fa4aa9c1..ca0d5407 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -16,12 +16,13 @@ from .utils import checksum, merge_dataset if TYPE_CHECKING: from datasets import Dataset, IterableDataset - from transformers import AutoProcessor, Seq2SeqTrainingArguments + from transformers import ProcessorMixin, Seq2SeqTrainingArguments from transformers.tokenization_utils import PreTrainedTokenizer from ..hparams import DataArguments, ModelArguments from .parser import DatasetAttr + logger = get_logger(__name__) @@ -114,12 +115,12 @@ def load_single_dataset( def get_dataset( - tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments", data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments", stage: Literal["pt", "sft", "rm", "ppo"], - processor: Optional["AutoProcessor"] = None, + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"] = None, ) -> Union["Dataset", "IterableDataset"]: template = get_template_and_fix_tokenizer(tokenizer, data_args.template) if data_args.train_on_prompt and template.efficient_eos: @@ -149,7 +150,7 @@ def get_dataset( with training_args.main_process_first(desc="pre-process dataset"): preprocess_func, print_function = get_preprocess_and_print_func( - tokenizer, template, data_args, training_args, stage, processor + data_args, training_args, stage, template, tokenizer, processor ) column_names = list(next(iter(dataset)).keys()) kwargs = {} From ece67f8c7ff7f5e4eab2e5c4b4f269bdd61f1678 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Fri, 26 Apr 2024 03:35:39 +0800 Subject: [PATCH 27/29] Update parser.py Former-commit-id: 4df75e8a9a391565cc3eec69bc0ebf5d5192de61 --- src/llmtuner/data/parser.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/llmtuner/data/parser.py b/src/llmtuner/data/parser.py index 4d3d7741..01a417a9 100644 --- a/src/llmtuner/data/parser.py +++ b/src/llmtuner/data/parser.py @@ -25,9 +25,10 @@ class DatasetAttr: subset: Optional[str] = None folder: Optional[str] = None ranking: bool = False - formatting: Literal["alpaca", "sharegpt", "llava"] = "alpaca" + formatting: Literal["alpaca", "sharegpt"] = "alpaca" """ columns """ system: Optional[str] = None + images: Optional[str] = None """ columns for the alpaca format """ prompt: Optional[str] = "instruction" query: Optional[str] = "input" @@ -44,8 +45,6 @@ class DatasetAttr: observation_tag: Optional[str] = "observation" function_tag: Optional[str] = "function_call" system_tag: Optional[str] = "system" - """ columns for the mllm format """ - images: Optional[str] = None def __repr__(self) -> str: return self.dataset_name @@ -105,21 +104,18 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: dataset_attr.set_attr("folder", dataset_info[name]) dataset_attr.set_attr("ranking", dataset_info[name], default=False) dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca") - dataset_attr.set_attr("images", dataset_info[name], default="") if "columns" in dataset_info[name]: - column_names = ["system"] + column_names = ["system", "images"] if dataset_attr.formatting == "alpaca": column_names.extend(["prompt", "query", "response", "history"]) - elif dataset_attr.formatting == "llava": - column_names.extend(["messages", "images"]) else: column_names.extend(["messages", "tools"]) for column_name in column_names: dataset_attr.set_attr(column_name, dataset_info[name]["columns"]) - if dataset_attr.formatting != "alpaca" and "tags" in dataset_info[name]: + if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]: tag_names = ( "role_tag", "content_tag", From c37582af02f0e598545f18bf2659aabe996740f5 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Fri, 26 Apr 2024 03:48:34 +0800 Subject: [PATCH 28/29] Update aligner.py Former-commit-id: 855489074c469f47572153df0fa1e251b187b232 --- src/llmtuner/data/aligner.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index 6fd6f404..dc1de865 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -1,3 +1,4 @@ +import os from functools import partial from typing import TYPE_CHECKING, Any, Dict, List, Union @@ -13,8 +14,10 @@ if TYPE_CHECKING: from .parser import DatasetAttr -def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: - outputs = {"prompt": [], "response": [], "system": [], "tools": []} +def convert_alpaca( + examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments" +) -> Dict[str, List[Any]]: + outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} for i in range(len(examples[dataset_attr.prompt])): prompt = [] if dataset_attr.history and isinstance(examples[dataset_attr.history][i], list): @@ -44,11 +47,18 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") outputs["response"].append(response) outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["tools"].append("") - outputs["images"].append([]) + outputs["images"].append( + [os.path.join(data_args.dataset_dir, path) for path in examples[dataset_attr.images][i]] + if dataset_attr.images + else [] + ) + return outputs -def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: +def convert_sharegpt( + examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments" +) -> Dict[str, List[Any]]: outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} tag_mapping = { dataset_attr.user_tag: Role.USER.value, @@ -84,7 +94,11 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr" outputs["response"].append(aligned_messages[-1:]) outputs["system"].append(system) outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) + outputs["images"].append( + [os.path.join(data_args.dataset_dir, path) for path in examples[dataset_attr.images][i]] + if dataset_attr.images + else [] + ) return outputs @@ -97,12 +111,13 @@ def align_dataset( prompt: [{"role": "user", "content": "..."}] * (2T - 1) response: [{"role": "assistant", "content": "..."}] * N (N > 1 for ranking dataset) system: "..." - tools: "..." + tools: "...", + images: [], """ if dataset_attr.formatting == "alpaca": - convert_func = partial(convert_alpaca, dataset_attr=dataset_attr) + convert_func = partial(convert_alpaca, dataset_attr=dataset_attr, data_args=data_args) else: - convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr) + convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr, data_args=data_args) column_names = list(next(iter(dataset)).keys()) features = Features.from_dict( @@ -115,7 +130,7 @@ def align_dataset( ], "system": {"dtype": "string", "_type": "Value"}, "tools": {"dtype": "string", "_type": "Value"}, - "images": {"feature": {"_type": "Image"}, "_type": "Sequence"}, + "images": [{"_type": "Image"}], } ) kwargs = {} From f9a7732a1fe3dd06c3cbd6fc6b1e1be466ca2b30 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Fri, 26 Apr 2024 04:10:28 +0800 Subject: [PATCH 29/29] Update preprocess.py Former-commit-id: 0e376eab23d38b8fca05f054f3cde308756ee3b1 --- src/llmtuner/data/preprocess.py | 66 +++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index be566a5b..0b467724 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -8,15 +8,26 @@ from .utils import Role if TYPE_CHECKING: - from transformers import Seq2SeqTrainingArguments - from transformers.tokenization_utils import AutoProcessor, PreTrainedTokenizer + from PIL import Image + from transformers import ProcessorMixin, Seq2SeqTrainingArguments + from transformers.image_processing_utils import BaseImageProcessor + from transformers.tokenization_utils import PreTrainedTokenizer from ..hparams import DataArguments from .template import Template + logger = get_logger(__name__) +def _preprocess_visual_inputs(model_inputs: Dict[str, Any], processor: "ProcessorMixin", image: "Image") -> None: + image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") + pixel_values = image_processor(image, return_tensors="pt")["pixel_values"][0] + if "pixel_values" not in model_inputs: + model_inputs["pixel_values"] = [] + model_inputs["pixel_values"].append(pixel_values) + + def preprocess_pretrain_dataset( examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, List[List[int]]]: @@ -47,10 +58,10 @@ def preprocess_pretrain_dataset( def preprocess_supervised_dataset( examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], data_args: "DataArguments", - processor: "AutoProcessor" = None, ) -> Dict[str, List[List[int]]]: # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. @@ -90,17 +101,15 @@ def preprocess_supervised_dataset( model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) if processor is not None and "images" in examples: - pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0] - if "pixel_values" not in model_inputs: - model_inputs["pixel_values"] = [] - model_inputs["pixel_values"].append(pixel_values) + _preprocess_visual_inputs(model_inputs, processor, examples["images"][i][0]) + return model_inputs def preprocess_packed_supervised_dataset( examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", template: "Template", + tokenizer: "PreTrainedTokenizer", data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build inputs with format ` X1 Y1 X2 Y2 ` @@ -145,8 +154,9 @@ def preprocess_packed_supervised_dataset( def preprocess_unsupervised_dataset( examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build inputs with format ` X` and labels with format `Y ` @@ -176,14 +186,17 @@ def preprocess_unsupervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) + if processor is not None and "images" in examples: + _preprocess_visual_inputs(model_inputs, processor, examples["images"][i][0]) return model_inputs def preprocess_pairwise_dataset( examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], data_args: "DataArguments", ) -> Dict[str, List[List[int]]]: # build input pairs with format ` X`, `Y1 ` and `Y2 ` @@ -218,6 +231,8 @@ def preprocess_pairwise_dataset( model_inputs["prompt_ids"].append(prompt_ids) model_inputs["chosen_ids"].append(chosen_ids) model_inputs["rejected_ids"].append(rejected_ids) + if processor is not None and "images" in examples: + _preprocess_visual_inputs(model_inputs, processor, examples["images"][i][0]) return model_inputs @@ -248,12 +263,12 @@ def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: def get_preprocess_and_print_func( - tokenizer: "PreTrainedTokenizer", - template: "Template", data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments", stage: Literal["pt", "sft", "rm", "ppo"], - processor: Optional["AutoProcessor"] = None, + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], ) -> Tuple[Callable, Callable]: if stage == "pt": preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args) @@ -261,25 +276,38 @@ def get_preprocess_and_print_func( elif stage == "sft" and not training_args.predict_with_generate: if data_args.packing: preprocess_func = partial( - preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args + preprocess_packed_supervised_dataset, + template=template, + tokenizer=tokenizer, + data_args=data_args, ) else: preprocess_func = partial( preprocess_supervised_dataset, - tokenizer=tokenizer, template=template, - data_args=data_args, + tokenizer=tokenizer, processor=processor, + data_args=data_args, ) + print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer) elif stage == "rm": preprocess_func = partial( - preprocess_pairwise_dataset, tokenizer=tokenizer, template=template, data_args=data_args + preprocess_pairwise_dataset, + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, ) print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer) else: preprocess_func = partial( - preprocess_unsupervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args + preprocess_unsupervised_dataset, + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, ) print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) + return preprocess_func, print_function