diff --git a/scripts/test_mllm.py b/scripts/test_mllm.py deleted file mode 100644 index b8fe3e0f..00000000 --- a/scripts/test_mllm.py +++ /dev/null @@ -1,99 +0,0 @@ -import os.path - -import fire -import torch -from datasets import load_dataset -from peft import PeftModel -from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoProcessor -import shutil -from PIL import Image - -"""usage -python3 scripts/test_mllm.py \ ---base_model_path llava-hf/llava-1.5-7b-hf \ ---lora_model_path saves/llava-1.5-7b/lora/sft \ ---model_path saves/llava-1.5-7b/lora/merged \ ---dataset_name data/llava_instruct_example.json \ ---do_merge 1 -""" - - -def get_processor(model_path): - processor = AutoProcessor.from_pretrained(model_path) - CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for message in messages %}{% if message['role'] == 'user' %}USER: {{ message['content'] }} ASSISTANT: {% else %}{{ message['content'] }}{% endif %} {% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}""" - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) - tokenizer.chat_template = CHAT_TEMPLATE - processor.tokenizer = tokenizer - return processor - - -def apply_lora(base_model_path, model_path, lora_path): - print(f"Loading the base model from {base_model_path}") - base_model = AutoModelForVision2Seq.from_pretrained( - base_model_path, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - device_map="cuda", - ) - processor = get_processor(base_model_path) - tokenizer = processor.tokenizer - print(f"Loading the LoRA adapter from {lora_path}") - - lora_model = PeftModel.from_pretrained( - base_model, - lora_path, - torch_dtype=torch.float16, - ) - - print("Applying the LoRA") - model = lora_model.merge_and_unload() - - print(f"Saving the target model to {model_path}") - model.save_pretrained(model_path) - tokenizer.save_pretrained(model_path) - processor.image_processor.save_pretrained(model_path) - - -def main( - model_path: str, - dataset_name: str, - base_model_path: str = "", - lora_model_path: str = "", - do_merge: bool = False, -): - if not os.path.exists(model_path) or do_merge: - apply_lora(base_model_path, model_path, lora_model_path) - model = AutoModelForVision2Seq.from_pretrained( - model_path, - torch_dtype=torch.bfloat16, - low_cpu_mem_usage=True, - device_map="cuda", - ) - processor = get_processor(model_path) - raw_datasets = load_dataset("json", data_files=dataset_name) - train_dataset = raw_datasets["train"] - examples = train_dataset.select(range(3)) - texts = [] - images = [] - for example in examples: - messages = example["messages"][:1] - text = processor.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=False - ) - texts.append(text) - images.append(Image.open(example["images"][0])) - batch = processor(text=texts, images=images, return_tensors="pt", padding=True).to( - "cuda" - ) - output = model.generate(**batch, max_new_tokens=100) - res_list = processor.batch_decode(output, skip_special_tokens=True) - for i, prompt in enumerate(texts): - res = res_list[i] - print(f"#{i}") - print(f"prompt:{prompt}") - print(f"response:{res[len(prompt):].strip()}") - print() - - -if __name__ == "__main__": - fire.Fire(main) diff --git a/src/llmtuner/data/aligner.py b/src/llmtuner/data/aligner.py index 17b9fc6d..6fd6f404 100644 --- a/src/llmtuner/data/aligner.py +++ b/src/llmtuner/data/aligner.py @@ -36,12 +36,7 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") {"role": Role.ASSISTANT.value, "content": content} for content in examples[dataset_attr.response][i] ] elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str): - response = [ - { - "role": Role.ASSISTANT.value, - "content": examples[dataset_attr.response][i], - } - ] + response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}] else: response = [] @@ -54,47 +49,6 @@ def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: - outputs = {"prompt": [], "response": [], "system": [], "tools": []} - tag_mapping = { - dataset_attr.user_tag: Role.USER.value, - dataset_attr.assistant_tag: Role.ASSISTANT.value, - dataset_attr.observation_tag: Role.OBSERVATION.value, - dataset_attr.function_tag: Role.FUNCTION.value, - dataset_attr.system_tag: Role.SYSTEM.value, - } - odd_tags = (dataset_attr.user_tag, dataset_attr.observation_tag) - even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag) - accept_tags = (odd_tags, even_tags) - for i, messages in enumerate(examples[dataset_attr.messages]): - if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag: - system = messages[0][dataset_attr.content_tag] - messages = messages[1:] - else: - system = examples[dataset_attr.system][i] if dataset_attr.system else "" - - messages = messages[: len(messages) // 2 * 2] # should be multiples of 2 - if len(messages) == 0: - continue - - aligned_messages = [] - for turn_idx, message in enumerate(messages): - if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]: - raise ValueError("Invalid role tag in {}.".format(messages)) - - aligned_messages.append( - {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} - ) - - outputs["prompt"].append(aligned_messages[:-1]) - outputs["response"].append(aligned_messages[-1:]) - outputs["system"].append(system) - outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - outputs["images"].append([]) - - return outputs - - -def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} tag_mapping = { dataset_attr.user_tag: Role.USER.value, @@ -130,7 +84,6 @@ def convert_llava(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") - outputs["response"].append(aligned_messages[-1:]) outputs["system"].append(system) outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - print(examples[dataset_attr.images][i]) outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) return outputs @@ -148,8 +101,6 @@ def align_dataset( """ if dataset_attr.formatting == "alpaca": convert_func = partial(convert_alpaca, dataset_attr=dataset_attr) - elif dataset_attr.formatting == "llava": - convert_func = partial(convert_llava, dataset_attr=dataset_attr) else: convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index 9cdcdfa2..6108b245 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -50,6 +50,7 @@ def preprocess_supervised_dataset( tokenizer: "PreTrainedTokenizer", template: "Template", data_args: "DataArguments", + processor: "AutoProcessor" = None, ) -> Dict[str, List[List[int]]]: # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. @@ -88,7 +89,9 @@ def preprocess_supervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) - + if processor is not None and "images" in examples: + pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0] + model_inputs["pixel_values"].append(pixel_values) return model_inputs @@ -138,55 +141,6 @@ def preprocess_packed_supervised_dataset( return model_inputs -def preprocess_multimodal_supervised_dataset( - examples: Dict[str, List[Any]], - processor: "AutoProcessor", - template: "Template", - data_args: "DataArguments", -) -> Dict[str, List[List[int]]]: - # build inputs with format ` X Y ` and labels with format ` ... Y ` - # for multiturn examples, we only mask the prompt part in each prompt-response pair. - tokenizer = processor.tokenizer - model_inputs = {"input_ids": [], "attention_mask": [], "labels": [], "pixel_values": []} - - for i in range(len(examples["prompt"])): - if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: - continue - - messages = examples["prompt"][i] + examples["response"][i] - input_ids, labels = [], [] - for turn_idx, (source_ids, target_ids) in enumerate( - template.encode_multiturn( - tokenizer, - messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) - ): - if data_args.train_on_prompt: - source_mask = source_ids - elif turn_idx != 0 and template.efficient_eos: - source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) - else: - source_mask = [IGNORE_INDEX] * len(source_ids) - - input_ids += source_ids + target_ids - labels += source_mask + target_ids - - if template.efficient_eos: - input_ids += [tokenizer.eos_token_id] - labels += [tokenizer.eos_token_id] - - model_inputs["input_ids"].append(input_ids) - model_inputs["attention_mask"].append([1] * len(input_ids)) - model_inputs["labels"].append(labels) - pixel_values = processor.image_processor(examples["images"][0], return_tensors="pt")["pixel_values"][0] - model_inputs["pixel_values"].append(pixel_values) - return model_inputs - - def preprocess_unsupervised_dataset( examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", @@ -307,15 +261,14 @@ def get_preprocess_and_print_func( preprocess_func = partial( preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args ) - elif processor is not None: - preprocess_func = partial( - preprocess_multimodal_supervised_dataset, processor=processor, template=template, data_args=data_args - ) else: preprocess_func = partial( - preprocess_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args + preprocess_supervised_dataset, + tokenizer=tokenizer, + template=template, + data_args=data_args, + processor=processor, ) - print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer) elif stage == "rm": preprocess_func = partial(