From 1f093334d137058807a8bbaba7ef14dc5332933e Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Tue, 21 May 2024 08:57:14 +0800 Subject: [PATCH 1/3] support pretraining of llava Former-commit-id: 6a4c8cf0a6a1674c693b9337f018ff8df7477f8f --- src/llamafactory/hparams/model_args.py | 4 ++++ src/llamafactory/model/loader.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index 5885bb09..255051dc 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -85,6 +85,10 @@ class ModelArguments: default=False, metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."}, ) + tune_mm_proj: bool = field( + default=False, + metadata={"help": "Whethor or not only finetune mm_projector for MLLM."}, + ) moe_aux_loss_coef: Optional[float] = field( default=None, metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."}, diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 49b347d5..d9784593 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -163,6 +163,11 @@ def load_model( else: model.train() + if model_args.visual_inputs and model_args.tune_mm_proj: + lm_params = [param for name, param in model.named_parameters() if "language_model" in name] + for param in lm_params: + param.requires_grad_(False) + trainable_params, all_param = count_parameters(model) if is_trainable: param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( From 932f0d5c20df491422343ddd05c09a28ecd9b169 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Mon, 27 May 2024 18:59:00 +0800 Subject: [PATCH 2/3] add regex of only tune lm and mm_proj Former-commit-id: 38d540b3e69bceabafafab524fcfc78aeb05612d --- sites/paligemma-pt.yaml | 49 ++++++++++++++++++++++++++ sites/paligemma.yaml | 49 ++++++++++++++++++++++++++ sites/paligemma_lora.yaml | 40 +++++++++++++++++++++ src/llamafactory/model/adapter.py | 7 ++++ src/llamafactory/model/loader.py | 5 --- src/llamafactory/model/utils/visual.py | 7 +++- 6 files changed, 151 insertions(+), 6 deletions(-) create mode 100644 sites/paligemma-pt.yaml create mode 100644 sites/paligemma.yaml create mode 100644 sites/paligemma_lora.yaml diff --git a/sites/paligemma-pt.yaml b/sites/paligemma-pt.yaml new file mode 100644 index 00000000..4305cf5f --- /dev/null +++ b/sites/paligemma-pt.yaml @@ -0,0 +1,49 @@ +# model +model_name_or_path: google/paligemma-3b-mix-448 +visual_inputs: true +tune_mm_proj: true +#print_param_status: true + +# method +stage: sft +do_train: true +finetuning_type: full + +# ddp +ddp_timeout: 180000000 +deepspeed: examples/deepspeed/ds_z2_offload_config.json + +# dataset +dataset: mllm_pt_demo +dataset_dir: data +template: gemma +cutoff_len: 2048 +max_samples: 3 +#val_size: 0.0001 +overwrite_cache: true +preprocessing_num_workers: 16 + +# output +output_dir: saves/paligemma/full/sft_llava_pt_test +logging_steps: 1 +save_steps: 50 +plot_loss: true +overwrite_output_dir: true +#save_strategy: epoch +#save_total_limit: 2 + +# train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 16 +learning_rate: 0.00001 +num_train_epochs: 100 +lr_scheduler_type: cosine +warmup_steps: 0.1 +#bf16: true +pure_bf16: true + +# eval +do_eval: false +#per_device_eval_batch_size: 1 +#evaluation_strategy: steps +#eval_steps: 500 diff --git a/sites/paligemma.yaml b/sites/paligemma.yaml new file mode 100644 index 00000000..f3257cfc --- /dev/null +++ b/sites/paligemma.yaml @@ -0,0 +1,49 @@ +# model +model_name_or_path: google/paligemma-3b-mix-448 +visual_inputs: true +#print_param_status: true +use_fast_tokenizer: false + +# method +stage: sft +do_train: true +finetuning_type: full + +# ddp +ddp_timeout: 180000000 +deepspeed: examples/deepspeed/ds_z2_offload_config.json + +# dataset +dataset: mllm_demo +dataset_dir: data +template: gemma +cutoff_len: 2048 +max_samples: 3 +#val_size: 0.0001 +overwrite_cache: true +preprocessing_num_workers: 16 + +# output +output_dir: saves/paligemma/full/sft_llava_1k +logging_steps: 1 +save_steps: 50 +plot_loss: true +overwrite_output_dir: true +#save_strategy: epoch +#save_total_limit: 2 + +# train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 16 +learning_rate: 0.00001 +num_train_epochs: 100 +lr_scheduler_type: cosine +warmup_steps: 0.1 +#bf16: true +pure_bf16: true + +# eval +do_eval: false +#per_device_eval_batch_size: 1 +#evaluation_strategy: steps +#eval_steps: 500 diff --git a/sites/paligemma_lora.yaml b/sites/paligemma_lora.yaml new file mode 100644 index 00000000..0693a6ae --- /dev/null +++ b/sites/paligemma_lora.yaml @@ -0,0 +1,40 @@ +### model +model_name_or_path: google/paligemma-3b-mix-448 +visual_inputs: true +use_fast_tokenizer: false + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: q_proj,v_proj + +### dataset +dataset: mllm_demo +template: gemma +cutoff_len: 1024 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/paligemma/lora/sft_mllm +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 0.0001 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_steps: 0.1 +fp16: true + +### eval +val_size: 0.1 +per_device_eval_batch_size: 1 +evaluation_strategy: steps +eval_steps: 500 diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index f37f3bbb..015db8a0 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -10,6 +10,7 @@ from ..extras.logging import get_logger from .utils.misc import find_all_linear_modules, find_expanded_modules from .utils.quantization import QuantizationMethod from .utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model +from .utils.visual import filter_vision_tower_linear if TYPE_CHECKING: @@ -58,6 +59,9 @@ def init_adapter( if model_args.visual_inputs and hasattr(model, "vision_tower"): # freeze vision model model.vision_tower.requires_grad_(False) + if model_args.visual_inputs and hasattr(model, "language_model") and model_args.tune_mm_proj: # freeze language model if only tune mm_proj + model.language_model.requires_grad_(False) + if finetuning_args.finetuning_type == "freeze" and is_trainable: logger.info("Fine-tuning method: Freeze") num_layers = ( @@ -180,6 +184,9 @@ def init_adapter( if finetuning_args.use_llama_pro: target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable) + if model_args.visual_inputs: + target_modules = filter_vision_tower_linear(target_modules) + if ( finetuning_args.use_dora and getattr(model, "quantization_method", None) is not None diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index d9784593..49b347d5 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -163,11 +163,6 @@ def load_model( else: model.train() - if model_args.visual_inputs and model_args.tune_mm_proj: - lm_params = [param for name, param in model.named_parameters() if "language_model" in name] - for param in lm_params: - param.requires_grad_(False) - trainable_params, all_param = count_parameters(model) if is_trainable: param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( diff --git a/src/llamafactory/model/utils/visual.py b/src/llamafactory/model/utils/visual.py index c8260b7f..a91777ba 100644 --- a/src/llamafactory/model/utils/visual.py +++ b/src/llamafactory/model/utils/visual.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Tuple, List import torch import transformers.models @@ -82,3 +82,8 @@ def configure_visual_model(config: "PretrainedConfig") -> None: if getattr(config, "is_yi_vl_derived_model", None): logger.info("Detected Yi-VL model, applying projector patch.") transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVL + + +def filter_vision_tower_linear(target_modules: List[str]) -> str: + target_modules = f"^(?!.*vision_tower).*(?:{'|'.join(target_modules)}).*" + return target_modules From dc7c54067e12ad45e2bd37b0e92a64a407fb41d8 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Mon, 27 May 2024 19:00:15 +0800 Subject: [PATCH 3/3] add only tune lm and mm_proj Former-commit-id: ba12ca430ec527fbfe4cd1eace0adb5c7712146a --- sites/paligemma-pt.yaml | 49 --------------------------------------- sites/paligemma.yaml | 49 --------------------------------------- sites/paligemma_lora.yaml | 40 -------------------------------- 3 files changed, 138 deletions(-) delete mode 100644 sites/paligemma-pt.yaml delete mode 100644 sites/paligemma.yaml delete mode 100644 sites/paligemma_lora.yaml diff --git a/sites/paligemma-pt.yaml b/sites/paligemma-pt.yaml deleted file mode 100644 index 4305cf5f..00000000 --- a/sites/paligemma-pt.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# model -model_name_or_path: google/paligemma-3b-mix-448 -visual_inputs: true -tune_mm_proj: true -#print_param_status: true - -# method -stage: sft -do_train: true -finetuning_type: full - -# ddp -ddp_timeout: 180000000 -deepspeed: examples/deepspeed/ds_z2_offload_config.json - -# dataset -dataset: mllm_pt_demo -dataset_dir: data -template: gemma -cutoff_len: 2048 -max_samples: 3 -#val_size: 0.0001 -overwrite_cache: true -preprocessing_num_workers: 16 - -# output -output_dir: saves/paligemma/full/sft_llava_pt_test -logging_steps: 1 -save_steps: 50 -plot_loss: true -overwrite_output_dir: true -#save_strategy: epoch -#save_total_limit: 2 - -# train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 16 -learning_rate: 0.00001 -num_train_epochs: 100 -lr_scheduler_type: cosine -warmup_steps: 0.1 -#bf16: true -pure_bf16: true - -# eval -do_eval: false -#per_device_eval_batch_size: 1 -#evaluation_strategy: steps -#eval_steps: 500 diff --git a/sites/paligemma.yaml b/sites/paligemma.yaml deleted file mode 100644 index f3257cfc..00000000 --- a/sites/paligemma.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# model -model_name_or_path: google/paligemma-3b-mix-448 -visual_inputs: true -#print_param_status: true -use_fast_tokenizer: false - -# method -stage: sft -do_train: true -finetuning_type: full - -# ddp -ddp_timeout: 180000000 -deepspeed: examples/deepspeed/ds_z2_offload_config.json - -# dataset -dataset: mllm_demo -dataset_dir: data -template: gemma -cutoff_len: 2048 -max_samples: 3 -#val_size: 0.0001 -overwrite_cache: true -preprocessing_num_workers: 16 - -# output -output_dir: saves/paligemma/full/sft_llava_1k -logging_steps: 1 -save_steps: 50 -plot_loss: true -overwrite_output_dir: true -#save_strategy: epoch -#save_total_limit: 2 - -# train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 16 -learning_rate: 0.00001 -num_train_epochs: 100 -lr_scheduler_type: cosine -warmup_steps: 0.1 -#bf16: true -pure_bf16: true - -# eval -do_eval: false -#per_device_eval_batch_size: 1 -#evaluation_strategy: steps -#eval_steps: 500 diff --git a/sites/paligemma_lora.yaml b/sites/paligemma_lora.yaml deleted file mode 100644 index 0693a6ae..00000000 --- a/sites/paligemma_lora.yaml +++ /dev/null @@ -1,40 +0,0 @@ -### model -model_name_or_path: google/paligemma-3b-mix-448 -visual_inputs: true -use_fast_tokenizer: false - -### method -stage: sft -do_train: true -finetuning_type: lora -lora_target: q_proj,v_proj - -### dataset -dataset: mllm_demo -template: gemma -cutoff_len: 1024 -max_samples: 1000 -overwrite_cache: true -preprocessing_num_workers: 16 - -### output -output_dir: saves/paligemma/lora/sft_mllm -logging_steps: 10 -save_steps: 500 -plot_loss: true -overwrite_output_dir: true - -### train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 8 -learning_rate: 0.0001 -num_train_epochs: 3.0 -lr_scheduler_type: cosine -warmup_steps: 0.1 -fp16: true - -### eval -val_size: 0.1 -per_device_eval_batch_size: 1 -evaluation_strategy: steps -eval_steps: 500