mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-10-14 23:58:11 +08:00
fix incorrect loss value for vlms
Former-commit-id: 0aa29a71ce958343a2086090d647eb63b8f5f5be
This commit is contained in:
parent
03213984ec
commit
8185eb1890
@ -1,5 +1,5 @@
|
|||||||
transformers>=4.41.2,<=4.46.0
|
transformers>=4.41.2,<=4.46.1
|
||||||
datasets>=2.16.0,<=2.21.0
|
datasets>=2.16.0,<=3.0.2
|
||||||
accelerate>=0.34.0,<=1.0.1
|
accelerate>=0.34.0,<=1.0.1
|
||||||
peft>=0.11.1,<=0.12.0
|
peft>=0.11.1,<=0.12.0
|
||||||
trl>=0.8.6,<=0.9.6
|
trl>=0.8.6,<=0.9.6
|
||||||
|
@ -20,17 +20,17 @@ Level:
|
|||||||
|
|
||||||
Dependency graph:
|
Dependency graph:
|
||||||
main:
|
main:
|
||||||
transformers>=4.41.2,<=4.46.0
|
transformers>=4.41.2,<=4.46.1
|
||||||
datasets>=2.16.0,<=2.21.0
|
datasets>=2.16.0,<=3.0.2
|
||||||
accelerate>=0.34.0,<=1.0.1
|
accelerate>=0.34.0,<=1.0.1
|
||||||
peft>=0.11.1,<=0.12.0
|
peft>=0.11.1,<=0.12.0
|
||||||
trl>=0.8.6,<=0.9.6
|
trl>=0.8.6,<=0.9.6
|
||||||
attention:
|
attention:
|
||||||
transformers>=4.42.4 (gemma+fa2)
|
transformers>=4.42.4 (gemma+fa2)
|
||||||
longlora:
|
longlora:
|
||||||
transformers>=4.41.2,<=4.46.0
|
transformers>=4.41.2,<=4.46.1
|
||||||
packing:
|
packing:
|
||||||
transformers>=4.41.2,<=4.46.0
|
transformers>=4.41.2,<=4.46.1
|
||||||
|
|
||||||
Disable version checking: DISABLE_VERSION_CHECK=1
|
Disable version checking: DISABLE_VERSION_CHECK=1
|
||||||
Enable VRAM recording: RECORD_VRAM=1
|
Enable VRAM recording: RECORD_VRAM=1
|
||||||
|
@ -79,8 +79,8 @@ def check_dependencies() -> None:
|
|||||||
if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]:
|
if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]:
|
||||||
logger.warning("Version checking has been disabled, may lead to unexpected behaviors.")
|
logger.warning("Version checking has been disabled, may lead to unexpected behaviors.")
|
||||||
else:
|
else:
|
||||||
require_version("transformers>=4.41.2,<=4.46.0", "To fix: pip install transformers>=4.41.2,<=4.46.0")
|
require_version("transformers>=4.41.2,<=4.46.1", "To fix: pip install transformers>=4.41.2,<=4.46.1")
|
||||||
require_version("datasets>=2.16.0,<=2.21.0", "To fix: pip install datasets>=2.16.0,<=2.21.0")
|
require_version("datasets>=2.16.0,<=3.0.2", "To fix: pip install datasets>=2.16.0,<=3.0.2")
|
||||||
require_version("accelerate>=0.34.0,<=1.0.1", "To fix: pip install accelerate>=0.34.0,<=1.0.1")
|
require_version("accelerate>=0.34.0,<=1.0.1", "To fix: pip install accelerate>=0.34.0,<=1.0.1")
|
||||||
require_version("peft>=0.11.1,<=0.12.0", "To fix: pip install peft>=0.11.1,<=0.12.0")
|
require_version("peft>=0.11.1,<=0.12.0", "To fix: pip install peft>=0.11.1,<=0.12.0")
|
||||||
require_version("trl>=0.8.6,<=0.9.6", "To fix: pip install trl>=0.8.6,<=0.9.6")
|
require_version("trl>=0.8.6,<=0.9.6", "To fix: pip install trl>=0.8.6,<=0.9.6")
|
||||||
@ -237,7 +237,7 @@ def try_download_model_from_other_hub(model_args: "ModelArguments") -> str:
|
|||||||
|
|
||||||
if use_modelscope():
|
if use_modelscope():
|
||||||
require_version("modelscope>=1.11.0", "To fix: pip install modelscope>=1.11.0")
|
require_version("modelscope>=1.11.0", "To fix: pip install modelscope>=1.11.0")
|
||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download # type: ignore
|
||||||
|
|
||||||
revision = "master" if model_args.model_revision == "main" else model_args.model_revision
|
revision = "master" if model_args.model_revision == "main" else model_args.model_revision
|
||||||
return snapshot_download(
|
return snapshot_download(
|
||||||
@ -248,7 +248,7 @@ def try_download_model_from_other_hub(model_args: "ModelArguments") -> str:
|
|||||||
|
|
||||||
if use_openmind():
|
if use_openmind():
|
||||||
require_version("openmind>=0.8.0", "To fix: pip install openmind>=0.8.0")
|
require_version("openmind>=0.8.0", "To fix: pip install openmind>=0.8.0")
|
||||||
from openmind.utils.hub import snapshot_download
|
from openmind.utils.hub import snapshot_download # type: ignore
|
||||||
|
|
||||||
return snapshot_download(
|
return snapshot_download(
|
||||||
model_args.model_name_or_path,
|
model_args.model_name_or_path,
|
||||||
|
@ -81,7 +81,7 @@ def is_transformers_version_greater_than_4_43():
|
|||||||
|
|
||||||
@lru_cache
|
@lru_cache
|
||||||
def is_transformers_version_equal_to_4_46():
|
def is_transformers_version_equal_to_4_46():
|
||||||
return _get_package_version("transformers") == version.parse("4.46.0")
|
return version.parse("4.46.0") <= _get_package_version("transformers") <= version.parse("4.46.1")
|
||||||
|
|
||||||
|
|
||||||
def is_uvicorn_available():
|
def is_uvicorn_available():
|
||||||
|
@ -353,7 +353,7 @@ def llama_sdpa_attention_forward(
|
|||||||
|
|
||||||
|
|
||||||
def _apply_llama_patch() -> None:
|
def _apply_llama_patch() -> None:
|
||||||
require_version("transformers>=4.41.2,<=4.46.0", "To fix: pip install transformers>=4.41.2,<=4.46.0")
|
require_version("transformers>=4.41.2,<=4.46.1", "To fix: pip install transformers>=4.41.2,<=4.46.1")
|
||||||
LlamaAttention.forward = llama_attention_forward
|
LlamaAttention.forward = llama_attention_forward
|
||||||
LlamaFlashAttention2.forward = llama_flash_attention_2_forward
|
LlamaFlashAttention2.forward = llama_flash_attention_2_forward
|
||||||
LlamaSdpaAttention.forward = llama_sdpa_attention_forward
|
LlamaSdpaAttention.forward = llama_sdpa_attention_forward
|
||||||
|
@ -114,7 +114,7 @@ def get_unpad_data(attention_mask: "torch.Tensor") -> Tuple["torch.Tensor", "tor
|
|||||||
|
|
||||||
|
|
||||||
def _patch_for_block_diag_attn(model_type: str) -> None:
|
def _patch_for_block_diag_attn(model_type: str) -> None:
|
||||||
require_version("transformers>=4.41.2,<=4.46.0", "To fix: pip install transformers>=4.41.2,<=4.46.0")
|
require_version("transformers>=4.41.2,<=4.46.1", "To fix: pip install transformers>=4.41.2,<=4.46.1")
|
||||||
if is_transformers_version_greater_than_4_43():
|
if is_transformers_version_greater_than_4_43():
|
||||||
import transformers.modeling_flash_attention_utils
|
import transformers.modeling_flash_attention_utils
|
||||||
|
|
||||||
|
@ -101,7 +101,7 @@ class CustomDPOTrainer(DPOTrainer):
|
|||||||
self.callback_handler.add_callback(PissaConvertCallback)
|
self.callback_handler.add_callback(PissaConvertCallback)
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
from badam import BAdamCallback, clip_grad_norm_old_version # type: ignore
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.add_callback(BAdamCallback)
|
self.add_callback(BAdamCallback)
|
||||||
@ -274,7 +274,7 @@ class CustomDPOTrainer(DPOTrainer):
|
|||||||
https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/trainer.py#L3605
|
https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/trainer.py#L3605
|
||||||
"""
|
"""
|
||||||
loss = super().compute_loss(model, inputs, return_outputs)
|
loss = super().compute_loss(model, inputs, return_outputs)
|
||||||
if kwargs.pop("num_items_in_batch", False) and is_transformers_version_equal_to_4_46():
|
if is_transformers_version_equal_to_4_46() and kwargs.pop("num_items_in_batch", False):
|
||||||
loss /= self.args.gradient_accumulation_steps
|
loss /= self.args.gradient_accumulation_steps
|
||||||
|
|
||||||
return loss
|
return loss
|
||||||
|
@ -96,7 +96,7 @@ class CustomKTOTrainer(KTOTrainer):
|
|||||||
self.add_callback(SaveProcessorCallback(processor))
|
self.add_callback(SaveProcessorCallback(processor))
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
from badam import BAdamCallback, clip_grad_norm_old_version # type: ignore
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.add_callback(BAdamCallback)
|
self.add_callback(BAdamCallback)
|
||||||
@ -247,7 +247,7 @@ class CustomKTOTrainer(KTOTrainer):
|
|||||||
https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/trainer.py#L3605
|
https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/trainer.py#L3605
|
||||||
"""
|
"""
|
||||||
loss = super().compute_loss(model, inputs, return_outputs)
|
loss = super().compute_loss(model, inputs, return_outputs)
|
||||||
if kwargs.pop("num_items_in_batch", False) and is_transformers_version_equal_to_4_46():
|
if is_transformers_version_equal_to_4_46() and kwargs.pop("num_items_in_batch", False):
|
||||||
loss /= self.args.gradient_accumulation_steps
|
loss /= self.args.gradient_accumulation_steps
|
||||||
|
|
||||||
return loss
|
return loss
|
||||||
|
@ -181,7 +181,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
|||||||
self.add_callback(SaveProcessorCallback(processor))
|
self.add_callback(SaveProcessorCallback(processor))
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
from badam import BAdamCallback, clip_grad_norm_old_version # type: ignore
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.add_callback(BAdamCallback)
|
self.add_callback(BAdamCallback)
|
||||||
|
@ -19,6 +19,7 @@ from transformers import Trainer
|
|||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from ...extras.logging import get_logger
|
from ...extras.logging import get_logger
|
||||||
|
from ...extras.packages import is_transformers_version_equal_to_4_46
|
||||||
from ..callbacks import PissaConvertCallback, SaveProcessorCallback
|
from ..callbacks import PissaConvertCallback, SaveProcessorCallback
|
||||||
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
|
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
|
||||||
|
|
||||||
@ -51,7 +52,7 @@ class CustomTrainer(Trainer):
|
|||||||
self.add_callback(PissaConvertCallback)
|
self.add_callback(PissaConvertCallback)
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
from badam import BAdamCallback, clip_grad_norm_old_version # type: ignore
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.add_callback(BAdamCallback)
|
self.add_callback(BAdamCallback)
|
||||||
@ -68,3 +69,15 @@ class CustomTrainer(Trainer):
|
|||||||
) -> "torch.optim.lr_scheduler.LRScheduler":
|
) -> "torch.optim.lr_scheduler.LRScheduler":
|
||||||
create_custom_scheduler(self.args, num_training_steps, optimizer)
|
create_custom_scheduler(self.args, num_training_steps, optimizer)
|
||||||
return super().create_scheduler(num_training_steps, optimizer)
|
return super().create_scheduler(num_training_steps, optimizer)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
|
||||||
|
r"""
|
||||||
|
Fixes the loss value for transformers 4.46.0.
|
||||||
|
https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/trainer.py#L3605
|
||||||
|
"""
|
||||||
|
loss = super().compute_loss(model, inputs, return_outputs, **kwargs)
|
||||||
|
if is_transformers_version_equal_to_4_46() and not getattr(self, "model_accepts_loss_kwargs", False):
|
||||||
|
loss /= self.args.gradient_accumulation_steps # other model should not scale the loss
|
||||||
|
|
||||||
|
return loss
|
||||||
|
@ -60,7 +60,7 @@ class PairwiseTrainer(Trainer):
|
|||||||
self.add_callback(PissaConvertCallback)
|
self.add_callback(PissaConvertCallback)
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
from badam import BAdamCallback, clip_grad_norm_old_version # type: ignore
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.add_callback(BAdamCallback)
|
self.add_callback(BAdamCallback)
|
||||||
@ -100,7 +100,7 @@ class PairwiseTrainer(Trainer):
|
|||||||
|
|
||||||
loss = -torch.nn.functional.logsigmoid(chosen_scores.float() - rejected_scores.float()).mean()
|
loss = -torch.nn.functional.logsigmoid(chosen_scores.float() - rejected_scores.float()).mean()
|
||||||
|
|
||||||
if kwargs.pop("num_items_in_batch", False) and is_transformers_version_equal_to_4_46():
|
if is_transformers_version_equal_to_4_46() and kwargs.pop("num_items_in_batch", False):
|
||||||
loss /= self.args.gradient_accumulation_steps # fixes the loss value for transformers 4.46.0
|
loss /= self.args.gradient_accumulation_steps # fixes the loss value for transformers 4.46.0
|
||||||
|
|
||||||
if return_outputs:
|
if return_outputs:
|
||||||
|
@ -27,6 +27,7 @@ from typing_extensions import override
|
|||||||
|
|
||||||
from ...extras.constants import IGNORE_INDEX
|
from ...extras.constants import IGNORE_INDEX
|
||||||
from ...extras.logging import get_logger
|
from ...extras.logging import get_logger
|
||||||
|
from ...extras.packages import is_transformers_version_equal_to_4_46
|
||||||
from ..callbacks import PissaConvertCallback, SaveProcessorCallback
|
from ..callbacks import PissaConvertCallback, SaveProcessorCallback
|
||||||
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
|
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
|
||||||
|
|
||||||
@ -60,7 +61,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
|
|||||||
self.add_callback(PissaConvertCallback)
|
self.add_callback(PissaConvertCallback)
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
from badam import BAdamCallback, clip_grad_norm_old_version # type: ignore
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.add_callback(BAdamCallback)
|
self.add_callback(BAdamCallback)
|
||||||
@ -78,6 +79,18 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
|
|||||||
create_custom_scheduler(self.args, num_training_steps, optimizer)
|
create_custom_scheduler(self.args, num_training_steps, optimizer)
|
||||||
return super().create_scheduler(num_training_steps, optimizer)
|
return super().create_scheduler(num_training_steps, optimizer)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
|
||||||
|
r"""
|
||||||
|
Fixes the loss value for transformers 4.46.0.
|
||||||
|
https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/trainer.py#L3605
|
||||||
|
"""
|
||||||
|
loss = super().compute_loss(model, inputs, return_outputs, **kwargs)
|
||||||
|
if is_transformers_version_equal_to_4_46() and not getattr(self, "model_accepts_loss_kwargs", False):
|
||||||
|
loss /= self.args.gradient_accumulation_steps # other model should not scale the loss
|
||||||
|
|
||||||
|
return loss
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def prediction_step(
|
def prediction_step(
|
||||||
self,
|
self,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user