Support distributed BAdam.

Former-commit-id: 0f72aac8c9
This commit is contained in:
Jonery
2024-06-18 12:27:47 +08:00
parent 5d59f6562a
commit 3a5eacb4cf
7 changed files with 46 additions and 30 deletions

View File

@@ -100,6 +100,12 @@ class CustomDPOTrainer(DPOTrainer):
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
if (self.args.deepspeed_plugin is not None
and self.args.deepspeed_plugin.zero_stage == 3
):
from badam.utils import BAdamZeRO3Callback
self.callback_handler.add_callback(BAdamZeRO3Callback)
def create_optimizer(self) -> "torch.optim.Optimizer":
if self.optimizer is None:
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)

View File

@@ -95,6 +95,12 @@ class CustomKTOTrainer(KTOTrainer):
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
if (self.args.deepspeed_plugin is not None
and self.args.deepspeed_plugin.zero_stage == 3
):
from badam.utils import BAdamZeRO3Callback
self.callback_handler.add_callback(BAdamZeRO3Callback)
def create_optimizer(self) -> "torch.optim.Optimizer":
if self.optimizer is None:
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)

View File

@@ -170,6 +170,12 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
if (self.args.deepspeed_plugin is not None
and self.args.deepspeed_plugin.zero_stage == 3
):
from badam.utils import BAdamZeRO3Callback
self.callback_handler.add_callback(BAdamZeRO3Callback)
def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
r"""
Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer.

View File

@@ -52,6 +52,12 @@ class CustomTrainer(Trainer):
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
if (self.args.deepspeed_plugin is not None
and self.args.deepspeed_plugin.zero_stage == 3
):
from badam.utils import BAdamZeRO3Callback
self.callback_handler.add_callback(BAdamZeRO3Callback)
def create_optimizer(self) -> "torch.optim.Optimizer":
if self.optimizer is None:
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)

View File

@@ -76,6 +76,12 @@ class PairwiseTrainer(Trainer):
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
if (self.args.deepspeed_plugin is not None
and self.args.deepspeed_plugin.zero_stage == 3
):
from badam.utils import BAdamZeRO3Callback
self.callback_handler.add_callback(BAdamZeRO3Callback)
def create_optimizer(self) -> "torch.optim.Optimizer":
if self.optimizer is None:
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)

View File

@@ -57,9 +57,14 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
if finetuning_args.use_badam:
from badam import clip_grad_norm_for_sparse_tensor
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
if (self.args.deepspeed_plugin is not None
and self.args.deepspeed_plugin.zero_stage == 3
):
from badam.utils import BAdamZeRO3Callback
self.callback_handler.add_callback(BAdamZeRO3Callback)
def create_optimizer(self) -> "torch.optim.Optimizer":
if self.optimizer is None:
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
@@ -80,21 +85,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
if self.processor is not None:
getattr(self.processor, "image_processor").save_pretrained(output_dir)
def training_step(self, *args, **kwargs):
r"""
Update the reference to deepspeed optimizer
"""
if self.finetuning_args.use_badam and \
self.args.deepspeed_plugin is not None and \
self.args.deepspeed_plugin.zero_stage == 3:
ds_optim = self.optimizer.optimizer
badam_optim = ds_optim.optimizer
badam_optim.ds_optimizer = ds_optim
return super().training_step(*args, **kwargs)
def prediction_step(
self,
model: "torch.nn.Module",