mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-08-22 22:02:51 +08:00
Support distributed BAdam.
Former-commit-id: 0f72aac8c9227e33ad20d2b1641b1c9faae16a5f
This commit is contained in:
parent
5d59f6562a
commit
3a5eacb4cf
@ -209,24 +209,20 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
|
|||||||
):
|
):
|
||||||
raise ValueError("Distributed training does not support layer-wise GaLore.")
|
raise ValueError("Distributed training does not support layer-wise GaLore.")
|
||||||
|
|
||||||
<<<<<<< HEAD
|
|
||||||
# if (
|
|
||||||
# finetuning_args.use_badam
|
|
||||||
# and finetuning_args.badam_mode == "layer"
|
|
||||||
# and training_args.parallel_mode.value == "distributed"
|
|
||||||
# ):
|
|
||||||
# raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
|
|
||||||
=======
|
|
||||||
if (
|
if (
|
||||||
finetuning_args.use_badam
|
finetuning_args.use_badam
|
||||||
and finetuning_args.badam_mode == "layer"
|
and training_args.parallel_mode.value == "distributed"
|
||||||
and training_args.parallel_mode == ParallelMode.DISTRIBUTED
|
|
||||||
):
|
):
|
||||||
raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.")
|
if finetuning_args.badam_mode == "ratio":
|
||||||
>>>>>>> upstream/main
|
raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
|
||||||
|
if (finetuning_args.badam_mode == "layer"
|
||||||
|
and training_args.deepspeed_plugin is not None
|
||||||
|
and training_args.deepspeed_plugin.zero_stage < 3
|
||||||
|
):
|
||||||
|
raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage, got stage {self.args.deepspeed_plugin.zero_stage}")
|
||||||
|
|
||||||
if (finetuning_args.use_galore or finetuning_args.use_badam) and training_args.deepspeed is not None:
|
if (finetuning_args.use_galore) and training_args.deepspeed is not None:
|
||||||
raise ValueError("GaLore and BAdam are incompatible with DeepSpeed yet.")
|
raise ValueError("GaLore are incompatible with DeepSpeed yet.")
|
||||||
|
|
||||||
if model_args.infer_backend == "vllm":
|
if model_args.infer_backend == "vllm":
|
||||||
raise ValueError("vLLM backend is only available for API, CLI and Web.")
|
raise ValueError("vLLM backend is only available for API, CLI and Web.")
|
||||||
|
@ -100,6 +100,12 @@ class CustomDPOTrainer(DPOTrainer):
|
|||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
||||||
|
|
||||||
|
if (self.args.deepspeed_plugin is not None
|
||||||
|
and self.args.deepspeed_plugin.zero_stage == 3
|
||||||
|
):
|
||||||
|
from badam.utils import BAdamZeRO3Callback
|
||||||
|
self.callback_handler.add_callback(BAdamZeRO3Callback)
|
||||||
|
|
||||||
def create_optimizer(self) -> "torch.optim.Optimizer":
|
def create_optimizer(self) -> "torch.optim.Optimizer":
|
||||||
if self.optimizer is None:
|
if self.optimizer is None:
|
||||||
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
||||||
|
@ -95,6 +95,12 @@ class CustomKTOTrainer(KTOTrainer):
|
|||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
||||||
|
|
||||||
|
if (self.args.deepspeed_plugin is not None
|
||||||
|
and self.args.deepspeed_plugin.zero_stage == 3
|
||||||
|
):
|
||||||
|
from badam.utils import BAdamZeRO3Callback
|
||||||
|
self.callback_handler.add_callback(BAdamZeRO3Callback)
|
||||||
|
|
||||||
def create_optimizer(self) -> "torch.optim.Optimizer":
|
def create_optimizer(self) -> "torch.optim.Optimizer":
|
||||||
if self.optimizer is None:
|
if self.optimizer is None:
|
||||||
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
||||||
|
@ -170,6 +170,12 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
|||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
||||||
|
|
||||||
|
if (self.args.deepspeed_plugin is not None
|
||||||
|
and self.args.deepspeed_plugin.zero_stage == 3
|
||||||
|
):
|
||||||
|
from badam.utils import BAdamZeRO3Callback
|
||||||
|
self.callback_handler.add_callback(BAdamZeRO3Callback)
|
||||||
|
|
||||||
def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
|
def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
|
||||||
r"""
|
r"""
|
||||||
Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer.
|
Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer.
|
||||||
|
@ -52,6 +52,12 @@ class CustomTrainer(Trainer):
|
|||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
||||||
|
|
||||||
|
if (self.args.deepspeed_plugin is not None
|
||||||
|
and self.args.deepspeed_plugin.zero_stage == 3
|
||||||
|
):
|
||||||
|
from badam.utils import BAdamZeRO3Callback
|
||||||
|
self.callback_handler.add_callback(BAdamZeRO3Callback)
|
||||||
|
|
||||||
def create_optimizer(self) -> "torch.optim.Optimizer":
|
def create_optimizer(self) -> "torch.optim.Optimizer":
|
||||||
if self.optimizer is None:
|
if self.optimizer is None:
|
||||||
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
||||||
|
@ -76,6 +76,12 @@ class PairwiseTrainer(Trainer):
|
|||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
||||||
|
|
||||||
|
if (self.args.deepspeed_plugin is not None
|
||||||
|
and self.args.deepspeed_plugin.zero_stage == 3
|
||||||
|
):
|
||||||
|
from badam.utils import BAdamZeRO3Callback
|
||||||
|
self.callback_handler.add_callback(BAdamZeRO3Callback)
|
||||||
|
|
||||||
def create_optimizer(self) -> "torch.optim.Optimizer":
|
def create_optimizer(self) -> "torch.optim.Optimizer":
|
||||||
if self.optimizer is None:
|
if self.optimizer is None:
|
||||||
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
||||||
|
@ -57,9 +57,14 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
|
|||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import clip_grad_norm_for_sparse_tensor
|
from badam import clip_grad_norm_for_sparse_tensor
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator)
|
||||||
|
|
||||||
|
if (self.args.deepspeed_plugin is not None
|
||||||
|
and self.args.deepspeed_plugin.zero_stage == 3
|
||||||
|
):
|
||||||
|
from badam.utils import BAdamZeRO3Callback
|
||||||
|
self.callback_handler.add_callback(BAdamZeRO3Callback)
|
||||||
|
|
||||||
def create_optimizer(self) -> "torch.optim.Optimizer":
|
def create_optimizer(self) -> "torch.optim.Optimizer":
|
||||||
if self.optimizer is None:
|
if self.optimizer is None:
|
||||||
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
|
||||||
@ -80,21 +85,6 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
|
|||||||
if self.processor is not None:
|
if self.processor is not None:
|
||||||
getattr(self.processor, "image_processor").save_pretrained(output_dir)
|
getattr(self.processor, "image_processor").save_pretrained(output_dir)
|
||||||
|
|
||||||
def training_step(self, *args, **kwargs):
|
|
||||||
r"""
|
|
||||||
Update the reference to deepspeed optimizer
|
|
||||||
"""
|
|
||||||
if self.finetuning_args.use_badam and \
|
|
||||||
self.args.deepspeed_plugin is not None and \
|
|
||||||
self.args.deepspeed_plugin.zero_stage == 3:
|
|
||||||
|
|
||||||
ds_optim = self.optimizer.optimizer
|
|
||||||
badam_optim = ds_optim.optimizer
|
|
||||||
badam_optim.ds_optimizer = ds_optim
|
|
||||||
|
|
||||||
return super().training_step(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def prediction_step(
|
def prediction_step(
|
||||||
self,
|
self,
|
||||||
model: "torch.nn.Module",
|
model: "torch.nn.Module",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user