From ed8d9e0881025ba3d39ebf9184ead88677161d30 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Thu, 2 May 2024 02:47:04 +0800 Subject: [PATCH] fix badam configs Former-commit-id: 9433c8c215881692f318b89df03af97b4eda4dd5 --- src/llmtuner/hparams/finetuning_args.py | 15 +++-- src/llmtuner/train/utils.py | 4 +- src/llmtuner/webui/components/train.py | 8 +-- src/llmtuner/webui/locales.py | 80 ++++++++----------------- src/llmtuner/webui/runner.py | 6 +- 5 files changed, 44 insertions(+), 69 deletions(-) diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py index f4f71bc5..03bf52af 100644 --- a/src/llmtuner/hparams/finetuning_args.py +++ b/src/llmtuner/hparams/finetuning_args.py @@ -221,16 +221,18 @@ class BAdamArgument: default=None, metadata={"help": "The starting block index for layer-wise BAdam."}, ) - badam_switch_block_every: Optional[int] = field( - default=50, - metadata={"help": "How often to switch model's block update. Set to -1 to disable the block update."}, - ) badam_switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field( default="ascending", metadata={"help": "the strategy of picking block to update for layer-wise BAdam."}, ) + badam_switch_interval: Optional[int] = field( + default=50, + metadata={ + "help": "Number of steps to update the block for layer-wise BAdam. Use -1 to disable the block update." + }, + ) badam_update_ratio: float = field( - default=0.0, + default=0.05, metadata={"help": "The ratio of the update for ratio-wise BAdam."}, ) badam_mask_mode: Literal["adjacent", "scatter"] = field( @@ -308,6 +310,9 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA if self.use_galore and self.finetuning_type == "lora": raise ValueError("Cannot use LoRA with GaLore together.") + if self.use_galore and self.use_badam: + raise ValueError("Cannot use GaLore with BAdam together.") + if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora": raise ValueError("`loraplus_lr_ratio` is only valid for the LoRA training.") diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py index d9fc363d..21dac461 100644 --- a/src/llmtuner/train/utils.py +++ b/src/llmtuner/train/utils.py @@ -317,14 +317,14 @@ def _create_badam_optimizer( base_optimizer=base_optimizer, named_parameters_list=list(model.named_parameters()), block_prefix_list=None, - switch_block_every=finetuning_args.badam_switch_block_every, + switch_block_every=finetuning_args.badam_switch_interval, start_block=finetuning_args.badam_start_block, switch_mode=finetuning_args.badam_switch_mode, verbose=finetuning_args.badam_verbose, ) logger.info( f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, " - f"switch block every {finetuning_args.badam_switch_block_every} steps, " + f"switch block every {finetuning_args.badam_switch_interval} steps, " f"default start block is {finetuning_args.badam_start_block}" ) diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py index be070869..c9671289 100644 --- a/src/llmtuner/webui/components/train.py +++ b/src/llmtuner/webui/components/train.py @@ -215,17 +215,17 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]: use_badam = gr.Checkbox() badam_mode = gr.Dropdown(choices=["layer", "ratio"], value="layer") badam_switch_mode = gr.Dropdown(choices=["ascending", "descending", "random", "fixed"], value="ascending") - badam_switch_block_every = gr.Slider(value=50, minimum=-1, maximum=200, step=1) - badam_update_ratio = gr.Slider(value=0, minimum=0, maximum=1, step=0.01) + badam_switch_interval = gr.Slider(value=50, minimum=1, maximum=1024, step=1) + badam_update_ratio = gr.Slider(value=0.05, minimum=0, maximum=1, step=0.01) - input_elems.update({use_badam, badam_mode, badam_switch_mode, badam_switch_block_every, badam_update_ratio}) + input_elems.update({use_badam, badam_mode, badam_switch_mode, badam_switch_interval, badam_update_ratio}) elem_dict.update( dict( badam_tab=badam_tab, use_badam=use_badam, badam_mode=badam_mode, badam_switch_mode=badam_switch_mode, - badam_switch_block_every=badam_switch_block_every, + badam_switch_interval=badam_switch_interval, badam_update_ratio=badam_update_ratio, ) ) diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py index d3dd4dc2..1c474f34 100644 --- a/src/llmtuner/webui/locales.py +++ b/src/llmtuner/webui/locales.py @@ -905,15 +905,15 @@ LOCALES = { "use_badam": { "en": { "label": "Use BAdam", - "info": "Enable the block coordinate optimization with Adam.", + "info": "Enable the BAdam optimizer.", }, "ru": { "label": "Использовать BAdam", - "info": "Включите блочную оптимизацию координат с Adam.", + "info": "Включите оптимизатор BAdam.", }, "zh": { "label": "使用 BAdam", - "info": "使用多Block协同的Adam优化器。", + "info": "使用 BAdam 优化器。", }, }, "badam_mode": { @@ -923,25 +923,11 @@ LOCALES = { }, "ru": { "label": "Режим BAdam", - "info": "Использовать оптимизатор BAdam с обработкой слоев или с обработкой коэффициентов.", + "info": "Использовать ли оптимизатор BAdam с послоевой или пропорциональной настройкой.", }, "zh": { "label": "BAdam 模式", - "info": "使用layer或者ratio比例模式。", - }, - }, - "badam_switch_block_every": { - "en": { - "label": "Switch block frequency", - "info": "How often to switch model's block update. Set to -1 to disable the block update.", - }, - "ru": { - "label": "Частота переключения", - "info": "Как часто переключать обновление блока модели. Установите -1, чтобы отключить обновление блока.", - }, - "zh": { - "label": "切换block的频率", - "info": "控制切换block切换的频率,如果是-1,则不切换。", + "info": "使用 layer-wise 或 ratio-wise BAdam 优化器。", }, }, "badam_switch_mode": { @@ -950,12 +936,26 @@ LOCALES = { "info": "The strategy of picking block to update for layer-wise BAdam.", }, "ru": { - "label": "Переключить режим", - "info": "Стратегия выбора блока для обновления в методе BAdam по слоям.", + "label": "Режим переключения", + "info": "Стратегия выбора блока для обновления для послойного BAdam.", }, "zh": { - "label": "Block切换策略", - "info": "如果是layer类型的训练模式,如何切换block。", + "label": "切换策略", + "info": "Layer-wise BAdam 优化器的块切换策略。", + }, + }, + "badam_switch_interval": { + "en": { + "label": "Switch interval", + "info": "Number of steps to update the block for layer-wise BAdam.", + }, + "ru": { + "label": "Интервал переключения", + "info": "количество шагов для обновления блока для пошагового BAdam.", + }, + "zh": { + "label": "切换频率", + "info": "Layer-wise BAdam 优化器的块切换频率。", }, }, "badam_update_ratio": { @@ -965,39 +965,11 @@ LOCALES = { }, "ru": { "label": "Коэффициент обновления", - "info": "Коэффициент обновления для метода BAdam, основанного на коэффициентах.", + "info": "Коэффициент обновления для BAdam с учётом соотношений.", }, "zh": { - "label": "Block更新比例", - "info": "如果是比例类型的训练模式,block每次更新的范围比例。", - }, - }, - "badam_mask_mode": { - "en": { - "label": "Mask mode", - "info": "The mode of the mask for BAdam optimizer.", - }, - "ru": { - "label": "Режим маски", - "info": "Режим маски для оптимизатора BAdam.", - }, - "zh": { - "label": "Mask模式", - "info": "BAdam优化器内训练参数的mask关系。", - }, - }, - "badam_verbose": { - "en": { - "label": "Verbosity level", - "info": "0 for no print, 1 for print the block prefix, 2 for print trainable parameters.", - }, - "ru": { - "label": "Уровень многословности", - "info": "0 для отсутствия печати, 1 для печати префикса блока, 2 для печати обучаемых параметров.", - }, - "zh": { - "label": "输出日志级别", - "info": "0:不输出,1:输出block前缀, 1:输出可训练的参数。", + "label": "Block 更新比例", + "info": "Ratio-wise BAdam 优化器的更新比例。", }, }, "cmd_preview_btn": { diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py index 52584f31..d53a4dfe 100644 --- a/src/llmtuner/webui/runner.py +++ b/src/llmtuner/webui/runner.py @@ -147,11 +147,11 @@ class Runner: shift_attn=get("train.shift_attn"), report_to="all" if get("train.report_to") else "none", use_galore=get("train.use_galore"), + use_badam=get("train.use_badam"), output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("train.output_dir")), fp16=(get("train.compute_type") == "fp16"), bf16=(get("train.compute_type") == "bf16"), pure_bf16=(get("train.compute_type") == "pure_bf16"), - use_badam=get("train.use_badam"), ) args["disable_tqdm"] = True @@ -201,11 +201,9 @@ class Runner: if args["use_badam"]: args["badam_mode"] = get("train.badam_mode") - args["badam_switch_block_every"] = get("train.badam_switch_block_every") args["badam_switch_mode"] = get("train.badam_switch_mode") + args["badam_switch_interval"] = get("train.badam_switch_interval") args["badam_update_ratio"] = get("train.badam_update_ratio") - args["badam_mask_mode"] = get("train.badam_mask_mode") - args["badam_verbose"] = get("train.badam_verbose") return args