Move load_stats to TrainingLoop

Summary:
Stats are logically connected to the training loop, not to the model. Hence, moving to the training loop.

Also removing resume_epoch from OptimizerFactory in favor of a single place - ModelFactory. This removes the need for config consistency checks etc.

Reviewed By: kjchalup

Differential Revision: D38313475

fbshipit-source-id: a1d188a63e28459df381ff98ad8acdcdb14887b7
This commit is contained in:
David Novotny
2022-08-02 15:40:53 -07:00
committed by Facebook GitHub Bot
parent 760305e044
commit c3f8dad55c
11 changed files with 259 additions and 189 deletions

View File

@@ -124,14 +124,32 @@ data_source_ImplicitronDataSource_args:
dataset_length_val: 0
dataset_length_test: 0
model_factory_ImplicitronModelFactory_args:
force_load: false
resume: true
model_class_type: GenericModel
resume: false
resume_epoch: -1
visdom_env: ''
visdom_port: 8097
visdom_server: http://127.0.0.1
force_resume: false
model_GenericModel_args:
log_vars:
- loss_rgb_psnr_fg
- loss_rgb_psnr
- loss_rgb_mse
- loss_rgb_huber
- loss_depth_abs
- loss_depth_abs_fg
- loss_mask_neg_iou
- loss_mask_bce
- loss_mask_beta_prior
- loss_eikonal
- loss_density_tv
- loss_depth_neg_penalty
- loss_autodecoder_norm
- loss_prev_stage_rgb_mse
- loss_prev_stage_rgb_psnr_fg
- loss_prev_stage_rgb_psnr
- loss_prev_stage_mask_bce
- objective
- epoch
- sec/it
mask_images: true
mask_depths: true
render_image_width: 400
@@ -162,27 +180,6 @@ model_factory_ImplicitronModelFactory_args:
loss_prev_stage_rgb_mse: 1.0
loss_mask_bce: 0.0
loss_prev_stage_mask_bce: 0.0
log_vars:
- loss_rgb_psnr_fg
- loss_rgb_psnr
- loss_rgb_mse
- loss_rgb_huber
- loss_depth_abs
- loss_depth_abs_fg
- loss_mask_neg_iou
- loss_mask_bce
- loss_mask_beta_prior
- loss_eikonal
- loss_density_tv
- loss_depth_neg_penalty
- loss_autodecoder_norm
- loss_prev_stage_rgb_mse
- loss_prev_stage_rgb_psnr_fg
- loss_prev_stage_rgb_psnr
- loss_prev_stage_mask_bce
- objective
- epoch
- sec/it
global_encoder_HarmonicTimeEncoder_args:
n_harmonic_functions: 10
append_input: true
@@ -422,8 +419,6 @@ optimizer_factory_ImplicitronOptimizerFactory_args:
lr_policy: MultiStepLR
momentum: 0.9
multistep_lr_milestones: []
resume: false
resume_epoch: -1
weight_decay: 0.0
training_loop_ImplicitronTrainingLoop_args:
eval_only: false
@@ -437,6 +432,9 @@ training_loop_ImplicitronTrainingLoop_args:
clip_grad: 0.0
metric_print_interval: 5
visualize_interval: 1000
visdom_env: ''
visdom_port: 8097
visdom_server: http://127.0.0.1
evaluator_ImplicitronEvaluator_args:
camera_difficulty_bin_breaks:
- 0.97

View File

@@ -5,6 +5,7 @@
# LICENSE file in the root directory of this source tree.
import os
import tempfile
import unittest
from pathlib import Path
@@ -172,3 +173,48 @@ class TestNerfRepro(unittest.TestCase):
experiment_runner = experiment.Experiment(**cfg)
experiment.dump_cfg(cfg)
experiment_runner.run()
@unittest.skip("This test checks resuming of the NeRF training.")
def test_nerf_blender_resume(self):
# Train one train batch of NeRF, then resume for one more batch.
# Set env vars BLENDER_DATASET_ROOT and BLENDER_SINGLESEQ_CLASS first!
if not interactive_testing_requested():
return
with initialize_config_dir(config_dir=str(IMPLICITRON_CONFIGS_DIR)):
with tempfile.TemporaryDirectory() as exp_dir:
cfg = compose(config_name="repro_singleseq_nerf_blender", overrides=[])
cfg.exp_dir = exp_dir
# set dataset len to 1
# fmt: off
(
cfg
.data_source_ImplicitronDataSource_args
.data_loader_map_provider_SequenceDataLoaderMapProvider_args
.dataset_length_train
) = 1
# fmt: on
# run for one epoch
cfg.training_loop_ImplicitronTrainingLoop_args.max_epochs = 1
experiment_runner = experiment.Experiment(**cfg)
experiment.dump_cfg(cfg)
experiment_runner.run()
# update num epochs + 2, let the optimizer resume
cfg.training_loop_ImplicitronTrainingLoop_args.max_epochs = 3
experiment_runner = experiment.Experiment(**cfg)
experiment_runner.run()
# start from scratch
cfg.model_factory_ImplicitronModelFactory_args.resume = False
experiment_runner = experiment.Experiment(**cfg)
experiment_runner.run()
# force resume from epoch 1
cfg.model_factory_ImplicitronModelFactory_args.resume = True
cfg.model_factory_ImplicitronModelFactory_args.force_resume = True
cfg.model_factory_ImplicitronModelFactory_args.resume_epoch = 1
experiment_runner = experiment.Experiment(**cfg)
experiment_runner.run()