diff --git a/projects/implicitron_trainer/experiment.py b/projects/implicitron_trainer/experiment.py index 02566288..1f65b5f1 100755 --- a/projects/implicitron_trainer/experiment.py +++ b/projects/implicitron_trainer/experiment.py @@ -53,6 +53,7 @@ import warnings from dataclasses import field import hydra + import torch from accelerate import Accelerator from omegaconf import DictConfig, OmegaConf @@ -78,6 +79,7 @@ from pytorch3d.implicitron.tools.config import ( from .impl.model_factory import ModelFactoryBase from .impl.optimizer_factory import OptimizerFactoryBase from .impl.training_loop import TrainingLoopBase +from .impl.utils import seed_all_random_engines logger = logging.getLogger(__name__) @@ -110,6 +112,7 @@ class Experiment(Configurable): # pyre-ignore: 13 scheduler. training_loop: An object that runs training given the outputs produced by the data_source, model_factory and optimizer_factory. + seed: A random seed to ensure reproducibility. detect_anomaly: Whether torch.autograd should detect anomalies. Useful for debugging, but might slow down the training. exp_dir: Root experimentation directory. Checkpoints and training stats @@ -125,6 +128,7 @@ class Experiment(Configurable): # pyre-ignore: 13 training_loop: TrainingLoopBase training_loop_class_type: str = "ImplicitronTrainingLoop" + seed: int = 42 detect_anomaly: bool = False exp_dir: str = "./data/default_experiment/" @@ -136,6 +140,10 @@ class Experiment(Configurable): # pyre-ignore: 13 ) def __post_init__(self): + seed_all_random_engines( + self.seed + ) # Set all random engine seeds for reproducibility + run_auto_creation(self) def run(self) -> None: @@ -214,6 +222,7 @@ class Experiment(Configurable): # pyre-ignore: 13 device=device, exp_dir=self.exp_dir, stats=stats, + seed=self.seed, task=task, ) diff --git a/projects/implicitron_trainer/impl/training_loop.py b/projects/implicitron_trainer/impl/training_loop.py index 9a0601a6..17b38a18 100644 --- a/projects/implicitron_trainer/impl/training_loop.py +++ b/projects/implicitron_trainer/impl/training_loop.py @@ -5,11 +5,9 @@ # LICENSE file in the root directory of this source tree. import logging -import random import time from typing import Any, Optional -import numpy as np import torch from accelerate import Accelerator from pytorch3d.implicitron.dataset.data_source import Task @@ -26,6 +24,8 @@ from pytorch3d.implicitron.tools.stats import Stats from pytorch3d.renderer.cameras import CamerasBase from torch.utils.data import DataLoader +from .utils import seed_all_random_engines + logger = logging.getLogger(__name__) @@ -52,7 +52,6 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] max_epochs: Train for this many epochs. Note that if the model was loaded from a checkpoint, we will restart training at the appropriate epoch and run for (max_epochs - checkpoint_epoch) epochs. - seed: A random seed to ensure reproducibility. store_checkpoints: If True, store model and optimizer state checkpoints. store_checkpoints_purge: If >= 0, remove any checkpoints older or equal to this many epochs. @@ -73,7 +72,6 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] evaluator: EvaluatorBase evaluator_class_type: str = "ImplicitronEvaluator" max_epochs: int = 1000 - seed: int = 0 store_checkpoints: bool = True store_checkpoints_purge: int = 1 test_interval: int = -1 @@ -102,6 +100,7 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] device: torch.device, exp_dir: str, stats: Stats, + seed: int, task: Task, **kwargs, ): @@ -109,7 +108,6 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] Entry point to run the training and validation loops based on the specified config file. """ - _seed_all_random_engines(self.seed) start_epoch = stats.epoch + 1 assert scheduler.last_epoch == stats.epoch + 1 assert scheduler.last_epoch == start_epoch @@ -140,7 +138,7 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] # Make sure to re-seed random generators to ensure reproducibility # even after restart. - _seed_all_random_engines(self.seed + epoch) + seed_all_random_engines(seed + epoch) cur_lr = float(scheduler.get_last_lr()[-1]) logger.debug(f"scheduler lr = {cur_lr:1.2e}") @@ -357,9 +355,3 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] model_io.safe_save_model( unwrapped_model, stats, outfile, optimizer=optimizer ) - - -def _seed_all_random_engines(seed: int) -> None: - np.random.seed(seed) - torch.manual_seed(seed) - random.seed(seed) diff --git a/projects/implicitron_trainer/impl/utils.py b/projects/implicitron_trainer/impl/utils.py new file mode 100644 index 00000000..4fac4463 --- /dev/null +++ b/projects/implicitron_trainer/impl/utils.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import random + +import numpy as np +import torch + + +def seed_all_random_engines(seed: int) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + random.seed(seed) diff --git a/projects/implicitron_trainer/tests/experiment.yaml b/projects/implicitron_trainer/tests/experiment.yaml index 2ecba628..a97246b7 100644 --- a/projects/implicitron_trainer/tests/experiment.yaml +++ b/projects/implicitron_trainer/tests/experiment.yaml @@ -2,6 +2,7 @@ data_source_class_type: ImplicitronDataSource model_factory_class_type: ImplicitronModelFactory optimizer_factory_class_type: ImplicitronOptimizerFactory training_loop_class_type: ImplicitronTrainingLoop +seed: 42 detect_anomaly: false exp_dir: ./data/default_experiment/ hydra: @@ -429,7 +430,6 @@ training_loop_ImplicitronTrainingLoop_args: eval_only: false evaluator_class_type: ImplicitronEvaluator max_epochs: 1000 - seed: 0 store_checkpoints: true store_checkpoints_purge: 1 test_interval: -1