From 80fc0ee0b643f2d6b24091b4ec722d617ffba165 Mon Sep 17 00:00:00 2001 From: David Novotny Date: Mon, 1 Aug 2022 10:03:09 -0700 Subject: [PATCH] Better seeding of random engines Summary: Currently, seeds are set only inside the train loop. But this does not ensure that the model weights are initialized the same way everywhere which makes all experiments irreproducible. This diff fixes it. Reviewed By: bottler Differential Revision: D38315840 fbshipit-source-id: 3d2ecebbc36072c2b68dd3cd8c5e30708e7dd808 --- projects/implicitron_trainer/experiment.py | 9 +++++++++ .../implicitron_trainer/impl/training_loop.py | 16 ++++------------ projects/implicitron_trainer/impl/utils.py | 17 +++++++++++++++++ .../implicitron_trainer/tests/experiment.yaml | 2 +- 4 files changed, 31 insertions(+), 13 deletions(-) create mode 100644 projects/implicitron_trainer/impl/utils.py diff --git a/projects/implicitron_trainer/experiment.py b/projects/implicitron_trainer/experiment.py index 02566288..1f65b5f1 100755 --- a/projects/implicitron_trainer/experiment.py +++ b/projects/implicitron_trainer/experiment.py @@ -53,6 +53,7 @@ import warnings from dataclasses import field import hydra + import torch from accelerate import Accelerator from omegaconf import DictConfig, OmegaConf @@ -78,6 +79,7 @@ from pytorch3d.implicitron.tools.config import ( from .impl.model_factory import ModelFactoryBase from .impl.optimizer_factory import OptimizerFactoryBase from .impl.training_loop import TrainingLoopBase +from .impl.utils import seed_all_random_engines logger = logging.getLogger(__name__) @@ -110,6 +112,7 @@ class Experiment(Configurable): # pyre-ignore: 13 scheduler. training_loop: An object that runs training given the outputs produced by the data_source, model_factory and optimizer_factory. + seed: A random seed to ensure reproducibility. detect_anomaly: Whether torch.autograd should detect anomalies. Useful for debugging, but might slow down the training. exp_dir: Root experimentation directory. Checkpoints and training stats @@ -125,6 +128,7 @@ class Experiment(Configurable): # pyre-ignore: 13 training_loop: TrainingLoopBase training_loop_class_type: str = "ImplicitronTrainingLoop" + seed: int = 42 detect_anomaly: bool = False exp_dir: str = "./data/default_experiment/" @@ -136,6 +140,10 @@ class Experiment(Configurable): # pyre-ignore: 13 ) def __post_init__(self): + seed_all_random_engines( + self.seed + ) # Set all random engine seeds for reproducibility + run_auto_creation(self) def run(self) -> None: @@ -214,6 +222,7 @@ class Experiment(Configurable): # pyre-ignore: 13 device=device, exp_dir=self.exp_dir, stats=stats, + seed=self.seed, task=task, ) diff --git a/projects/implicitron_trainer/impl/training_loop.py b/projects/implicitron_trainer/impl/training_loop.py index 9a0601a6..17b38a18 100644 --- a/projects/implicitron_trainer/impl/training_loop.py +++ b/projects/implicitron_trainer/impl/training_loop.py @@ -5,11 +5,9 @@ # LICENSE file in the root directory of this source tree. import logging -import random import time from typing import Any, Optional -import numpy as np import torch from accelerate import Accelerator from pytorch3d.implicitron.dataset.data_source import Task @@ -26,6 +24,8 @@ from pytorch3d.implicitron.tools.stats import Stats from pytorch3d.renderer.cameras import CamerasBase from torch.utils.data import DataLoader +from .utils import seed_all_random_engines + logger = logging.getLogger(__name__) @@ -52,7 +52,6 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] max_epochs: Train for this many epochs. Note that if the model was loaded from a checkpoint, we will restart training at the appropriate epoch and run for (max_epochs - checkpoint_epoch) epochs. - seed: A random seed to ensure reproducibility. store_checkpoints: If True, store model and optimizer state checkpoints. store_checkpoints_purge: If >= 0, remove any checkpoints older or equal to this many epochs. @@ -73,7 +72,6 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] evaluator: EvaluatorBase evaluator_class_type: str = "ImplicitronEvaluator" max_epochs: int = 1000 - seed: int = 0 store_checkpoints: bool = True store_checkpoints_purge: int = 1 test_interval: int = -1 @@ -102,6 +100,7 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] device: torch.device, exp_dir: str, stats: Stats, + seed: int, task: Task, **kwargs, ): @@ -109,7 +108,6 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] Entry point to run the training and validation loops based on the specified config file. """ - _seed_all_random_engines(self.seed) start_epoch = stats.epoch + 1 assert scheduler.last_epoch == stats.epoch + 1 assert scheduler.last_epoch == start_epoch @@ -140,7 +138,7 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] # Make sure to re-seed random generators to ensure reproducibility # even after restart. - _seed_all_random_engines(self.seed + epoch) + seed_all_random_engines(seed + epoch) cur_lr = float(scheduler.get_last_lr()[-1]) logger.debug(f"scheduler lr = {cur_lr:1.2e}") @@ -357,9 +355,3 @@ class ImplicitronTrainingLoop(TrainingLoopBase): # pyre-ignore [13] model_io.safe_save_model( unwrapped_model, stats, outfile, optimizer=optimizer ) - - -def _seed_all_random_engines(seed: int) -> None: - np.random.seed(seed) - torch.manual_seed(seed) - random.seed(seed) diff --git a/projects/implicitron_trainer/impl/utils.py b/projects/implicitron_trainer/impl/utils.py new file mode 100644 index 00000000..4fac4463 --- /dev/null +++ b/projects/implicitron_trainer/impl/utils.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import random + +import numpy as np +import torch + + +def seed_all_random_engines(seed: int) -> None: + np.random.seed(seed) + torch.manual_seed(seed) + random.seed(seed) diff --git a/projects/implicitron_trainer/tests/experiment.yaml b/projects/implicitron_trainer/tests/experiment.yaml index 2ecba628..a97246b7 100644 --- a/projects/implicitron_trainer/tests/experiment.yaml +++ b/projects/implicitron_trainer/tests/experiment.yaml @@ -2,6 +2,7 @@ data_source_class_type: ImplicitronDataSource model_factory_class_type: ImplicitronModelFactory optimizer_factory_class_type: ImplicitronOptimizerFactory training_loop_class_type: ImplicitronTrainingLoop +seed: 42 detect_anomaly: false exp_dir: ./data/default_experiment/ hydra: @@ -429,7 +430,6 @@ training_loop_ImplicitronTrainingLoop_args: eval_only: false evaluator_class_type: ImplicitronEvaluator max_epochs: 1000 - seed: 0 store_checkpoints: true store_checkpoints_purge: 1 test_interval: -1