Revert "Fix CUDA kernel index data type in vision/fair/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.cu +10"

This reverts commit 3987612062.
2026-04-08 13:35:59 +08:00 · 2025-03-27 05:28:03 -07:00
169 changed files with 1570 additions and 832 deletions
--- a/dev/linter.sh
+++ b/dev/linter.sh
@@ -10,7 +10,7 @@
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 DIR=$(dirname "${DIR}")

-if [[ -f "${DIR}/BUCK" ]]
+if [[ -f "${DIR}/TARGETS" ]]
 then
  pyfmt "${DIR}"
 else
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -19,6 +19,7 @@
 #
 import os
 import sys
+
 import unittest.mock as mock

 from recommonmark.parser import CommonMarkParser
--- a/docs/modules/implicitron/datasets.rst
+++ b/docs/modules/implicitron/datasets.rst
@@ -3,6 +3,11 @@ pytorch3d.implicitron.dataset specific datasets

 specific datasets

+.. automodule:: pytorch3d.implicitron.dataset.blender_dataset_map_provider
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 .. automodule:: pytorch3d.implicitron.dataset.json_index_dataset_map_provider
    :members:
    :undoc-members:
@@ -13,6 +18,11 @@ specific datasets
    :undoc-members:
    :show-inheritance:

+.. automodule:: pytorch3d.implicitron.dataset.llff_dataset_map_provider
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 .. automodule:: pytorch3d.implicitron.dataset.rendered_mesh_dataset_map_provider
    :members:
    :undoc-members:
--- a/projects/implicitron_trainer/configs/overfit_singleseq_nerf_blender.yaml
+++ b/projects/implicitron_trainer/configs/overfit_singleseq_nerf_blender.yaml
@@ -0,0 +1,56 @@
+defaults:
+- overfit_singleseq_base
+- _self_
+exp_dir: "./data/overfit_nerf_blender_repro/${oc.env:BLENDER_SINGLESEQ_CLASS}"
+data_source_ImplicitronDataSource_args:
+  data_loader_map_provider_SequenceDataLoaderMapProvider_args:
+    dataset_length_train: 100
+  dataset_map_provider_class_type: BlenderDatasetMapProvider
+  dataset_map_provider_BlenderDatasetMapProvider_args:
+    base_dir: ${oc.env:BLENDER_DATASET_ROOT}/${oc.env:BLENDER_SINGLESEQ_CLASS}
+    n_known_frames_for_test: null
+    object_name: ${oc.env:BLENDER_SINGLESEQ_CLASS}
+    path_manager_factory_class_type: PathManagerFactory
+    path_manager_factory_PathManagerFactory_args:
+      silence_logs: true
+
+model_factory_ImplicitronModelFactory_args:
+  model_class_type: "OverfitModel"
+  model_OverfitModel_args:
+    mask_images: false
+    raysampler_class_type: AdaptiveRaySampler
+    raysampler_AdaptiveRaySampler_args:
+      n_pts_per_ray_training: 64
+      n_pts_per_ray_evaluation: 64
+      n_rays_per_image_sampled_from_mask: 4096
+      stratified_point_sampling_training: true
+      stratified_point_sampling_evaluation: false
+      scene_extent: 2.0
+      scene_center:
+      - 0.0
+      - 0.0
+      - 0.0
+    renderer_MultiPassEmissionAbsorptionRenderer_args:
+      density_noise_std_train: 0.0
+      n_pts_per_ray_fine_training: 128
+      n_pts_per_ray_fine_evaluation: 128
+      raymarcher_EmissionAbsorptionRaymarcher_args:
+        blend_output: false
+    loss_weights:
+      loss_rgb_mse: 1.0
+      loss_prev_stage_rgb_mse: 1.0
+      loss_mask_bce: 0.0
+      loss_prev_stage_mask_bce: 0.0
+      loss_autodecoder_norm: 0.00
+
+optimizer_factory_ImplicitronOptimizerFactory_args:
+  exponential_lr_step_size: 3001
+  lr_policy: LinearExponential
+  linear_exponential_lr_milestone: 200
+
+training_loop_ImplicitronTrainingLoop_args:
+  max_epochs: 6000
+  metric_print_interval: 10
+  store_checkpoints_purge: 3
+  test_when_finished: true
+  validation_interval: 100
--- a/projects/implicitron_trainer/configs/repro_singleseq_nerf_blender.yaml
+++ b/projects/implicitron_trainer/configs/repro_singleseq_nerf_blender.yaml
@@ -0,0 +1,55 @@
+defaults:
+- repro_singleseq_base
+- _self_
+exp_dir: "./data/nerf_blender_repro/${oc.env:BLENDER_SINGLESEQ_CLASS}"
+data_source_ImplicitronDataSource_args:
+  data_loader_map_provider_SequenceDataLoaderMapProvider_args:
+    dataset_length_train: 100
+  dataset_map_provider_class_type: BlenderDatasetMapProvider
+  dataset_map_provider_BlenderDatasetMapProvider_args:
+    base_dir: ${oc.env:BLENDER_DATASET_ROOT}/${oc.env:BLENDER_SINGLESEQ_CLASS}
+    n_known_frames_for_test: null
+    object_name: ${oc.env:BLENDER_SINGLESEQ_CLASS}
+    path_manager_factory_class_type: PathManagerFactory
+    path_manager_factory_PathManagerFactory_args:
+      silence_logs: true
+
+model_factory_ImplicitronModelFactory_args:
+  model_GenericModel_args:
+    mask_images: false
+    raysampler_class_type: AdaptiveRaySampler
+    raysampler_AdaptiveRaySampler_args:
+      n_pts_per_ray_training: 64
+      n_pts_per_ray_evaluation: 64
+      n_rays_per_image_sampled_from_mask: 4096
+      stratified_point_sampling_training: true
+      stratified_point_sampling_evaluation: false
+      scene_extent: 2.0
+      scene_center:
+      - 0.0
+      - 0.0
+      - 0.0
+    renderer_MultiPassEmissionAbsorptionRenderer_args:
+      density_noise_std_train: 0.0
+      n_pts_per_ray_fine_training: 128
+      n_pts_per_ray_fine_evaluation: 128
+      raymarcher_EmissionAbsorptionRaymarcher_args:
+        blend_output: false
+    loss_weights:
+      loss_rgb_mse: 1.0
+      loss_prev_stage_rgb_mse: 1.0
+      loss_mask_bce: 0.0
+      loss_prev_stage_mask_bce: 0.0
+      loss_autodecoder_norm: 0.00
+
+optimizer_factory_ImplicitronOptimizerFactory_args:
+  exponential_lr_step_size: 3001
+  lr_policy: LinearExponential
+  linear_exponential_lr_milestone: 200
+
+training_loop_ImplicitronTrainingLoop_args:
+  max_epochs: 6000
+  metric_print_interval: 10
+  store_checkpoints_purge: 3
+  test_when_finished: true
+  validation_interval: 100
--- a/projects/implicitron_trainer/experiment.py
+++ b/projects/implicitron_trainer/experiment.py
@@ -48,18 +48,22 @@ The outputs of the experiment are saved and logged in multiple ways:
 import logging
 import os
 import warnings
+
 from dataclasses import field

 import hydra
+
 import torch
 from accelerate import Accelerator
 from omegaconf import DictConfig, OmegaConf
 from packaging import version
+
 from pytorch3d.implicitron.dataset.data_source import (
    DataSourceBase,
    ImplicitronDataSource,
 )
 from pytorch3d.implicitron.models.base_model import ImplicitronModelBase
+
 from pytorch3d.implicitron.models.renderer.multipass_ea import (
    MultiPassEmissionAbsorptionRenderer,
 )
--- a/projects/implicitron_trainer/impl/model_factory.py
+++ b/projects/implicitron_trainer/impl/model_factory.py
@@ -11,6 +11,7 @@ import os
 from typing import Optional

 import torch.optim
+
 from accelerate import Accelerator
 from pytorch3d.implicitron.models.base_model import ImplicitronModelBase
 from pytorch3d.implicitron.tools import model_io
--- a/projects/implicitron_trainer/impl/optimizer_factory.py
+++ b/projects/implicitron_trainer/impl/optimizer_factory.py
@@ -14,7 +14,9 @@ from dataclasses import field
 from typing import Any, Dict, List, Optional, Tuple

 import torch.optim
+
 from accelerate import Accelerator
+
 from pytorch3d.implicitron.models.base_model import ImplicitronModelBase
 from pytorch3d.implicitron.tools import model_io
 from pytorch3d.implicitron.tools.config import (
--- a/projects/implicitron_trainer/tests/experiment.yaml
+++ b/projects/implicitron_trainer/tests/experiment.yaml
@@ -13,6 +13,13 @@ hydra:
 data_source_ImplicitronDataSource_args:
  dataset_map_provider_class_type: ???
  data_loader_map_provider_class_type: SequenceDataLoaderMapProvider
+  dataset_map_provider_BlenderDatasetMapProvider_args:
+    base_dir: ???
+    object_name: ???
+    path_manager_factory_class_type: PathManagerFactory
+    n_known_frames_for_test: null
+    path_manager_factory_PathManagerFactory_args:
+      silence_logs: true
  dataset_map_provider_JsonIndexDatasetMapProvider_args:
    category: ???
    task_str: singlesequence
@@ -84,6 +91,14 @@ data_source_ImplicitronDataSource_args:
      sort_frames: false
    path_manager_factory_PathManagerFactory_args:
      silence_logs: true
+  dataset_map_provider_LlffDatasetMapProvider_args:
+    base_dir: ???
+    object_name: ???
+    path_manager_factory_class_type: PathManagerFactory
+    n_known_frames_for_test: null
+    path_manager_factory_PathManagerFactory_args:
+      silence_logs: true
+    downscale_factor: 4
  dataset_map_provider_RenderedMeshDatasetMapProvider_args:
    num_views: 40
    data_file: null
--- a/projects/implicitron_trainer/tests/test_experiment.py
+++ b/projects/implicitron_trainer/tests/test_experiment.py
@@ -12,6 +12,7 @@ import unittest
 from pathlib import Path

 import torch
+
 from hydra import compose, initialize_config_dir
 from omegaconf import OmegaConf
 from projects.implicitron_trainer.impl.optimizer_factory import (
--- a/pytorch3d/init.py
+++ b/pytorch3d/init.py
@@ -6,4 +6,4 @@

 # pyre-unsafe

-__version__ = "0.7.9"
+__version__ = "0.7.8"
--- a/pytorch3d/common/workaround/symeig3x3.py
+++ b/pytorch3d/common/workaround/symeig3x3.py
@@ -82,12 +82,10 @@ class _SymEig3x3(nn.Module):
        q = inputs_trace / 3.0

        # Calculate squared sum of elements outside the main diagonal / 2
-        p1 = (
-            torch.square(inputs).sum(dim=(-1, -2)) - torch.square(inputs_diag).sum(-1)
-        ) / 2
-        p2 = torch.square(inputs_diag - q[..., None]).sum(dim=-1) + 2.0 * p1.clamp(
-            self._eps
-        )
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        p1 = ((inputs**2).sum(dim=(-1, -2)) - (inputs_diag**2).sum(-1)) / 2
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        p2 = ((inputs_diag - q[..., None]) ** 2).sum(dim=-1) + 2.0 * p1.clamp(self._eps)

        p = torch.sqrt(p2 / 6.0)
        B = (inputs - q[..., None, None] * self._identity) / p[..., None, None]
@@ -106,9 +104,7 @@ class _SymEig3x3(nn.Module):
        # Soft dispatch between the degenerate case (diagonal A) and general.
        # diag_soft_cond -> 1.0 when p1 < 6 * eps and diag_soft_cond -> 0.0 otherwise.
        # We use 6 * eps to take into account the error accumulated during the p1 summation
-        diag_soft_cond = torch.exp(-torch.square(p1 / (6 * self._eps))).detach()[
-            ..., None
-        ]
+        diag_soft_cond = torch.exp(-((p1 / (6 * self._eps)) ** 2)).detach()[..., None]

        # Eigenvalues are the ordered elements of main diagonal in the degenerate case
        diag_eigenvals, _ = torch.sort(inputs_diag, dim=-1)
@@ -203,7 +199,8 @@ class _SymEig3x3(nn.Module):
            cross_products[..., :1, :]
        )

-        norms_sq = torch.square(cross_products).sum(dim=-1)
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        norms_sq = (cross_products**2).sum(dim=-1)
        max_norms_index = norms_sq.argmax(dim=-1)

        # Pick only the cross-product with highest squared norm for each input
--- a/pytorch3d/csrc/ball_query/ball_query.cu
+++ b/pytorch3d/csrc/ball_query/ball_query.cu
@@ -32,9 +32,7 @@ __global__ void BallQueryKernel(
    at::PackedTensorAccessor64<int64_t, 3, at::RestrictPtrTraits> idxs,
    at::PackedTensorAccessor64<scalar_t, 3, at::RestrictPtrTraits> dists,
    const int64_t K,
-    const float radius,
-    const float radius2,
-    const bool skip_points_outside_cube) {
+    const float radius2) {
  const int64_t N = p1.size(0);
  const int64_t chunks_per_cloud = (1 + (p1.size(1) - 1) / blockDim.x);
  const int64_t chunks_to_do = N * chunks_per_cloud;
@@ -53,19 +51,7 @@ __global__ void BallQueryKernel(
    // Iterate over points in p2 until desired count is reached or
    // all points have been considered
    for (int64_t j = 0, count = 0; j < lengths2[n] && count < K; ++j) {
-      if (skip_points_outside_cube) {
-        bool is_within_radius = true;
-        // Filter when any one coordinate is already outside the radius
-        for (int d = 0; is_within_radius && d < D; ++d) {
-          scalar_t abs_diff = fabs(p1[n][i][d] - p2[n][j][d]);
-          is_within_radius = (abs_diff <= radius);
-        }
-        if (!is_within_radius) {
-          continue;
-        }
-      }
-
-      // Else, calculate the distance between the points and compare
+      // Calculate the distance between the points
      scalar_t dist2 = 0.0;
      for (int d = 0; d < D; ++d) {
        scalar_t diff = p1[n][i][d] - p2[n][j][d];
@@ -91,8 +77,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
    const at::Tensor& lengths1, // (N,)
    const at::Tensor& lengths2, // (N,)
    int K,
-    float radius,
-    bool skip_points_outside_cube) {
+    float radius) {
  // Check inputs are on the same device
  at::TensorArg p1_t{p1, "p1", 1}, p2_t{p2, "p2", 2},
      lengths1_t{lengths1, "lengths1", 3}, lengths2_t{lengths2, "lengths2", 4};
@@ -135,9 +120,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
            idxs.packed_accessor64<int64_t, 3, at::RestrictPtrTraits>(),
            dists.packed_accessor64<float, 3, at::RestrictPtrTraits>(),
            K_64,
-            radius,
-            radius2,
-            skip_points_outside_cube);
+            radius2);
      }));

  AT_CUDA_CHECK(cudaGetLastError());
--- a/pytorch3d/csrc/ball_query/ball_query.h
+++ b/pytorch3d/csrc/ball_query/ball_query.h
@@ -25,9 +25,6 @@
 //      within the radius
 //    radius: the radius around each point within which the neighbors need to be
 //      located
-//    skip_points_outside_cube: If true, reduce multiplications of float values
-//      by not explicitly calculating distances to points that fall outside the
-//      D-cube with side length (2*radius) centered at each point in p1.
 //
 // Returns:
 //    p1_neighbor_idx: LongTensor of shape (N, P1, K), where
@@ -49,8 +46,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    const int K,
-    const float radius,
-    const bool skip_points_outside_cube);
+    const float radius);

 // CUDA implementation
 std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
@@ -59,8 +55,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    const int K,
-    const float radius,
-    const bool skip_points_outside_cube);
+    const float radius);

 // Implementation which is exposed
 // Note: the backward pass reuses the KNearestNeighborBackward kernel
@@ -70,8 +65,7 @@ inline std::tuple<at::Tensor, at::Tensor> BallQuery(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    int K,
-    float radius,
-    bool skip_points_outside_cube) {
+    float radius) {
  if (p1.is_cuda() || p2.is_cuda()) {
 #ifdef WITH_CUDA
    CHECK_CUDA(p1);
@@ -82,20 +76,16 @@ inline std::tuple<at::Tensor, at::Tensor> BallQuery(
        lengths1.contiguous(),
        lengths2.contiguous(),
        K,
-        radius,
-        skip_points_outside_cube);
+        radius);
 #else
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(p1);
-  CHECK_CPU(p2);
  return BallQueryCpu(
      p1.contiguous(),
      p2.contiguous(),
      lengths1.contiguous(),
      lengths2.contiguous(),
      K,
-      radius,
-      skip_points_outside_cube);
+      radius);
 }
--- a/pytorch3d/csrc/ball_query/ball_query_cpu.cpp
+++ b/pytorch3d/csrc/ball_query/ball_query_cpu.cpp
@@ -6,7 +6,6 @@
 * LICENSE file in the root directory of this source tree.
 */

-#include <math.h>
 #include <torch/extension.h>
 #include <tuple>

@@ -16,8 +15,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    int K,
-    float radius,
-    bool skip_points_outside_cube) {
+    float radius) {
  const int N = p1.size(0);
  const int P1 = p1.size(1);
  const int D = p1.size(2);
@@ -39,16 +37,6 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
    const int64_t length2 = lengths2_a[n];
    for (int64_t i = 0; i < length1; ++i) {
      for (int64_t j = 0, count = 0; j < length2 && count < K; ++j) {
-        if (skip_points_outside_cube) {
-          bool is_within_radius = true;
-          for (int d = 0; is_within_radius && d < D; ++d) {
-            float abs_diff = fabs(p1_a[n][i][d] - p2_a[n][j][d]);
-            is_within_radius = (abs_diff <= radius);
-          }
-          if (!is_within_radius) {
-            continue;
-          }
-        }
        float dist2 = 0;
        for (int d = 0; d < D; ++d) {
          float diff = p1_a[n][i][d] - p2_a[n][j][d];
--- a/pytorch3d/csrc/blending/sigmoid_alpha_blend.h
+++ b/pytorch3d/csrc/blending/sigmoid_alpha_blend.h
@@ -98,11 +98,6 @@ at::Tensor SigmoidAlphaBlendBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(distances);
-  CHECK_CPU(pix_to_face);
-  CHECK_CPU(alphas);
-  CHECK_CPU(grad_alphas);
-
  return SigmoidAlphaBlendBackwardCpu(
      grad_alphas, alphas, distances, pix_to_face, sigma);
 }
--- a/pytorch3d/csrc/compositing/alpha_composite.cu
+++ b/pytorch3d/csrc/compositing/alpha_composite.cu
@@ -33,11 +33,11 @@ __global__ void alphaCompositeCudaForwardKernel(
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Iterate over each feature in each pixel
  for (int pid = tid; pid < num_pixels; pid += num_threads) {
@@ -83,11 +83,11 @@ __global__ void alphaCompositeCudaBackwardKernel(
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
--- a/pytorch3d/csrc/compositing/alpha_composite.h
+++ b/pytorch3d/csrc/compositing/alpha_composite.h
@@ -74,9 +74,6 @@ torch::Tensor alphaCompositeForward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
    return alphaCompositeCpuForward(features, alphas, points_idx);
  }
 }
@@ -104,11 +101,6 @@ std::tuple<torch::Tensor, torch::Tensor> alphaCompositeBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(grad_outputs);
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
-
    return alphaCompositeCpuBackward(
        grad_outputs, features, alphas, points_idx);
  }
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.cu
@@ -33,11 +33,11 @@ __global__ void weightedSumNormCudaForwardKernel(
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
@@ -96,11 +96,11 @@ __global__ void weightedSumNormCudaBackwardKernel(
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * W * H;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.h
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.h
@@ -73,10 +73,6 @@ torch::Tensor weightedSumNormForward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
-
    return weightedSumNormCpuForward(features, alphas, points_idx);
  }
 }
@@ -104,11 +100,6 @@ std::tuple<torch::Tensor, torch::Tensor> weightedSumNormBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(grad_outputs);
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
-
    return weightedSumNormCpuBackward(
        grad_outputs, features, alphas, points_idx);
  }
--- a/pytorch3d/csrc/compositing/weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/weighted_sum.cu
@@ -31,11 +31,11 @@ __global__ void weightedSumCudaForwardKernel(
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
@@ -78,11 +78,11 @@ __global__ void weightedSumCudaBackwardKernel(
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Iterate over each pixel to compute the contribution to the
  // gradient for the features and weights
--- a/pytorch3d/csrc/compositing/weighted_sum.h
+++ b/pytorch3d/csrc/compositing/weighted_sum.h
@@ -72,9 +72,6 @@ torch::Tensor weightedSumForward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
    return weightedSumCpuForward(features, alphas, points_idx);
  }
 }
@@ -101,11 +98,6 @@ std::tuple<torch::Tensor, torch::Tensor> weightedSumBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(grad_outputs);
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
-
    return weightedSumCpuBackward(grad_outputs, features, alphas, points_idx);
  }
 }
--- a/pytorch3d/csrc/ext.cpp
+++ b/pytorch3d/csrc/ext.cpp
@@ -8,6 +8,7 @@

 // clang-format off
 #include "./pulsar/global.h" // Include before <torch/extension.h>.
+#include <torch/extension.h>
 // clang-format on
 #include "./pulsar/pytorch/renderer.h"
 #include "./pulsar/pytorch/tensor_util.h"
@@ -105,8 +106,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  py::class_<
      pulsar::pytorch::Renderer,
      std::shared_ptr<pulsar::pytorch::Renderer>>(m, "PulsarRenderer")
-      .def(
-          py::init<
+      .def(py::init<
           const uint&,
           const uint&,
           const uint&,
--- a/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
+++ b/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
@@ -60,8 +60,6 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(verts);
-  CHECK_CPU(faces);
  return FaceAreasNormalsForwardCpu(verts, faces);
 }

@@ -82,9 +80,5 @@ at::Tensor FaceAreasNormalsBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(grad_areas);
-  CHECK_CPU(grad_normals);
-  CHECK_CPU(verts);
-  CHECK_CPU(faces);
  return FaceAreasNormalsBackwardCpu(grad_areas, grad_normals, verts, faces);
 }
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.cu
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.cu
@@ -20,14 +20,14 @@ __global__ void GatherScatterCudaKernel(
    const size_t V,
    const size_t D,
    const size_t E) {
-  const auto tid = threadIdx.x;
+  const int tid = threadIdx.x;

  // Reverse the vertex order if backward.
  const int v0_idx = backward ? 1 : 0;
  const int v1_idx = backward ? 0 : 1;

  // Edges are split evenly across the blocks.
-  for (auto e = blockIdx.x; e < E; e += gridDim.x) {
+  for (int e = blockIdx.x; e < E; e += gridDim.x) {
    // Get indices of vertices which form the edge.
    const int64_t v0 = edges[2 * e + v0_idx];
    const int64_t v1 = edges[2 * e + v1_idx];
@@ -35,7 +35,7 @@ __global__ void GatherScatterCudaKernel(
    // Split vertex features evenly across threads.
    // This implementation will be quite wasteful when D<128 since there will be
    // a lot of threads doing nothing.
-    for (auto d = tid; d < D; d += blockDim.x) {
+    for (int d = tid; d < D; d += blockDim.x) {
      const float val = input[v1 * D + d];
      float* address = output + v0 * D + d;
      atomicAdd(address, val);
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.h
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.h
@@ -53,7 +53,5 @@ at::Tensor GatherScatter(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(input);
-  CHECK_CPU(edges);
  return GatherScatterCpu(input, edges, directed, backward);
 }
--- a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
+++ b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
@@ -20,8 +20,8 @@ __global__ void InterpFaceAttrsForwardKernel(
    const size_t P,
    const size_t F,
    const size_t D) {
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
  for (int pd = tid; pd < P * D; pd += num_threads) {
    const int p = pd / D;
    const int d = pd % D;
@@ -93,8 +93,8 @@ __global__ void InterpFaceAttrsBackwardKernel(
    const size_t P,
    const size_t F,
    const size_t D) {
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
  for (int pd = tid; pd < P * D; pd += num_threads) {
    const int p = pd / D;
    const int d = pd % D;
--- a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.h
+++ b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.h
@@ -57,8 +57,6 @@ at::Tensor InterpFaceAttrsForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(face_attrs);
-  CHECK_CPU(barycentric_coords);
  return InterpFaceAttrsForwardCpu(pix_to_face, barycentric_coords, face_attrs);
 }

@@ -108,9 +106,6 @@ std::tuple<at::Tensor, at::Tensor> InterpFaceAttrsBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(face_attrs);
-  CHECK_CPU(barycentric_coords);
-  CHECK_CPU(grad_pix_attrs);
  return InterpFaceAttrsBackwardCpu(
      pix_to_face, barycentric_coords, face_attrs, grad_pix_attrs);
 }
--- a/pytorch3d/csrc/iou_box3d/iou_box3d.h
+++ b/pytorch3d/csrc/iou_box3d/iou_box3d.h
@@ -44,7 +44,5 @@ inline std::tuple<at::Tensor, at::Tensor> IoUBox3D(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(boxes1);
-  CHECK_CPU(boxes2);
  return IoUBox3DCpu(boxes1.contiguous(), boxes2.contiguous());
 }
--- a/pytorch3d/csrc/knn/knn.h
+++ b/pytorch3d/csrc/knn/knn.h
@@ -74,8 +74,6 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdx(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(p1);
-  CHECK_CPU(p2);
  return KNearestNeighborIdxCpu(p1, p2, lengths1, lengths2, norm, K);
 }

@@ -142,8 +140,6 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(p1);
-  CHECK_CPU(p2);
  return KNearestNeighborBackwardCpu(
      p1, p2, lengths1, lengths2, idxs, norm, grad_dists);
 }
--- a/pytorch3d/csrc/marching_cubes/marching_cubes.h
+++ b/pytorch3d/csrc/marching_cubes/marching_cubes.h
@@ -58,6 +58,5 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor> MarchingCubes(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(vol);
  return MarchingCubesCpu(vol.contiguous(), isolevel);
 }
--- a/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
+++ b/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
@@ -88,8 +88,6 @@ at::Tensor PackedToPadded(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(inputs_packed);
-  CHECK_CPU(first_idxs);
  return PackedToPaddedCpu(inputs_packed, first_idxs, max_size);
 }

@@ -107,7 +105,5 @@ at::Tensor PaddedToPacked(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(inputs_padded);
-  CHECK_CPU(first_idxs);
  return PaddedToPackedCpu(inputs_padded, first_idxs, num_inputs);
 }
--- a/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
+++ b/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
@@ -110,7 +110,7 @@ __global__ void DistanceForwardKernel(
    __syncthreads();

    // Perform reduction in shared memory.
-    for (auto s = blockDim.x / 2; s > 32; s >>= 1) {
+    for (int s = blockDim.x / 2; s > 32; s >>= 1) {
      if (tid < s) {
        if (min_dists[tid] > min_dists[tid + s]) {
          min_dists[tid] = min_dists[tid + s];
@@ -502,8 +502,8 @@ __global__ void PointFaceArrayForwardKernel(
  const float3* tris_f3 = (float3*)tris;

  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int t_i = tid; t_i < P * T; t_i += num_threads) {
    const int t = t_i / P; // segment index.
@@ -576,8 +576,8 @@ __global__ void PointFaceArrayBackwardKernel(
  const float3* tris_f3 = (float3*)tris;

  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int t_i = tid; t_i < P * T; t_i += num_threads) {
    const int t = t_i / P; // triangle index.
@@ -683,8 +683,8 @@ __global__ void PointEdgeArrayForwardKernel(
  float3* segms_f3 = (float3*)segms;

  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int t_i = tid; t_i < P * S; t_i += num_threads) {
    const int s = t_i / P; // segment index.
@@ -752,8 +752,8 @@ __global__ void PointEdgeArrayBackwardKernel(
  float3* segms_f3 = (float3*)segms;

  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int t_i = tid; t_i < P * S; t_i += num_threads) {
    const int s = t_i / P; // segment index.
--- a/pytorch3d/csrc/point_mesh/point_mesh_cuda.h
+++ b/pytorch3d/csrc/point_mesh/point_mesh_cuda.h
@@ -88,10 +88,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(points_first_idx);
-  CHECK_CPU(tris);
-  CHECK_CPU(tris_first_idx);
  return PointFaceDistanceForwardCpu(
      points, points_first_idx, tris, tris_first_idx, min_triangle_area);
 }
@@ -147,10 +143,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(tris);
-  CHECK_CPU(idx_points);
-  CHECK_CPU(grad_dists);
  return PointFaceDistanceBackwardCpu(
      points, tris, idx_points, grad_dists, min_triangle_area);
 }
@@ -229,10 +221,6 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(points_first_idx);
-  CHECK_CPU(tris);
-  CHECK_CPU(tris_first_idx);
  return FacePointDistanceForwardCpu(
      points, points_first_idx, tris, tris_first_idx, min_triangle_area);
 }
@@ -289,10 +277,6 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(tris);
-  CHECK_CPU(idx_tris);
-  CHECK_CPU(grad_dists);
  return FacePointDistanceBackwardCpu(
      points, tris, idx_tris, grad_dists, min_triangle_area);
 }
@@ -362,10 +346,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(points_first_idx);
-  CHECK_CPU(segms);
-  CHECK_CPU(segms_first_idx);
  return PointEdgeDistanceForwardCpu(
      points, points_first_idx, segms, segms_first_idx, max_points);
 }
@@ -416,10 +396,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(segms);
-  CHECK_CPU(idx_points);
-  CHECK_CPU(grad_dists);
  return PointEdgeDistanceBackwardCpu(points, segms, idx_points, grad_dists);
 }

@@ -488,10 +464,6 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(points_first_idx);
-  CHECK_CPU(segms);
-  CHECK_CPU(segms_first_idx);
  return EdgePointDistanceForwardCpu(
      points, points_first_idx, segms, segms_first_idx, max_segms);
 }
@@ -542,10 +514,6 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(segms);
-  CHECK_CPU(idx_segms);
-  CHECK_CPU(grad_dists);
  return EdgePointDistanceBackwardCpu(points, segms, idx_segms, grad_dists);
 }

@@ -599,8 +567,6 @@ torch::Tensor PointFaceArrayDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(tris);
  return PointFaceArrayDistanceForwardCpu(points, tris, min_triangle_area);
 }

@@ -647,9 +613,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(tris);
-  CHECK_CPU(grad_dists);
  return PointFaceArrayDistanceBackwardCpu(
      points, tris, grad_dists, min_triangle_area);
 }
@@ -698,8 +661,6 @@ torch::Tensor PointEdgeArrayDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(segms);
  return PointEdgeArrayDistanceForwardCpu(points, segms);
 }

@@ -742,8 +703,5 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(segms);
-  CHECK_CPU(grad_dists);
  return PointEdgeArrayDistanceBackwardCpu(points, segms, grad_dists);
 }
--- a/pytorch3d/csrc/points_to_volumes/points_to_volumes.h
+++ b/pytorch3d/csrc/points_to_volumes/points_to_volumes.h
@@ -104,12 +104,6 @@ inline void PointsToVolumesForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points_3d);
-  CHECK_CPU(points_features);
-  CHECK_CPU(volume_densities);
-  CHECK_CPU(volume_features);
-  CHECK_CPU(grid_sizes);
-  CHECK_CPU(mask);
  PointsToVolumesForwardCpu(
      points_3d,
      points_features,
@@ -189,14 +183,6 @@ inline void PointsToVolumesBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points_3d);
-  CHECK_CPU(points_features);
-  CHECK_CPU(grid_sizes);
-  CHECK_CPU(mask);
-  CHECK_CPU(grad_volume_densities);
-  CHECK_CPU(grad_volume_features);
-  CHECK_CPU(grad_points_3d);
-  CHECK_CPU(grad_points_features);
  PointsToVolumesBackwardCpu(
      points_3d,
      points_features,
--- a/pytorch3d/csrc/pulsar/global.h
+++ b/pytorch3d/csrc/pulsar/global.h
@@ -15,8 +15,8 @@
 #endif

 #if defined(_WIN64) || defined(_WIN32)
-using uint = unsigned int;
-using ushort = unsigned short;
+#define uint unsigned int
+#define ushort unsigned short
 #endif

 #include "./logging.h" // <- include before torch/extension.h
--- a/pytorch3d/csrc/pulsar/gpu/commands.h
+++ b/pytorch3d/csrc/pulsar/gpu/commands.h
@@ -417,7 +417,7 @@ __device__ static float atomicMin(float* address, float val) {
      (OUT_PTR),              \
      (NUM_SELECTED_PTR),     \
      (NUM_ITEMS),            \
-      (STREAM));
+      stream = (STREAM));

 #define COPY_HOST_DEV(PTR_D, PTR_H, TYPE, SIZE) \
  HANDLECUDA(cudaMemcpy(                        \
--- a/pytorch3d/csrc/pulsar/include/camera.h
+++ b/pytorch3d/csrc/pulsar/include/camera.h
@@ -70,6 +70,11 @@ struct CamGradInfo {
  float3 pixel_dir_y;
 };

+// TODO: remove once https://github.com/NVlabs/cub/issues/172 is resolved.
+struct IntWrapper {
+  int val;
+};
+
 } // namespace pulsar

 #endif
--- a/pytorch3d/csrc/pulsar/include/math.h
+++ b/pytorch3d/csrc/pulsar/include/math.h
@@ -149,6 +149,11 @@ IHD CamGradInfo operator*(const CamGradInfo& a, const float& b) {
  return res;
 }

+IHD IntWrapper operator+(const IntWrapper& a, const IntWrapper& b) {
+  IntWrapper res;
+  res.val = a.val + b.val;
+  return res;
+}
 } // namespace pulsar

 #endif
--- a/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
@@ -155,8 +155,8 @@ void backward(
        stream);
    CHECKLAUNCH();
    SUM_WS(
-        self->ids_sorted_d,
-        self->n_grad_contributions_d,
+        (IntWrapper*)(self->ids_sorted_d),
+        (IntWrapper*)(self->n_grad_contributions_d),
        static_cast<int>(num_balls),
        self->workspace_d,
        self->workspace_size,
--- a/pytorch3d/csrc/pulsar/include/renderer.destruct.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.destruct.device.h
@@ -18,89 +18,68 @@ namespace Renderer {

 template <bool DEV>
 HOST void destruct(Renderer* self) {
-  if (self->result_d != NULL) {
+  if (self->result_d != NULL)
    FREE(self->result_d);
-  }
  self->result_d = NULL;
-  if (self->min_depth_d != NULL) {
+  if (self->min_depth_d != NULL)
    FREE(self->min_depth_d);
-  }
  self->min_depth_d = NULL;
-  if (self->min_depth_sorted_d != NULL) {
+  if (self->min_depth_sorted_d != NULL)
    FREE(self->min_depth_sorted_d);
-  }
  self->min_depth_sorted_d = NULL;
-  if (self->ii_d != NULL) {
+  if (self->ii_d != NULL)
    FREE(self->ii_d);
-  }
  self->ii_d = NULL;
-  if (self->ii_sorted_d != NULL) {
+  if (self->ii_sorted_d != NULL)
    FREE(self->ii_sorted_d);
-  }
  self->ii_sorted_d = NULL;
-  if (self->ids_d != NULL) {
+  if (self->ids_d != NULL)
    FREE(self->ids_d);
-  }
  self->ids_d = NULL;
-  if (self->ids_sorted_d != NULL) {
+  if (self->ids_sorted_d != NULL)
    FREE(self->ids_sorted_d);
-  }
  self->ids_sorted_d = NULL;
-  if (self->workspace_d != NULL) {
+  if (self->workspace_d != NULL)
    FREE(self->workspace_d);
-  }
  self->workspace_d = NULL;
-  if (self->di_d != NULL) {
+  if (self->di_d != NULL)
    FREE(self->di_d);
-  }
  self->di_d = NULL;
-  if (self->di_sorted_d != NULL) {
+  if (self->di_sorted_d != NULL)
    FREE(self->di_sorted_d);
-  }
  self->di_sorted_d = NULL;
-  if (self->region_flags_d != NULL) {
+  if (self->region_flags_d != NULL)
    FREE(self->region_flags_d);
-  }
  self->region_flags_d = NULL;
-  if (self->num_selected_d != NULL) {
+  if (self->num_selected_d != NULL)
    FREE(self->num_selected_d);
-  }
  self->num_selected_d = NULL;
-  if (self->forw_info_d != NULL) {
+  if (self->forw_info_d != NULL)
    FREE(self->forw_info_d);
-  }
  self->forw_info_d = NULL;
-  if (self->min_max_pixels_d != NULL) {
+  if (self->min_max_pixels_d != NULL)
    FREE(self->min_max_pixels_d);
-  }
  self->min_max_pixels_d = NULL;
-  if (self->grad_pos_d != NULL) {
+  if (self->grad_pos_d != NULL)
    FREE(self->grad_pos_d);
-  }
  self->grad_pos_d = NULL;
-  if (self->grad_col_d != NULL) {
+  if (self->grad_col_d != NULL)
    FREE(self->grad_col_d);
-  }
  self->grad_col_d = NULL;
-  if (self->grad_rad_d != NULL) {
+  if (self->grad_rad_d != NULL)
    FREE(self->grad_rad_d);
-  }
  self->grad_rad_d = NULL;
-  if (self->grad_cam_d != NULL) {
+  if (self->grad_cam_d != NULL)
    FREE(self->grad_cam_d);
-  }
  self->grad_cam_d = NULL;
-  if (self->grad_cam_buf_d != NULL) {
+  if (self->grad_cam_buf_d != NULL)
    FREE(self->grad_cam_buf_d);
-  }
  self->grad_cam_buf_d = NULL;
-  if (self->grad_opy_d != NULL) {
+  if (self->grad_opy_d != NULL)
    FREE(self->grad_opy_d);
-  }
  self->grad_opy_d = NULL;
-  if (self->n_grad_contributions_d != NULL) {
+  if (self->n_grad_contributions_d != NULL)
    FREE(self->n_grad_contributions_d);
-  }
  self->n_grad_contributions_d = NULL;
 }

--- a/pytorch3d/csrc/pulsar/include/renderer.norm_sphere_gradients.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.norm_sphere_gradients.device.h
@@ -64,9 +64,8 @@ GLOBAL void norm_sphere_gradients(Renderer renderer, const int num_balls) {
  // The sphere only contributes to the camera gradients if it is
  // large enough in screen space.
  if (renderer.ids_sorted_d[idx] > 0 && ii.max.x >= ii.min.x + 3 &&
-      ii.max.y >= ii.min.y + 3) {
+      ii.max.y >= ii.min.y + 3)
    renderer.ids_sorted_d[idx] = 1;
-  }
  END_PARALLEL_NORET();
 };

--- a/pytorch3d/csrc/pulsar/include/renderer.render.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.render.device.h
@@ -139,9 +139,8 @@ GLOBAL void render(
      coord_y < cam_norm.film_border_top + cam_norm.film_height) {
    // Initialize the result.
    if (mode == 0u) {
-      for (uint c_id = 0; c_id < cam_norm.n_channels; ++c_id) {
+      for (uint c_id = 0; c_id < cam_norm.n_channels; ++c_id)
        result[c_id] = bg_col[c_id];
-      }
    } else {
      result[0] = 0.f;
    }
@@ -191,22 +190,20 @@ GLOBAL void render(
            "render|found intersection with sphere %u.\n",
            sphere_id_l[write_idx]);
      }
-      if (ii.min.x == MAX_USHORT) {
+      if (ii.min.x == MAX_USHORT)
        // This is an invalid sphere (out of image). These spheres have
        // maximum depth. Since we ordered the spheres by earliest possible
        // intersection depth we re certain that there will no other sphere
        // that is relevant after this one.
        loading_done = true;
    }
-    }
    // Reset n_pixels_done.
    n_pixels_done = 0;
    thread_block.sync(); // Make sure n_loaded is updated.
    if (n_loaded > RENDER_BUFFER_LOAD_THRESH) {
      // The load buffer is full enough. Draw.
-      if (thread_block.thread_rank() == 0) {
+      if (thread_block.thread_rank() == 0)
        n_balls_loaded += n_loaded;
-      }
      max_closest_possible_intersection = 0.f;
      // This excludes threads outside of the image boundary. Also, it reduces
      // block artifacts.
@@ -293,9 +290,8 @@ GLOBAL void render(
      uint warp_done = thread_warp.ballot(done);
      int warp_done_bit_cnt = POPC(warp_done);
 #endif //__CUDACC__ && __HIP_PLATFORM_AMD__
-      if (thread_warp.thread_rank() == 0) {
+      if (thread_warp.thread_rank() == 0)
        ATOMICADD_B(&n_pixels_done, warp_done_bit_cnt);
-      }
      // This sync is necessary to keep n_loaded until all threads are done with
      // painting.
      thread_block.sync();
@@ -303,9 +299,8 @@ GLOBAL void render(
    }
    thread_block.sync();
  }
-  if (thread_block.thread_rank() == 0) {
+  if (thread_block.thread_rank() == 0)
    n_balls_loaded += n_loaded;
-  }
  PULSAR_LOG_DEV_PIX(
      PULSAR_LOG_RENDER_PIX,
      "render|loaded %d balls in total.\n",
@@ -391,9 +386,8 @@ GLOBAL void render(
            static_cast<float>(tracker.get_n_hits());
  } else {
    float sm_d_normfac = FRCP(FMAX(sm_d, FEPS));
-    for (uint c_id = 0; c_id < cam_norm.n_channels; ++c_id) {
+    for (uint c_id = 0; c_id < cam_norm.n_channels; ++c_id)
      result[c_id] *= sm_d_normfac;
-    }
    int write_loc = (coord_y - cam_norm.film_border_top) * cam_norm.film_width *
            (3 + 2 * n_track) +
        (coord_x - cam_norm.film_border_left) * (3 + 2 * n_track);
--- a/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
@@ -860,9 +860,8 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
            ? (cudaStream_t) nullptr
 #endif
            : (cudaStream_t) nullptr);
-    if (mode == 1) {
+    if (mode == 1)
      results[batch_i] = results[batch_i].slice(2, 0, 1, 1);
-    }
    forw_infos[batch_i] = from_blob(
        this->renderer_vec[batch_i].forw_info_d,
        {this->renderer_vec[0].cam.film_height,
--- a/pytorch3d/csrc/pulsar/pytorch/renderer.h
+++ b/pytorch3d/csrc/pulsar/pytorch/renderer.h
@@ -128,9 +128,8 @@ struct Renderer {
    stream << "pulsar::Renderer[";
    // Device info.
    stream << self.device_type;
-    if (self.device_index != -1) {
+    if (self.device_index != -1)
      stream << ", ID " << self.device_index;
-    }
    stream << "]";
    return stream;
  }
--- a/pytorch3d/csrc/pulsar/warnings.cpp
+++ b/pytorch3d/csrc/pulsar/warnings.cpp
@@ -6,6 +6,9 @@
 * LICENSE file in the root directory of this source tree.
 */

+#include "./global.h"
+#include "./logging.h"
+
 /**
 * A compilation unit to provide warnings about the code and avoid
 * repeated messages.
--- a/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
+++ b/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
@@ -25,7 +25,7 @@ class BitMask {

  // Use all threads in the current block to clear all bits of this BitMask
  __device__ void block_clear() {
-    for (auto i = threadIdx.x; i < H * W * D; i += blockDim.x) {
+    for (int i = threadIdx.x; i < H * W * D; i += blockDim.x) {
      data[i] = 0;
    }
    __syncthreads();
--- a/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu
+++ b/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu
@@ -23,8 +23,8 @@ __global__ void TriangleBoundingBoxKernel(
    const float blur_radius,
    float* bboxes, // (4, F)
    bool* skip_face) { // (F,)
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = blockDim.x * gridDim.x;
  const float sqrt_radius = sqrt(blur_radius);
  for (int f = tid; f < F; f += num_threads) {
    const float v0x = face_verts[f * 9 + 0 * 3 + 0];
@@ -56,8 +56,8 @@ __global__ void PointBoundingBoxKernel(
    const int P,
    float* bboxes, // (4, P)
    bool* skip_points) {
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = blockDim.x * gridDim.x;
  for (int p = tid; p < P; p += num_threads) {
    const float x = points[p * 3 + 0];
    const float y = points[p * 3 + 1];
@@ -113,7 +113,7 @@ __global__ void RasterizeCoarseCudaKernel(
  const int chunks_per_batch = 1 + (E - 1) / chunk_size;
  const int num_chunks = N * chunks_per_batch;

-  for (auto chunk = blockIdx.x; chunk < num_chunks; chunk += gridDim.x) {
+  for (int chunk = blockIdx.x; chunk < num_chunks; chunk += gridDim.x) {
    const int batch_idx = chunk / chunks_per_batch; // batch index
    const int chunk_idx = chunk % chunks_per_batch;
    const int elem_chunk_start_idx = chunk_idx * chunk_size;
@@ -123,7 +123,7 @@ __global__ void RasterizeCoarseCudaKernel(
    const int64_t elem_stop_idx = elem_start_idx + elems_per_batch[batch_idx];

    // Have each thread handle a different face within the chunk
-    for (auto e = threadIdx.x; e < chunk_size; e += blockDim.x) {
+    for (int e = threadIdx.x; e < chunk_size; e += blockDim.x) {
      const int e_idx = elem_chunk_start_idx + e;

      // Check that we are still within the same element of the batch
@@ -170,7 +170,7 @@ __global__ void RasterizeCoarseCudaKernel(
    // Now we have processed every elem in the current chunk. We need to
    // count the number of elems in each bin so we can write the indices
    // out to global memory. We have each thread handle a different bin.
-    for (auto byx = threadIdx.x; byx < num_bins_y * num_bins_x;
+    for (int byx = threadIdx.x; byx < num_bins_y * num_bins_x;
         byx += blockDim.x) {
      const int by = byx / num_bins_x;
      const int bx = byx % num_bins_x;
--- a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
+++ b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
@@ -260,8 +260,8 @@ __global__ void RasterizeMeshesNaiveCudaKernel(
    float* pix_dists,
    float* bary) {
  // Simple version: One thread per output pixel
-  auto num_threads = gridDim.x * blockDim.x;
-  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  int num_threads = gridDim.x * blockDim.x;
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;

  for (int i = tid; i < N * H * W; i += num_threads) {
    // Convert linear index to 3D index
@@ -446,8 +446,8 @@ __global__ void RasterizeMeshesBackwardCudaKernel(

  // Parallelize over each pixel in images of
  // size H * W, for each image in the batch of size N.
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int t_i = tid; t_i < N * H * W; t_i += num_threads) {
    // Convert linear index to 3D index
@@ -650,8 +650,8 @@ __global__ void RasterizeMeshesFineCudaKernel(
 ) {
  // This can be more than H * W if H or W are not divisible by bin_size.
  int num_pixels = N * BH * BW * bin_size * bin_size;
-  auto num_threads = gridDim.x * blockDim.x;
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = gridDim.x * blockDim.x;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int pid = tid; pid < num_pixels; pid += num_threads) {
    // Convert linear index into bin and pixel indices. We make the within
--- a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.h
+++ b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.h
@@ -138,9 +138,6 @@ RasterizeMeshesNaive(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(face_verts);
-    CHECK_CPU(mesh_to_face_first_idx);
-    CHECK_CPU(num_faces_per_mesh);
    return RasterizeMeshesNaiveCpu(
        face_verts,
        mesh_to_face_first_idx,
@@ -235,11 +232,6 @@ torch::Tensor RasterizeMeshesBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(face_verts);
-    CHECK_CPU(pix_to_face);
-    CHECK_CPU(grad_zbuf);
-    CHECK_CPU(grad_bary);
-    CHECK_CPU(grad_dists);
    return RasterizeMeshesBackwardCpu(
        face_verts,
        pix_to_face,
@@ -314,9 +306,6 @@ torch::Tensor RasterizeMeshesCoarse(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(face_verts);
-    CHECK_CPU(mesh_to_face_first_idx);
-    CHECK_CPU(num_faces_per_mesh);
    return RasterizeMeshesCoarseCpu(
        face_verts,
        mesh_to_face_first_idx,
@@ -434,8 +423,6 @@ RasterizeMeshesFine(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(face_verts);
-    CHECK_CPU(bin_faces);
    AT_ERROR("NOT IMPLEMENTED");
  }
 }
--- a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp
+++ b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp
@@ -106,8 +106,6 @@ auto ComputeFaceAreas(const torch::Tensor& face_verts) {
  return face_areas;
 }

-namespace {
-
 // Helper function to use with std::find_if to find the index of any
 // values in the top k struct which match a given idx.
 struct IsNeighbor {
@@ -120,6 +118,7 @@ struct IsNeighbor {
  int neighbor_idx;
 };

+namespace {
 void RasterizeMeshesNaiveCpu_worker(
    const int start_yi,
    const int end_yi,
--- a/pytorch3d/csrc/rasterize_points/rasterize_points.cu
+++ b/pytorch3d/csrc/rasterize_points/rasterize_points.cu
@@ -97,8 +97,8 @@ __global__ void RasterizePointsNaiveCudaKernel(
    float* zbuf, // (N, H, W, K)
    float* pix_dists) { // (N, H, W, K)
  // Simple version: One thread per output pixel
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
  for (int i = tid; i < N * H * W; i += num_threads) {
    // Convert linear index to 3D index
    const int n = i / (H * W); // Batch index
@@ -237,8 +237,8 @@ __global__ void RasterizePointsFineCudaKernel(
    float* pix_dists) { // (N, H, W, K)
  // This can be more than H * W if H or W are not divisible by bin_size.
  const int num_pixels = N * BH * BW * bin_size * bin_size;
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int pid = tid; pid < num_pixels; pid += num_threads) {
    // Convert linear index into bin and pixel indices. We make the within
@@ -376,8 +376,8 @@ __global__ void RasterizePointsBackwardCudaKernel(
    float* grad_points) { // (P, 3)
  // Parallelized over each of K points per pixel, for each pixel in images of
  // size H * W, for each image in the batch of size N.
-  auto num_threads = gridDim.x * blockDim.x;
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = gridDim.x * blockDim.x;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = tid; i < N * H * W * K; i += num_threads) {
    // const int n = i / (H * W * K); // batch index (not needed).
    const int yxk = i % (H * W * K);
--- a/pytorch3d/csrc/rasterize_points/rasterize_points.h
+++ b/pytorch3d/csrc/rasterize_points/rasterize_points.h
@@ -91,10 +91,6 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaive(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(points);
-    CHECK_CPU(cloud_to_packed_first_idx);
-    CHECK_CPU(num_points_per_cloud);
-    CHECK_CPU(radius);
    return RasterizePointsNaiveCpu(
        points,
        cloud_to_packed_first_idx,
@@ -170,10 +166,6 @@ torch::Tensor RasterizePointsCoarse(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(points);
-    CHECK_CPU(cloud_to_packed_first_idx);
-    CHECK_CPU(num_points_per_cloud);
-    CHECK_CPU(radius);
    return RasterizePointsCoarseCpu(
        points,
        cloud_to_packed_first_idx,
@@ -240,8 +232,6 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFine(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(points);
-    CHECK_CPU(bin_points);
    AT_ERROR("NOT IMPLEMENTED");
  }
 }
@@ -294,10 +284,6 @@ torch::Tensor RasterizePointsBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(points);
-    CHECK_CPU(idxs);
-    CHECK_CPU(grad_zbuf);
-    CHECK_CPU(grad_dists);
    return RasterizePointsBackwardCpu(points, idxs, grad_zbuf, grad_dists);
  }
 }
--- a/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.cu
+++ b/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.cu
@@ -107,8 +107,7 @@ at::Tensor FarthestPointSamplingCuda(
    const at::Tensor& points, // (N, P, 3)
    const at::Tensor& lengths, // (N,)
    const at::Tensor& K, // (N,)
-    const at::Tensor& start_idxs,
-    const int64_t max_K_known = -1) {
+    const at::Tensor& start_idxs) {
  // Check inputs are on the same device
  at::TensorArg p_t{points, "points", 1}, lengths_t{lengths, "lengths", 2},
      k_t{K, "K", 3}, start_idxs_t{start_idxs, "start_idxs", 4};
@@ -130,12 +129,7 @@ at::Tensor FarthestPointSamplingCuda(

  const int64_t N = points.size(0);
  const int64_t P = points.size(1);
-  int64_t max_K;
-  if (max_K_known > 0) {
-    max_K = max_K_known;
-  } else {
-    max_K = at::max(K).item<int64_t>();
-  }
+  const int64_t max_K = at::max(K).item<int64_t>();

  // Initialize the output tensor with the sampled indices
  auto idxs = at::full({N, max_K}, -1, lengths.options());
--- a/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.h
+++ b/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.h
@@ -43,8 +43,7 @@ at::Tensor FarthestPointSamplingCuda(
    const at::Tensor& points,
    const at::Tensor& lengths,
    const at::Tensor& K,
-    const at::Tensor& start_idxs,
-    const int64_t max_K_known = -1);
+    const at::Tensor& start_idxs);

 at::Tensor FarthestPointSamplingCpu(
    const at::Tensor& points,
@@ -57,23 +56,17 @@ at::Tensor FarthestPointSampling(
    const at::Tensor& points,
    const at::Tensor& lengths,
    const at::Tensor& K,
-    const at::Tensor& start_idxs,
-    const int64_t max_K_known = -1) {
+    const at::Tensor& start_idxs) {
  if (points.is_cuda() || lengths.is_cuda() || K.is_cuda()) {
 #ifdef WITH_CUDA
    CHECK_CUDA(points);
    CHECK_CUDA(lengths);
    CHECK_CUDA(K);
    CHECK_CUDA(start_idxs);
-    return FarthestPointSamplingCuda(
-        points, lengths, K, start_idxs, max_K_known);
+    return FarthestPointSamplingCuda(points, lengths, K, start_idxs);
 #else
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(lengths);
-  CHECK_CPU(K);
-  CHECK_CPU(start_idxs);
  return FarthestPointSamplingCpu(points, lengths, K, start_idxs);
 }
--- a/pytorch3d/csrc/sample_pdf/sample_pdf.h
+++ b/pytorch3d/csrc/sample_pdf/sample_pdf.h
@@ -71,8 +71,6 @@ inline void SamplePdf(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(weights);
-  CHECK_CPU(outputs);
  CHECK_CONTIGUOUS(outputs);
  SamplePdfCpu(bins, weights, outputs, eps);
 }
--- a/pytorch3d/csrc/utils/dispatch.cuh
+++ b/pytorch3d/csrc/utils/dispatch.cuh
@@ -99,7 +99,8 @@ namespace {
 // and increment it via template recursion until it is equal to the run-time
 // argument N.
 template <
-    template <typename, int64_t> class Kernel,
+    template <typename, int64_t>
+    class Kernel,
    typename T,
    int64_t minN,
    int64_t maxN,
@@ -123,7 +124,8 @@ struct DispatchKernelHelper1D {
 // 1D dispatch: Specialization when curN == maxN
 // We need this base case to avoid infinite template recursion.
 template <
-    template <typename, int64_t> class Kernel,
+    template <typename, int64_t>
+    class Kernel,
    typename T,
    int64_t minN,
    int64_t maxN,
@@ -143,7 +145,8 @@ struct DispatchKernelHelper1D<Kernel, T, minN, maxN, maxN, Args...> {
 // the run-time values of N and M, at which point we dispatch to the run
 // method of the kernel.
 template <
-    template <typename, int64_t, int64_t> class Kernel,
+    template <typename, int64_t, int64_t>
+    class Kernel,
    typename T,
    int64_t minN,
    int64_t maxN,
@@ -200,7 +203,8 @@ struct DispatchKernelHelper2D {

 // 2D dispatch, specialization for curN == maxN
 template <
-    template <typename, int64_t, int64_t> class Kernel,
+    template <typename, int64_t, int64_t>
+    class Kernel,
    typename T,
    int64_t minN,
    int64_t maxN,
@@ -239,7 +243,8 @@ struct DispatchKernelHelper2D<

 // 2D dispatch, specialization for curM == maxM
 template <
-    template <typename, int64_t, int64_t> class Kernel,
+    template <typename, int64_t, int64_t>
+    class Kernel,
    typename T,
    int64_t minN,
    int64_t maxN,
@@ -278,7 +283,8 @@ struct DispatchKernelHelper2D<

 // 2D dispatch, specialization for curN == maxN, curM == maxM
 template <
-    template <typename, int64_t, int64_t> class Kernel,
+    template <typename, int64_t, int64_t>
+    class Kernel,
    typename T,
    int64_t minN,
    int64_t maxN,
@@ -307,7 +313,8 @@ struct DispatchKernelHelper2D<

 // This is the function we expect users to call to dispatch to 1D functions
 template <
-    template <typename, int64_t> class Kernel,
+    template <typename, int64_t>
+    class Kernel,
    typename T,
    int64_t minN,
    int64_t maxN,
@@ -323,7 +330,8 @@ void DispatchKernel1D(const int64_t N, Args... args) {

 // This is the function we expect users to call to dispatch to 2D functions
 template <
-    template <typename, int64_t, int64_t> class Kernel,
+    template <typename, int64_t, int64_t>
+    class Kernel,
    typename T,
    int64_t minN,
    int64_t maxN,
--- a/pytorch3d/csrc/utils/pytorch3d_cutils.h
+++ b/pytorch3d/csrc/utils/pytorch3d_cutils.h
@@ -15,7 +15,3 @@
 #define CHECK_CONTIGUOUS_CUDA(x) \
  CHECK_CUDA(x);                 \
  CHECK_CONTIGUOUS(x)
-#define CHECK_CPU(x)                    \
-  TORCH_CHECK(                          \
-      x.device().type() == torch::kCPU, \
-      "Cannot use CPU implementation: " #x " not on CPU.")
--- a/pytorch3d/csrc/utils/vec2.h
+++ b/pytorch3d/csrc/utils/vec2.h
@@ -19,7 +19,7 @@ template <
        std::is_same<T, double>::value || std::is_same<T, float>::value>>
 struct vec2 {
  T x, y;
-  using scalar_t = T;
+  typedef T scalar_t;
  vec2(T x, T y) : x(x), y(y) {}
 };

--- a/pytorch3d/csrc/utils/vec3.h
+++ b/pytorch3d/csrc/utils/vec3.h
@@ -18,7 +18,7 @@ template <
        std::is_same<T, double>::value || std::is_same<T, float>::value>>
 struct vec3 {
  T x, y, z;
-  using scalar_t = T;
+  typedef T scalar_t;
  vec3(T x, T y, T z) : x(x), y(y), z(z) {}
 };

--- a/pytorch3d/implicitron/dataset/blender_dataset_map_provider.py
+++ b/pytorch3d/implicitron/dataset/blender_dataset_map_provider.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+
+import torch
+from pytorch3d.implicitron.tools.config import registry
+
+from .load_blender import load_blender_data
+from .single_sequence_dataset import (
+    _interpret_blender_cameras,
+    SingleSceneDatasetMapProviderBase,
+)
+
+
+@registry.register
+class BlenderDatasetMapProvider(SingleSceneDatasetMapProviderBase):
+    """
+    Provides data for one scene from Blender synthetic dataset.
+    Uses the code in load_blender.py
+
+    Members:
+        base_dir: directory holding the data for the scene.
+        object_name: The name of the scene (e.g. "lego"). This is just used as a label.
+            It will typically be equal to the name of the directory self.base_dir.
+        path_manager_factory: Creates path manager which may be used for
+            interpreting paths.
+        n_known_frames_for_test: If set, training frames are included in the val
+            and test datasets, and this many random training frames are added to
+            each test batch. If not set, test batches each contain just a single
+            testing frame.
+    """
+
+    def _load_data(self) -> None:
+        path_manager = self.path_manager_factory.get()
+        images, poses, _, hwf, i_split = load_blender_data(
+            self.base_dir,
+            testskip=1,
+            path_manager=path_manager,
+        )
+        H, W, focal = hwf
+        images_masks = torch.from_numpy(images).permute(0, 3, 1, 2)
+
+        # pyre-ignore[16]
+        self.poses = _interpret_blender_cameras(poses, focal)
+        # pyre-ignore[16]
+        self.images = images_masks[:, :3]
+        # pyre-ignore[16]
+        self.fg_probabilities = images_masks[:, 3:4]
+        # pyre-ignore[16]
+        self.i_split = i_split
--- a/pytorch3d/implicitron/dataset/data_source.py
+++ b/pytorch3d/implicitron/dataset/data_source.py
@@ -64,12 +64,16 @@ class ImplicitronDataSource(DataSourceBase):
    def pre_expand(cls) -> None:
        # use try/finally to bypass cinder's lazy imports
        try:
+            from .blender_dataset_map_provider import (  # noqa: F401
+                BlenderDatasetMapProvider,
+            )
            from .json_index_dataset_map_provider import (  # noqa: F401
                JsonIndexDatasetMapProvider,
            )
            from .json_index_dataset_map_provider_v2 import (  # noqa: F401
                JsonIndexDatasetMapProviderV2,
            )
+            from .llff_dataset_map_provider import LlffDatasetMapProvider  # noqa: F401
            from .rendered_mesh_dataset_map_provider import (  # noqa: F401
                RenderedMeshDatasetMapProvider,
            )
--- a/pytorch3d/implicitron/dataset/dataset_base.py
+++ b/pytorch3d/implicitron/dataset/dataset_base.py
@@ -21,6 +21,7 @@ from typing import (
 )

 import torch
+
 from pytorch3d.implicitron.dataset.frame_data import FrameData
 from pytorch3d.implicitron.dataset.utils import GenericWorkaround

--- a/pytorch3d/implicitron/dataset/frame_data.py
+++ b/pytorch3d/implicitron/dataset/frame_data.py
@@ -25,6 +25,7 @@ from typing import (

 import numpy as np
 import torch
+
 from pytorch3d.implicitron.dataset import orm_types, types
 from pytorch3d.implicitron.dataset.utils import (
    adjust_camera_to_bbox_crop_,
--- a/pytorch3d/implicitron/dataset/json_index_dataset.py
+++ b/pytorch3d/implicitron/dataset/json_index_dataset.py
@@ -38,6 +38,7 @@ from pytorch3d.implicitron.dataset.utils import is_known_frame_scalar
 from pytorch3d.implicitron.tools.config import registry, ReplaceableBase
 from pytorch3d.renderer.camera_utils import join_cameras_as_batch
 from pytorch3d.renderer.cameras import CamerasBase
+
 from tqdm import tqdm


@@ -326,9 +327,9 @@ class JsonIndexDataset(DatasetBase, ReplaceableBase):
                assert os.path.normpath(
                    # pyre-ignore[16]
                    self.frame_annots[idx]["frame_annotation"].image.path
-                ) == os.path.normpath(path), (
-                    f"Inconsistent frame indices {seq_name, frame_no, path}."
-                )
+                ) == os.path.normpath(
+                    path
+                ), f"Inconsistent frame indices {seq_name, frame_no, path}."
            return idx

        dataset_idx = [
--- a/pytorch3d/implicitron/dataset/json_index_dataset_map_provider.py
+++ b/pytorch3d/implicitron/dataset/json_index_dataset_map_provider.py
@@ -21,6 +21,7 @@ from pytorch3d.renderer.cameras import CamerasBase

 from .dataset_map_provider import DatasetMap, DatasetMapProviderBase, PathManagerFactory
 from .json_index_dataset import JsonIndexDataset
+
 from .utils import (
    DATASET_TYPE_KNOWN,
    DATASET_TYPE_TEST,
--- a/pytorch3d/implicitron/dataset/json_index_dataset_map_provider_v2.py
+++ b/pytorch3d/implicitron/dataset/json_index_dataset_map_provider_v2.py
@@ -18,6 +18,7 @@ from typing import Dict, List, Optional, Tuple, Type, Union

 import numpy as np
 from iopath.common.file_io import PathManager
+
 from omegaconf import DictConfig
 from pytorch3d.implicitron.dataset.dataset_map_provider import (
    DatasetMap,
@@ -30,6 +31,7 @@ from pytorch3d.implicitron.tools.config import (
    registry,
    run_auto_creation,
 )
+
 from pytorch3d.renderer.cameras import CamerasBase
 from tqdm import tqdm

--- a/pytorch3d/implicitron/dataset/llff_dataset_map_provider.py
+++ b/pytorch3d/implicitron/dataset/llff_dataset_map_provider.py
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+
+import numpy as np
+import torch
+from pytorch3d.implicitron.tools.config import registry
+
+from .load_llff import load_llff_data
+
+from .single_sequence_dataset import (
+    _interpret_blender_cameras,
+    SingleSceneDatasetMapProviderBase,
+)
+
+
+@registry.register
+class LlffDatasetMapProvider(SingleSceneDatasetMapProviderBase):
+    """
+    Provides data for one scene from the LLFF dataset.
+
+    Members:
+        base_dir: directory holding the data for the scene.
+        object_name: The name of the scene (e.g. "fern"). This is just used as a label.
+            It will typically be equal to the name of the directory self.base_dir.
+        path_manager_factory: Creates path manager which may be used for
+            interpreting paths.
+        n_known_frames_for_test: If set, training frames are included in the val
+            and test datasets, and this many random training frames are added to
+            each test batch. If not set, test batches each contain just a single
+            testing frame.
+        downscale_factor: determines image sizes.
+    """
+
+    downscale_factor: int = 4
+
+    def _load_data(self) -> None:
+        path_manager = self.path_manager_factory.get()
+        images, poses, _ = load_llff_data(
+            self.base_dir, factor=self.downscale_factor, path_manager=path_manager
+        )
+        hwf = poses[0, :3, -1]
+        poses = poses[:, :3, :4]
+
+        llffhold = 8
+        i_test = np.arange(images.shape[0])[::llffhold]
+        i_test_index = set(i_test.tolist())
+        i_train = np.array(
+            [i for i in np.arange(images.shape[0]) if i not in i_test_index]
+        )
+        i_split = (i_train, i_test, i_test)
+        H, W, focal = hwf
+        focal_ndc = 2 * focal / min(H, W)
+        images = torch.from_numpy(images).permute(0, 3, 1, 2)
+        poses = torch.from_numpy(poses)
+
+        # pyre-ignore[16]
+        self.poses = _interpret_blender_cameras(poses, focal_ndc)
+        # pyre-ignore[16]
+        self.images = images
+        # pyre-ignore[16]
+        self.fg_probabilities = None
+        # pyre-ignore[16]
+        self.i_split = i_split
--- a/pytorch3d/implicitron/dataset/load_blender.py
+++ b/pytorch3d/implicitron/dataset/load_blender.py
@@ -0,0 +1,143 @@
+# @lint-ignore-every LICENSELINT
+# Adapted from https://github.com/bmild/nerf/blob/master/load_blender.py
+# Copyright (c) 2020 bmild
+
+# pyre-unsafe
+import json
+import os
+
+import numpy as np
+import torch
+from PIL import Image
+
+
+def translate_by_t_along_z(t):
+    tform = np.eye(4).astype(np.float32)
+    tform[2][3] = t
+    return tform
+
+
+def rotate_by_phi_along_x(phi):
+    tform = np.eye(4).astype(np.float32)
+    tform[1, 1] = tform[2, 2] = np.cos(phi)
+    tform[1, 2] = -np.sin(phi)
+    tform[2, 1] = -tform[1, 2]
+    return tform
+
+
+def rotate_by_theta_along_y(theta):
+    tform = np.eye(4).astype(np.float32)
+    tform[0, 0] = tform[2, 2] = np.cos(theta)
+    tform[0, 2] = -np.sin(theta)
+    tform[2, 0] = -tform[0, 2]
+    return tform
+
+
+def pose_spherical(theta, phi, radius):
+    c2w = translate_by_t_along_z(radius)
+    c2w = rotate_by_phi_along_x(phi / 180.0 * np.pi) @ c2w
+    c2w = rotate_by_theta_along_y(theta / 180 * np.pi) @ c2w
+    c2w = np.array([[-1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) @ c2w
+    return c2w
+
+
+def _local_path(path_manager, path):
+    if path_manager is None:
+        return path
+    return path_manager.get_local_path(path)
+
+
+def load_blender_data(
+    basedir,
+    half_res=False,
+    testskip=1,
+    debug=False,
+    path_manager=None,
+    focal_length_in_screen_space=False,
+):
+    splits = ["train", "val", "test"]
+    metas = {}
+    for s in splits:
+        path = os.path.join(basedir, f"transforms_{s}.json")
+        with open(_local_path(path_manager, path)) as fp:
+            metas[s] = json.load(fp)
+
+    all_imgs = []
+    all_poses = []
+    counts = [0]
+    for s in splits:
+        meta = metas[s]
+        imgs = []
+        poses = []
+        if s == "train" or testskip == 0:
+            skip = 1
+        else:
+            skip = testskip
+
+        for frame in meta["frames"][::skip]:
+            fname = os.path.join(basedir, frame["file_path"] + ".png")
+            imgs.append(np.array(Image.open(_local_path(path_manager, fname))))
+            poses.append(np.array(frame["transform_matrix"]))
+        imgs = (np.array(imgs) / 255.0).astype(np.float32)
+        poses = np.array(poses).astype(np.float32)
+        counts.append(counts[-1] + imgs.shape[0])
+        all_imgs.append(imgs)
+        all_poses.append(poses)
+
+    i_split = [np.arange(counts[i], counts[i + 1]) for i in range(3)]
+
+    imgs = np.concatenate(all_imgs, 0)
+    poses = np.concatenate(all_poses, 0)
+
+    H, W = imgs[0].shape[:2]
+    camera_angle_x = float(meta["camera_angle_x"])
+    if focal_length_in_screen_space:
+        focal = 0.5 * W / np.tan(0.5 * camera_angle_x)
+    else:
+        focal = 1 / np.tan(0.5 * camera_angle_x)
+
+    render_poses = torch.stack(
+        [
+            torch.from_numpy(pose_spherical(angle, -30.0, 4.0))
+            for angle in np.linspace(-180, 180, 40 + 1)[:-1]
+        ],
+        0,
+    )
+
+    # In debug mode, return extremely tiny images
+    if debug:
+        import cv2
+
+        H = H // 32
+        W = W // 32
+        if focal_length_in_screen_space:
+            focal = focal / 32.0
+        imgs = [
+            torch.from_numpy(
+                cv2.resize(imgs[i], dsize=(25, 25), interpolation=cv2.INTER_AREA)
+            )
+            for i in range(imgs.shape[0])
+        ]
+        imgs = torch.stack(imgs, 0)
+        poses = torch.from_numpy(poses)
+        return imgs, poses, render_poses, [H, W, focal], i_split
+
+    if half_res:
+        import cv2
+
+        # TODO: resize images using INTER_AREA (cv2)
+        H = H // 2
+        W = W // 2
+        if focal_length_in_screen_space:
+            focal = focal / 2.0
+        imgs = [
+            torch.from_numpy(
+                cv2.resize(imgs[i], dsize=(400, 400), interpolation=cv2.INTER_AREA)
+            )
+            for i in range(imgs.shape[0])
+        ]
+        imgs = torch.stack(imgs, 0)
+
+    poses = torch.from_numpy(poses)
+
+    return imgs, poses, render_poses, [H, W, focal], i_split
--- a/pytorch3d/implicitron/dataset/load_llff.py
+++ b/pytorch3d/implicitron/dataset/load_llff.py
@@ -0,0 +1,336 @@
+# @lint-ignore-every LICENSELINT
+# Adapted from https://github.com/bmild/nerf/blob/master/load_llff.py
+# Copyright (c) 2020 bmild
+
+# pyre-unsafe
+import logging
+import os
+import warnings
+
+import numpy as np
+
+from PIL import Image
+
+
+# Slightly modified version of LLFF data loading code
+#  see https://github.com/Fyusion/LLFF for original
+
+logger = logging.getLogger(__name__)
+
+
+def _minify(basedir, path_manager, factors=(), resolutions=()):
+    needtoload = False
+    for r in factors:
+        imgdir = os.path.join(basedir, "images_{}".format(r))
+        if not _exists(path_manager, imgdir):
+            needtoload = True
+    for r in resolutions:
+        imgdir = os.path.join(basedir, "images_{}x{}".format(r[1], r[0]))
+        if not _exists(path_manager, imgdir):
+            needtoload = True
+    if not needtoload:
+        return
+    assert path_manager is None
+
+    from subprocess import check_output
+
+    imgdir = os.path.join(basedir, "images")
+    imgs = [os.path.join(imgdir, f) for f in sorted(_ls(path_manager, imgdir))]
+    imgs = [f for f in imgs if f.endswith("JPG", "jpg", "png", "jpeg", "PNG")]
+    imgdir_orig = imgdir
+
+    wd = os.getcwd()
+
+    for r in factors + resolutions:
+        if isinstance(r, int):
+            name = "images_{}".format(r)
+            resizearg = "{}%".format(100.0 / r)
+        else:
+            name = "images_{}x{}".format(r[1], r[0])
+            resizearg = "{}x{}".format(r[1], r[0])
+        imgdir = os.path.join(basedir, name)
+        if os.path.exists(imgdir):
+            continue
+
+        logger.info(f"Minifying {r}, {basedir}")
+
+        os.makedirs(imgdir)
+        check_output("cp {}/* {}".format(imgdir_orig, imgdir), shell=True)
+
+        ext = imgs[0].split(".")[-1]
+        args = " ".join(
+            ["mogrify", "-resize", resizearg, "-format", "png", "*.{}".format(ext)]
+        )
+        logger.info(args)
+        os.chdir(imgdir)
+        check_output(args, shell=True)
+        os.chdir(wd)
+
+        if ext != "png":
+            check_output("rm {}/*.{}".format(imgdir, ext), shell=True)
+            logger.info("Removed duplicates")
+        logger.info("Done")
+
+
+def _load_data(
+    basedir, factor=None, width=None, height=None, load_imgs=True, path_manager=None
+):
+    poses_arr = np.load(
+        _local_path(path_manager, os.path.join(basedir, "poses_bounds.npy"))
+    )
+    poses = poses_arr[:, :-2].reshape([-1, 3, 5]).transpose([1, 2, 0])
+    bds = poses_arr[:, -2:].transpose([1, 0])
+
+    img0 = [
+        os.path.join(basedir, "images", f)
+        for f in sorted(_ls(path_manager, os.path.join(basedir, "images")))
+        if f.endswith("JPG") or f.endswith("jpg") or f.endswith("png")
+    ][0]
+
+    def imread(f):
+        return np.array(Image.open(f))
+
+    sh = imread(_local_path(path_manager, img0)).shape
+
+    sfx = ""
+
+    if factor is not None:
+        sfx = "_{}".format(factor)
+        _minify(basedir, path_manager, factors=[factor])
+        factor = factor
+    elif height is not None:
+        factor = sh[0] / float(height)
+        width = int(sh[1] / factor)
+        _minify(basedir, path_manager, resolutions=[[height, width]])
+        sfx = "_{}x{}".format(width, height)
+    elif width is not None:
+        factor = sh[1] / float(width)
+        height = int(sh[0] / factor)
+        _minify(basedir, path_manager, resolutions=[[height, width]])
+        sfx = "_{}x{}".format(width, height)
+    else:
+        factor = 1
+
+    imgdir = os.path.join(basedir, "images" + sfx)
+    if not _exists(path_manager, imgdir):
+        raise ValueError(f"{imgdir} does not exist, returning")
+
+    imgfiles = [
+        _local_path(path_manager, os.path.join(imgdir, f))
+        for f in sorted(_ls(path_manager, imgdir))
+        if f.endswith("JPG") or f.endswith("jpg") or f.endswith("png")
+    ]
+    if poses.shape[-1] != len(imgfiles):
+        raise ValueError(
+            "Mismatch between imgs {} and poses {} !!!!".format(
+                len(imgfiles), poses.shape[-1]
+            )
+        )
+
+    sh = imread(imgfiles[0]).shape
+    poses[:2, 4, :] = np.array(sh[:2]).reshape([2, 1])
+    poses[2, 4, :] = poses[2, 4, :] * 1.0 / factor
+
+    if not load_imgs:
+        return poses, bds
+
+    imgs = imgs = [imread(f)[..., :3] / 255.0 for f in imgfiles]
+    imgs = np.stack(imgs, -1)
+
+    logger.info(f"Loaded image data, shape {imgs.shape}")
+    return poses, bds, imgs
+
+
+def normalize(x):
+    denom = np.linalg.norm(x)
+    if denom < 0.001:
+        warnings.warn("unsafe normalize()")
+    return x / denom
+
+
+def viewmatrix(z, up, pos):
+    vec2 = normalize(z)
+    vec1_avg = up
+    vec0 = normalize(np.cross(vec1_avg, vec2))
+    vec1 = normalize(np.cross(vec2, vec0))
+    m = np.stack([vec0, vec1, vec2, pos], 1)
+    return m
+
+
+def ptstocam(pts, c2w):
+    tt = np.matmul(c2w[:3, :3].T, (pts - c2w[:3, 3])[..., np.newaxis])[..., 0]
+    return tt
+
+
+def poses_avg(poses):
+    hwf = poses[0, :3, -1:]
+
+    center = poses[:, :3, 3].mean(0)
+    vec2 = normalize(poses[:, :3, 2].sum(0))
+    up = poses[:, :3, 1].sum(0)
+    c2w = np.concatenate([viewmatrix(vec2, up, center), hwf], 1)
+
+    return c2w
+
+
+def render_path_spiral(c2w, up, rads, focal, zdelta, zrate, rots, N):
+    render_poses = []
+    rads = np.array(list(rads) + [1.0])
+    hwf = c2w[:, 4:5]
+
+    for theta in np.linspace(0.0, 2.0 * np.pi * rots, N + 1)[:-1]:
+        c = np.dot(
+            c2w[:3, :4],
+            np.array([np.cos(theta), -np.sin(theta), -np.sin(theta * zrate), 1.0])
+            * rads,
+        )
+        z = normalize(c - np.dot(c2w[:3, :4], np.array([0, 0, -focal, 1.0])))
+        render_poses.append(np.concatenate([viewmatrix(z, up, c), hwf], 1))
+    return render_poses
+
+
+def recenter_poses(poses):
+    poses_ = poses + 0
+    bottom = np.reshape([0, 0, 0, 1.0], [1, 4])
+    c2w = poses_avg(poses)
+    c2w = np.concatenate([c2w[:3, :4], bottom], -2)
+    bottom = np.tile(np.reshape(bottom, [1, 1, 4]), [poses.shape[0], 1, 1])
+    poses = np.concatenate([poses[:, :3, :4], bottom], -2)
+
+    poses = np.linalg.inv(c2w) @ poses
+    poses_[:, :3, :4] = poses[:, :3, :4]
+    poses = poses_
+    return poses
+
+
+def spherify_poses(poses, bds):
+    def add_row_to_homogenize_transform(p):
+        r"""Add the last row to homogenize 3 x 4 transformation matrices."""
+        return np.concatenate(
+            [p, np.tile(np.reshape(np.eye(4)[-1, :], [1, 1, 4]), [p.shape[0], 1, 1])], 1
+        )
+
+    # p34_to_44 = lambda p: np.concatenate(
+    #     [p, np.tile(np.reshape(np.eye(4)[-1, :], [1, 1, 4]), [p.shape[0], 1, 1])], 1
+    # )
+
+    p34_to_44 = add_row_to_homogenize_transform
+
+    rays_d = poses[:, :3, 2:3]
+    rays_o = poses[:, :3, 3:4]
+
+    def min_line_dist(rays_o, rays_d):
+        A_i = np.eye(3) - rays_d * np.transpose(rays_d, [0, 2, 1])
+        b_i = -A_i @ rays_o
+        pt_mindist = np.squeeze(
+            -np.linalg.inv((np.transpose(A_i, [0, 2, 1]) @ A_i).mean(0)) @ (b_i).mean(0)
+        )
+        return pt_mindist
+
+    pt_mindist = min_line_dist(rays_o, rays_d)
+
+    center = pt_mindist
+    up = (poses[:, :3, 3] - center).mean(0)
+
+    vec0 = normalize(up)
+    vec1 = normalize(np.cross([0.1, 0.2, 0.3], vec0))
+    vec2 = normalize(np.cross(vec0, vec1))
+    pos = center
+    c2w = np.stack([vec1, vec2, vec0, pos], 1)
+
+    poses_reset = np.linalg.inv(p34_to_44(c2w[None])) @ p34_to_44(poses[:, :3, :4])
+
+    rad = np.sqrt(np.mean(np.sum(np.square(poses_reset[:, :3, 3]), -1)))
+
+    sc = 1.0 / rad
+    poses_reset[:, :3, 3] *= sc
+    bds *= sc
+    rad *= sc
+
+    centroid = np.mean(poses_reset[:, :3, 3], 0)
+    zh = centroid[2]
+    radcircle = np.sqrt(rad**2 - zh**2)
+    new_poses = []
+
+    for th in np.linspace(0.0, 2.0 * np.pi, 120):
+        camorigin = np.array([radcircle * np.cos(th), radcircle * np.sin(th), zh])
+        up = np.array([0, 0, -1.0])
+
+        vec2 = normalize(camorigin)
+        vec0 = normalize(np.cross(vec2, up))
+        vec1 = normalize(np.cross(vec2, vec0))
+        pos = camorigin
+        p = np.stack([vec0, vec1, vec2, pos], 1)
+
+        new_poses.append(p)
+
+    new_poses = np.stack(new_poses, 0)
+
+    new_poses = np.concatenate(
+        [new_poses, np.broadcast_to(poses[0, :3, -1:], new_poses[:, :3, -1:].shape)], -1
+    )
+    poses_reset = np.concatenate(
+        [
+            poses_reset[:, :3, :4],
+            np.broadcast_to(poses[0, :3, -1:], poses_reset[:, :3, -1:].shape),
+        ],
+        -1,
+    )
+
+    return poses_reset, new_poses, bds
+
+
+def _local_path(path_manager, path):
+    if path_manager is None:
+        return path
+    return path_manager.get_local_path(path)
+
+
+def _ls(path_manager, path):
+    if path_manager is None:
+        return os.listdir(path)
+    return path_manager.ls(path)
+
+
+def _exists(path_manager, path):
+    if path_manager is None:
+        return os.path.exists(path)
+    return path_manager.exists(path)
+
+
+def load_llff_data(
+    basedir,
+    factor=8,
+    recenter=True,
+    bd_factor=0.75,
+    spherify=False,
+    path_zflat=False,
+    path_manager=None,
+):
+    poses, bds, imgs = _load_data(
+        basedir, factor=factor, path_manager=path_manager
+    )  # factor=8 downsamples original imgs by 8x
+    logger.info(f"Loaded {basedir}, {bds.min()}, {bds.max()}")
+
+    # Correct rotation matrix ordering and move variable dim to axis 0
+    poses = np.concatenate([poses[:, 1:2, :], -poses[:, 0:1, :], poses[:, 2:, :]], 1)
+    poses = np.moveaxis(poses, -1, 0).astype(np.float32)
+    imgs = np.moveaxis(imgs, -1, 0).astype(np.float32)
+    images = imgs
+    bds = np.moveaxis(bds, -1, 0).astype(np.float32)
+
+    # Rescale if bd_factor is provided
+    sc = 1.0 if bd_factor is None else 1.0 / (bds.min() * bd_factor)
+    poses[:, :3, 3] *= sc
+    bds *= sc
+
+    if recenter:
+        poses = recenter_poses(poses)
+
+    if spherify:
+        poses, render_poses, bds = spherify_poses(poses, bds)
+
+    images = images.astype(np.float32)
+    poses = poses.astype(np.float32)
+
+    return images, poses, bds
--- a/pytorch3d/implicitron/dataset/orm_types.py
+++ b/pytorch3d/implicitron/dataset/orm_types.py
@@ -13,6 +13,7 @@ import struct
 from typing import Optional, Tuple

 import numpy as np
+
 from pytorch3d.implicitron.dataset.types import (
    DepthAnnotation,
    ImageAnnotation,
@@ -21,6 +22,7 @@ from pytorch3d.implicitron.dataset.types import (
    VideoAnnotation,
    ViewpointAnnotation,
 )
+
 from sqlalchemy import LargeBinary
 from sqlalchemy.orm import (
    composite,
--- a/pytorch3d/implicitron/dataset/single_sequence_dataset.py
+++ b/pytorch3d/implicitron/dataset/single_sequence_dataset.py
@@ -85,7 +85,7 @@ class SingleSceneDataset(DatasetBase, Configurable):

 class SingleSceneDatasetMapProviderBase(DatasetMapProviderBase):
    """
-    Base for provider of data for one scene.
+    Base for provider of data for one scene from LLFF or blender datasets.

    Members:
        base_dir: directory holding the data for the scene.
@@ -171,3 +171,40 @@ class SingleSceneDatasetMapProviderBase(DatasetMapProviderBase):
        # pyre-ignore[16]
        cameras = [self.poses[i] for i in self.i_split[0]]
        return join_cameras_as_batch(cameras)
+
+
+def _interpret_blender_cameras(
+    poses: torch.Tensor, focal: float
+) -> List[PerspectiveCameras]:
+    """
+    Convert 4x4 matrices representing cameras in blender format
+    to PyTorch3D format.
+
+    Args:
+        poses: N x 3 x 4 camera matrices
+        focal: ndc space focal length
+    """
+    pose_target_cameras = []
+    for pose_target in poses:
+        pose_target = pose_target[:3, :4]
+        mtx = torch.eye(4, dtype=pose_target.dtype)
+        mtx[:3, :3] = pose_target[:3, :3].t()
+        mtx[3, :3] = pose_target[:, 3]
+        mtx = mtx.inverse()
+
+        # flip the XZ coordinates.
+        mtx[:, [0, 2]] *= -1.0
+
+        Rpt3, Tpt3 = mtx[:, :3].split([3, 1], dim=0)
+
+        focal_length_pt3 = torch.FloatTensor([[focal, focal]])
+        principal_point_pt3 = torch.FloatTensor([[0.0, 0.0]])
+
+        cameras = PerspectiveCameras(
+            focal_length=focal_length_pt3,
+            principal_point=principal_point_pt3,
+            R=Rpt3[None],
+            T=Tpt3,
+        )
+        pose_target_cameras.append(cameras)
+    return pose_target_cameras
--- a/pytorch3d/implicitron/dataset/sql_dataset.py
+++ b/pytorch3d/implicitron/dataset/sql_dataset.py
@@ -10,6 +10,7 @@ import hashlib
 import json
 import logging
 import os
+
 import urllib
 from dataclasses import dataclass, Field, field
 from typing import (
@@ -31,11 +32,13 @@ import pandas as pd
 import sqlalchemy as sa
 import torch
 from pytorch3d.implicitron.dataset.dataset_base import DatasetBase
+
 from pytorch3d.implicitron.dataset.frame_data import (
    FrameData,
    FrameDataBuilder,  # noqa
    FrameDataBuilderBase,
 )
+
 from pytorch3d.implicitron.tools.config import (
    registry,
    ReplaceableBase,
@@ -483,10 +486,9 @@ class SqlIndexDataset(DatasetBase, ReplaceableBase):
            *self._get_pick_filters(),
            *self._get_exclude_filters(),
        ]
-        if pick_sequences_sql_clause := self.pick_sequences_sql_clause:
+        if self.pick_sequences_sql_clause:
            print("Applying the custom SQL clause.")
-            # pyre-ignore[6]: TextClause is compatible with where conditions
-            where_conditions.append(sa.text(pick_sequences_sql_clause))
+            where_conditions.append(sa.text(self.pick_sequences_sql_clause))

        def add_where(stmt):
            return stmt.where(*where_conditions) if where_conditions else stmt
@@ -506,7 +508,6 @@ class SqlIndexDataset(DatasetBase, ReplaceableBase):

            subquery = add_where(subquery).subquery()
            stmt = sa.select(subquery.c.sequence_name).where(
-                # pyre-ignore[6]: SQLAlchemy column comparison returns ColumnElement, not bool
                subquery.c.row_number <= self.limit_sequences_per_category_to
            )

@@ -635,10 +636,9 @@ class SqlIndexDataset(DatasetBase, ReplaceableBase):
                    )
                )

-        if pick_frames_sql_clause := self.pick_frames_sql_clause:
+        if self.pick_frames_sql_clause:
            logger.info("Applying the custom SQL clause.")
-            # pyre-ignore[6]: TextClause is compatible with where conditions
-            pick_frames_criteria.append(sa.text(pick_frames_sql_clause))
+            pick_frames_criteria.append(sa.text(self.pick_frames_sql_clause))

        if pick_frames_criteria:
            index = self._pick_frames_by_criteria(index, pick_frames_criteria)
@@ -701,10 +701,9 @@ class SqlIndexDataset(DatasetBase, ReplaceableBase):
                )
            )

-        if pick_frames_sql_clause := self.pick_frames_sql_clause:
+        if self.pick_frames_sql_clause:
            logger.info("  applying custom SQL clause")
-            # pyre-ignore[6]: TextClause is compatible with where conditions
-            where_conditions.append(sa.text(pick_frames_sql_clause))
+            where_conditions.append(sa.text(self.pick_frames_sql_clause))

        if where_conditions:
            stmt = stmt.where(*where_conditions)
@@ -756,7 +755,7 @@ class SqlIndexDataset(DatasetBase, ReplaceableBase):
        if pick_sequences:
            old_len = len(eval_batches)
            eval_batches = [b for b in eval_batches if b[0][0] in pick_sequences]
-            logger.warning(
+            logger.warn(
                f"Picked eval batches by sequence/cat: {old_len} -> {len(eval_batches)}"
            )

@@ -764,7 +763,7 @@ class SqlIndexDataset(DatasetBase, ReplaceableBase):
            old_len = len(eval_batches)
            exclude_sequences = set(self.exclude_sequences)
            eval_batches = [b for b in eval_batches if b[0][0] not in exclude_sequences]
-            logger.warning(
+            logger.warn(
                f"Excluded eval batches by sequence: {old_len} -> {len(eval_batches)}"
            )

--- a/pytorch3d/implicitron/dataset/sql_dataset_provider.py
+++ b/pytorch3d/implicitron/dataset/sql_dataset_provider.py
@@ -12,7 +12,9 @@ import os
 from typing import List, Optional, Tuple, Type

 import numpy as np
+
 from omegaconf import DictConfig, OmegaConf
+
 from pytorch3d.implicitron.dataset.dataset_map_provider import (
    DatasetMap,
    DatasetMapProviderBase,
--- a/pytorch3d/implicitron/dataset/train_eval_data_loader_provider.py
+++ b/pytorch3d/implicitron/dataset/train_eval_data_loader_provider.py
@@ -18,6 +18,7 @@ from pytorch3d.implicitron.dataset.dataset_base import DatasetBase
 from pytorch3d.implicitron.dataset.dataset_map_provider import DatasetMap
 from pytorch3d.implicitron.dataset.frame_data import FrameData
 from pytorch3d.implicitron.tools.config import registry, run_auto_creation
+
 from torch.utils.data import DataLoader

 logger = logging.getLogger(__name__)
--- a/pytorch3d/implicitron/dataset/utils.py
+++ b/pytorch3d/implicitron/dataset/utils.py
@@ -15,6 +15,7 @@ from typing import List, Optional, Tuple, TypeVar, Union
 import numpy as np
 import torch
 from PIL import Image
+
 from pytorch3d.io import IO
 from pytorch3d.renderer.cameras import PerspectiveCameras
 from pytorch3d.structures.pointclouds import Pointclouds
--- a/pytorch3d/implicitron/evaluation/evaluator.py
+++ b/pytorch3d/implicitron/evaluation/evaluator.py
@@ -14,6 +14,7 @@ import warnings
 from typing import Any, Dict, List, Optional, Tuple

 import torch
+
 import tqdm
 from pytorch3d.implicitron.evaluation import evaluate_new_view_synthesis as evaluate
 from pytorch3d.implicitron.models.base_model import EvaluationMode, ImplicitronModelBase
--- a/pytorch3d/implicitron/models/base_model.py
+++ b/pytorch3d/implicitron/models/base_model.py
@@ -10,6 +10,7 @@ from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional

 import torch
+
 from pytorch3d.implicitron.models.renderer.base import EvaluationMode
 from pytorch3d.implicitron.tools.config import ReplaceableBase
 from pytorch3d.renderer.cameras import CamerasBase
--- a/pytorch3d/implicitron/models/generic_model.py
+++ b/pytorch3d/implicitron/models/generic_model.py
@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING, Union

 import torch
 from omegaconf import DictConfig
+
 from pytorch3d.implicitron.models.base_model import (
    ImplicitronModelBase,
    ImplicitronRender,
@@ -27,6 +28,7 @@ from pytorch3d.implicitron.models.metrics import (
    RegularizationMetricsBase,
    ViewMetricsBase,
 )
+
 from pytorch3d.implicitron.models.renderer.base import (
    BaseRenderer,
    EvaluationMode,
@@ -36,6 +38,7 @@ from pytorch3d.implicitron.models.renderer.base import (
    RenderSamplingMode,
 )
 from pytorch3d.implicitron.models.renderer.ray_sampler import RaySamplerBase
+
 from pytorch3d.implicitron.models.utils import (
    apply_chunked,
    chunk_generator,
@@ -50,6 +53,7 @@ from pytorch3d.implicitron.tools.config import (
    registry,
    run_auto_creation,
 )
+
 from pytorch3d.implicitron.tools.rasterize_mc import rasterize_sparse_ray_bundle
 from pytorch3d.renderer import utils as rend_utils
 from pytorch3d.renderer.cameras import CamerasBase
--- a/pytorch3d/implicitron/models/implicit_function/base.py
+++ b/pytorch3d/implicitron/models/implicit_function/base.py
@@ -10,6 +10,7 @@ from abc import ABC, abstractmethod
 from typing import Optional

 from pytorch3d.implicitron.models.renderer.base import ImplicitronRayBundle
+
 from pytorch3d.implicitron.tools.config import ReplaceableBase
 from pytorch3d.renderer.cameras import CamerasBase

--- a/pytorch3d/implicitron/models/implicit_function/decoding_functions.py
+++ b/pytorch3d/implicitron/models/implicit_function/decoding_functions.py
@@ -16,11 +16,14 @@ This file contains

 import logging
 from dataclasses import field
+
 from enum import Enum
 from typing import Dict, Optional, Tuple

 import torch
+
 from omegaconf import DictConfig
+
 from pytorch3d.implicitron.tools.config import (
    Configurable,
    registry,
--- a/pytorch3d/implicitron/models/implicit_function/idr_feature_field.py
+++ b/pytorch3d/implicitron/models/implicit_function/idr_feature_field.py
@@ -11,6 +11,7 @@ import torch
 from pytorch3d.implicitron.models.renderer.base import ImplicitronRayBundle
 from pytorch3d.implicitron.tools.config import registry
 from pytorch3d.renderer.implicit import HarmonicEmbedding
+
 from torch import nn

 from .base import ImplicitFunctionBase
--- a/pytorch3d/implicitron/models/implicit_function/neural_radiance_field.py
+++ b/pytorch3d/implicitron/models/implicit_function/neural_radiance_field.py
@@ -21,6 +21,7 @@ from pytorch3d.renderer.implicit import HarmonicEmbedding
 from pytorch3d.renderer.implicit.utils import ray_bundle_to_ray_points

 from .base import ImplicitFunctionBase
+
 from .decoding_functions import (  # noqa
    _xavier_init,
    MLPWithInputSkips,
--- a/pytorch3d/implicitron/models/implicit_function/utils.py
+++ b/pytorch3d/implicitron/models/implicit_function/utils.py
@@ -9,6 +9,7 @@
 from typing import Callable, Optional

 import torch
+
 import torch.nn.functional as F
 from pytorch3d.common.compat import prod
 from pytorch3d.implicitron.models.renderer.base import ImplicitronRayBundle
--- a/pytorch3d/implicitron/models/implicit_function/voxel_grid.py
+++ b/pytorch3d/implicitron/models/implicit_function/voxel_grid.py
@@ -21,6 +21,8 @@ import logging
 import warnings
 from collections.abc import Mapping
 from dataclasses import dataclass, field
+
+from distutils.version import LooseVersion
 from typing import Any, Callable, ClassVar, Dict, Iterator, List, Optional, Tuple, Type

 import torch
@@ -220,8 +222,7 @@ class VoxelGridBase(ReplaceableBase, torch.nn.Module):
                + "| 'bicubic' | 'linear' | 'area' | 'nearest-exact'"
            )

-        # We assume PyTorch 1.11 and newer.
-        interpolate_has_antialias = True
+        interpolate_has_antialias = LooseVersion(torch.__version__) >= "1.11"

        if antialias and not interpolate_has_antialias:
            warnings.warn("Antialiased interpolation requires PyTorch 1.11+; ignoring")
--- a/pytorch3d/implicitron/models/implicit_function/voxel_grid_implicit_function.py
+++ b/pytorch3d/implicitron/models/implicit_function/voxel_grid_implicit_function.py
@@ -13,7 +13,9 @@ from dataclasses import fields
 from typing import Callable, Dict, Optional, Tuple

 import torch
+
 from omegaconf import DictConfig
+
 from pytorch3d.implicitron.models.implicit_function.base import ImplicitFunctionBase
 from pytorch3d.implicitron.models.implicit_function.decoding_functions import (
    DecoderFunctionBase,
--- a/pytorch3d/implicitron/models/overfit_model.py
+++ b/pytorch3d/implicitron/models/overfit_model.py
@@ -17,6 +17,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, TYPE_CHECKING, Un

 import torch
 from omegaconf import DictConfig
+
 from pytorch3d.implicitron.models.base_model import (
    ImplicitronModelBase,
    ImplicitronRender,
@@ -27,6 +28,7 @@ from pytorch3d.implicitron.models.metrics import (
    RegularizationMetricsBase,
    ViewMetricsBase,
 )
+
 from pytorch3d.implicitron.models.renderer.base import (
    BaseRenderer,
    EvaluationMode,
@@ -48,6 +50,7 @@ from pytorch3d.implicitron.tools.config import (
    registry,
    run_auto_creation,
 )
+
 from pytorch3d.implicitron.tools.rasterize_mc import rasterize_sparse_ray_bundle
 from pytorch3d.renderer import utils as rend_utils
 from pytorch3d.renderer.cameras import CamerasBase
--- a/pytorch3d/implicitron/models/renderer/ray_point_refiner.py
+++ b/pytorch3d/implicitron/models/renderer/ray_point_refiner.py
@@ -11,6 +11,7 @@ import copy
 import torch
 from pytorch3d.implicitron.models.renderer.base import ImplicitronRayBundle
 from pytorch3d.implicitron.tools.config import Configurable, expand_args_fields
+
 from pytorch3d.renderer.implicit.sample_pdf import sample_pdf


--- a/pytorch3d/implicitron/models/renderer/rgb_net.py
+++ b/pytorch3d/implicitron/models/renderer/rgb_net.py
@@ -12,6 +12,7 @@ import torch
 from pytorch3d.implicitron.models.renderer.base import ImplicitronRayBundle
 from pytorch3d.implicitron.tools.config import enable_get_default_args
 from pytorch3d.renderer.implicit import HarmonicEmbedding
+
 from torch import nn


--- a/pytorch3d/implicitron/models/utils.py
+++ b/pytorch3d/implicitron/models/utils.py
@@ -17,8 +17,11 @@ from typing import Any, Dict, Optional, Tuple
 import torch
 import tqdm
 from pytorch3d.common.compat import prod
+
 from pytorch3d.implicitron.models.renderer.base import ImplicitronRayBundle
+
 from pytorch3d.implicitron.tools import image_utils
+
 from pytorch3d.implicitron.tools.utils import cat_dataclass


@@ -80,9 +83,9 @@ def preprocess_input(

    if mask_depths and fg_mask is not None and depth_map is not None:
        # mask the depths
-        assert mask_threshold > 0.0, (
-            "Depths should be masked only with thresholded masks"
-        )
+        assert (
+            mask_threshold > 0.0
+        ), "Depths should be masked only with thresholded masks"
        warnings.warn("Masking depths!")
        depth_map = depth_map * fg_mask

--- a/pytorch3d/implicitron/models/visualization/render_flyaround.py
+++ b/pytorch3d/implicitron/models/visualization/render_flyaround.py
@@ -304,11 +304,11 @@ def _show_predictions(
    assert isinstance(preds, list)

    pred_all = []
-    # Randomly choose a subset of the rendered images, sort by order in the sequence
+    # Randomly choose a subset of the rendered images, sort by ordr in the sequence
    n_samples = min(n_samples, len(preds))
    pred_idx = sorted(random.sample(list(range(len(preds))), n_samples))
    for predi in pred_idx:
-        # Make the concatenation for the same camera vertically
+        # Make the concatentation for the same camera vertically
        pred_all.append(
            torch.cat(
                [
@@ -359,7 +359,7 @@ def _generate_prediction_videos(
    vws = {}
    for k in predicted_keys:
        if k not in preds[0]:
-            logger.warning(f"Cannot generate video for prediction key '{k}'")
+            logger.warn(f"Cannot generate video for prediction key '{k}'")
            continue
        cache_dir = (
            None
--- a/pytorch3d/implicitron/tools/rasterize_mc.py
+++ b/pytorch3d/implicitron/tools/rasterize_mc.py
@@ -10,6 +10,7 @@ import math
 from typing import Optional, Tuple

 import pytorch3d
+
 import torch
 from pytorch3d.ops import packed_to_padded
 from pytorch3d.renderer import PerspectiveCameras
--- a/pytorch3d/implicitron/tools/stats.py
+++ b/pytorch3d/implicitron/tools/stats.py
@@ -499,7 +499,7 @@ class StatsJSONEncoder(json.JSONEncoder):
            return enc
        else:
            raise TypeError(
-                f"Object of type {o.__class__.__name__} is not JSON serializable"
+                f"Object of type {o.__class__.__name__} " f"is not JSON serializable"
            )


--- a/pytorch3d/implicitron/tools/video_writer.py
+++ b/pytorch3d/implicitron/tools/video_writer.py
@@ -17,6 +17,7 @@ import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
+
 from PIL import Image

 _NO_TORCHVISION = False
--- a/pytorch3d/io/obj_io.py
+++ b/pytorch3d/io/obj_io.py
@@ -796,7 +796,7 @@ def save_obj(
        # Create .mtl file with the material name and texture map filename
        # TODO: enable material properties to also be saved.
        with _open_file(mtl_path, path_manager, "w") as f_mtl:
-            lines = f"newmtl mesh\nmap_Kd {output_path.stem}.png\n"
+            lines = f"newmtl mesh\n" f"map_Kd {output_path.stem}.png\n"
            f_mtl.write(lines)


--- a/pytorch3d/loss/init.py
+++ b/pytorch3d/loss/init.py
@@ -8,8 +8,11 @@


 from .chamfer import chamfer_distance
+
 from .mesh_edge_loss import mesh_edge_loss
+
 from .mesh_laplacian_smoothing import mesh_laplacian_smoothing
+
 from .mesh_normal_consistency import mesh_normal_consistency
 from .point_mesh_distance import point_mesh_edge_distance, point_mesh_face_distance

--- a/pytorch3d/loss/mesh_laplacian_smoothing.py
+++ b/pytorch3d/loss/mesh_laplacian_smoothing.py
@@ -114,7 +114,9 @@ def mesh_laplacian_smoothing(meshes, method: str = "uniform"):
            if method == "cot":
                norm_w = torch.sparse.sum(L, dim=1).to_dense().view(-1, 1)
                idx = norm_w > 0
-                norm_w[idx] = torch.reciprocal(norm_w[idx])
+                # pyre-fixme[58]: `/` is not supported for operand types `float` and
+                #  `Tensor`.
+                norm_w[idx] = 1.0 / norm_w[idx]
            else:
                L_sum = torch.sparse.sum(L, dim=1).to_dense().view(-1, 1)
                norm_w = 0.25 * inv_areas
--- a/pytorch3d/loss/point_mesh_distance.py
+++ b/pytorch3d/loss/point_mesh_distance.py
@@ -6,7 +6,6 @@

 # pyre-unsafe

-import torch
 from pytorch3d import _C
 from pytorch3d.structures import Meshes, Pointclouds
 from torch.autograd import Function
@@ -303,7 +302,8 @@ def point_mesh_edge_distance(meshes: Meshes, pcls: Pointclouds):
    point_to_cloud_idx = pcls.packed_to_cloud_idx()  # (sum(P_i), )
    num_points_per_cloud = pcls.num_points_per_cloud()  # (N,)
    weights_p = num_points_per_cloud.gather(0, point_to_cloud_idx)
-    weights_p = torch.reciprocal(weights_p.float())
+    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
+    weights_p = 1.0 / weights_p.float()
    point_to_edge = point_to_edge * weights_p
    point_dist = point_to_edge.sum() / N

@@ -377,7 +377,8 @@ def point_mesh_face_distance(
    point_to_cloud_idx = pcls.packed_to_cloud_idx()  # (sum(P_i),)
    num_points_per_cloud = pcls.num_points_per_cloud()  # (N,)
    weights_p = num_points_per_cloud.gather(0, point_to_cloud_idx)
-    weights_p = torch.reciprocal(weights_p.float())
+    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
+    weights_p = 1.0 / weights_p.float()
    point_to_face = point_to_face * weights_p
    point_dist = point_to_face.sum() / N

--- a/pytorch3d/ops/init.py
+++ b/pytorch3d/ops/init.py
@@ -8,14 +8,17 @@

 from .ball_query import ball_query
 from .cameras_alignment import corresponding_cameras_alignment
+
 from .cubify import cubify
 from .graph_conv import GraphConv
 from .interp_face_attrs import interpolate_face_attributes
 from .iou_box3d import box3d_overlap
 from .knn import knn_gather, knn_points
 from .laplacian_matrices import cot_laplacian, laplacian, norm_laplacian
+
 from .mesh_face_areas_normals import mesh_face_areas_normals
 from .mesh_filtering import taubin_smoothing
+
 from .packed_to_padded import packed_to_padded, padded_to_packed
 from .perspective_n_points import efficient_pnp
 from .points_alignment import corresponding_points_alignment, iterative_closest_point
@@ -27,7 +30,9 @@ from .points_to_volumes import (
    add_pointclouds_to_volumes,
    add_points_features_to_volume_densities_features,
 )
+
 from .sample_farthest_points import sample_farthest_points
+
 from .sample_points_from_meshes import sample_points_from_meshes
 from .subdivide_meshes import SubdivideMeshes
 from .utils import (
@@ -37,6 +42,7 @@ from .utils import (
    is_pointclouds,
    wmean,
 )
+
 from .vert_align import vert_align


--- a/pytorch3d/ops/ball_query.py
+++ b/pytorch3d/ops/ball_query.py
@@ -23,13 +23,11 @@ class _ball_query(Function):
    """

    @staticmethod
-    def forward(ctx, p1, p2, lengths1, lengths2, K, radius, skip_points_outside_cube):
+    def forward(ctx, p1, p2, lengths1, lengths2, K, radius):
        """
        Arguments defintions the same as in the ball_query function
        """
-        idx, dists = _C.ball_query(
-            p1, p2, lengths1, lengths2, K, radius, skip_points_outside_cube
-        )
+        idx, dists = _C.ball_query(p1, p2, lengths1, lengths2, K, radius)
        ctx.save_for_backward(p1, p2, lengths1, lengths2, idx)
        ctx.mark_non_differentiable(idx)
        return dists, idx
@@ -51,7 +49,7 @@ class _ball_query(Function):
        grad_p1, grad_p2 = _C.knn_points_backward(
            p1, p2, lengths1, lengths2, idx, 2, grad_dists
        )
-        return grad_p1, grad_p2, None, None, None, None, None
+        return grad_p1, grad_p2, None, None, None, None


 def ball_query(
@@ -62,7 +60,6 @@ def ball_query(
    K: int = 500,
    radius: float = 0.2,
    return_nn: bool = True,
-    skip_points_outside_cube: bool = False,
 ):
    """
    Ball Query is an alternative to KNN. It can be
@@ -101,9 +98,6 @@ def ball_query(
            within the radius
        radius: the radius around each point within which the neighbors need to be located
        return_nn: If set to True returns the K neighbor points in p2 for each point in p1.
-        skip_points_outside_cube: If set to True, reduce multiplications of float values
-            by not explicitly calculating distances to points that fall outside the
-            D-cube with side length (2*radius) centered at each point in p1.

    Returns:
        dists: Tensor of shape (N, P1, K) giving the squared distances to
@@ -140,9 +134,7 @@ def ball_query(
    if lengths2 is None:
        lengths2 = torch.full((N,), P2, dtype=torch.int64, device=p1.device)

-    dists, idx = _ball_query.apply(
-        p1, p2, lengths1, lengths2, K, radius, skip_points_outside_cube
-    )
+    dists, idx = _ball_query.apply(p1, p2, lengths1, lengths2, K, radius)

    # Gather the neighbors if needed
    points_nn = masked_gather(p2, idx) if return_nn else None
--- a/Show More
+++ b/Show More