MKL version fix in CI (#1820 )

Summary: Fix for "undefined symbol: iJIT_NotifyEvent" build issue, Pull Request resolved: https://github.com/facebookresearch/pytorch3d/pull/1820 Differential Revision: D58685326
2026-06-17 12:38:53 +08:00 · 2024-06-20 09:24:07 -07:00
252 changed files with 1203 additions and 1981 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -162,6 +162,90 @@ workflows:
    jobs:
      # - main:
      #     context: DOCKERHUB_TOKEN
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py38_cu113_pyt1120
+          python_version: '3.8'
+          pytorch_version: 1.12.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py38_cu116_pyt1120
+          python_version: '3.8'
+          pytorch_version: 1.12.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py38_cu113_pyt1121
+          python_version: '3.8'
+          pytorch_version: 1.12.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py38_cu116_pyt1121
+          python_version: '3.8'
+          pytorch_version: 1.12.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py38_cu116_pyt1130
+          python_version: '3.8'
+          pytorch_version: 1.13.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py38_cu117_pyt1130
+          python_version: '3.8'
+          pytorch_version: 1.13.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py38_cu116_pyt1131
+          python_version: '3.8'
+          pytorch_version: 1.13.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py38_cu117_pyt1131
+          python_version: '3.8'
+          pytorch_version: 1.13.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py38_cu117_pyt200
+          python_version: '3.8'
+          pytorch_version: 2.0.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda118
+          context: DOCKERHUB_TOKEN
+          cu_version: cu118
+          name: linux_conda_py38_cu118_pyt200
+          python_version: '3.8'
+          pytorch_version: 2.0.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py38_cu117_pyt201
+          python_version: '3.8'
+          pytorch_version: 2.0.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda118
+          context: DOCKERHUB_TOKEN
+          cu_version: cu118
+          name: linux_conda_py38_cu118_pyt201
+          python_version: '3.8'
+          pytorch_version: 2.0.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -247,33 +331,89 @@ workflows:
          python_version: '3.8'
          pytorch_version: 2.3.1
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
+          conda_docker_image: pytorch/conda-builder:cuda113
          context: DOCKERHUB_TOKEN
-          cu_version: cu118
-          name: linux_conda_py38_cu118_pyt240
-          python_version: '3.8'
-          pytorch_version: 2.4.0
+          cu_version: cu113
+          name: linux_conda_py39_cu113_pyt1120
+          python_version: '3.9'
+          pytorch_version: 1.12.0
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
+          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py38_cu121_pyt240
-          python_version: '3.8'
-          pytorch_version: 2.4.0
+          cu_version: cu116
+          name: linux_conda_py39_cu116_pyt1120
+          python_version: '3.9'
+          pytorch_version: 1.12.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py39_cu113_pyt1121
+          python_version: '3.9'
+          pytorch_version: 1.12.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py39_cu116_pyt1121
+          python_version: '3.9'
+          pytorch_version: 1.12.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py39_cu116_pyt1130
+          python_version: '3.9'
+          pytorch_version: 1.13.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py39_cu117_pyt1130
+          python_version: '3.9'
+          pytorch_version: 1.13.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py39_cu116_pyt1131
+          python_version: '3.9'
+          pytorch_version: 1.13.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py39_cu117_pyt1131
+          python_version: '3.9'
+          pytorch_version: 1.13.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py39_cu117_pyt200
+          python_version: '3.9'
+          pytorch_version: 2.0.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
-          name: linux_conda_py38_cu118_pyt241
-          python_version: '3.8'
-          pytorch_version: 2.4.1
+          name: linux_conda_py39_cu118_pyt200
+          python_version: '3.9'
+          pytorch_version: 2.0.0
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
+          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py38_cu121_pyt241
-          python_version: '3.8'
-          pytorch_version: 2.4.1
+          cu_version: cu117
+          name: linux_conda_py39_cu117_pyt201
+          python_version: '3.9'
+          pytorch_version: 2.0.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda118
+          context: DOCKERHUB_TOKEN
+          cu_version: cu118
+          name: linux_conda_py39_cu118_pyt201
+          python_version: '3.9'
+          pytorch_version: 2.0.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -359,33 +499,89 @@ workflows:
          python_version: '3.9'
          pytorch_version: 2.3.1
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
+          conda_docker_image: pytorch/conda-builder:cuda113
          context: DOCKERHUB_TOKEN
-          cu_version: cu118
-          name: linux_conda_py39_cu118_pyt240
-          python_version: '3.9'
-          pytorch_version: 2.4.0
+          cu_version: cu113
+          name: linux_conda_py310_cu113_pyt1120
+          python_version: '3.10'
+          pytorch_version: 1.12.0
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
+          conda_docker_image: pytorch/conda-builder:cuda116
          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py39_cu121_pyt240
-          python_version: '3.9'
-          pytorch_version: 2.4.0
+          cu_version: cu116
+          name: linux_conda_py310_cu116_pyt1120
+          python_version: '3.10'
+          pytorch_version: 1.12.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py310_cu113_pyt1121
+          python_version: '3.10'
+          pytorch_version: 1.12.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py310_cu116_pyt1121
+          python_version: '3.10'
+          pytorch_version: 1.12.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py310_cu116_pyt1130
+          python_version: '3.10'
+          pytorch_version: 1.13.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py310_cu117_pyt1130
+          python_version: '3.10'
+          pytorch_version: 1.13.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda116
+          context: DOCKERHUB_TOKEN
+          cu_version: cu116
+          name: linux_conda_py310_cu116_pyt1131
+          python_version: '3.10'
+          pytorch_version: 1.13.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py310_cu117_pyt1131
+          python_version: '3.10'
+          pytorch_version: 1.13.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda117
+          context: DOCKERHUB_TOKEN
+          cu_version: cu117
+          name: linux_conda_py310_cu117_pyt200
+          python_version: '3.10'
+          pytorch_version: 2.0.0
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
          cu_version: cu118
-          name: linux_conda_py39_cu118_pyt241
-          python_version: '3.9'
-          pytorch_version: 2.4.1
+          name: linux_conda_py310_cu118_pyt200
+          python_version: '3.10'
+          pytorch_version: 2.0.0
      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
+          conda_docker_image: pytorch/conda-builder:cuda117
          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py39_cu121_pyt241
-          python_version: '3.9'
-          pytorch_version: 2.4.1
+          cu_version: cu117
+          name: linux_conda_py310_cu117_pyt201
+          python_version: '3.10'
+          pytorch_version: 2.0.1
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda118
+          context: DOCKERHUB_TOKEN
+          cu_version: cu118
+          name: linux_conda_py310_cu118_pyt201
+          python_version: '3.10'
+          pytorch_version: 2.0.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -470,34 +666,6 @@ workflows:
          name: linux_conda_py310_cu121_pyt231
          python_version: '3.10'
          pytorch_version: 2.3.1
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
-          context: DOCKERHUB_TOKEN
-          cu_version: cu118
-          name: linux_conda_py310_cu118_pyt240
-          python_version: '3.10'
-          pytorch_version: 2.4.0
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
-          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py310_cu121_pyt240
-          python_version: '3.10'
-          pytorch_version: 2.4.0
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
-          context: DOCKERHUB_TOKEN
-          cu_version: cu118
-          name: linux_conda_py310_cu118_pyt241
-          python_version: '3.10'
-          pytorch_version: 2.4.1
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
-          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py310_cu121_pyt241
-          python_version: '3.10'
-          pytorch_version: 2.4.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -582,34 +750,6 @@ workflows:
          name: linux_conda_py311_cu121_pyt231
          python_version: '3.11'
          pytorch_version: 2.3.1
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
-          context: DOCKERHUB_TOKEN
-          cu_version: cu118
-          name: linux_conda_py311_cu118_pyt240
-          python_version: '3.11'
-          pytorch_version: 2.4.0
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
-          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py311_cu121_pyt240
-          python_version: '3.11'
-          pytorch_version: 2.4.0
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
-          context: DOCKERHUB_TOKEN
-          cu_version: cu118
-          name: linux_conda_py311_cu118_pyt241
-          python_version: '3.11'
-          pytorch_version: 2.4.1
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
-          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py311_cu121_pyt241
-          python_version: '3.11'
-          pytorch_version: 2.4.1
      - binary_linux_conda:
          conda_docker_image: pytorch/conda-builder:cuda118
          context: DOCKERHUB_TOKEN
@@ -652,34 +792,6 @@ workflows:
          name: linux_conda_py312_cu121_pyt231
          python_version: '3.12'
          pytorch_version: 2.3.1
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
-          context: DOCKERHUB_TOKEN
-          cu_version: cu118
-          name: linux_conda_py312_cu118_pyt240
-          python_version: '3.12'
-          pytorch_version: 2.4.0
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
-          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py312_cu121_pyt240
-          python_version: '3.12'
-          pytorch_version: 2.4.0
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda118
-          context: DOCKERHUB_TOKEN
-          cu_version: cu118
-          name: linux_conda_py312_cu118_pyt241
-          python_version: '3.12'
-          pytorch_version: 2.4.1
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda121
-          context: DOCKERHUB_TOKEN
-          cu_version: cu121
-          name: linux_conda_py312_cu121_pyt241
-          python_version: '3.12'
-          pytorch_version: 2.4.1
      - binary_linux_conda_cuda:
          name: testrun_conda_cuda_py310_cu117_pyt201
          context: DOCKERHUB_TOKEN
--- a/.circleci/regenerate.py
+++ b/.circleci/regenerate.py
@@ -19,14 +19,18 @@ from packaging import version
 # The CUDA versions which have pytorch conda packages available for linux for each
 # version of pytorch.
 CONDA_CUDA_VERSIONS = {
+    "1.12.0": ["cu113", "cu116"],
+    "1.12.1": ["cu113", "cu116"],
+    "1.13.0": ["cu116", "cu117"],
+    "1.13.1": ["cu116", "cu117"],
+    "2.0.0": ["cu117", "cu118"],
+    "2.0.1": ["cu117", "cu118"],
    "2.1.0": ["cu118", "cu121"],
    "2.1.1": ["cu118", "cu121"],
    "2.1.2": ["cu118", "cu121"],
    "2.2.0": ["cu118", "cu121"],
    "2.2.2": ["cu118", "cu121"],
    "2.3.1": ["cu118", "cu121"],
-    "2.4.0": ["cu118", "cu121"],
-    "2.4.1": ["cu118", "cu121"],
 }


@@ -88,6 +92,7 @@ def workflow_pair(
    upload=False,
    filter_branch,
 ):
+
    w = []
    py = python_version.replace(".", "")
    pyt = pytorch_version.replace(".", "")
@@ -126,6 +131,7 @@ def generate_base_workflow(
    btype,
    filter_branch=None,
 ):
+
    d = {
        "name": base_workflow_name,
        "python_version": python_version,
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,23 +0,0 @@
-name: facebookresearch/pytorch3d/build_and_test
-on:
-  pull_request:
-    branches:
-      - main
-  push:
-    branches:
-      - main
-jobs:
-  binary_linux_conda_cuda:
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      PYTHON_VERSION: "3.12"
-      BUILD_VERSION: "${{ github.run_number }}"
-      PYTORCH_VERSION: "2.4.1"
-      CU_VERSION: "cu121"
-      JUST_TESTRUN: 1
-    steps:
-    - uses: actions/checkout@v4
-    - name: Build and run tests
-      run: |-
-        conda create --name env --yes --quiet conda-build
-        conda run --no-capture-output --name env python3 ./packaging/build_conda.py --use-conda-cuda
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -8,10 +8,11 @@
 The core library is written in PyTorch. Several components have underlying implementation in CUDA for improved performance. A subset of these components have CPU implementations in C++/PyTorch. It is advised to use PyTorch3D with GPU support in order to use all the features.

 - Linux or macOS or Windows
- Python
- PyTorch 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0 or 2.4.1.
+- Python 3.8, 3.9 or 3.10
+- PyTorch 1.12.0, 1.12.1, 1.13.0, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0 or 2.3.1.
 - torchvision that matches the PyTorch installation. You can install them together as explained at pytorch.org to make sure of this.
 - gcc & g++ ≥ 4.9
+- [fvcore](https://github.com/facebookresearch/fvcore)
 - [ioPath](https://github.com/facebookresearch/iopath)
 - If CUDA is to be used, use a version which is supported by the corresponding pytorch version and at least version 9.2.
 - If CUDA older than 11.7 is to be used and you are building from source, the CUB library must be available. We recommend version 1.10.0.
@@ -21,7 +22,7 @@ The runtime dependencies can be installed by running:
 conda create -n pytorch3d python=3.9
 conda activate pytorch3d
 conda install pytorch=1.13.0 torchvision pytorch-cuda=11.6 -c pytorch -c nvidia
-conda install -c iopath iopath
+conda install -c fvcore -c iopath -c conda-forge fvcore iopath
 ```

 For the CUB build time dependency, which you only need if you have CUDA older than 11.7, if you are using conda, you can continue with
@@ -48,7 +49,6 @@ For developing on top of PyTorch3D or contributing, you will need to run the lin
 - tdqm
 - jupyter
 - imageio
- fvcore
 - plotly
 - opencv-python

@@ -59,7 +59,6 @@ conda install jupyter
 pip install scikit-image matplotlib imageio plotly opencv-python

 # Tests/Linting
-conda install -c fvcore -c conda-forge fvcore
 pip install black usort flake8 flake8-bugbear flake8-comprehensions
 ```

@@ -98,7 +97,7 @@ version_str="".join([
    torch.version.cuda.replace(".",""),
    f"_pyt{pyt_version_str}"
 ])
-!pip install iopath
+!pip install fvcore iopath
 !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
 ```

--- a/dev/linter.sh
+++ b/dev/linter.sh
@@ -10,7 +10,7 @@
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 DIR=$(dirname "${DIR}")

-if [[ -f "${DIR}/BUCK" ]]
+if [[ -f "${DIR}/TARGETS" ]]
 then
  pyfmt "${DIR}"
 else
@@ -36,5 +36,5 @@ then

  echo "Running pyre..."
  echo "To restart/kill pyre server, run 'pyre restart' or 'pyre kill' in fbcode/"
-  ( cd ~/fbsource/fbcode; arc pyre check //vision/fair/pytorch3d/... )
+  ( cd ~/fbsource/fbcode; pyre -l vision/fair/pytorch3d/ )
 fi
--- a/dev/run_tutorials.sh
+++ b/dev/run_tutorials.sh
@@ -23,7 +23,7 @@ conda init bash
 source ~/.bashrc
 conda create -y -n myenv python=3.8 matplotlib ipython ipywidgets nbconvert
 conda activate myenv
-conda install -y -c iopath iopath
+conda install -y -c fvcore -c iopath -c conda-forge fvcore iopath
 conda install -y -c pytorch pytorch=1.6.0 cudatoolkit=10.1 torchvision
 conda install -y -c pytorch3d-nightly pytorch3d
 pip install plotly scikit-image
--- a/docs/examples/pulsar_basic.py
+++ b/docs/examples/pulsar_basic.py
@@ -10,7 +10,6 @@ This example demonstrates the most trivial, direct interface of the pulsar
 sphere renderer. It renders and saves an image with 10 random spheres.
 Output: basic.png.
 """
-
 import logging
 import math
 from os import path
--- a/docs/examples/pulsar_basic_unified.py
+++ b/docs/examples/pulsar_basic_unified.py
@@ -11,7 +11,6 @@ interface for sphere renderering. It renders and saves an image with
 10 random spheres.
 Output: basic-pt3d.png.
 """
-
 import logging
 from os import path

--- a/docs/examples/pulsar_cam.py
+++ b/docs/examples/pulsar_cam.py
@@ -14,7 +14,6 @@ distorted. Gradient-based optimization is used to converge towards the
 original camera parameters.
 Output: cam.gif.
 """
-
 import logging
 import math
 from os import path
--- a/docs/examples/pulsar_cam_unified.py
+++ b/docs/examples/pulsar_cam_unified.py
@@ -14,7 +14,6 @@ distorted. Gradient-based optimization is used to converge towards the
 original camera parameters.
 Output: cam-pt3d.gif
 """
-
 import logging
 from os import path

--- a/docs/examples/pulsar_multiview.py
+++ b/docs/examples/pulsar_multiview.py
@@ -18,7 +18,6 @@ This example is not available yet through the 'unified' interface,
 because opacity support has not landed in PyTorch3D for general data
 structures yet.
 """
-
 import logging
 import math
 from os import path
--- a/docs/examples/pulsar_optimization.py
+++ b/docs/examples/pulsar_optimization.py
@@ -13,7 +13,6 @@ The scene is initialized with random spheres. Gradient-based
 optimization is used to converge towards a faithful
 scene representation.
 """
-
 import logging
 import math

--- a/docs/examples/pulsar_optimization_unified.py
+++ b/docs/examples/pulsar_optimization_unified.py
@@ -13,7 +13,6 @@ The scene is initialized with random spheres. Gradient-based
 optimization is used to converge towards a faithful
 scene representation.
 """
-
 import logging
 import math

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,6 +5,7 @@ sphinx_rtd_theme
 sphinx_markdown_tables
 numpy
 iopath
+fvcore
 https://download.pytorch.org/whl/cpu/torchvision-0.15.2%2Bcpu-cp311-cp311-linux_x86_64.whl
 https://download.pytorch.org/whl/cpu/torch-2.0.1%2Bcpu-cp311-cp311-linux_x86_64.whl
 omegaconf
--- a/docs/tutorials/bundle_adjustment.ipynb
+++ b/docs/tutorials/bundle_adjustment.ipynb
@@ -96,7 +96,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/camera_position_optimization_with_differentiable_rendering.ipynb
+++ b/docs/tutorials/camera_position_optimization_with_differentiable_rendering.ipynb
@@ -83,7 +83,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/dataloaders_ShapeNetCore_R2N2.ipynb
+++ b/docs/tutorials/dataloaders_ShapeNetCore_R2N2.ipynb
@@ -58,7 +58,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/deform_source_mesh_to_target_mesh.ipynb
+++ b/docs/tutorials/deform_source_mesh_to_target_mesh.ipynb
@@ -97,7 +97,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/fit_simple_neural_radiance_field.ipynb
+++ b/docs/tutorials/fit_simple_neural_radiance_field.ipynb
@@ -63,7 +63,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/fit_textured_mesh.ipynb
+++ b/docs/tutorials/fit_textured_mesh.ipynb
@@ -75,7 +75,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/fit_textured_volume.ipynb
+++ b/docs/tutorials/fit_textured_volume.ipynb
@@ -54,7 +54,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/implicitron_config_system.ipynb
+++ b/docs/tutorials/implicitron_config_system.ipynb
@@ -85,7 +85,7 @@
        "        torch.version.cuda.replace(\".\",\"\"),\n",
        "        f\"_pyt{pyt_version_str}\"\n",
        "    ])\n",
-        "    !pip install iopath\n",
+        "    !pip install fvcore iopath\n",
        "    if sys.platform.startswith(\"linux\"):\n",
        "        print(\"Trying to install wheel for PyTorch3D\")\n",
        "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/implicitron_volumes.ipynb
+++ b/docs/tutorials/implicitron_volumes.ipynb
@@ -79,7 +79,7 @@
        "        torch.version.cuda.replace(\".\",\"\"),\n",
        "        f\"_pyt{pyt_version_str}\"\n",
        "    ])\n",
-        "    !pip install iopath\n",
+        "    !pip install fvcore iopath\n",
        "    if sys.platform.startswith(\"linux\"):\n",
        "        print(\"Trying to install wheel for PyTorch3D\")\n",
        "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/render_colored_points.ipynb
+++ b/docs/tutorials/render_colored_points.ipynb
@@ -57,7 +57,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/render_densepose.ipynb
+++ b/docs/tutorials/render_densepose.ipynb
@@ -64,7 +64,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/docs/tutorials/render_textured_meshes.ipynb
+++ b/docs/tutorials/render_textured_meshes.ipynb
@@ -80,7 +80,7 @@
    "        torch.version.cuda.replace(\".\",\"\"),\n",
    "        f\"_pyt{pyt_version_str}\"\n",
    "    ])\n",
-    "    !pip install iopath\n",
+    "    !pip install fvcore iopath\n",
    "    if sys.platform.startswith(\"linux\"):\n",
    "        print(\"Trying to install wheel for PyTorch3D\")\n",
    "        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html\n",
--- a/packaging/build_conda.py
+++ b/packaging/build_conda.py
@@ -4,11 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.

-import argparse
 import os.path
 import runpy
 import subprocess
-from typing import List, Tuple
+from typing import List

 # required env vars:
 # CU_VERSION: E.g. cu112
@@ -24,7 +23,7 @@ pytorch_major_minor = tuple(int(i) for i in PYTORCH_VERSION.split(".")[:2])
 source_root_dir = os.environ["PWD"]


-def version_constraint(version) -> str:
+def version_constraint(version):
    """
    Given version "11.3" returns " >=11.3,<11.4"
    """
@@ -33,7 +32,7 @@ def version_constraint(version) -> str:
    return f" >={version},<{upper}"


-def get_cuda_major_minor() -> Tuple[str, str]:
+def get_cuda_major_minor():
    if CU_VERSION == "cpu":
        raise ValueError("fn only for cuda builds")
    if len(CU_VERSION) != 5 or CU_VERSION[:2] != "cu":
@@ -43,10 +42,11 @@ def get_cuda_major_minor() -> Tuple[str, str]:
    return major, minor


-def setup_cuda(use_conda_cuda: bool) -> List[str]:
+def setup_cuda():
    if CU_VERSION == "cpu":
-        return []
+        return
    major, minor = get_cuda_major_minor()
+    os.environ["CUDA_HOME"] = f"/usr/local/cuda-{major}.{minor}/"
    os.environ["FORCE_CUDA"] = "1"

    basic_nvcc_flags = (
@@ -75,15 +75,6 @@ def setup_cuda(use_conda_cuda: bool) -> List[str]:

    if os.environ.get("JUST_TESTRUN", "0") != "1":
        os.environ["NVCC_FLAGS"] = nvcc_flags
-    if use_conda_cuda:
-        os.environ["CONDA_CUDA_TOOLKIT_BUILD_CONSTRAINT1"] = "- cuda-toolkit"
-        os.environ["CONDA_CUDA_TOOLKIT_BUILD_CONSTRAINT2"] = (
-            f"- cuda-version={major}.{minor}"
-        )
-        return ["-c", f"nvidia/label/cuda-{major}.{minor}.0"]
-    else:
-        os.environ["CUDA_HOME"] = f"/usr/local/cuda-{major}.{minor}/"
-        return []


 def setup_conda_pytorch_constraint() -> List[str]:
@@ -104,7 +95,7 @@ def setup_conda_pytorch_constraint() -> List[str]:
        return ["-c", "pytorch", "-c", "nvidia"]


-def setup_conda_cudatoolkit_constraint() -> None:
+def setup_conda_cudatoolkit_constraint():
    if CU_VERSION == "cpu":
        os.environ["CONDA_CPUONLY_FEATURE"] = "- cpuonly"
        os.environ["CONDA_CUDATOOLKIT_CONSTRAINT"] = ""
@@ -125,14 +116,14 @@ def setup_conda_cudatoolkit_constraint() -> None:
    os.environ["CONDA_CUDATOOLKIT_CONSTRAINT"] = toolkit


-def do_build(start_args: List[str]) -> None:
+def do_build(start_args: List[str]):
    args = start_args.copy()

    test_flag = os.environ.get("TEST_FLAG")
    if test_flag is not None:
        args.append(test_flag)

-    args.extend(["-c", "bottler", "-c", "iopath", "-c", "conda-forge"])
+    args.extend(["-c", "bottler", "-c", "fvcore", "-c", "iopath", "-c", "conda-forge"])
    args.append("--no-anaconda-upload")
    args.extend(["--python", os.environ["PYTHON_VERSION"]])
    args.append("packaging/pytorch3d")
@@ -141,16 +132,8 @@ def do_build(start_args: List[str]) -> None:


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Build the conda package.")
-    parser.add_argument(
-        "--use-conda-cuda",
-        action="store_true",
-        help="get cuda from conda ignoring local cuda",
-    )
-    our_args = parser.parse_args()
-
    args = ["conda", "build"]
-    args += setup_cuda(use_conda_cuda=our_args.use_conda_cuda)
+    setup_cuda()

    init_path = source_root_dir + "/pytorch3d/__init__.py"
    build_version = runpy.run_path(init_path)["__version__"]
--- a/packaging/linux_wheels/README.md
+++ b/packaging/linux_wheels/README.md
@@ -26,6 +26,6 @@ version_str="".join([
    torch.version.cuda.replace(".",""),
    f"_pyt{pyt_version_str}"
 ])
-!pip install iopath
+!pip install fvcore iopath
 !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
 ```
--- a/packaging/linux_wheels/inside.sh
+++ b/packaging/linux_wheels/inside.sh
@@ -144,7 +144,7 @@ do
            conda activate "$tag"
            # shellcheck disable=SC2086
            conda install -y -c pytorch $extra_channel "pytorch=$pytorch_version" "$cudatools=$CUDA_TAG"
-            pip install iopath
+            pip install fvcore iopath
            echo "python version" "$python_version" "pytorch version" "$pytorch_version" "cuda version" "$cu_version" "tag" "$tag"

            rm -rf dist
--- a/packaging/pytorch3d/meta.yaml
+++ b/packaging/pytorch3d/meta.yaml
@@ -8,13 +8,10 @@ source:
 requirements:
  build:
    - {{ compiler('c') }} # [win]
-    {{ environ.get('CONDA_CUDA_TOOLKIT_BUILD_CONSTRAINT1', '') }}
-    {{ environ.get('CONDA_CUDA_TOOLKIT_BUILD_CONSTRAINT2', '') }}
    {{ environ.get('CONDA_CUB_CONSTRAINT') }}

  host:
    - python
-    - mkl =2023  # [x86_64]
    {{ environ.get('SETUPTOOLS_CONSTRAINT') }}
    {{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT') }}
    {{ environ.get('CONDA_PYTORCH_MKL_CONSTRAINT') }}
@@ -25,7 +22,7 @@ requirements:
    - python
    - numpy >=1.11
    - torchvision >=0.5
-    - mkl =2023  # [x86_64]
+    - fvcore
    - iopath
    {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
    {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
@@ -51,11 +48,8 @@ test:
    - imageio
    - hydra-core
    - accelerate
-    - matplotlib
-    - tabulate
-    - pandas
-    - sqlalchemy
  commands:
+    #pytest .
    python -m unittest discover -v -s tests -t .


--- a/projects/implicitron_trainer/experiment.py
+++ b/projects/implicitron_trainer/experiment.py
@@ -7,7 +7,7 @@

 # pyre-unsafe

-""" "
+""""
 This file is the entry point for launching experiments with Implicitron.

 Launch Training
@@ -44,7 +44,6 @@ The outputs of the experiment are saved and logged in multiple ways:
        config file.

 """
-
 import logging
 import os
 import warnings
@@ -100,7 +99,7 @@ except ModuleNotFoundError:
 no_accelerate = os.environ.get("PYTORCH3D_NO_ACCELERATE") is not None


-class Experiment(Configurable):
+class Experiment(Configurable):  # pyre-ignore: 13
    """
    This class is at the top level of Implicitron's config hierarchy. Its
    members are high-level components necessary for training an implicit rende-
@@ -121,16 +120,12 @@ class Experiment(Configurable):
            will be saved here.
    """

-    # pyre-fixme[13]: Attribute `data_source` is never initialized.
    data_source: DataSourceBase
    data_source_class_type: str = "ImplicitronDataSource"
-    # pyre-fixme[13]: Attribute `model_factory` is never initialized.
    model_factory: ModelFactoryBase
    model_factory_class_type: str = "ImplicitronModelFactory"
-    # pyre-fixme[13]: Attribute `optimizer_factory` is never initialized.
    optimizer_factory: OptimizerFactoryBase
    optimizer_factory_class_type: str = "ImplicitronOptimizerFactory"
-    # pyre-fixme[13]: Attribute `training_loop` is never initialized.
    training_loop: TrainingLoopBase
    training_loop_class_type: str = "ImplicitronTrainingLoop"

--- a/projects/implicitron_trainer/impl/model_factory.py
+++ b/projects/implicitron_trainer/impl/model_factory.py
@@ -26,6 +26,7 @@ logger = logging.getLogger(__name__)


 class ModelFactoryBase(ReplaceableBase):
+
    resume: bool = True  # resume from the last checkpoint

    def __call__(self, **kwargs) -> ImplicitronModelBase:
@@ -44,7 +45,7 @@ class ModelFactoryBase(ReplaceableBase):


@registry.register
-class ImplicitronModelFactory(ModelFactoryBase):
+class ImplicitronModelFactory(ModelFactoryBase):  # pyre-ignore [13]
    """
    A factory class that initializes an implicit rendering model.

@@ -60,7 +61,6 @@ class ImplicitronModelFactory(ModelFactoryBase):

    """

-    # pyre-fixme[13]: Attribute `model` is never initialized.
    model: ImplicitronModelBase
    model_class_type: str = "GenericModel"
    resume: bool = True
@@ -115,9 +115,7 @@ class ImplicitronModelFactory(ModelFactoryBase):
                        "cuda:%d" % 0: "cuda:%d" % accelerator.local_process_index
                    }
                model_state_dict = torch.load(
-                    model_io.get_model_path(model_path),
-                    map_location=map_location,
-                    weights_only=True,
+                    model_io.get_model_path(model_path), map_location=map_location
                )

                try:
--- a/projects/implicitron_trainer/impl/optimizer_factory.py
+++ b/projects/implicitron_trainer/impl/optimizer_factory.py
@@ -123,7 +123,6 @@ class ImplicitronOptimizerFactory(OptimizerFactoryBase):
        """
        # Get the parameters to optimize
        if hasattr(model, "_get_param_groups"):  # use the model function
-            # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
            p_groups = model._get_param_groups(self.lr, wd=self.weight_decay)
        else:
            p_groups = [
@@ -242,7 +241,7 @@ class ImplicitronOptimizerFactory(OptimizerFactoryBase):
                    map_location = {
                        "cuda:%d" % 0: "cuda:%d" % accelerator.local_process_index
                    }
-                optimizer_state = torch.load(opt_path, map_location, weights_only=True)
+                optimizer_state = torch.load(opt_path, map_location)
            else:
                raise FileNotFoundError(f"Optimizer state {opt_path} does not exist.")
        return optimizer_state
--- a/projects/implicitron_trainer/impl/training_loop.py
+++ b/projects/implicitron_trainer/impl/training_loop.py
@@ -30,13 +30,13 @@ from .utils import seed_all_random_engines
 logger = logging.getLogger(__name__)


+# pyre-fixme[13]: Attribute `evaluator` is never initialized.
 class TrainingLoopBase(ReplaceableBase):
    """
    Members:
        evaluator: An EvaluatorBase instance, used to evaluate training results.
    """

-    # pyre-fixme[13]: Attribute `evaluator` is never initialized.
    evaluator: Optional[EvaluatorBase]
    evaluator_class_type: Optional[str] = "ImplicitronEvaluator"

@@ -161,6 +161,7 @@ class ImplicitronTrainingLoop(TrainingLoopBase):
        for epoch in range(start_epoch, self.max_epochs):
            # automatic new_epoch and plotting of stats at every epoch start
            with stats:
+
                # Make sure to re-seed random generators to ensure reproducibility
                # even after restart.
                seed_all_random_engines(seed + epoch)
@@ -394,7 +395,6 @@ class ImplicitronTrainingLoop(TrainingLoopBase):
            ):
                prefix = f"e{stats.epoch}_it{stats.it[trainmode]}"
                if hasattr(model, "visualize"):
-                    # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
                    model.visualize(
                        viz,
                        visdom_env_imgs,
--- a/projects/implicitron_trainer/tests/test_experiment.py
+++ b/projects/implicitron_trainer/tests/test_experiment.py
@@ -53,8 +53,12 @@ class TestExperiment(unittest.TestCase):
        cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_class_type = (
            "JsonIndexDatasetMapProvider"
        )
-        dataset_args = cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_JsonIndexDatasetMapProvider_args
-        dataloader_args = cfg.data_source_ImplicitronDataSource_args.data_loader_map_provider_SequenceDataLoaderMapProvider_args
+        dataset_args = (
+            cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_JsonIndexDatasetMapProvider_args
+        )
+        dataloader_args = (
+            cfg.data_source_ImplicitronDataSource_args.data_loader_map_provider_SequenceDataLoaderMapProvider_args
+        )
        dataset_args.category = "skateboard"
        dataset_args.test_restrict_sequence_id = 0
        dataset_args.dataset_root = "manifold://co3d/tree/extracted"
@@ -90,8 +94,12 @@ class TestExperiment(unittest.TestCase):
        cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_class_type = (
            "JsonIndexDatasetMapProvider"
        )
-        dataset_args = cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_JsonIndexDatasetMapProvider_args
-        dataloader_args = cfg.data_source_ImplicitronDataSource_args.data_loader_map_provider_SequenceDataLoaderMapProvider_args
+        dataset_args = (
+            cfg.data_source_ImplicitronDataSource_args.dataset_map_provider_JsonIndexDatasetMapProvider_args
+        )
+        dataloader_args = (
+            cfg.data_source_ImplicitronDataSource_args.data_loader_map_provider_SequenceDataLoaderMapProvider_args
+        )
        dataset_args.category = "skateboard"
        dataset_args.test_restrict_sequence_id = 0
        dataset_args.dataset_root = "manifold://co3d/tree/extracted"
@@ -103,7 +111,9 @@ class TestExperiment(unittest.TestCase):
        cfg.training_loop_ImplicitronTrainingLoop_args.max_epochs = 2
        cfg.training_loop_ImplicitronTrainingLoop_args.store_checkpoints = False
        cfg.optimizer_factory_ImplicitronOptimizerFactory_args.lr_policy = "Exponential"
-        cfg.optimizer_factory_ImplicitronOptimizerFactory_args.exponential_lr_step_size = 2
+        cfg.optimizer_factory_ImplicitronOptimizerFactory_args.exponential_lr_step_size = (
+            2
+        )

        if DEBUG:
            experiment.dump_cfg(cfg)
--- a/projects/implicitron_trainer/tests/test_optimizer_factory.py
+++ b/projects/implicitron_trainer/tests/test_optimizer_factory.py
@@ -81,9 +81,8 @@ class TestOptimizerFactory(unittest.TestCase):

    def test_param_overrides_self_param_group_assignment(self):
        pa, pb, pc = [torch.nn.Parameter(data=torch.tensor(i * 1.0)) for i in range(3)]
-        na, nb = (
-            Node(params=[pa]),
-            Node(params=[pb], param_groups={"self": "pb_self", "p1": "pb_param"}),
+        na, nb = Node(params=[pa]), Node(
+            params=[pb], param_groups={"self": "pb_self", "p1": "pb_param"}
        )
        root = Node(children=[na, nb], params=[pc], param_groups={"m1": "pb_member"})
        param_groups = self._get_param_groups(root)
--- a/projects/nerf/nerf/dataset.py
+++ b/projects/nerf/nerf/dataset.py
@@ -84,9 +84,9 @@ def get_nerf_datasets(

    if autodownload and any(not os.path.isfile(p) for p in (cameras_path, image_path)):
        # Automatically download the data files if missing.
-        download_data([dataset_name], data_root=data_root)
+        download_data((dataset_name,), data_root=data_root)

-    train_data = torch.load(cameras_path, weights_only=True)
+    train_data = torch.load(cameras_path)
    n_cameras = train_data["cameras"]["R"].shape[0]

    _image_max_image_pixels = Image.MAX_IMAGE_PIXELS
--- a/projects/nerf/nerf/stats.py
+++ b/projects/nerf/nerf/stats.py
@@ -194,6 +194,7 @@ class Stats:
        it = self.it[stat_set]

        for stat in self.log_vars:
+
            if stat not in self.stats[stat_set]:
                self.stats[stat_set][stat] = AverageMeter()

--- a/projects/nerf/test_nerf.py
+++ b/projects/nerf/test_nerf.py
@@ -24,6 +24,7 @@ CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs"

@hydra.main(config_path=CONFIG_DIR, config_name="lego")
 def main(cfg: DictConfig):
+
    # Device on which to run.
    if torch.cuda.is_available():
        device = "cuda"
@@ -62,7 +63,7 @@ def main(cfg: DictConfig):
        raise ValueError(f"Model checkpoint {checkpoint_path} does not exist!")

    print(f"Loading checkpoint {checkpoint_path}.")
-    loaded_data = torch.load(checkpoint_path, weights_only=True)
+    loaded_data = torch.load(checkpoint_path)
    # Do not load the cached xy grid.
    # - this allows setting an arbitrary evaluation image size.
    state_dict = {
--- a/projects/nerf/tests/test_raysampler.py
+++ b/projects/nerf/tests/test_raysampler.py
@@ -42,6 +42,7 @@ class TestRaysampler(unittest.TestCase):
        cameras, rays = [], []

        for _ in range(batch_size):
+
            R = random_rotations(1)
            T = torch.randn(1, 3)
            focal_length = torch.rand(1, 2) + 0.5
--- a/projects/nerf/train_nerf.py
+++ b/projects/nerf/train_nerf.py
@@ -25,6 +25,7 @@ CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs"

@hydra.main(config_path=CONFIG_DIR, config_name="lego")
 def main(cfg: DictConfig):
+
    # Set the relevant seeds for reproducibility.
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
@@ -76,7 +77,7 @@ def main(cfg: DictConfig):
        # Resume training if requested.
        if cfg.resume and os.path.isfile(checkpoint_path):
            print(f"Resuming from checkpoint {checkpoint_path}.")
-            loaded_data = torch.load(checkpoint_path, weights_only=True)
+            loaded_data = torch.load(checkpoint_path)
            model.load_state_dict(loaded_data["model"])
            stats = pickle.loads(loaded_data["stats"])
            print(f"   => resuming from epoch {stats.epoch}.")
@@ -218,6 +219,7 @@ def main(cfg: DictConfig):

        # Validation
        if epoch % cfg.validation_epoch_interval == 0 and epoch > 0:
+
            # Sample a validation camera/image.
            val_batch = next(val_dataloader.__iter__())
            val_image, val_camera, camera_idx = val_batch[0].values()
--- a/pytorch3d/init.py
+++ b/pytorch3d/init.py
@@ -6,4 +6,4 @@

 # pyre-unsafe

-__version__ = "0.7.9"
+__version__ = "0.7.6"
--- a/pytorch3d/common/compat.py
+++ b/pytorch3d/common/compat.py
@@ -17,7 +17,7 @@ Some functions which depend on PyTorch or Python versions.


 def meshgrid_ij(
-    *A: Union[torch.Tensor, Sequence[torch.Tensor]],
+    *A: Union[torch.Tensor, Sequence[torch.Tensor]]
 ) -> Tuple[torch.Tensor, ...]:  # pragma: no cover
    """
    Like torch.meshgrid was before PyTorch 1.10.0, i.e. with indexing set to ij
--- a/pytorch3d/csrc/ball_query/ball_query.cu
+++ b/pytorch3d/csrc/ball_query/ball_query.cu
@@ -32,9 +32,7 @@ __global__ void BallQueryKernel(
    at::PackedTensorAccessor64<int64_t, 3, at::RestrictPtrTraits> idxs,
    at::PackedTensorAccessor64<scalar_t, 3, at::RestrictPtrTraits> dists,
    const int64_t K,
-    const float radius,
-    const float radius2,
-    const bool skip_points_outside_cube) {
+    const float radius2) {
  const int64_t N = p1.size(0);
  const int64_t chunks_per_cloud = (1 + (p1.size(1) - 1) / blockDim.x);
  const int64_t chunks_to_do = N * chunks_per_cloud;
@@ -53,19 +51,7 @@ __global__ void BallQueryKernel(
    // Iterate over points in p2 until desired count is reached or
    // all points have been considered
    for (int64_t j = 0, count = 0; j < lengths2[n] && count < K; ++j) {
-      if (skip_points_outside_cube) {
-        bool is_within_radius = true;
-        // Filter when any one coordinate is already outside the radius
-        for (int d = 0; is_within_radius && d < D; ++d) {
-          scalar_t abs_diff = fabs(p1[n][i][d] - p2[n][j][d]);
-          is_within_radius = (abs_diff <= radius);
-        }
-        if (!is_within_radius) {
-          continue;
-        }
-      }
-
-      // Else, calculate the distance between the points and compare
+      // Calculate the distance between the points
      scalar_t dist2 = 0.0;
      for (int d = 0; d < D; ++d) {
        scalar_t diff = p1[n][i][d] - p2[n][j][d];
@@ -91,8 +77,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
    const at::Tensor& lengths1, // (N,)
    const at::Tensor& lengths2, // (N,)
    int K,
-    float radius,
-    bool skip_points_outside_cube) {
+    float radius) {
  // Check inputs are on the same device
  at::TensorArg p1_t{p1, "p1", 1}, p2_t{p2, "p2", 2},
      lengths1_t{lengths1, "lengths1", 3}, lengths2_t{lengths2, "lengths2", 4};
@@ -135,9 +120,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
            idxs.packed_accessor64<int64_t, 3, at::RestrictPtrTraits>(),
            dists.packed_accessor64<float, 3, at::RestrictPtrTraits>(),
            K_64,
-            radius,
-            radius2,
-            skip_points_outside_cube);
+            radius2);
      }));

  AT_CUDA_CHECK(cudaGetLastError());
--- a/pytorch3d/csrc/ball_query/ball_query.h
+++ b/pytorch3d/csrc/ball_query/ball_query.h
@@ -25,9 +25,6 @@
 //      within the radius
 //    radius: the radius around each point within which the neighbors need to be
 //      located
-//    skip_points_outside_cube: If true, reduce multiplications of float values
-//      by not explicitly calculating distances to points that fall outside the
-//      D-cube with side length (2*radius) centered at each point in p1.
 //
 // Returns:
 //    p1_neighbor_idx: LongTensor of shape (N, P1, K), where
@@ -49,8 +46,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    const int K,
-    const float radius,
-    const bool skip_points_outside_cube);
+    const float radius);

 // CUDA implementation
 std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
@@ -59,8 +55,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    const int K,
-    const float radius,
-    const bool skip_points_outside_cube);
+    const float radius);

 // Implementation which is exposed
 // Note: the backward pass reuses the KNearestNeighborBackward kernel
@@ -70,8 +65,7 @@ inline std::tuple<at::Tensor, at::Tensor> BallQuery(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    int K,
-    float radius,
-    bool skip_points_outside_cube) {
+    float radius) {
  if (p1.is_cuda() || p2.is_cuda()) {
 #ifdef WITH_CUDA
    CHECK_CUDA(p1);
@@ -82,20 +76,16 @@ inline std::tuple<at::Tensor, at::Tensor> BallQuery(
        lengths1.contiguous(),
        lengths2.contiguous(),
        K,
-        radius,
-        skip_points_outside_cube);
+        radius);
 #else
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(p1);
-  CHECK_CPU(p2);
  return BallQueryCpu(
      p1.contiguous(),
      p2.contiguous(),
      lengths1.contiguous(),
      lengths2.contiguous(),
      K,
-      radius,
-      skip_points_outside_cube);
+      radius);
 }
--- a/pytorch3d/csrc/ball_query/ball_query_cpu.cpp
+++ b/pytorch3d/csrc/ball_query/ball_query_cpu.cpp
@@ -6,8 +6,8 @@
 * LICENSE file in the root directory of this source tree.
 */

-#include <math.h>
 #include <torch/extension.h>
+#include <queue>
 #include <tuple>

 std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
@@ -16,8 +16,7 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
    const at::Tensor& lengths1,
    const at::Tensor& lengths2,
    int K,
-    float radius,
-    bool skip_points_outside_cube) {
+    float radius) {
  const int N = p1.size(0);
  const int P1 = p1.size(1);
  const int D = p1.size(2);
@@ -39,16 +38,6 @@ std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
    const int64_t length2 = lengths2_a[n];
    for (int64_t i = 0; i < length1; ++i) {
      for (int64_t j = 0, count = 0; j < length2 && count < K; ++j) {
-        if (skip_points_outside_cube) {
-          bool is_within_radius = true;
-          for (int d = 0; is_within_radius && d < D; ++d) {
-            float abs_diff = fabs(p1_a[n][i][d] - p2_a[n][j][d]);
-            is_within_radius = (abs_diff <= radius);
-          }
-          if (!is_within_radius) {
-            continue;
-          }
-        }
        float dist2 = 0;
        for (int d = 0; d < D; ++d) {
          float diff = p1_a[n][i][d] - p2_a[n][j][d];
--- a/pytorch3d/csrc/blending/sigmoid_alpha_blend.h
+++ b/pytorch3d/csrc/blending/sigmoid_alpha_blend.h
@@ -98,11 +98,6 @@ at::Tensor SigmoidAlphaBlendBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(distances);
-  CHECK_CPU(pix_to_face);
-  CHECK_CPU(alphas);
-  CHECK_CPU(grad_alphas);
-
  return SigmoidAlphaBlendBackwardCpu(
      grad_alphas, alphas, distances, pix_to_face, sigma);
 }
--- a/pytorch3d/csrc/compositing/alpha_composite.cu
+++ b/pytorch3d/csrc/compositing/alpha_composite.cu
@@ -28,16 +28,17 @@ __global__ void alphaCompositeCudaForwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
+  const int64_t batch_size = result.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Iterate over each feature in each pixel
  for (int pid = tid; pid < num_pixels; pid += num_threads) {
@@ -78,16 +79,17 @@ __global__ void alphaCompositeCudaBackwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
+  const int64_t batch_size = points_idx.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
--- a/pytorch3d/csrc/compositing/alpha_composite.h
+++ b/pytorch3d/csrc/compositing/alpha_composite.h
@@ -74,9 +74,6 @@ torch::Tensor alphaCompositeForward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
    return alphaCompositeCpuForward(features, alphas, points_idx);
  }
 }
@@ -104,11 +101,6 @@ std::tuple<torch::Tensor, torch::Tensor> alphaCompositeBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(grad_outputs);
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
-
    return alphaCompositeCpuBackward(
        grad_outputs, features, alphas, points_idx);
  }
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.cu
@@ -28,16 +28,17 @@ __global__ void weightedSumNormCudaForwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
+  const int64_t batch_size = result.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
@@ -91,16 +92,17 @@ __global__ void weightedSumNormCudaBackwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
+  const int64_t batch_size = points_idx.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * W * H;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.h
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.h
@@ -73,10 +73,6 @@ torch::Tensor weightedSumNormForward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
-
    return weightedSumNormCpuForward(features, alphas, points_idx);
  }
 }
@@ -104,11 +100,6 @@ std::tuple<torch::Tensor, torch::Tensor> weightedSumNormBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(grad_outputs);
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
-
    return weightedSumNormCpuBackward(
        grad_outputs, features, alphas, points_idx);
  }
--- a/pytorch3d/csrc/compositing/weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/weighted_sum.cu
@@ -26,16 +26,17 @@ __global__ void weightedSumCudaForwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
+  const int64_t batch_size = result.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Parallelize over each feature in each pixel in images of size H * W,
  // for each image in the batch of size batch_size
@@ -73,16 +74,17 @@ __global__ void weightedSumCudaBackwardKernel(
    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
  // clang-format on
+  const int64_t batch_size = points_idx.size(0);
  const int64_t C = features.size(0);
  const int64_t H = points_idx.size(2);
  const int64_t W = points_idx.size(3);

  // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;

  const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;

  // Iterate over each pixel to compute the contribution to the
  // gradient for the features and weights
--- a/pytorch3d/csrc/compositing/weighted_sum.h
+++ b/pytorch3d/csrc/compositing/weighted_sum.h
@@ -72,9 +72,6 @@ torch::Tensor weightedSumForward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
    return weightedSumCpuForward(features, alphas, points_idx);
  }
 }
@@ -101,11 +98,6 @@ std::tuple<torch::Tensor, torch::Tensor> weightedSumBackward(
    AT_ERROR("Not compiled with GPU support");
 #endif
  } else {
-    CHECK_CPU(grad_outputs);
-    CHECK_CPU(features);
-    CHECK_CPU(alphas);
-    CHECK_CPU(points_idx);
-
    return weightedSumCpuBackward(grad_outputs, features, alphas, points_idx);
  }
 }
--- a/pytorch3d/csrc/ext.cpp
+++ b/pytorch3d/csrc/ext.cpp
@@ -8,6 +8,7 @@

 // clang-format off
 #include "./pulsar/global.h" // Include before <torch/extension.h>.
+#include <torch/extension.h>
 // clang-format on
 #include "./pulsar/pytorch/renderer.h"
 #include "./pulsar/pytorch/tensor_util.h"
@@ -98,23 +99,21 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("marching_cubes", &MarchingCubes);

  // Pulsar.
-  // Pulsar not enabled on AMD.
 #ifdef PULSAR_LOGGING_ENABLED
  c10::ShowLogInfoToStderr();
 #endif
  py::class_<
      pulsar::pytorch::Renderer,
      std::shared_ptr<pulsar::pytorch::Renderer>>(m, "PulsarRenderer")
-      .def(
-          py::init<
-              const uint&,
-              const uint&,
-              const uint&,
-              const bool&,
-              const bool&,
-              const float&,
-              const uint&,
-              const uint&>())
+      .def(py::init<
+           const uint&,
+           const uint&,
+           const uint&,
+           const bool&,
+           const bool&,
+           const float&,
+           const uint&,
+           const uint&>())
      .def(
          "__eq__",
          [](const pulsar::pytorch::Renderer& a,
@@ -149,10 +148,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
          py::arg("gamma"),
          py::arg("max_depth"),
          py::arg("min_depth") /* = 0.f*/,
-          py::arg("bg_col") /* = std::nullopt not exposed properly in
-                               pytorch 1.1. */
+          py::arg(
+              "bg_col") /* = at::nullopt not exposed properly in pytorch 1.1. */
          ,
-          py::arg("opacity") /* = std::nullopt ... */,
+          py::arg("opacity") /* = at::nullopt ... */,
          py::arg("percent_allowed_difference") = 0.01f,
          py::arg("max_n_hits") = MAX_UINT,
          py::arg("mode") = 0)
--- a/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
+++ b/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
@@ -60,8 +60,6 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(verts);
-  CHECK_CPU(faces);
  return FaceAreasNormalsForwardCpu(verts, faces);
 }

@@ -82,9 +80,5 @@ at::Tensor FaceAreasNormalsBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(grad_areas);
-  CHECK_CPU(grad_normals);
-  CHECK_CPU(verts);
-  CHECK_CPU(faces);
  return FaceAreasNormalsBackwardCpu(grad_areas, grad_normals, verts, faces);
 }
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.cu
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.cu
@@ -20,14 +20,14 @@ __global__ void GatherScatterCudaKernel(
    const size_t V,
    const size_t D,
    const size_t E) {
-  const auto tid = threadIdx.x;
+  const int tid = threadIdx.x;

  // Reverse the vertex order if backward.
  const int v0_idx = backward ? 1 : 0;
  const int v1_idx = backward ? 0 : 1;

  // Edges are split evenly across the blocks.
-  for (auto e = blockIdx.x; e < E; e += gridDim.x) {
+  for (int e = blockIdx.x; e < E; e += gridDim.x) {
    // Get indices of vertices which form the edge.
    const int64_t v0 = edges[2 * e + v0_idx];
    const int64_t v1 = edges[2 * e + v1_idx];
@@ -35,7 +35,7 @@ __global__ void GatherScatterCudaKernel(
    // Split vertex features evenly across threads.
    // This implementation will be quite wasteful when D<128 since there will be
    // a lot of threads doing nothing.
-    for (auto d = tid; d < D; d += blockDim.x) {
+    for (int d = tid; d < D; d += blockDim.x) {
      const float val = input[v1 * D + d];
      float* address = output + v0 * D + d;
      atomicAdd(address, val);
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.h
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.h
@@ -53,7 +53,5 @@ at::Tensor GatherScatter(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(input);
-  CHECK_CPU(edges);
  return GatherScatterCpu(input, edges, directed, backward);
 }
--- a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
+++ b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
@@ -20,8 +20,8 @@ __global__ void InterpFaceAttrsForwardKernel(
    const size_t P,
    const size_t F,
    const size_t D) {
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
  for (int pd = tid; pd < P * D; pd += num_threads) {
    const int p = pd / D;
    const int d = pd % D;
@@ -93,8 +93,8 @@ __global__ void InterpFaceAttrsBackwardKernel(
    const size_t P,
    const size_t F,
    const size_t D) {
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
  for (int pd = tid; pd < P * D; pd += num_threads) {
    const int p = pd / D;
    const int d = pd % D;
--- a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.h
+++ b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.h
@@ -57,8 +57,6 @@ at::Tensor InterpFaceAttrsForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(face_attrs);
-  CHECK_CPU(barycentric_coords);
  return InterpFaceAttrsForwardCpu(pix_to_face, barycentric_coords, face_attrs);
 }

@@ -108,9 +106,6 @@ std::tuple<at::Tensor, at::Tensor> InterpFaceAttrsBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(face_attrs);
-  CHECK_CPU(barycentric_coords);
-  CHECK_CPU(grad_pix_attrs);
  return InterpFaceAttrsBackwardCpu(
      pix_to_face, barycentric_coords, face_attrs, grad_pix_attrs);
 }
--- a/pytorch3d/csrc/iou_box3d/iou_box3d.h
+++ b/pytorch3d/csrc/iou_box3d/iou_box3d.h
@@ -44,7 +44,5 @@ inline std::tuple<at::Tensor, at::Tensor> IoUBox3D(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(boxes1);
-  CHECK_CPU(boxes2);
  return IoUBox3DCpu(boxes1.contiguous(), boxes2.contiguous());
 }
--- a/pytorch3d/csrc/iou_box3d/iou_box3d_cpu.cpp
+++ b/pytorch3d/csrc/iou_box3d/iou_box3d_cpu.cpp
@@ -7,7 +7,10 @@
 */

 #include <torch/extension.h>
+#include <torch/torch.h>
 #include <list>
+#include <numeric>
+#include <queue>
 #include <tuple>
 #include "iou_box3d/iou_utils.h"

--- a/pytorch3d/csrc/iou_box3d/iou_utils.cuh
+++ b/pytorch3d/csrc/iou_box3d/iou_utils.cuh
@@ -461,8 +461,10 @@ __device__ inline std::tuple<float3, float3> ArgMaxVerts(
 __device__ inline bool IsCoplanarTriTri(
    const FaceVerts& tri1,
    const FaceVerts& tri2) {
+  const float3 tri1_ctr = FaceCenter({tri1.v0, tri1.v1, tri1.v2});
  const float3 tri1_n = FaceNormal({tri1.v0, tri1.v1, tri1.v2});

+  const float3 tri2_ctr = FaceCenter({tri2.v0, tri2.v1, tri2.v2});
  const float3 tri2_n = FaceNormal({tri2.v0, tri2.v1, tri2.v2});

  // Check if parallel
@@ -498,6 +500,7 @@ __device__ inline bool IsCoplanarTriPlane(
    const FaceVerts& tri,
    const FaceVerts& plane,
    const float3& normal) {
+  const float3 tri_ctr = FaceCenter({tri.v0, tri.v1, tri.v2});
  const float3 nt = FaceNormal({tri.v0, tri.v1, tri.v2});

  // check if parallel
@@ -725,7 +728,7 @@ __device__ inline int BoxIntersections(
      }
    }
    // Update the face_verts_out tris
-    num_tris = min(MAX_TRIS, offset);
+    num_tris = offset;
    for (int j = 0; j < num_tris; ++j) {
      face_verts_out[j] = tri_verts_updated[j];
    }
--- a/pytorch3d/csrc/knn/knn.h
+++ b/pytorch3d/csrc/knn/knn.h
@@ -74,8 +74,6 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdx(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(p1);
-  CHECK_CPU(p2);
  return KNearestNeighborIdxCpu(p1, p2, lengths1, lengths2, norm, K);
 }

@@ -142,8 +140,6 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(p1);
-  CHECK_CPU(p2);
  return KNearestNeighborBackwardCpu(
      p1, p2, lengths1, lengths2, idxs, norm, grad_dists);
 }
--- a/pytorch3d/csrc/marching_cubes/marching_cubes.h
+++ b/pytorch3d/csrc/marching_cubes/marching_cubes.h
@@ -58,6 +58,5 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor> MarchingCubes(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(vol);
  return MarchingCubesCpu(vol.contiguous(), isolevel);
 }
--- a/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
+++ b/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
@@ -88,8 +88,6 @@ at::Tensor PackedToPadded(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(inputs_packed);
-  CHECK_CPU(first_idxs);
  return PackedToPaddedCpu(inputs_packed, first_idxs, max_size);
 }

@@ -107,7 +105,5 @@ at::Tensor PaddedToPacked(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(inputs_padded);
-  CHECK_CPU(first_idxs);
  return PaddedToPackedCpu(inputs_padded, first_idxs, num_inputs);
 }
--- a/pytorch3d/csrc/point_mesh/point_mesh_cpu.cpp
+++ b/pytorch3d/csrc/point_mesh/point_mesh_cpu.cpp
@@ -174,8 +174,8 @@ std::tuple<at::Tensor, at::Tensor> HullHullDistanceForwardCpu(
  at::Tensor idxs = at::zeros({A_N,}, as_first_idx.options());
  // clang-format on

-  auto as_a = as.accessor<float, H1 == 1 ? 2 : 3>();
-  auto bs_a = bs.accessor<float, H2 == 1 ? 2 : 3>();
+  auto as_a = as.accessor < float, H1 == 1 ? 2 : 3 > ();
+  auto bs_a = bs.accessor < float, H2 == 1 ? 2 : 3 > ();
  auto as_first_idx_a = as_first_idx.accessor<int64_t, 1>();
  auto bs_first_idx_a = bs_first_idx.accessor<int64_t, 1>();
  auto dists_a = dists.accessor<float, 1>();
@@ -230,10 +230,10 @@ std::tuple<at::Tensor, at::Tensor> HullHullDistanceBackwardCpu(
  at::Tensor grad_as = at::zeros_like(as);
  at::Tensor grad_bs = at::zeros_like(bs);

-  auto as_a = as.accessor<float, H1 == 1 ? 2 : 3>();
-  auto bs_a = bs.accessor<float, H2 == 1 ? 2 : 3>();
-  auto grad_as_a = grad_as.accessor<float, H1 == 1 ? 2 : 3>();
-  auto grad_bs_a = grad_bs.accessor<float, H2 == 1 ? 2 : 3>();
+  auto as_a = as.accessor < float, H1 == 1 ? 2 : 3 > ();
+  auto bs_a = bs.accessor < float, H2 == 1 ? 2 : 3 > ();
+  auto grad_as_a = grad_as.accessor < float, H1 == 1 ? 2 : 3 > ();
+  auto grad_bs_a = grad_bs.accessor < float, H2 == 1 ? 2 : 3 > ();
  auto idx_bs_a = idx_bs.accessor<int64_t, 1>();
  auto grad_dists_a = grad_dists.accessor<float, 1>();

--- a/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
+++ b/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
@@ -110,7 +110,7 @@ __global__ void DistanceForwardKernel(
    __syncthreads();

    // Perform reduction in shared memory.
-    for (auto s = blockDim.x / 2; s > 32; s >>= 1) {
+    for (int s = blockDim.x / 2; s > 32; s >>= 1) {
      if (tid < s) {
        if (min_dists[tid] > min_dists[tid + s]) {
          min_dists[tid] = min_dists[tid + s];
@@ -502,8 +502,8 @@ __global__ void PointFaceArrayForwardKernel(
  const float3* tris_f3 = (float3*)tris;

  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int t_i = tid; t_i < P * T; t_i += num_threads) {
    const int t = t_i / P; // segment index.
@@ -576,8 +576,8 @@ __global__ void PointFaceArrayBackwardKernel(
  const float3* tris_f3 = (float3*)tris;

  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int t_i = tid; t_i < P * T; t_i += num_threads) {
    const int t = t_i / P; // triangle index.
@@ -683,8 +683,8 @@ __global__ void PointEdgeArrayForwardKernel(
  float3* segms_f3 = (float3*)segms;

  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int t_i = tid; t_i < P * S; t_i += num_threads) {
    const int s = t_i / P; // segment index.
@@ -752,8 +752,8 @@ __global__ void PointEdgeArrayBackwardKernel(
  float3* segms_f3 = (float3*)segms;

  // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;

  for (int t_i = tid; t_i < P * S; t_i += num_threads) {
    const int s = t_i / P; // segment index.
--- a/pytorch3d/csrc/point_mesh/point_mesh_cuda.h
+++ b/pytorch3d/csrc/point_mesh/point_mesh_cuda.h
@@ -88,10 +88,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(points_first_idx);
-  CHECK_CPU(tris);
-  CHECK_CPU(tris_first_idx);
  return PointFaceDistanceForwardCpu(
      points, points_first_idx, tris, tris_first_idx, min_triangle_area);
 }
@@ -147,10 +143,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(tris);
-  CHECK_CPU(idx_points);
-  CHECK_CPU(grad_dists);
  return PointFaceDistanceBackwardCpu(
      points, tris, idx_points, grad_dists, min_triangle_area);
 }
@@ -229,10 +221,6 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(points_first_idx);
-  CHECK_CPU(tris);
-  CHECK_CPU(tris_first_idx);
  return FacePointDistanceForwardCpu(
      points, points_first_idx, tris, tris_first_idx, min_triangle_area);
 }
@@ -289,10 +277,6 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(tris);
-  CHECK_CPU(idx_tris);
-  CHECK_CPU(grad_dists);
  return FacePointDistanceBackwardCpu(
      points, tris, idx_tris, grad_dists, min_triangle_area);
 }
@@ -362,10 +346,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(points_first_idx);
-  CHECK_CPU(segms);
-  CHECK_CPU(segms_first_idx);
  return PointEdgeDistanceForwardCpu(
      points, points_first_idx, segms, segms_first_idx, max_points);
 }
@@ -416,10 +396,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(segms);
-  CHECK_CPU(idx_points);
-  CHECK_CPU(grad_dists);
  return PointEdgeDistanceBackwardCpu(points, segms, idx_points, grad_dists);
 }

@@ -488,10 +464,6 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(points_first_idx);
-  CHECK_CPU(segms);
-  CHECK_CPU(segms_first_idx);
  return EdgePointDistanceForwardCpu(
      points, points_first_idx, segms, segms_first_idx, max_segms);
 }
@@ -542,10 +514,6 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(segms);
-  CHECK_CPU(idx_segms);
-  CHECK_CPU(grad_dists);
  return EdgePointDistanceBackwardCpu(points, segms, idx_segms, grad_dists);
 }

@@ -599,8 +567,6 @@ torch::Tensor PointFaceArrayDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(tris);
  return PointFaceArrayDistanceForwardCpu(points, tris, min_triangle_area);
 }

@@ -647,9 +613,6 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(tris);
-  CHECK_CPU(grad_dists);
  return PointFaceArrayDistanceBackwardCpu(
      points, tris, grad_dists, min_triangle_area);
 }
@@ -698,8 +661,6 @@ torch::Tensor PointEdgeArrayDistanceForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(segms);
  return PointEdgeArrayDistanceForwardCpu(points, segms);
 }

@@ -742,8 +703,5 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points);
-  CHECK_CPU(segms);
-  CHECK_CPU(grad_dists);
  return PointEdgeArrayDistanceBackwardCpu(points, segms, grad_dists);
 }
--- a/pytorch3d/csrc/points_to_volumes/points_to_volumes.h
+++ b/pytorch3d/csrc/points_to_volumes/points_to_volumes.h
@@ -104,12 +104,6 @@ inline void PointsToVolumesForward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points_3d);
-  CHECK_CPU(points_features);
-  CHECK_CPU(volume_densities);
-  CHECK_CPU(volume_features);
-  CHECK_CPU(grid_sizes);
-  CHECK_CPU(mask);
  PointsToVolumesForwardCpu(
      points_3d,
      points_features,
@@ -189,14 +183,6 @@ inline void PointsToVolumesBackward(
    AT_ERROR("Not compiled with GPU support.");
 #endif
  }
-  CHECK_CPU(points_3d);
-  CHECK_CPU(points_features);
-  CHECK_CPU(grid_sizes);
-  CHECK_CPU(mask);
-  CHECK_CPU(grad_volume_densities);
-  CHECK_CPU(grad_volume_features);
-  CHECK_CPU(grad_points_3d);
-  CHECK_CPU(grad_points_features);
  PointsToVolumesBackwardCpu(
      points_3d,
      points_features,
--- a/pytorch3d/csrc/points_to_volumes/points_to_volumes_cpu.cpp
+++ b/pytorch3d/csrc/points_to_volumes/points_to_volumes_cpu.cpp
@@ -8,7 +8,9 @@

 #include <torch/csrc/autograd/VariableTypeUtils.h>
 #include <torch/extension.h>
+#include <algorithm>
 #include <cmath>
+#include <thread>
 #include <vector>

 // In the x direction, the location {0, ..., grid_size_x - 1} correspond to
--- a/pytorch3d/csrc/pulsar/cuda/README.md
+++ b/pytorch3d/csrc/pulsar/cuda/README.md
--- a/pytorch3d/csrc/pulsar/cuda/commands.h
+++ b/pytorch3d/csrc/pulsar/cuda/commands.h
@@ -59,11 +59,6 @@ getLastCudaError(const char* errorMessage, const char* file, const int line) {
 #define SHARED __shared__
 #define ACTIVEMASK() __activemask()
 #define BALLOT(mask, val) __ballot_sync((mask), val)
-
-/* TODO (ROCM-6.2): None of the WARP_* are used anywhere and ROCM-6.2 natively
- * supports __shfl_*. Disabling until the move to ROCM-6.2.
- */
-#if !defined(USE_ROCM)
 /**
 * Find the cumulative sum within a warp up to the current
 * thread lane, with each mask thread contributing base.
@@ -120,7 +115,6 @@ INLINE DEVICE float3 WARP_SUM_FLOAT3(
  ret.z = WARP_SUM(group, mask, base.z);
  return ret;
 }
-#endif //! USE_ROCM

 // Floating point.
 // #define FMUL(a, b) __fmul_rn((a), (b))
@@ -148,7 +142,6 @@ INLINE DEVICE float3 WARP_SUM_FLOAT3(
 #define FMA(x, y, z) __fmaf_rn((x), (y), (z))
 #define I2F(a) __int2float_rn(a)
 #define FRCP(x) __frcp_rn(x)
-#if !defined(USE_ROCM)
 __device__ static float atomicMax(float* address, float val) {
  int* address_as_i = (int*)address;
  int old = *address_as_i, assumed;
@@ -173,7 +166,6 @@ __device__ static float atomicMin(float* address, float val) {
  } while (assumed != old);
  return __int_as_float(old);
 }
-#endif //! USE_ROCM
 #define DMAX(a, b) FMAX(a, b)
 #define DMIN(a, b) FMIN(a, b)
 #define DSQRT(a) sqrt(a)
@@ -417,7 +409,7 @@ __device__ static float atomicMin(float* address, float val) {
      (OUT_PTR),              \
      (NUM_SELECTED_PTR),     \
      (NUM_ITEMS),            \
-      (STREAM));
+      stream = (STREAM));

 #define COPY_HOST_DEV(PTR_D, PTR_H, TYPE, SIZE) \
  HANDLECUDA(cudaMemcpy(                        \
--- a/pytorch3d/csrc/pulsar/cuda/renderer.backward.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.backward.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.backward_dbg.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.backward_dbg.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.calc_gradients.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.calc_gradients.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.calc_signature.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.calc_signature.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.construct.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.construct.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.create_selector.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.create_selector.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.destruct.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.destruct.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.fill_bg.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.fill_bg.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.forward.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.forward.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.norm_cam_gradients.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.norm_cam_gradients.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.norm_sphere_gradients.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.norm_sphere_gradients.gpu.cu
--- a/pytorch3d/csrc/pulsar/cuda/renderer.render.gpu.cu
+++ b/pytorch3d/csrc/pulsar/cuda/renderer.render.gpu.cu
--- a/pytorch3d/csrc/pulsar/global.h
+++ b/pytorch3d/csrc/pulsar/global.h
@@ -15,8 +15,8 @@
 #endif

 #if defined(_WIN64) || defined(_WIN32)
-using uint = unsigned int;
-using ushort = unsigned short;
+#define uint unsigned int
+#define ushort unsigned short
 #endif

 #include "./logging.h" // <- include before torch/extension.h
@@ -36,13 +36,11 @@ using ushort = unsigned short;
 #pragma nv_diag_suppress 2951
 #pragma nv_diag_suppress 2967
 #else
-#if !defined(USE_ROCM)
 #pragma diag_suppress = attribute_not_allowed
 #pragma diag_suppress = 1866
 #pragma diag_suppress = 2941
 #pragma diag_suppress = 2951
 #pragma diag_suppress = 2967
-#endif //! USE_ROCM
 #endif
 #else // __CUDACC__
 #define INLINE inline
@@ -58,9 +56,7 @@ using ushort = unsigned short;
 #pragma clang diagnostic pop
 #ifdef WITH_CUDA
 #include <ATen/cuda/CUDAContext.h>
-#if !defined(USE_ROCM)
 #include <vector_functions.h>
-#endif //! USE_ROCM
 #else
 #ifndef cudaStream_t
 typedef void* cudaStream_t;
--- a/pytorch3d/csrc/pulsar/host/commands.h
+++ b/pytorch3d/csrc/pulsar/host/commands.h
@@ -357,11 +357,11 @@ void MAX_WS(
 //
 //
 #define END_PARALLEL() \
-  end_parallel:;       \
+  end_parallel :;      \
  }
 #define END_PARALLEL_NORET() }
 #define END_PARALLEL_2D() \
-  end_parallel:;          \
+  end_parallel :;         \
  }                       \
  }
 #define END_PARALLEL_2D_NORET() \
--- a/pytorch3d/csrc/pulsar/include/camera.device.h
+++ b/pytorch3d/csrc/pulsar/include/camera.device.h
@@ -14,7 +14,7 @@
 #include "./commands.h"

 namespace pulsar {
-IHD CamGradInfo::CamGradInfo(int x) {
+IHD CamGradInfo::CamGradInfo() {
  cam_pos = make_float3(0.f, 0.f, 0.f);
  pixel_0_0_center = make_float3(0.f, 0.f, 0.f);
  pixel_dir_x = make_float3(0.f, 0.f, 0.f);
--- a/pytorch3d/csrc/pulsar/include/camera.h
+++ b/pytorch3d/csrc/pulsar/include/camera.h
@@ -63,13 +63,18 @@ inline bool operator==(const CamInfo& a, const CamInfo& b) {
 };

 struct CamGradInfo {
-  HOST DEVICE CamGradInfo(int = 0);
+  HOST DEVICE CamGradInfo();
  float3 cam_pos;
  float3 pixel_0_0_center;
  float3 pixel_dir_x;
  float3 pixel_dir_y;
 };

+// TODO: remove once https://github.com/NVlabs/cub/issues/172 is resolved.
+struct IntWrapper {
+  int val;
+};
+
 } // namespace pulsar

 #endif
--- a/pytorch3d/csrc/pulsar/include/commands.h
+++ b/pytorch3d/csrc/pulsar/include/commands.h
@@ -24,7 +24,7 @@
 // #pragma diag_suppress = 68
 #include <ATen/cuda/CUDAContext.h>
 // #pragma pop
-#include "../gpu/commands.h"
+#include "../cuda/commands.h"
 #else
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Weverything"
--- a/pytorch3d/csrc/pulsar/include/math.h
+++ b/pytorch3d/csrc/pulsar/include/math.h
@@ -46,7 +46,6 @@ IHD float3 outer_product_sum(const float3& a) {
 }

 // TODO: put intrinsics here.
-#if !defined(USE_ROCM)
 IHD float3 operator+(const float3& a, const float3& b) {
  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
 }
@@ -94,7 +93,6 @@ IHD float3 operator*(const float3& a, const float3& b) {
 IHD float3 operator*(const float& a, const float3& b) {
  return b * a;
 }
-#endif //! USE_ROCM

 INLINE DEVICE float length(const float3& v) {
  // TODO: benchmark what's faster.
@@ -149,6 +147,11 @@ IHD CamGradInfo operator*(const CamGradInfo& a, const float& b) {
  return res;
 }

+IHD IntWrapper operator+(const IntWrapper& a, const IntWrapper& b) {
+  IntWrapper res;
+  res.val = a.val + b.val;
+  return res;
+}
 } // namespace pulsar

 #endif
--- a/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
@@ -155,8 +155,8 @@ void backward(
        stream);
    CHECKLAUNCH();
    SUM_WS(
-        self->ids_sorted_d,
-        self->n_grad_contributions_d,
+        (IntWrapper*)(self->ids_sorted_d),
+        (IntWrapper*)(self->n_grad_contributions_d),
        static_cast<int>(num_balls),
        self->workspace_d,
        self->workspace_size,
--- a/pytorch3d/csrc/pulsar/include/renderer.construct.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.construct.device.h
@@ -52,7 +52,7 @@ HOST void construct(
  self->cam.film_width = width;
  self->cam.film_height = height;
  self->max_num_balls = max_num_balls;
-  MALLOC(self->result_d, float, width * height * n_channels);
+  MALLOC(self->result_d, float, width* height* n_channels);
  self->cam.orthogonal_projection = orthogonal_projection;
  self->cam.right_handed = right_handed_system;
  self->cam.background_normalization_depth = background_normalization_depth;
@@ -93,7 +93,7 @@ HOST void construct(
  MALLOC(self->di_sorted_d, DrawInfo, max_num_balls);
  MALLOC(self->region_flags_d, char, max_num_balls);
  MALLOC(self->num_selected_d, size_t, 1);
-  MALLOC(self->forw_info_d, float, width * height * (3 + 2 * n_track));
+  MALLOC(self->forw_info_d, float, width* height * (3 + 2 * n_track));
  MALLOC(self->min_max_pixels_d, IntersectInfo, 1);
  MALLOC(self->grad_pos_d, float3, max_num_balls);
  MALLOC(self->grad_col_d, float, max_num_balls* n_channels);
--- a/pytorch3d/csrc/pulsar/include/renderer.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.h
@@ -255,7 +255,7 @@ GLOBAL void calc_signature(
 * for every iteration through the loading loop every thread could add a
 * 'hit' to the buffer.
 */
-#define RENDER_BUFFER_SIZE RENDER_BLOCK_SIZE * RENDER_BLOCK_SIZE * 2
+#define RENDER_BUFFER_SIZE RENDER_BLOCK_SIZE* RENDER_BLOCK_SIZE * 2
 /**
 * The threshold after which the spheres that are in the render buffer
 * are rendered and the buffer is flushed.
--- a/pytorch3d/csrc/pulsar/include/renderer.render.device.h
+++ b/pytorch3d/csrc/pulsar/include/renderer.render.device.h
@@ -283,15 +283,9 @@ GLOBAL void render(
          (percent_allowed_difference > 0.f &&
           max_closest_possible_intersection > depth_threshold) ||
          tracker.get_n_hits() >= max_n_hits;
-#if defined(__CUDACC__) && defined(__HIP_PLATFORM_AMD__)
-      unsigned long long warp_done = __ballot(done);
-      int warp_done_bit_cnt = __popcll(warp_done);
-#else
      uint warp_done = thread_warp.ballot(done);
-      int warp_done_bit_cnt = POPC(warp_done);
-#endif //__CUDACC__ && __HIP_PLATFORM_AMD__
      if (thread_warp.thread_rank() == 0)
-        ATOMICADD_B(&n_pixels_done, warp_done_bit_cnt);
+        ATOMICADD_B(&n_pixels_done, POPC(warp_done));
      // This sync is necessary to keep n_loaded until all threads are done with
      // painting.
      thread_block.sync();
--- a/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
@@ -213,8 +213,8 @@ std::tuple<size_t, size_t, bool, torch::Tensor> Renderer::arg_check(
    const float& gamma,
    const float& max_depth,
    float& min_depth,
-    const std::optional<torch::Tensor>& bg_col,
-    const std::optional<torch::Tensor>& opacity,
+    const c10::optional<torch::Tensor>& bg_col,
+    const c10::optional<torch::Tensor>& opacity,
    const float& percent_allowed_difference,
    const uint& max_n_hits,
    const uint& mode) {
@@ -668,8 +668,8 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
    const float& gamma,
    const float& max_depth,
    float min_depth,
-    const std::optional<torch::Tensor>& bg_col,
-    const std::optional<torch::Tensor>& opacity,
+    const c10::optional<torch::Tensor>& bg_col,
+    const c10::optional<torch::Tensor>& opacity,
    const float& percent_allowed_difference,
    const uint& max_n_hits,
    const uint& mode) {
@@ -888,14 +888,14 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
 };

 std::tuple<
-    std::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
-    std::optional<torch::Tensor>,
-    std::optional<torch::Tensor>>
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>>
 Renderer::backward(
    const torch::Tensor& grad_im,
    const torch::Tensor& image,
@@ -912,8 +912,8 @@ Renderer::backward(
    const float& gamma,
    const float& max_depth,
    float min_depth,
-    const std::optional<torch::Tensor>& bg_col,
-    const std::optional<torch::Tensor>& opacity,
+    const c10::optional<torch::Tensor>& bg_col,
+    const c10::optional<torch::Tensor>& opacity,
    const float& percent_allowed_difference,
    const uint& max_n_hits,
    const uint& mode,
@@ -922,7 +922,7 @@ Renderer::backward(
    const bool& dif_rad,
    const bool& dif_cam,
    const bool& dif_opy,
-    const std::optional<std::pair<uint, uint>>& dbg_pos) {
+    const at::optional<std::pair<uint, uint>>& dbg_pos) {
  this->ensure_on_device(this->device_tracker.device());
  size_t batch_size;
  size_t n_points;
@@ -1045,14 +1045,14 @@ Renderer::backward(
  }
  // Prepare the return value.
  std::tuple<
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>>
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>>
      ret;
  if (mode == 1 || (!dif_pos && !dif_col && !dif_rad && !dif_cam && !dif_opy)) {
    return ret;
--- a/pytorch3d/csrc/pulsar/pytorch/renderer.h
+++ b/pytorch3d/csrc/pulsar/pytorch/renderer.h
@@ -44,21 +44,21 @@ struct Renderer {
      const float& gamma,
      const float& max_depth,
      float min_depth,
-      const std::optional<torch::Tensor>& bg_col,
-      const std::optional<torch::Tensor>& opacity,
+      const c10::optional<torch::Tensor>& bg_col,
+      const c10::optional<torch::Tensor>& opacity,
      const float& percent_allowed_difference,
      const uint& max_n_hits,
      const uint& mode);

  std::tuple<
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>,
-      std::optional<torch::Tensor>>
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>>
  backward(
      const torch::Tensor& grad_im,
      const torch::Tensor& image,
@@ -75,8 +75,8 @@ struct Renderer {
      const float& gamma,
      const float& max_depth,
      float min_depth,
-      const std::optional<torch::Tensor>& bg_col,
-      const std::optional<torch::Tensor>& opacity,
+      const c10::optional<torch::Tensor>& bg_col,
+      const c10::optional<torch::Tensor>& opacity,
      const float& percent_allowed_difference,
      const uint& max_n_hits,
      const uint& mode,
@@ -85,7 +85,7 @@ struct Renderer {
      const bool& dif_rad,
      const bool& dif_cam,
      const bool& dif_opy,
-      const std::optional<std::pair<uint, uint>>& dbg_pos);
+      const at::optional<std::pair<uint, uint>>& dbg_pos);

  // Infrastructure.
  /**
@@ -115,8 +115,8 @@ struct Renderer {
      const float& gamma,
      const float& max_depth,
      float& min_depth,
-      const std::optional<torch::Tensor>& bg_col,
-      const std::optional<torch::Tensor>& opacity,
+      const c10::optional<torch::Tensor>& bg_col,
+      const c10::optional<torch::Tensor>& opacity,
      const float& percent_allowed_difference,
      const uint& max_n_hits,
      const uint& mode);
--- a/pytorch3d/csrc/pulsar/pytorch/tensor_util.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/tensor_util.cpp
@@ -8,7 +8,6 @@

 #ifdef WITH_CUDA
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAException.h>
 #include <cuda_runtime_api.h>
 #endif
 #include <torch/extension.h>
@@ -34,13 +33,13 @@ torch::Tensor sphere_ids_from_result_info_nograd(
          .contiguous();
  if (forw_info.device().type() == c10::DeviceType::CUDA) {
 #ifdef WITH_CUDA
-    C10_CUDA_CHECK(cudaMemcpyAsync(
+    cudaMemcpyAsync(
        result.data_ptr(),
        tmp.data_ptr(),
        sizeof(uint32_t) * tmp.size(0) * tmp.size(1) * tmp.size(2) *
            tmp.size(3),
        cudaMemcpyDeviceToDevice,
-        at::cuda::getCurrentCUDAStream()));
+        at::cuda::getCurrentCUDAStream());
 #else
    throw std::runtime_error(
        "Copy on CUDA device initiated but built "
--- a/pytorch3d/csrc/pulsar/pytorch/util.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/util.cpp
@@ -7,7 +7,6 @@
 */

 #ifdef WITH_CUDA
-#include <c10/cuda/CUDAException.h>
 #include <cuda_runtime_api.h>

 namespace pulsar {
@@ -18,8 +17,7 @@ void cudaDevToDev(
    const void* src,
    const int& size,
    const cudaStream_t& stream) {
-  C10_CUDA_CHECK(
-      cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToDevice, stream));
+  cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToDevice, stream);
 }

 void cudaDevToHost(
@@ -27,8 +25,7 @@ void cudaDevToHost(
    const void* src,
    const int& size,
    const cudaStream_t& stream) {
-  C10_CUDA_CHECK(
-      cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToHost, stream));
+  cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToHost, stream);
 }

 } // namespace pytorch
--- a/pytorch3d/csrc/pulsar/warnings.cpp
+++ b/pytorch3d/csrc/pulsar/warnings.cpp
@@ -6,6 +6,9 @@
 * LICENSE file in the root directory of this source tree.
 */

+#include "./global.h"
+#include "./logging.h"
+
 /**
 * A compilation unit to provide warnings about the code and avoid
 * repeated messages.
--- a/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
+++ b/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
@@ -25,7 +25,7 @@ class BitMask {

  // Use all threads in the current block to clear all bits of this BitMask
  __device__ void block_clear() {
-    for (auto i = threadIdx.x; i < H * W * D; i += blockDim.x) {
+    for (int i = threadIdx.x; i < H * W * D; i += blockDim.x) {
      data[i] = 0;
    }
    __syncthreads();
--- a/Show More
+++ b/Show More