From d220ee2f66659c828422cb55344973e64fc33a55 Mon Sep 17 00:00:00 2001
From: Jeremy Reizenstein <reizenstein@fb.com>
Date: Tue, 10 Nov 2020 09:36:29 -0800
Subject: [PATCH] pulsar build and CI changes

Summary:
Changes to CI and some minor fixes now that pulsar is part of pytorch3d. Most significantly, add CUB to CI builds.

Make CUB_HOME override the CUB already in cudatoolkit (important for cuda11.0 which uses cub 1.9.9 which pulsar doesn't work well with.
Make imageio available for testing.
Lint fixes.
Fix some test verbosity.
Avoid use of atomicAdd_block on older GPUs.

Reviewed By: nikhilaravi, classner

Differential Revision: D24773716

fbshipit-source-id: 2428356bb2e62735f2bc0c15cbe4cff35b1b24b8
---
 .circleci/config.in.yml                      | 24 +++++++++++++-----
 .circleci/config.yml                         | 26 +++++++++++++++++---
 docs/examples/pulsar_basic.py                |  2 +-
 docs/examples/pulsar_cam_unified.py          |  2 +-
 docs/examples/pulsar_multiview.py            |  2 +-
 docs/examples/pulsar_optimization.py         |  2 +-
 docs/examples/pulsar_optimization_unified.py |  2 +-
 packaging/build_conda.sh                     |  2 +-
 packaging/build_wheel.sh                     |  1 +
 packaging/pkg_helpers.bash                   | 22 +++++++++++++++++
 packaging/pytorch3d/meta.yaml                |  4 ++-
 pytorch3d/csrc/pulsar/cuda/commands.h        |  2 +-
 setup.py                                     | 22 +++++++++++------
 tests/test_build.py                          |  1 -
 14 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/.circleci/config.in.yml b/.circleci/config.in.yml
index 7360aa17..00288912 100644
--- a/.circleci/config.in.yml
+++ b/.circleci/config.in.yml
@@ -68,14 +68,26 @@ jobs:
     steps:
       - checkout
       - <<: *setupcuda
-      - run: pip3 install --progress-bar off wheel matplotlib 'pillow<7'
+      - run: pip3 install --progress-bar off imageio wheel matplotlib 'pillow<7'
       - run: pip3 install --progress-bar off torch torchvision
       # - run: conda create -p ~/conda_env python=3.7 numpy
       # - run: conda activate ~/conda_env
       # - run: conda install -c pytorch pytorch torchvision
 
       - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
-      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python3 setup.py build_ext --inplace
+      - run:
+          name: get cub
+          command: |
+            cd ..
+            wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+            tar xzf 1.10.0.tar.gz
+            # This expands to a directory called cub-1.10.0
+      - run:
+          name: build
+          command: |
+            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64
+            export CUB_HOME=$(realpath ../cub-1.10.0)
+            python3 setup.py build_ext --inplace
       - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python -m unittest discover -v -s tests
       - run: python3 setup.py bdist_wheel
 
@@ -89,7 +101,7 @@ jobs:
     resource_class: 2xlarge+
     steps:
       - checkout
-      - run: packaging/build_wheel.sh
+      - run: MAX_JOBS=15 packaging/build_wheel.sh
       - store_artifacts:
           path: dist
       - persist_to_workspace:
@@ -109,7 +121,7 @@ jobs:
       - checkout
       # This is building with cuda but no gpu present,
       # so we aren't running the tests.
-      - run: TEST_FLAG=--no-test packaging/build_conda.sh
+      - run: MAX_JOBS=15 TEST_FLAG=--no-test packaging/build_conda.sh
       - store_artifacts:
           path: /opt/conda/conda-bld/linux-64
       - persist_to_workspace:
@@ -215,9 +227,9 @@ workflows:
           context: DOCKERHUB_TOKEN
       {{workflows()}}
       - binary_linux_conda_cuda:
-          name: testrun_conda_cuda_py37_cu101_pyt14
+          name: testrun_conda_cuda_py36_cu101_pyt14
           context: DOCKERHUB_TOKEN
-          python_version: "3.7"
+          python_version: "3.6"
           pytorch_version: "1.4"
           cu_version: "cu101"
       - binary_linux_conda_cuda:
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1d121fa2..f1bc3b1e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -68,14 +68,26 @@ jobs:
     steps:
       - checkout
       - <<: *setupcuda
-      - run: pip3 install --progress-bar off wheel matplotlib 'pillow<7'
+      - run: pip3 install --progress-bar off imageio wheel matplotlib 'pillow<7'
       - run: pip3 install --progress-bar off torch torchvision
       # - run: conda create -p ~/conda_env python=3.7 numpy
       # - run: conda activate ~/conda_env
       # - run: conda install -c pytorch pytorch torchvision
 
       - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
-      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python3 setup.py build_ext --inplace
+      - run:
+          name: get cub
+          command: |
+            cd ..
+            wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+            tar xzf 1.10.0.tar.gz
+            # This expands to a directory called cub-1.10.0
+      - run:
+          name: build
+          command: |
+            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64
+            export CUB_HOME=$(realpath ../cub-1.10.0)
+            python3 setup.py build_ext --inplace
       - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python -m unittest discover -v -s tests
       - run: python3 setup.py bdist_wheel
 
@@ -89,7 +101,7 @@ jobs:
     resource_class: 2xlarge+
     steps:
       - checkout
-      - run: packaging/build_wheel.sh
+      - run: MAX_JOBS=15 packaging/build_wheel.sh
       - store_artifacts:
           path: dist
       - persist_to_workspace:
@@ -109,7 +121,7 @@ jobs:
       - checkout
       # This is building with cuda but no gpu present,
       # so we aren't running the tests.
-      - run: TEST_FLAG=--no-test packaging/build_conda.sh
+      - run: MAX_JOBS=15 TEST_FLAG=--no-test packaging/build_conda.sh
       - store_artifacts:
           path: /opt/conda/conda-bld/linux-64
       - persist_to_workspace:
@@ -489,6 +501,12 @@ workflows:
           python_version: "3.6"
           pytorch_version: "1.4"
           cu_version: "cu101"
+      - binary_linux_conda_cuda:
+          name: testrun_conda_cuda_py37_cu102_pyt160
+          context: DOCKERHUB_TOKEN
+          python_version: "3.7"
+          pytorch_version: '1.6.0'
+          cu_version: "cu102"
       - binary_linux_conda_cuda:
           name: testrun_conda_cuda_py37_cu110_pyt170
           context: DOCKERHUB_TOKEN
diff --git a/docs/examples/pulsar_basic.py b/docs/examples/pulsar_basic.py
index 43733d6d..1cd5f8cd 100755
--- a/docs/examples/pulsar_basic.py
+++ b/docs/examples/pulsar_basic.py
@@ -5,9 +5,9 @@ This example demonstrates the most trivial, direct interface of the pulsar
 sphere renderer. It renders and saves an image with 10 random spheres.
 Output: basic.png.
 """
+import logging
 import math
 from os import path
-import logging
 
 import imageio
 import torch
diff --git a/docs/examples/pulsar_cam_unified.py b/docs/examples/pulsar_cam_unified.py
index 265c204c..47affc4d 100755
--- a/docs/examples/pulsar_cam_unified.py
+++ b/docs/examples/pulsar_cam_unified.py
@@ -9,8 +9,8 @@ distorted. Gradient-based optimization is used to converge towards the
 original camera parameters.
 Output: cam-pt3d.gif
 """
-from os import path
 import logging
+from os import path
 
 import cv2
 import imageio
diff --git a/docs/examples/pulsar_multiview.py b/docs/examples/pulsar_multiview.py
index ad487234..26889a11 100755
--- a/docs/examples/pulsar_multiview.py
+++ b/docs/examples/pulsar_multiview.py
@@ -13,9 +13,9 @@ This example is not available yet through the 'unified' interface,
 because opacity support has not landed in PyTorch3D for general data
 structures yet.
 """
+import logging
 import math
 from os import path
-import logging
 
 import cv2
 import imageio
diff --git a/docs/examples/pulsar_optimization.py b/docs/examples/pulsar_optimization.py
index 50a2ac43..5bd64424 100755
--- a/docs/examples/pulsar_optimization.py
+++ b/docs/examples/pulsar_optimization.py
@@ -8,8 +8,8 @@ The scene is initialized with random spheres. Gradient-based
 optimization is used to converge towards a faithful
 scene representation.
 """
-import math
 import logging
+import math
 
 import cv2
 import imageio
diff --git a/docs/examples/pulsar_optimization_unified.py b/docs/examples/pulsar_optimization_unified.py
index 268a501e..69517e1a 100755
--- a/docs/examples/pulsar_optimization_unified.py
+++ b/docs/examples/pulsar_optimization_unified.py
@@ -8,8 +8,8 @@ The scene is initialized with random spheres. Gradient-based
 optimization is used to converge towards a faithful
 scene representation.
 """
-import math
 import logging
+import math
 
 import cv2
 import imageio
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
index 81b5f924..ef50a95e 100755
--- a/packaging/build_conda.sh
+++ b/packaging/build_conda.sh
@@ -17,4 +17,4 @@ setup_conda_pytorch_constraint
 setup_conda_cudatoolkit_constraint
 setup_visual_studio_constraint
 # shellcheck disable=SC2086
-conda build $CONDA_CHANNEL_FLAGS ${TEST_FLAG:-} -c defaults -c conda-forge --no-anaconda-upload -c fvcore --python "$PYTHON_VERSION" packaging/pytorch3d
+conda build $CONDA_CHANNEL_FLAGS ${TEST_FLAG:-} -c bottler -c defaults -c conda-forge --no-anaconda-upload -c fvcore --python "$PYTHON_VERSION" packaging/pytorch3d
diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
index 2d256baf..68c1ab3c 100755
--- a/packaging/build_wheel.sh
+++ b/packaging/build_wheel.sh
@@ -12,5 +12,6 @@ setup_env "$VERSION"
 setup_wheel_python
 pip_install numpy
 setup_pip_pytorch_version
+download_nvidiacub_if_needed
 python setup.py clean
 IS_WHEEL=1 python setup.py bdist_wheel
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
index da6c220e..80f58100 100644
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@@ -251,24 +251,32 @@ setup_conda_pytorch_constraint() {
 # Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
 setup_conda_cudatoolkit_constraint() {
   export CONDA_CPUONLY_FEATURE=""
+  export CONDA_CUB_CONSTRAINT=""
   if [[ "$(uname)" == Darwin ]]; then
     export CONDA_CUDATOOLKIT_CONSTRAINT=""
   else
     case "$CU_VERSION" in
       cu110)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.0,<11.1 # [not osx]"
+        # Even though cudatoolkit 11.0 provides CUB we need our own, to control the
+        # version, because the built-in 1.9.9 in the cudatoolkit causes problems.
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cu102)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.2,<10.3 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cu101)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.1,<10.2 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cu100)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.0,<10.1 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cu92)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.2,<9.3 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cpu)
         export CONDA_CUDATOOLKIT_CONSTRAINT=""
@@ -292,3 +300,17 @@ setup_visual_studio_constraint() {
       cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/pytorch3d/conda_build_config.yaml
   fi
 }
+
+download_nvidiacub_if_needed() {
+  case "$CU_VERSION" in
+    cu110|cu102|cu101|cu100|cu92)
+      echo "Downloading cub"
+      wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+      tar xzf 1.10.0.tar.gz
+      CUB_HOME=$(realpath ./cub-1.10.0)
+      export CUB_HOME
+      echo "CUB_HOME is now $CUB_HOME"
+      ;;
+  esac
+  # We don't need CUB for a cpu build or if cuda is 11.1 or higher
+}
diff --git a/packaging/pytorch3d/meta.yaml b/packaging/pytorch3d/meta.yaml
index 0c818c45..f7f853ad 100644
--- a/packaging/pytorch3d/meta.yaml
+++ b/packaging/pytorch3d/meta.yaml
@@ -8,6 +8,7 @@ source:
 requirements:
   build:
     - {{ compiler('c') }} # [win]
+    {{ environ.get('CONDA_CUB_CONSTRAINT') }}
 
   host:
     - python
@@ -31,6 +32,7 @@ build:
     - CUDA_HOME
     - FORCE_CUDA
     - NVCC_FLAGS
+    - MAX_JOBS
   features:
     {{ environ.get('CONDA_CPUONLY_FEATURE') }}
 
@@ -41,7 +43,7 @@ test:
     - tests
     - docs
   requires:
-    - ca-certificates
+    - imageio
   commands:
     #pytest .
     python -m unittest discover -v -s tests
diff --git a/pytorch3d/csrc/pulsar/cuda/commands.h b/pytorch3d/csrc/pulsar/cuda/commands.h
index 6c5f2a35..2ea94a50 100644
--- a/pytorch3d/csrc/pulsar/cuda/commands.h
+++ b/pytorch3d/csrc/pulsar/cuda/commands.h
@@ -186,7 +186,7 @@ __device__ static float atomicMin(float* address, float val) {
   ATOMICADD(&((PTR)->x), VAL.x); \
   ATOMICADD(&((PTR)->y), VAL.y); \
   ATOMICADD(&((PTR)->z), VAL.z);
-#if (CUDART_VERSION >= 10000)
+#if (CUDART_VERSION >= 10000) && (__CUDA_ARCH__ >= 600)
 #define ATOMICADD_B(PTR, VAL) atomicAdd_block((PTR), (VAL))
 #else
 #define ATOMICADD_B(PTR, VAL) ATOMICADD(PTR, VAL)
diff --git a/setup.py b/setup.py
index 3bfcd280..b9925893 100755
--- a/setup.py
+++ b/setup.py
@@ -20,12 +20,18 @@ def get_extensions():
 
     extra_compile_args = {"cxx": ["-std=c++14"]}
     define_macros = []
+    include_dirs = [extensions_dir]
 
     force_cuda = os.getenv("FORCE_CUDA", "0") == "1"
     if (torch.cuda.is_available() and CUDA_HOME is not None) or force_cuda:
         extension = CUDAExtension
         sources += source_cuda
         define_macros += [("WITH_CUDA", None)]
+        # Thrust is only used for its tuple objects.
+        # With CUDA 11.0 we can't use the cudatoolkit's version of cub.
+        # We take the risk that CUB and Thrust are incompatible, because
+        # we aren't using parts of Thrust which actually use CUB.
+        define_macros += [("THRUST_IGNORE_CUB_VERSION_CHECK", None)]
         cub_home = os.environ.get("CUB_HOME", None)
         nvcc_args = [
             "-std=c++14",
@@ -34,6 +40,11 @@ def get_extensions():
             "-D__CUDA_NO_HALF_CONVERSIONS__",
             "-D__CUDA_NO_HALF2_OPERATORS__",
         ]
+        if cub_home is None:
+            prefix = os.environ.get("CONDA_PREFIX", None)
+            if prefix is not None and os.path.isdir(prefix + "/include/cub"):
+                cub_home = prefix + "/include"
+
         if cub_home is None:
             warnings.warn(
                 "The environment variable `CUB_HOME` was not found. "
@@ -43,14 +54,13 @@ def get_extensions():
                 "`CUB_HOME` to the folder containing the `CMakeListst.txt` file."
             )
         else:
-            nvcc_args.insert(
-                0, "-I%s" % (os.path.realpath(cub_home).replace("\\ ", " "))
-            )
+            include_dirs.append(os.path.realpath(cub_home).replace("\\ ", " "))
         nvcc_flags_env = os.getenv("NVCC_FLAGS", "")
         if nvcc_flags_env != "":
             nvcc_args.extend(nvcc_flags_env.split(" "))
 
-        # It's better if pytorch can do this by default ..
+        # This is needed for pytorch 1.6 and earlier. See e.g.
+        # https://github.com/facebookresearch/pytorch3d/issues/436
         CC = os.environ.get("CC", None)
         if CC is not None:
             CC_arg = "-ccbin={}".format(CC)
@@ -63,8 +73,6 @@ def get_extensions():
 
     sources = [os.path.join(extensions_dir, s) for s in sources]
 
-    include_dirs = [extensions_dir]
-
     ext_modules = [
         extension(
             "pytorch3d._C",
@@ -100,7 +108,7 @@ setup(
     url="https://github.com/facebookresearch/pytorch3d",
     description="PyTorch3D is FAIR's library of reusable components "
     "for deep Learning with 3D data.",
-    packages=find_packages(exclude=("configs", "tests")),
+    packages=find_packages(exclude=("configs", "tests", "tests.*")),
     install_requires=["torchvision>=0.4", "fvcore"],
     extras_require={
         "all": ["matplotlib", "tqdm>4.29.0", "imageio", "ipywidgets"],
diff --git a/tests/test_build.py b/tests/test_build.py
index 1b30607b..e93e3db2 100644
--- a/tests/test_build.py
+++ b/tests/test_build.py
@@ -41,7 +41,6 @@ class TestBuild(unittest.TestCase):
 
         for extension in extensions:
             for i in root_dir.glob(f"**/*.{extension}"):
-                print(i)
                 if str(i).endswith(
                     "pytorch3d/transforms/external/kornia_angle_axis_to_rotation_matrix.py"
                 ):