diff --git a/.circleci/config.in.yml b/.circleci/config.in.yml
index 7360aa17..00288912 100644
--- a/.circleci/config.in.yml
+++ b/.circleci/config.in.yml
@@ -68,14 +68,26 @@ jobs:
     steps:
       - checkout
       - <<: *setupcuda
-      - run: pip3 install --progress-bar off wheel matplotlib 'pillow<7'
+      - run: pip3 install --progress-bar off imageio wheel matplotlib 'pillow<7'
       - run: pip3 install --progress-bar off torch torchvision
       # - run: conda create -p ~/conda_env python=3.7 numpy
       # - run: conda activate ~/conda_env
       # - run: conda install -c pytorch pytorch torchvision
 
       - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
-      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python3 setup.py build_ext --inplace
+      - run:
+          name: get cub
+          command: |
+            cd ..
+            wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+            tar xzf 1.10.0.tar.gz
+            # This expands to a directory called cub-1.10.0
+      - run:
+          name: build
+          command: |
+            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64
+            export CUB_HOME=$(realpath ../cub-1.10.0)
+            python3 setup.py build_ext --inplace
       - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python -m unittest discover -v -s tests
       - run: python3 setup.py bdist_wheel
 
@@ -89,7 +101,7 @@ jobs:
     resource_class: 2xlarge+
     steps:
       - checkout
-      - run: packaging/build_wheel.sh
+      - run: MAX_JOBS=15 packaging/build_wheel.sh
       - store_artifacts:
           path: dist
       - persist_to_workspace:
@@ -109,7 +121,7 @@ jobs:
       - checkout
       # This is building with cuda but no gpu present,
       # so we aren't running the tests.
-      - run: TEST_FLAG=--no-test packaging/build_conda.sh
+      - run: MAX_JOBS=15 TEST_FLAG=--no-test packaging/build_conda.sh
       - store_artifacts:
           path: /opt/conda/conda-bld/linux-64
       - persist_to_workspace:
@@ -215,9 +227,9 @@ workflows:
           context: DOCKERHUB_TOKEN
       {{workflows()}}
       - binary_linux_conda_cuda:
-          name: testrun_conda_cuda_py37_cu101_pyt14
+          name: testrun_conda_cuda_py36_cu101_pyt14
           context: DOCKERHUB_TOKEN
-          python_version: "3.7"
+          python_version: "3.6"
           pytorch_version: "1.4"
           cu_version: "cu101"
       - binary_linux_conda_cuda:
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1d121fa2..f1bc3b1e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -68,14 +68,26 @@ jobs:
     steps:
       - checkout
       - <<: *setupcuda
-      - run: pip3 install --progress-bar off wheel matplotlib 'pillow<7'
+      - run: pip3 install --progress-bar off imageio wheel matplotlib 'pillow<7'
       - run: pip3 install --progress-bar off torch torchvision
       # - run: conda create -p ~/conda_env python=3.7 numpy
       # - run: conda activate ~/conda_env
       # - run: conda install -c pytorch pytorch torchvision
 
       - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
-      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python3 setup.py build_ext --inplace
+      - run:
+          name: get cub
+          command: |
+            cd ..
+            wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+            tar xzf 1.10.0.tar.gz
+            # This expands to a directory called cub-1.10.0
+      - run:
+          name: build
+          command: |
+            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64
+            export CUB_HOME=$(realpath ../cub-1.10.0)
+            python3 setup.py build_ext --inplace
       - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-10.2/lib64 python -m unittest discover -v -s tests
       - run: python3 setup.py bdist_wheel
 
@@ -89,7 +101,7 @@ jobs:
     resource_class: 2xlarge+
     steps:
       - checkout
-      - run: packaging/build_wheel.sh
+      - run: MAX_JOBS=15 packaging/build_wheel.sh
       - store_artifacts:
           path: dist
       - persist_to_workspace:
@@ -109,7 +121,7 @@ jobs:
       - checkout
       # This is building with cuda but no gpu present,
       # so we aren't running the tests.
-      - run: TEST_FLAG=--no-test packaging/build_conda.sh
+      - run: MAX_JOBS=15 TEST_FLAG=--no-test packaging/build_conda.sh
       - store_artifacts:
           path: /opt/conda/conda-bld/linux-64
       - persist_to_workspace:
@@ -489,6 +501,12 @@ workflows:
           python_version: "3.6"
           pytorch_version: "1.4"
           cu_version: "cu101"
+      - binary_linux_conda_cuda:
+          name: testrun_conda_cuda_py37_cu102_pyt160
+          context: DOCKERHUB_TOKEN
+          python_version: "3.7"
+          pytorch_version: '1.6.0'
+          cu_version: "cu102"
       - binary_linux_conda_cuda:
           name: testrun_conda_cuda_py37_cu110_pyt170
           context: DOCKERHUB_TOKEN
diff --git a/docs/examples/pulsar_basic.py b/docs/examples/pulsar_basic.py
index 43733d6d..1cd5f8cd 100755
--- a/docs/examples/pulsar_basic.py
+++ b/docs/examples/pulsar_basic.py
@@ -5,9 +5,9 @@ This example demonstrates the most trivial, direct interface of the pulsar
 sphere renderer. It renders and saves an image with 10 random spheres.
 Output: basic.png.
 """
+import logging
 import math
 from os import path
-import logging
 
 import imageio
 import torch
diff --git a/docs/examples/pulsar_cam_unified.py b/docs/examples/pulsar_cam_unified.py
index 265c204c..47affc4d 100755
--- a/docs/examples/pulsar_cam_unified.py
+++ b/docs/examples/pulsar_cam_unified.py
@@ -9,8 +9,8 @@ distorted. Gradient-based optimization is used to converge towards the
 original camera parameters.
 Output: cam-pt3d.gif
 """
-from os import path
 import logging
+from os import path
 
 import cv2
 import imageio
diff --git a/docs/examples/pulsar_multiview.py b/docs/examples/pulsar_multiview.py
index ad487234..26889a11 100755
--- a/docs/examples/pulsar_multiview.py
+++ b/docs/examples/pulsar_multiview.py
@@ -13,9 +13,9 @@ This example is not available yet through the 'unified' interface,
 because opacity support has not landed in PyTorch3D for general data
 structures yet.
 """
+import logging
 import math
 from os import path
-import logging
 
 import cv2
 import imageio
diff --git a/docs/examples/pulsar_optimization.py b/docs/examples/pulsar_optimization.py
index 50a2ac43..5bd64424 100755
--- a/docs/examples/pulsar_optimization.py
+++ b/docs/examples/pulsar_optimization.py
@@ -8,8 +8,8 @@ The scene is initialized with random spheres. Gradient-based
 optimization is used to converge towards a faithful
 scene representation.
 """
-import math
 import logging
+import math
 
 import cv2
 import imageio
diff --git a/docs/examples/pulsar_optimization_unified.py b/docs/examples/pulsar_optimization_unified.py
index 268a501e..69517e1a 100755
--- a/docs/examples/pulsar_optimization_unified.py
+++ b/docs/examples/pulsar_optimization_unified.py
@@ -8,8 +8,8 @@ The scene is initialized with random spheres. Gradient-based
 optimization is used to converge towards a faithful
 scene representation.
 """
-import math
 import logging
+import math
 
 import cv2
 import imageio
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
index 81b5f924..ef50a95e 100755
--- a/packaging/build_conda.sh
+++ b/packaging/build_conda.sh
@@ -17,4 +17,4 @@ setup_conda_pytorch_constraint
 setup_conda_cudatoolkit_constraint
 setup_visual_studio_constraint
 # shellcheck disable=SC2086
-conda build $CONDA_CHANNEL_FLAGS ${TEST_FLAG:-} -c defaults -c conda-forge --no-anaconda-upload -c fvcore --python "$PYTHON_VERSION" packaging/pytorch3d
+conda build $CONDA_CHANNEL_FLAGS ${TEST_FLAG:-} -c bottler -c defaults -c conda-forge --no-anaconda-upload -c fvcore --python "$PYTHON_VERSION" packaging/pytorch3d
diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
index 2d256baf..68c1ab3c 100755
--- a/packaging/build_wheel.sh
+++ b/packaging/build_wheel.sh
@@ -12,5 +12,6 @@ setup_env "$VERSION"
 setup_wheel_python
 pip_install numpy
 setup_pip_pytorch_version
+download_nvidiacub_if_needed
 python setup.py clean
 IS_WHEEL=1 python setup.py bdist_wheel
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
index da6c220e..80f58100 100644
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@@ -251,24 +251,32 @@ setup_conda_pytorch_constraint() {
 # Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
 setup_conda_cudatoolkit_constraint() {
   export CONDA_CPUONLY_FEATURE=""
+  export CONDA_CUB_CONSTRAINT=""
   if [[ "$(uname)" == Darwin ]]; then
     export CONDA_CUDATOOLKIT_CONSTRAINT=""
   else
     case "$CU_VERSION" in
       cu110)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.0,<11.1 # [not osx]"
+        # Even though cudatoolkit 11.0 provides CUB we need our own, to control the
+        # version, because the built-in 1.9.9 in the cudatoolkit causes problems.
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cu102)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.2,<10.3 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cu101)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.1,<10.2 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cu100)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.0,<10.1 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cu92)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.2,<9.3 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
         ;;
       cpu)
         export CONDA_CUDATOOLKIT_CONSTRAINT=""
@@ -292,3 +300,17 @@ setup_visual_studio_constraint() {
       cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/pytorch3d/conda_build_config.yaml
   fi
 }
+
+download_nvidiacub_if_needed() {
+  case "$CU_VERSION" in
+    cu110|cu102|cu101|cu100|cu92)
+      echo "Downloading cub"
+      wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+      tar xzf 1.10.0.tar.gz
+      CUB_HOME=$(realpath ./cub-1.10.0)
+      export CUB_HOME
+      echo "CUB_HOME is now $CUB_HOME"
+      ;;
+  esac
+  # We don't need CUB for a cpu build or if cuda is 11.1 or higher
+}
diff --git a/packaging/pytorch3d/meta.yaml b/packaging/pytorch3d/meta.yaml
index 0c818c45..f7f853ad 100644
--- a/packaging/pytorch3d/meta.yaml
+++ b/packaging/pytorch3d/meta.yaml
@@ -8,6 +8,7 @@ source:
 requirements:
   build:
     - {{ compiler('c') }} # [win]
+    {{ environ.get('CONDA_CUB_CONSTRAINT') }}
 
   host:
     - python
@@ -31,6 +32,7 @@ build:
     - CUDA_HOME
     - FORCE_CUDA
     - NVCC_FLAGS
+    - MAX_JOBS
   features:
     {{ environ.get('CONDA_CPUONLY_FEATURE') }}
 
@@ -41,7 +43,7 @@ test:
     - tests
     - docs
   requires:
-    - ca-certificates
+    - imageio
   commands:
     #pytest .
     python -m unittest discover -v -s tests
diff --git a/pytorch3d/csrc/pulsar/cuda/commands.h b/pytorch3d/csrc/pulsar/cuda/commands.h
index 6c5f2a35..2ea94a50 100644
--- a/pytorch3d/csrc/pulsar/cuda/commands.h
+++ b/pytorch3d/csrc/pulsar/cuda/commands.h
@@ -186,7 +186,7 @@ __device__ static float atomicMin(float* address, float val) {
   ATOMICADD(&((PTR)->x), VAL.x); \
   ATOMICADD(&((PTR)->y), VAL.y); \
   ATOMICADD(&((PTR)->z), VAL.z);
-#if (CUDART_VERSION >= 10000)
+#if (CUDART_VERSION >= 10000) && (__CUDA_ARCH__ >= 600)
 #define ATOMICADD_B(PTR, VAL) atomicAdd_block((PTR), (VAL))
 #else
 #define ATOMICADD_B(PTR, VAL) ATOMICADD(PTR, VAL)
diff --git a/setup.py b/setup.py
index 3bfcd280..b9925893 100755
--- a/setup.py
+++ b/setup.py
@@ -20,12 +20,18 @@ def get_extensions():
 
     extra_compile_args = {"cxx": ["-std=c++14"]}
     define_macros = []
+    include_dirs = [extensions_dir]
 
     force_cuda = os.getenv("FORCE_CUDA", "0") == "1"
     if (torch.cuda.is_available() and CUDA_HOME is not None) or force_cuda:
         extension = CUDAExtension
         sources += source_cuda
         define_macros += [("WITH_CUDA", None)]
+        # Thrust is only used for its tuple objects.
+        # With CUDA 11.0 we can't use the cudatoolkit's version of cub.
+        # We take the risk that CUB and Thrust are incompatible, because
+        # we aren't using parts of Thrust which actually use CUB.
+        define_macros += [("THRUST_IGNORE_CUB_VERSION_CHECK", None)]
         cub_home = os.environ.get("CUB_HOME", None)
         nvcc_args = [
             "-std=c++14",
@@ -34,6 +40,11 @@ def get_extensions():
             "-D__CUDA_NO_HALF_CONVERSIONS__",
             "-D__CUDA_NO_HALF2_OPERATORS__",
         ]
+        if cub_home is None:
+            prefix = os.environ.get("CONDA_PREFIX", None)
+            if prefix is not None and os.path.isdir(prefix + "/include/cub"):
+                cub_home = prefix + "/include"
+
         if cub_home is None:
             warnings.warn(
                 "The environment variable `CUB_HOME` was not found. "
@@ -43,14 +54,13 @@ def get_extensions():
                 "`CUB_HOME` to the folder containing the `CMakeListst.txt` file."
             )
         else:
-            nvcc_args.insert(
-                0, "-I%s" % (os.path.realpath(cub_home).replace("\\ ", " "))
-            )
+            include_dirs.append(os.path.realpath(cub_home).replace("\\ ", " "))
         nvcc_flags_env = os.getenv("NVCC_FLAGS", "")
         if nvcc_flags_env != "":
             nvcc_args.extend(nvcc_flags_env.split(" "))
 
-        # It's better if pytorch can do this by default ..
+        # This is needed for pytorch 1.6 and earlier. See e.g.
+        # https://github.com/facebookresearch/pytorch3d/issues/436
         CC = os.environ.get("CC", None)
         if CC is not None:
             CC_arg = "-ccbin={}".format(CC)
@@ -63,8 +73,6 @@ def get_extensions():
 
     sources = [os.path.join(extensions_dir, s) for s in sources]
 
-    include_dirs = [extensions_dir]
-
     ext_modules = [
         extension(
             "pytorch3d._C",
@@ -100,7 +108,7 @@ setup(
     url="https://github.com/facebookresearch/pytorch3d",
     description="PyTorch3D is FAIR's library of reusable components "
     "for deep Learning with 3D data.",
-    packages=find_packages(exclude=("configs", "tests")),
+    packages=find_packages(exclude=("configs", "tests", "tests.*")),
     install_requires=["torchvision>=0.4", "fvcore"],
     extras_require={
         "all": ["matplotlib", "tqdm>4.29.0", "imageio", "ipywidgets"],
diff --git a/tests/test_build.py b/tests/test_build.py
index 1b30607b..e93e3db2 100644
--- a/tests/test_build.py
+++ b/tests/test_build.py
@@ -41,7 +41,6 @@ class TestBuild(unittest.TestCase):
 
         for extension in extensions:
             for i in root_dir.glob(f"**/*.{extension}"):
-                print(i)
                 if str(i).endswith(
                     "pytorch3d/transforms/external/kornia_angle_axis_to_rotation_matrix.py"
                 ):