examples and docs.

Summary: This diff updates the documentation and tutorials with information about the new pulsar backend. For more information about the pulsar backend, see the release notes and the paper (https://arxiv.org/abs/2004.07484). For information on how to use the backend, see the point cloud rendering notebook and the examples in the folder docs/examples. Reviewed By: nikhilaravi Differential Revision: D24498129 fbshipit-source-id: e312b0169a72b13590df6e4db36bfe6190d742f9
2025-12-20 14:20:38 +08:00 · 2020-11-03 13:05:02 -08:00
parent 960fd6d8b6
commit 039e02601d
21 changed files with 759 additions and 60 deletions
--- a/pytorch3d/csrc/pulsar/global.h
+++ b/pytorch3d/csrc/pulsar/global.h
@@ -38,9 +38,27 @@
 // Don't care about pytorch warnings; they shouldn't clutter our warnings.
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Weverything"
-#include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #pragma clang diagnostic pop
+#ifdef WITH_CUDA
+#include <ATen/cuda/CUDAContext.h>
+#else
+#ifndef cudaStream_t
+typedef void* cudaStream_t;
+#endif
+struct int2 {
+  int x, y;
+};
+struct ushort2 {
+  unsigned short x, y;
+};
+struct float2 {
+  float x, y;
+};
+struct float3 {
+  float x, y, z;
+};
+#endif
 namespace py = pybind11;
 inline float3 make_float3(const float& x, const float& y, const float& z) {
  float3 res;
--- a/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
@@ -5,8 +5,10 @@
 #include "./util.h"

 #include <ATen/ATen.h>
+#ifdef WITH_CUDA
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#endif

 namespace PRE = ::pulsar::Renderer;

@@ -58,10 +60,13 @@ Renderer::Renderer(

 Renderer::~Renderer() {
  if (this->device_type == c10::DeviceType::CUDA) {
+// Can't happen in the case that not compiled with CUDA.
+#ifdef WITH_CUDA
    at::cuda::CUDAGuard device_guard(this->device_tracker.device());
    for (auto nrend : this->renderer_vec) {
      PRE::destruct<true>(&nrend);
    }
+#endif
  } else {
    for (auto nrend : this->renderer_vec) {
      PRE::destruct<false>(&nrend);
@@ -87,6 +92,7 @@ void Renderer::ensure_on_device(torch::Device device, bool /*non_blocking*/) {
      "Only CPU and CUDA device types are supported.");
  if (device.type() != this->device_type ||
      device.index() != this->device_index) {
+#ifdef WITH_CUDA
    LOG_IF(INFO, PULSAR_LOG_INIT)
        << "Transferring render buffers between devices.";
    int prev_active;
@@ -136,6 +142,11 @@ void Renderer::ensure_on_device(torch::Device device, bool /*non_blocking*/) {
    cudaSetDevice(prev_active);
    this->device_type = device.type();
    this->device_index = device.index();
+#else
+    throw std::runtime_error(
+        "pulsar was built without CUDA "
+        "but a device move to a CUDA device was initiated.");
+#endif
  }
 };

@@ -148,6 +159,7 @@ void Renderer::ensure_n_renderers_gte(const size_t& batch_size) {
    for (ptrdiff_t i = 0; i < diff; ++i) {
      this->renderer_vec.emplace_back();
      if (this->device_type == c10::DeviceType::CUDA) {
+#ifdef WITH_CUDA
        PRE::construct<true>(
            &this->renderer_vec[this->renderer_vec.size() - 1],
            this->max_num_balls(),
@@ -158,6 +170,7 @@ void Renderer::ensure_n_renderers_gte(const size_t& batch_size) {
            this->renderer_vec[0].cam.background_normalization_depth,
            this->renderer_vec[0].cam.n_channels,
            this->n_track());
+#endif
      } else {
        PRE::construct<false>(
            &this->renderer_vec[this->renderer_vec.size() - 1],
@@ -708,6 +721,10 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
    opacity_ptr = opacity_contiguous.data_ptr<float>();
  }
  if (this->device_type == c10::DeviceType::CUDA) {
+// No else check necessary - if not compiled with CUDA
+// we can't even reach this code (the renderer can't be
+// moved to a CUDA device).
+#ifdef WITH_CUDA
    int prev_active;
    cudaGetDevice(&prev_active);
    cudaSetDevice(this->device_index);
@@ -756,6 +773,7 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
              << time_ms / static_cast<float>(batch_size) << "ms" << std::endl;
 #endif
    cudaSetDevice(prev_active);
+#endif
  } else {
 #ifdef PULSAR_TIMINGS_BATCHED_ENABLED
    START_TIME(batch_forward);
@@ -816,7 +834,11 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
        this->device_index,
        torch::kFloat,
        this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
            ? at::cuda::getCurrentCUDAStream()
+#else
+            ? (cudaStream_t) nullptr
+#endif
            : (cudaStream_t) nullptr);
    if (mode == 1)
      results[batch_i] = results[batch_i].slice(2, 0, 1, 1);
@@ -829,7 +851,11 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
        this->device_index,
        torch::kFloat,
        this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
            ? at::cuda::getCurrentCUDAStream()
+#else
+            ? (cudaStream_t) nullptr
+#endif
            : (cudaStream_t) nullptr);
  }
  LOG_IF(INFO, PULSAR_LOG_FORWARD) << "Forward render complete.";
@@ -1048,6 +1074,9 @@ Renderer::backward(
    opacity_ptr = opacity_contiguous.data_ptr<float>();
  }
  if (this->device_type == c10::DeviceType::CUDA) {
+// No else check necessary - it's not possible to move
+// the renderer to a CUDA device if not built with CUDA.
+#ifdef WITH_CUDA
    int prev_active;
    cudaGetDevice(&prev_active);
    cudaSetDevice(this->device_index);
@@ -1162,6 +1191,7 @@ Renderer::backward(
    std::cout << "Backward render batched time per example: "
              << time_ms / static_cast<float>(batch_size) << "ms" << std::endl;
 #endif
+#endif // WITH_CUDA
  } else {
 #ifdef PULSAR_TIMINGS_BATCHED_ENABLED
    START_TIME(batch_backward);
@@ -1285,7 +1315,11 @@ Renderer::backward(
            this->device_index,
            torch::kFloat,
            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
                : (cudaStream_t) nullptr);
      }
      std::get<0>(ret) = torch::stack(results);
@@ -1297,7 +1331,11 @@ Renderer::backward(
          this->device_index,
          torch::kFloat,
          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
              : (cudaStream_t) nullptr);
    }
  }
@@ -1313,7 +1351,11 @@ Renderer::backward(
            this->device_index,
            torch::kFloat,
            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
                : (cudaStream_t) nullptr);
      }
      std::get<1>(ret) = torch::stack(results);
@@ -1326,7 +1368,11 @@ Renderer::backward(
          this->device_index,
          torch::kFloat,
          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
              : (cudaStream_t) nullptr);
    }
  }
@@ -1341,7 +1387,11 @@ Renderer::backward(
            this->device_index,
            torch::kFloat,
            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
                : (cudaStream_t) nullptr);
      }
      std::get<2>(ret) = torch::stack(results);
@@ -1353,7 +1403,11 @@ Renderer::backward(
          this->device_index,
          torch::kFloat,
          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
              : (cudaStream_t) nullptr);
    }
  }
@@ -1371,7 +1425,11 @@ Renderer::backward(
            this->device_index,
            torch::kFloat,
            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
                : (cudaStream_t) nullptr);
        res_p2[batch_i] = from_blob(
            reinterpret_cast<float*>(
@@ -1381,7 +1439,11 @@ Renderer::backward(
            this->device_index,
            torch::kFloat,
            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
                : (cudaStream_t) nullptr);
        res_p3[batch_i] = from_blob(
            reinterpret_cast<float*>(
@@ -1391,7 +1453,11 @@ Renderer::backward(
            this->device_index,
            torch::kFloat,
            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
                : (cudaStream_t) nullptr);
        res_p4[batch_i] = from_blob(
            reinterpret_cast<float*>(
@@ -1401,7 +1467,11 @@ Renderer::backward(
            this->device_index,
            torch::kFloat,
            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
                : (cudaStream_t) nullptr);
      }
      std::get<3>(ret) = torch::stack(res_p1);
@@ -1416,7 +1486,11 @@ Renderer::backward(
          this->device_index,
          torch::kFloat,
          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
              : (cudaStream_t) nullptr);
      std::get<4>(ret) = from_blob(
          reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d + 3),
@@ -1425,7 +1499,11 @@ Renderer::backward(
          this->device_index,
          torch::kFloat,
          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
              : (cudaStream_t) nullptr);
      std::get<5>(ret) = from_blob(
          reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d + 6),
@@ -1434,7 +1512,11 @@ Renderer::backward(
          this->device_index,
          torch::kFloat,
          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
              : (cudaStream_t) nullptr);
      std::get<6>(ret) = from_blob(
          reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d + 9),
@@ -1443,7 +1525,11 @@ Renderer::backward(
          this->device_index,
          torch::kFloat,
          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
              : (cudaStream_t) nullptr);
    }
  }
@@ -1458,7 +1544,11 @@ Renderer::backward(
            this->device_index,
            torch::kFloat,
            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
                : (cudaStream_t) nullptr);
      }
      std::get<7>(ret) = torch::stack(results);
@@ -1470,7 +1560,11 @@ Renderer::backward(
          this->device_index,
          torch::kFloat,
          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
              : (cudaStream_t) nullptr);
    }
  }
--- a/pytorch3d/csrc/pulsar/pytorch/tensor_util.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/tensor_util.cpp
@@ -1,6 +1,8 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifdef WITH_CUDA
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda_runtime_api.h>
+#endif
 #include <torch/extension.h>

 #include "./tensor_util.h"
@@ -23,6 +25,7 @@ torch::Tensor sphere_ids_from_result_info_nograd(
              /*dim=*/3, /*start=*/3, /*end=*/forw_info.size(3), /*step=*/2)
          .contiguous();
  if (forw_info.device().type() == c10::DeviceType::CUDA) {
+#ifdef WITH_CUDA
    cudaMemcpyAsync(
        result.data_ptr(),
        tmp.data_ptr(),
@@ -30,6 +33,11 @@ torch::Tensor sphere_ids_from_result_info_nograd(
            tmp.size(3),
        cudaMemcpyDeviceToDevice,
        at::cuda::getCurrentCUDAStream());
+#else
+    throw std::runtime_error(
+        "Copy on CUDA device initiated but built "
+        "without CUDA support.");
+#endif
  } else {
    memcpy(
        result.data_ptr(),
--- a/pytorch3d/csrc/pulsar/pytorch/util.cpp
+++ b/pytorch3d/csrc/pulsar/pytorch/util.cpp
@@ -1,4 +1,5 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#ifdef WITH_CUDA
 #include <cuda_runtime_api.h>

 namespace pulsar {
@@ -22,3 +23,4 @@ void cudaDevToHost(

 } // namespace pytorch
 } // namespace pulsar
+#endif
--- a/pytorch3d/csrc/pulsar/pytorch/util.h
+++ b/pytorch3d/csrc/pulsar/pytorch/util.h
@@ -41,11 +41,16 @@ torch::Tensor from_blob(
  const int num_elements =
      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
  if (device_type == c10::DeviceType::CUDA) {
+#ifdef WITH_CUDA
    cudaDevToDev(
        ret.data_ptr(),
        static_cast<const void*>(ptr),
        sizeof(T) * num_elements,
        stream);
+#else
+    throw std::runtime_error(
+        "Initiating devToDev copy on a build without CUDA.");
+#endif
    // TODO: check for synchronization.
  } else {
    memcpy(ret.data_ptr(), ptr, sizeof(T) * num_elements);