examples and docs.

Summary: This diff updates the documentation and tutorials with information about the new pulsar backend. For more information about the pulsar backend, see the release notes and the paper (https://arxiv.org/abs/2004.07484). For information on how to use the backend, see the point cloud rendering notebook and the examples in the folder docs/examples.

Reviewed By: nikhilaravi

Differential Revision: D24498129

fbshipit-source-id: e312b0169a72b13590df6e4db36bfe6190d742f9
This commit is contained in:
Christoph Lassner
2020-11-03 13:05:02 -08:00
committed by Facebook GitHub Bot
parent 960fd6d8b6
commit 039e02601d
21 changed files with 759 additions and 60 deletions

View File

@@ -38,9 +38,27 @@
// Don't care about pytorch warnings; they shouldn't clutter our warnings.
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Weverything"
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
#pragma clang diagnostic pop
#ifdef WITH_CUDA
#include <ATen/cuda/CUDAContext.h>
#else
#ifndef cudaStream_t
typedef void* cudaStream_t;
#endif
struct int2 {
int x, y;
};
struct ushort2 {
unsigned short x, y;
};
struct float2 {
float x, y;
};
struct float3 {
float x, y, z;
};
#endif
namespace py = pybind11;
inline float3 make_float3(const float& x, const float& y, const float& z) {
float3 res;

View File

@@ -5,8 +5,10 @@
#include "./util.h"
#include <ATen/ATen.h>
#ifdef WITH_CUDA
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#endif
namespace PRE = ::pulsar::Renderer;
@@ -58,10 +60,13 @@ Renderer::Renderer(
Renderer::~Renderer() {
if (this->device_type == c10::DeviceType::CUDA) {
// Can't happen in the case that not compiled with CUDA.
#ifdef WITH_CUDA
at::cuda::CUDAGuard device_guard(this->device_tracker.device());
for (auto nrend : this->renderer_vec) {
PRE::destruct<true>(&nrend);
}
#endif
} else {
for (auto nrend : this->renderer_vec) {
PRE::destruct<false>(&nrend);
@@ -87,6 +92,7 @@ void Renderer::ensure_on_device(torch::Device device, bool /*non_blocking*/) {
"Only CPU and CUDA device types are supported.");
if (device.type() != this->device_type ||
device.index() != this->device_index) {
#ifdef WITH_CUDA
LOG_IF(INFO, PULSAR_LOG_INIT)
<< "Transferring render buffers between devices.";
int prev_active;
@@ -136,6 +142,11 @@ void Renderer::ensure_on_device(torch::Device device, bool /*non_blocking*/) {
cudaSetDevice(prev_active);
this->device_type = device.type();
this->device_index = device.index();
#else
throw std::runtime_error(
"pulsar was built without CUDA "
"but a device move to a CUDA device was initiated.");
#endif
}
};
@@ -148,6 +159,7 @@ void Renderer::ensure_n_renderers_gte(const size_t& batch_size) {
for (ptrdiff_t i = 0; i < diff; ++i) {
this->renderer_vec.emplace_back();
if (this->device_type == c10::DeviceType::CUDA) {
#ifdef WITH_CUDA
PRE::construct<true>(
&this->renderer_vec[this->renderer_vec.size() - 1],
this->max_num_balls(),
@@ -158,6 +170,7 @@ void Renderer::ensure_n_renderers_gte(const size_t& batch_size) {
this->renderer_vec[0].cam.background_normalization_depth,
this->renderer_vec[0].cam.n_channels,
this->n_track());
#endif
} else {
PRE::construct<false>(
&this->renderer_vec[this->renderer_vec.size() - 1],
@@ -708,6 +721,10 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
opacity_ptr = opacity_contiguous.data_ptr<float>();
}
if (this->device_type == c10::DeviceType::CUDA) {
// No else check necessary - if not compiled with CUDA
// we can't even reach this code (the renderer can't be
// moved to a CUDA device).
#ifdef WITH_CUDA
int prev_active;
cudaGetDevice(&prev_active);
cudaSetDevice(this->device_index);
@@ -756,6 +773,7 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
<< time_ms / static_cast<float>(batch_size) << "ms" << std::endl;
#endif
cudaSetDevice(prev_active);
#endif
} else {
#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
START_TIME(batch_forward);
@@ -816,7 +834,11 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
if (mode == 1)
results[batch_i] = results[batch_i].slice(2, 0, 1, 1);
@@ -829,7 +851,11 @@ std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
LOG_IF(INFO, PULSAR_LOG_FORWARD) << "Forward render complete.";
@@ -1048,6 +1074,9 @@ Renderer::backward(
opacity_ptr = opacity_contiguous.data_ptr<float>();
}
if (this->device_type == c10::DeviceType::CUDA) {
// No else check necessary - it's not possible to move
// the renderer to a CUDA device if not built with CUDA.
#ifdef WITH_CUDA
int prev_active;
cudaGetDevice(&prev_active);
cudaSetDevice(this->device_index);
@@ -1162,6 +1191,7 @@ Renderer::backward(
std::cout << "Backward render batched time per example: "
<< time_ms / static_cast<float>(batch_size) << "ms" << std::endl;
#endif
#endif // WITH_CUDA
} else {
#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
START_TIME(batch_backward);
@@ -1285,7 +1315,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
std::get<0>(ret) = torch::stack(results);
@@ -1297,7 +1331,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
}
@@ -1313,7 +1351,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
std::get<1>(ret) = torch::stack(results);
@@ -1326,7 +1368,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
}
@@ -1341,7 +1387,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
std::get<2>(ret) = torch::stack(results);
@@ -1353,7 +1403,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
}
@@ -1371,7 +1425,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
res_p2[batch_i] = from_blob(
reinterpret_cast<float*>(
@@ -1381,7 +1439,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
res_p3[batch_i] = from_blob(
reinterpret_cast<float*>(
@@ -1391,7 +1453,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
res_p4[batch_i] = from_blob(
reinterpret_cast<float*>(
@@ -1401,7 +1467,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
std::get<3>(ret) = torch::stack(res_p1);
@@ -1416,7 +1486,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
std::get<4>(ret) = from_blob(
reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d + 3),
@@ -1425,7 +1499,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
std::get<5>(ret) = from_blob(
reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d + 6),
@@ -1434,7 +1512,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
std::get<6>(ret) = from_blob(
reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d + 9),
@@ -1443,7 +1525,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
}
@@ -1458,7 +1544,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
std::get<7>(ret) = torch::stack(results);
@@ -1470,7 +1560,11 @@ Renderer::backward(
this->device_index,
torch::kFloat,
this->device_type == c10::DeviceType::CUDA
#ifdef WITH_CUDA
? at::cuda::getCurrentCUDAStream()
#else
? (cudaStream_t) nullptr
#endif
: (cudaStream_t) nullptr);
}
}

View File

@@ -1,6 +1,8 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#ifdef WITH_CUDA
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime_api.h>
#endif
#include <torch/extension.h>
#include "./tensor_util.h"
@@ -23,6 +25,7 @@ torch::Tensor sphere_ids_from_result_info_nograd(
/*dim=*/3, /*start=*/3, /*end=*/forw_info.size(3), /*step=*/2)
.contiguous();
if (forw_info.device().type() == c10::DeviceType::CUDA) {
#ifdef WITH_CUDA
cudaMemcpyAsync(
result.data_ptr(),
tmp.data_ptr(),
@@ -30,6 +33,11 @@ torch::Tensor sphere_ids_from_result_info_nograd(
tmp.size(3),
cudaMemcpyDeviceToDevice,
at::cuda::getCurrentCUDAStream());
#else
throw std::runtime_error(
"Copy on CUDA device initiated but built "
"without CUDA support.");
#endif
} else {
memcpy(
result.data_ptr(),

View File

@@ -1,4 +1,5 @@
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#ifdef WITH_CUDA
#include <cuda_runtime_api.h>
namespace pulsar {
@@ -22,3 +23,4 @@ void cudaDevToHost(
} // namespace pytorch
} // namespace pulsar
#endif

View File

@@ -41,11 +41,16 @@ torch::Tensor from_blob(
const int num_elements =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
if (device_type == c10::DeviceType::CUDA) {
#ifdef WITH_CUDA
cudaDevToDev(
ret.data_ptr(),
static_cast<const void*>(ptr),
sizeof(T) * num_elements,
stream);
#else
throw std::runtime_error(
"Initiating devToDev copy on a build without CUDA.");
#endif
// TODO: check for synchronization.
} else {
memcpy(ret.data_ptr(), ptr, sizeof(T) * num_elements);