Fixed windows MSVC build compatibility (#9)

Summary:
Fixed a few MSVC compiler (visual studio 2019, MSVC 19.16.27034) compatibility issues
1. Replaced long with int64_t. aten::data_ptr\<long\> is not supported in MSVC
2. pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp, inline function is not correctly recognized by MSVC.
3. pytorch3d/csrc/rasterize_meshes/geometry_utils.cuh
const auto kEpsilon = 1e-30;
MSVC does not compile this const into both host and device, change to a MACRO.
4. pytorch3d/csrc/rasterize_meshes/geometry_utils.cuh,
const float area2 = pow(area, 2.0);
2.0 is considered as double by MSVC and raised an error
5. pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp
std::tuple<torch::Tensor, torch::Tensor> RasterizePointsCoarseCpu() return type does not match the declaration in rasterize_points_cpu.h.
Pull Request resolved: https://github.com/facebookresearch/pytorch3d/pull/9

Reviewed By: nikhilaravi

Differential Revision: D19986567

Pulled By: yuanluxu

fbshipit-source-id: f4d98525d088c99c513b85193db6f0fc69c7f017
This commit is contained in:
merayxu
2020-02-20 18:41:41 -08:00
committed by Facebook Github Bot
parent a3baa367e3
commit 9e21659fc5
5 changed files with 60 additions and 18 deletions

View File

@@ -5,7 +5,7 @@
// TODO(T47953967) to make this cuda kernel support all datatypes.
__global__ void gather_scatter_kernel(
const float* __restrict__ input,
const long* __restrict__ edges,
const int64_t* __restrict__ edges,
float* __restrict__ output,
bool directed,
bool backward,
@@ -21,8 +21,8 @@ __global__ void gather_scatter_kernel(
// Edges are split evenly across the blocks.
for (int e = blockIdx.x; e < E; e += gridDim.x) {
// Get indices of vertices which form the edge.
const long v0 = edges[2 * e + v0_idx];
const long v1 = edges[2 * e + v1_idx];
const int64_t v0 = edges[2 * e + v0_idx];
const int64_t v1 = edges[2 * e + v1_idx];
// Split vertex features evenly across threads.
// This implementation will be quite wasteful when D<128 since there will be
@@ -57,7 +57,7 @@ at::Tensor gather_scatter_cuda(
gather_scatter_kernel<<<blocks, threads>>>(
input.data_ptr<float>(),
edges.data_ptr<long>(),
edges.data_ptr<int64_t>(),
output.data_ptr<float>(),
directed,
backward,

View File

@@ -6,7 +6,7 @@
template <typename scalar_t>
__device__ void WarpReduce(
volatile scalar_t* min_dists,
volatile long* min_idxs,
volatile int64_t* min_idxs,
const size_t tid) {
// s = 32
if (min_dists[tid] > min_dists[tid + 32]) {
@@ -57,7 +57,7 @@ template <typename scalar_t>
__global__ void NearestNeighborKernel(
const scalar_t* __restrict__ points1,
const scalar_t* __restrict__ points2,
long* __restrict__ idx,
int64_t* __restrict__ idx,
const size_t N,
const size_t P1,
const size_t P2,
@@ -74,7 +74,7 @@ __global__ void NearestNeighborKernel(
extern __shared__ char shared_buf[];
scalar_t* x = (scalar_t*)shared_buf; // scalar_t[DD]
scalar_t* min_dists = &x[D_2]; // scalar_t[NUM_THREADS]
long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS]
const size_t n = blockIdx.y; // index of batch element.
const size_t i = blockIdx.x; // index of point within batch element.
@@ -147,14 +147,14 @@ template <typename scalar_t>
__global__ void NearestNeighborKernelD3(
const scalar_t* __restrict__ points1,
const scalar_t* __restrict__ points2,
long* __restrict__ idx,
int64_t* __restrict__ idx,
const size_t N,
const size_t P1,
const size_t P2) {
// Single shared memory buffer which is split and cast to different types.
extern __shared__ char shared_buf[];
scalar_t* min_dists = (scalar_t*)shared_buf; // scalar_t[NUM_THREADS]
long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS]
const size_t D = 3;
const size_t n = blockIdx.y; // index of batch element.
@@ -230,12 +230,12 @@ at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2) {
// Use the specialized kernel for D=3.
AT_DISPATCH_FLOATING_TYPES(p1.type(), "nearest_neighbor_v3_cuda", ([&] {
size_t shared_size = threads * sizeof(size_t) +
threads * sizeof(long);
threads * sizeof(int64_t);
NearestNeighborKernelD3<scalar_t>
<<<blocks, threads, shared_size>>>(
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
idx.data_ptr<long>(),
idx.data_ptr<int64_t>(),
N,
P1,
P2);
@@ -248,11 +248,11 @@ at::Tensor NearestNeighborIdxCuda(at::Tensor p1, at::Tensor p2) {
// need to be rounded to the next even size.
size_t D_2 = D + (D % 2);
size_t shared_size = (D_2 + threads) * sizeof(size_t);
shared_size += threads * sizeof(long);
shared_size += threads * sizeof(int64_t);
NearestNeighborKernel<scalar_t><<<blocks, threads, shared_size>>>(
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
idx.data_ptr<long>(),
idx.data_ptr<int64_t>(),
N,
P1,
P2,

View File

@@ -7,7 +7,11 @@
#include "float_math.cuh"
// Set epsilon for preventing floating point errors and division by 0.
#ifdef _MSC_VER
#define kEpsilon 1e-30f
#else
const auto kEpsilon = 1e-30;
#endif
// Determines whether a point p is on the right side of a 2D line segment
// given by the end points v0, v1.
@@ -93,7 +97,7 @@ BarycentricCoordsBackward(
const float2& v2,
const float3& grad_bary_upstream) {
const float area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
const float area2 = pow(area, 2.0);
const float area2 = pow(area, 2.0f);
const float e0 = EdgeFunctionForward(p, v1, v2);
const float e1 = EdgeFunctionForward(p, v2, v0);
const float e2 = EdgeFunctionForward(p, v0, v1);

View File

@@ -7,7 +7,7 @@
// Given a pixel coordinate 0 <= i < S, convert it to a normalized device
// coordinate in the range [-1, 1]. The NDC range is divided into S evenly-sized
// pixels, and assume that each pixel falls in the *center* of its range.
inline float PixToNdc(const int i, const int S) {
static float PixToNdc(const int i, const int S) {
// NDC x-offset + (i * pixel_width + half_pixel_width)
return -1 + (2 * i + 1.0f) / S;
}
@@ -74,7 +74,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaiveCpu(
return std::make_tuple(point_idxs, zbuf, pix_dists);
}
std::tuple<torch::Tensor, torch::Tensor> RasterizePointsCoarseCpu(
torch::Tensor RasterizePointsCoarseCpu(
const torch::Tensor& points,
const int image_size,
const float radius,
@@ -140,7 +140,7 @@ std::tuple<torch::Tensor, torch::Tensor> RasterizePointsCoarseCpu(
bin_y_max = bin_y_min + bin_width;
}
}
return std::make_tuple(points_per_bin, bin_points);
return bin_points;
}
torch::Tensor RasterizePointsBackwardCpu(