Make cuda tensors contiguous in host function and remove contiguous check

Summary:
Update the cuda kernels to:
- remove contiguous checks for the grad tensors and for cpu functions which use accessors
- for cuda implementations call `.contiguous()` on all tensors in the host function before invoking the kernel

Reviewed By: gkioxari

Differential Revision: D21598008

fbshipit-source-id: 9b97bda4582fd4269c8a00999874d4552a1aea2d
This commit is contained in:
Nikhila Ravi 2020-05-15 14:58:04 -07:00 committed by Facebook GitHub Bot
parent a8377f1f06
commit 3fef506895
21 changed files with 219 additions and 233 deletions

View File

@ -168,6 +168,8 @@ at::Tensor alphaCompositeCudaForward(
// doubles. Currently, support is for floats only.
alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off
// As we are using packed accessors here the tensors
// do not need to be made contiguous.
result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
@ -211,6 +213,8 @@ std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward(
// doubles. Currently, support is for floats only.
alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off
// As we are using packed accessors here the tensors
// do not need to be made contiguous.
grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
grad_outputs.packed_accessor64<float, 4, at::RestrictPtrTraits>(),

View File

@ -60,18 +60,14 @@ torch::Tensor alphaCompositeForward(
if (features.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
CHECK_CUDA(features);
CHECK_CUDA(alphas);
CHECK_CUDA(points_idx);
return alphaCompositeCudaForward(features, alphas, points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return alphaCompositeCpuForward(features, alphas, points_idx);
}
}
@ -88,10 +84,10 @@ std::tuple<torch::Tensor, torch::Tensor> alphaCompositeBackward(
if (grad_outputs.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(grad_outputs);
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
CHECK_CUDA(grad_outputs);
CHECK_CUDA(features);
CHECK_CUDA(alphas);
CHECK_CUDA(points_idx);
return alphaCompositeCudaBackward(
grad_outputs, features, alphas, points_idx);
@ -99,11 +95,6 @@ std::tuple<torch::Tensor, torch::Tensor> alphaCompositeBackward(
AT_ERROR("Not compiled with GPU support");
#endif
} else {
CHECK_CONTIGUOUS(grad_outputs);
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return alphaCompositeCpuBackward(
grad_outputs, features, alphas, points_idx);
}

View File

@ -183,6 +183,8 @@ at::Tensor weightedSumNormCudaForward(
// doubles. Currently, support is for floats only.
// clang-format off
weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// As we are using packed accessors here the tensors
// do not need to be made contiguous.
result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
@ -227,6 +229,8 @@ std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward(
// doubles. Currently, support is for floats only.
weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off
// As we are using packed accessors here the tensors
// do not need to be made contiguous.
grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
grad_outputs.packed_accessor64<float, 4, at::RestrictPtrTraits>(),

View File

@ -58,19 +58,15 @@ torch::Tensor weightedSumNormForward(
if (features.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
CHECK_CUDA(features);
CHECK_CUDA(alphas);
CHECK_CUDA(points_idx);
return weightedSumNormCudaForward(features, alphas, points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return weightedSumNormCpuForward(features, alphas, points_idx);
}
}
@ -87,10 +83,10 @@ std::tuple<torch::Tensor, torch::Tensor> weightedSumNormBackward(
if (grad_outputs.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(grad_outputs);
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
CHECK_CUDA(grad_outputs);
CHECK_CUDA(features);
CHECK_CUDA(alphas);
CHECK_CUDA(points_idx);
return weightedSumNormCudaBackward(
grad_outputs, features, alphas, points_idx);
@ -98,11 +94,6 @@ std::tuple<torch::Tensor, torch::Tensor> weightedSumNormBackward(
AT_ERROR("Not compiled with GPU support");
#endif
} else {
CHECK_CONTIGUOUS(grad_outputs);
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return weightedSumNormCpuBackward(
grad_outputs, features, alphas, points_idx);
}

View File

@ -142,6 +142,8 @@ at::Tensor weightedSumCudaForward(
// doubles. Currently, support is for floats only.
weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off
// As we are using packed accessors here the tensors
// do not need to be made contiguous.
result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
@ -185,6 +187,8 @@ std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward(
// doubles. Currently, support is for floats only.
weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
// clang-format off
// As we are using packed accessors here the tensors
// do not need to be made contiguous.
grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
grad_outputs.packed_accessor64<float, 4, at::RestrictPtrTraits>(),

View File

@ -58,18 +58,14 @@ torch::Tensor weightedSumForward(
if (features.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
CHECK_CUDA(features);
CHECK_CUDA(alphas);
CHECK_CUDA(points_idx);
return weightedSumCudaForward(features, alphas, points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return weightedSumCpuForward(features, alphas, points_idx);
}
}
@ -86,21 +82,16 @@ std::tuple<torch::Tensor, torch::Tensor> weightedSumBackward(
if (grad_outputs.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(grad_outputs);
CHECK_CONTIGUOUS_CUDA(features);
CHECK_CONTIGUOUS_CUDA(alphas);
CHECK_CONTIGUOUS_CUDA(points_idx);
CHECK_CUDA(grad_outputs);
CHECK_CUDA(features);
CHECK_CUDA(alphas);
CHECK_CUDA(points_idx);
return weightedSumCudaBackward(grad_outputs, features, alphas, points_idx);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
CHECK_CONTIGUOUS(grad_outputs);
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(alphas);
CHECK_CONTIGUOUS(points_idx);
return weightedSumCpuBackward(grad_outputs, features, alphas, points_idx);
}
}

View File

@ -239,8 +239,8 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCuda(
AT_DISPATCH_FLOATING_TYPES(
verts.scalar_type(), "face_areas_normals_forward_cuda", ([&] {
FaceAreasNormalsForwardKernel<scalar_t><<<blocks, threads, 0, stream>>>(
verts.data_ptr<scalar_t>(),
faces.data_ptr<int64_t>(),
verts.contiguous().data_ptr<scalar_t>(),
faces.contiguous().data_ptr<int64_t>(),
areas.data_ptr<scalar_t>(),
normals.data_ptr<scalar_t>(),
V,
@ -282,10 +282,10 @@ at::Tensor FaceAreasNormalsBackwardCuda(
// TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
// doubles. Currently, support is for floats only.
FaceAreasNormalsBackwardKernel<<<blocks, threads, 0, stream>>>(
grad_areas.data_ptr<float>(),
grad_normals.data_ptr<float>(),
verts.data_ptr<float>(),
faces.data_ptr<int64_t>(),
grad_areas.contiguous().data_ptr<float>(),
grad_normals.contiguous().data_ptr<float>(),
verts.contiguous().data_ptr<float>(),
faces.contiguous().data_ptr<int64_t>(),
grad_verts.data_ptr<float>(),
V,
F);

View File

@ -47,8 +47,8 @@ std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForward(
const at::Tensor faces) {
if (verts.is_cuda() && faces.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(verts);
CHECK_CONTIGUOUS_CUDA(faces);
CHECK_CUDA(verts);
CHECK_CUDA(faces);
return FaceAreasNormalsForwardCuda(verts, faces);
#else
AT_ERROR("Not compiled with GPU support.");
@ -65,10 +65,10 @@ at::Tensor FaceAreasNormalsBackward(
const at::Tensor faces) {
if (verts.is_cuda() && faces.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(verts);
CHECK_CONTIGUOUS_CUDA(faces);
CHECK_CONTIGUOUS_CUDA(grad_areas);
CHECK_CONTIGUOUS_CUDA(grad_normals);
CHECK_CUDA(verts);
CHECK_CUDA(faces);
CHECK_CUDA(grad_areas);
CHECK_CUDA(grad_normals);
return FaceAreasNormalsBackwardCuda(grad_areas, grad_normals, verts, faces);
#else
AT_ERROR("Not compiled with GPU support.");

View File

@ -72,8 +72,8 @@ at::Tensor GatherScatterCuda(
}
GatherScatterCudaKernel<<<blocks, threads, 0, stream>>>(
input.data_ptr<float>(),
edges.data_ptr<int64_t>(),
input.contiguous().data_ptr<float>(),
edges.contiguous().data_ptr<int64_t>(),
output.data_ptr<float>(),
directed,
backward,

View File

@ -35,8 +35,8 @@ at::Tensor GatherScatter(
bool backward) {
if (input.is_cuda() && edges.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(input);
CHECK_CONTIGUOUS_CUDA(edges);
CHECK_CUDA(input);
CHECK_CUDA(edges);
return GatherScatterCuda(input, edges, directed, backward);
#else
AT_ERROR("Not compiled with GPU support.");

View File

@ -347,21 +347,21 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda(
const size_t threads = 256;
const size_t blocks = 256;
if (version == 0) {
AT_DISPATCH_FLOATING_TYPES(p1.scalar_type(), "knn_kernel_cuda", ([&] {
KNearestNeighborKernelV0<scalar_t>
<<<blocks, threads, 0, stream>>>(
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
lengths1.data_ptr<int64_t>(),
lengths2.data_ptr<int64_t>(),
dists.data_ptr<scalar_t>(),
idxs.data_ptr<int64_t>(),
N,
P1,
P2,
D,
K);
}));
AT_DISPATCH_FLOATING_TYPES(
p1.scalar_type(), "knn_kernel_cuda", ([&] {
KNearestNeighborKernelV0<scalar_t><<<blocks, threads, 0, stream>>>(
p1.contiguous().data_ptr<scalar_t>(),
p2.contiguous().data_ptr<scalar_t>(),
lengths1.contiguous().data_ptr<int64_t>(),
lengths2.contiguous().data_ptr<int64_t>(),
dists.data_ptr<scalar_t>(),
idxs.data_ptr<int64_t>(),
N,
P1,
P2,
D,
K);
}));
} else if (version == 1) {
AT_DISPATCH_FLOATING_TYPES(p1.scalar_type(), "knn_kernel_cuda", ([&] {
DispatchKernel1D<
@ -372,10 +372,10 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda(
D,
blocks,
threads,
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
lengths1.data_ptr<int64_t>(),
lengths2.data_ptr<int64_t>(),
p1.contiguous().data_ptr<scalar_t>(),
p2.contiguous().data_ptr<scalar_t>(),
lengths1.contiguous().data_ptr<int64_t>(),
lengths2.contiguous().data_ptr<int64_t>(),
dists.data_ptr<scalar_t>(),
idxs.data_ptr<int64_t>(),
N,
@ -396,10 +396,10 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda(
K_64,
blocks,
threads,
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
lengths1.data_ptr<int64_t>(),
lengths2.data_ptr<int64_t>(),
p1.contiguous().data_ptr<scalar_t>(),
p2.contiguous().data_ptr<scalar_t>(),
lengths1.contiguous().data_ptr<int64_t>(),
lengths2.contiguous().data_ptr<int64_t>(),
dists.data_ptr<scalar_t>(),
idxs.data_ptr<int64_t>(),
N,
@ -419,10 +419,10 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda(
K_64,
blocks,
threads,
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
lengths1.data_ptr<int64_t>(),
lengths2.data_ptr<int64_t>(),
p1.contiguous().data_ptr<scalar_t>(),
p2.contiguous().data_ptr<scalar_t>(),
lengths1.contiguous().data_ptr<int64_t>(),
lengths2.contiguous().data_ptr<int64_t>(),
dists.data_ptr<scalar_t>(),
idxs.data_ptr<int64_t>(),
N,
@ -525,12 +525,12 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackwardCuda(
const int threads = 512;
KNearestNeighborBackwardKernel<<<blocks, threads, 0, stream>>>(
p1.data_ptr<float>(),
p2.data_ptr<float>(),
lengths1.data_ptr<int64_t>(),
lengths2.data_ptr<int64_t>(),
idxs.data_ptr<int64_t>(),
grad_dists.data_ptr<float>(),
p1.contiguous().data_ptr<float>(),
p2.contiguous().data_ptr<float>(),
lengths1.contiguous().data_ptr<int64_t>(),
lengths2.contiguous().data_ptr<int64_t>(),
idxs.contiguous().data_ptr<int64_t>(),
grad_dists.contiguous().data_ptr<float>(),
grad_p1.data_ptr<float>(),
grad_p2.data_ptr<float>(),
N,

View File

@ -56,8 +56,8 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdx(
int version) {
if (p1.is_cuda() || p2.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(p1);
CHECK_CONTIGUOUS_CUDA(p2);
CHECK_CUDA(p1);
CHECK_CUDA(p2);
return KNearestNeighborIdxCuda(p1, p2, lengths1, lengths2, K, version);
#else
AT_ERROR("Not compiled with GPU support.");
@ -117,8 +117,8 @@ std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackward(
const at::Tensor& grad_dists) {
if (p1.is_cuda() || p2.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(p1);
CHECK_CONTIGUOUS_CUDA(p2);
CHECK_CUDA(p1);
CHECK_CUDA(p2);
return KNearestNeighborBackwardCuda(
p1, p2, lengths1, lengths2, idxs, grad_dists);
#else

View File

@ -146,8 +146,8 @@ at::Tensor PackedToPaddedCuda(
AT_DISPATCH_FLOATING_TYPES(
inputs_packed.scalar_type(), "packed_to_padded_d1_kernel", ([&] {
PackedToPaddedKernelD1<scalar_t><<<blocks, threads, 0, stream>>>(
inputs_packed.data_ptr<scalar_t>(),
first_idxs.data_ptr<int64_t>(),
inputs_packed.contiguous().data_ptr<scalar_t>(),
first_idxs.contiguous().data_ptr<int64_t>(),
inputs_padded.data_ptr<scalar_t>(),
batch_size,
max_size,
@ -157,8 +157,8 @@ at::Tensor PackedToPaddedCuda(
AT_DISPATCH_FLOATING_TYPES(
inputs_packed.scalar_type(), "packed_to_padded_kernel", ([&] {
PackedToPaddedKernel<scalar_t><<<blocks, threads, 0, stream>>>(
inputs_packed.data_ptr<scalar_t>(),
first_idxs.data_ptr<int64_t>(),
inputs_packed.contiguous().data_ptr<scalar_t>(),
first_idxs.contiguous().data_ptr<int64_t>(),
inputs_padded.data_ptr<scalar_t>(),
batch_size,
max_size,
@ -209,8 +209,8 @@ at::Tensor PaddedToPackedCuda(
AT_DISPATCH_FLOATING_TYPES(
inputs_padded.scalar_type(), "padded_to_packed_d1_kernel", ([&] {
PaddedToPackedKernelD1<scalar_t><<<blocks, threads, 0, stream>>>(
inputs_padded.data_ptr<scalar_t>(),
first_idxs.data_ptr<int64_t>(),
inputs_padded.contiguous().data_ptr<scalar_t>(),
first_idxs.contiguous().data_ptr<int64_t>(),
inputs_packed.data_ptr<scalar_t>(),
batch_size,
max_size,
@ -220,8 +220,8 @@ at::Tensor PaddedToPackedCuda(
AT_DISPATCH_FLOATING_TYPES(
inputs_padded.scalar_type(), "padded_to_packed_kernel", ([&] {
PaddedToPackedKernel<scalar_t><<<blocks, threads, 0, stream>>>(
inputs_padded.data_ptr<scalar_t>(),
first_idxs.data_ptr<int64_t>(),
inputs_padded.contiguous().data_ptr<scalar_t>(),
first_idxs.contiguous().data_ptr<int64_t>(),
inputs_packed.data_ptr<scalar_t>(),
batch_size,
max_size,

View File

@ -75,8 +75,8 @@ at::Tensor PackedToPadded(
const int64_t max_size) {
if (inputs_packed.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(inputs_packed);
CHECK_CONTIGUOUS_CUDA(first_idxs);
CHECK_CUDA(inputs_packed);
CHECK_CUDA(first_idxs);
return PackedToPaddedCuda(inputs_packed, first_idxs, max_size);
#else
AT_ERROR("Not compiled with GPU support.");
@ -92,8 +92,8 @@ at::Tensor PaddedToPacked(
const int64_t num_inputs) {
if (inputs_padded.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(inputs_padded);
CHECK_CONTIGUOUS_CUDA(first_idxs);
CHECK_CUDA(inputs_padded);
CHECK_CUDA(first_idxs);
return PaddedToPackedCuda(inputs_padded, first_idxs, num_inputs);
#else
AT_ERROR("Not compiled with GPU support.");

View File

@ -144,15 +144,16 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceForwardCuda(
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
PointEdgeForwardKernel<<<blocks, threads, shared_size, stream>>>(
points.data_ptr<float>(),
points_first_idx.data_ptr<int64_t>(),
segms.data_ptr<float>(),
segms_first_idx.data_ptr<int64_t>(),
points.contiguous().data_ptr<float>(),
points_first_idx.contiguous().data_ptr<int64_t>(),
segms.contiguous().data_ptr<float>(),
segms_first_idx.contiguous().data_ptr<int64_t>(),
dists.data_ptr<float>(),
idxs.data_ptr<int64_t>(),
B,
P,
S);
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(dists, idxs);
}
@ -240,10 +241,10 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceBackwardCuda(
const int threads = 512;
PointEdgeBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(),
segms.data_ptr<float>(),
idx_points.data_ptr<int64_t>(),
grad_dists.data_ptr<float>(),
points.contiguous().data_ptr<float>(),
segms.contiguous().data_ptr<float>(),
idx_points.contiguous().data_ptr<int64_t>(),
grad_dists.contiguous().data_ptr<float>(),
grad_points.data_ptr<float>(),
grad_segms.data_ptr<float>(),
P);
@ -386,10 +387,10 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceForwardCuda(
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
EdgePointForwardKernel<<<blocks, threads, shared_size, stream>>>(
points.data_ptr<float>(),
points_first_idx.data_ptr<int64_t>(),
segms.data_ptr<float>(),
segms_first_idx.data_ptr<int64_t>(),
points.contiguous().data_ptr<float>(),
points_first_idx.contiguous().data_ptr<int64_t>(),
segms.contiguous().data_ptr<float>(),
segms_first_idx.contiguous().data_ptr<int64_t>(),
dists.data_ptr<float>(),
idxs.data_ptr<int64_t>(),
B,
@ -478,10 +479,10 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceBackwardCuda(
const int threads = 512;
EdgePointBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(),
segms.data_ptr<float>(),
idx_segms.data_ptr<int64_t>(),
grad_dists.data_ptr<float>(),
points.contiguous().data_ptr<float>(),
segms.contiguous().data_ptr<float>(),
idx_segms.contiguous().data_ptr<int64_t>(),
grad_dists.contiguous().data_ptr<float>(),
grad_points.data_ptr<float>(),
grad_segms.data_ptr<float>(),
S);
@ -550,8 +551,8 @@ at::Tensor PointEdgeArrayDistanceForwardCuda(
const size_t threads = 64;
PointEdgeArrayForwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(),
segms.data_ptr<float>(),
points.contiguous().data_ptr<float>(),
segms.contiguous().data_ptr<float>(),
dists.data_ptr<float>(),
P,
S);
@ -638,9 +639,9 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeArrayDistanceBackwardCuda(
const size_t threads = 64;
PointEdgeArrayBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(),
segms.data_ptr<float>(),
grad_dists.data_ptr<float>(),
points.contiguous().data_ptr<float>(),
segms.contiguous().data_ptr<float>(),
grad_dists.contiguous().data_ptr<float>(),
grad_points.data_ptr<float>(),
grad_segms.data_ptr<float>(),
P,

View File

@ -54,10 +54,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForward(
const int64_t max_points) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(points_first_idx);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(segms_first_idx);
CHECK_CUDA(points);
CHECK_CUDA(points_first_idx);
CHECK_CUDA(segms);
CHECK_CUDA(segms_first_idx);
return PointEdgeDistanceForwardCuda(
points, points_first_idx, segms, segms_first_idx, max_points);
#else
@ -98,10 +98,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackward(
const torch::Tensor& grad_dists) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(idx_points);
CHECK_CONTIGUOUS_CUDA(grad_dists);
CHECK_CUDA(points);
CHECK_CUDA(segms);
CHECK_CUDA(idx_points);
CHECK_CUDA(grad_dists);
return PointEdgeDistanceBackwardCuda(points, segms, idx_points, grad_dists);
#else
AT_ERROR("Not compiled with GPU support.");
@ -158,10 +158,10 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForward(
const int64_t max_segms) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(points_first_idx);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(segms_first_idx);
CHECK_CUDA(points);
CHECK_CUDA(points_first_idx);
CHECK_CUDA(segms);
CHECK_CUDA(segms_first_idx);
return EdgePointDistanceForwardCuda(
points, points_first_idx, segms, segms_first_idx, max_segms);
#else
@ -202,10 +202,10 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward(
const torch::Tensor& grad_dists) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(idx_segms);
CHECK_CONTIGUOUS_CUDA(grad_dists);
CHECK_CUDA(points);
CHECK_CUDA(segms);
CHECK_CUDA(idx_segms);
CHECK_CUDA(grad_dists);
return EdgePointDistanceBackwardCuda(points, segms, idx_segms, grad_dists);
#else
AT_ERROR("Not compiled with GPU support.");
@ -247,8 +247,8 @@ torch::Tensor PointEdgeArrayDistanceForward(
const torch::Tensor& segms) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CUDA(points);
CHECK_CUDA(segms);
return PointEdgeArrayDistanceForwardCuda(points, segms);
#else
AT_ERROR("Not compiled with GPU support.");
@ -283,9 +283,9 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackward(
const torch::Tensor& grad_dists) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(segms);
CHECK_CONTIGUOUS_CUDA(grad_dists);
CHECK_CUDA(points);
CHECK_CUDA(segms);
CHECK_CUDA(grad_dists);
return PointEdgeArrayDistanceBackwardCuda(points, segms, grad_dists);
#else
AT_ERROR("Not compiled with GPU support.");

View File

@ -145,10 +145,10 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceForwardCuda(
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
PointFaceForwardKernel<<<blocks, threads, shared_size, stream>>>(
points.data_ptr<float>(),
points_first_idx.data_ptr<int64_t>(),
tris.data_ptr<float>(),
tris_first_idx.data_ptr<int64_t>(),
points.contiguous().data_ptr<float>(),
points_first_idx.contiguous().data_ptr<int64_t>(),
tris.contiguous().data_ptr<float>(),
tris_first_idx.contiguous().data_ptr<int64_t>(),
dists.data_ptr<float>(),
idxs.data_ptr<int64_t>(),
B,
@ -249,10 +249,10 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceBackwardCuda(
const int threads = 512;
PointFaceBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(),
tris.data_ptr<float>(),
idx_points.data_ptr<int64_t>(),
grad_dists.data_ptr<float>(),
points.contiguous().data_ptr<float>(),
tris.contiguous().data_ptr<float>(),
idx_points.contiguous().data_ptr<int64_t>(),
grad_dists.contiguous().data_ptr<float>(),
grad_points.data_ptr<float>(),
grad_tris.data_ptr<float>(),
P);
@ -396,10 +396,10 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceForwardCuda(
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
FacePointForwardKernel<<<blocks, threads, shared_size, stream>>>(
points.data_ptr<float>(),
points_first_idx.data_ptr<int64_t>(),
tris.data_ptr<float>(),
tris_first_idx.data_ptr<int64_t>(),
points.contiguous().data_ptr<float>(),
points_first_idx.contiguous().data_ptr<int64_t>(),
tris.contiguous().data_ptr<float>(),
tris_first_idx.contiguous().data_ptr<int64_t>(),
dists.data_ptr<float>(),
idxs.data_ptr<int64_t>(),
B,
@ -501,10 +501,10 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceBackwardCuda(
const int threads = 512;
FacePointBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(),
tris.data_ptr<float>(),
idx_tris.data_ptr<int64_t>(),
grad_dists.data_ptr<float>(),
points.contiguous().data_ptr<float>(),
tris.contiguous().data_ptr<float>(),
idx_tris.contiguous().data_ptr<int64_t>(),
grad_dists.contiguous().data_ptr<float>(),
grad_points.data_ptr<float>(),
grad_tris.data_ptr<float>(),
T);
@ -575,8 +575,8 @@ at::Tensor PointFaceArrayDistanceForwardCuda(
const size_t threads = 64;
PointFaceArrayForwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(),
tris.data_ptr<float>(),
points.contiguous().data_ptr<float>(),
tris.contiguous().data_ptr<float>(),
dists.data_ptr<float>(),
P,
T);
@ -672,9 +672,9 @@ std::tuple<at::Tensor, at::Tensor> PointFaceArrayDistanceBackwardCuda(
const size_t threads = 64;
PointFaceArrayBackwardKernel<<<blocks, threads, 0, stream>>>(
points.data_ptr<float>(),
tris.data_ptr<float>(),
grad_dists.data_ptr<float>(),
points.contiguous().data_ptr<float>(),
tris.contiguous().data_ptr<float>(),
grad_dists.contiguous().data_ptr<float>(),
grad_points.data_ptr<float>(),
grad_tris.data_ptr<float>(),
P,

View File

@ -56,10 +56,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForward(
const int64_t max_points) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(points_first_idx);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(tris_first_idx);
CHECK_CUDA(points);
CHECK_CUDA(points_first_idx);
CHECK_CUDA(tris);
CHECK_CUDA(tris_first_idx);
return PointFaceDistanceForwardCuda(
points, points_first_idx, tris, tris_first_idx, max_points);
#else
@ -100,10 +100,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackward(
const torch::Tensor& grad_dists) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(idx_points);
CHECK_CONTIGUOUS_CUDA(grad_dists);
CHECK_CUDA(points);
CHECK_CUDA(tris);
CHECK_CUDA(idx_points);
CHECK_CUDA(grad_dists);
return PointFaceDistanceBackwardCuda(points, tris, idx_points, grad_dists);
#else
AT_ERROR("Not compiled with GPU support.");
@ -160,10 +160,10 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForward(
const int64_t max_tris) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(points_first_idx);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(tris_first_idx);
CHECK_CUDA(points);
CHECK_CUDA(points_first_idx);
CHECK_CUDA(tris);
CHECK_CUDA(tris_first_idx);
return FacePointDistanceForwardCuda(
points, points_first_idx, tris, tris_first_idx, max_tris);
#else
@ -204,10 +204,10 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackward(
const torch::Tensor& grad_dists) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(idx_tris);
CHECK_CONTIGUOUS_CUDA(grad_dists);
CHECK_CUDA(points);
CHECK_CUDA(tris);
CHECK_CUDA(idx_tris);
CHECK_CUDA(grad_dists);
return FacePointDistanceBackwardCuda(points, tris, idx_tris, grad_dists);
#else
AT_ERROR("Not compiled with GPU support.");
@ -250,8 +250,8 @@ torch::Tensor PointFaceArrayDistanceForward(
const torch::Tensor& tris) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CUDA(points);
CHECK_CUDA(tris);
return PointFaceArrayDistanceForwardCuda(points, tris);
#else
AT_ERROR("Not compiled with GPU support.");
@ -285,9 +285,9 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackward(
const torch::Tensor& grad_dists) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(tris);
CHECK_CONTIGUOUS_CUDA(grad_dists);
CHECK_CUDA(points);
CHECK_CUDA(tris);
CHECK_CUDA(grad_dists);
return PointFaceArrayDistanceBackwardCuda(points, tris, grad_dists);
#else
AT_ERROR("Not compiled with GPU support.");

View File

@ -348,10 +348,10 @@ RasterizeMeshesNaiveCuda(
H,
W,
K,
face_idxs.contiguous().data_ptr<int64_t>(),
zbuf.contiguous().data_ptr<float>(),
pix_dists.contiguous().data_ptr<float>(),
bary.contiguous().data_ptr<float>());
face_idxs.data_ptr<int64_t>(),
zbuf.data_ptr<float>(),
pix_dists.data_ptr<float>(),
bary.data_ptr<float>());
AT_CUDA_CHECK(cudaGetLastError());
return std::make_tuple(face_idxs, zbuf, bary, pix_dists);
@ -530,7 +530,7 @@ at::Tensor RasterizeMeshesBackwardCuda(
grad_zbuf.contiguous().data_ptr<float>(),
grad_bary.contiguous().data_ptr<float>(),
grad_dists.contiguous().data_ptr<float>(),
grad_face_verts.contiguous().data_ptr<float>());
grad_face_verts.data_ptr<float>());
AT_CUDA_CHECK(cudaGetLastError());
return grad_face_verts;
@ -727,8 +727,8 @@ at::Tensor RasterizeMeshesCoarseCuda(
bin_size,
chunk_size,
M,
faces_per_bin.contiguous().data_ptr<int32_t>(),
bin_faces.contiguous().data_ptr<int32_t>());
faces_per_bin.data_ptr<int32_t>(),
bin_faces.data_ptr<int32_t>());
AT_CUDA_CHECK(cudaGetLastError());
return bin_faces;
@ -897,10 +897,10 @@ RasterizeMeshesFineCuda(
H,
W,
K,
face_idxs.contiguous().data_ptr<int64_t>(),
zbuf.contiguous().data_ptr<float>(),
pix_dists.contiguous().data_ptr<float>(),
bary.contiguous().data_ptr<float>());
face_idxs.data_ptr<int64_t>(),
zbuf.data_ptr<float>(),
pix_dists.data_ptr<float>(),
bary.data_ptr<float>());
return std::make_tuple(face_idxs, zbuf, bary, pix_dists);
}

View File

@ -96,9 +96,9 @@ RasterizeMeshesNaive(
// TODO: Better type checking.
if (face_verts.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(face_verts);
CHECK_CONTIGUOUS_CUDA(mesh_to_face_first_idx);
CHECK_CONTIGUOUS_CUDA(num_faces_per_mesh);
CHECK_CUDA(face_verts);
CHECK_CUDA(mesh_to_face_first_idx);
CHECK_CUDA(num_faces_per_mesh);
return RasterizeMeshesNaiveCuda(
face_verts,
mesh_to_face_first_idx,
@ -179,11 +179,11 @@ torch::Tensor RasterizeMeshesBackward(
const bool perspective_correct) {
if (face_verts.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(face_verts);
CHECK_CONTIGUOUS_CUDA(pix_to_face);
CHECK_CONTIGUOUS_CUDA(grad_zbuf);
CHECK_CONTIGUOUS_CUDA(grad_bary);
CHECK_CONTIGUOUS_CUDA(grad_dists);
CHECK_CUDA(face_verts);
CHECK_CUDA(pix_to_face);
CHECK_CUDA(grad_zbuf);
CHECK_CUDA(grad_bary);
CHECK_CUDA(grad_dists);
return RasterizeMeshesBackwardCuda(
face_verts,
pix_to_face,
@ -260,9 +260,9 @@ torch::Tensor RasterizeMeshesCoarse(
const int max_faces_per_bin) {
if (face_verts.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(face_verts);
CHECK_CONTIGUOUS_CUDA(mesh_to_face_first_idx);
CHECK_CONTIGUOUS_CUDA(num_faces_per_mesh);
CHECK_CUDA(face_verts);
CHECK_CUDA(mesh_to_face_first_idx);
CHECK_CUDA(num_faces_per_mesh);
return RasterizeMeshesCoarseCuda(
face_verts,
mesh_to_face_first_idx,
@ -359,8 +359,8 @@ RasterizeMeshesFine(
const bool cull_backfaces) {
if (face_verts.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(face_verts);
CHECK_CONTIGUOUS_CUDA(bin_faces);
CHECK_CUDA(face_verts);
CHECK_CUDA(bin_faces);
return RasterizeMeshesFineCuda(
face_verts,
bin_faces,

View File

@ -67,9 +67,9 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaive(
if (points.is_cuda() && cloud_to_packed_first_idx.is_cuda() &&
num_points_per_cloud.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(cloud_to_packed_first_idx);
CHECK_CONTIGUOUS_CUDA(num_points_per_cloud);
CHECK_CUDA(points);
CHECK_CUDA(cloud_to_packed_first_idx);
CHECK_CUDA(num_points_per_cloud);
return RasterizePointsNaiveCuda(
points,
cloud_to_packed_first_idx,
@ -144,9 +144,9 @@ torch::Tensor RasterizePointsCoarse(
if (points.is_cuda() && cloud_to_packed_first_idx.is_cuda() &&
num_points_per_cloud.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(cloud_to_packed_first_idx);
CHECK_CONTIGUOUS_CUDA(num_points_per_cloud);
CHECK_CUDA(points);
CHECK_CUDA(cloud_to_packed_first_idx);
CHECK_CUDA(num_points_per_cloud);
return RasterizePointsCoarseCuda(
points,
cloud_to_packed_first_idx,
@ -215,8 +215,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFine(
const int points_per_pixel) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(bin_points);
CHECK_CUDA(points);
CHECK_CUDA(bin_points);
return RasterizePointsFineCuda(
points, bin_points, image_size, radius, bin_size, points_per_pixel);
#else
@ -266,10 +266,10 @@ torch::Tensor RasterizePointsBackward(
const torch::Tensor& grad_dists) {
if (points.is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(points);
CHECK_CONTIGUOUS_CUDA(idxs);
CHECK_CONTIGUOUS_CUDA(grad_zbuf);
CHECK_CONTIGUOUS_CUDA(grad_dists);
CHECK_CUDA(points);
CHECK_CUDA(idxs);
CHECK_CUDA(grad_zbuf);
CHECK_CUDA(grad_dists);
return RasterizePointsBackwardCuda(points, idxs, grad_zbuf, grad_dists);
#else
AT_ERROR("Not compiled with GPU support");