mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2026-03-03 18:55:59 +08:00
Cuda updates
Summary: Updates to: - enable cuda kernel launches on any GPU (not just the default) - cuda and contiguous checks for all kernels - checks to ensure all tensors are on the same device - error reporting in the cuda kernels - cuda tests now run on a random device not just the default Reviewed By: jcjohnson, gkioxari Differential Revision: D21215280 fbshipit-source-id: 1bedc9fe6c35e9e920bdc4d78ed12865b1005519
This commit is contained in:
committed by
Facebook GitHub Bot
parent
c9267ab7af
commit
c3d636dc8c
@@ -1,6 +1,8 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <algorithm>
|
||||
#include <list>
|
||||
#include <queue>
|
||||
@@ -103,26 +105,45 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceForwardCuda(
|
||||
const at::Tensor& segms,
|
||||
const at::Tensor& segms_first_idx,
|
||||
const int64_t max_points) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1},
|
||||
points_first_idx_t{points_first_idx, "points_first_idx", 2},
|
||||
segms_t{segms, "segms", 3},
|
||||
segms_first_idx_t{segms_first_idx, "segms_first_idx", 4};
|
||||
at::CheckedFrom c = "PointEdgeDistanceForwardCuda";
|
||||
at::checkAllSameGPU(
|
||||
c, {points_t, points_first_idx_t, segms_t, segms_first_idx_t});
|
||||
at::checkAllSameType(c, {points_t, segms_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t S = segms.size(0);
|
||||
const int64_t B = points_first_idx.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(segms.size(1) == 2) && (segms.size(2) == 3),
|
||||
"segms must be of shape Sx2x3");
|
||||
AT_ASSERTM(segms_first_idx.size(0) == B);
|
||||
TORCH_CHECK(segms_first_idx.size(0) == B);
|
||||
|
||||
// clang-format off
|
||||
at::Tensor dists = at::zeros({P,}, points.options());
|
||||
at::Tensor idxs = at::zeros({P,}, points_first_idx.options());
|
||||
// clang-format on
|
||||
|
||||
if (dists.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(dists, idxs);
|
||||
}
|
||||
|
||||
const int threads = 128;
|
||||
const dim3 blocks(max_points, B);
|
||||
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
|
||||
|
||||
PointEdgeForwardKernel<<<blocks, threads, shared_size>>>(
|
||||
PointEdgeForwardKernel<<<blocks, threads, shared_size, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
points_first_idx.data_ptr<int64_t>(),
|
||||
segms.data_ptr<float>(),
|
||||
@@ -132,7 +153,7 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceForwardCuda(
|
||||
B,
|
||||
P,
|
||||
S);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(dists, idxs);
|
||||
}
|
||||
|
||||
@@ -183,25 +204,42 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceBackwardCuda(
|
||||
const at::Tensor& segms,
|
||||
const at::Tensor& idx_points,
|
||||
const at::Tensor& grad_dists) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1},
|
||||
idx_points_t{idx_points, "idx_points", 2}, segms_t{segms, "segms", 3},
|
||||
grad_dists_t{grad_dists, "grad_dists", 4};
|
||||
at::CheckedFrom c = "PointEdgeDistanceBackwardCuda";
|
||||
at::checkAllSameGPU(c, {points_t, idx_points_t, segms_t, grad_dists_t});
|
||||
at::checkAllSameType(c, {points_t, segms_t, grad_dists_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t S = segms.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(segms.size(1) == 2) && (segms.size(2) == 3),
|
||||
"segms must be of shape Sx2x3");
|
||||
AT_ASSERTM(idx_points.size(0) == P);
|
||||
AT_ASSERTM(grad_dists.size(0) == P);
|
||||
TORCH_CHECK(idx_points.size(0) == P);
|
||||
TORCH_CHECK(grad_dists.size(0) == P);
|
||||
|
||||
// clang-format off
|
||||
at::Tensor grad_points = at::zeros({P, 3}, points.options());
|
||||
at::Tensor grad_segms = at::zeros({S, 2, 3}, segms.options());
|
||||
// clang-format on
|
||||
|
||||
if (grad_points.numel() == 0 || grad_segms.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_segms);
|
||||
}
|
||||
|
||||
const int blocks = 64;
|
||||
const int threads = 512;
|
||||
|
||||
PointEdgeBackwardKernel<<<blocks, threads>>>(
|
||||
PointEdgeBackwardKernel<<<blocks, threads, 0, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
segms.data_ptr<float>(),
|
||||
idx_points.data_ptr<int64_t>(),
|
||||
@@ -210,6 +248,7 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceBackwardCuda(
|
||||
grad_segms.data_ptr<float>(),
|
||||
P);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_segms);
|
||||
}
|
||||
|
||||
@@ -308,26 +347,45 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceForwardCuda(
|
||||
const at::Tensor& segms,
|
||||
const at::Tensor& segms_first_idx,
|
||||
const int64_t max_segms) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1},
|
||||
points_first_idx_t{points_first_idx, "points_first_idx", 2},
|
||||
segms_t{segms, "segms", 3},
|
||||
segms_first_idx_t{segms_first_idx, "segms_first_idx", 4};
|
||||
at::CheckedFrom c = "EdgePointDistanceForwardCuda";
|
||||
at::checkAllSameGPU(
|
||||
c, {points_t, points_first_idx_t, segms_t, segms_first_idx_t});
|
||||
at::checkAllSameType(c, {points_t, segms_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t S = segms.size(0);
|
||||
const int64_t B = points_first_idx.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(segms.size(1) == 2) && (segms.size(2) == 3),
|
||||
"segms must be of shape Sx2x3");
|
||||
AT_ASSERTM(segms_first_idx.size(0) == B);
|
||||
TORCH_CHECK(segms_first_idx.size(0) == B);
|
||||
|
||||
// clang-format off
|
||||
at::Tensor dists = at::zeros({S,}, segms.options());
|
||||
at::Tensor idxs = at::zeros({S,}, segms_first_idx.options());
|
||||
// clang-format on
|
||||
|
||||
if (dists.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(dists, idxs);
|
||||
}
|
||||
|
||||
const int threads = 128;
|
||||
const dim3 blocks(max_segms, B);
|
||||
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
|
||||
|
||||
EdgePointForwardKernel<<<blocks, threads, shared_size>>>(
|
||||
EdgePointForwardKernel<<<blocks, threads, shared_size, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
points_first_idx.data_ptr<int64_t>(),
|
||||
segms.data_ptr<float>(),
|
||||
@@ -337,7 +395,7 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceForwardCuda(
|
||||
B,
|
||||
P,
|
||||
S);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(dists, idxs);
|
||||
}
|
||||
|
||||
@@ -389,15 +447,27 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceBackwardCuda(
|
||||
const at::Tensor& segms,
|
||||
const at::Tensor& idx_segms,
|
||||
const at::Tensor& grad_dists) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1},
|
||||
idx_segms_t{idx_segms, "idx_segms", 2}, segms_t{segms, "segms", 3},
|
||||
grad_dists_t{grad_dists, "grad_dists", 4};
|
||||
at::CheckedFrom c = "PointEdgeDistanceBackwardCuda";
|
||||
at::checkAllSameGPU(c, {points_t, idx_segms_t, segms_t, grad_dists_t});
|
||||
at::checkAllSameType(c, {points_t, segms_t, grad_dists_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t S = segms.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(segms.size(1) == 2) && (segms.size(2) == 3),
|
||||
"segms must be of shape Sx2x3");
|
||||
AT_ASSERTM(idx_segms.size(0) == S);
|
||||
AT_ASSERTM(grad_dists.size(0) == S);
|
||||
TORCH_CHECK(idx_segms.size(0) == S);
|
||||
TORCH_CHECK(grad_dists.size(0) == S);
|
||||
|
||||
// clang-format off
|
||||
at::Tensor grad_points = at::zeros({P, 3}, points.options());
|
||||
@@ -407,7 +477,7 @@ std::tuple<at::Tensor, at::Tensor> EdgePointDistanceBackwardCuda(
|
||||
const int blocks = 64;
|
||||
const int threads = 512;
|
||||
|
||||
EdgePointBackwardKernel<<<blocks, threads>>>(
|
||||
EdgePointBackwardKernel<<<blocks, threads, 0, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
segms.data_ptr<float>(),
|
||||
idx_segms.data_ptr<int64_t>(),
|
||||
@@ -451,26 +521,42 @@ __global__ void PointEdgeArrayForwardKernel(
|
||||
at::Tensor PointEdgeArrayDistanceForwardCuda(
|
||||
const at::Tensor& points,
|
||||
const at::Tensor& segms) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1}, segms_t{segms, "segms", 2};
|
||||
at::CheckedFrom c = "PointEdgeArrayDistanceForwardCuda";
|
||||
at::checkAllSameGPU(c, {points_t, segms_t});
|
||||
at::checkAllSameType(c, {points_t, segms_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t S = segms.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(segms.size(1) == 2) && (segms.size(2) == 3),
|
||||
"segms must be of shape Sx2x3");
|
||||
|
||||
at::Tensor dists = at::zeros({P, S}, points.options());
|
||||
|
||||
if (dists.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return dists;
|
||||
}
|
||||
|
||||
const size_t blocks = 1024;
|
||||
const size_t threads = 64;
|
||||
|
||||
PointEdgeArrayForwardKernel<<<blocks, threads>>>(
|
||||
PointEdgeArrayForwardKernel<<<blocks, threads, 0, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
segms.data_ptr<float>(),
|
||||
dists.data_ptr<float>(),
|
||||
P,
|
||||
S);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return dists;
|
||||
}
|
||||
|
||||
@@ -520,22 +606,38 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeArrayDistanceBackwardCuda(
|
||||
const at::Tensor& points,
|
||||
const at::Tensor& segms,
|
||||
const at::Tensor& grad_dists) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1}, segms_t{segms, "segms", 2},
|
||||
grad_dists_t{grad_dists, "grad_dists", 3};
|
||||
at::CheckedFrom c = "PointEdgeArrayDistanceBackwardCuda";
|
||||
at::checkAllSameGPU(c, {points_t, segms_t, grad_dists_t});
|
||||
at::checkAllSameType(c, {points_t, segms_t, grad_dists_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t S = segms.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(segms.size(1) == 2) && (segms.size(2) == 3),
|
||||
"segms must be of shape Sx2x3");
|
||||
AT_ASSERTM((grad_dists.size(0) == P) && (grad_dists.size(1) == S));
|
||||
TORCH_CHECK((grad_dists.size(0) == P) && (grad_dists.size(1) == S));
|
||||
|
||||
at::Tensor grad_points = at::zeros({P, 3}, points.options());
|
||||
at::Tensor grad_segms = at::zeros({S, 2, 3}, segms.options());
|
||||
|
||||
if (grad_points.numel() == 0 || grad_segms.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_segms);
|
||||
}
|
||||
|
||||
const size_t blocks = 1024;
|
||||
const size_t threads = 64;
|
||||
|
||||
PointEdgeArrayBackwardKernel<<<blocks, threads>>>(
|
||||
PointEdgeArrayBackwardKernel<<<blocks, threads, 0, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
segms.data_ptr<float>(),
|
||||
grad_dists.data_ptr<float>(),
|
||||
@@ -543,6 +645,6 @@ std::tuple<at::Tensor, at::Tensor> PointEdgeArrayDistanceBackwardCuda(
|
||||
grad_segms.data_ptr<float>(),
|
||||
P,
|
||||
S);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_segms);
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <torch/extension.h>
|
||||
#include <cstdio>
|
||||
#include <tuple>
|
||||
#include "utils/pytorch3d_cutils.h"
|
||||
|
||||
// ****************************************************************************
|
||||
// * PointEdgeDistance *
|
||||
@@ -53,6 +54,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForward(
|
||||
const int64_t max_points) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(points_first_idx);
|
||||
CHECK_CONTIGUOUS_CUDA(segms);
|
||||
CHECK_CONTIGUOUS_CUDA(segms_first_idx);
|
||||
return PointEdgeDistanceForwardCuda(
|
||||
points, points_first_idx, segms, segms_first_idx, max_points);
|
||||
#else
|
||||
@@ -93,6 +98,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackward(
|
||||
const torch::Tensor& grad_dists) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(segms);
|
||||
CHECK_CONTIGUOUS_CUDA(idx_points);
|
||||
CHECK_CONTIGUOUS_CUDA(grad_dists);
|
||||
return PointEdgeDistanceBackwardCuda(points, segms, idx_points, grad_dists);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support.");
|
||||
@@ -149,6 +158,10 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForward(
|
||||
const int64_t max_segms) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(points_first_idx);
|
||||
CHECK_CONTIGUOUS_CUDA(segms);
|
||||
CHECK_CONTIGUOUS_CUDA(segms_first_idx);
|
||||
return EdgePointDistanceForwardCuda(
|
||||
points, points_first_idx, segms, segms_first_idx, max_segms);
|
||||
#else
|
||||
@@ -189,6 +202,10 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward(
|
||||
const torch::Tensor& grad_dists) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(segms);
|
||||
CHECK_CONTIGUOUS_CUDA(idx_segms);
|
||||
CHECK_CONTIGUOUS_CUDA(grad_dists);
|
||||
return EdgePointDistanceBackwardCuda(points, segms, idx_segms, grad_dists);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support.");
|
||||
@@ -220,7 +237,6 @@ std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward(
|
||||
// will require for the forward pass 5.8G of memory to store dists.
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
|
||||
torch::Tensor PointEdgeArrayDistanceForwardCuda(
|
||||
const torch::Tensor& points,
|
||||
const torch::Tensor& segms);
|
||||
@@ -231,6 +247,8 @@ torch::Tensor PointEdgeArrayDistanceForward(
|
||||
const torch::Tensor& segms) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(segms);
|
||||
return PointEdgeArrayDistanceForwardCuda(points, segms);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support.");
|
||||
@@ -265,6 +283,9 @@ std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackward(
|
||||
const torch::Tensor& grad_dists) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(segms);
|
||||
CHECK_CONTIGUOUS_CUDA(grad_dists);
|
||||
return PointEdgeArrayDistanceBackwardCuda(points, segms, grad_dists);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support.");
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <algorithm>
|
||||
#include <list>
|
||||
#include <queue>
|
||||
@@ -104,26 +106,45 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceForwardCuda(
|
||||
const at::Tensor& tris,
|
||||
const at::Tensor& tris_first_idx,
|
||||
const int64_t max_points) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1},
|
||||
points_first_idx_t{points_first_idx, "points_first_idx", 2},
|
||||
tris_t{tris, "tris", 3},
|
||||
tris_first_idx_t{tris_first_idx, "tris_first_idx", 4};
|
||||
at::CheckedFrom c = "PointFaceDistanceForwardCuda";
|
||||
at::checkAllSameGPU(
|
||||
c, {points_t, points_first_idx_t, tris_t, tris_first_idx_t});
|
||||
at::checkAllSameType(c, {points_t, tris_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t T = tris.size(0);
|
||||
const int64_t B = points_first_idx.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(tris.size(1) == 3) && (tris.size(2) == 3),
|
||||
"tris must be of shape Tx3x3");
|
||||
AT_ASSERTM(tris_first_idx.size(0) == B);
|
||||
TORCH_CHECK(tris_first_idx.size(0) == B);
|
||||
|
||||
// clang-format off
|
||||
at::Tensor dists = at::zeros({P,}, points.options());
|
||||
at::Tensor idxs = at::zeros({P,}, points_first_idx.options());
|
||||
// clang-format on
|
||||
|
||||
if (dists.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(dists, idxs);
|
||||
}
|
||||
|
||||
const int threads = 128;
|
||||
const dim3 blocks(max_points, B);
|
||||
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
|
||||
|
||||
PointFaceForwardKernel<<<blocks, threads, shared_size>>>(
|
||||
PointFaceForwardKernel<<<blocks, threads, shared_size, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
points_first_idx.data_ptr<int64_t>(),
|
||||
tris.data_ptr<float>(),
|
||||
@@ -134,6 +155,7 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceForwardCuda(
|
||||
P,
|
||||
T);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(dists, idxs);
|
||||
}
|
||||
|
||||
@@ -191,25 +213,42 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceBackwardCuda(
|
||||
const at::Tensor& tris,
|
||||
const at::Tensor& idx_points,
|
||||
const at::Tensor& grad_dists) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1},
|
||||
idx_points_t{idx_points, "idx_points", 2}, tris_t{tris, "tris", 3},
|
||||
grad_dists_t{grad_dists, "grad_dists", 4};
|
||||
at::CheckedFrom c = "PointFaceDistanceBackwardCuda";
|
||||
at::checkAllSameGPU(c, {points_t, idx_points_t, tris_t, grad_dists_t});
|
||||
at::checkAllSameType(c, {points_t, tris_t, grad_dists_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t T = tris.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(tris.size(1) == 3) && (tris.size(2) == 3),
|
||||
"tris must be of shape Tx3x3");
|
||||
AT_ASSERTM(idx_points.size(0) == P);
|
||||
AT_ASSERTM(grad_dists.size(0) == P);
|
||||
TORCH_CHECK(idx_points.size(0) == P);
|
||||
TORCH_CHECK(grad_dists.size(0) == P);
|
||||
|
||||
// clang-format off
|
||||
at::Tensor grad_points = at::zeros({P, 3}, points.options());
|
||||
at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options());
|
||||
// clang-format on
|
||||
|
||||
if (grad_points.numel() == 0 || grad_tris.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_tris);
|
||||
}
|
||||
|
||||
const int blocks = 64;
|
||||
const int threads = 512;
|
||||
|
||||
PointFaceBackwardKernel<<<blocks, threads>>>(
|
||||
PointFaceBackwardKernel<<<blocks, threads, 0, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
tris.data_ptr<float>(),
|
||||
idx_points.data_ptr<int64_t>(),
|
||||
@@ -218,6 +257,7 @@ std::tuple<at::Tensor, at::Tensor> PointFaceDistanceBackwardCuda(
|
||||
grad_tris.data_ptr<float>(),
|
||||
P);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_tris);
|
||||
}
|
||||
|
||||
@@ -317,26 +357,45 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceForwardCuda(
|
||||
const at::Tensor& tris,
|
||||
const at::Tensor& tris_first_idx,
|
||||
const int64_t max_tris) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1},
|
||||
points_first_idx_t{points_first_idx, "points_first_idx", 2},
|
||||
tris_t{tris, "tris", 3},
|
||||
tris_first_idx_t{tris_first_idx, "tris_first_idx", 4};
|
||||
at::CheckedFrom c = "FacePointDistanceForwardCuda";
|
||||
at::checkAllSameGPU(
|
||||
c, {points_t, points_first_idx_t, tris_t, tris_first_idx_t});
|
||||
at::checkAllSameType(c, {points_t, tris_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t T = tris.size(0);
|
||||
const int64_t B = points_first_idx.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(tris.size(1) == 3) && (tris.size(2) == 3),
|
||||
"tris must be of shape Tx3x3");
|
||||
AT_ASSERTM(tris_first_idx.size(0) == B);
|
||||
TORCH_CHECK(tris_first_idx.size(0) == B);
|
||||
|
||||
// clang-format off
|
||||
at::Tensor dists = at::zeros({T,}, tris.options());
|
||||
at::Tensor idxs = at::zeros({T,}, tris_first_idx.options());
|
||||
// clang-format on
|
||||
|
||||
if (dists.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(dists, idxs);
|
||||
}
|
||||
|
||||
const int threads = 128;
|
||||
const dim3 blocks(max_tris, B);
|
||||
size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
|
||||
|
||||
FacePointForwardKernel<<<blocks, threads, shared_size>>>(
|
||||
FacePointForwardKernel<<<blocks, threads, shared_size, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
points_first_idx.data_ptr<int64_t>(),
|
||||
tris.data_ptr<float>(),
|
||||
@@ -347,6 +406,7 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceForwardCuda(
|
||||
P,
|
||||
T);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(dists, idxs);
|
||||
}
|
||||
|
||||
@@ -405,25 +465,42 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceBackwardCuda(
|
||||
const at::Tensor& tris,
|
||||
const at::Tensor& idx_tris,
|
||||
const at::Tensor& grad_dists) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1},
|
||||
idx_tris_t{idx_tris, "idx_tris", 2}, tris_t{tris, "tris", 3},
|
||||
grad_dists_t{grad_dists, "grad_dists", 4};
|
||||
at::CheckedFrom c = "FacePointDistanceBackwardCuda";
|
||||
at::checkAllSameGPU(c, {points_t, idx_tris_t, tris_t, grad_dists_t});
|
||||
at::checkAllSameType(c, {points_t, tris_t, grad_dists_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t T = tris.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(tris.size(1) == 3) && (tris.size(2) == 3),
|
||||
"tris must be of shape Tx3x3");
|
||||
AT_ASSERTM(idx_tris.size(0) == T);
|
||||
AT_ASSERTM(grad_dists.size(0) == T);
|
||||
TORCH_CHECK(idx_tris.size(0) == T);
|
||||
TORCH_CHECK(grad_dists.size(0) == T);
|
||||
|
||||
// clang-format off
|
||||
at::Tensor grad_points = at::zeros({P, 3}, points.options());
|
||||
at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options());
|
||||
// clang-format on
|
||||
|
||||
if (grad_points.numel() == 0 || grad_tris.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_tris);
|
||||
}
|
||||
|
||||
const int blocks = 64;
|
||||
const int threads = 512;
|
||||
|
||||
FacePointBackwardKernel<<<blocks, threads>>>(
|
||||
FacePointBackwardKernel<<<blocks, threads, 0, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
tris.data_ptr<float>(),
|
||||
idx_tris.data_ptr<int64_t>(),
|
||||
@@ -432,6 +509,7 @@ std::tuple<at::Tensor, at::Tensor> FacePointDistanceBackwardCuda(
|
||||
grad_tris.data_ptr<float>(),
|
||||
T);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_tris);
|
||||
}
|
||||
|
||||
@@ -468,26 +546,42 @@ __global__ void PointFaceArrayForwardKernel(
|
||||
at::Tensor PointFaceArrayDistanceForwardCuda(
|
||||
const at::Tensor& points,
|
||||
const at::Tensor& tris) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1}, tris_t{tris, "tris", 2};
|
||||
at::CheckedFrom c = "PointFaceArrayDistanceForwardCuda";
|
||||
at::checkAllSameGPU(c, {points_t, tris_t});
|
||||
at::checkAllSameType(c, {points_t, tris_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t T = tris.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(tris.size(1) == 3) && (tris.size(2) == 3),
|
||||
"tris must be of shape Tx3x3");
|
||||
|
||||
at::Tensor dists = at::zeros({P, T}, points.options());
|
||||
|
||||
if (dists.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return dists;
|
||||
}
|
||||
|
||||
const size_t blocks = 1024;
|
||||
const size_t threads = 64;
|
||||
|
||||
PointFaceArrayForwardKernel<<<blocks, threads>>>(
|
||||
PointFaceArrayForwardKernel<<<blocks, threads, 0, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
tris.data_ptr<float>(),
|
||||
dists.data_ptr<float>(),
|
||||
P,
|
||||
T);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return dists;
|
||||
}
|
||||
|
||||
@@ -546,22 +640,38 @@ std::tuple<at::Tensor, at::Tensor> PointFaceArrayDistanceBackwardCuda(
|
||||
const at::Tensor& points,
|
||||
const at::Tensor& tris,
|
||||
const at::Tensor& grad_dists) {
|
||||
// Check inputs are on the same device
|
||||
at::TensorArg points_t{points, "points", 1}, tris_t{tris, "tris", 2},
|
||||
grad_dists_t{grad_dists, "grad_dists", 3};
|
||||
at::CheckedFrom c = "PointFaceArrayDistanceBackwardCuda";
|
||||
at::checkAllSameGPU(c, {points_t, tris_t, grad_dists_t});
|
||||
at::checkAllSameType(c, {points_t, tris_t, grad_dists_t});
|
||||
|
||||
// Set the device for the kernel launch based on the device of the input
|
||||
at::cuda::CUDAGuard device_guard(points.device());
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t P = points.size(0);
|
||||
const int64_t T = tris.size(0);
|
||||
|
||||
AT_ASSERTM(points.size(1) == 3, "points must be of shape Px3");
|
||||
AT_ASSERTM(
|
||||
TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
|
||||
TORCH_CHECK(
|
||||
(tris.size(1) == 3) && (tris.size(2) == 3),
|
||||
"tris must be of shape Tx3x3");
|
||||
AT_ASSERTM((grad_dists.size(0) == P) && (grad_dists.size(1) == T));
|
||||
TORCH_CHECK((grad_dists.size(0) == P) && (grad_dists.size(1) == T));
|
||||
|
||||
at::Tensor grad_points = at::zeros({P, 3}, points.options());
|
||||
at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options());
|
||||
|
||||
if (grad_points.numel() == 0 || grad_tris.numel() == 0) {
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_tris);
|
||||
}
|
||||
|
||||
const size_t blocks = 1024;
|
||||
const size_t threads = 64;
|
||||
|
||||
PointFaceArrayBackwardKernel<<<blocks, threads>>>(
|
||||
PointFaceArrayBackwardKernel<<<blocks, threads, 0, stream>>>(
|
||||
points.data_ptr<float>(),
|
||||
tris.data_ptr<float>(),
|
||||
grad_dists.data_ptr<float>(),
|
||||
@@ -570,5 +680,6 @@ std::tuple<at::Tensor, at::Tensor> PointFaceArrayDistanceBackwardCuda(
|
||||
P,
|
||||
T);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
return std::make_tuple(grad_points, grad_tris);
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <torch/extension.h>
|
||||
#include <cstdio>
|
||||
#include <tuple>
|
||||
#include "utils/pytorch3d_cutils.h"
|
||||
|
||||
// ****************************************************************************
|
||||
// * PointFaceDistance *
|
||||
@@ -55,6 +56,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForward(
|
||||
const int64_t max_points) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(points_first_idx);
|
||||
CHECK_CONTIGUOUS_CUDA(tris);
|
||||
CHECK_CONTIGUOUS_CUDA(tris_first_idx);
|
||||
return PointFaceDistanceForwardCuda(
|
||||
points, points_first_idx, tris, tris_first_idx, max_points);
|
||||
#else
|
||||
@@ -95,6 +100,10 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackward(
|
||||
const torch::Tensor& grad_dists) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(tris);
|
||||
CHECK_CONTIGUOUS_CUDA(idx_points);
|
||||
CHECK_CONTIGUOUS_CUDA(grad_dists);
|
||||
return PointFaceDistanceBackwardCuda(points, tris, idx_points, grad_dists);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support.");
|
||||
@@ -151,6 +160,10 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForward(
|
||||
const int64_t max_tris) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(points_first_idx);
|
||||
CHECK_CONTIGUOUS_CUDA(tris);
|
||||
CHECK_CONTIGUOUS_CUDA(tris_first_idx);
|
||||
return FacePointDistanceForwardCuda(
|
||||
points, points_first_idx, tris, tris_first_idx, max_tris);
|
||||
#else
|
||||
@@ -191,6 +204,10 @@ std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackward(
|
||||
const torch::Tensor& grad_dists) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(tris);
|
||||
CHECK_CONTIGUOUS_CUDA(idx_tris);
|
||||
CHECK_CONTIGUOUS_CUDA(grad_dists);
|
||||
return FacePointDistanceBackwardCuda(points, tris, idx_tris, grad_dists);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support.");
|
||||
@@ -233,6 +250,8 @@ torch::Tensor PointFaceArrayDistanceForward(
|
||||
const torch::Tensor& tris) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(tris);
|
||||
return PointFaceArrayDistanceForwardCuda(points, tris);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support.");
|
||||
@@ -254,7 +273,6 @@ torch::Tensor PointFaceArrayDistanceForward(
|
||||
//
|
||||
|
||||
#ifdef WITH_CUDA
|
||||
|
||||
std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackwardCuda(
|
||||
const torch::Tensor& points,
|
||||
const torch::Tensor& tris,
|
||||
@@ -267,6 +285,9 @@ std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackward(
|
||||
const torch::Tensor& grad_dists) {
|
||||
if (points.is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
CHECK_CONTIGUOUS_CUDA(points);
|
||||
CHECK_CONTIGUOUS_CUDA(tris);
|
||||
CHECK_CONTIGUOUS_CUDA(grad_dists);
|
||||
return PointFaceArrayDistanceBackwardCuda(points, tris, grad_dists);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support.");
|
||||
|
||||
Reference in New Issue
Block a user