mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2025-08-02 03:42:50 +08:00
CUDA marching_cubes fix
Summary: Fix an inclusive vs exclusive scan mix-up that was accidentally introduced when removing the Thrust dependency (`Thrust::exclusive_scan`) and reimplementing it using `at::cumsum` (which does an inclusive scan). This fixes two Github reported issues: * https://github.com/facebookresearch/pytorch3d/issues/1731 * https://github.com/facebookresearch/pytorch3d/issues/1751 Reviewed By: bottler Differential Revision: D54605545 fbshipit-source-id: da9e92f3f8a9a35f7b7191428d0b9a9ca03e0d4d
This commit is contained in:
parent
a27755db41
commit
7566530669
@ -382,6 +382,44 @@ __global__ void GenerateFacesKernel(
|
|||||||
} // end for grid-strided kernel
|
} // end for grid-strided kernel
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ATen/Torch does not have an exclusive-scan operator. Additionally, in the
|
||||||
|
// code below we need to get the "total number of items to work on" after
|
||||||
|
// a scan, which with an inclusive-scan would simply be the value of the last
|
||||||
|
// element in the tensor.
|
||||||
|
//
|
||||||
|
// This utility function hits two birds with one stone, by running
|
||||||
|
// an inclusive-scan into a right-shifted view of a tensor that's
|
||||||
|
// allocated to be one element bigger than the input tensor.
|
||||||
|
//
|
||||||
|
// Note; return tensor is `int64_t` per element, even if the input
|
||||||
|
// tensor is only 32-bit. Also, the return tensor is one element bigger
|
||||||
|
// than the input one.
|
||||||
|
//
|
||||||
|
// Secondary optional argument is an output argument that gets the
|
||||||
|
// value of the last element of the return tensor (because you almost
|
||||||
|
// always need this CPU-side right after this function anyway).
|
||||||
|
static at::Tensor ExclusiveScanAndTotal(
|
||||||
|
const at::Tensor& inTensor,
|
||||||
|
int64_t* optTotal = nullptr) {
|
||||||
|
const auto inSize = inTensor.sizes()[0];
|
||||||
|
auto retTensor = at::zeros({inSize + 1}, at::kLong).to(inTensor.device());
|
||||||
|
|
||||||
|
using at::indexing::None;
|
||||||
|
using at::indexing::Slice;
|
||||||
|
auto rightShiftedView = retTensor.index({Slice(1, None)});
|
||||||
|
|
||||||
|
// Do an (inclusive-scan) cumulative sum in to the view that's
|
||||||
|
// shifted one element to the right...
|
||||||
|
at::cumsum_out(rightShiftedView, inTensor, 0, at::kLong);
|
||||||
|
|
||||||
|
if (optTotal) {
|
||||||
|
*optTotal = retTensor[inSize].cpu().item<int64_t>();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ...so that the not-shifted tensor holds the exclusive-scan
|
||||||
|
return retTensor;
|
||||||
|
}
|
||||||
|
|
||||||
// Entrance for marching cubes cuda extension. Marching Cubes is an algorithm to
|
// Entrance for marching cubes cuda extension. Marching Cubes is an algorithm to
|
||||||
// create triangle meshes from an implicit function (one of the form f(x, y, z)
|
// create triangle meshes from an implicit function (one of the form f(x, y, z)
|
||||||
// = 0). It works by iteratively checking a grid of cubes superimposed over a
|
// = 0). It works by iteratively checking a grid of cubes superimposed over a
|
||||||
@ -444,20 +482,18 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> MarchingCubesCuda(
|
|||||||
using at::indexing::Slice;
|
using at::indexing::Slice;
|
||||||
|
|
||||||
auto d_voxelVerts =
|
auto d_voxelVerts =
|
||||||
at::zeros({numVoxels + 1}, at::TensorOptions().dtype(at::kInt))
|
at::zeros({numVoxels}, at::TensorOptions().dtype(at::kInt))
|
||||||
.to(vol.device());
|
.to(vol.device());
|
||||||
auto d_voxelVerts_ = d_voxelVerts.index({Slice(1, None)});
|
|
||||||
auto d_voxelOccupied =
|
auto d_voxelOccupied =
|
||||||
at::zeros({numVoxels + 1}, at::TensorOptions().dtype(at::kInt))
|
at::zeros({numVoxels}, at::TensorOptions().dtype(at::kInt))
|
||||||
.to(vol.device());
|
.to(vol.device());
|
||||||
auto d_voxelOccupied_ = d_voxelOccupied.index({Slice(1, None)});
|
|
||||||
|
|
||||||
// Execute "ClassifyVoxelKernel" kernel to precompute
|
// Execute "ClassifyVoxelKernel" kernel to precompute
|
||||||
// two arrays - d_voxelOccupied and d_voxelVertices to global memory,
|
// two arrays - d_voxelOccupied and d_voxelVertices to global memory,
|
||||||
// which stores the occupancy state and number of voxel vertices per voxel.
|
// which stores the occupancy state and number of voxel vertices per voxel.
|
||||||
ClassifyVoxelKernel<<<grid, threads, 0, stream>>>(
|
ClassifyVoxelKernel<<<grid, threads, 0, stream>>>(
|
||||||
d_voxelVerts_.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
d_voxelVerts.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
||||||
d_voxelOccupied_.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
d_voxelOccupied.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
||||||
vol.packed_accessor32<float, 3, at::RestrictPtrTraits>(),
|
vol.packed_accessor32<float, 3, at::RestrictPtrTraits>(),
|
||||||
isolevel);
|
isolevel);
|
||||||
AT_CUDA_CHECK(cudaGetLastError());
|
AT_CUDA_CHECK(cudaGetLastError());
|
||||||
@ -467,12 +503,9 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> MarchingCubesCuda(
|
|||||||
// count for voxels in the grid and compute the number of active voxels.
|
// count for voxels in the grid and compute the number of active voxels.
|
||||||
// If the number of active voxels is 0, return zero tensor for verts and
|
// If the number of active voxels is 0, return zero tensor for verts and
|
||||||
// faces.
|
// faces.
|
||||||
|
int64_t activeVoxels = 0;
|
||||||
auto d_voxelOccupiedScan = at::cumsum(d_voxelOccupied, 0);
|
auto d_voxelOccupiedScan =
|
||||||
auto d_voxelOccupiedScan_ = d_voxelOccupiedScan.index({Slice(1, None)});
|
ExclusiveScanAndTotal(d_voxelOccupied, &activeVoxels);
|
||||||
|
|
||||||
// number of active voxels
|
|
||||||
int64_t activeVoxels = d_voxelOccupiedScan[numVoxels].cpu().item<int64_t>();
|
|
||||||
|
|
||||||
const int device_id = vol.device().index();
|
const int device_id = vol.device().index();
|
||||||
auto opt = at::TensorOptions().dtype(at::kInt).device(at::kCUDA, device_id);
|
auto opt = at::TensorOptions().dtype(at::kInt).device(at::kCUDA, device_id);
|
||||||
@ -487,24 +520,21 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> MarchingCubesCuda(
|
|||||||
return std::make_tuple(verts, faces, ids);
|
return std::make_tuple(verts, faces, ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Execute "CompactVoxelsKernel" kernel to compress voxels for accleration.
|
// Execute "CompactVoxelsKernel" kernel to compress voxels for acceleration.
|
||||||
// This allows us to run triangle generation on only the occupied voxels.
|
// This allows us to run triangle generation on only the occupied voxels.
|
||||||
auto d_compVoxelArray = at::zeros({activeVoxels}, opt);
|
auto d_compVoxelArray = at::zeros({activeVoxels}, opt);
|
||||||
CompactVoxelsKernel<<<grid, threads, 0, stream>>>(
|
CompactVoxelsKernel<<<grid, threads, 0, stream>>>(
|
||||||
d_compVoxelArray.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
d_compVoxelArray.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
||||||
d_voxelOccupied.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
d_voxelOccupied.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
||||||
d_voxelOccupiedScan_
|
d_voxelOccupiedScan
|
||||||
.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
|
.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
|
||||||
numVoxels);
|
numVoxels);
|
||||||
AT_CUDA_CHECK(cudaGetLastError());
|
AT_CUDA_CHECK(cudaGetLastError());
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
// Scan d_voxelVerts array to generate offsets of vertices for each voxel
|
// Scan d_voxelVerts array to generate offsets of vertices for each voxel
|
||||||
auto d_voxelVertsScan = at::cumsum(d_voxelVerts, 0);
|
int64_t totalVerts = 0;
|
||||||
auto d_voxelVertsScan_ = d_voxelVertsScan.index({Slice(1, None)});
|
auto d_voxelVertsScan = ExclusiveScanAndTotal(d_voxelVerts, &totalVerts);
|
||||||
|
|
||||||
// total number of vertices
|
|
||||||
int64_t totalVerts = d_voxelVertsScan[numVoxels].cpu().item<int64_t>();
|
|
||||||
|
|
||||||
// Execute "GenerateFacesKernel" kernel
|
// Execute "GenerateFacesKernel" kernel
|
||||||
// This runs only on the occupied voxels.
|
// This runs only on the occupied voxels.
|
||||||
@ -524,7 +554,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> MarchingCubesCuda(
|
|||||||
faces.packed_accessor<int64_t, 2, at::RestrictPtrTraits>(),
|
faces.packed_accessor<int64_t, 2, at::RestrictPtrTraits>(),
|
||||||
ids.packed_accessor<int64_t, 1, at::RestrictPtrTraits>(),
|
ids.packed_accessor<int64_t, 1, at::RestrictPtrTraits>(),
|
||||||
d_compVoxelArray.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
d_compVoxelArray.packed_accessor32<int, 1, at::RestrictPtrTraits>(),
|
||||||
d_voxelVertsScan_.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
|
d_voxelVertsScan.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
|
||||||
activeVoxels,
|
activeVoxels,
|
||||||
vol.packed_accessor32<float, 3, at::RestrictPtrTraits>(),
|
vol.packed_accessor32<float, 3, at::RestrictPtrTraits>(),
|
||||||
faceTable.packed_accessor32<int, 2, at::RestrictPtrTraits>(),
|
faceTable.packed_accessor32<int, 2, at::RestrictPtrTraits>(),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user