From 62a2031dd4456b4b0cae2caa24e950c221eddc58 Mon Sep 17 00:00:00 2001 From: bottler Date: Thu, 27 Mar 2025 05:28:03 -0700 Subject: [PATCH] Revert "Fix CUDA kernel index data type in vision/fair/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.cu +10" This reverts commit 3987612062f3db5dba609df3552768dcd97b410f. --- pytorch3d/csrc/compositing/alpha_composite.cu | 12 ++++++------ .../csrc/compositing/norm_weighted_sum.cu | 12 ++++++------ pytorch3d/csrc/compositing/weighted_sum.cu | 12 ++++++------ .../csrc/gather_scatter/gather_scatter.cu | 6 +++--- .../interp_face_attrs/interp_face_attrs.cu | 8 ++++---- pytorch3d/csrc/point_mesh/point_mesh_cuda.cu | 18 +++++++++--------- pytorch3d/csrc/rasterize_coarse/bitmask.cuh | 2 +- .../csrc/rasterize_coarse/rasterize_coarse.cu | 14 +++++++------- .../csrc/rasterize_meshes/rasterize_meshes.cu | 12 ++++++------ .../csrc/rasterize_points/rasterize_points.cu | 12 ++++++------ 10 files changed, 54 insertions(+), 54 deletions(-) diff --git a/pytorch3d/csrc/compositing/alpha_composite.cu b/pytorch3d/csrc/compositing/alpha_composite.cu index 2bfe79dc..b5d512e8 100644 --- a/pytorch3d/csrc/compositing/alpha_composite.cu +++ b/pytorch3d/csrc/compositing/alpha_composite.cu @@ -33,11 +33,11 @@ __global__ void alphaCompositeCudaForwardKernel( const int64_t W = points_idx.size(3); // Get the batch and index - const auto batch = blockIdx.x; + const int batch = blockIdx.x; const int num_pixels = C * H * W; - const auto num_threads = gridDim.y * blockDim.x; - const auto tid = blockIdx.y * blockDim.x + threadIdx.x; + const int num_threads = gridDim.y * blockDim.x; + const int tid = blockIdx.y * blockDim.x + threadIdx.x; // Iterate over each feature in each pixel for (int pid = tid; pid < num_pixels; pid += num_threads) { @@ -83,11 +83,11 @@ __global__ void alphaCompositeCudaBackwardKernel( const int64_t W = points_idx.size(3); // Get the batch and index - const auto batch = blockIdx.x; + const int batch = blockIdx.x; const int num_pixels = C * H * W; - const auto num_threads = gridDim.y * blockDim.x; - const auto tid = blockIdx.y * blockDim.x + threadIdx.x; + const int num_threads = gridDim.y * blockDim.x; + const int tid = blockIdx.y * blockDim.x + threadIdx.x; // Parallelize over each feature in each pixel in images of size H * W, // for each image in the batch of size batch_size diff --git a/pytorch3d/csrc/compositing/norm_weighted_sum.cu b/pytorch3d/csrc/compositing/norm_weighted_sum.cu index e21617d2..455bdb7f 100644 --- a/pytorch3d/csrc/compositing/norm_weighted_sum.cu +++ b/pytorch3d/csrc/compositing/norm_weighted_sum.cu @@ -33,11 +33,11 @@ __global__ void weightedSumNormCudaForwardKernel( const int64_t W = points_idx.size(3); // Get the batch and index - const auto batch = blockIdx.x; + const int batch = blockIdx.x; const int num_pixels = C * H * W; - const auto num_threads = gridDim.y * blockDim.x; - const auto tid = blockIdx.y * blockDim.x + threadIdx.x; + const int num_threads = gridDim.y * blockDim.x; + const int tid = blockIdx.y * blockDim.x + threadIdx.x; // Parallelize over each feature in each pixel in images of size H * W, // for each image in the batch of size batch_size @@ -96,11 +96,11 @@ __global__ void weightedSumNormCudaBackwardKernel( const int64_t W = points_idx.size(3); // Get the batch and index - const auto batch = blockIdx.x; + const int batch = blockIdx.x; const int num_pixels = C * W * H; - const auto num_threads = gridDim.y * blockDim.x; - const auto tid = blockIdx.y * blockDim.x + threadIdx.x; + const int num_threads = gridDim.y * blockDim.x; + const int tid = blockIdx.y * blockDim.x + threadIdx.x; // Parallelize over each feature in each pixel in images of size H * W, // for each image in the batch of size batch_size diff --git a/pytorch3d/csrc/compositing/weighted_sum.cu b/pytorch3d/csrc/compositing/weighted_sum.cu index 2e0904e7..125688a1 100644 --- a/pytorch3d/csrc/compositing/weighted_sum.cu +++ b/pytorch3d/csrc/compositing/weighted_sum.cu @@ -31,11 +31,11 @@ __global__ void weightedSumCudaForwardKernel( const int64_t W = points_idx.size(3); // Get the batch and index - const auto batch = blockIdx.x; + const int batch = blockIdx.x; const int num_pixels = C * H * W; - const auto num_threads = gridDim.y * blockDim.x; - const auto tid = blockIdx.y * blockDim.x + threadIdx.x; + const int num_threads = gridDim.y * blockDim.x; + const int tid = blockIdx.y * blockDim.x + threadIdx.x; // Parallelize over each feature in each pixel in images of size H * W, // for each image in the batch of size batch_size @@ -78,11 +78,11 @@ __global__ void weightedSumCudaBackwardKernel( const int64_t W = points_idx.size(3); // Get the batch and index - const auto batch = blockIdx.x; + const int batch = blockIdx.x; const int num_pixels = C * H * W; - const auto num_threads = gridDim.y * blockDim.x; - const auto tid = blockIdx.y * blockDim.x + threadIdx.x; + const int num_threads = gridDim.y * blockDim.x; + const int tid = blockIdx.y * blockDim.x + threadIdx.x; // Iterate over each pixel to compute the contribution to the // gradient for the features and weights diff --git a/pytorch3d/csrc/gather_scatter/gather_scatter.cu b/pytorch3d/csrc/gather_scatter/gather_scatter.cu index d4affd4b..1ec1a6f2 100644 --- a/pytorch3d/csrc/gather_scatter/gather_scatter.cu +++ b/pytorch3d/csrc/gather_scatter/gather_scatter.cu @@ -20,14 +20,14 @@ __global__ void GatherScatterCudaKernel( const size_t V, const size_t D, const size_t E) { - const auto tid = threadIdx.x; + const int tid = threadIdx.x; // Reverse the vertex order if backward. const int v0_idx = backward ? 1 : 0; const int v1_idx = backward ? 0 : 1; // Edges are split evenly across the blocks. - for (auto e = blockIdx.x; e < E; e += gridDim.x) { + for (int e = blockIdx.x; e < E; e += gridDim.x) { // Get indices of vertices which form the edge. const int64_t v0 = edges[2 * e + v0_idx]; const int64_t v1 = edges[2 * e + v1_idx]; @@ -35,7 +35,7 @@ __global__ void GatherScatterCudaKernel( // Split vertex features evenly across threads. // This implementation will be quite wasteful when D<128 since there will be // a lot of threads doing nothing. - for (auto d = tid; d < D; d += blockDim.x) { + for (int d = tid; d < D; d += blockDim.x) { const float val = input[v1 * D + d]; float* address = output + v0 * D + d; atomicAdd(address, val); diff --git a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu index 8fe292ae..6bd2a80d 100644 --- a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu +++ b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu @@ -20,8 +20,8 @@ __global__ void InterpFaceAttrsForwardKernel( const size_t P, const size_t F, const size_t D) { - const auto tid = threadIdx.x + blockIdx.x * blockDim.x; - const auto num_threads = blockDim.x * gridDim.x; + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int num_threads = blockDim.x * gridDim.x; for (int pd = tid; pd < P * D; pd += num_threads) { const int p = pd / D; const int d = pd % D; @@ -93,8 +93,8 @@ __global__ void InterpFaceAttrsBackwardKernel( const size_t P, const size_t F, const size_t D) { - const auto tid = threadIdx.x + blockIdx.x * blockDim.x; - const auto num_threads = blockDim.x * gridDim.x; + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int num_threads = blockDim.x * gridDim.x; for (int pd = tid; pd < P * D; pd += num_threads) { const int p = pd / D; const int d = pd % D; diff --git a/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu b/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu index 606ec9e6..3788d405 100644 --- a/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu +++ b/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu @@ -110,7 +110,7 @@ __global__ void DistanceForwardKernel( __syncthreads(); // Perform reduction in shared memory. - for (auto s = blockDim.x / 2; s > 32; s >>= 1) { + for (int s = blockDim.x / 2; s > 32; s >>= 1) { if (tid < s) { if (min_dists[tid] > min_dists[tid + s]) { min_dists[tid] = min_dists[tid + s]; @@ -502,8 +502,8 @@ __global__ void PointFaceArrayForwardKernel( const float3* tris_f3 = (float3*)tris; // Parallelize over P * S computations - const auto num_threads = gridDim.x * blockDim.x; - const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + const int num_threads = gridDim.x * blockDim.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; for (int t_i = tid; t_i < P * T; t_i += num_threads) { const int t = t_i / P; // segment index. @@ -576,8 +576,8 @@ __global__ void PointFaceArrayBackwardKernel( const float3* tris_f3 = (float3*)tris; // Parallelize over P * S computations - const auto num_threads = gridDim.x * blockDim.x; - const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + const int num_threads = gridDim.x * blockDim.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; for (int t_i = tid; t_i < P * T; t_i += num_threads) { const int t = t_i / P; // triangle index. @@ -683,8 +683,8 @@ __global__ void PointEdgeArrayForwardKernel( float3* segms_f3 = (float3*)segms; // Parallelize over P * S computations - const auto num_threads = gridDim.x * blockDim.x; - const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + const int num_threads = gridDim.x * blockDim.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; for (int t_i = tid; t_i < P * S; t_i += num_threads) { const int s = t_i / P; // segment index. @@ -752,8 +752,8 @@ __global__ void PointEdgeArrayBackwardKernel( float3* segms_f3 = (float3*)segms; // Parallelize over P * S computations - const auto num_threads = gridDim.x * blockDim.x; - const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + const int num_threads = gridDim.x * blockDim.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; for (int t_i = tid; t_i < P * S; t_i += num_threads) { const int s = t_i / P; // segment index. diff --git a/pytorch3d/csrc/rasterize_coarse/bitmask.cuh b/pytorch3d/csrc/rasterize_coarse/bitmask.cuh index 729650ba..6ffcac87 100644 --- a/pytorch3d/csrc/rasterize_coarse/bitmask.cuh +++ b/pytorch3d/csrc/rasterize_coarse/bitmask.cuh @@ -25,7 +25,7 @@ class BitMask { // Use all threads in the current block to clear all bits of this BitMask __device__ void block_clear() { - for (auto i = threadIdx.x; i < H * W * D; i += blockDim.x) { + for (int i = threadIdx.x; i < H * W * D; i += blockDim.x) { data[i] = 0; } __syncthreads(); diff --git a/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu b/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu index f093ef05..aed57d21 100644 --- a/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu +++ b/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu @@ -23,8 +23,8 @@ __global__ void TriangleBoundingBoxKernel( const float blur_radius, float* bboxes, // (4, F) bool* skip_face) { // (F,) - const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - const auto num_threads = blockDim.x * gridDim.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int num_threads = blockDim.x * gridDim.x; const float sqrt_radius = sqrt(blur_radius); for (int f = tid; f < F; f += num_threads) { const float v0x = face_verts[f * 9 + 0 * 3 + 0]; @@ -56,8 +56,8 @@ __global__ void PointBoundingBoxKernel( const int P, float* bboxes, // (4, P) bool* skip_points) { - const auto tid = blockIdx.x * blockDim.x + threadIdx.x; - const auto num_threads = blockDim.x * gridDim.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int num_threads = blockDim.x * gridDim.x; for (int p = tid; p < P; p += num_threads) { const float x = points[p * 3 + 0]; const float y = points[p * 3 + 1]; @@ -113,7 +113,7 @@ __global__ void RasterizeCoarseCudaKernel( const int chunks_per_batch = 1 + (E - 1) / chunk_size; const int num_chunks = N * chunks_per_batch; - for (auto chunk = blockIdx.x; chunk < num_chunks; chunk += gridDim.x) { + for (int chunk = blockIdx.x; chunk < num_chunks; chunk += gridDim.x) { const int batch_idx = chunk / chunks_per_batch; // batch index const int chunk_idx = chunk % chunks_per_batch; const int elem_chunk_start_idx = chunk_idx * chunk_size; @@ -123,7 +123,7 @@ __global__ void RasterizeCoarseCudaKernel( const int64_t elem_stop_idx = elem_start_idx + elems_per_batch[batch_idx]; // Have each thread handle a different face within the chunk - for (auto e = threadIdx.x; e < chunk_size; e += blockDim.x) { + for (int e = threadIdx.x; e < chunk_size; e += blockDim.x) { const int e_idx = elem_chunk_start_idx + e; // Check that we are still within the same element of the batch @@ -170,7 +170,7 @@ __global__ void RasterizeCoarseCudaKernel( // Now we have processed every elem in the current chunk. We need to // count the number of elems in each bin so we can write the indices // out to global memory. We have each thread handle a different bin. - for (auto byx = threadIdx.x; byx < num_bins_y * num_bins_x; + for (int byx = threadIdx.x; byx < num_bins_y * num_bins_x; byx += blockDim.x) { const int by = byx / num_bins_x; const int bx = byx % num_bins_x; diff --git a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu index 28c546c6..9dd3e266 100644 --- a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu +++ b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu @@ -260,8 +260,8 @@ __global__ void RasterizeMeshesNaiveCudaKernel( float* pix_dists, float* bary) { // Simple version: One thread per output pixel - auto num_threads = gridDim.x * blockDim.x; - auto tid = blockDim.x * blockIdx.x + threadIdx.x; + int num_threads = gridDim.x * blockDim.x; + int tid = blockDim.x * blockIdx.x + threadIdx.x; for (int i = tid; i < N * H * W; i += num_threads) { // Convert linear index to 3D index @@ -446,8 +446,8 @@ __global__ void RasterizeMeshesBackwardCudaKernel( // Parallelize over each pixel in images of // size H * W, for each image in the batch of size N. - const auto num_threads = gridDim.x * blockDim.x; - const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + const int num_threads = gridDim.x * blockDim.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; for (int t_i = tid; t_i < N * H * W; t_i += num_threads) { // Convert linear index to 3D index @@ -650,8 +650,8 @@ __global__ void RasterizeMeshesFineCudaKernel( ) { // This can be more than H * W if H or W are not divisible by bin_size. int num_pixels = N * BH * BW * bin_size * bin_size; - auto num_threads = gridDim.x * blockDim.x; - auto tid = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = gridDim.x * blockDim.x; + int tid = blockIdx.x * blockDim.x + threadIdx.x; for (int pid = tid; pid < num_pixels; pid += num_threads) { // Convert linear index into bin and pixel indices. We make the within diff --git a/pytorch3d/csrc/rasterize_points/rasterize_points.cu b/pytorch3d/csrc/rasterize_points/rasterize_points.cu index 20bf0de7..5b18d833 100644 --- a/pytorch3d/csrc/rasterize_points/rasterize_points.cu +++ b/pytorch3d/csrc/rasterize_points/rasterize_points.cu @@ -97,8 +97,8 @@ __global__ void RasterizePointsNaiveCudaKernel( float* zbuf, // (N, H, W, K) float* pix_dists) { // (N, H, W, K) // Simple version: One thread per output pixel - const auto num_threads = gridDim.x * blockDim.x; - const auto tid = blockDim.x * blockIdx.x + threadIdx.x; + const int num_threads = gridDim.x * blockDim.x; + const int tid = blockDim.x * blockIdx.x + threadIdx.x; for (int i = tid; i < N * H * W; i += num_threads) { // Convert linear index to 3D index const int n = i / (H * W); // Batch index @@ -237,8 +237,8 @@ __global__ void RasterizePointsFineCudaKernel( float* pix_dists) { // (N, H, W, K) // This can be more than H * W if H or W are not divisible by bin_size. const int num_pixels = N * BH * BW * bin_size * bin_size; - const auto num_threads = gridDim.x * blockDim.x; - const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + const int num_threads = gridDim.x * blockDim.x; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; for (int pid = tid; pid < num_pixels; pid += num_threads) { // Convert linear index into bin and pixel indices. We make the within @@ -376,8 +376,8 @@ __global__ void RasterizePointsBackwardCudaKernel( float* grad_points) { // (P, 3) // Parallelized over each of K points per pixel, for each pixel in images of // size H * W, for each image in the batch of size N. - auto num_threads = gridDim.x * blockDim.x; - auto tid = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = gridDim.x * blockDim.x; + int tid = blockIdx.x * blockDim.x + threadIdx.x; for (int i = tid; i < N * H * W * K; i += num_threads) { // const int n = i / (H * W * K); // batch index (not needed). const int yxk = i % (H * W * K);