From 62a2031dd4456b4b0cae2caa24e950c221eddc58 Mon Sep 17 00:00:00 2001
From: bottler <bottler@users.noreply.github.com>
Date: Thu, 27 Mar 2025 05:28:03 -0700
Subject: [PATCH] Revert "Fix CUDA kernel index data type in
 vision/fair/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.cu +10"

This reverts commit 3987612062f3db5dba609df3552768dcd97b410f.
---
 pytorch3d/csrc/compositing/alpha_composite.cu  | 12 ++++++------
 .../csrc/compositing/norm_weighted_sum.cu      | 12 ++++++------
 pytorch3d/csrc/compositing/weighted_sum.cu     | 12 ++++++------
 .../csrc/gather_scatter/gather_scatter.cu      |  6 +++---
 .../interp_face_attrs/interp_face_attrs.cu     |  8 ++++----
 pytorch3d/csrc/point_mesh/point_mesh_cuda.cu   | 18 +++++++++---------
 pytorch3d/csrc/rasterize_coarse/bitmask.cuh    |  2 +-
 .../csrc/rasterize_coarse/rasterize_coarse.cu  | 14 +++++++-------
 .../csrc/rasterize_meshes/rasterize_meshes.cu  | 12 ++++++------
 .../csrc/rasterize_points/rasterize_points.cu  | 12 ++++++------
 10 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/pytorch3d/csrc/compositing/alpha_composite.cu b/pytorch3d/csrc/compositing/alpha_composite.cu
index 2bfe79dc..b5d512e8 100644
--- a/pytorch3d/csrc/compositing/alpha_composite.cu
+++ b/pytorch3d/csrc/compositing/alpha_composite.cu
@@ -33,11 +33,11 @@ __global__ void alphaCompositeCudaForwardKernel(
   const int64_t W = points_idx.size(3);
 
   // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
 
   const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Iterate over each feature in each pixel
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
@@ -83,11 +83,11 @@ __global__ void alphaCompositeCudaBackwardKernel(
   const int64_t W = points_idx.size(3);
 
   // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
 
   const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
diff --git a/pytorch3d/csrc/compositing/norm_weighted_sum.cu b/pytorch3d/csrc/compositing/norm_weighted_sum.cu
index e21617d2..455bdb7f 100644
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.cu
@@ -33,11 +33,11 @@ __global__ void weightedSumNormCudaForwardKernel(
   const int64_t W = points_idx.size(3);
 
   // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
 
   const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
@@ -96,11 +96,11 @@ __global__ void weightedSumNormCudaBackwardKernel(
   const int64_t W = points_idx.size(3);
 
   // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
 
   const int num_pixels = C * W * H;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
diff --git a/pytorch3d/csrc/compositing/weighted_sum.cu b/pytorch3d/csrc/compositing/weighted_sum.cu
index 2e0904e7..125688a1 100644
--- a/pytorch3d/csrc/compositing/weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/weighted_sum.cu
@@ -31,11 +31,11 @@ __global__ void weightedSumCudaForwardKernel(
   const int64_t W = points_idx.size(3);
 
   // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
 
   const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
@@ -78,11 +78,11 @@ __global__ void weightedSumCudaBackwardKernel(
   const int64_t W = points_idx.size(3);
 
   // Get the batch and index
-  const auto batch = blockIdx.x;
+  const int batch = blockIdx.x;
 
   const int num_pixels = C * H * W;
-  const auto num_threads = gridDim.y * blockDim.x;
-  const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Iterate over each pixel to compute the contribution to the
   // gradient for the features and weights
diff --git a/pytorch3d/csrc/gather_scatter/gather_scatter.cu b/pytorch3d/csrc/gather_scatter/gather_scatter.cu
index d4affd4b..1ec1a6f2 100644
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.cu
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.cu
@@ -20,14 +20,14 @@ __global__ void GatherScatterCudaKernel(
     const size_t V,
     const size_t D,
     const size_t E) {
-  const auto tid = threadIdx.x;
+  const int tid = threadIdx.x;
 
   // Reverse the vertex order if backward.
   const int v0_idx = backward ? 1 : 0;
   const int v1_idx = backward ? 0 : 1;
 
   // Edges are split evenly across the blocks.
-  for (auto e = blockIdx.x; e < E; e += gridDim.x) {
+  for (int e = blockIdx.x; e < E; e += gridDim.x) {
     // Get indices of vertices which form the edge.
     const int64_t v0 = edges[2 * e + v0_idx];
     const int64_t v1 = edges[2 * e + v1_idx];
@@ -35,7 +35,7 @@ __global__ void GatherScatterCudaKernel(
     // Split vertex features evenly across threads.
     // This implementation will be quite wasteful when D<128 since there will be
     // a lot of threads doing nothing.
-    for (auto d = tid; d < D; d += blockDim.x) {
+    for (int d = tid; d < D; d += blockDim.x) {
       const float val = input[v1 * D + d];
       float* address = output + v0 * D + d;
       atomicAdd(address, val);
diff --git a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
index 8fe292ae..6bd2a80d 100644
--- a/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
+++ b/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
@@ -20,8 +20,8 @@ __global__ void InterpFaceAttrsForwardKernel(
     const size_t P,
     const size_t F,
     const size_t D) {
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
   for (int pd = tid; pd < P * D; pd += num_threads) {
     const int p = pd / D;
     const int d = pd % D;
@@ -93,8 +93,8 @@ __global__ void InterpFaceAttrsBackwardKernel(
     const size_t P,
     const size_t F,
     const size_t D) {
-  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
   for (int pd = tid; pd < P * D; pd += num_threads) {
     const int p = pd / D;
     const int d = pd % D;
diff --git a/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu b/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
index 606ec9e6..3788d405 100644
--- a/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
+++ b/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
@@ -110,7 +110,7 @@ __global__ void DistanceForwardKernel(
     __syncthreads();
 
     // Perform reduction in shared memory.
-    for (auto s = blockDim.x / 2; s > 32; s >>= 1) {
+    for (int s = blockDim.x / 2; s > 32; s >>= 1) {
       if (tid < s) {
         if (min_dists[tid] > min_dists[tid + s]) {
           min_dists[tid] = min_dists[tid + s];
@@ -502,8 +502,8 @@ __global__ void PointFaceArrayForwardKernel(
   const float3* tris_f3 = (float3*)tris;
 
   // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (int t_i = tid; t_i < P * T; t_i += num_threads) {
     const int t = t_i / P; // segment index.
@@ -576,8 +576,8 @@ __global__ void PointFaceArrayBackwardKernel(
   const float3* tris_f3 = (float3*)tris;
 
   // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (int t_i = tid; t_i < P * T; t_i += num_threads) {
     const int t = t_i / P; // triangle index.
@@ -683,8 +683,8 @@ __global__ void PointEdgeArrayForwardKernel(
   float3* segms_f3 = (float3*)segms;
 
   // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (int t_i = tid; t_i < P * S; t_i += num_threads) {
     const int s = t_i / P; // segment index.
@@ -752,8 +752,8 @@ __global__ void PointEdgeArrayBackwardKernel(
   float3* segms_f3 = (float3*)segms;
 
   // Parallelize over P * S computations
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (int t_i = tid; t_i < P * S; t_i += num_threads) {
     const int s = t_i / P; // segment index.
diff --git a/pytorch3d/csrc/rasterize_coarse/bitmask.cuh b/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
index 729650ba..6ffcac87 100644
--- a/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
+++ b/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
@@ -25,7 +25,7 @@ class BitMask {
 
   // Use all threads in the current block to clear all bits of this BitMask
   __device__ void block_clear() {
-    for (auto i = threadIdx.x; i < H * W * D; i += blockDim.x) {
+    for (int i = threadIdx.x; i < H * W * D; i += blockDim.x) {
       data[i] = 0;
     }
     __syncthreads();
diff --git a/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu b/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu
index f093ef05..aed57d21 100644
--- a/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu
+++ b/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu
@@ -23,8 +23,8 @@ __global__ void TriangleBoundingBoxKernel(
     const float blur_radius,
     float* bboxes, // (4, F)
     bool* skip_face) { // (F,)
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = blockDim.x * gridDim.x;
   const float sqrt_radius = sqrt(blur_radius);
   for (int f = tid; f < F; f += num_threads) {
     const float v0x = face_verts[f * 9 + 0 * 3 + 0];
@@ -56,8 +56,8 @@ __global__ void PointBoundingBoxKernel(
     const int P,
     float* bboxes, // (4, P)
     bool* skip_points) {
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  const auto num_threads = blockDim.x * gridDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = blockDim.x * gridDim.x;
   for (int p = tid; p < P; p += num_threads) {
     const float x = points[p * 3 + 0];
     const float y = points[p * 3 + 1];
@@ -113,7 +113,7 @@ __global__ void RasterizeCoarseCudaKernel(
   const int chunks_per_batch = 1 + (E - 1) / chunk_size;
   const int num_chunks = N * chunks_per_batch;
 
-  for (auto chunk = blockIdx.x; chunk < num_chunks; chunk += gridDim.x) {
+  for (int chunk = blockIdx.x; chunk < num_chunks; chunk += gridDim.x) {
     const int batch_idx = chunk / chunks_per_batch; // batch index
     const int chunk_idx = chunk % chunks_per_batch;
     const int elem_chunk_start_idx = chunk_idx * chunk_size;
@@ -123,7 +123,7 @@ __global__ void RasterizeCoarseCudaKernel(
     const int64_t elem_stop_idx = elem_start_idx + elems_per_batch[batch_idx];
 
     // Have each thread handle a different face within the chunk
-    for (auto e = threadIdx.x; e < chunk_size; e += blockDim.x) {
+    for (int e = threadIdx.x; e < chunk_size; e += blockDim.x) {
       const int e_idx = elem_chunk_start_idx + e;
 
       // Check that we are still within the same element of the batch
@@ -170,7 +170,7 @@ __global__ void RasterizeCoarseCudaKernel(
     // Now we have processed every elem in the current chunk. We need to
     // count the number of elems in each bin so we can write the indices
     // out to global memory. We have each thread handle a different bin.
-    for (auto byx = threadIdx.x; byx < num_bins_y * num_bins_x;
+    for (int byx = threadIdx.x; byx < num_bins_y * num_bins_x;
          byx += blockDim.x) {
       const int by = byx / num_bins_x;
       const int bx = byx % num_bins_x;
diff --git a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
index 28c546c6..9dd3e266 100644
--- a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
+++ b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
@@ -260,8 +260,8 @@ __global__ void RasterizeMeshesNaiveCudaKernel(
     float* pix_dists,
     float* bary) {
   // Simple version: One thread per output pixel
-  auto num_threads = gridDim.x * blockDim.x;
-  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  int num_threads = gridDim.x * blockDim.x;
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   for (int i = tid; i < N * H * W; i += num_threads) {
     // Convert linear index to 3D index
@@ -446,8 +446,8 @@ __global__ void RasterizeMeshesBackwardCudaKernel(
 
   // Parallelize over each pixel in images of
   // size H * W, for each image in the batch of size N.
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (int t_i = tid; t_i < N * H * W; t_i += num_threads) {
     // Convert linear index to 3D index
@@ -650,8 +650,8 @@ __global__ void RasterizeMeshesFineCudaKernel(
 ) {
   // This can be more than H * W if H or W are not divisible by bin_size.
   int num_pixels = N * BH * BW * bin_size * bin_size;
-  auto num_threads = gridDim.x * blockDim.x;
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = gridDim.x * blockDim.x;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
     // Convert linear index into bin and pixel indices. We make the within
diff --git a/pytorch3d/csrc/rasterize_points/rasterize_points.cu b/pytorch3d/csrc/rasterize_points/rasterize_points.cu
index 20bf0de7..5b18d833 100644
--- a/pytorch3d/csrc/rasterize_points/rasterize_points.cu
+++ b/pytorch3d/csrc/rasterize_points/rasterize_points.cu
@@ -97,8 +97,8 @@ __global__ void RasterizePointsNaiveCudaKernel(
     float* zbuf, // (N, H, W, K)
     float* pix_dists) { // (N, H, W, K)
   // Simple version: One thread per output pixel
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
   for (int i = tid; i < N * H * W; i += num_threads) {
     // Convert linear index to 3D index
     const int n = i / (H * W); // Batch index
@@ -237,8 +237,8 @@ __global__ void RasterizePointsFineCudaKernel(
     float* pix_dists) { // (N, H, W, K)
   // This can be more than H * W if H or W are not divisible by bin_size.
   const int num_pixels = N * BH * BW * bin_size * bin_size;
-  const auto num_threads = gridDim.x * blockDim.x;
-  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
     // Convert linear index into bin and pixel indices. We make the within
@@ -376,8 +376,8 @@ __global__ void RasterizePointsBackwardCudaKernel(
     float* grad_points) { // (P, 3)
   // Parallelized over each of K points per pixel, for each pixel in images of
   // size H * W, for each image in the batch of size N.
-  auto num_threads = gridDim.x * blockDim.x;
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = gridDim.x * blockDim.x;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
   for (int i = tid; i < N * H * W * K; i += num_threads) {
     // const int n = i / (H * W * K); // batch index (not needed).
     const int yxk = i % (H * W * K);