mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2025-07-31 10:52:50 +08:00
Revert "Fix CUDA kernel index data type in vision/fair/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.cu +10"
This reverts commit 3987612062f3db5dba609df3552768dcd97b410f.
This commit is contained in:
parent
3987612062
commit
62a2031dd4
@ -33,11 +33,11 @@ __global__ void alphaCompositeCudaForwardKernel(
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const auto batch = blockIdx.x;
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * H * W;
|
||||
const auto num_threads = gridDim.y * blockDim.x;
|
||||
const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Iterate over each feature in each pixel
|
||||
for (int pid = tid; pid < num_pixels; pid += num_threads) {
|
||||
@ -83,11 +83,11 @@ __global__ void alphaCompositeCudaBackwardKernel(
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const auto batch = blockIdx.x;
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * H * W;
|
||||
const auto num_threads = gridDim.y * blockDim.x;
|
||||
const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Parallelize over each feature in each pixel in images of size H * W,
|
||||
// for each image in the batch of size batch_size
|
||||
|
@ -33,11 +33,11 @@ __global__ void weightedSumNormCudaForwardKernel(
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const auto batch = blockIdx.x;
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * H * W;
|
||||
const auto num_threads = gridDim.y * blockDim.x;
|
||||
const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Parallelize over each feature in each pixel in images of size H * W,
|
||||
// for each image in the batch of size batch_size
|
||||
@ -96,11 +96,11 @@ __global__ void weightedSumNormCudaBackwardKernel(
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const auto batch = blockIdx.x;
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * W * H;
|
||||
const auto num_threads = gridDim.y * blockDim.x;
|
||||
const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Parallelize over each feature in each pixel in images of size H * W,
|
||||
// for each image in the batch of size batch_size
|
||||
|
@ -31,11 +31,11 @@ __global__ void weightedSumCudaForwardKernel(
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const auto batch = blockIdx.x;
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * H * W;
|
||||
const auto num_threads = gridDim.y * blockDim.x;
|
||||
const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Parallelize over each feature in each pixel in images of size H * W,
|
||||
// for each image in the batch of size batch_size
|
||||
@ -78,11 +78,11 @@ __global__ void weightedSumCudaBackwardKernel(
|
||||
const int64_t W = points_idx.size(3);
|
||||
|
||||
// Get the batch and index
|
||||
const auto batch = blockIdx.x;
|
||||
const int batch = blockIdx.x;
|
||||
|
||||
const int num_pixels = C * H * W;
|
||||
const auto num_threads = gridDim.y * blockDim.x;
|
||||
const auto tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.y * blockDim.x;
|
||||
const int tid = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// Iterate over each pixel to compute the contribution to the
|
||||
// gradient for the features and weights
|
||||
|
@ -20,14 +20,14 @@ __global__ void GatherScatterCudaKernel(
|
||||
const size_t V,
|
||||
const size_t D,
|
||||
const size_t E) {
|
||||
const auto tid = threadIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
// Reverse the vertex order if backward.
|
||||
const int v0_idx = backward ? 1 : 0;
|
||||
const int v1_idx = backward ? 0 : 1;
|
||||
|
||||
// Edges are split evenly across the blocks.
|
||||
for (auto e = blockIdx.x; e < E; e += gridDim.x) {
|
||||
for (int e = blockIdx.x; e < E; e += gridDim.x) {
|
||||
// Get indices of vertices which form the edge.
|
||||
const int64_t v0 = edges[2 * e + v0_idx];
|
||||
const int64_t v1 = edges[2 * e + v1_idx];
|
||||
@ -35,7 +35,7 @@ __global__ void GatherScatterCudaKernel(
|
||||
// Split vertex features evenly across threads.
|
||||
// This implementation will be quite wasteful when D<128 since there will be
|
||||
// a lot of threads doing nothing.
|
||||
for (auto d = tid; d < D; d += blockDim.x) {
|
||||
for (int d = tid; d < D; d += blockDim.x) {
|
||||
const float val = input[v1 * D + d];
|
||||
float* address = output + v0 * D + d;
|
||||
atomicAdd(address, val);
|
||||
|
@ -20,8 +20,8 @@ __global__ void InterpFaceAttrsForwardKernel(
|
||||
const size_t P,
|
||||
const size_t F,
|
||||
const size_t D) {
|
||||
const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const auto num_threads = blockDim.x * gridDim.x;
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int num_threads = blockDim.x * gridDim.x;
|
||||
for (int pd = tid; pd < P * D; pd += num_threads) {
|
||||
const int p = pd / D;
|
||||
const int d = pd % D;
|
||||
@ -93,8 +93,8 @@ __global__ void InterpFaceAttrsBackwardKernel(
|
||||
const size_t P,
|
||||
const size_t F,
|
||||
const size_t D) {
|
||||
const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const auto num_threads = blockDim.x * gridDim.x;
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int num_threads = blockDim.x * gridDim.x;
|
||||
for (int pd = tid; pd < P * D; pd += num_threads) {
|
||||
const int p = pd / D;
|
||||
const int d = pd % D;
|
||||
|
@ -110,7 +110,7 @@ __global__ void DistanceForwardKernel(
|
||||
__syncthreads();
|
||||
|
||||
// Perform reduction in shared memory.
|
||||
for (auto s = blockDim.x / 2; s > 32; s >>= 1) {
|
||||
for (int s = blockDim.x / 2; s > 32; s >>= 1) {
|
||||
if (tid < s) {
|
||||
if (min_dists[tid] > min_dists[tid + s]) {
|
||||
min_dists[tid] = min_dists[tid + s];
|
||||
@ -502,8 +502,8 @@ __global__ void PointFaceArrayForwardKernel(
|
||||
const float3* tris_f3 = (float3*)tris;
|
||||
|
||||
// Parallelize over P * S computations
|
||||
const auto num_threads = gridDim.x * blockDim.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.x * blockDim.x;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int t_i = tid; t_i < P * T; t_i += num_threads) {
|
||||
const int t = t_i / P; // segment index.
|
||||
@ -576,8 +576,8 @@ __global__ void PointFaceArrayBackwardKernel(
|
||||
const float3* tris_f3 = (float3*)tris;
|
||||
|
||||
// Parallelize over P * S computations
|
||||
const auto num_threads = gridDim.x * blockDim.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.x * blockDim.x;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int t_i = tid; t_i < P * T; t_i += num_threads) {
|
||||
const int t = t_i / P; // triangle index.
|
||||
@ -683,8 +683,8 @@ __global__ void PointEdgeArrayForwardKernel(
|
||||
float3* segms_f3 = (float3*)segms;
|
||||
|
||||
// Parallelize over P * S computations
|
||||
const auto num_threads = gridDim.x * blockDim.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.x * blockDim.x;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int t_i = tid; t_i < P * S; t_i += num_threads) {
|
||||
const int s = t_i / P; // segment index.
|
||||
@ -752,8 +752,8 @@ __global__ void PointEdgeArrayBackwardKernel(
|
||||
float3* segms_f3 = (float3*)segms;
|
||||
|
||||
// Parallelize over P * S computations
|
||||
const auto num_threads = gridDim.x * blockDim.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.x * blockDim.x;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int t_i = tid; t_i < P * S; t_i += num_threads) {
|
||||
const int s = t_i / P; // segment index.
|
||||
|
@ -25,7 +25,7 @@ class BitMask {
|
||||
|
||||
// Use all threads in the current block to clear all bits of this BitMask
|
||||
__device__ void block_clear() {
|
||||
for (auto i = threadIdx.x; i < H * W * D; i += blockDim.x) {
|
||||
for (int i = threadIdx.x; i < H * W * D; i += blockDim.x) {
|
||||
data[i] = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
@ -23,8 +23,8 @@ __global__ void TriangleBoundingBoxKernel(
|
||||
const float blur_radius,
|
||||
float* bboxes, // (4, F)
|
||||
bool* skip_face) { // (F,)
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto num_threads = blockDim.x * gridDim.x;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int num_threads = blockDim.x * gridDim.x;
|
||||
const float sqrt_radius = sqrt(blur_radius);
|
||||
for (int f = tid; f < F; f += num_threads) {
|
||||
const float v0x = face_verts[f * 9 + 0 * 3 + 0];
|
||||
@ -56,8 +56,8 @@ __global__ void PointBoundingBoxKernel(
|
||||
const int P,
|
||||
float* bboxes, // (4, P)
|
||||
bool* skip_points) {
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto num_threads = blockDim.x * gridDim.x;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int num_threads = blockDim.x * gridDim.x;
|
||||
for (int p = tid; p < P; p += num_threads) {
|
||||
const float x = points[p * 3 + 0];
|
||||
const float y = points[p * 3 + 1];
|
||||
@ -113,7 +113,7 @@ __global__ void RasterizeCoarseCudaKernel(
|
||||
const int chunks_per_batch = 1 + (E - 1) / chunk_size;
|
||||
const int num_chunks = N * chunks_per_batch;
|
||||
|
||||
for (auto chunk = blockIdx.x; chunk < num_chunks; chunk += gridDim.x) {
|
||||
for (int chunk = blockIdx.x; chunk < num_chunks; chunk += gridDim.x) {
|
||||
const int batch_idx = chunk / chunks_per_batch; // batch index
|
||||
const int chunk_idx = chunk % chunks_per_batch;
|
||||
const int elem_chunk_start_idx = chunk_idx * chunk_size;
|
||||
@ -123,7 +123,7 @@ __global__ void RasterizeCoarseCudaKernel(
|
||||
const int64_t elem_stop_idx = elem_start_idx + elems_per_batch[batch_idx];
|
||||
|
||||
// Have each thread handle a different face within the chunk
|
||||
for (auto e = threadIdx.x; e < chunk_size; e += blockDim.x) {
|
||||
for (int e = threadIdx.x; e < chunk_size; e += blockDim.x) {
|
||||
const int e_idx = elem_chunk_start_idx + e;
|
||||
|
||||
// Check that we are still within the same element of the batch
|
||||
@ -170,7 +170,7 @@ __global__ void RasterizeCoarseCudaKernel(
|
||||
// Now we have processed every elem in the current chunk. We need to
|
||||
// count the number of elems in each bin so we can write the indices
|
||||
// out to global memory. We have each thread handle a different bin.
|
||||
for (auto byx = threadIdx.x; byx < num_bins_y * num_bins_x;
|
||||
for (int byx = threadIdx.x; byx < num_bins_y * num_bins_x;
|
||||
byx += blockDim.x) {
|
||||
const int by = byx / num_bins_x;
|
||||
const int bx = byx % num_bins_x;
|
||||
|
@ -260,8 +260,8 @@ __global__ void RasterizeMeshesNaiveCudaKernel(
|
||||
float* pix_dists,
|
||||
float* bary) {
|
||||
// Simple version: One thread per output pixel
|
||||
auto num_threads = gridDim.x * blockDim.x;
|
||||
auto tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int num_threads = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
for (int i = tid; i < N * H * W; i += num_threads) {
|
||||
// Convert linear index to 3D index
|
||||
@ -446,8 +446,8 @@ __global__ void RasterizeMeshesBackwardCudaKernel(
|
||||
|
||||
// Parallelize over each pixel in images of
|
||||
// size H * W, for each image in the batch of size N.
|
||||
const auto num_threads = gridDim.x * blockDim.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.x * blockDim.x;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int t_i = tid; t_i < N * H * W; t_i += num_threads) {
|
||||
// Convert linear index to 3D index
|
||||
@ -650,8 +650,8 @@ __global__ void RasterizeMeshesFineCudaKernel(
|
||||
) {
|
||||
// This can be more than H * W if H or W are not divisible by bin_size.
|
||||
int num_pixels = N * BH * BW * bin_size * bin_size;
|
||||
auto num_threads = gridDim.x * blockDim.x;
|
||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int num_threads = gridDim.x * blockDim.x;
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int pid = tid; pid < num_pixels; pid += num_threads) {
|
||||
// Convert linear index into bin and pixel indices. We make the within
|
||||
|
@ -97,8 +97,8 @@ __global__ void RasterizePointsNaiveCudaKernel(
|
||||
float* zbuf, // (N, H, W, K)
|
||||
float* pix_dists) { // (N, H, W, K)
|
||||
// Simple version: One thread per output pixel
|
||||
const auto num_threads = gridDim.x * blockDim.x;
|
||||
const auto tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int num_threads = gridDim.x * blockDim.x;
|
||||
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
for (int i = tid; i < N * H * W; i += num_threads) {
|
||||
// Convert linear index to 3D index
|
||||
const int n = i / (H * W); // Batch index
|
||||
@ -237,8 +237,8 @@ __global__ void RasterizePointsFineCudaKernel(
|
||||
float* pix_dists) { // (N, H, W, K)
|
||||
// This can be more than H * W if H or W are not divisible by bin_size.
|
||||
const int num_pixels = N * BH * BW * bin_size * bin_size;
|
||||
const auto num_threads = gridDim.x * blockDim.x;
|
||||
const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int num_threads = gridDim.x * blockDim.x;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int pid = tid; pid < num_pixels; pid += num_threads) {
|
||||
// Convert linear index into bin and pixel indices. We make the within
|
||||
@ -376,8 +376,8 @@ __global__ void RasterizePointsBackwardCudaKernel(
|
||||
float* grad_points) { // (P, 3)
|
||||
// Parallelized over each of K points per pixel, for each pixel in images of
|
||||
// size H * W, for each image in the batch of size N.
|
||||
auto num_threads = gridDim.x * blockDim.x;
|
||||
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int num_threads = gridDim.x * blockDim.x;
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
for (int i = tid; i < N * H * W * K; i += num_threads) {
|
||||
// const int n = i / (H * W * K); // batch index (not needed).
|
||||
const int yxk = i % (H * W * K);
|
||||
|
Loading…
x
Reference in New Issue
Block a user