From 3d769a66cb184d75126600abeb4ad953cd56cb8d Mon Sep 17 00:00:00 2001
From: Nikhila Ravi <nikhilar@fb.com>
Date: Tue, 15 Dec 2020 14:14:27 -0800
Subject: [PATCH] Non Square image rasterization for pointclouds

Summary:
Similar to non square image rasterization for meshes, apply the same updates to the pointcloud rasterizer.

Main API Change:
- PointRasterizationSettings now accepts a tuple/list of (H, W) for the image size.

Reviewed By: jcjohnson

Differential Revision: D25465206

fbshipit-source-id: 7370d83c431af1b972158cecae19d82364623380
---
 pytorch3d/csrc/compositing/alpha_composite.cu |  16 +-
 pytorch3d/csrc/compositing/alpha_composite.h  |   8 +-
 .../csrc/compositing/norm_weighted_sum.cu     |  14 +-
 .../csrc/compositing/norm_weighted_sum.h      |   8 +-
 pytorch3d/csrc/compositing/weighted_sum.cu    |  16 +-
 pytorch3d/csrc/compositing/weighted_sum.h     |   6 +-
 .../csrc/rasterize_meshes/rasterize_meshes.cu |  17 +-
 .../rasterize_meshes/rasterize_meshes_cpu.cpp |  32 +-
 .../rasterize_points/rasterization_utils.cuh  |  12 +-
 .../rasterize_points/rasterization_utils.h    |  34 ++
 .../csrc/rasterize_points/rasterize_points.cu | 186 +++++----
 .../csrc/rasterize_points/rasterize_points.h  |  30 +-
 .../rasterize_points/rasterize_points_cpu.cpp |  78 ++--
 pytorch3d/renderer/mesh/__init__.py           |   1 +
 pytorch3d/renderer/mesh/rasterize_meshes.py   |  11 +-
 pytorch3d/renderer/points/compositor.py       |   2 +-
 pytorch3d/renderer/points/rasterize_points.py |  88 +++-
 pytorch3d/renderer/points/rasterizer.py       |   4 +-
 tests/bm_rasterize_points.py                  |  15 +
 .../data/test_pointcloud_rectangle_image.png  | Bin 0 -> 20251 bytes
 tests/test_rasterize_points.py                |   4 +-
 ....py => test_rasterize_rectangle_images.py} | 393 +++++++++++++++++-
 22 files changed, 712 insertions(+), 263 deletions(-)
 create mode 100644 pytorch3d/csrc/rasterize_points/rasterization_utils.h
 create mode 100644 tests/data/test_pointcloud_rectangle_image.png
 rename tests/{test_rasterize_rectangles.py => test_rasterize_rectangle_images.py} (51%)

diff --git a/pytorch3d/csrc/compositing/alpha_composite.cu b/pytorch3d/csrc/compositing/alpha_composite.cu
index 389b95f8..16fadb02 100644
--- a/pytorch3d/csrc/compositing/alpha_composite.cu
+++ b/pytorch3d/csrc/compositing/alpha_composite.cu
@@ -30,15 +30,15 @@ __global__ void alphaCompositeCudaForwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Iterate over each feature in each pixel
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // alphacomposite the different values
     float cum_alpha = 1.;
@@ -81,16 +81,16 @@ __global__ void alphaCompositeCudaBackwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // alphacomposite the different values
     float cum_alpha = 1.;
diff --git a/pytorch3d/csrc/compositing/alpha_composite.h b/pytorch3d/csrc/compositing/alpha_composite.h
index 735d87e1..c910c32d 100644
--- a/pytorch3d/csrc/compositing/alpha_composite.h
+++ b/pytorch3d/csrc/compositing/alpha_composite.h
@@ -11,13 +11,13 @@
 //    features: FloatTensor of shape (C, P) which gives the features
 //            of each point where C is the size of the feature and
 //            P the number of points.
-//    alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
+//    alphas: FloatTensor of shape (N, points_per_pixel, H, W) where
 //            points_per_pixel is the number of points in the z-buffer
-//            sorted in z-order, and W is the image size.
-//    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
+//            sorted in z-order, and (H, W) is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, H, W) giving the
 //            indices of the nearest points at each pixel, sorted in z-order.
 // Returns:
-//    weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
+//    weighted_fs: FloatTensor of shape (N, C, H, W) giving the accumulated
 //            feature for each point. Concretely, it gives:
 //                 weighted_fs[b,c,i,j] = sum_k cum_alpha_k *
 //                   features[c,points_idx[b,k,i,j]]
diff --git a/pytorch3d/csrc/compositing/norm_weighted_sum.cu b/pytorch3d/csrc/compositing/norm_weighted_sum.cu
index a787e1fa..1885bec6 100644
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.cu
@@ -30,16 +30,16 @@ __global__ void weightedSumNormCudaForwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // Store the accumulated alpha value
     float cum_alpha = 0.;
@@ -101,9 +101,9 @@ __global__ void weightedSumNormCudaBackwardKernel(
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     float sum_alpha = 0.;
     float sum_alphafs = 0.;
diff --git a/pytorch3d/csrc/compositing/norm_weighted_sum.h b/pytorch3d/csrc/compositing/norm_weighted_sum.h
index 34c271bc..c2878503 100644
--- a/pytorch3d/csrc/compositing/norm_weighted_sum.h
+++ b/pytorch3d/csrc/compositing/norm_weighted_sum.h
@@ -11,13 +11,13 @@
 //    features: FloatTensor of shape (C, P) which gives the features
 //            of each point where C is the size of the feature and
 //            P the number of points.
-//    alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
+//    alphas: FloatTensor of shape (N, points_per_pixel, H, W) where
 //            points_per_pixel is the number of points in the z-buffer
-//            sorted in z-order, and W is the image size.
-//    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
+//            sorted in z-order, and (H, W) is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, H, W) giving the
 //            indices of the nearest points at each pixel, sorted in z-order.
 // Returns:
-//    weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
+//    weighted_fs: FloatTensor of shape (N, C, H, W) giving the accumulated
 //            feature in each point. Concretely, it gives:
 //                 weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
 //                   features[c,points_idx[b,k,i,j]] / sum_k alphas[b,k,i,j]
diff --git a/pytorch3d/csrc/compositing/weighted_sum.cu b/pytorch3d/csrc/compositing/weighted_sum.cu
index 68ec351e..cee8e75a 100644
--- a/pytorch3d/csrc/compositing/weighted_sum.cu
+++ b/pytorch3d/csrc/compositing/weighted_sum.cu
@@ -28,16 +28,16 @@ __global__ void weightedSumCudaForwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Parallelize over each feature in each pixel in images of size H * W,
   // for each image in the batch of size batch_size
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // Iterate through the closest K points for this pixel
     for (int k = 0; k < points_idx.size(1); ++k) {
@@ -76,16 +76,16 @@ __global__ void weightedSumCudaBackwardKernel(
   // Get the batch and index
   const int batch = blockIdx.x;
 
-  const int num_pixels = C * W * H;
+  const int num_pixels = C * H * W;
   const int num_threads = gridDim.y * blockDim.x;
   const int tid = blockIdx.y * blockDim.x + threadIdx.x;
 
   // Iterate over each pixel to compute the contribution to the
   // gradient for the features and weights
   for (int pid = tid; pid < num_pixels; pid += num_threads) {
-    int ch = pid / (W * H);
-    int j = (pid % (W * H)) / H;
-    int i = (pid % (W * H)) % H;
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
 
     // Iterate through the closest K points for this pixel
     for (int k = 0; k < points_idx.size(1); ++k) {
diff --git a/pytorch3d/csrc/compositing/weighted_sum.h b/pytorch3d/csrc/compositing/weighted_sum.h
index 4928a252..aa4154ed 100644
--- a/pytorch3d/csrc/compositing/weighted_sum.h
+++ b/pytorch3d/csrc/compositing/weighted_sum.h
@@ -11,13 +11,13 @@
 //    features: FloatTensor of shape (C, P) which gives the features
 //            of each point where C is the size of the feature and
 //            P the number of points.
-//    alphas: FloatTensor of shape (N, points_per_pixel, W, W) where
+//    alphas: FloatTensor of shape (N, points_per_pixel, H, W) where
 //            points_per_pixel is the number of points in the z-buffer
-//            sorted in z-order, and W is the image size.
+//            sorted in z-order, and (H, W) is the image size.
 //    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
 //            indices of the nearest points at each pixel, sorted in z-order.
 // Returns:
-//    weighted_fs: FloatTensor of shape (N, C, W, W) giving the accumulated
+//    weighted_fs: FloatTensor of shape (N, C, H, W) giving the accumulated
 //            feature in each point. Concretely, it gives:
 //                 weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
 //                   features[c,points_idx[b,k,i,j]]
diff --git a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
index af973f38..a92a64e5 100644
--- a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
+++ b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
@@ -452,7 +452,6 @@ __global__ void RasterizeMeshesBackwardCudaKernel(
       const bool inside = b_pp.x > 0.0f && b_pp.y > 0.0f && b_pp.z > 0.0f;
       const float sign = inside ? -1.0f : 1.0f;
 
-      // TODO(T52813608) Add support for non-square images.
       auto grad_dist_f = PointTriangleDistanceBackward(
           pxy, v0xy, v1xy, v2xy, sign * grad_dist_upstream);
       const float2 ddist_d_v0 = thrust::get<1>(grad_dist_f);
@@ -606,7 +605,7 @@ __global__ void RasterizeMeshesCoarseCudaKernel(
   const float half_pix_x = NDC_x_half_range / W;
   const float half_pix_y = NDC_y_half_range / H;
 
-  // This is a boolean array of shape (num_bins, num_bins, chunk_size)
+  // This is a boolean array of shape (num_bins_y, num_bins_x, chunk_size)
   // stored in shared memory that will track whether each point in the chunk
   // falls into each bin of the image.
   BitMask binmask((unsigned int*)sbuf, num_bins_y, num_bins_x, chunk_size);
@@ -755,7 +754,7 @@ at::Tensor RasterizeMeshesCoarseCuda(
   const int num_bins_y = 1 + (H - 1) / bin_size;
   const int num_bins_x = 1 + (W - 1) / bin_size;
 
-  if (num_bins_y >= kMaxFacesPerBin || num_bins_x >= kMaxFacesPerBin) {
+  if (num_bins_y >= kMaxItemsPerBin || num_bins_x >= kMaxItemsPerBin) {
     std::stringstream ss;
     ss << "In Coarse Rasterizer got num_bins_y: " << num_bins_y
        << ", num_bins_x: " << num_bins_x << ", "
@@ -800,7 +799,7 @@ at::Tensor RasterizeMeshesCoarseCuda(
 // ****************************************************************************
 __global__ void RasterizeMeshesFineCudaKernel(
     const float* face_verts, // (F, 3, 3)
-    const int32_t* bin_faces, // (N, B, B, T)
+    const int32_t* bin_faces, // (N, BH, BW, T)
     const float blur_radius,
     const int bin_size,
     const bool perspective_correct,
@@ -813,12 +812,12 @@ __global__ void RasterizeMeshesFineCudaKernel(
     const int H,
     const int W,
     const int K,
-    int64_t* face_idxs, // (N, S, S, K)
-    float* zbuf, // (N, S, S, K)
-    float* pix_dists, // (N, S, S, K)
-    float* bary // (N, S, S, K, 3)
+    int64_t* face_idxs, // (N, H, W, K)
+    float* zbuf, // (N, H, W, K)
+    float* pix_dists, // (N, H, W, K)
+    float* bary // (N, H, W, K, 3)
 ) {
-  // This can be more than S^2 if S % bin_size != 0
+  // This can be more than H * W if H or W are not divisible by bin_size.
   int num_pixels = N * BH * BW * bin_size * bin_size;
   int num_threads = gridDim.x * blockDim.x;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp
index b8a73e20..3160e685 100644
--- a/pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp
+++ b/pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp
@@ -5,41 +5,11 @@
 #include <list>
 #include <queue>
 #include <tuple>
+#include "rasterize_points/rasterization_utils.h"
 #include "utils/geometry_utils.h"
 #include "utils/vec2.h"
 #include "utils/vec3.h"
 
-// The default value of the NDC range is [-1, 1], however in the case that
-// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
-// the longer side is scaled by the ratio of H:W. S1 is the dimension for which
-// the NDC range is calculated and S2 is the other image dimension.
-// e.g. to get the NDC x range S1 = W and S2 = H
-float NonSquareNdcRange(int S1, int S2) {
-  float range = 2.0f;
-  if (S1 > S2) {
-    range = ((S1 / S2) * range);
-  }
-  return range;
-}
-
-// Given a pixel coordinate 0 <= i < S1, convert it to a normalized device
-// coordinates. We divide the NDC range into S1 evenly-sized
-// pixels, and assume that each pixel falls in the *center* of its range.
-// The default value of the NDC range is [-1, 1], however in the case that
-// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
-// the longer side is scaled by the ratio of H:W. The dimension of i should be
-// S1 and the other image dimension is S2 For example, to get the x and y NDC
-// coordinates or a given pixel i:
-//     x = PixToNonSquareNdc(i, W, H)
-//     y = PixToNonSquareNdc(i, H, W)
-float PixToNonSquareNdc(int i, int S1, int S2) {
-  float range = NonSquareNdcRange(S1, S2);
-  // NDC: offset + (i * pixel_width + half_pixel_width)
-  // The NDC range is [-range/2, range/2].
-  const float offset = (range / 2.0f);
-  return -offset + (range * i + offset) / S1;
-}
-
 // Get (x, y, z) values for vertex from (3, 3) tensor face.
 template <typename Face>
 auto ExtractVerts(const Face& face, const int vertex_index) {
diff --git a/pytorch3d/csrc/rasterize_points/rasterization_utils.cuh b/pytorch3d/csrc/rasterize_points/rasterization_utils.cuh
index 8492bad1..18272ab7 100644
--- a/pytorch3d/csrc/rasterize_points/rasterization_utils.cuh
+++ b/pytorch3d/csrc/rasterize_points/rasterization_utils.cuh
@@ -2,16 +2,6 @@
 
 #pragma once
 
-// Given a pixel coordinate 0 <= i < S, convert it to a normalized device
-// coordinates in the range [-1, 1]. We divide the NDC range into S evenly-sized
-// pixels, and assume that each pixel falls in the *center* of its range.
-// TODO: delete this function after updating the pointcloud rasterizer to
-// support non square images.
-__device__ inline float PixToNdc(int i, int S) {
-  // NDC: x-offset + (i * pixel_width + half_pixel_width)
-  return -1.0 + (2 * i + 1.0) / S;
-}
-
 // The default value of the NDC range is [-1, 1], however in the case that
 // H != W, the NDC range is set such that the shorter side has range [-1, 1] and
 // the longer side is scaled by the ratio of H:W. S1 is the dimension for which
@@ -50,7 +40,7 @@ __device__ inline float PixToNonSquareNdc(int i, int S1, int S2) {
 // TODO: is 8 enough? Would increasing have performance considerations?
 const int32_t kMaxPointsPerPixel = 150;
 
-const int32_t kMaxFacesPerBin = 22;
+const int32_t kMaxItemsPerBin = 22;
 
 template <typename T>
 __device__ inline void BubbleSort(T* arr, int n) {
diff --git a/pytorch3d/csrc/rasterize_points/rasterization_utils.h b/pytorch3d/csrc/rasterize_points/rasterization_utils.h
new file mode 100644
index 00000000..06b6bc5c
--- /dev/null
+++ b/pytorch3d/csrc/rasterize_points/rasterization_utils.h
@@ -0,0 +1,34 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+
+// The default value of the NDC range is [-1, 1], however in the case that
+// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
+// the longer side is scaled by the ratio of H:W. S1 is the dimension for which
+// the NDC range is calculated and S2 is the other image dimension.
+// e.g. to get the NDC x range S1 = W and S2 = H
+inline float NonSquareNdcRange(int S1, int S2) {
+  float range = 2.0f;
+  if (S1 > S2) {
+    range = ((S1 / S2) * range);
+  }
+  return range;
+}
+
+// Given a pixel coordinate 0 <= i < S1, convert it to a normalized device
+// coordinates. We divide the NDC range into S1 evenly-sized
+// pixels, and assume that each pixel falls in the *center* of its range.
+// The default value of the NDC range is [-1, 1], however in the case that
+// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
+// the longer side is scaled by the ratio of H:W. The dimension of i should be
+// S1 and the other image dimension is S2 For example, to get the x and y NDC
+// coordinates or a given pixel i:
+//     x = PixToNonSquareNdc(i, W, H)
+//     y = PixToNonSquareNdc(i, H, W)
+inline float PixToNonSquareNdc(int i, int S1, int S2) {
+  float range = NonSquareNdcRange(S1, S2);
+  // NDC: offset + (i * pixel_width + half_pixel_width)
+  // The NDC range is [-range/2, range/2].
+  const float offset = (range / 2.0f);
+  return -offset + (range * i + offset) / S1;
+}
diff --git a/pytorch3d/csrc/rasterize_points/rasterize_points.cu b/pytorch3d/csrc/rasterize_points/rasterize_points.cu
index d02a5680..8b5ea133 100644
--- a/pytorch3d/csrc/rasterize_points/rasterize_points.cu
+++ b/pytorch3d/csrc/rasterize_points/rasterize_points.cu
@@ -85,26 +85,28 @@ __global__ void RasterizePointsNaiveCudaKernel(
     const int64_t* num_points_per_cloud, // (N)
     const float* radius,
     const int N,
-    const int S,
+    const int H,
+    const int W,
     const int K,
-    int32_t* point_idxs, // (N, S, S, K)
-    float* zbuf, // (N, S, S, K)
-    float* pix_dists) { // (N, S, S, K)
+    int32_t* point_idxs, // (N, H, W, K)
+    float* zbuf, // (N, H, W, K)
+    float* pix_dists) { // (N, H, W, K)
   // Simple version: One thread per output pixel
   const int num_threads = gridDim.x * blockDim.x;
   const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = tid; i < N * S * S; i += num_threads) {
+  for (int i = tid; i < N * H * W; i += num_threads) {
     // Convert linear index to 3D index
-    const int n = i / (S * S); // Batch index
-    const int pix_idx = i % (S * S);
+    const int n = i / (H * W); // Batch index
+    const int pix_idx = i % (H * W);
 
     // Reverse ordering of the X and Y axis as the camera coordinates
     // assume that +Y is pointing up and +X is pointing left.
-    const int yi = S - 1 - pix_idx / S;
-    const int xi = S - 1 - pix_idx % S;
+    const int yi = H - 1 - pix_idx / W;
+    const int xi = W - 1 - pix_idx % W;
 
-    const float xf = PixToNdc(xi, S);
-    const float yf = PixToNdc(yi, S);
+    // screen coordinates to ndc coordiantes of pixel.
+    const float xf = PixToNonSquareNdc(xi, W, H);
+    const float yf = PixToNonSquareNdc(yi, H, W);
 
     // For keeping track of the K closest points we want a data structure
     // that (1) gives O(1) access to the closest point for easy comparisons,
@@ -132,7 +134,7 @@ __global__ void RasterizePointsNaiveCudaKernel(
           points, p_idx, q_size, q_max_z, q_max_idx, q, radius, xf, yf, K);
     }
     BubbleSort(q, q_size);
-    int idx = n * S * S * K + pix_idx * K;
+    int idx = n * H * W * K + pix_idx * K;
     for (int k = 0; k < q_size; ++k) {
       point_idxs[idx + k] = q[k].idx;
       zbuf[idx + k] = q[k].z;
@@ -145,7 +147,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda(
     const at::Tensor& points, // (P. 3)
     const at::Tensor& cloud_to_packed_first_idx, // (N)
     const at::Tensor& num_points_per_cloud, // (N)
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const at::Tensor& radius,
     const int points_per_pixel) {
   // Check inputs are on the same device
@@ -169,7 +171,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda(
       "num_points_per_cloud must have same size first dimension as cloud_to_packed_first_idx");
 
   const int N = num_points_per_cloud.size(0); // batch size.
-  const int S = image_size;
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
   const int K = points_per_pixel;
 
   if (K > kMaxPointsPerPixel) {
@@ -180,9 +183,9 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda(
 
   auto int_opts = num_points_per_cloud.options().dtype(at::kInt);
   auto float_opts = points.options().dtype(at::kFloat);
-  at::Tensor point_idxs = at::full({N, S, S, K}, -1, int_opts);
-  at::Tensor zbuf = at::full({N, S, S, K}, -1, float_opts);
-  at::Tensor pix_dists = at::full({N, S, S, K}, -1, float_opts);
+  at::Tensor point_idxs = at::full({N, H, W, K}, -1, int_opts);
+  at::Tensor zbuf = at::full({N, H, W, K}, -1, float_opts);
+  at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts);
 
   if (point_idxs.numel() == 0) {
     AT_CUDA_CHECK(cudaGetLastError());
@@ -197,7 +200,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda(
       num_points_per_cloud.contiguous().data_ptr<int64_t>(),
       radius.contiguous().data_ptr<float>(),
       N,
-      S,
+      H,
+      W,
       K,
       point_idxs.contiguous().data_ptr<int32_t>(),
       zbuf.contiguous().data_ptr<float>(),
@@ -218,7 +222,8 @@ __global__ void RasterizePointsCoarseCudaKernel(
     const float* radius,
     const int N,
     const int P,
-    const int S,
+    const int H,
+    const int W,
     const int bin_size,
     const int chunk_size,
     const int max_points_per_bin,
@@ -226,13 +231,26 @@ __global__ void RasterizePointsCoarseCudaKernel(
     int* bin_points) {
   extern __shared__ char sbuf[];
   const int M = max_points_per_bin;
-  const int num_bins = 1 + (S - 1) / bin_size; // Integer divide round up
-  const float half_pix = 1.0f / S; // Size of half a pixel in NDC units
 
-  // This is a boolean array of shape (num_bins, num_bins, chunk_size)
+  // Integer divide round up
+  const int num_bins_x = 1 + (W - 1) / bin_size;
+  const int num_bins_y = 1 + (H - 1) / bin_size;
+
+  // NDC range depends on the ratio of W/H
+  // The shorter side from (H, W) is given an NDC range of 2.0 and
+  // the other side is scaled by the ratio of H:W.
+  const float NDC_x_half_range = NonSquareNdcRange(W, H) / 2.0f;
+  const float NDC_y_half_range = NonSquareNdcRange(H, W) / 2.0f;
+
+  // Size of half a pixel in NDC units is the NDC half range
+  // divided by the corresponding image dimension
+  const float half_pix_x = NDC_x_half_range / W;
+  const float half_pix_y = NDC_y_half_range / H;
+
+  // This is a boolean array of shape (num_bins_y, num_bins_x, chunk_size)
   // stored in shared memory that will track whether each point in the chunk
   // falls into each bin of the image.
-  BitMask binmask((unsigned int*)sbuf, num_bins, num_bins, chunk_size);
+  BitMask binmask((unsigned int*)sbuf, num_bins_y, num_bins_x, chunk_size);
 
   // Have each block handle a chunk of points and build a 3D bitmask in
   // shared memory to mark which points hit which bins.  In this first phase,
@@ -279,22 +297,24 @@ __global__ void RasterizePointsCoarseCudaKernel(
       // For example we could compute the exact bin where the point falls,
       // then check neighboring bins. This way we wouldn't have to check
       // all bins (however then we might have more warp divergence?)
-      for (int by = 0; by < num_bins; ++by) {
-        // Get y extent for the bin. PixToNdc gives us the location of
+      for (int by = 0; by < num_bins_y; ++by) {
+        // Get y extent for the bin. PixToNonSquareNdc gives us the location of
         // the center of each pixel, so we need to add/subtract a half
         // pixel to get the true extent of the bin.
-        const float by0 = PixToNdc(by * bin_size, S) - half_pix;
-        const float by1 = PixToNdc((by + 1) * bin_size - 1, S) + half_pix;
+        const float by0 = PixToNonSquareNdc(by * bin_size, H, W) - half_pix_y;
+        const float by1 =
+            PixToNonSquareNdc((by + 1) * bin_size - 1, H, W) + half_pix_y;
         const bool y_overlap = (py0 <= by1) && (by0 <= py1);
 
         if (!y_overlap) {
           continue;
         }
-        for (int bx = 0; bx < num_bins; ++bx) {
+        for (int bx = 0; bx < num_bins_x; ++bx) {
           // Get x extent for the bin; again we need to adjust the
-          // output of PixToNdc by half a pixel.
-          const float bx0 = PixToNdc(bx * bin_size, S) - half_pix;
-          const float bx1 = PixToNdc((bx + 1) * bin_size - 1, S) + half_pix;
+          // output of PixToNonSquareNdc by half a pixel.
+          const float bx0 = PixToNonSquareNdc(bx * bin_size, W, H) - half_pix_x;
+          const float bx1 =
+              PixToNonSquareNdc((bx + 1) * bin_size - 1, W, H) + half_pix_x;
           const bool x_overlap = (px0 <= bx1) && (bx0 <= px1);
 
           if (x_overlap) {
@@ -307,12 +327,13 @@ __global__ void RasterizePointsCoarseCudaKernel(
     // Now we have processed every point in the current chunk. We need to
     // count the number of points in each bin so we can write the indices
     // out to global memory. We have each thread handle a different bin.
-    for (int byx = threadIdx.x; byx < num_bins * num_bins; byx += blockDim.x) {
-      const int by = byx / num_bins;
-      const int bx = byx % num_bins;
+    for (int byx = threadIdx.x; byx < num_bins_y * num_bins_x;
+         byx += blockDim.x) {
+      const int by = byx / num_bins_x;
+      const int bx = byx % num_bins_x;
       const int count = binmask.count(by, bx);
       const int points_per_bin_idx =
-          batch_idx * num_bins * num_bins + by * num_bins + bx;
+          batch_idx * num_bins_y * num_bins_x + by * num_bins_x + bx;
 
       // This atomically increments the (global) number of points found
       // in the current bin, and gets the previous value of the counter;
@@ -322,8 +343,8 @@ __global__ void RasterizePointsCoarseCudaKernel(
 
       // Now loop over the binmask and write the active bits for this bin
       // out to bin_points.
-      int next_idx = batch_idx * num_bins * num_bins * M + by * num_bins * M +
-          bx * M + start;
+      int next_idx = batch_idx * num_bins_y * num_bins_x * M +
+          by * num_bins_x * M + bx * M + start;
       for (int p = 0; p < chunk_size; ++p) {
         if (binmask.get(by, bx, p)) {
           // TODO: Throw an error if next_idx >= M -- this means that
@@ -342,7 +363,7 @@ at::Tensor RasterizePointsCoarseCuda(
     const at::Tensor& points, // (P, 3)
     const at::Tensor& cloud_to_packed_first_idx, // (N)
     const at::Tensor& num_points_per_cloud, // (N)
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const at::Tensor& radius,
     const int bin_size,
     const int max_points_per_bin) {
@@ -363,20 +384,28 @@ at::Tensor RasterizePointsCoarseCuda(
   at::cuda::CUDAGuard device_guard(points.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
+
   const int P = points.size(0);
   const int N = num_points_per_cloud.size(0);
-  const int num_bins = 1 + (image_size - 1) / bin_size; // divide round up
   const int M = max_points_per_bin;
 
-  if (num_bins >= 22) {
+  // Integer divide round up.
+  const int num_bins_y = 1 + (H - 1) / bin_size;
+  const int num_bins_x = 1 + (W - 1) / bin_size;
+
+  if (num_bins_y >= kMaxItemsPerBin || num_bins_x >= kMaxItemsPerBin) {
     // Make sure we do not use too much shared memory.
     std::stringstream ss;
-    ss << "Got " << num_bins << "; that's too many!";
+    ss << "In Coarse Rasterizer got num_bins_y: " << num_bins_y
+       << ", num_bins_x: " << num_bins_x << ", "
+       << "; that's too many!";
     AT_ERROR(ss.str());
   }
   auto opts = num_points_per_cloud.options().dtype(at::kInt);
-  at::Tensor points_per_bin = at::zeros({N, num_bins, num_bins}, opts);
-  at::Tensor bin_points = at::full({N, num_bins, num_bins, M}, -1, opts);
+  at::Tensor points_per_bin = at::zeros({N, num_bins_y, num_bins_x}, opts);
+  at::Tensor bin_points = at::full({N, num_bins_y, num_bins_x, M}, -1, opts);
 
   if (bin_points.numel() == 0) {
     AT_CUDA_CHECK(cudaGetLastError());
@@ -384,7 +413,7 @@ at::Tensor RasterizePointsCoarseCuda(
   }
 
   const int chunk_size = 512;
-  const size_t shared_size = num_bins * num_bins * chunk_size / 8;
+  const size_t shared_size = num_bins_y * num_bins_x * chunk_size / 8;
   const size_t blocks = 64;
   const size_t threads = 512;
 
@@ -395,7 +424,8 @@ at::Tensor RasterizePointsCoarseCuda(
       radius.contiguous().data_ptr<float>(),
       N,
       P,
-      image_size,
+      H,
+      W,
       bin_size,
       chunk_size,
       M,
@@ -412,19 +442,21 @@ at::Tensor RasterizePointsCoarseCuda(
 
 __global__ void RasterizePointsFineCudaKernel(
     const float* points, // (P, 3)
-    const int32_t* bin_points, // (N, B, B, T)
+    const int32_t* bin_points, // (N, BH, BW, T)
     const float* radius,
     const int bin_size,
     const int N,
-    const int B, // num_bins
+    const int BH, // num_bins y
+    const int BW, // num_bins x
     const int M,
-    const int S,
+    const int H,
+    const int W,
     const int K,
-    int32_t* point_idxs, // (N, S, S, K)
-    float* zbuf, // (N, S, S, K)
-    float* pix_dists) { // (N, S, S, K)
-  // This can be more than S^2 if S is not dividable by bin_size.
-  const int num_pixels = N * B * B * bin_size * bin_size;
+    int32_t* point_idxs, // (N, H, W, K)
+    float* zbuf, // (N, H, W, K)
+    float* pix_dists) { // (N, H, W, K)
+  // This can be more than H * W if H or W are not divisible by bin_size.
+  const int num_pixels = N * BH * BW * bin_size * bin_size;
   const int num_threads = gridDim.x * blockDim.x;
   const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -434,21 +466,21 @@ __global__ void RasterizePointsFineCudaKernel(
     // into the same bin; this should give them coalesced memory reads when
     // they read from points and bin_points.
     int i = pid;
-    const int n = i / (B * B * bin_size * bin_size);
-    i %= B * B * bin_size * bin_size;
-    const int by = i / (B * bin_size * bin_size);
-    i %= B * bin_size * bin_size;
+    const int n = i / (BH * BW * bin_size * bin_size);
+    i %= BH * BW * bin_size * bin_size;
+    const int by = i / (BW * bin_size * bin_size);
+    i %= BW * bin_size * bin_size;
     const int bx = i / (bin_size * bin_size);
     i %= bin_size * bin_size;
 
     const int yi = i / bin_size + by * bin_size;
     const int xi = i % bin_size + bx * bin_size;
 
-    if (yi >= S || xi >= S)
+    if (yi >= H || xi >= W)
       continue;
 
-    const float xf = PixToNdc(xi, S);
-    const float yf = PixToNdc(yi, S);
+    const float xf = PixToNonSquareNdc(xi, W, H);
+    const float yf = PixToNonSquareNdc(yi, H, W);
 
     // This part looks like the naive rasterization kernel, except we use
     // bin_points to only look at a subset of points already known to fall
@@ -459,7 +491,7 @@ __global__ void RasterizePointsFineCudaKernel(
     float q_max_z = -1000;
     int q_max_idx = -1;
     for (int m = 0; m < M; ++m) {
-      const int p = bin_points[n * B * B * M + by * B * M + bx * M + m];
+      const int p = bin_points[n * BH * BW * M + by * BW * M + bx * M + m];
       if (p < 0) {
         // bin_points uses -1 as a sentinal value
         continue;
@@ -473,10 +505,10 @@ __global__ void RasterizePointsFineCudaKernel(
 
     // Reverse ordering of the X and Y axis as the camera coordinates
     // assume that +Y is pointing up and +X is pointing left.
-    const int yidx = S - 1 - yi;
-    const int xidx = S - 1 - xi;
+    const int yidx = H - 1 - yi;
+    const int xidx = W - 1 - xi;
 
-    const int pix_idx = n * S * S * K + yidx * S * K + xidx * K;
+    const int pix_idx = n * H * W * K + yidx * W * K + xidx * K;
     for (int k = 0; k < q_size; ++k) {
       point_idxs[pix_idx + k] = q[k].idx;
       zbuf[pix_idx + k] = q[k].z;
@@ -488,7 +520,7 @@ __global__ void RasterizePointsFineCudaKernel(
 std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda(
     const at::Tensor& points, // (P, 3)
     const at::Tensor& bin_points,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const at::Tensor& radius,
     const int bin_size,
     const int points_per_pixel) {
@@ -503,18 +535,22 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda(
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   const int N = bin_points.size(0);
-  const int B = bin_points.size(1); // num_bins
+  const int BH = bin_points.size(1);
+  const int BW = bin_points.size(2);
   const int M = bin_points.size(3);
-  const int S = image_size;
   const int K = points_per_pixel;
+
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
+
   if (K > kMaxPointsPerPixel) {
     AT_ERROR("Must have num_closest <= 150");
   }
   auto int_opts = bin_points.options().dtype(at::kInt);
   auto float_opts = points.options().dtype(at::kFloat);
-  at::Tensor point_idxs = at::full({N, S, S, K}, -1, int_opts);
-  at::Tensor zbuf = at::full({N, S, S, K}, -1, float_opts);
-  at::Tensor pix_dists = at::full({N, S, S, K}, -1, float_opts);
+  at::Tensor point_idxs = at::full({N, H, W, K}, -1, int_opts);
+  at::Tensor zbuf = at::full({N, H, W, K}, -1, float_opts);
+  at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts);
 
   if (point_idxs.numel() == 0) {
     AT_CUDA_CHECK(cudaGetLastError());
@@ -529,9 +565,11 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda(
       radius.contiguous().data_ptr<float>(),
       bin_size,
       N,
-      B,
+      BH,
+      BW,
       M,
-      S,
+      H,
+      W,
       K,
       point_idxs.contiguous().data_ptr<int32_t>(),
       zbuf.contiguous().data_ptr<float>(),
@@ -571,8 +609,8 @@ __global__ void RasterizePointsBackwardCudaKernel(
     const int yidx = H - 1 - yi;
     const int xidx = W - 1 - xi;
 
-    const float xf = PixToNdc(xidx, W);
-    const float yf = PixToNdc(yidx, H);
+    const float xf = PixToNonSquareNdc(xidx, W, H);
+    const float yf = PixToNonSquareNdc(yidx, H, W);
 
     const int p = idxs[i];
     if (p < 0)
diff --git a/pytorch3d/csrc/rasterize_points/rasterize_points.h b/pytorch3d/csrc/rasterize_points/rasterize_points.h
index f1ec1aaf..a13d9773 100644
--- a/pytorch3d/csrc/rasterize_points/rasterize_points.h
+++ b/pytorch3d/csrc/rasterize_points/rasterize_points.h
@@ -14,7 +14,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaiveCpu(
     const torch::Tensor& points,
     const torch::Tensor& cloud_to_packed_first_idx,
     const torch::Tensor& num_points_per_cloud,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int points_per_pixel);
 
@@ -24,7 +24,7 @@ RasterizePointsNaiveCuda(
     const torch::Tensor& points,
     const torch::Tensor& cloud_to_packed_first_idx,
     const torch::Tensor& num_points_per_cloud,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int points_per_pixel);
 #endif
@@ -43,7 +43,8 @@ RasterizePointsNaiveCuda(
 //                        for each pointcloud in the batch.
 //  radius: FloatTensor of shape (P) giving the radius (in NDC units) of
 //          each point in points.
-//  image_size: (S) Size of the image to return (in pixels)
+//  image_size: Tuple (H, W) giving the size in pixels of the output
+//              image to be rasterized.
 //  points_per_pixel: (K) The number closest of points to return for each pixel
 //
 // Returns:
@@ -62,7 +63,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaive(
     const torch::Tensor& points,
     const torch::Tensor& cloud_to_packed_first_idx,
     const torch::Tensor& num_points_per_cloud,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int points_per_pixel) {
   if (points.is_cuda() && cloud_to_packed_first_idx.is_cuda() &&
@@ -101,7 +102,7 @@ torch::Tensor RasterizePointsCoarseCpu(
     const torch::Tensor& points,
     const torch::Tensor& cloud_to_packed_first_idx,
     const torch::Tensor& num_points_per_cloud,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int bin_size,
     const int max_points_per_bin);
@@ -111,7 +112,7 @@ torch::Tensor RasterizePointsCoarseCuda(
     const torch::Tensor& points,
     const torch::Tensor& cloud_to_packed_first_idx,
     const torch::Tensor& num_points_per_cloud,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int bin_size,
     const int max_points_per_bin);
@@ -128,7 +129,8 @@ torch::Tensor RasterizePointsCoarseCuda(
 //                        for each pointcloud in the batch.
 //  radius: FloatTensor of shape (P) giving the radius (in NDC units) of
 //          each point in points.
-//  image_size: Size of the image to generate (in pixels)
+//  image_size: Tuple (H, W) giving the size in pixels of the output
+//              image to be rasterized.
 //  bin_size: Size of each bin within the image (in pixels)
 //
 // Returns:
@@ -140,7 +142,7 @@ torch::Tensor RasterizePointsCoarse(
     const torch::Tensor& points,
     const torch::Tensor& cloud_to_packed_first_idx,
     const torch::Tensor& num_points_per_cloud,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int bin_size,
     const int max_points_per_bin) {
@@ -182,7 +184,7 @@ torch::Tensor RasterizePointsCoarse(
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFineCuda(
     const torch::Tensor& points,
     const torch::Tensor& bin_points,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int bin_size,
     const int points_per_pixel);
@@ -194,7 +196,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFineCuda(
 //          are expected to be in NDC coordinates in the range [-1, 1].
 //  bin_points: int32 Tensor of shape (N, B, B, M) giving the indices of points
 //              that fall into each bin (output from coarse rasterization)
-//  image_size: Size of image to generate (in pixels)
+//  image_size: Tuple (H, W) giving the size in pixels of the output
+//              image to be rasterized.
 //  radius: FloatTensor of shape (P) giving the radius (in NDC units) of
 //          each point in points.
 //  bin_size: Size of each bin (in pixels)
@@ -214,7 +217,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFineCuda(
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFine(
     const torch::Tensor& points,
     const torch::Tensor& bin_points,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int bin_size,
     const int points_per_pixel) {
@@ -303,7 +306,8 @@ torch::Tensor RasterizePointsBackward(
 //                        for each pointcloud in the batch.
 //  radius: FloatTensor of shape (P) giving the radius (in NDC units) of
 //          each point in points.
-//  image_size:  (S) Size of the image to return (in pixels)
+//  image_size: Tuple (H, W) giving the size in pixels of the output
+//              image to be rasterized.
 //  points_per_pixel: (K) The number of points to return for each pixel
 //  bin_size: Bin size (in pixels) for coarse-to-fine rasterization. Setting
 //            bin_size=0 uses naive rasterization instead.
@@ -325,7 +329,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePoints(
     const torch::Tensor& points,
     const torch::Tensor& cloud_to_packed_first_idx,
     const torch::Tensor& num_points_per_cloud,
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int points_per_pixel,
     const int bin_size,
diff --git a/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp b/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp
index d7913f65..53cd6cba 100644
--- a/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp
+++ b/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp
@@ -3,33 +3,27 @@
 #include <torch/extension.h>
 #include <queue>
 #include <tuple>
-
-// Given a pixel coordinate 0 <= i < S, convert it to a normalized device
-// coordinate in the range [-1, 1]. The NDC range is divided into S evenly-sized
-// pixels, and assume that each pixel falls in the *center* of its range.
-static float PixToNdc(const int i, const int S) {
-  // NDC x-offset + (i * pixel_width + half_pixel_width)
-  return -1 + (2 * i + 1.0f) / S;
-}
+#include "rasterization_utils.h"
 
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaiveCpu(
     const torch::Tensor& points, // (P, 3)
     const torch::Tensor& cloud_to_packed_first_idx, // (N)
     const torch::Tensor& num_points_per_cloud, // (N)
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int points_per_pixel) {
   const int32_t N = cloud_to_packed_first_idx.size(0); // batch_size.
 
-  const int S = image_size;
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
   const int K = points_per_pixel;
 
   // Initialize output tensors.
   auto int_opts = num_points_per_cloud.options().dtype(torch::kInt32);
   auto float_opts = points.options().dtype(torch::kFloat32);
-  torch::Tensor point_idxs = torch::full({N, S, S, K}, -1, int_opts);
-  torch::Tensor zbuf = torch::full({N, S, S, K}, -1, float_opts);
-  torch::Tensor pix_dists = torch::full({N, S, S, K}, -1, float_opts);
+  torch::Tensor point_idxs = torch::full({N, H, W, K}, -1, int_opts);
+  torch::Tensor zbuf = torch::full({N, H, W, K}, -1, float_opts);
+  torch::Tensor pix_dists = torch::full({N, H, W, K}, -1, float_opts);
 
   auto points_a = points.accessor<float, 2>();
   auto point_idxs_a = point_idxs.accessor<int32_t, 4>();
@@ -46,16 +40,16 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaiveCpu(
     const int point_stop_idx =
         (point_start_idx + num_points_per_cloud[n].item().to<int32_t>());
 
-    for (int yi = 0; yi < S; ++yi) {
+    for (int yi = 0; yi < H; ++yi) {
       // Reverse the order of yi so that +Y is pointing upwards in the image.
-      const int yidx = S - 1 - yi;
-      const float yf = PixToNdc(yidx, S);
+      const int yidx = H - 1 - yi;
+      const float yf = PixToNonSquareNdc(yidx, H, W);
 
-      for (int xi = 0; xi < S; ++xi) {
+      for (int xi = 0; xi < W; ++xi) {
         // Reverse the order of xi so that +X is pointing to the left in the
         // image.
-        const int xidx = S - 1 - xi;
-        const float xf = PixToNdc(xidx, S);
+        const int xidx = W - 1 - xi;
+        const float xf = PixToNonSquareNdc(xidx, W, H);
 
         // Use a priority queue to hold (z, idx, r)
         std::priority_queue<std::tuple<float, int, float>> q;
@@ -99,25 +93,36 @@ torch::Tensor RasterizePointsCoarseCpu(
     const torch::Tensor& points, // (P, 3)
     const torch::Tensor& cloud_to_packed_first_idx, // (N)
     const torch::Tensor& num_points_per_cloud, // (N)
-    const int image_size,
+    const std::tuple<int, int> image_size,
     const torch::Tensor& radius,
     const int bin_size,
     const int max_points_per_bin) {
   const int32_t N = cloud_to_packed_first_idx.size(0); // batch_size.
-
-  const int B = 1 + (image_size - 1) / bin_size; // Integer division round up
   const int M = max_points_per_bin;
+
+  const float H = std::get<0>(image_size);
+  const float W = std::get<1>(image_size);
+
+  // Integer division round up.
+  const int BH = 1 + (H - 1) / bin_size;
+  const int BW = 1 + (W - 1) / bin_size;
+
   auto opts = num_points_per_cloud.options().dtype(torch::kInt32);
-  torch::Tensor points_per_bin = torch::zeros({N, B, B}, opts);
-  torch::Tensor bin_points = torch::full({N, B, B, M}, -1, opts);
+  torch::Tensor points_per_bin = torch::zeros({N, BH, BW}, opts);
+  torch::Tensor bin_points = torch::full({N, BH, BW, M}, -1, opts);
 
   auto points_a = points.accessor<float, 2>();
   auto points_per_bin_a = points_per_bin.accessor<int32_t, 3>();
   auto bin_points_a = bin_points.accessor<int32_t, 4>();
   auto radius_a = radius.accessor<float, 1>();
 
-  const float pixel_width = 2.0f / image_size;
-  const float bin_width = pixel_width * bin_size;
+  const float ndc_x_range = NonSquareNdcRange(W, H);
+  const float pixel_width_x = ndc_x_range / W;
+  const float bin_width_x = pixel_width_x * bin_size;
+
+  const float ndc_y_range = NonSquareNdcRange(H, W);
+  const float pixel_width_y = ndc_y_range / H;
+  const float bin_width_y = pixel_width_y * bin_size;
 
   for (int n = 0; n < N; ++n) {
     // Loop through each pointcloud in the batch.
@@ -129,15 +134,15 @@ torch::Tensor RasterizePointsCoarseCpu(
         (point_start_idx + num_points_per_cloud[n].item().to<int32_t>());
 
     float bin_y_min = -1.0f;
-    float bin_y_max = bin_y_min + bin_width;
+    float bin_y_max = bin_y_min + bin_width_y;
 
     // Iterate through the horizontal bins from top to bottom.
-    for (int by = 0; by < B; by++) {
+    for (int by = 0; by < BH; by++) {
       float bin_x_min = -1.0f;
-      float bin_x_max = bin_x_min + bin_width;
+      float bin_x_max = bin_x_min + bin_width_x;
 
       // Iterate through bins on this horizontal line, left to right.
-      for (int bx = 0; bx < B; bx++) {
+      for (int bx = 0; bx < BW; bx++) {
         int32_t points_hit = 0;
         for (int p = point_start_idx; p < point_stop_idx; ++p) {
           float px = points_a[p][0];
@@ -172,11 +177,11 @@ torch::Tensor RasterizePointsCoarseCpu(
 
         // Shift the bin to the right for the next loop iteration
         bin_x_min = bin_x_max;
-        bin_x_max = bin_x_min + bin_width;
+        bin_x_max = bin_x_min + bin_width_x;
       }
       // Shift the bin down for the next loop iteration
       bin_y_min = bin_y_max;
-      bin_y_max = bin_y_min + bin_width;
+      bin_y_max = bin_y_min + bin_width_y;
     }
   }
   return bin_points;
@@ -194,11 +199,6 @@ torch::Tensor RasterizePointsBackwardCpu(
   const int W = idxs.size(2);
   const int K = idxs.size(3);
 
-  // For now only support square images.
-  // TODO(jcjohns): Extend to non-square images.
-  if (H != W) {
-    AT_ERROR("RasterizePointsBackwardCpu only supports square images");
-  }
   torch::Tensor grad_points = torch::zeros({P, 3}, points.options());
 
   auto points_a = points.accessor<float, 2>();
@@ -212,7 +212,7 @@ torch::Tensor RasterizePointsBackwardCpu(
       // Reverse the order of yi so that +Y is pointing upwards in the image.
       const int yidx = H - 1 - y;
       // Y coordinate of the top of the pixel.
-      const float yf = PixToNdc(yidx, H);
+      const float yf = PixToNonSquareNdc(yidx, H, W);
 
       // Iterate through pixels on this horizontal line, left to right.
       for (int x = 0; x < W; ++x) { // Loop over pixels in the row
@@ -220,7 +220,7 @@ torch::Tensor RasterizePointsBackwardCpu(
         // Reverse the order of xi so that +X is pointing to the left in the
         // image.
         const int xidx = W - 1 - x;
-        const float xf = PixToNdc(xidx, W);
+        const float xf = PixToNonSquareNdc(xidx, W, H);
         for (int k = 0; k < K; ++k) { // Loop over points for the pixel
           const int p = idxs_a[n][y][x][k];
           if (p < 0) {
diff --git a/pytorch3d/renderer/mesh/__init__.py b/pytorch3d/renderer/mesh/__init__.py
index a0a01086..d8b6b13b 100644
--- a/pytorch3d/renderer/mesh/__init__.py
+++ b/pytorch3d/renderer/mesh/__init__.py
@@ -6,6 +6,7 @@ from .rasterizer import MeshRasterizer, RasterizationSettings
 from .renderer import MeshRenderer
 from .shader import TexturedSoftPhongShader  # DEPRECATED
 from .shader import (
+    BlendParams,
     HardFlatShader,
     HardGouraudShader,
     HardPhongShader,
diff --git a/pytorch3d/renderer/mesh/rasterize_meshes.py b/pytorch3d/renderer/mesh/rasterize_meshes.py
index c0702631..d4cb24ab 100644
--- a/pytorch3d/renderer/mesh/rasterize_meshes.py
+++ b/pytorch3d/renderer/mesh/rasterize_meshes.py
@@ -1,7 +1,7 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 
 
-from typing import Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -20,7 +20,7 @@ kMaxFacesPerBin = 22
 
 def rasterize_meshes(
     meshes,
-    image_size: Union[int, Tuple[int, int]] = 256,
+    image_size: Union[int, List[int], Tuple[int, int]] = 256,
     blur_radius: float = 0.0,
     faces_per_pixel: int = 8,
     bin_size: Optional[int] = None,
@@ -219,7 +219,7 @@ class _RasterizeFaceVerts(torch.autograd.Function):
         face_verts,
         mesh_to_face_first_idx,
         num_faces_per_mesh,
-        image_size: Tuple[int, int] = (256, 256),
+        image_size: Union[List[int], Tuple[int, int]] = (256, 256),
         blur_radius: float = 0.01,
         faces_per_pixel: int = 0,
         bin_size: int = 0,
@@ -287,11 +287,6 @@ class _RasterizeFaceVerts(torch.autograd.Function):
         return grads
 
 
-def pix_to_ndc(i, S):
-    # NDC x-offset + (i * pixel_width + half_pixel_width)
-    return -1 + (2 * i + 1.0) / S
-
-
 def non_square_ndc_range(S1, S2):
     """
     In the case of non square images, we scale the NDC range
diff --git a/pytorch3d/renderer/points/compositor.py b/pytorch3d/renderer/points/compositor.py
index 6f6c274c..650f0a3a 100644
--- a/pytorch3d/renderer/points/compositor.py
+++ b/pytorch3d/renderer/points/compositor.py
@@ -75,7 +75,7 @@ def _add_background_color_to_images(pix_idxs, images, background_color):
             pixels with accumulated features have unchanged values.
     """
     # Initialize background mask
-    background_mask = pix_idxs[:, 0] < 0  # (N, image_size, image_size)
+    background_mask = pix_idxs[:, 0] < 0  # (N, H, W)
 
     # Convert background_color to an appropriate tensor and check shape
     if not torch.is_tensor(background_color):
diff --git a/pytorch3d/renderer/points/rasterize_points.py b/pytorch3d/renderer/points/rasterize_points.py
index 2942f708..80a0e99b 100644
--- a/pytorch3d/renderer/points/rasterize_points.py
+++ b/pytorch3d/renderer/points/rasterize_points.py
@@ -6,7 +6,7 @@ import torch
 
 # pyre-fixme[21]: Could not find name `_C` in `pytorch3d`.
 from pytorch3d import _C
-from pytorch3d.renderer.mesh.rasterize_meshes import pix_to_ndc
+from pytorch3d.renderer.mesh.rasterize_meshes import pix_to_non_square_ndc
 
 
 # Maxinum number of faces per bins for
@@ -14,17 +14,30 @@ from pytorch3d.renderer.mesh.rasterize_meshes import pix_to_ndc
 kMaxPointsPerBin = 22
 
 
-# TODO(jcjohns): Support non-square images
 def rasterize_points(
     pointclouds,
-    image_size: int = 256,
+    image_size: Union[int, List[int], Tuple[int, int]] = 256,
     radius: Union[float, List, Tuple, torch.Tensor] = 0.01,
     points_per_pixel: int = 8,
     bin_size: Optional[int] = None,
     max_points_per_bin: Optional[int] = None,
 ):
     """
-    Pointcloud rasterization
+    Each pointcloud is rasterized onto a separate image of shape
+    (H, W) if `image_size` is a tuple or (image_size, image_size) if it
+    is an int.
+
+    If the desired image size is non square (i.e. a tuple of (H, W) where H != W)
+    the aspect ratio needs special consideration. There are two aspect ratios
+    to be aware of:
+        - the aspect ratio of each pixel
+        - the aspect ratio of the output image
+    The camera can be used to set the pixel aspect ratio. In the rasterizer,
+    we assume square pixels, but variable image aspect ratio (i.e rectangle images).
+
+    In most cases you will want to set the camera aspect ratio to
+    1.0 (i.e. square pixels) and only vary the
+    `image_size` (i.e. the output image dimensions in pix
 
     Args:
         pointclouds: A Pointclouds object representing a batch of point clouds to be
@@ -34,7 +47,8 @@ def rasterize_points(
             be in normalized device coordinates (NDC): [-1, 1]^3 with the camera at
             (0, 0, 0); In the camera coordinate frame the x-axis goes from right-to-left,
             the y-axis goes from bottom-to-top, and the z-axis goes from back-to-front.
-        image_size: Integer giving the resolution of the rasterized image
+        image_size: Size in pixels of the output image to be rasterized.
+            Can optionally be a tuple of (H, W) in the case of non square images.
         radius (Optional): The radius (in NDC units) of the disk to
             be rasterized. This can either be a float in which case the same radius is used
             for each point, or a torch.Tensor of shape (N, P) giving a radius per point
@@ -71,6 +85,9 @@ def rasterize_points(
           then `dists[n, y, x, k]` is the squared distance between the pixel (y, x)
           and the point `(points[n, p, 0], points[n, p, 1])`. Pixels hit with fewer
           than points_per_pixel are padded with -1.
+
+        In the case that image_size is a tuple of (H, W) then the outputs
+        will be of shape `(N, H, W, ...)`.
     """
     points_packed = pointclouds.points_packed()
     cloud_to_packed_first_idx = pointclouds.cloud_to_packed_first_idx()
@@ -78,26 +95,46 @@ def rasterize_points(
 
     radius = _format_radius(radius, pointclouds)
 
+    # In the case that H != W use the max image size to set the bin_size
+    # to accommodate the num bins constraint in the coarse rasteizer.
+    # If the ratio of H:W is large this might cause issues as the smaller
+    # dimension will have fewer bins.
+    # TODO: consider a better way of setting the bin size.
+    if isinstance(image_size, (tuple, list)):
+        if len(image_size) != 2:
+            raise ValueError("Image size can only be a tuple/list of (H, W)")
+        if not all(i > 0 for i in image_size):
+            raise ValueError(
+                "Image sizes must be greater than 0; got %d, %d" % image_size
+            )
+        if not all(type(i) == int for i in image_size):
+            raise ValueError("Image sizes must be integers; got %f, %f" % image_size)
+        max_image_size = max(*image_size)
+        im_size = image_size
+    else:
+        im_size = (image_size, image_size)
+        max_image_size = image_size
+
     if bin_size is None:
         if not points_packed.is_cuda:
             # Binned CPU rasterization not fully implemented
             bin_size = 0
         else:
             # TODO: These heuristics are not well-thought out!
-            if image_size <= 64:
+            if max_image_size <= 64:
                 bin_size = 8
-            elif image_size <= 256:
+            elif max_image_size <= 256:
                 bin_size = 16
-            elif image_size <= 512:
+            elif max_image_size <= 512:
                 bin_size = 32
-            elif image_size <= 1024:
+            elif max_image_size <= 1024:
                 bin_size = 64
 
     if bin_size != 0:
         # There is a limit on the number of points per bin in the cuda kernel.
         # pyre-fixme[58]: `//` is not supported for operand types `int` and
         #  `Union[int, None, int]`.
-        points_per_bin = 1 + (image_size - 1) // bin_size
+        points_per_bin = 1 + (max_image_size - 1) // bin_size
         if points_per_bin >= kMaxPointsPerBin:
             raise ValueError(
                 "bin_size too small, number of points per bin must be less than %d; got %d"
@@ -114,7 +151,7 @@ def rasterize_points(
         points_packed,
         cloud_to_packed_first_idx,
         num_points_per_cloud,
-        image_size,
+        im_size,
         radius,
         points_per_pixel,
         bin_size,
@@ -173,7 +210,7 @@ class _RasterizePoints(torch.autograd.Function):
         points,  # (P, 3)
         cloud_to_packed_first_idx,
         num_points_per_cloud,
-        image_size: int = 256,
+        image_size: Union[List[int], Tuple[int, int]] = (256, 256),
         radius: Union[float, torch.Tensor] = 0.01,
         points_per_pixel: int = 8,
         bin_size: int = 0,
@@ -225,7 +262,7 @@ class _RasterizePoints(torch.autograd.Function):
 
 def rasterize_points_python(
     pointclouds,
-    image_size: int = 256,
+    image_size: Union[int, Tuple[int, int]] = 256,
     radius: Union[float, torch.Tensor] = 0.01,
     points_per_pixel: int = 8,
 ):
@@ -235,7 +272,12 @@ def rasterize_points_python(
     Inputs / Outputs: Same as above
     """
     N = len(pointclouds)
-    S, K = image_size, points_per_pixel
+    H, W = (
+        image_size
+        if isinstance(image_size, (tuple, list))
+        else (image_size, image_size)
+    )
+    K = points_per_pixel
     device = pointclouds.device
 
     points_packed = pointclouds.points_packed()
@@ -247,11 +289,11 @@ def rasterize_points_python(
 
     # Intialize output tensors.
     point_idxs = torch.full(
-        (N, S, S, K), fill_value=-1, dtype=torch.int32, device=device
+        (N, H, W, K), fill_value=-1, dtype=torch.int32, device=device
     )
-    zbuf = torch.full((N, S, S, K), fill_value=-1, dtype=torch.float32, device=device)
+    zbuf = torch.full((N, H, W, K), fill_value=-1, dtype=torch.float32, device=device)
     pix_dists = torch.full(
-        (N, S, S, K), fill_value=-1, dtype=torch.float32, device=device
+        (N, H, W, K), fill_value=-1, dtype=torch.float32, device=device
     )
 
     # NDC is from [-1, 1]. Get pixel size using specified image size.
@@ -263,18 +305,18 @@ def rasterize_points_python(
         point_stop_idx = point_start_idx + num_points_per_cloud[n]
 
         # Iterate through the horizontal lines of the image from top to bottom.
-        for yi in range(S):
+        for yi in range(H):
             # Y coordinate of one end of the image. Reverse the ordering
             # of yi so that +Y is pointing up in the image.
-            yfix = S - 1 - yi
-            yf = pix_to_ndc(yfix, S)
+            yfix = H - 1 - yi
+            yf = pix_to_non_square_ndc(yfix, H, W)
 
             # Iterate through pixels on this horizontal line, left to right.
-            for xi in range(S):
+            for xi in range(W):
                 # X coordinate of one end of the image. Reverse the ordering
                 # of xi so that +X is pointing to the left in the image.
-                xfix = S - 1 - xi
-                xf = pix_to_ndc(xfix, S)
+                xfix = W - 1 - xi
+                xf = pix_to_non_square_ndc(xfix, W, H)
 
                 top_k_points = []
                 # Check whether each point in the batch affects this pixel.
diff --git a/pytorch3d/renderer/points/rasterizer.py b/pytorch3d/renderer/points/rasterizer.py
index 85e93e42..e8794f4c 100644
--- a/pytorch3d/renderer/points/rasterizer.py
+++ b/pytorch3d/renderer/points/rasterizer.py
@@ -2,7 +2,7 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 
 
-from typing import NamedTuple, Optional, Union
+from typing import NamedTuple, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -29,7 +29,7 @@ class PointsRasterizationSettings:
 
     def __init__(
         self,
-        image_size: int = 256,
+        image_size: Union[int, Tuple[int, int]] = 256,
         radius: Union[float, torch.Tensor] = 0.01,
         points_per_pixel: int = 8,
         bin_size: Optional[int] = None,
diff --git a/tests/bm_rasterize_points.py b/tests/bm_rasterize_points.py
index d00e45ac..70e4a778 100644
--- a/tests/bm_rasterize_points.py
+++ b/tests/bm_rasterize_points.py
@@ -74,6 +74,21 @@ def bm_python_vs_cpu_vs_cuda() -> None:
     kwargs_list += [
         {"N": 32, "P": 100000, "img_size": 128, "radius": 0.01, "pts_per_pxl": 50},
         {"N": 8, "P": 200000, "img_size": 512, "radius": 0.01, "pts_per_pxl": 50},
+        {"N": 8, "P": 200000, "img_size": 256, "radius": 0.01, "pts_per_pxl": 50},
+        {
+            "N": 8,
+            "P": 200000,
+            "img_size": (512, 256),
+            "radius": 0.01,
+            "pts_per_pxl": 50,
+        },
+        {
+            "N": 8,
+            "P": 200000,
+            "img_size": (256, 512),
+            "radius": 0.01,
+            "pts_per_pxl": 50,
+        },
     ]
     for k in kwargs_list:
         k["device"] = "cuda"
diff --git a/tests/data/test_pointcloud_rectangle_image.png b/tests/data/test_pointcloud_rectangle_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..8bf30329477ed9d2a3945c63451fb05c32e33a93
GIT binary patch
literal 20251
zcmeHv_aoK)`~T}8BGgSv2$d0vlATp4d(W~9Wkd;atkdGAGLyZsLL4K>Rw^smJDmy{
z$H+Lw;T+%V<^BGA{)X=lpYua^xp|G}^Lbs@<9a+EmoNi8brvQrCImrPG&R(W5QGl?
zN`vgBf&awf!xj)k@3*Gf1ry(t`7!@zDUQP%OR2-DD|c@?b&lSy88uTC;XK2}A>hkD
z5!Yw;><cGj*pA(+hiu0g8KVw_ec|6=dc7$aF7%~Vl=0YJUgLjQ>Tw@l-Fj?PJ-?6=
z*_jgApi+F_V^O*D>pP#y)Uo;XmF9`z;hVXgUxn^icLt6PN=u>-@_(QITY>+tR$%Vb
zJ9}FMar5x08JqDMnO}>Jj)uRw$%F=8Mvyn55Jca;3=LVFJh;3?;f-wmb>~?`#2U5^
zzUP*dOU1Q(6G`S9LxXz}RXza(`OEk8^Yv(rPf3WELh6+)!y+@5rjrMm5_NK|#P@Gq
z#~^Z_he{+6<Q;onue_3y(%G~BJiqejLxuo?WsK571e7#{VrRUtL*^_<V?#88G$wjh
zS>3yLueY~1Ha515=I6>Ly&yvVt$I9~bFG3F`5UZ(Aaf4N>77JU_1Hpb>DlCtfq5Ux
zBC|Rhyoo3YOHsj!C3nc2LqGR~7;)Z?{IRVm<@$_BV=#^mY5aDOHMF_8nf3v$bLw7|
z6>DhI3*_L_&jHH8f27H=Nnv4(f%|cEWHK+VUf-;+|K4^cyn_-Of}DTe^eZ7LDg5*G
z@$vBwc6Yj%RP#-ko?J}S`J*#?@4NebDX!;g$GfG8ZZr$r6z-4<5dA|;=+mBbuk`ov
z@xfvhxSn5Q$JyWqT`F8Ek^~T=#_>CHW@cvaA_^Ydtf7}Oqp$~KkoRNm#*9%-ze=v`
z{JJBT+z~^IAR!Uxv%^1~Zr+^w^T%NiKMyOzqYEM!E^hAf(Rr6W8CK%NQh)s5dB5ce
zg~hq`EqFt0U*f|0`P6US-QCgA|0t2o@y9U;!hDnw!KL1CadE+7u`VvNZy9kTwT1M)
z(#ahrqW$Z)m3SiuiT!-uxg1C<or6dtxnPAFhah%WXb`*5y`fE|7N@wLf4P2E-Xf1q
zqhTF?ZT*E-PJPjX6urFO8Luj<SwrFT&jh2Rq%wMEq>?-8@myaH(;|%`=u#XQcg<F{
zjo5Ohc{$y%IQPR>(LHdg`&=)PBtVYmYyR?uoHj7y6|zKvleN3kbjs-x2lmpPCcne<
ziv^pkN=)S$iMZn2%ZCSc2KD`6jV&3_*J&h@+P+H2cSRNns$wT26^zd^H?96j?RBmk
zby52DHe{6r^FLQ~uN<B9tQvENZ=N(PLZ(?1Nai8~bQ#^Zq!?nmYzaE2$n~>j?c!pG
zo({VWknsp|CnyT`lJU!zFFdTx2~QM7B4dzMtP&CLH6V-chnOh3R#c##sTKA{`ZC#>
zTqs^MS9US;=lYg$eqToJB?L(^hEVaLN&W)&hQq2t+}4`eqK4SN7~~Ae^f&=u&AnjP
z?_#QQ^(be^SrwJV^({EgUP9K^)=G=E9LUd$>ah62)5S9+>?2kOR#k#+Yw7mTIt7BS
z)Z#_*`T&NefaUe}Yb+B7(;mOL+U*Anyl0p%y$Y5-bt+LOZQWxZa_7Pc3{t}Wc0ka3
z$K}8f`Fd%Bvog*E<5+$*jCzzcH0Yr-TgeX-*fFQ_VJmTA28ci2NcBi6e4)rpEKaM5
zN=1;v2hdQs5U5*ZX3Ri-E78k;rnlAel!^^@CI@>VIL9dZmU5hy=Bfu1K{pns%c$zo
zQQoO6HcIc1`~d4$W%ZKI&dp6{l8tQ1vQ@YTnUg{n=3zKwi>(0V-nSBg2Q1*tncuyC
ze=GHX)yadBi8_kBvX`fV8VGOs8k3Mu<4Y>krO4-3E-g?--DH%=5(k++*qu4Zbd{im
z7n(i)iW9+=tD=kbdXNZ<V<H--CFq(Ml-1qE@IaG>f2z<tOd@!*J(r13TIlQ4iePMO
ztg!y9w{pSBTd8zE`^|0Vy%Yi$DJolY<?QV2{_6{IbQz0GII39_H3@^9$G~AbdOW=|
z^x`q^Mce(qbssgt{^uBp#A#JoiKll?NkQ@oWJdy%hL*;gCtK?)=XDLYW@igdT}WUG
zRpjkF5vf>KQgR0JV3-<N2yxk?8d&VB_`{_lgrZA}ypu(jSFOtGn}Z~|F1X5cab<J$
zf*qMjRXJThuQySeKv}q7<R4JjUqCv_7N(YI(8Z1rZ*NUbI##iTZJDl0+-s{}&5`Kn
z?9_*kj*ePZH%S`@!L{%F-QRTMk1+^0BqqBl4FR@+zt#<YcwwZBCaTo|u5~=xcePtR
zP}!D1r)pE2w70dLrRjCQOO4OGg6G7)A{gJ54p~Z2!!wT9TQ+MO5fKq><C&L24;?y`
zn}>dcg@yZ>aL$B>S}Fo$b1_aaF|j6UE(RIpM3>AFKB=>*7Z@9Zch-X;b5VTiZIrVa
z2(uL-_APxSkwL3jHTM_O($muu6Qf5rB@=Y4W0E9t(^9%VT`#Q{LqstK>(rVt_nNU{
z$WAWO>7Dx*&(lL0Ll4jk$gs;3nx=~E%=9F>#%c!0`HJM}tXyS9ER9{!Q8)xfF9@a6
zJ~gw3Ev7S5R`)ujpFVvWZQROzP5oK^v~96fg`gnh#uRCE{S8FpVt%CN=_2)8V~gYf
z<pu%-BqWvrZ`yRB_;zWA*xBF;hKzof)|E^Hjb#?U7qnM{OMEa!>(oh_Y~lRc5{px;
zTdweQ+UV2eV(XxeqPc)A13i0iLg3%yOOWR)M^7`5nQ+x4@dom8e8ZbX;yejk5wv=>
z!Ly{Kgnx;$yu3`MY*A>Zk6u2X8o^5`q#&Z^$IvG|iazPpyk1CjXP8Mkrw-Mu_GlkI
zeE29AnuZl7Rf`afk3a9oi8pL7-;slYu)auu-8Q<l+9rnQQ4ZL6YP)^C$_gqwwjnsL
z*YEGo4;lhxZ{G?m1uCSPiG*q?dO?YH7JG;dt8Fr9Fh?3?paT5-c)fB|U0r=X4|4W|
zYS|u5M3Fc0`&j(>)t+wKYg85L#(3oqY@vwK`=u}3>vO~GVQPKtwn3YYah}svp|8x4
z2`>7alP-7dhHQlqyDfN?JN7fFiBO-QcAYy{?!WG?^w4}#sZ7(H4QFuj;9k0*zmNDJ
z>^*{3lG!N1zxLS<Rk+6gn1uVo8JwL7mF9@PIiWD9Z+JyrT@#?z%?D5_pX3B8$Ru~b
zxu~<5nG^YEJA(rUfhr!~PaT7DIO<+APqf`3o7XSLRaMFNbDFS_r6)Ow*?h~d6Kt+4
zVG!IoHS}$JICXTl%0}nUx#|Fx*soe;B~!6jyF=Z&zxYE*T3T9O-Wgjp!I`Re(|~-e
za&0n@-cW*X?LH$i^b+cwl|@0nOO8=~npXPSBy2_9YR_2BeeI26_g@S&7-O0)6j3Ef
zi^&CT`vQEpk|jBzZHuuGrWhngnsRqRB3}QBt^09HDui;G26-ojZg~__+ijWT@V<Tz
zmZm7f`Vjk#0A&t}c2h^YM1vJyAII}6g9KW;aH^wu<FBSrm7UGb(57EYOG|g;-mu|L
zs7HLi)5H?<;I*P!`kLTV#D!J0$c&g=YEsyr8T0|c-F~)<Gkj_UjpBV$2Hw|NoEmVc
zxR{yY^ceD6pc<}uoH1SWAXDoN0tX^psU7rUscANV-LI-miMc6ixCy%|VU07P;rDp`
zdhn%Ij(zYjf}0PAYwFpR4+{wkldc^N<cb#^kU)#8Oq`aZ<PpfDx?2T<L<k0{6tM$)
zU#lC_*lE!h^cSQEP5DdjLcO+jQ}VIH{M`Em$oN5`i^PL8!xru3&01_W-CpF*04%ke
zzj%E6Tg%EtCfFnKgG_AA5Md6I!}k^cLI5ezPJ4{HSCXb}X#(Zd_3F14YJyj@_^R|K
zy%bz4M*8|p2&<PQ>FVe-1QB>HD0z}c{CL*PS&^YH=*2zs@{GCM(^{EuYCCK`>56_m
z2Psgn5-W3>iOX5ah1x&NOsN+71}YInM~)sfZT6{Aif#Sc*;#0BBRU|fOY^ebj8PS8
zS&mVwL)ow!>y{-0;&ua)!@J5E=@{fq1B2?TVA3{}4AWIx(YOF*!gOMEaCPs@o8Tnf
zEOCnn$QHa5b5V}Cl-1Lin6{k${{HKwa<vlNP|{Iw^5uHz(5Fw2f8g(>(y1=slbPJu
z!XB&bWE!+gt+bA&wPv5DR1RcXdv?wHoKMwTH|cSEYlD|f?x1X~YomAU<9&2N5CCoX
zG#^rO7k#ioz}Ee_##e<goNpV<NVKHQfK+Kok9My&Sg*F^gdj(yTQbQ-W`)CUZ6Ah)
zhFV(w4KkIe-}o&UZ*_d8*_j<@RwzA(=lSpR6<^GRU}mC!lxa-TMaTmK?VA<XbieGG
z@T>wntZ16m9i>iT=^st{_98Y8TW3=UuqsK4+tl<@j7ke*!wzL}-;<BeWq-WaqxtDm
zW4emJvgf`+DrxW}f=oY!<K>g1;N8~YUw1!G8e`DipflK`{{zxQy@5fInV?I&pQY!U
zhGgdh2|-EJkLMTOXIY82`#5k=h`Nva1}3y}mRn9lYJ}|o7D)H~9cF`HFY*X<_4fAm
z^V=+fbhbGe50x*ikw@;1DvR7X=0{a_kRVV@W(m0=@Vs~Rn7dzJ4KkBizHOHJ$EPNe
zM}E0GGRQ;_!PhQ_AKVky?iluy<ZPQ;d-3>&XVsfXV$4mgoER=Cr)&RqsWY^A(HM%x
zg@68CAGj64t2FCb#fzOjcC~A$A-KW$-W@<yJsE|?)mr^~qv@eNql1ciyA_f=5410t
z>x91G7yk}Bl~;_;!&<J>dDlV`jYA;Ezc)BJeR&hxBJTI3;8*nCde{PK$x5;Kc1>Xe
zzoNI?J^lys=~VRwhTY!zFI3&Ud)LtVOU8mKzHV&<kdXDsgES9t%yQgow)99zKXhk9
zFS8E@d3p5}cL$qL6ikivD0CFfnlWV+6+9bGB53Jpqzd&tGy9p#JAh{kYF;yQnt4`z
z@98m8NPWqyR#!eqJo*B0C?8%;a;c~WY!wz3rXg^gu`y#4k<{DBiMOd<8*%OL7m97Q
z-?<CWo(o=d9#TUh_U`Qbc@m#)bFQ%Zl`Bq8POA?-z`hXo2mPgWt%`7VeIWWFo{qmF
z*Wl#aYp3OlM(5i`lLpvVm5_q=n}E8U%2!rapr`EVH%W?R9pQNsMEZWOGjRG6U(AOu
zsd}N_(0MALfvyekoXh>);Iy@zX@g!^-zZma<n`;<FJ7Kyn`|DxYn-o7kcPy40IlB}
zEiD>3Mpv(1HC)UOIbg|-)OGB%wY7cx`0?x4X9582GCUZgen<llRa*IaPWJnqM_M;T
z<JwAg)DYZhG$+2x9&x`m<L}eIVA-ElzNr1G`?a%Gqin-w2h=BB{K{GB!(}fKO`442
z5~F+`l_0BE!HIfyBl>Ta8RC!@#c^m>6WyIeF_J}S&luWtth+&Rb0vLe`wO?ni**6C
z*SiR3w8n@<mZLY-)3_$>!jFIzUDLuW+4)WKU_;OjWt~qY==&LFjt~1s=u+AznVvkH
zVX`cG**>Y@Gne44U0P3LaO#k4AgTWm!~TLyr&+r$CLC6bgJNFTZ<G(Mv>>5n(sR<&
zPb2sU()bc>5o-W!6|CfXhre8<%94mu<fSc&OJOlJ1|Kjf1`sdeN>=E8KGr7}2iW+P
zFc#OYL0KNyTKIf@>D5#5YZey1%dNs&#mF`Z9}HON^L0ZO$$g<sxm8XBN=UB%4jU!(
zVkRH;Z)hm%Q6@T?2ddE2BVC8lgQ69FQ|DM&wR&;o&ld)k$Gu|#6$f6yhtD<qeIoI-
zqeE9$_i|QuXE1f;fhzQiKLXdsaxFLRf4{4qZ_+mD*|T!f(ed*O+6hnjs*m~szuAq3
zRg{&*$NaNH)W7Vvy+7u?wKl(!E_z^Z%L5S%obfoV%PNE#C*S{lb?$Zt$-t~&qJUi-
z;-H^1#pu#Eh~_NU!Q#Zk#MahUi&KYS$uJv`8U#{(vTa`5;N#mJDTG;{n&wD8kBV!v
zOn_G)SS$!8rCEoU3NJX>;LX!rbA$TW5hZeE#Kn6Gsp9e4%H}m=yt4!bq=W;F$}jNr
zl8GkvZ)c;j!bt#tNAUN&y-lC*GhnA><yCxtvy*rD*?D>SBhRk@^&yCfI!a?C(YH%N
zY+u5f85H(giJL<oX}MKq6s;kk)7@|lQ!+X~=5DSlXnFH(q5vXFzR&u1IxCV_!RwZr
zo12%{3s`kKz$AA5u_N4dmJRlt#5V}6ob%wyY$3iN5s^vOt+RJ{KKBVw4H^k=@P`ic
zHF(Yql|B`(hDwmr>)dqaRrmDQuU{eS>Ym;vap1axRv$PnXIb4Q#FADZ&Y=)fGZDmK
zx4eg`U5qSptSolkqv}H};J~46V5&5mc>59rTu)ETNJt?oq^r)U*mV;|Rl>Bb4fI+3
z^XJ*aKEXZa=H@2jZK<Ew^;<aFd?f<+s4D-f-160-3$N4J?sblmg+c7lSP^b<kYuye
zZukZ*M*-dR@Xho%>yMrfnSMi11S9ZcE)@71^<pBaP;ql2HhtrhaQ4E^L~Cq3VBEgG
zzOnlR=%p8{iQ<9mC89P)<zL@47cAK7s-AnUa;`2L8kc0<Krx^#n9#21ViNgg`LH(|
zj)B+L&eOBUr22#`Y4Yub<hxJOkm+?Qa9pJAnHgu(^&NQf?{HwtQhy4oY=#XI>lUhs
zt8}U%FIrP1z4`kV>;tK|SM(|Q<4>Uo!>jsKSv64~K^;Gdg2wP{j3>aqc9fZD7f()4
zmt}4mX}8@!86O|tJF^PV@y+6}60~_E*9+N@eo2jYvq>_G3aQ^vK5WoKj)#|z&wP}T
zN?mU#{B-Hi0p-nB{Uo6mGe;hJvz2t^W$uz7KHZlO`zfY5NXpNt>_P0F0@+k@BmejY
zP!uN*IykB>PMq-Gj$uZduS|nmJ5&mJ$U5wu$NOI)M)@Wu4?@8obC0J~5Enl|B?FR;
z{OxfKye0a+&!%JAyDHhAJ%0|b>$kcYkX!m;psK+oDv^4z=?rjU{jG(Cg-z5G@WMRk
z3#S+CD_f)YuCjtOk(7{NVP(Zk|47obvCsUiQGQETX?}jbj!-$(-CB^7xj;kdAk&^l
z7w}i~5rJdT9~{b5EYjV&+$*6!OG8@HoT>z30WeAc=a_#{CRr(xAcfd*^TXS%90`fm
z%6Ze>&^*4`6JP)Ke;RuS2fW%Sx#V=%JIASo?d<?XUO=sn;MfWWDyOe*nHR%*p_m3j
znPoQS$zDxb8~^zVsPF}gdlC)_n$SdbPU+t30E#9kNlM5?imQpb58e#rUJgGt5p}#h
zya|8>@Fn#4+BA08Fa~?p%V;vz=S$t2r00L%mf~6nGRf&08cGm=Kr9+0wnjEk<~>xF
zqa#;Sk9@zsN0kkc9-N!WOBX-%bhgZqa1Z)e0k$S88G~5vg_F7OGf)=1yDZ4Qefywr
zLi;U-3>>ghlDLgFk~_bjN(>N51yAPMiuzq5g52_A{-N{qTSm#6)5_Bv7sx_(j0=L8
zf1AfURdxb$jJlhT#kWt|o#AArk8PbG_iMpD&=A<C0g3Cx1K|N3Jw3p%z*T&AUx(*b
z+gY9fB)Pmr*r^W?tvI`m52m<^$Cq0I%?ypgyid-ux)!eQT&mtwe8cu+sLHl|#WM8m
z4u;oL>G%_MsN;M)P<Uq_=KR)}m;JjL05xiepoG{J!oJ<H2f7A08&2khu^L&RKC%o+
zC!xKkQ}t-h5B>AxVy02P#?xcgr51BVGr$>?!UFy)E<%m2S?<}R@y91@F?eSyOzk(2
zY*AxP1)|Odj0Qrn&XB&%1GOU|TkS^oKEv1P7Z`EjR7e=%)JvINPzp9++kkDh63<)w
z1Vz>S>F3|xgVuR8_lN!Yy*Jq?g@)H-kUk=5giD^f(X2wPTMX0}L;}r`@XxPm<7yk4
zesOYh$EwR8Sl*98_%|A8aQnW&Y7ssH%7>!VwMfzKZlD&Jhzc^QLKmNWcR#NpC)1O)
z`O%xRlLR*z6^L6cOphzRF1H$K<v&yR;<Zu!i}uOSpFij1<kSHq8>t{dcVsMv2<xXo
zoWThV|2O-+#?j=oXueQN98grdr;rB#5HvZwkvw}lxr}Kt)OX0}nQ_TCiJ{}?jl{os
zqoc2{o@Ge+R!6r`wqw!1VC&s)iXKAX1eZ(-0$z{KE3XaRFg7+WwQvgvsI9re@z2c7
z0$PgikgYsow+|%-KHO#fb4PB%@Rd*C(MGQbIltxXYnT?NayXJePP(YTzDUO*n)i4&
z=hc;$qkR>r?Y2}SeZ(_q0Gaox0e)swi!q8?tMHO<68MJyy=>{|d{$PLk?^^K?%)?)
z1h1qZ{xpF7fm>@p47*e)sHmu%J9pyf<)&YM6~qv-#)JLA8%=~A>N@0OEhIbXWU8Jl
z*K;!05o!Kr7Zu!yUYAS7EK%XGZB~Hz%fpD>DEjcni~^VZxhU$r&a)OfMccls(}u|%
z?*N%g8#P{|$zDUI11YAu!N;S4W9u<E8Q%^(0dQ~{0t%^mlO|-f61}_|T8QYQ+XL%D
z=lOK3!}@Y3oXV@N?i@gVe&1ak=X5QuzlAqU*Co1(f|6!3#@vI6mITmG*1zA2R5n$O
zLuMFpq8OvofhMcljkNm%w#WGf%@Jq}?Kuma^L1zq*xO7L>~#?>id);k61y|6gNy4K
zRWl8u(z276%xRDmF=zzE<)DH?lb6Nw`ld-IU;jPh)Qt4pH^s%ut6i6XL~}{N?R|3b
zfvTzpL4*-k(R?e%=q&TNKVj>~@87Q;FT)S_&a~wGS=qc$^|*a9ru!ong_J9s0npe_
zdTvT{Z&yQ|_qJHh_~1}*uE%K`D*hDO;q23^;0qjy=XpK-&a99Ws9AB%+Qw$#ypdg(
zkR9>)y(1zAn4Y|R`O={eR~-+;t9Fi&P;BXyogR7<7U7rX&~Z6yzG!uTrW+PCub2B&
zJ8U~pZ!R8}P98K8)LFL9k7EhEwKP3_OFVwGwt*9|I|pFu{edG~f!dH_OzU=ZD=W9K
z#G=YKv2+J}c@g61ti*85eSCg`@IfX=$wal|nda1%eZW_b&cAz!v@$`+=|S(};5lx%
z0f1v}?v3+q4<Mk3ef3cRx(i)&J=Gw;7@TaOxG9n2nRtw|#Z49V?5iB5EVoN65*i3n
z7OH#)GlUV*>o4!=+=O^+e}TL<@CQ6+8xfW5Na4V#9seh?S$5+z(A$71p_0L;w-`xy
z6+FE3H3;zFLr6hZ|IU)IGRgf}!6FSambrMQ@8x-bw1G$eb?0>};_?iqY^lm8(se%p
z$3wVJ2#i!(OR%~Gdv}`ul!Qc~7$g0oJZm)AxVOVo^lkz>_4zu8TXUE*93p^W65UFc
z1n7{G8T3{ekVDjVu*|2EF+gd<&L(%jGN9CZ&cu&*-f$}i)##ASTtixN6vi(lDanWt
zH@lJl(9M|z>b_nw0#|<?tnw>}J`TCMNP$9eM=qtqX@-Rfv1CC{f;p6xJ!B)o_0t0q
zaauj&l5tu^#l;1)K15DT?<(S*<Q#+Z9#O>)f*!)wmaGs|AfT2+2IUL#uM6MX3y`Vq
z6zmosP{pJ%Agp=j4sBecL+tLMTYd)=9Cl&7gti6!8k&q(!8Kz)?!!Z$RQ;j;+n2Ig
z|E0Nk^16;vaP~q2Y@CL~j14xTp1pDu@Bv6xwKn+9sg*SdR-HJN+*qI>?F!NOyg>9)
z01~|5oz2(}v-4It;u^YDJtr2syA69eF{xPgK(Ba(VTFf30X+6RYEd|^Q+%3}86ela
zYlh=XzHrY+AM6x)?^jvD>6ON5A*-O6B~C54V33=rNPES>?xoA!-b%WbYZe3|iIqpr
zAm2o%3A1|e;uwECon4nACSN(`I&a|8g@q5q`@cI4Y3wRTZ-9DtD3*>tRWCACOXhS!
zTJ-1VdU;tY`Mgh7MHCJXG9z;5;gJ3fhJy2Az2&7<*vCP6+wkDkQe=SXc;E@V0P^Sx
z(|fC8_e$5g%mtsCNyC(<`^9_fb8~Y|X#N5$Xk)hXlRC00Mmf@Z7O^8jxkbu>h82=6
zg4PCTo<!t>ok0=r>g~uZVdTlh3j;h86`KL|7N<tsWU4ZY7gEpjM4WwkZ^;j2P5^c}
z!@+Vqtn`l{hDYZM)Iqd!tSmLr6q)6rOmvLIGb2N%;Cb(?xm1jpnp=}bT=9utZX{NZ
z7e2V^avtxTjUyFTK`%4A#51Owo|Kf7hJFUjk!wX}<-;bCo*5DSl1~I=ilosC>NeOR
zjr4FN8i!+NQw#C+WZ)zu(N6EP6tDI>_80ArnutRhFnM1cB%wWWnbdc^hbK&}=<4Wv
zl7h<ZQh}{2=H;M_PwzXrY)NtOuB|!@k0=MgP0k+#zBb=t;8v#6{Ph-j3L=-|I1r#5
z*>F&~B7?UtxyNeLbSb2ERD0%cEt0U~u4p-6i>CXjA589TR$eOw&;l&yq-T!N61EOX
zGw>{0!kTUqI;4{_#D^Fp`3W2h8G+;udKye*^GfFa!0E)BX@=I1pi>~wX5bOUo$VD8
z*1Cswn3n$@GEjOoUg`Y#WUU;?quC#BL0jfhYvXbEIp`hOx?E>lm#RxeZlLbI+QSIh
z>jErv$uVeQ;<V&bzm=Aja=X9(HKnrSx2uE4aA;(>9i55ma7qs%&uHZwm+gj5=<Dk3
z%2CNUo$2+y&$(7BpFVx+>w9s^c!LZtpdQIv_d8rn;Cc5VrLc|#Bv^x!6M7XQ=KGK#
zEA(E5i8@ZeHaV4hE2MIJ)Sl&G-T51;^7bW?Pxli*3IsdQ1Fv?R%Ja#S#2aBxKfj{F
z>6XpOifEq<zmnB`%Op3jia|TCcln>}t*C<QWisb`w3B;WwQ_o#0%{PPF7OpdGN47R
z0IS0Ow#jACv$Ea(bgT23GB$obY#RU#O~<!ye^&e`A=RVvFF6oX8UaR-;-GNNo=xxc
zXv+y{vKJxWPl>SSguBI^nepcy&_rTd9Ta)>1=x~3E8r$T6C%kcVq05SsCVRe_R={F
z&cS^Ix}e>cXxJ{R7s_y=fCB&;PRN|Gt=YZKes3d=E1#e7s<t@AEAOec{IV%@Kf!;o
zZeT6zmBGo!P4-Ha-Jz*7VGjh5L>-~sT-c@q68oV~xGTtzLr{Y)8zPbx-NVnA;a`{b
z>^ty3x*+n?0zL+GAQY=96bp%I$v5DUxnt!hDV32NVbf{}^y5mW3ceV~g#1*-2dZ}0
zuOA1jaPb(iw%n=wYQ9Myld4ecaJB>+az_t1=f)~Y8JXNQ`q!*!E`N$2j{s8<ha|G$
zps?%Z<-BrwWG<<!%=`QbiB7eREEr(Us49~jxKK3(oDIP{4B#ekbyOzR^5O5#JXn!C
ztZ3M69(sOdO9%m~2TUQ4>b&NBwpLeHCng-ExL87)0&B<`w<BHjk~9Qf#>bO55K6(1
zm9%w>bp%vvT|OgvL{1&*k=<S^@xjD5h}+)~;AKHk1W7S5AwjMQO?$xGf#6%e`6oHB
zmrX=ROMnd;$M*L2=7^*I<IiQm5A!Eb7Qx+ns@sU1p?iMi1cYF0>ji&A#*uv<o&~7A
zYZ!|Lx)!N-U(YZPH`-|-Z@?IU-j*A(J}wg^dVn=N&)bjk@ujDvq$DIHq@=tRLUMH*
z2OJ-`7FYd&wrjyM;YH`vz9$!hI)uIn(<3Pi=y|6do?n~JKOX(>3q;fZxDw{IS>eD7
zFm^O71}^%ev>1?q_l#x&6y`dakd01s4_FU^1trq}Qe<e;tgiuM6fcEnNdsSzOzdOF
zT`xV^%Lu1YGBJBx7;#hIy~!Tf0DdqhIG%jK)1KwMrxCkoTj%2KU9sp}rgprvtnA_x
zA2JkyzR}d3Dl71ur0V&ff+~jv5{l@AjMc6G@NYj|X^|WYRo*_>3~9Uytv;|B*uaR$
z$iq)B?+TFCyP0Y&OAFNJ!C*&hf9Z{8NV-|w0Ci&!00+B2ISi%*-0e>H%66Q+CKACT
zFR7p)dhg#s`KZCR<BbZ!d`ei8A_SU4HVSQ{4;o6NKfZsr5{)Y`D_rvZ{_Bq0LNeM}
z{151ei;L?Byaxwq_gj-Yr-&r1wzf9Fo=H!6o(K?xP|uT3Uhfeve!TS#kGzGA%}exL
z?B6P<L1aMQ2VDl5643s4Z9WDeOi+&?{y-}MIqnv!(yf@d4B|(&T(4vMlRfIMo;(P4
zm=YB+02AnrymP1=h0+D0_>Iy85zKp7tR2G6R&a%JVs2m}{exCU>8I|M+D)*=_UIaK
zMNbF&_?QVI(1Hs>x889E1YL+hP#T$=`pCUfd*7H9wzlS)9|<`-x1R|a3P5*}0;+hv
zG!%2TuxT;0$A`rC^z*Li1e&t;$sUi*WcmlDg{DU+g*G`6p_B_y9U|#~L^u-J%nq4z
zS1#hQV8>0R2U8oU#0CMf?31mEYaQfg_MD%upY<1|r_eM;fC!3=hK<hi+Oo5T_Lqua
zrb*aNrS<QmL0ddw%R`Dwl2{2Ww*ZoFA_^tiC?C}FrNW9~H&4?WH*WaU)B!Xey?sQa
zi%60g-$Ib>X7ruUgDvL+Mu$<gupa`DZMyJ0Wzuul<R5{n58&wIJAD<?Gz4UP;ey8c
z+5(uULeK!9dWJCdTJ1%?sf?nIvOBqRJ~g|BBe)SPK<~_3)Q7W{r$W)xM@(LWTkSvI
z1TZE*DUjoN=gxRW5Pi;Q4Y!gFVYV<hzy)vB^pTz!FPY@AS`SbXvFO^e$fbVU&D3z_
z1&|XbP5{mWClHYYdvfYfB(G8d8&Ws)<;*h(W+ATUAr~`M`AdQryS|78gEH}^myx~*
z$?lzTkgTe%7R7tZ@j$a#XQQ8PM*rWs)(`7j70kDn4pQagMKE^&7jft*$d;*k+##8G
z2o4U$3c`B*`V0`6x{cff5-8*6WMyy8>pG0l@|(xCItXA&hTUdAV(ZczgkWVxV%&Rx
z1Lw5g59im}rzQ~mW}cpshn}YT>``s=f7N{py1Ky;DShS5?8hT;B=I7cp*QX--M74U
zO<)C!zLL<zEC1^tDXf>`+kF1VF)^e}3M^5u`Xkv|Ib|YndBAeG(``=Wt#9%9`63lG
zcHzm6yT-GlD&SU=T7nI!Gxn|-vvjYT94ftm*7w<hLP*twfRzs2L=w@nD*JRk*6QYd
z+>CLH=qGRnoriGm3|P3A4vfQ-i^mS^g=7oF+!2BpVl|B(m8U3T151eiEKwLCvv^nG
zVcmc)bhX~?Alp4Ze9@Hc(GYmWJ9rh-b^n+ObfrcKUOyW#ame=X!wpfN{+vtSe3mxq
zrd3RX97d7YQ24_SKObLV!`YoV`JkSz&4>onZ`kXCSwm>|9!YaO2XNI}FO&1n6f}nb
zik@5q$g0%#z?;2fr0H+IexNAVbMe!sL6|9{AyqLq`I2Yy?p1vUgm$(<J^{3(A^zRl
z96v-9Kmtdx@V%799V!(3A`2i}K={)U0@gLxs>lPNmU(1&KApy^9U{IVs54EeKt6xs
z%s~DgI3t|Q%KsIPMvhn&naNI;VSau@BhoU0u(+RuV*j^UmDfP7ag=W`5!GzcH9U2w
ze`#W30%RyB>yj-m6UL})adR8L$?fghBt8CgZRLOK#h{`Trc4bsP)X8*RwvqN&@=xJ
z`p4wPAmFqo(kPE)UT8{rvgp_8Pn`;;s&l#j@vN$Kx|cjF_YJoj6VUmCGbkxBvAa1I
z8tGoJNcrvOQx2GnGjBU|(-ZKn>+U6e!5*?p6}hh7sAGA}dp23`WwVY@mcqhQFNM09
z)6(-d8zHgIuUVXWce69CRT>;o^O!Dk1Y#qcn)71@<z9dZWZO#z$KmH26eYYoQTY|^
z>1X=(AFQC^?{1jkA&G`IL8S(O3<o{22FN)psh;l9X9eo)@~2%g<B`NQv_YP^zxr{X
z#HmxK+{o(CSAumAaXX#mI`O3{C?hr&95yo!@@A2M2*_W@bl+?ZR-A^_rDnZh17-#g
z3uFawj}E@BW67n<@TlzFmB66Txrx|J#q_z#x_&)^%yFXcU6J6PXc_q=9LF%`8s7{;
zR2|(k&*i3E`Yf=nvG&nB4$~gs{F;Q4{Ej1~3RjnTTE)Bo(ck7WGc^+RJ9+5o)bDcI
z#&?&{-F~Xm&X%gDt|ZE4)INWkqSNdA8^UY1-MDwZqaX12WnNMTn4`@9Aije+!pQ6s
zw)*66+pxusAhzziqmWu^p<~JprpJB}Qx<LUIIS{v^7p{~NXcomWWDU=R9)Q2cez1M
z1M|b=ParCRW-J^ePOUyn?#T5zEX|x{B6`)5EXNa32^<~+D-T?9x4z2{z8{zdl5@<O
zi&8-rhES0_<rfg}y-&o{Ru$f2c~tHAu~A`g((ImFz5b8mOViyA`Eoz7_20uhWRlC$
zN|v;_z%y~8&LP)h*WI;@fA;Lz#EPG>3Lgihz3|I`G$e^g5R^`HO?MXZO<MFzUOpZ@
zJUrams(BTIx=^1B`pEGBq%AUY9k<>0U!XI|2|zz4DW)*AFPBbr)^dWr*4<c?if&gw
z$j};LhNv4u68rabyg7alzN|y@vQ5y>GWm*<hv?i{B@b-a_pS8Q)YQa8`B)C5cLh))
z_l_r2FBI*Z&Kq_+gkrBQS@NKV2a$Ao&(r6&no?Y;fHZH@9AFL2X$LUYUZE)w49YBv
z--IFSK~k+K)ljFw)Zs=cGdq}xjl_^gY{)QYhPaJntchsFkzjFT$O%n*Vr>qR09s5o
zv!IndT3m2GsMO*s)X4vMQxe@|#Ny<+&q7sDQabwdGxUjPI9InQq9HHaUV#fc--Nlz
zz<wP#eCHQ<Ep7naY0Z4Wwa~f^PB*jRiLH-<A{5FigbwW82&k_=v8Qk24D_Kz7hBFj
zbC?vKMjZPGDcOUDni;gT!0w2U@n(hO`P*o>3ZVqV4z$4Z5_VAU+}+*5M66?mtZsgS
z+F@Xx6F)XOzm}A1gKxi4XJgLSEVWlh#aQJ%w0brHOP{N%s!A>c6VFD`{_*P7&h=7G
z)=<#sgeaeZINhx<1s120UIlZEv0lSy<n=E3TDdoQs0pwoLie7OqySstI<^C^HX#7s
zzVhKVS8z_)ypV{luJ?WqrLeHS?266tAK;VR0V&+-W;RX)6D5?5UB&AO0V{(+H?IgR
z5Hpif1Jh;%T7E|GmRkg?EjU9{3x)*v>|!Y>;u;J_uFd*^#$|&<Q-5U6$N%p0pmdwl
zj<44Z!=3|53Ic#Z&%*xKR}rKG{B~)5_O&ASAGXZMI}Dotdw@gvzOQoOPQ^<ER_yuC
zQ?#T1SAo}rrcag(4k%ASKnB0=`2M%i$^<vVBNfVwK1nh#FnuAT5>Q}S-33<Opp9S9
z&E8kWDch^oRfSV^Pweg62RxJ2E%c}qx>j6U@>NXkD2_#-y6A3Jy2)7HL0oAvA{Sz|
zh(_xYvK!#o`wu1sFeHcpIRw<~!KVExZr~Fez*F6^G*KN+y-AiHbs)Xf&)@dwW!N*#
z5I$mVcl1QY=!-=b<lR5iCI)2mQu*);Ar-(xJ~h^_Ms;`01BjA>NEyck$R8lngHi-%
z5>mL&KLTuw9I1s$%mt(Q)PP+}R>4~l-bgT`X$UNXLE-g28C^C5{n>`2Qdb%ke1I6G
zLlY-Z(U0-<-}8VRDzG8U`)GSVSLm4Z1cC*cfHohRI0V3<EoIhx&^EZPUeNr@(bX}!
zQ>b+ba9vBtV+{dMa593=ZF+@nl~!6`)swN8F}L^Im@Fe=c?k0TPeQin7vMl?Gx@Y>
z__N-xPU#-c146Rt*9NEuJw%f7-v|3O{*}nl1=+=fv8&j+rtkX2Gn@f4Iq`3r=q83t
zC>{#UE~uQ4f|^fp!Qj6)vURY{iXhQ?=K=}8HA*Pf!a~sTdONjT6hwC6-{O<QHt#25
zv_JkT;qv`{e?QpnEG;Y&AiboenLT9;RA=gQk>;Qz`*0SXdqVwaA3TZNsfP$es9<JU
z8_-C7#xvd7!lVj((D`*pzy?aPe8JmyIo=0@JraZ(sxf5uJk;+F!Z~!YfF^9vVT||W
z*+N}G)rxELKxIL`LR(4U!Pj&d0ivKuf|aUEqHYH)15%o*(HDh~_Zd(S;GO_^2i}g}
z-ZWGa!C;}axRM23RDoBXhh<MXbu|x;Pdq%<o9G)#7=uwpaM*y!ZXMJ#P>{e+nZI^~
zn;Z3`!k4)FHE90!8S9UEwx4zxNO;Bt_N<EL*UyAXA`Wt{t+ODPz^;vKRFV<VHd`yJ
z@8vi3rMRR!1iKh~^V_JQd_i*28r!qx4Kdc39-M6)YR?O!3i`x<JVrVReeTf3FKknu
z@?hnHN}7$5yAcOL?R}v{pDd|@LEi0yNNj8{6^)xs0#1rW!BjC2APUgxuAZKr6gr@;
zr!zO9o7n*w(b3DxWR1Dy1NeIIgT|d+TUDTzgLZa!FBna5HtGT@_25rh#17U>r2oes
z;S_sST`(a;P|V!w?&VB_zgrYIhlx6@TQv;0mB(~Iq<Vln@b=2$H|O%<;sz>tb8|B>
z@rWh@^ez~R!S})Yr0_J|Y<iTsbs--BB_I#F+va`XJBh$Qm8$KCcA1BTP6!l;!iDpr
zX#m))je5v3Us@B?rhNr2K>J_{az`26G`;A~smX=<=5fR5)jVZVzLNji>yGzOU&7uM
zt^ZVkQ(gP_<G!+dRng$=4Rz^1i<bAneFsw;ifs=JR1|n3;^H_0vvJNAWfdqV$-oy-
zVpcfQ-7u#KE+<qP{+R;JI+6m-1aMSg&rA>SRao2vfnz(cRWQC5p16!?&I|6s8y$;5
z>oq86$?+T<U-0>lW=SqRB>`<KU^LM<SkE^|OTlFV2TnLWpsf2AoPN+lp62(vgxE80
zL4P-e25aM~<c`_OzkJx~%$yv-UcK)ICmpKCirK&n6Mq^55SfP+fNIqe7@J+%A9&3~
ztNV?57aKVJ(=ra5D4DqGt5~KoR87q&{slGAxHPEZn)KxzN&f_JUqzr4!SoFfRz9z~
za2I1u-X!C6n`)t&U^DGGdsft|k^GWTv>WI3{rLbNIPXgKhao7vyc3a+S|T3m<;8Y5
z2|ktc{hJ|;*%&2IbK}WU$-0|{Y-}8q!%R=W(HzIYycPK$H4Dj5N$&9Z{fSW|uKWJ)
zB%jFwY%&OZF3rv(@BBds_E+SkJVwp^G04Qq=Eck?y*$#^dWUiAH#<_+;~AH19F_{k
zFkL`FLa|G}+n>VNJ?cmU-C63OXoPPFE*Tis3y7k5(Q&lt0WHI-t>$&GK88{Tfq_Ip
z<aFowQtkNBMc=jIoBMg>>_)<figDL4QBsZ7mgYz22o1LSh%(fkc&!{TqW}{NHoau`
z>KP!b%w!i`^MG#w!=@8sq5q~RCW<MDxcL!AZrUEDykQ0U<EF~iRq4U6Z^cvv5l_Et
z4B|r%ai_{BS*iWdi8wG?YCHo_$A{Rq^rOz7RGey*v{Qi(Je{gX@aomgH*o<F;pli-
z>ptsIg;ijbtKL9C!}3U6deW~;f%C$vDtQ39yYVe}BF#TZ&?q|lkKDH`d3zcDCy>NL
zV8I$fjV`KuqIcrMYL~N}iA9prenh;_;*TOc@_IlCJe}|=Y01e?o5w*?N&jJb@}Pi%
zY{lFQK0%Cu_clx+XhXf`fo>1bI_a)MCyocTXv9}JL0`ecnoMN`_YWa4N<-iqWpF<-
zbpVcM#(wE)?X{8O26pq`{*;spQ=J_hSD|XS)s>aSv`=2Sas^D$)4|Y*$G2b2+PRXI
zX+{gI$1Z^kd93ovj+N-KD#(*lQ&TWP;rQl(lr&c#r^#*as3~X=|57D^1V?L(vO;Ra
z*Lm{yDdlyr?Z1-8jgSgKz<0Oq{bTKY=G?g(VA%4Re{m(H&PD<ys;-7M+1Cx@WoRwT
z5lH$#`R4UP0*TW)3Vd=7-)t7p+6%B1kON^34Dbin^Wt)v3=gjXhJDOi`gF+6Fi2iq
zp{xDQQd~U(&EQ|psl-Un*Gp66&Yrb*VRwMuY+3yfpITvmc=LGKu=$=z!p0ZbzCSno
zmZ@8d=3+?Co|DlUiljbcY&(AiE<;}mNGBL5sP2^@;6%vr`%i8$g^<@VxbodF)PFu5
zvt;3Nsp#2A|M{RKHl%m6O<ST)0ACF$c-6qN?r&G-C8JFUIg%K>NZGlK^@g1~wOOcy
zXT}P4zTK}s!HuBW3IJr7MgyP0po=s`D*y0Er3V3oXyraI83c3*{jBc}F128$qj|<m
zTcV5BSMs@m0H6r`poa$m!TDRU+76iqG37P3%32!-PR-z?7gT;AdB7f^`TBMe*BTtq
ztFu&K)F+@AnbF)=GKvp24}X|#J0*s%?V0yktlx5Xre?kJ%RkI@<On0Poi4J<nkPz*
zkA3ukALPv2;pp0?f9eHZVR0Rbm+>n8l~1pLU4NZAAq=(;9O<Zj4t5MJ?GDfdmL>jp
z%7->}yHu<`1uDCAVW5A7;cC7CMPada<y$B(R)ZjQDw=Yhu=x$Dus_4OBF^cs=Rrx$
z<41mgbyo&0L}v_SlH+b|-Qe3AI@hUqqk0`5z5zB%r~qCHr<3o(Y5Vx`q1s=ztaWP)
zFti2ZV4H=3O3x!;oJEZ>?I$Fl>U6TWH&E7~G#CUGzWb+!+|7_O<BynlN=}=0INcix
zK8|ceqKiro7L-;8BlmCuOd9~vfY<H+>!PdL5G~Q6R6gtmI=KMYZmtg&TbGxYgSY6j
z4BVsQHz@Sb#lcj7QwNz5FiXWtdhiKu#<?t>bF#-8U7>4fUi7IU%bg=?1M=PFvvDK`
z$s_%)J4vp^@&Owq;maiM&4QB}5*RJCbx=I?wBzg7*AZKyn648WW`%;%im(q;<->E?
zP82r&UVq?Qn|b}`{v><Ikk((wBQA}JGpaj=!F|msi9vL@c~nrl!v(K0mff)2CWh)y
z)9e9*5*Vh?CiWz7?p-U&{kUp_7gh>BboemoWom(Ko;Um7j8_?eTsGjcW{cnFeROHt
zC)6pLAd4UFvd-cwJm&~TD>qbV#r1+`w8v3}ht<}zRZyb#??cf^z88q%_TMJ(H@kj%
z<w4WsCIe_pXU$Y;BS-zF#)(dgQIbYtJ;YEuOk;v?)Ep*L6cdBqbV|prACVf-NjtKZ
zz6be6kAmH2Ejq~Xp;~Z4V%A|FB<kqEZeo4+GY49t4<9-Nbav%vAP67e69sP;)O2Qg
zn9elA)Nv7s>s#P@&o|Cfym1;MDSgmL0$NOu`psM|1im|Ams^JcfI*@s|2R!w-Wfr*
zSsr#X*cWu#5x0ZO#uzb3$f71jrYE!y8c|~+m~V~U|29B_!F}Y2yyrv<8sn7{>DXBj
z#^1WIIS=Ef8_Ws=$GHh|cYflezMGir({pXm<0vEMwJrgsg~3mgyV1?#z&dHpM?ovJ
zZ(rHD3%gc>S3x{$=`_eUHuSySTHe`FjzF`qviaxF9~0*&@9=@smER#LgVRpBa<u61
z{Of7kS3Y1R>wEn!SqP>K9K95d_BZ!po%j#DHW8JqMkgEW%IcF7K#;l&MJndeufTN>
zK+%`e1@DQ@Z7Td9@U2?|B7{d1=h$KM(_WSMP~G$^D#J>eU^S&m^`05b-)Zxzg}JH@
z&3jcT_M9-=SN-6^^L-`ovuQdk(je#NPB1`!g#y(pmyiDgE^>wBM=&G3JD?0GaH*{=
zypNTHRjL6s)MNx#Af6JQe|SnEK3jH)=g5)a@|%$YAgZ59)f0}@bPYGc4-RS~Fl7Q$
zY?)3lp8P!^`o9oSM&KY5Wd3hozv}4fUS*&p{h`%ZRgsZ#svHIO2tU|MD9D>hiKu1>
z9ytf@321!)4Q2E7xS!;WOiqewvpT1Qi5lArwbH5_JZASgLRNKY8~>mhe~J<ec<&BG
zi9{0Pk$ho6GVq;1JB2X61-o8N5o<QfEeL%nF6@pjDQkA~#_V9e-mN#$d^d026pITF
zr_TGp_;Re^UTA{;Jh`fk1Cm#fH*ZTAHy?op;TqUTrMUX}Km>y^96CbJt%iP=ej-UL
z=l7Ct+3*|QvZq^%4Lgto_25vN6)sy8nd$b{>rfOJHitWdgoyjGs|5E-B8d!BHvNC-
zh`tyi%qzCST-6D5>8}F2550ImfRz#xu$cv=hBw{`Y`nbt@z<UA*NU|0Z1YVDxSGyn
z|H`%J92m9_oT9e1P8x%da;<1UmU4y}ghFuk8Jx`ivTxNI1Kz_XYCUFG5<<SkJX-~?
z`mE~F!aw8!I*rRaYysf=_72K7F}nSbSr=rSgG{-Cdq^Q<{xemkKxNcw9cB&Hg^s2x
zA)61f&mksIVz1dp>F)J3MF&ysV$m%7EoKmmfB82!GVa${LlOD*q8V)Z)IYKfhB<QJ
zsQ@{BWfQ!9YjNf_GyXSznC#0oL$b|;^W^XThQe*s-2i34GtNOWUpd-=<`Tu|e;~eK
ze7oREExD%#$)67HYK1B{_7OC15Sw>ZF!xHRvEUM)6Mm<hzc|1F+G%X?Ajt63grw3I
zw;RZNRkR^Ne~mzb&Ip%fywrI7WkihRnco!6fdlTUhz!$^V3iJ>O1(cjujX2jRBvgX
zHmHp=sY&xyT=1*{j4yT6rE>I9&wkjO!w1m~8a9Z1r^7&IB;$2-I$;)L*j=m%n=?*D
z8A;Eo?XFAv$AY%z$+v?{5BA;PkIT2s)8Ur8{SrujR%#&wBF7DsvA=@R_6;TD;4Adr
zsd9He9{ivJ00nTZ`-;lO&qp&}<kDu|Skd4>b5&?eZZt<!SruQXQYX04A>uq}+?OA#
zApK|NDU0VMU~!4D?iq=LfW%+&e&7RWMc$q^rOJZb0o)2u#YHSzBqao8#fG1l@D-)M
zYLPk%cB9?JiCO)cKS*3Y$<RYFKJMWiYU4wg?c4Vm5WV0-TN@j*vvr@=B{|^ekttM=
z88<hU#^KLDXb#jug4W8@;pp9wJ9XsA>UMNH=Zja{;1_*4+z1Q|*{cNHFfc|>nWt&2
z0#`dCrwtUb(CyX-Z)f8}lFO7cIx$fjKMIWUx92^AWA%2hI{3(OlEskQJ5aLTzI_Ws
zymgHaWC|^XO#Q7M|HxSpts=9Um2c-yM7O|<5^(d)<I(Zcj6l1zO|3BR(3G6mJ&;Cl
zl))^6N*6H1CGDnt-5p4Ou2r$A!ozJ~1O)dk{F(!OWTIQ%V$uZoOsRQoL8QaU_)4T^
z{oUrH_vdyExpa>{Kxy4Ifx$s3X>bBp!|w8f`Q<+zfBzjAK0nfz!-g<^$Z3s48BT7c
zxY}|a$Tg%}$GIO0QlhkY!9HjcmQ5uY<$I0Zt~`C})Tirb<B)_N@R_~79M<1&x(0)q
zngAt`o8azluiJr{HyB3@xm^^CWJ@nAgKEc{bH_G#Idw50Pi>_jY5M1fN6Up^-(^^W
z`$IBh%h1a4gll)ec!|D#x(p_2+P1DWj9oZ{EKS?!1Y{%Prg1#_%pL;lz(OifP!+l-
zd0TYaS_2&*fHOJf-kw!Wkx*F}t%P570G$}&1@wS^k<GnsGAvENfMRE&)&64VirH{5
zxpu?e-hNlI1Jr&!AV3i&-{YYJ3$bdLfr0Q(B@NEV(C$%XfyD+eHTwgG@(n+q8_;cZ
z*~1Svg>w#`Oi=ccJFaBG#Ly|Ouiw7G7e0T!+?(o4ynazKQ8D%}<=gqi?=|;5P*wRl
zh~Y4#?7bi7y?Jroj(07d1xdz1PA-98_u=a2clA{;?8L>tFx)*~Nf2l8gVD|W%}#Wj
z3+Nz_%pnjMao@kiSH=*&j4yc?+!w(7dkZzy-Gyl)?;Tu#IK=mPTBzH0_p*&doYS)K
zh=s?fgt)j3AQhN!r36WLnUl7_bxi<ou8O?8{7H;~M@|q31Jz|^$L+CDonNY95A1;J
z+m}!u_yGnCG?doHXzg5VOvK|Sx4jf}Zx|rw5%d<leME1Og~b{gaQNr=fiT*YwF3Z-
zLiMPg$%SLDnOF}qgdT#7xS7rI=1A_4V-nHDYN8SAMeQau4kPca@=aDzm){(M9?1@K
z$Oru{1Y1yA>es0%Tf?0TP?4dr^7J%nTLtsWZ_haGX$f=h3$W>sL1Q(cx`3Hz$$mk4
z)`E3a>C>$3+0+x?wm@LrRp{^709kAzniIMR`(#!KBTKM^Fh%`gW8-~AlE(Ew*I2%M
z2^cIMSiOP8j@(s<L0Y(r?~NEV$p3fiS57DD%ytJ?Tmw3wq5ADx=xKp=RBXFIVB;q~
zvkwC^Iw<x+KxTmQ<|ebn<-N^!bZCCfKEM(wX+^`~Qb~37h<l|VIsRxJ3`@cIJ0-}Q
z9|)2n?uY&RwtR?-MTKZpw;rbZzHUxF*R)Z9{HG>`oQTtE?by1Q$uCWYX~e9h;%<*k
zBAC)eN!N(SRkuuv%(Ru5s+uKktLipL*12fFueWd$Li2kUXNW!LmzI_mpf6JLpc<1)
zUA~=&{D!d-kC(>0PA}-b^;*56(t?jps3C@yl5j}%KP*FI)bq3>`Qy+S5YwFkcV_e7
zWIZe@f83u)UdpJyf|S&u>Gs|8UbA28N}0VgF9udqPdRbXnZbh`3b6<1cyFy0`MC||
z9fbL6UOGS(zQ||KG-`vc1@43eD0`%eX2`HjKWf~a$^gp_%+%uL7lGwZVMh<ZFIRM2
zxEy^rd4q8ps9Y|zF4wW9gmvIo!8TyBc|!NG&ZEX`IzW{i02@(<FrYJ`SdoKFQxr=K
z?%qR)r85CWH*E|1`?5qVU|x8@l}s`xrPA^{kQ1D7;19T}f`Q+ZBWbs>9mMw^uM`r3
zhlOc(Y&tVx$T6$&AZnG0cHLXHhIrBVcH)j0s$$mdcvj7=Yz~NEK(>d53tT?^E~$7A
zfXVXcn*Rm4<YK1X>0VepSP%RNU^bUkeBX5Y1I6MUyjRT__+nem!y?(8nO@oKV*-LT
ziUWBSjGSjiV@Vqf!UNN`Ox|xgHE5#RC*i%I&3trjt~XsnU}6%M3e3wg82}iCe%{-u
z!u$+CDMUBMT@&He8-W!^z@^?w@z+F*lFsc^X8VA(e*ZO0#<(!<C`_UwC8p5b8E4pv
z$Ll8}&@%ce295lKF_Zkg{ypFzGE|M8RZGaPUHd4cD(l=&8894EJ`7bxLxA3UhYqnA
z4he)CCTvo`88~p?E54pl4%SRE(cbnH;&u+*ztb?#NB8JLWOEihO#+Bp2biIU-90i2
xM}FpL;T%NIz)k+|^M5Pw|Je#OLaJ=pVNewMW>?D;{u+Hc&C7af#TV@!{y%L{m{b4&

literal 0
HcmV?d00001

diff --git a/tests/test_rasterize_points.py b/tests/test_rasterize_points.py
index eef3b85e..dc59e9b2 100644
--- a/tests/test_rasterize_points.py
+++ b/tests/test_rasterize_points.py
@@ -404,7 +404,7 @@ class TestRasterizePoints(TestCaseMixin, unittest.TestCase):
         torch.manual_seed(231)
         N = 3
         max_P = 1000
-        image_size = 64
+        image_size = (64, 64)
         radius = 0.1
         bin_size = 16
         max_points_per_bin = 500
@@ -501,7 +501,7 @@ class TestRasterizePoints(TestCaseMixin, unittest.TestCase):
             device=device
         )
         # fmt: on
-        image_size = 16
+        image_size = (16, 16)
         radius = 0.2
         bin_size = 8
         max_points_per_bin = 5
diff --git a/tests/test_rasterize_rectangles.py b/tests/test_rasterize_rectangle_images.py
similarity index 51%
rename from tests/test_rasterize_rectangles.py
rename to tests/test_rasterize_rectangle_images.py
index 72bb6fc5..98424938 100644
--- a/tests/test_rasterize_rectangles.py
+++ b/tests/test_rasterize_rectangle_images.py
@@ -12,19 +12,33 @@ from pytorch3d.io import load_obj
 from pytorch3d.renderer.cameras import FoVPerspectiveCameras, look_at_view_transform
 from pytorch3d.renderer.lighting import PointLights
 from pytorch3d.renderer.materials import Materials
-from pytorch3d.renderer.mesh import TexturesUV
+from pytorch3d.renderer.mesh import (
+    BlendParams,
+    MeshRasterizer,
+    MeshRenderer,
+    RasterizationSettings,
+    SoftPhongShader,
+    TexturesUV,
+)
 from pytorch3d.renderer.mesh.rasterize_meshes import (
     rasterize_meshes,
     rasterize_meshes_python,
 )
-from pytorch3d.renderer.mesh.rasterizer import (
-    Fragments,
-    MeshRasterizer,
-    RasterizationSettings,
+from pytorch3d.renderer.mesh.rasterizer import Fragments
+from pytorch3d.renderer.points import (
+    AlphaCompositor,
+    PointsRasterizationSettings,
+    PointsRasterizer,
+    PointsRenderer,
 )
-from pytorch3d.renderer.mesh.renderer import MeshRenderer
-from pytorch3d.renderer.mesh.shader import BlendParams, SoftPhongShader
-from pytorch3d.structures import Meshes
+from pytorch3d.renderer.points.rasterize_points import (
+    rasterize_points,
+    rasterize_points_python,
+)
+from pytorch3d.renderer.points.rasterizer import PointFragments
+from pytorch3d.structures import Meshes, Pointclouds
+from pytorch3d.transforms.transform3d import Transform3d
+from pytorch3d.utils import torus
 
 
 DEBUG = False
@@ -44,9 +58,36 @@ verts0 = torch.tensor(
 )
 faces0 = torch.tensor([[1, 0, 2], [4, 3, 5]], dtype=torch.int64)
 
+# Points for a simple pointcloud. Get the vertices from a
+# torus and apply rotations such that the points are no longer
+# symmerical in X/Y.
+torus_mesh = torus(r=0.25, R=1.0, sides=5, rings=2 * 5)
+t = (
+    Transform3d()
+    .rotate_axis_angle(angle=90, axis="Y")
+    .rotate_axis_angle(angle=45, axis="Z")
+    .scale(0.3)
+)
+torus_points = t.transform_points(torus_mesh.verts_padded()).squeeze()
 
-class TestRasterizeRectanglesErrors(TestCaseMixin, unittest.TestCase):
-    def test_image_size_arg(self):
+
+def _save_debug_image(idx, image_size, bin_size, blur):
+    """
+    Save a mask image from the rasterization output for debugging.
+    """
+    H, W = image_size
+    # Save out the last image for debugging
+    rgb = (idx[-1, ..., :3].cpu() > -1).squeeze()
+    suffix = "square" if H == W else "non_square"
+    filename = "%s_bin_size_%s_blur_%.3f_%dx%d.png"
+    filename = filename % (suffix, str(bin_size), blur, H, W)
+    if DEBUG:
+        filename = "DEBUG_%s" % filename
+        Image.fromarray((rgb.numpy() * 255).astype(np.uint8)).save(DATA_DIR / filename)
+
+
+class TestRasterizeRectangleImagesErrors(TestCaseMixin, unittest.TestCase):
+    def test_mesh_image_size_arg(self):
         meshes = Meshes(verts=[verts0], faces=[faces0])
 
         with self.assertRaises(ValueError) as cm:
@@ -76,8 +117,38 @@ class TestRasterizeRectanglesErrors(TestCaseMixin, unittest.TestCase):
             )
             self.assertTrue("sizes must be integers" in cm.msg)
 
+    def test_points_image_size_arg(self):
+        points = Pointclouds([verts0])
 
-class TestRasterizeRectangles(TestCaseMixin, unittest.TestCase):
+        with self.assertRaises(ValueError) as cm:
+            rasterize_points(
+                points,
+                (100, 200, 3),
+                0.0001,
+                points_per_pixel=1,
+            )
+            self.assertTrue("tuple/list of (H, W)" in cm.msg)
+
+        with self.assertRaises(ValueError) as cm:
+            rasterize_points(
+                points,
+                (0, 10),
+                0.0001,
+                points_per_pixel=1,
+            )
+            self.assertTrue("sizes must be positive" in cm.msg)
+
+        with self.assertRaises(ValueError) as cm:
+            rasterize_points(
+                points,
+                (100.5, 120.5),
+                0.0001,
+                points_per_pixel=1,
+            )
+            self.assertTrue("sizes must be integers" in cm.msg)
+
+
+class TestRasterizeRectangleImagesMeshes(TestCaseMixin, unittest.TestCase):
     @staticmethod
     def _clone_mesh(verts0, faces0, device, batch_size):
         """
@@ -164,7 +235,7 @@ class TestRasterizeRectangles(TestCaseMixin, unittest.TestCase):
             meshes_sq, image_size=(S, S), bin_size=0, blur=blur
         )
         # Save debug image
-        self._save_debug_image(square_fragments, (S, S), 0, blur)
+        _save_debug_image(square_fragments.pix_to_face, (S, S), 0, blur)
 
         # Extract the values in the square image which are non zero.
         square_mask = square_fragments.pix_to_face > -1
@@ -284,8 +355,8 @@ class TestRasterizeRectangles(TestCaseMixin, unittest.TestCase):
             )
 
             # Save out debug images if needed
-            self._save_debug_image(fragments_naive, image_size, 0, blur)
-            self._save_debug_image(fragments_binned, image_size, None, blur)
+            _save_debug_image(fragments_naive.pix_to_face, image_size, 0, blur)
+            _save_debug_image(fragments_binned.pix_to_face, image_size, None, blur)
 
             # Check naive and binned fragments give the same outputs
             self._check_fragments(fragments_naive, fragments_binned)
@@ -354,8 +425,8 @@ class TestRasterizeRectangles(TestCaseMixin, unittest.TestCase):
             )
 
             # Save debug images if DEBUG is set to true at the top of the file.
-            self._save_debug_image(fragments_naive, image_size, 0, blur)
-            self._save_debug_image(fragments_python, image_size, "python", blur)
+            _save_debug_image(fragments_naive.pix_to_face, image_size, 0, blur)
+            _save_debug_image(fragments_python.pix_to_face, image_size, "python", blur)
 
             # List of non square outputs to compare with the square output
             nonsq_fragment_gradtensor_list = [
@@ -437,3 +508,293 @@ class TestRasterizeRectangles(TestCaseMixin, unittest.TestCase):
             # NOTE some pixels can be flaky
             cond1 = torch.allclose(rgb, image_ref, atol=0.05)
             self.assertTrue(cond1)
+
+
+class TestRasterizeRectangleImagesPointclouds(TestCaseMixin, unittest.TestCase):
+    @staticmethod
+    def _clone_pointcloud(verts0, device, batch_size):
+        """
+        Helper function to detach and clone the verts.
+        This is needed in order to set up the tensors for
+        gradient computation in different tests.
+        """
+        verts = verts0.detach().clone()
+        verts.requires_grad = True
+        pointclouds = Pointclouds(points=[verts])
+        pointclouds = pointclouds.to(device).extend(batch_size)
+        return verts, pointclouds
+
+    def _rasterize(self, meshes, image_size, bin_size, blur):
+        """
+        Simple wrapper around the rasterize function to return
+        the fragment data.
+        """
+        idxs, zbuf, dists = rasterize_points(
+            meshes,
+            image_size,
+            blur,
+            points_per_pixel=1,
+            bin_size=bin_size,
+        )
+        return PointFragments(
+            idx=idxs,
+            zbuf=zbuf,
+            dists=dists,
+        )
+
+    def _check_fragments(self, frag_1, frag_2):
+        """
+        Helper function to check that the tensors in
+        the Fragments frag_1 and frag_2 are the same.
+        """
+        self.assertClose(frag_1.idx, frag_2.idx)
+        self.assertClose(frag_1.dists, frag_2.dists)
+        self.assertClose(frag_1.zbuf, frag_2.zbuf)
+
+    def _compare_square_with_nonsq(
+        self,
+        image_size,
+        blur,
+        device,
+        points,
+        nonsq_fragment_gradtensor_list,
+        batch_size=1,
+    ):
+        """
+        Calculate the output from rasterizing a square image with the minimum of (H, W).
+        Then compare this with the same square region in the non square image.
+        The input points are contained within the [-1, 1] range of the image
+        so all the relevant pixels will be within the square region.
+
+        `nonsq_fragment_gradtensor_list` is a list of fragments and verts grad tensors
+        from rasterizing non square images.
+        """
+        # Rasterize the square version of the image
+        H, W = image_size
+        S = min(H, W)
+        points_square, pointclouds_sq = self._clone_pointcloud(
+            points, device, batch_size
+        )
+        square_fragments = self._rasterize(
+            pointclouds_sq, image_size=(S, S), bin_size=0, blur=blur
+        )
+        # Save debug image
+        _save_debug_image(square_fragments.idx, (S, S), 0, blur)
+
+        # Extract the values in the square image which are non zero.
+        square_mask = square_fragments.idx > -1
+        square_dists = square_fragments.dists[square_mask]
+        square_zbuf = square_fragments.zbuf[square_mask]
+
+        # Retain gradients on the output of fragments to check
+        # intermediate values with the non square outputs.
+        square_fragments.dists.retain_grad()
+        square_fragments.zbuf.retain_grad()
+
+        # Calculate gradient for the square image
+        torch.manual_seed(231)
+        grad_zbuf = torch.randn_like(square_zbuf)
+        grad_dist = torch.randn_like(square_dists)
+        loss0 = (grad_dist * square_dists).sum() + (grad_zbuf * square_zbuf).sum()
+        loss0.backward()
+
+        # Now compare against the non square outputs provided
+        # in the nonsq_fragment_gradtensor_list list
+        for fragments, grad_tensor, _name in nonsq_fragment_gradtensor_list:
+            # Check that there are the same number of non zero pixels
+            # in both the square and non square images.
+            non_square_mask = fragments.idx > -1
+            self.assertEqual(non_square_mask.sum().item(), square_mask.sum().item())
+
+            # Check dists, zbuf and bary match the square image
+            non_square_dists = fragments.dists[non_square_mask]
+            non_square_zbuf = fragments.zbuf[non_square_mask]
+            self.assertClose(square_dists, non_square_dists)
+            self.assertClose(square_zbuf, non_square_zbuf)
+
+            # Retain gradients to compare values with outputs from
+            # square image
+            fragments.dists.retain_grad()
+            fragments.zbuf.retain_grad()
+            loss1 = (grad_dist * non_square_dists).sum() + (
+                grad_zbuf * non_square_zbuf
+            ).sum()
+            loss1.sum().backward()
+
+            # Get the non zero values in the intermediate gradients
+            # and compare with the values from the square image
+            non_square_grad_dists = fragments.dists.grad[non_square_mask]
+            non_square_grad_zbuf = fragments.zbuf.grad[non_square_mask]
+
+            self.assertClose(
+                non_square_grad_dists,
+                square_fragments.dists.grad[square_mask],
+            )
+            self.assertClose(
+                non_square_grad_zbuf,
+                square_fragments.zbuf.grad[square_mask],
+            )
+
+            # Finally check the gradients of the input vertices for
+            # the square and non square case
+            self.assertClose(points_square.grad, grad_tensor.grad, rtol=2e-4)
+
+    def test_gpu(self):
+        """
+        Test that the output of rendering non square images
+        gives the same result as square images. i.e. the
+        dists, zbuf, idx are all the same for the square
+        region which is present in both images.
+        """
+        # Test both cases: (W > H), (H > W)
+        image_sizes = [(64, 128), (128, 64), (128, 256), (256, 128)]
+
+        devices = ["cuda:0"]
+        blurs = [5e-2]
+        batch_sizes = [1, 4]
+        test_cases = product(image_sizes, blurs, devices, batch_sizes)
+
+        for image_size, blur, device, batch_size in test_cases:
+            # Initialize the verts grad tensor and the meshes objects
+            verts_nonsq_naive, pointcloud_nonsq_naive = self._clone_pointcloud(
+                torus_points, device, batch_size
+            )
+            verts_nonsq_binned, pointcloud_nonsq_binned = self._clone_pointcloud(
+                torus_points, device, batch_size
+            )
+
+            # Get the outputs for both naive and coarse to fine rasterization
+            fragments_naive = self._rasterize(
+                pointcloud_nonsq_naive,
+                image_size,
+                blur=blur,
+                bin_size=0,
+            )
+            fragments_binned = self._rasterize(
+                pointcloud_nonsq_binned,
+                image_size,
+                blur=blur,
+                bin_size=None,
+            )
+
+            # Save out debug images if needed
+            _save_debug_image(fragments_naive.idx, image_size, 0, blur)
+            _save_debug_image(fragments_binned.idx, image_size, None, blur)
+
+            # Check naive and binned fragments give the same outputs
+            self._check_fragments(fragments_naive, fragments_binned)
+
+            # Here we want to compare the square image with the naive and the
+            # coarse to fine methods outputs
+            nonsq_fragment_gradtensor_list = [
+                (fragments_naive, verts_nonsq_naive, "naive"),
+                (fragments_binned, verts_nonsq_binned, "coarse-to-fine"),
+            ]
+
+            self._compare_square_with_nonsq(
+                image_size,
+                blur,
+                device,
+                torus_points,
+                nonsq_fragment_gradtensor_list,
+                batch_size,
+            )
+
+    def test_cpu(self):
+        """
+        Test that the output of rendering non square images
+        gives the same result as square images. i.e. the
+        dists, zbuf, idx are all the same for the square
+        region which is present in both images.
+
+        In this test we compare between the naive C++ implementation
+        and the naive python implementation as the Coarse/Fine
+        method is not fully implemented in C++
+        """
+        # Test both when (W > H) and (H > W).
+        # Using smaller image sizes here as the Python rasterizer is really slow.
+        image_sizes = [(32, 64), (64, 32)]
+        devices = ["cpu"]
+        blurs = [5e-2]
+        batch_sizes = [1]
+        test_cases = product(image_sizes, blurs, devices, batch_sizes)
+
+        for image_size, blur, device, batch_size in test_cases:
+            # Initialize the verts grad tensor and the meshes objects
+            verts_nonsq_naive, pointcloud_nonsq_naive = self._clone_pointcloud(
+                torus_points, device, batch_size
+            )
+            verts_nonsq_python, pointcloud_nonsq_python = self._clone_pointcloud(
+                torus_points, device, batch_size
+            )
+
+            # Compare Naive CPU with Python as Coarse/Fine rasteriztation
+            # is not implemented for CPU
+            fragments_naive = self._rasterize(
+                pointcloud_nonsq_naive, image_size, bin_size=0, blur=blur
+            )
+            idxs, zbuf, pix_dists = rasterize_points_python(
+                pointcloud_nonsq_python,
+                image_size,
+                blur,
+                points_per_pixel=1,
+            )
+            fragments_python = PointFragments(
+                idx=idxs,
+                zbuf=zbuf,
+                dists=pix_dists,
+            )
+
+            # Save debug images if DEBUG is set to true at the top of the file.
+            _save_debug_image(fragments_naive.idx, image_size, 0, blur)
+            _save_debug_image(fragments_python.idx, image_size, "python", blur)
+
+            # List of non square outputs to compare with the square output
+            nonsq_fragment_gradtensor_list = [
+                (fragments_naive, verts_nonsq_naive, "naive"),
+                (fragments_python, verts_nonsq_python, "python"),
+            ]
+            self._compare_square_with_nonsq(
+                image_size,
+                blur,
+                device,
+                torus_points,
+                nonsq_fragment_gradtensor_list,
+                batch_size,
+            )
+
+    def test_render_pointcloud(self):
+        """
+        Test a textured poincloud is rendered correctly in a non square image.
+        """
+        device = torch.device("cuda:0")
+        pointclouds = Pointclouds(
+            points=[torus_points * 2.0],
+            features=torch.ones_like(torus_points[None, ...]),
+        ).to(device)
+        R, T = look_at_view_transform(2.7, 0.0, 0.0)
+        cameras = FoVPerspectiveCameras(device=device, R=R, T=T)
+        raster_settings = PointsRasterizationSettings(
+            image_size=(512, 1024), radius=5e-2, points_per_pixel=1
+        )
+        rasterizer = PointsRasterizer(cameras=cameras, raster_settings=raster_settings)
+        compositor = AlphaCompositor()
+        renderer = PointsRenderer(rasterizer=rasterizer, compositor=compositor)
+
+        # Load reference image
+        image_ref = load_rgb_image("test_pointcloud_rectangle_image.png", DATA_DIR)
+
+        for bin_size in [0, None]:
+            # Check both naive and coarse to fine produce the same output.
+            renderer.rasterizer.raster_settings.bin_size = bin_size
+            images = renderer(pointclouds)
+            rgb = images[0, ..., :3].squeeze().cpu()
+
+            if DEBUG:
+                Image.fromarray((rgb.numpy() * 255).astype(np.uint8)).save(
+                    DATA_DIR / "DEBUG_pointcloud_rectangle_image.png"
+                )
+
+            # NOTE some pixels can be flaky
+            cond1 = torch.allclose(rgb, image_ref, atol=0.05)
+            self.assertTrue(cond1)