mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2025-08-02 03:42:50 +08:00
Differential Revision: D43044534 fbshipit-source-id: dc841b6704ccd562f5a40e7b2834e26063a9f7ae
1875 lines
69 KiB
Python
1875 lines
69 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the BSD-style license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
import math
|
|
import warnings
|
|
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
|
|
import numpy as np
|
|
import torch
|
|
import torch.nn.functional as F
|
|
from pytorch3d.common.datatypes import Device
|
|
from pytorch3d.transforms import Rotate, Transform3d, Translate
|
|
|
|
from .utils import convert_to_tensors_and_broadcast, TensorProperties
|
|
|
|
|
|
# Default values for rotation and translation matrices.
|
|
_R = torch.eye(3)[None] # (1, 3, 3)
|
|
_T = torch.zeros(1, 3) # (1, 3)
|
|
|
|
# An input which is a float per batch element
|
|
_BatchFloatType = Union[float, Sequence[float], torch.Tensor]
|
|
|
|
# one or two floats per batch element
|
|
_FocalLengthType = Union[
|
|
float, Sequence[Tuple[float]], Sequence[Tuple[float, float]], torch.Tensor
|
|
]
|
|
|
|
|
|
class CamerasBase(TensorProperties):
|
|
"""
|
|
`CamerasBase` implements a base class for all cameras.
|
|
|
|
For cameras, there are four different coordinate systems (or spaces)
|
|
- World coordinate system: This is the system the object lives - the world.
|
|
- Camera view coordinate system: This is the system that has its origin on
|
|
the camera and the Z-axis perpendicular to the image plane.
|
|
In PyTorch3D, we assume that +X points left, and +Y points up and
|
|
+Z points out from the image plane.
|
|
The transformation from world --> view happens after applying a rotation (R)
|
|
and translation (T)
|
|
- NDC coordinate system: This is the normalized coordinate system that confines
|
|
points in a volume the rendered part of the object or scene, also known as
|
|
view volume. For square images, given the PyTorch3D convention, (+1, +1, znear)
|
|
is the top left near corner, and (-1, -1, zfar) is the bottom right far
|
|
corner of the volume.
|
|
The transformation from view --> NDC happens after applying the camera
|
|
projection matrix (P) if defined in NDC space.
|
|
For non square images, we scale the points such that smallest side
|
|
has range [-1, 1] and the largest side has range [-u, u], with u > 1.
|
|
- Screen coordinate system: This is another representation of the view volume with
|
|
the XY coordinates defined in image space instead of a normalized space.
|
|
|
|
An illustration of the coordinate systems can be found in pytorch3d/docs/notes/cameras.md.
|
|
|
|
CameraBase defines methods that are common to all camera models:
|
|
- `get_camera_center` that returns the optical center of the camera in
|
|
world coordinates
|
|
- `get_world_to_view_transform` which returns a 3D transform from
|
|
world coordinates to the camera view coordinates (R, T)
|
|
- `get_full_projection_transform` which composes the projection
|
|
transform (P) with the world-to-view transform (R, T)
|
|
- `transform_points` which takes a set of input points in world coordinates and
|
|
projects to the space the camera is defined in (NDC or screen)
|
|
- `get_ndc_camera_transform` which defines the transform from screen/NDC to
|
|
PyTorch3D's NDC space
|
|
- `transform_points_ndc` which takes a set of points in world coordinates and
|
|
projects them to PyTorch3D's NDC space
|
|
- `transform_points_screen` which takes a set of points in world coordinates and
|
|
projects them to screen space
|
|
|
|
For each new camera, one should implement the `get_projection_transform`
|
|
routine that returns the mapping from camera view coordinates to camera
|
|
coordinates (NDC or screen).
|
|
|
|
Another useful function that is specific to each camera model is
|
|
`unproject_points` which sends points from camera coordinates (NDC or screen)
|
|
back to camera view or world coordinates depending on the `world_coordinates`
|
|
boolean argument of the function.
|
|
"""
|
|
|
|
# Used in __getitem__ to index the relevant fields
|
|
# When creating a new camera, this should be set in the __init__
|
|
_FIELDS: Tuple[str, ...] = ()
|
|
|
|
# Names of fields which are a constant property of the whole batch, rather
|
|
# than themselves a batch of data.
|
|
# When joining objects into a batch, they will have to agree.
|
|
_SHARED_FIELDS: Tuple[str, ...] = ()
|
|
|
|
def get_projection_transform(self, **kwargs):
|
|
"""
|
|
Calculate the projective transformation matrix.
|
|
|
|
Args:
|
|
**kwargs: parameters for the projection can be passed in as keyword
|
|
arguments to override the default values set in `__init__`.
|
|
|
|
Return:
|
|
a `Transform3d` object which represents a batch of projection
|
|
matrices of shape (N, 3, 3)
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def unproject_points(self, xy_depth: torch.Tensor, **kwargs):
|
|
"""
|
|
Transform input points from camera coodinates (NDC or screen)
|
|
to the world / camera coordinates.
|
|
|
|
Each of the input points `xy_depth` of shape (..., 3) is
|
|
a concatenation of the x, y location and its depth.
|
|
|
|
For instance, for an input 2D tensor of shape `(num_points, 3)`
|
|
`xy_depth` takes the following form:
|
|
`xy_depth[i] = [x[i], y[i], depth[i]]`,
|
|
for a each point at an index `i`.
|
|
|
|
The following example demonstrates the relationship between
|
|
`transform_points` and `unproject_points`:
|
|
|
|
.. code-block:: python
|
|
|
|
cameras = # camera object derived from CamerasBase
|
|
xyz = # 3D points of shape (batch_size, num_points, 3)
|
|
# transform xyz to the camera view coordinates
|
|
xyz_cam = cameras.get_world_to_view_transform().transform_points(xyz)
|
|
# extract the depth of each point as the 3rd coord of xyz_cam
|
|
depth = xyz_cam[:, :, 2:]
|
|
# project the points xyz to the camera
|
|
xy = cameras.transform_points(xyz)[:, :, :2]
|
|
# append depth to xy
|
|
xy_depth = torch.cat((xy, depth), dim=2)
|
|
# unproject to the world coordinates
|
|
xyz_unproj_world = cameras.unproject_points(xy_depth, world_coordinates=True)
|
|
print(torch.allclose(xyz, xyz_unproj_world)) # True
|
|
# unproject to the camera coordinates
|
|
xyz_unproj = cameras.unproject_points(xy_depth, world_coordinates=False)
|
|
print(torch.allclose(xyz_cam, xyz_unproj)) # True
|
|
|
|
Args:
|
|
xy_depth: torch tensor of shape (..., 3).
|
|
world_coordinates: If `True`, unprojects the points back to world
|
|
coordinates using the camera extrinsics `R` and `T`.
|
|
`False` ignores `R` and `T` and unprojects to
|
|
the camera view coordinates.
|
|
from_ndc: If `False` (default), assumes xy part of input is in
|
|
NDC space if self.in_ndc(), otherwise in screen space. If
|
|
`True`, assumes xy is in NDC space even if the camera
|
|
is defined in screen space.
|
|
|
|
Returns
|
|
new_points: unprojected points with the same shape as `xy_depth`.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def get_camera_center(self, **kwargs) -> torch.Tensor:
|
|
"""
|
|
Return the 3D location of the camera optical center
|
|
in the world coordinates.
|
|
|
|
Args:
|
|
**kwargs: parameters for the camera extrinsics can be passed in
|
|
as keyword arguments to override the default values
|
|
set in __init__.
|
|
|
|
Setting R or T here will update the values set in init as these
|
|
values may be needed later on in the rendering pipeline e.g. for
|
|
lighting calculations.
|
|
|
|
Returns:
|
|
C: a batch of 3D locations of shape (N, 3) denoting
|
|
the locations of the center of each camera in the batch.
|
|
"""
|
|
w2v_trans = self.get_world_to_view_transform(**kwargs)
|
|
P = w2v_trans.inverse().get_matrix()
|
|
# the camera center is the translation component (the first 3 elements
|
|
# of the last row) of the inverted world-to-view
|
|
# transform (4x4 RT matrix)
|
|
C = P[:, 3, :3]
|
|
return C
|
|
|
|
def get_world_to_view_transform(self, **kwargs) -> Transform3d:
|
|
"""
|
|
Return the world-to-view transform.
|
|
|
|
Args:
|
|
**kwargs: parameters for the camera extrinsics can be passed in
|
|
as keyword arguments to override the default values
|
|
set in __init__.
|
|
|
|
Setting R and T here will update the values set in init as these
|
|
values may be needed later on in the rendering pipeline e.g. for
|
|
lighting calculations.
|
|
|
|
Returns:
|
|
A Transform3d object which represents a batch of transforms
|
|
of shape (N, 3, 3)
|
|
"""
|
|
R: torch.Tensor = kwargs.get("R", self.R)
|
|
T: torch.Tensor = kwargs.get("T", self.T)
|
|
self.R = R # pyre-ignore[16]
|
|
self.T = T # pyre-ignore[16]
|
|
world_to_view_transform = get_world_to_view_transform(R=R, T=T)
|
|
return world_to_view_transform
|
|
|
|
def get_full_projection_transform(self, **kwargs) -> Transform3d:
|
|
"""
|
|
Return the full world-to-camera transform composing the
|
|
world-to-view and view-to-camera transforms.
|
|
If camera is defined in NDC space, the projected points are in NDC space.
|
|
If camera is defined in screen space, the projected points are in screen space.
|
|
|
|
Args:
|
|
**kwargs: parameters for the projection transforms can be passed in
|
|
as keyword arguments to override the default values
|
|
set in __init__.
|
|
|
|
Setting R and T here will update the values set in init as these
|
|
values may be needed later on in the rendering pipeline e.g. for
|
|
lighting calculations.
|
|
|
|
Returns:
|
|
a Transform3d object which represents a batch of transforms
|
|
of shape (N, 3, 3)
|
|
"""
|
|
self.R: torch.Tensor = kwargs.get("R", self.R) # pyre-ignore[16]
|
|
self.T: torch.Tensor = kwargs.get("T", self.T) # pyre-ignore[16]
|
|
world_to_view_transform = self.get_world_to_view_transform(R=self.R, T=self.T)
|
|
view_to_proj_transform = self.get_projection_transform(**kwargs)
|
|
return world_to_view_transform.compose(view_to_proj_transform)
|
|
|
|
def transform_points(
|
|
self, points, eps: Optional[float] = None, **kwargs
|
|
) -> torch.Tensor:
|
|
"""
|
|
Transform input points from world to camera space.
|
|
If camera is defined in NDC space, the projected points are in NDC space.
|
|
If camera is defined in screen space, the projected points are in screen space.
|
|
|
|
For `CamerasBase.transform_points`, setting `eps > 0`
|
|
stabilizes gradients since it leads to avoiding division
|
|
by excessively low numbers for points close to the camera plane.
|
|
|
|
Args:
|
|
points: torch tensor of shape (..., 3).
|
|
eps: If eps!=None, the argument is used to clamp the
|
|
divisor in the homogeneous normalization of the points
|
|
transformed to the ndc space. Please see
|
|
`transforms.Transform3d.transform_points` for details.
|
|
|
|
For `CamerasBase.transform_points`, setting `eps > 0`
|
|
stabilizes gradients since it leads to avoiding division
|
|
by excessively low numbers for points close to the
|
|
camera plane.
|
|
|
|
Returns
|
|
new_points: transformed points with the same shape as the input.
|
|
"""
|
|
world_to_proj_transform = self.get_full_projection_transform(**kwargs)
|
|
return world_to_proj_transform.transform_points(points, eps=eps)
|
|
|
|
def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
|
|
"""
|
|
Returns the transform from camera projection space (screen or NDC) to NDC space.
|
|
For cameras that can be specified in screen space, this transform
|
|
allows points to be converted from screen to NDC space.
|
|
The default transform scales the points from [0, W]x[0, H]
|
|
to [-1, 1]x[-u, u] or [-u, u]x[-1, 1] where u > 1 is the aspect ratio of the image.
|
|
This function should be modified per camera definitions if need be,
|
|
e.g. for Perspective/Orthographic cameras we provide a custom implementation.
|
|
This transform assumes PyTorch3D coordinate system conventions for
|
|
both the NDC space and the input points.
|
|
|
|
This transform interfaces with the PyTorch3D renderer which assumes
|
|
input points to the renderer to be in NDC space.
|
|
"""
|
|
if self.in_ndc():
|
|
return Transform3d(device=self.device, dtype=torch.float32)
|
|
else:
|
|
# For custom cameras which can be defined in screen space,
|
|
# users might might have to implement the screen to NDC transform based
|
|
# on the definition of the camera parameters.
|
|
# See PerspectiveCameras/OrthographicCameras for an example.
|
|
# We don't flip xy because we assume that world points are in
|
|
# PyTorch3D coordinates, and thus conversion from screen to ndc
|
|
# is a mere scaling from image to [-1, 1] scale.
|
|
image_size = kwargs.get("image_size", self.get_image_size())
|
|
return get_screen_to_ndc_transform(
|
|
self, with_xyflip=False, image_size=image_size
|
|
)
|
|
|
|
def transform_points_ndc(
|
|
self, points, eps: Optional[float] = None, **kwargs
|
|
) -> torch.Tensor:
|
|
"""
|
|
Transforms points from PyTorch3D world/camera space to NDC space.
|
|
Input points follow the PyTorch3D coordinate system conventions: +X left, +Y up.
|
|
Output points are in NDC space: +X left, +Y up, origin at image center.
|
|
|
|
Args:
|
|
points: torch tensor of shape (..., 3).
|
|
eps: If eps!=None, the argument is used to clamp the
|
|
divisor in the homogeneous normalization of the points
|
|
transformed to the ndc space. Please see
|
|
`transforms.Transform3d.transform_points` for details.
|
|
|
|
For `CamerasBase.transform_points`, setting `eps > 0`
|
|
stabilizes gradients since it leads to avoiding division
|
|
by excessively low numbers for points close to the
|
|
camera plane.
|
|
|
|
Returns
|
|
new_points: transformed points with the same shape as the input.
|
|
"""
|
|
world_to_ndc_transform = self.get_full_projection_transform(**kwargs)
|
|
if not self.in_ndc():
|
|
to_ndc_transform = self.get_ndc_camera_transform(**kwargs)
|
|
world_to_ndc_transform = world_to_ndc_transform.compose(to_ndc_transform)
|
|
|
|
return world_to_ndc_transform.transform_points(points, eps=eps)
|
|
|
|
def transform_points_screen(
|
|
self, points, eps: Optional[float] = None, with_xyflip: bool = True, **kwargs
|
|
) -> torch.Tensor:
|
|
"""
|
|
Transforms points from PyTorch3D world/camera space to screen space.
|
|
Input points follow the PyTorch3D coordinate system conventions: +X left, +Y up.
|
|
Output points are in screen space: +X right, +Y down, origin at top left corner.
|
|
|
|
Args:
|
|
points: torch tensor of shape (..., 3).
|
|
eps: If eps!=None, the argument is used to clamp the
|
|
divisor in the homogeneous normalization of the points
|
|
transformed to the ndc space. Please see
|
|
`transforms.Transform3d.transform_points` for details.
|
|
|
|
For `CamerasBase.transform_points`, setting `eps > 0`
|
|
stabilizes gradients since it leads to avoiding division
|
|
by excessively low numbers for points close to the
|
|
camera plane.
|
|
with_xyflip: If True, flip x and y directions. In world/camera/ndc coords,
|
|
+x points to the left and +y up. If with_xyflip is true, in screen
|
|
coords +x points right, and +y down, following the usual RGB image
|
|
convention. Warning: do not set to False unless you know what you're
|
|
doing!
|
|
|
|
Returns
|
|
new_points: transformed points with the same shape as the input.
|
|
"""
|
|
points_ndc = self.transform_points_ndc(points, eps=eps, **kwargs)
|
|
image_size = kwargs.get("image_size", self.get_image_size())
|
|
return get_ndc_to_screen_transform(
|
|
self, with_xyflip=with_xyflip, image_size=image_size
|
|
).transform_points(points_ndc, eps=eps)
|
|
|
|
def clone(self):
|
|
"""
|
|
Returns a copy of `self`.
|
|
"""
|
|
cam_type = type(self)
|
|
other = cam_type(device=self.device)
|
|
return super().clone(other)
|
|
|
|
def is_perspective(self):
|
|
raise NotImplementedError()
|
|
|
|
def in_ndc(self):
|
|
"""
|
|
Specifies whether the camera is defined in NDC space
|
|
or in screen (image) space
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def get_znear(self):
|
|
return self.znear if hasattr(self, "znear") else None
|
|
|
|
def get_image_size(self):
|
|
"""
|
|
Returns the image size, if provided, expected in the form of (height, width)
|
|
The image size is used for conversion of projected points to screen coordinates.
|
|
"""
|
|
return self.image_size if hasattr(self, "image_size") else None
|
|
|
|
def __getitem__(
|
|
self, index: Union[int, List[int], torch.BoolTensor, torch.LongTensor]
|
|
) -> "CamerasBase":
|
|
"""
|
|
Override for the __getitem__ method in TensorProperties which needs to be
|
|
refactored.
|
|
|
|
Args:
|
|
index: an integer index, list/tensor of integer indices, or tensor of boolean
|
|
indicators used to filter all the fields in the cameras given by self._FIELDS.
|
|
Returns:
|
|
an instance of the current cameras class with only the values at the selected index.
|
|
"""
|
|
|
|
kwargs = {}
|
|
|
|
tensor_types = {
|
|
# pyre-fixme[16]: Module `cuda` has no attribute `BoolTensor`.
|
|
"bool": (torch.BoolTensor, torch.cuda.BoolTensor),
|
|
# pyre-fixme[16]: Module `cuda` has no attribute `LongTensor`.
|
|
"long": (torch.LongTensor, torch.cuda.LongTensor),
|
|
}
|
|
if not isinstance(
|
|
index, (int, list, *tensor_types["bool"], *tensor_types["long"])
|
|
) or (
|
|
isinstance(index, list)
|
|
and not all(isinstance(i, int) and not isinstance(i, bool) for i in index)
|
|
):
|
|
msg = (
|
|
"Invalid index type, expected int, List[int] or Bool/LongTensor; got %r"
|
|
)
|
|
raise ValueError(msg % type(index))
|
|
|
|
if isinstance(index, int):
|
|
index = [index]
|
|
|
|
if isinstance(index, tensor_types["bool"]):
|
|
# pyre-fixme[16]: Item `List` of `Union[List[int], BoolTensor,
|
|
# LongTensor]` has no attribute `ndim`.
|
|
# pyre-fixme[16]: Item `List` of `Union[List[int], BoolTensor,
|
|
# LongTensor]` has no attribute `shape`.
|
|
if index.ndim != 1 or index.shape[0] != len(self):
|
|
raise ValueError(
|
|
# pyre-fixme[16]: Item `List` of `Union[List[int], BoolTensor,
|
|
# LongTensor]` has no attribute `shape`.
|
|
f"Boolean index of shape {index.shape} does not match cameras"
|
|
)
|
|
elif max(index) >= len(self):
|
|
raise IndexError(f"Index {max(index)} is out of bounds for select cameras")
|
|
|
|
for field in self._FIELDS:
|
|
val = getattr(self, field, None)
|
|
if val is None:
|
|
continue
|
|
|
|
# e.g. "in_ndc" is set as attribute "_in_ndc" on the class
|
|
# but provided as "in_ndc" on initialization
|
|
if field.startswith("_"):
|
|
field = field[1:]
|
|
|
|
if isinstance(val, (str, bool)):
|
|
kwargs[field] = val
|
|
elif isinstance(val, torch.Tensor):
|
|
# In the init, all inputs will be converted to
|
|
# tensors before setting as attributes
|
|
kwargs[field] = val[index]
|
|
else:
|
|
raise ValueError(f"Field {field} type is not supported for indexing")
|
|
|
|
kwargs["device"] = self.device
|
|
return self.__class__(**kwargs)
|
|
|
|
|
|
############################################################
|
|
# Field of View Camera Classes #
|
|
############################################################
|
|
|
|
|
|
def OpenGLPerspectiveCameras(
|
|
znear: _BatchFloatType = 1.0,
|
|
zfar: _BatchFloatType = 100.0,
|
|
aspect_ratio: _BatchFloatType = 1.0,
|
|
fov: _BatchFloatType = 60.0,
|
|
degrees: bool = True,
|
|
R: torch.Tensor = _R,
|
|
T: torch.Tensor = _T,
|
|
device: Device = "cpu",
|
|
) -> "FoVPerspectiveCameras":
|
|
"""
|
|
OpenGLPerspectiveCameras has been DEPRECATED. Use FoVPerspectiveCameras instead.
|
|
Preserving OpenGLPerspectiveCameras for backward compatibility.
|
|
"""
|
|
|
|
warnings.warn(
|
|
"""OpenGLPerspectiveCameras is deprecated,
|
|
Use FoVPerspectiveCameras instead.
|
|
OpenGLPerspectiveCameras will be removed in future releases.""",
|
|
PendingDeprecationWarning,
|
|
)
|
|
|
|
return FoVPerspectiveCameras(
|
|
znear=znear,
|
|
zfar=zfar,
|
|
aspect_ratio=aspect_ratio,
|
|
fov=fov,
|
|
degrees=degrees,
|
|
R=R,
|
|
T=T,
|
|
device=device,
|
|
)
|
|
|
|
|
|
class FoVPerspectiveCameras(CamerasBase):
|
|
"""
|
|
A class which stores a batch of parameters to generate a batch of
|
|
projection matrices by specifying the field of view.
|
|
The definitions of the parameters follow the OpenGL perspective camera.
|
|
|
|
The extrinsics of the camera (R and T matrices) can also be set in the
|
|
initializer or passed in to `get_full_projection_transform` to get
|
|
the full transformation from world -> ndc.
|
|
|
|
The `transform_points` method calculates the full world -> ndc transform
|
|
and then applies it to the input points.
|
|
|
|
The transforms can also be returned separately as Transform3d objects.
|
|
|
|
* Setting the Aspect Ratio for Non Square Images *
|
|
|
|
If the desired output image size is non square (i.e. a tuple of (H, W) where H != W)
|
|
the aspect ratio needs special consideration: There are two aspect ratios
|
|
to be aware of:
|
|
- the aspect ratio of each pixel
|
|
- the aspect ratio of the output image
|
|
The `aspect_ratio` setting in the FoVPerspectiveCameras sets the
|
|
pixel aspect ratio. When using this camera with the differentiable rasterizer
|
|
be aware that in the rasterizer we assume square pixels, but allow
|
|
variable image aspect ratio (i.e rectangle images).
|
|
|
|
In most cases you will want to set the camera `aspect_ratio=1.0`
|
|
(i.e. square pixels) and only vary the output image dimensions in pixels
|
|
for rasterization.
|
|
"""
|
|
|
|
# For __getitem__
|
|
_FIELDS = (
|
|
"K",
|
|
"znear",
|
|
"zfar",
|
|
"aspect_ratio",
|
|
"fov",
|
|
"R",
|
|
"T",
|
|
"degrees",
|
|
)
|
|
|
|
_SHARED_FIELDS = ("degrees",)
|
|
|
|
def __init__(
|
|
self,
|
|
znear: _BatchFloatType = 1.0,
|
|
zfar: _BatchFloatType = 100.0,
|
|
aspect_ratio: _BatchFloatType = 1.0,
|
|
fov: _BatchFloatType = 60.0,
|
|
degrees: bool = True,
|
|
R: torch.Tensor = _R,
|
|
T: torch.Tensor = _T,
|
|
K: Optional[torch.Tensor] = None,
|
|
device: Device = "cpu",
|
|
) -> None:
|
|
"""
|
|
|
|
Args:
|
|
znear: near clipping plane of the view frustrum.
|
|
zfar: far clipping plane of the view frustrum.
|
|
aspect_ratio: aspect ratio of the image pixels.
|
|
1.0 indicates square pixels.
|
|
fov: field of view angle of the camera.
|
|
degrees: bool, set to True if fov is specified in degrees.
|
|
R: Rotation matrix of shape (N, 3, 3)
|
|
T: Translation matrix of shape (N, 3)
|
|
K: (optional) A calibration matrix of shape (N, 4, 4)
|
|
If provided, don't need znear, zfar, fov, aspect_ratio, degrees
|
|
device: Device (as str or torch.device)
|
|
"""
|
|
# The initializer formats all inputs to torch tensors and broadcasts
|
|
# all the inputs to have the same batch dimension where necessary.
|
|
super().__init__(
|
|
device=device,
|
|
znear=znear,
|
|
zfar=zfar,
|
|
aspect_ratio=aspect_ratio,
|
|
fov=fov,
|
|
R=R,
|
|
T=T,
|
|
K=K,
|
|
)
|
|
|
|
# No need to convert to tensor or broadcast.
|
|
self.degrees = degrees
|
|
|
|
def compute_projection_matrix(
|
|
self, znear, zfar, fov, aspect_ratio, degrees: bool
|
|
) -> torch.Tensor:
|
|
"""
|
|
Compute the calibration matrix K of shape (N, 4, 4)
|
|
|
|
Args:
|
|
znear: near clipping plane of the view frustrum.
|
|
zfar: far clipping plane of the view frustrum.
|
|
fov: field of view angle of the camera.
|
|
aspect_ratio: aspect ratio of the image pixels.
|
|
1.0 indicates square pixels.
|
|
degrees: bool, set to True if fov is specified in degrees.
|
|
|
|
Returns:
|
|
torch.FloatTensor of the calibration matrix with shape (N, 4, 4)
|
|
"""
|
|
K = torch.zeros((self._N, 4, 4), device=self.device, dtype=torch.float32)
|
|
ones = torch.ones((self._N), dtype=torch.float32, device=self.device)
|
|
if degrees:
|
|
fov = (np.pi / 180) * fov
|
|
|
|
if not torch.is_tensor(fov):
|
|
fov = torch.tensor(fov, device=self.device)
|
|
tanHalfFov = torch.tan((fov / 2))
|
|
max_y = tanHalfFov * znear
|
|
min_y = -max_y
|
|
max_x = max_y * aspect_ratio
|
|
min_x = -max_x
|
|
|
|
# NOTE: In OpenGL the projection matrix changes the handedness of the
|
|
# coordinate frame. i.e the NDC space positive z direction is the
|
|
# camera space negative z direction. This is because the sign of the z
|
|
# in the projection matrix is set to -1.0.
|
|
# In pytorch3d we maintain a right handed coordinate system throughout
|
|
# so the so the z sign is 1.0.
|
|
z_sign = 1.0
|
|
|
|
# pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
|
|
K[:, 0, 0] = 2.0 * znear / (max_x - min_x)
|
|
# pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
|
|
K[:, 1, 1] = 2.0 * znear / (max_y - min_y)
|
|
K[:, 0, 2] = (max_x + min_x) / (max_x - min_x)
|
|
K[:, 1, 2] = (max_y + min_y) / (max_y - min_y)
|
|
K[:, 3, 2] = z_sign * ones
|
|
|
|
# NOTE: This maps the z coordinate from [0, 1] where z = 0 if the point
|
|
# is at the near clipping plane and z = 1 when the point is at the far
|
|
# clipping plane.
|
|
K[:, 2, 2] = z_sign * zfar / (zfar - znear)
|
|
K[:, 2, 3] = -(zfar * znear) / (zfar - znear)
|
|
|
|
return K
|
|
|
|
def get_projection_transform(self, **kwargs) -> Transform3d:
|
|
"""
|
|
Calculate the perspective projection matrix with a symmetric
|
|
viewing frustrum. Use column major order.
|
|
The viewing frustrum will be projected into ndc, s.t.
|
|
(max_x, max_y) -> (+1, +1)
|
|
(min_x, min_y) -> (-1, -1)
|
|
|
|
Args:
|
|
**kwargs: parameters for the projection can be passed in as keyword
|
|
arguments to override the default values set in `__init__`.
|
|
|
|
Return:
|
|
a Transform3d object which represents a batch of projection
|
|
matrices of shape (N, 4, 4)
|
|
|
|
.. code-block:: python
|
|
|
|
h1 = (max_y + min_y)/(max_y - min_y)
|
|
w1 = (max_x + min_x)/(max_x - min_x)
|
|
tanhalffov = tan((fov/2))
|
|
s1 = 1/tanhalffov
|
|
s2 = 1/(tanhalffov * (aspect_ratio))
|
|
|
|
# To map z to the range [0, 1] use:
|
|
f1 = far / (far - near)
|
|
f2 = -(far * near) / (far - near)
|
|
|
|
# Projection matrix
|
|
K = [
|
|
[s1, 0, w1, 0],
|
|
[0, s2, h1, 0],
|
|
[0, 0, f1, f2],
|
|
[0, 0, 1, 0],
|
|
]
|
|
"""
|
|
K = kwargs.get("K", self.K)
|
|
if K is not None:
|
|
if K.shape != (self._N, 4, 4):
|
|
msg = "Expected K to have shape of (%r, 4, 4)"
|
|
raise ValueError(msg % (self._N))
|
|
else:
|
|
K = self.compute_projection_matrix(
|
|
kwargs.get("znear", self.znear),
|
|
kwargs.get("zfar", self.zfar),
|
|
kwargs.get("fov", self.fov),
|
|
kwargs.get("aspect_ratio", self.aspect_ratio),
|
|
kwargs.get("degrees", self.degrees),
|
|
)
|
|
|
|
# Transpose the projection matrix as PyTorch3D transforms use row vectors.
|
|
transform = Transform3d(
|
|
matrix=K.transpose(1, 2).contiguous(), device=self.device
|
|
)
|
|
return transform
|
|
|
|
def unproject_points(
|
|
self,
|
|
xy_depth: torch.Tensor,
|
|
world_coordinates: bool = True,
|
|
scaled_depth_input: bool = False,
|
|
**kwargs,
|
|
) -> torch.Tensor:
|
|
""">!
|
|
FoV cameras further allow for passing depth in world units
|
|
(`scaled_depth_input=False`) or in the [0, 1]-normalized units
|
|
(`scaled_depth_input=True`)
|
|
|
|
Args:
|
|
scaled_depth_input: If `True`, assumes the input depth is in
|
|
the [0, 1]-normalized units. If `False` the input depth is in
|
|
the world units.
|
|
"""
|
|
|
|
# obtain the relevant transformation to ndc
|
|
if world_coordinates:
|
|
to_ndc_transform = self.get_full_projection_transform()
|
|
else:
|
|
to_ndc_transform = self.get_projection_transform()
|
|
|
|
if scaled_depth_input:
|
|
# the input is scaled depth, so we don't have to do anything
|
|
xy_sdepth = xy_depth
|
|
else:
|
|
# parse out important values from the projection matrix
|
|
K_matrix = self.get_projection_transform(**kwargs.copy()).get_matrix()
|
|
# parse out f1, f2 from K_matrix
|
|
unsqueeze_shape = [1] * xy_depth.dim()
|
|
unsqueeze_shape[0] = K_matrix.shape[0]
|
|
f1 = K_matrix[:, 2, 2].reshape(unsqueeze_shape)
|
|
f2 = K_matrix[:, 3, 2].reshape(unsqueeze_shape)
|
|
# get the scaled depth
|
|
sdepth = (f1 * xy_depth[..., 2:3] + f2) / xy_depth[..., 2:3]
|
|
# concatenate xy + scaled depth
|
|
xy_sdepth = torch.cat((xy_depth[..., 0:2], sdepth), dim=-1)
|
|
|
|
# unproject with inverse of the projection
|
|
unprojection_transform = to_ndc_transform.inverse()
|
|
return unprojection_transform.transform_points(xy_sdepth)
|
|
|
|
def is_perspective(self):
|
|
return True
|
|
|
|
def in_ndc(self):
|
|
return True
|
|
|
|
|
|
def OpenGLOrthographicCameras(
|
|
znear: _BatchFloatType = 1.0,
|
|
zfar: _BatchFloatType = 100.0,
|
|
top: _BatchFloatType = 1.0,
|
|
bottom: _BatchFloatType = -1.0,
|
|
left: _BatchFloatType = -1.0,
|
|
right: _BatchFloatType = 1.0,
|
|
scale_xyz=((1.0, 1.0, 1.0),), # (1, 3)
|
|
R: torch.Tensor = _R,
|
|
T: torch.Tensor = _T,
|
|
device: Device = "cpu",
|
|
) -> "FoVOrthographicCameras":
|
|
"""
|
|
OpenGLOrthographicCameras has been DEPRECATED. Use FoVOrthographicCameras instead.
|
|
Preserving OpenGLOrthographicCameras for backward compatibility.
|
|
"""
|
|
|
|
warnings.warn(
|
|
"""OpenGLOrthographicCameras is deprecated,
|
|
Use FoVOrthographicCameras instead.
|
|
OpenGLOrthographicCameras will be removed in future releases.""",
|
|
PendingDeprecationWarning,
|
|
)
|
|
|
|
return FoVOrthographicCameras(
|
|
znear=znear,
|
|
zfar=zfar,
|
|
max_y=top,
|
|
min_y=bottom,
|
|
max_x=right,
|
|
min_x=left,
|
|
scale_xyz=scale_xyz,
|
|
R=R,
|
|
T=T,
|
|
device=device,
|
|
)
|
|
|
|
|
|
class FoVOrthographicCameras(CamerasBase):
|
|
"""
|
|
A class which stores a batch of parameters to generate a batch of
|
|
projection matrices by specifying the field of view.
|
|
The definitions of the parameters follow the OpenGL orthographic camera.
|
|
"""
|
|
|
|
# For __getitem__
|
|
_FIELDS = (
|
|
"K",
|
|
"znear",
|
|
"zfar",
|
|
"R",
|
|
"T",
|
|
"max_y",
|
|
"min_y",
|
|
"max_x",
|
|
"min_x",
|
|
"scale_xyz",
|
|
)
|
|
|
|
def __init__(
|
|
self,
|
|
znear: _BatchFloatType = 1.0,
|
|
zfar: _BatchFloatType = 100.0,
|
|
max_y: _BatchFloatType = 1.0,
|
|
min_y: _BatchFloatType = -1.0,
|
|
max_x: _BatchFloatType = 1.0,
|
|
min_x: _BatchFloatType = -1.0,
|
|
scale_xyz=((1.0, 1.0, 1.0),), # (1, 3)
|
|
R: torch.Tensor = _R,
|
|
T: torch.Tensor = _T,
|
|
K: Optional[torch.Tensor] = None,
|
|
device: Device = "cpu",
|
|
):
|
|
"""
|
|
|
|
Args:
|
|
znear: near clipping plane of the view frustrum.
|
|
zfar: far clipping plane of the view frustrum.
|
|
max_y: maximum y coordinate of the frustrum.
|
|
min_y: minimum y coordinate of the frustrum.
|
|
max_x: maximum x coordinate of the frustrum.
|
|
min_x: minimum x coordinate of the frustrum
|
|
scale_xyz: scale factors for each axis of shape (N, 3).
|
|
R: Rotation matrix of shape (N, 3, 3).
|
|
T: Translation of shape (N, 3).
|
|
K: (optional) A calibration matrix of shape (N, 4, 4)
|
|
If provided, don't need znear, zfar, max_y, min_y, max_x, min_x, scale_xyz
|
|
device: torch.device or string.
|
|
|
|
Only need to set min_x, max_x, min_y, max_y for viewing frustrums
|
|
which are non symmetric about the origin.
|
|
"""
|
|
# The initializer formats all inputs to torch tensors and broadcasts
|
|
# all the inputs to have the same batch dimension where necessary.
|
|
super().__init__(
|
|
device=device,
|
|
znear=znear,
|
|
zfar=zfar,
|
|
max_y=max_y,
|
|
min_y=min_y,
|
|
max_x=max_x,
|
|
min_x=min_x,
|
|
scale_xyz=scale_xyz,
|
|
R=R,
|
|
T=T,
|
|
K=K,
|
|
)
|
|
|
|
def compute_projection_matrix(
|
|
self, znear, zfar, max_x, min_x, max_y, min_y, scale_xyz
|
|
) -> torch.Tensor:
|
|
"""
|
|
Compute the calibration matrix K of shape (N, 4, 4)
|
|
|
|
Args:
|
|
znear: near clipping plane of the view frustrum.
|
|
zfar: far clipping plane of the view frustrum.
|
|
max_x: maximum x coordinate of the frustrum.
|
|
min_x: minimum x coordinate of the frustrum
|
|
max_y: maximum y coordinate of the frustrum.
|
|
min_y: minimum y coordinate of the frustrum.
|
|
scale_xyz: scale factors for each axis of shape (N, 3).
|
|
"""
|
|
K = torch.zeros((self._N, 4, 4), dtype=torch.float32, device=self.device)
|
|
ones = torch.ones((self._N), dtype=torch.float32, device=self.device)
|
|
# NOTE: OpenGL flips handedness of coordinate system between camera
|
|
# space and NDC space so z sign is -ve. In PyTorch3D we maintain a
|
|
# right handed coordinate system throughout.
|
|
z_sign = +1.0
|
|
|
|
K[:, 0, 0] = (2.0 / (max_x - min_x)) * scale_xyz[:, 0]
|
|
K[:, 1, 1] = (2.0 / (max_y - min_y)) * scale_xyz[:, 1]
|
|
K[:, 0, 3] = -(max_x + min_x) / (max_x - min_x)
|
|
K[:, 1, 3] = -(max_y + min_y) / (max_y - min_y)
|
|
K[:, 3, 3] = ones
|
|
|
|
# NOTE: This maps the z coordinate to the range [0, 1] and replaces the
|
|
# the OpenGL z normalization to [-1, 1]
|
|
K[:, 2, 2] = z_sign * (1.0 / (zfar - znear)) * scale_xyz[:, 2]
|
|
K[:, 2, 3] = -znear / (zfar - znear)
|
|
|
|
return K
|
|
|
|
def get_projection_transform(self, **kwargs) -> Transform3d:
|
|
"""
|
|
Calculate the orthographic projection matrix.
|
|
Use column major order.
|
|
|
|
Args:
|
|
**kwargs: parameters for the projection can be passed in to
|
|
override the default values set in __init__.
|
|
Return:
|
|
a Transform3d object which represents a batch of projection
|
|
matrices of shape (N, 4, 4)
|
|
|
|
.. code-block:: python
|
|
|
|
scale_x = 2 / (max_x - min_x)
|
|
scale_y = 2 / (max_y - min_y)
|
|
scale_z = 2 / (far-near)
|
|
mid_x = (max_x + min_x) / (max_x - min_x)
|
|
mix_y = (max_y + min_y) / (max_y - min_y)
|
|
mid_z = (far + near) / (far - near)
|
|
|
|
K = [
|
|
[scale_x, 0, 0, -mid_x],
|
|
[0, scale_y, 0, -mix_y],
|
|
[0, 0, -scale_z, -mid_z],
|
|
[0, 0, 0, 1],
|
|
]
|
|
"""
|
|
K = kwargs.get("K", self.K)
|
|
if K is not None:
|
|
if K.shape != (self._N, 4, 4):
|
|
msg = "Expected K to have shape of (%r, 4, 4)"
|
|
raise ValueError(msg % (self._N))
|
|
else:
|
|
K = self.compute_projection_matrix(
|
|
kwargs.get("znear", self.znear),
|
|
kwargs.get("zfar", self.zfar),
|
|
kwargs.get("max_x", self.max_x),
|
|
kwargs.get("min_x", self.min_x),
|
|
kwargs.get("max_y", self.max_y),
|
|
kwargs.get("min_y", self.min_y),
|
|
kwargs.get("scale_xyz", self.scale_xyz),
|
|
)
|
|
|
|
transform = Transform3d(
|
|
matrix=K.transpose(1, 2).contiguous(), device=self.device
|
|
)
|
|
return transform
|
|
|
|
def unproject_points(
|
|
self,
|
|
xy_depth: torch.Tensor,
|
|
world_coordinates: bool = True,
|
|
scaled_depth_input: bool = False,
|
|
**kwargs,
|
|
) -> torch.Tensor:
|
|
""">!
|
|
FoV cameras further allow for passing depth in world units
|
|
(`scaled_depth_input=False`) or in the [0, 1]-normalized units
|
|
(`scaled_depth_input=True`)
|
|
|
|
Args:
|
|
scaled_depth_input: If `True`, assumes the input depth is in
|
|
the [0, 1]-normalized units. If `False` the input depth is in
|
|
the world units.
|
|
"""
|
|
|
|
if world_coordinates:
|
|
to_ndc_transform = self.get_full_projection_transform(**kwargs.copy())
|
|
else:
|
|
to_ndc_transform = self.get_projection_transform(**kwargs.copy())
|
|
|
|
if scaled_depth_input:
|
|
# the input depth is already scaled
|
|
xy_sdepth = xy_depth
|
|
else:
|
|
# we have to obtain the scaled depth first
|
|
K = self.get_projection_transform(**kwargs).get_matrix()
|
|
unsqueeze_shape = [1] * K.dim()
|
|
unsqueeze_shape[0] = K.shape[0]
|
|
mid_z = K[:, 3, 2].reshape(unsqueeze_shape)
|
|
scale_z = K[:, 2, 2].reshape(unsqueeze_shape)
|
|
scaled_depth = scale_z * xy_depth[..., 2:3] + mid_z
|
|
# cat xy and scaled depth
|
|
xy_sdepth = torch.cat((xy_depth[..., :2], scaled_depth), dim=-1)
|
|
# finally invert the transform
|
|
unprojection_transform = to_ndc_transform.inverse()
|
|
return unprojection_transform.transform_points(xy_sdepth)
|
|
|
|
def is_perspective(self):
|
|
return False
|
|
|
|
def in_ndc(self):
|
|
return True
|
|
|
|
|
|
############################################################
|
|
# MultiView Camera Classes #
|
|
############################################################
|
|
"""
|
|
Note that the MultiView Cameras accept parameters in NDC space.
|
|
"""
|
|
|
|
|
|
def SfMPerspectiveCameras(
|
|
focal_length: _FocalLengthType = 1.0,
|
|
principal_point=((0.0, 0.0),),
|
|
R: torch.Tensor = _R,
|
|
T: torch.Tensor = _T,
|
|
device: Device = "cpu",
|
|
) -> "PerspectiveCameras":
|
|
"""
|
|
SfMPerspectiveCameras has been DEPRECATED. Use PerspectiveCameras instead.
|
|
Preserving SfMPerspectiveCameras for backward compatibility.
|
|
"""
|
|
|
|
warnings.warn(
|
|
"""SfMPerspectiveCameras is deprecated,
|
|
Use PerspectiveCameras instead.
|
|
SfMPerspectiveCameras will be removed in future releases.""",
|
|
PendingDeprecationWarning,
|
|
)
|
|
|
|
return PerspectiveCameras(
|
|
focal_length=focal_length,
|
|
principal_point=principal_point,
|
|
R=R,
|
|
T=T,
|
|
device=device,
|
|
)
|
|
|
|
|
|
class PerspectiveCameras(CamerasBase):
|
|
"""
|
|
A class which stores a batch of parameters to generate a batch of
|
|
transformation matrices using the multi-view geometry convention for
|
|
perspective camera.
|
|
|
|
Parameters for this camera are specified in NDC if `in_ndc` is set to True.
|
|
If parameters are specified in screen space, `in_ndc` must be set to False.
|
|
"""
|
|
|
|
# For __getitem__
|
|
_FIELDS = (
|
|
"K",
|
|
"R",
|
|
"T",
|
|
"focal_length",
|
|
"principal_point",
|
|
"_in_ndc", # arg is in_ndc but attribute set as _in_ndc
|
|
"image_size",
|
|
)
|
|
|
|
_SHARED_FIELDS = ("_in_ndc",)
|
|
|
|
def __init__(
|
|
self,
|
|
focal_length: _FocalLengthType = 1.0,
|
|
principal_point=((0.0, 0.0),),
|
|
R: torch.Tensor = _R,
|
|
T: torch.Tensor = _T,
|
|
K: Optional[torch.Tensor] = None,
|
|
device: Device = "cpu",
|
|
in_ndc: bool = True,
|
|
image_size: Optional[Union[List, Tuple, torch.Tensor]] = None,
|
|
) -> None:
|
|
"""
|
|
|
|
Args:
|
|
focal_length: Focal length of the camera in world units.
|
|
A tensor of shape (N, 1) or (N, 2) for
|
|
square and non-square pixels respectively.
|
|
principal_point: xy coordinates of the center of
|
|
the principal point of the camera in pixels.
|
|
A tensor of shape (N, 2).
|
|
in_ndc: True if camera parameters are specified in NDC.
|
|
If camera parameters are in screen space, it must
|
|
be set to False.
|
|
R: Rotation matrix of shape (N, 3, 3)
|
|
T: Translation matrix of shape (N, 3)
|
|
K: (optional) A calibration matrix of shape (N, 4, 4)
|
|
If provided, don't need focal_length, principal_point
|
|
image_size: (height, width) of image size.
|
|
A tensor of shape (N, 2) or a list/tuple. Required for screen cameras.
|
|
device: torch.device or string
|
|
"""
|
|
# The initializer formats all inputs to torch tensors and broadcasts
|
|
# all the inputs to have the same batch dimension where necessary.
|
|
kwargs = {"image_size": image_size} if image_size is not None else {}
|
|
super().__init__(
|
|
device=device,
|
|
focal_length=focal_length,
|
|
principal_point=principal_point,
|
|
R=R,
|
|
T=T,
|
|
K=K,
|
|
_in_ndc=in_ndc,
|
|
**kwargs, # pyre-ignore
|
|
)
|
|
if image_size is not None:
|
|
if (self.image_size < 1).any(): # pyre-ignore
|
|
raise ValueError("Image_size provided has invalid values")
|
|
else:
|
|
self.image_size = None
|
|
|
|
# When focal length is provided as one value, expand to
|
|
# create (N, 2) shape tensor
|
|
if self.focal_length.ndim == 1: # (N,)
|
|
self.focal_length = self.focal_length[:, None] # (N, 1)
|
|
self.focal_length = self.focal_length.expand(-1, 2) # (N, 2)
|
|
|
|
def get_projection_transform(self, **kwargs) -> Transform3d:
|
|
"""
|
|
Calculate the projection matrix using the
|
|
multi-view geometry convention.
|
|
|
|
Args:
|
|
**kwargs: parameters for the projection can be passed in as keyword
|
|
arguments to override the default values set in __init__.
|
|
|
|
Returns:
|
|
A `Transform3d` object with a batch of `N` projection transforms.
|
|
|
|
.. code-block:: python
|
|
|
|
fx = focal_length[:, 0]
|
|
fy = focal_length[:, 1]
|
|
px = principal_point[:, 0]
|
|
py = principal_point[:, 1]
|
|
|
|
K = [
|
|
[fx, 0, px, 0],
|
|
[0, fy, py, 0],
|
|
[0, 0, 0, 1],
|
|
[0, 0, 1, 0],
|
|
]
|
|
"""
|
|
K = kwargs.get("K", self.K)
|
|
if K is not None:
|
|
if K.shape != (self._N, 4, 4):
|
|
msg = "Expected K to have shape of (%r, 4, 4)"
|
|
raise ValueError(msg % (self._N))
|
|
else:
|
|
K = _get_sfm_calibration_matrix(
|
|
self._N,
|
|
self.device,
|
|
kwargs.get("focal_length", self.focal_length),
|
|
kwargs.get("principal_point", self.principal_point),
|
|
orthographic=False,
|
|
)
|
|
|
|
transform = Transform3d(
|
|
matrix=K.transpose(1, 2).contiguous(), device=self.device
|
|
)
|
|
return transform
|
|
|
|
def unproject_points(
|
|
self,
|
|
xy_depth: torch.Tensor,
|
|
world_coordinates: bool = True,
|
|
from_ndc: bool = False,
|
|
**kwargs,
|
|
) -> torch.Tensor:
|
|
"""
|
|
Args:
|
|
from_ndc: If `False` (default), assumes xy part of input is in
|
|
NDC space if self.in_ndc(), otherwise in screen space. If
|
|
`True`, assumes xy is in NDC space even if the camera
|
|
is defined in screen space.
|
|
"""
|
|
if world_coordinates:
|
|
to_camera_transform = self.get_full_projection_transform(**kwargs)
|
|
else:
|
|
to_camera_transform = self.get_projection_transform(**kwargs)
|
|
if from_ndc:
|
|
to_camera_transform = to_camera_transform.compose(
|
|
self.get_ndc_camera_transform()
|
|
)
|
|
|
|
unprojection_transform = to_camera_transform.inverse()
|
|
xy_inv_depth = torch.cat(
|
|
(xy_depth[..., :2], 1.0 / xy_depth[..., 2:3]), dim=-1 # type: ignore
|
|
)
|
|
return unprojection_transform.transform_points(xy_inv_depth)
|
|
|
|
def get_principal_point(self, **kwargs) -> torch.Tensor:
|
|
"""
|
|
Return the camera's principal point
|
|
|
|
Args:
|
|
**kwargs: parameters for the camera extrinsics can be passed in
|
|
as keyword arguments to override the default values
|
|
set in __init__.
|
|
"""
|
|
proj_mat = self.get_projection_transform(**kwargs).get_matrix()
|
|
return proj_mat[:, 2, :2]
|
|
|
|
def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
|
|
"""
|
|
Returns the transform from camera projection space (screen or NDC) to NDC space.
|
|
If the camera is defined already in NDC space, the transform is identity.
|
|
For cameras defined in screen space, we adjust the principal point computation
|
|
which is defined in the image space (commonly) and scale the points to NDC space.
|
|
|
|
This transform leaves the depth unchanged.
|
|
|
|
Important: This transforms assumes PyTorch3D conventions for the input points,
|
|
i.e. +X left, +Y up.
|
|
"""
|
|
if self.in_ndc():
|
|
ndc_transform = Transform3d(device=self.device, dtype=torch.float32)
|
|
else:
|
|
# when cameras are defined in screen/image space, the principal point is
|
|
# provided in the (+X right, +Y down), aka image, coordinate system.
|
|
# Since input points are defined in the PyTorch3D system (+X left, +Y up),
|
|
# we need to adjust for the principal point transform.
|
|
pr_point_fix = torch.zeros(
|
|
(self._N, 4, 4), device=self.device, dtype=torch.float32
|
|
)
|
|
pr_point_fix[:, 0, 0] = 1.0
|
|
pr_point_fix[:, 1, 1] = 1.0
|
|
pr_point_fix[:, 2, 2] = 1.0
|
|
pr_point_fix[:, 3, 3] = 1.0
|
|
pr_point_fix[:, :2, 3] = -2.0 * self.get_principal_point(**kwargs)
|
|
pr_point_fix_transform = Transform3d(
|
|
matrix=pr_point_fix.transpose(1, 2).contiguous(), device=self.device
|
|
)
|
|
image_size = kwargs.get("image_size", self.get_image_size())
|
|
screen_to_ndc_transform = get_screen_to_ndc_transform(
|
|
self, with_xyflip=False, image_size=image_size
|
|
)
|
|
ndc_transform = pr_point_fix_transform.compose(screen_to_ndc_transform)
|
|
|
|
return ndc_transform
|
|
|
|
def is_perspective(self):
|
|
return True
|
|
|
|
def in_ndc(self):
|
|
return self._in_ndc
|
|
|
|
|
|
def SfMOrthographicCameras(
|
|
focal_length: _FocalLengthType = 1.0,
|
|
principal_point=((0.0, 0.0),),
|
|
R: torch.Tensor = _R,
|
|
T: torch.Tensor = _T,
|
|
device: Device = "cpu",
|
|
) -> "OrthographicCameras":
|
|
"""
|
|
SfMOrthographicCameras has been DEPRECATED. Use OrthographicCameras instead.
|
|
Preserving SfMOrthographicCameras for backward compatibility.
|
|
"""
|
|
|
|
warnings.warn(
|
|
"""SfMOrthographicCameras is deprecated,
|
|
Use OrthographicCameras instead.
|
|
SfMOrthographicCameras will be removed in future releases.""",
|
|
PendingDeprecationWarning,
|
|
)
|
|
|
|
return OrthographicCameras(
|
|
focal_length=focal_length,
|
|
principal_point=principal_point,
|
|
R=R,
|
|
T=T,
|
|
device=device,
|
|
)
|
|
|
|
|
|
class OrthographicCameras(CamerasBase):
|
|
"""
|
|
A class which stores a batch of parameters to generate a batch of
|
|
transformation matrices using the multi-view geometry convention for
|
|
orthographic camera.
|
|
|
|
Parameters for this camera are specified in NDC if `in_ndc` is set to True.
|
|
If parameters are specified in screen space, `in_ndc` must be set to False.
|
|
"""
|
|
|
|
# For __getitem__
|
|
_FIELDS = (
|
|
"K",
|
|
"R",
|
|
"T",
|
|
"focal_length",
|
|
"principal_point",
|
|
"_in_ndc",
|
|
"image_size",
|
|
)
|
|
|
|
_SHARED_FIELDS = ("_in_ndc",)
|
|
|
|
def __init__(
|
|
self,
|
|
focal_length: _FocalLengthType = 1.0,
|
|
principal_point=((0.0, 0.0),),
|
|
R: torch.Tensor = _R,
|
|
T: torch.Tensor = _T,
|
|
K: Optional[torch.Tensor] = None,
|
|
device: Device = "cpu",
|
|
in_ndc: bool = True,
|
|
image_size: Optional[Union[List, Tuple, torch.Tensor]] = None,
|
|
) -> None:
|
|
"""
|
|
|
|
Args:
|
|
focal_length: Focal length of the camera in world units.
|
|
A tensor of shape (N, 1) or (N, 2) for
|
|
square and non-square pixels respectively.
|
|
principal_point: xy coordinates of the center of
|
|
the principal point of the camera in pixels.
|
|
A tensor of shape (N, 2).
|
|
in_ndc: True if camera parameters are specified in NDC.
|
|
If False, then camera parameters are in screen space.
|
|
R: Rotation matrix of shape (N, 3, 3)
|
|
T: Translation matrix of shape (N, 3)
|
|
K: (optional) A calibration matrix of shape (N, 4, 4)
|
|
If provided, don't need focal_length, principal_point, image_size
|
|
image_size: (height, width) of image size.
|
|
A tensor of shape (N, 2) or list/tuple. Required for screen cameras.
|
|
device: torch.device or string
|
|
"""
|
|
# The initializer formats all inputs to torch tensors and broadcasts
|
|
# all the inputs to have the same batch dimension where necessary.
|
|
kwargs = {"image_size": image_size} if image_size is not None else {}
|
|
super().__init__(
|
|
device=device,
|
|
focal_length=focal_length,
|
|
principal_point=principal_point,
|
|
R=R,
|
|
T=T,
|
|
K=K,
|
|
_in_ndc=in_ndc,
|
|
**kwargs, # pyre-ignore
|
|
)
|
|
if image_size is not None:
|
|
if (self.image_size < 1).any(): # pyre-ignore
|
|
raise ValueError("Image_size provided has invalid values")
|
|
else:
|
|
self.image_size = None
|
|
|
|
# When focal length is provided as one value, expand to
|
|
# create (N, 2) shape tensor
|
|
if self.focal_length.ndim == 1: # (N,)
|
|
self.focal_length = self.focal_length[:, None] # (N, 1)
|
|
self.focal_length = self.focal_length.expand(-1, 2) # (N, 2)
|
|
|
|
def get_projection_transform(self, **kwargs) -> Transform3d:
|
|
"""
|
|
Calculate the projection matrix using
|
|
the multi-view geometry convention.
|
|
|
|
Args:
|
|
**kwargs: parameters for the projection can be passed in as keyword
|
|
arguments to override the default values set in __init__.
|
|
|
|
Returns:
|
|
A `Transform3d` object with a batch of `N` projection transforms.
|
|
|
|
.. code-block:: python
|
|
|
|
fx = focal_length[:,0]
|
|
fy = focal_length[:,1]
|
|
px = principal_point[:,0]
|
|
py = principal_point[:,1]
|
|
|
|
K = [
|
|
[fx, 0, 0, px],
|
|
[0, fy, 0, py],
|
|
[0, 0, 1, 0],
|
|
[0, 0, 0, 1],
|
|
]
|
|
"""
|
|
K = kwargs.get("K", self.K)
|
|
if K is not None:
|
|
if K.shape != (self._N, 4, 4):
|
|
msg = "Expected K to have shape of (%r, 4, 4)"
|
|
raise ValueError(msg % (self._N))
|
|
else:
|
|
K = _get_sfm_calibration_matrix(
|
|
self._N,
|
|
self.device,
|
|
kwargs.get("focal_length", self.focal_length),
|
|
kwargs.get("principal_point", self.principal_point),
|
|
orthographic=True,
|
|
)
|
|
|
|
transform = Transform3d(
|
|
matrix=K.transpose(1, 2).contiguous(), device=self.device
|
|
)
|
|
return transform
|
|
|
|
def unproject_points(
|
|
self,
|
|
xy_depth: torch.Tensor,
|
|
world_coordinates: bool = True,
|
|
from_ndc: bool = False,
|
|
**kwargs,
|
|
) -> torch.Tensor:
|
|
"""
|
|
Args:
|
|
from_ndc: If `False` (default), assumes xy part of input is in
|
|
NDC space if self.in_ndc(), otherwise in screen space. If
|
|
`True`, assumes xy is in NDC space even if the camera
|
|
is defined in screen space.
|
|
"""
|
|
if world_coordinates:
|
|
to_camera_transform = self.get_full_projection_transform(**kwargs)
|
|
else:
|
|
to_camera_transform = self.get_projection_transform(**kwargs)
|
|
if from_ndc:
|
|
to_camera_transform = to_camera_transform.compose(
|
|
self.get_ndc_camera_transform()
|
|
)
|
|
|
|
unprojection_transform = to_camera_transform.inverse()
|
|
return unprojection_transform.transform_points(xy_depth)
|
|
|
|
def get_principal_point(self, **kwargs) -> torch.Tensor:
|
|
"""
|
|
Return the camera's principal point
|
|
|
|
Args:
|
|
**kwargs: parameters for the camera extrinsics can be passed in
|
|
as keyword arguments to override the default values
|
|
set in __init__.
|
|
"""
|
|
proj_mat = self.get_projection_transform(**kwargs).get_matrix()
|
|
return proj_mat[:, 3, :2]
|
|
|
|
def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
|
|
"""
|
|
Returns the transform from camera projection space (screen or NDC) to NDC space.
|
|
If the camera is defined already in NDC space, the transform is identity.
|
|
For cameras defined in screen space, we adjust the principal point computation
|
|
which is defined in the image space (commonly) and scale the points to NDC space.
|
|
|
|
Important: This transforms assumes PyTorch3D conventions for the input points,
|
|
i.e. +X left, +Y up.
|
|
"""
|
|
if self.in_ndc():
|
|
ndc_transform = Transform3d(device=self.device, dtype=torch.float32)
|
|
else:
|
|
# when cameras are defined in screen/image space, the principal point is
|
|
# provided in the (+X right, +Y down), aka image, coordinate system.
|
|
# Since input points are defined in the PyTorch3D system (+X left, +Y up),
|
|
# we need to adjust for the principal point transform.
|
|
pr_point_fix = torch.zeros(
|
|
(self._N, 4, 4), device=self.device, dtype=torch.float32
|
|
)
|
|
pr_point_fix[:, 0, 0] = 1.0
|
|
pr_point_fix[:, 1, 1] = 1.0
|
|
pr_point_fix[:, 2, 2] = 1.0
|
|
pr_point_fix[:, 3, 3] = 1.0
|
|
pr_point_fix[:, :2, 3] = -2.0 * self.get_principal_point(**kwargs)
|
|
pr_point_fix_transform = Transform3d(
|
|
matrix=pr_point_fix.transpose(1, 2).contiguous(), device=self.device
|
|
)
|
|
image_size = kwargs.get("image_size", self.get_image_size())
|
|
screen_to_ndc_transform = get_screen_to_ndc_transform(
|
|
self, with_xyflip=False, image_size=image_size
|
|
)
|
|
ndc_transform = pr_point_fix_transform.compose(screen_to_ndc_transform)
|
|
|
|
return ndc_transform
|
|
|
|
def is_perspective(self):
|
|
return False
|
|
|
|
def in_ndc(self):
|
|
return self._in_ndc
|
|
|
|
|
|
################################################
|
|
# Helper functions for cameras #
|
|
################################################
|
|
|
|
|
|
def _get_sfm_calibration_matrix(
|
|
N: int,
|
|
device: Device,
|
|
focal_length,
|
|
principal_point,
|
|
orthographic: bool = False,
|
|
) -> torch.Tensor:
|
|
"""
|
|
Returns a calibration matrix of a perspective/orthographic camera.
|
|
|
|
Args:
|
|
N: Number of cameras.
|
|
focal_length: Focal length of the camera.
|
|
principal_point: xy coordinates of the center of
|
|
the principal point of the camera in pixels.
|
|
orthographic: Boolean specifying if the camera is orthographic or not
|
|
|
|
The calibration matrix `K` is set up as follows:
|
|
|
|
.. code-block:: python
|
|
|
|
fx = focal_length[:,0]
|
|
fy = focal_length[:,1]
|
|
px = principal_point[:,0]
|
|
py = principal_point[:,1]
|
|
|
|
for orthographic==True:
|
|
K = [
|
|
[fx, 0, 0, px],
|
|
[0, fy, 0, py],
|
|
[0, 0, 1, 0],
|
|
[0, 0, 0, 1],
|
|
]
|
|
else:
|
|
K = [
|
|
[fx, 0, px, 0],
|
|
[0, fy, py, 0],
|
|
[0, 0, 0, 1],
|
|
[0, 0, 1, 0],
|
|
]
|
|
|
|
Returns:
|
|
A calibration matrix `K` of the SfM-conventioned camera
|
|
of shape (N, 4, 4).
|
|
"""
|
|
|
|
if not torch.is_tensor(focal_length):
|
|
focal_length = torch.tensor(focal_length, device=device)
|
|
|
|
if focal_length.ndim in (0, 1) or focal_length.shape[1] == 1:
|
|
fx = fy = focal_length
|
|
else:
|
|
fx, fy = focal_length.unbind(1)
|
|
|
|
if not torch.is_tensor(principal_point):
|
|
principal_point = torch.tensor(principal_point, device=device)
|
|
|
|
px, py = principal_point.unbind(1)
|
|
|
|
K = fx.new_zeros(N, 4, 4)
|
|
K[:, 0, 0] = fx
|
|
K[:, 1, 1] = fy
|
|
if orthographic:
|
|
K[:, 0, 3] = px
|
|
K[:, 1, 3] = py
|
|
K[:, 2, 2] = 1.0
|
|
K[:, 3, 3] = 1.0
|
|
else:
|
|
K[:, 0, 2] = px
|
|
K[:, 1, 2] = py
|
|
K[:, 3, 2] = 1.0
|
|
K[:, 2, 3] = 1.0
|
|
|
|
return K
|
|
|
|
|
|
################################################
|
|
# Helper functions for world to view transforms
|
|
################################################
|
|
|
|
|
|
def get_world_to_view_transform(
|
|
R: torch.Tensor = _R, T: torch.Tensor = _T
|
|
) -> Transform3d:
|
|
"""
|
|
This function returns a Transform3d representing the transformation
|
|
matrix to go from world space to view space by applying a rotation and
|
|
a translation.
|
|
|
|
PyTorch3D uses the same convention as Hartley & Zisserman.
|
|
I.e., for camera extrinsic parameters R (rotation) and T (translation),
|
|
we map a 3D point `X_world` in world coordinates to
|
|
a point `X_cam` in camera coordinates with:
|
|
`X_cam = X_world R + T`
|
|
|
|
Args:
|
|
R: (N, 3, 3) matrix representing the rotation.
|
|
T: (N, 3) matrix representing the translation.
|
|
|
|
Returns:
|
|
a Transform3d object which represents the composed RT transformation.
|
|
|
|
"""
|
|
# TODO: also support the case where RT is specified as one matrix
|
|
# of shape (N, 4, 4).
|
|
|
|
if T.shape[0] != R.shape[0]:
|
|
msg = "Expected R, T to have the same batch dimension; got %r, %r"
|
|
raise ValueError(msg % (R.shape[0], T.shape[0]))
|
|
if T.dim() != 2 or T.shape[1:] != (3,):
|
|
msg = "Expected T to have shape (N, 3); got %r"
|
|
raise ValueError(msg % repr(T.shape))
|
|
if R.dim() != 3 or R.shape[1:] != (3, 3):
|
|
msg = "Expected R to have shape (N, 3, 3); got %r"
|
|
raise ValueError(msg % repr(R.shape))
|
|
|
|
# Create a Transform3d object
|
|
T_ = Translate(T, device=T.device)
|
|
R_ = Rotate(R, device=R.device)
|
|
return R_.compose(T_)
|
|
|
|
|
|
def camera_position_from_spherical_angles(
|
|
distance: float,
|
|
elevation: float,
|
|
azimuth: float,
|
|
degrees: bool = True,
|
|
device: Device = "cpu",
|
|
) -> torch.Tensor:
|
|
"""
|
|
Calculate the location of the camera based on the distance away from
|
|
the target point, the elevation and azimuth angles.
|
|
|
|
Args:
|
|
distance: distance of the camera from the object.
|
|
elevation, azimuth: angles.
|
|
The inputs distance, elevation and azimuth can be one of the following
|
|
- Python scalar
|
|
- Torch scalar
|
|
- Torch tensor of shape (N) or (1)
|
|
degrees: bool, whether the angles are specified in degrees or radians.
|
|
device: str or torch.device, device for new tensors to be placed on.
|
|
|
|
The vectors are broadcast against each other so they all have shape (N, 1).
|
|
|
|
Returns:
|
|
camera_position: (N, 3) xyz location of the camera.
|
|
"""
|
|
broadcasted_args = convert_to_tensors_and_broadcast(
|
|
distance, elevation, azimuth, device=device
|
|
)
|
|
dist, elev, azim = broadcasted_args
|
|
if degrees:
|
|
elev = math.pi / 180.0 * elev
|
|
azim = math.pi / 180.0 * azim
|
|
x = dist * torch.cos(elev) * torch.sin(azim)
|
|
y = dist * torch.sin(elev)
|
|
z = dist * torch.cos(elev) * torch.cos(azim)
|
|
camera_position = torch.stack([x, y, z], dim=1)
|
|
if camera_position.dim() == 0:
|
|
camera_position = camera_position.view(1, -1) # add batch dim.
|
|
return camera_position.view(-1, 3)
|
|
|
|
|
|
def look_at_rotation(
|
|
camera_position, at=((0, 0, 0),), up=((0, 1, 0),), device: Device = "cpu"
|
|
) -> torch.Tensor:
|
|
"""
|
|
This function takes a vector 'camera_position' which specifies the location
|
|
of the camera in world coordinates and two vectors `at` and `up` which
|
|
indicate the position of the object and the up directions of the world
|
|
coordinate system respectively. The object is assumed to be centered at
|
|
the origin.
|
|
|
|
The output is a rotation matrix representing the transformation
|
|
from world coordinates -> view coordinates.
|
|
|
|
Args:
|
|
camera_position: position of the camera in world coordinates
|
|
at: position of the object in world coordinates
|
|
up: vector specifying the up direction in the world coordinate frame.
|
|
|
|
The inputs camera_position, at and up can each be a
|
|
- 3 element tuple/list
|
|
- torch tensor of shape (1, 3)
|
|
- torch tensor of shape (N, 3)
|
|
|
|
The vectors are broadcast against each other so they all have shape (N, 3).
|
|
|
|
Returns:
|
|
R: (N, 3, 3) batched rotation matrices
|
|
"""
|
|
# Format input and broadcast
|
|
broadcasted_args = convert_to_tensors_and_broadcast(
|
|
camera_position, at, up, device=device
|
|
)
|
|
camera_position, at, up = broadcasted_args
|
|
for t, n in zip([camera_position, at, up], ["camera_position", "at", "up"]):
|
|
if t.shape[-1] != 3:
|
|
msg = "Expected arg %s to have shape (N, 3); got %r"
|
|
raise ValueError(msg % (n, t.shape))
|
|
z_axis = F.normalize(at - camera_position, eps=1e-5)
|
|
x_axis = F.normalize(torch.cross(up, z_axis, dim=1), eps=1e-5)
|
|
y_axis = F.normalize(torch.cross(z_axis, x_axis, dim=1), eps=1e-5)
|
|
is_close = torch.isclose(x_axis, torch.tensor(0.0), atol=5e-3).all(
|
|
dim=1, keepdim=True
|
|
)
|
|
if is_close.any():
|
|
replacement = F.normalize(torch.cross(y_axis, z_axis, dim=1), eps=1e-5)
|
|
x_axis = torch.where(is_close, replacement, x_axis)
|
|
R = torch.cat((x_axis[:, None, :], y_axis[:, None, :], z_axis[:, None, :]), dim=1)
|
|
return R.transpose(1, 2)
|
|
|
|
|
|
def look_at_view_transform(
|
|
dist: _BatchFloatType = 1.0,
|
|
elev: _BatchFloatType = 0.0,
|
|
azim: _BatchFloatType = 0.0,
|
|
degrees: bool = True,
|
|
eye: Optional[Union[Sequence, torch.Tensor]] = None,
|
|
at=((0, 0, 0),), # (1, 3)
|
|
up=((0, 1, 0),), # (1, 3)
|
|
device: Device = "cpu",
|
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
"""
|
|
This function returns a rotation and translation matrix
|
|
to apply the 'Look At' transformation from world -> view coordinates [0].
|
|
|
|
Args:
|
|
dist: distance of the camera from the object
|
|
elev: angle in degrees or radians. This is the angle between the
|
|
vector from the object to the camera, and the horizontal plane y = 0 (xz-plane).
|
|
azim: angle in degrees or radians. The vector from the object to
|
|
the camera is projected onto a horizontal plane y = 0.
|
|
azim is the angle between the projected vector and a
|
|
reference vector at (0, 0, 1) on the reference plane (the horizontal plane).
|
|
dist, elev and azim can be of shape (1), (N).
|
|
degrees: boolean flag to indicate if the elevation and azimuth
|
|
angles are specified in degrees or radians.
|
|
eye: the position of the camera(s) in world coordinates. If eye is not
|
|
None, it will override the camera position derived from dist, elev, azim.
|
|
up: the direction of the x axis in the world coordinate system.
|
|
at: the position of the object(s) in world coordinates.
|
|
eye, up and at can be of shape (1, 3) or (N, 3).
|
|
|
|
Returns:
|
|
2-element tuple containing
|
|
|
|
- **R**: the rotation to apply to the points to align with the camera.
|
|
- **T**: the translation to apply to the points to align with the camera.
|
|
|
|
References:
|
|
[0] https://www.scratchapixel.com
|
|
"""
|
|
|
|
if eye is not None:
|
|
broadcasted_args = convert_to_tensors_and_broadcast(eye, at, up, device=device)
|
|
eye, at, up = broadcasted_args
|
|
C = eye
|
|
else:
|
|
broadcasted_args = convert_to_tensors_and_broadcast(
|
|
dist, elev, azim, at, up, device=device
|
|
)
|
|
dist, elev, azim, at, up = broadcasted_args
|
|
C = (
|
|
camera_position_from_spherical_angles(
|
|
dist, elev, azim, degrees=degrees, device=device
|
|
)
|
|
+ at
|
|
)
|
|
|
|
R = look_at_rotation(C, at, up, device=device)
|
|
T = -torch.bmm(R.transpose(1, 2), C[:, :, None])[:, :, 0]
|
|
return R, T
|
|
|
|
|
|
def get_ndc_to_screen_transform(
|
|
cameras,
|
|
with_xyflip: bool = False,
|
|
image_size: Optional[Union[List, Tuple, torch.Tensor]] = None,
|
|
) -> Transform3d:
|
|
"""
|
|
PyTorch3D NDC to screen conversion.
|
|
Conversion from PyTorch3D's NDC space (+X left, +Y up) to screen/image space
|
|
(+X right, +Y down, origin top left).
|
|
|
|
Args:
|
|
cameras
|
|
with_xyflip: flips x- and y-axis if set to True.
|
|
Optional kwargs:
|
|
image_size: ((height, width),) specifying the height, width
|
|
of the image. If not provided, it reads it from cameras.
|
|
|
|
We represent the NDC to screen conversion as a Transform3d
|
|
with projection matrix
|
|
|
|
K = [
|
|
[s, 0, 0, cx],
|
|
[0, s, 0, cy],
|
|
[0, 0, 1, 0],
|
|
[0, 0, 0, 1],
|
|
]
|
|
|
|
"""
|
|
# We require the image size, which is necessary for the transform
|
|
if image_size is None:
|
|
msg = "For NDC to screen conversion, image_size=(height, width) needs to be specified."
|
|
raise ValueError(msg)
|
|
|
|
K = torch.zeros((cameras._N, 4, 4), device=cameras.device, dtype=torch.float32)
|
|
if not torch.is_tensor(image_size):
|
|
image_size = torch.tensor(image_size, device=cameras.device)
|
|
# pyre-fixme[16]: Item `List` of `Union[List[typing.Any], Tensor, Tuple[Any,
|
|
# ...]]` has no attribute `view`.
|
|
image_size = image_size.view(-1, 2) # of shape (1 or B)x2
|
|
height, width = image_size.unbind(1)
|
|
|
|
# For non square images, we scale the points such that smallest side
|
|
# has range [-1, 1] and the largest side has range [-u, u], with u > 1.
|
|
# This convention is consistent with the PyTorch3D renderer
|
|
scale = (image_size.min(dim=1).values - 0.0) / 2.0
|
|
|
|
K[:, 0, 0] = scale
|
|
K[:, 1, 1] = scale
|
|
K[:, 0, 3] = -1.0 * (width - 0.0) / 2.0
|
|
K[:, 1, 3] = -1.0 * (height - 0.0) / 2.0
|
|
K[:, 2, 2] = 1.0
|
|
K[:, 3, 3] = 1.0
|
|
|
|
# Transpose the projection matrix as PyTorch3D transforms use row vectors.
|
|
transform = Transform3d(
|
|
matrix=K.transpose(1, 2).contiguous(), device=cameras.device
|
|
)
|
|
|
|
if with_xyflip:
|
|
# flip x, y axis
|
|
xyflip = torch.eye(4, device=cameras.device, dtype=torch.float32)
|
|
xyflip[0, 0] = -1.0
|
|
xyflip[1, 1] = -1.0
|
|
xyflip = xyflip.view(1, 4, 4).expand(cameras._N, -1, -1)
|
|
xyflip_transform = Transform3d(
|
|
matrix=xyflip.transpose(1, 2).contiguous(), device=cameras.device
|
|
)
|
|
transform = transform.compose(xyflip_transform)
|
|
return transform
|
|
|
|
|
|
def get_screen_to_ndc_transform(
|
|
cameras,
|
|
with_xyflip: bool = False,
|
|
image_size: Optional[Union[List, Tuple, torch.Tensor]] = None,
|
|
) -> Transform3d:
|
|
"""
|
|
Screen to PyTorch3D NDC conversion.
|
|
Conversion from screen/image space (+X right, +Y down, origin top left)
|
|
to PyTorch3D's NDC space (+X left, +Y up).
|
|
|
|
Args:
|
|
cameras
|
|
with_xyflip: flips x- and y-axis if set to True.
|
|
Optional kwargs:
|
|
image_size: ((height, width),) specifying the height, width
|
|
of the image. If not provided, it reads it from cameras.
|
|
|
|
We represent the screen to NDC conversion as a Transform3d
|
|
with projection matrix
|
|
|
|
K = [
|
|
[1/s, 0, 0, cx/s],
|
|
[ 0, 1/s, 0, cy/s],
|
|
[ 0, 0, 1, 0],
|
|
[ 0, 0, 0, 1],
|
|
]
|
|
|
|
"""
|
|
transform = get_ndc_to_screen_transform(
|
|
cameras,
|
|
with_xyflip=with_xyflip,
|
|
image_size=image_size,
|
|
).inverse()
|
|
return transform
|
|
|
|
|
|
def try_get_projection_transform(
|
|
cameras: CamerasBase, cameras_kwargs: Dict[str, Any]
|
|
) -> Optional[Transform3d]:
|
|
"""
|
|
Try block to get projection transform from cameras and cameras_kwargs.
|
|
|
|
Args:
|
|
cameras: cameras instance, can be linear cameras or nonliear cameras
|
|
cameras_kwargs: camera parameters to be passed to cameras
|
|
|
|
Returns:
|
|
If the camera implemented projection_transform, return the
|
|
projection transform; Otherwise, return None
|
|
"""
|
|
|
|
transform = None
|
|
try:
|
|
transform = cameras.get_projection_transform(**cameras_kwargs)
|
|
except NotImplementedError:
|
|
pass
|
|
return transform
|