mirror of
https://github.com/facebookresearch/pytorch3d.git
synced 2025-12-22 23:30:35 +08:00
Camera alignment
Summary: adds `corresponding_cameras_alignment` function that estimates a similarity transformation between two sets of cameras. The function is essential for computing camera errors in SfM pipelines. ``` Benchmark Avg Time(μs) Peak Time(μs) Iterations -------------------------------------------------------------------------------- CORRESPONDING_CAMERAS_ALIGNMENT_10_centers_False 32219 36211 16 CORRESPONDING_CAMERAS_ALIGNMENT_10_centers_True 32429 36063 16 CORRESPONDING_CAMERAS_ALIGNMENT_10_extrinsics_False 5548 8782 91 CORRESPONDING_CAMERAS_ALIGNMENT_10_extrinsics_True 6153 9752 82 CORRESPONDING_CAMERAS_ALIGNMENT_100_centers_False 33344 40398 16 CORRESPONDING_CAMERAS_ALIGNMENT_100_centers_True 34528 37095 15 CORRESPONDING_CAMERAS_ALIGNMENT_100_extrinsics_False 5576 7187 90 CORRESPONDING_CAMERAS_ALIGNMENT_100_extrinsics_True 6256 9166 80 CORRESPONDING_CAMERAS_ALIGNMENT_1000_centers_False 32020 37247 16 CORRESPONDING_CAMERAS_ALIGNMENT_1000_centers_True 32776 37644 16 CORRESPONDING_CAMERAS_ALIGNMENT_1000_extrinsics_False 5336 8795 94 CORRESPONDING_CAMERAS_ALIGNMENT_1000_extrinsics_True 6266 9929 80 -------------------------------------------------------------------------------- ``` Reviewed By: shapovalov Differential Revision: D22946415 fbshipit-source-id: 8caae7ee365b304d8aa1f8133cf0dd92c35bc0dd
This commit is contained in:
committed by
Facebook GitHub Bot
parent
14f015d8bf
commit
316b77782e
@@ -1,6 +1,6 @@
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
|
||||
from .cameras_alignment import corresponding_cameras_alignment
|
||||
from .cubify import cubify
|
||||
from .graph_conv import GraphConv
|
||||
from .interp_face_attrs import interpolate_face_attributes
|
||||
|
||||
215
pytorch3d/ops/cameras_alignment.py
Normal file
215
pytorch3d/ops/cameras_alignment.py
Normal file
@@ -0,0 +1,215 @@
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
from .. import ops
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pytorch3d.renderer.cameras import CamerasBase
|
||||
|
||||
|
||||
def corresponding_cameras_alignment(
|
||||
cameras_src: "CamerasBase",
|
||||
cameras_tgt: "CamerasBase",
|
||||
estimate_scale: bool = True,
|
||||
mode: str = "extrinsics",
|
||||
eps: float = 1e-9,
|
||||
) -> "CamerasBase":
|
||||
"""
|
||||
.. warning::
|
||||
The `corresponding_cameras_alignment` API is experimental
|
||||
and subject to change!
|
||||
|
||||
Estimates a single similarity transformation between two sets of cameras
|
||||
`cameras_src` and `cameras_tgt` and returns an aligned version of
|
||||
`cameras_src`.
|
||||
|
||||
Given source cameras [(R_1, T_1), (R_2, T_2), ..., (R_N, T_N)] and
|
||||
target cameras [(R_1', T_1'), (R_2', T_2'), ..., (R_N', T_N')],
|
||||
where (R_i, T_i) is a 2-tuple of the camera rotation and translation matrix
|
||||
respectively, the algorithm finds a global rotation, translation and scale
|
||||
(R_A, T_A, s_A) which aligns all source cameras with the target cameras
|
||||
such that the following holds:
|
||||
|
||||
Under the change of coordinates using a similarity transform
|
||||
(R_A, T_A, s_A) a 3D point X' is mapped to X with:
|
||||
```
|
||||
X = (X' R_A + T_A) / s_A
|
||||
```
|
||||
Then, for all cameras `i`, we assume that the following holds:
|
||||
```
|
||||
X R_i + T_i = s' (X' R_i' + T_i'),
|
||||
```
|
||||
i.e. an adjusted point X' is mapped by a camera (R_i', T_i')
|
||||
to the same point as imaged from camera (R_i, T_i) after resolving
|
||||
the scale ambiguity with a global scalar factor s'.
|
||||
|
||||
Substituting for X above gives rise to the following:
|
||||
```
|
||||
(X' R_A + T_A) / s_A R_i + T_i = s' (X' R_i' + T_i') // · s_A
|
||||
(X' R_A + T_A) R_i + T_i s_A = (s' s_A) (X' R_i' + T_i')
|
||||
s' := 1 / s_A # without loss of generality
|
||||
(X' R_A + T_A) R_i + T_i s_A = X' R_i' + T_i'
|
||||
X' R_A R_i + T_A R_i + T_i s_A = X' R_i' + T_i'
|
||||
^^^^^^^ ^^^^^^^^^^^^^^^^^
|
||||
~= R_i' ~= T_i'
|
||||
```
|
||||
i.e. after estimating R_A, T_A, s_A, the aligned source cameras have
|
||||
extrinsics:
|
||||
`cameras_src_align = (R_A R_i, T_A R_i + T_i s_A) ~= (R_i', T_i')`
|
||||
|
||||
We support two ways `R_A, T_A, s_A` can be estimated:
|
||||
1) `mode=='centers'`
|
||||
Estimates the similarity alignment between camera centers using
|
||||
Umeyama's algorithm (see `pytorch3d.ops.corresponding_points_alignment`
|
||||
for details) and transforms camera extrinsics accordingly.
|
||||
|
||||
2) `mode=='extrinsics'`
|
||||
Defines the alignment problem as a system
|
||||
of the following equations:
|
||||
```
|
||||
for all i:
|
||||
[ R_A 0 ] x [ R_i 0 ] = [ R_i' 0 ]
|
||||
[ T_A^T 1 ] [ (s_A T_i^T) 1 ] [ T_i' 1 ]
|
||||
```
|
||||
`R_A, T_A` and `s_A` are then obtained by solving the
|
||||
system in the least squares sense.
|
||||
|
||||
The estimated camera transformation is a true similarity transform, i.e.
|
||||
it cannot be a reflection.
|
||||
|
||||
Args:
|
||||
cameras_src: `N` cameras to be aligned.
|
||||
cameras_tgt: `N` target cameras.
|
||||
estimate_scale: Controls whether the alignment transform is rigid
|
||||
(`estimate_scale=False`), or a similarity (`estimate_scale=True`).
|
||||
`s_A` is set to `1` if `estimate_scale==False`.
|
||||
mode: Controls the alignment algorithm.
|
||||
Can be one either `'centers'` or `'extrinsics'`. Please refer to the
|
||||
description above for details.
|
||||
eps: A scalar for clamping to avoid dividing by zero.
|
||||
Active when `estimate_scale==True`.
|
||||
|
||||
Returns:
|
||||
cameras_src_aligned: `cameras_src` after applying the alignment transform.
|
||||
"""
|
||||
|
||||
if cameras_src.R.shape[0] != cameras_tgt.R.shape[0]:
|
||||
raise ValueError(
|
||||
"cameras_src and cameras_tgt have to contain the same number of cameras!"
|
||||
)
|
||||
|
||||
if mode == "centers":
|
||||
align_fun = _align_camera_centers
|
||||
elif mode == "extrinsics":
|
||||
align_fun = _align_camera_extrinsics
|
||||
else:
|
||||
raise ValueError("mode has to be one of (centers, extrinsics)")
|
||||
|
||||
align_t_R, align_t_T, align_t_s = align_fun(
|
||||
cameras_src, cameras_tgt, estimate_scale=estimate_scale, eps=eps
|
||||
)
|
||||
|
||||
# create a new cameras object and set the R and T accordingly
|
||||
cameras_src_aligned = cameras_src.clone()
|
||||
cameras_src_aligned.R = torch.bmm(align_t_R.expand_as(cameras_src.R), cameras_src.R)
|
||||
cameras_src_aligned.T = (
|
||||
torch.bmm(
|
||||
align_t_T[:, None].repeat(cameras_src.R.shape[0], 1, 1), cameras_src.R
|
||||
)[:, 0]
|
||||
+ cameras_src.T * align_t_s
|
||||
)
|
||||
|
||||
return cameras_src_aligned
|
||||
|
||||
|
||||
def _align_camera_centers(
|
||||
cameras_src: "CamerasBase",
|
||||
cameras_tgt: "CamerasBase",
|
||||
estimate_scale: bool = True,
|
||||
eps: float = 1e-9,
|
||||
):
|
||||
"""
|
||||
Use Umeyama's algorithm to align the camera centers.
|
||||
"""
|
||||
centers_src = cameras_src.get_camera_center()
|
||||
centers_tgt = cameras_tgt.get_camera_center()
|
||||
align_t = ops.corresponding_points_alignment(
|
||||
centers_src[None],
|
||||
centers_tgt[None],
|
||||
estimate_scale=estimate_scale,
|
||||
allow_reflection=False,
|
||||
eps=eps,
|
||||
)
|
||||
# the camera transform is the inverse of the estimated transform between centers
|
||||
align_t_R = align_t.R.permute(0, 2, 1)
|
||||
align_t_T = -(torch.bmm(align_t.T[:, None], align_t_R))[:, 0]
|
||||
align_t_s = align_t.s[0]
|
||||
|
||||
return align_t_R, align_t_T, align_t_s
|
||||
|
||||
|
||||
def _align_camera_extrinsics(
|
||||
cameras_src: "CamerasBase",
|
||||
cameras_tgt: "CamerasBase",
|
||||
estimate_scale: bool = True,
|
||||
eps: float = 1e-9,
|
||||
):
|
||||
"""
|
||||
Get the global rotation R_A with svd of cov(RR^T):
|
||||
```
|
||||
R_A R_i = R_i' for all i
|
||||
R_A [R_1 R_2 ... R_N] = [R_1' R_2' ... R_N']
|
||||
U, _, V = svd([R_1 R_2 ... R_N]^T [R_1' R_2' ... R_N'])
|
||||
R_A = (U V^T)^T
|
||||
```
|
||||
"""
|
||||
RRcov = torch.bmm(cameras_src.R, cameras_tgt.R.transpose(2, 1)).mean(0)
|
||||
U, _, V = torch.svd(RRcov)
|
||||
align_t_R = V @ U.t()
|
||||
|
||||
"""
|
||||
The translation + scale `T_A` and `s_A` is computed by finding
|
||||
a translation and scaling that aligns two tensors `A, B`
|
||||
defined as follows:
|
||||
```
|
||||
T_A R_i + s_A T_i = T_i' ; for all i // · R_i^T
|
||||
s_A T_i R_i^T + T_A = T_i' R_i^T ; for all i
|
||||
^^^^^^^^^ ^^^^^^^^^^
|
||||
A_i B_i
|
||||
|
||||
A_i := T_i R_i^T
|
||||
A = [A_1 A_2 ... A_N]
|
||||
B_i := T_i' R_i^T
|
||||
B = [B_1 B_2 ... B_N]
|
||||
```
|
||||
The scale s_A can be retrieved by matching the correlations of
|
||||
the points sets A and B:
|
||||
```
|
||||
s_A = (A-mean(A))*(B-mean(B)).sum() / ((A-mean(A))**2).sum()
|
||||
```
|
||||
The translation `T_A` is then defined as:
|
||||
```
|
||||
T_A = mean(B) - mean(A) * s_A
|
||||
```
|
||||
"""
|
||||
A = torch.bmm(cameras_src.R, cameras_src.T[:, :, None])[:, :, 0]
|
||||
B = torch.bmm(cameras_src.R, cameras_tgt.T[:, :, None])[:, :, 0]
|
||||
Amu = A.mean(0, keepdim=True)
|
||||
Bmu = B.mean(0, keepdim=True)
|
||||
if estimate_scale and A.shape[0] > 1:
|
||||
# get the scaling component by matching covariances
|
||||
# of centered A and centered B
|
||||
Ac = A - Amu
|
||||
Bc = B - Bmu
|
||||
align_t_s = (Ac * Bc).mean() / (Ac ** 2).mean().clamp(eps)
|
||||
else:
|
||||
# set the scale to identity
|
||||
align_t_s = 1.0
|
||||
# get the translation as the difference between the means of A and B
|
||||
align_t_T = Bmu - align_t_s * Amu
|
||||
|
||||
return align_t_R, align_t_T, align_t_s
|
||||
@@ -13,8 +13,8 @@ from .utils import TensorProperties, convert_to_tensors_and_broadcast
|
||||
|
||||
|
||||
# Default values for rotation and translation matrices.
|
||||
r = np.expand_dims(np.eye(3), axis=0) # (1, 3, 3)
|
||||
t = np.expand_dims(np.zeros(3), axis=0) # (1, 3)
|
||||
_R = torch.eye(3)[None] # (1, 3, 3)
|
||||
_T = torch.zeros(1, 3) # (1, 3)
|
||||
|
||||
|
||||
class CamerasBase(TensorProperties):
|
||||
@@ -280,8 +280,8 @@ def OpenGLPerspectiveCameras(
|
||||
aspect_ratio=1.0,
|
||||
fov=60.0,
|
||||
degrees: bool = True,
|
||||
R=r,
|
||||
T=t,
|
||||
R=_R,
|
||||
T=_T,
|
||||
device="cpu",
|
||||
):
|
||||
"""
|
||||
@@ -331,8 +331,8 @@ class FoVPerspectiveCameras(CamerasBase):
|
||||
aspect_ratio=1.0,
|
||||
fov=60.0,
|
||||
degrees: bool = True,
|
||||
R=r,
|
||||
T=t,
|
||||
R=_R,
|
||||
T=_T,
|
||||
device="cpu",
|
||||
):
|
||||
"""
|
||||
@@ -436,7 +436,7 @@ class FoVPerspectiveCameras(CamerasBase):
|
||||
P[:, 2, 2] = z_sign * zfar / (zfar - znear)
|
||||
P[:, 2, 3] = -(zfar * znear) / (zfar - znear)
|
||||
|
||||
# Transpose the projection matrix as PyTorch3d transforms use row vectors.
|
||||
# Transpose the projection matrix as PyTorch3D transforms use row vectors.
|
||||
transform = Transform3d(device=self.device)
|
||||
transform._matrix = P.transpose(1, 2).contiguous()
|
||||
return transform
|
||||
@@ -494,8 +494,8 @@ def OpenGLOrthographicCameras(
|
||||
left=-1.0,
|
||||
right=1.0,
|
||||
scale_xyz=((1.0, 1.0, 1.0),), # (1, 3)
|
||||
R=r,
|
||||
T=t,
|
||||
R=_R,
|
||||
T=_T,
|
||||
device="cpu",
|
||||
):
|
||||
"""
|
||||
@@ -540,8 +540,8 @@ class FoVOrthographicCameras(CamerasBase):
|
||||
max_x=1.0,
|
||||
min_x=-1.0,
|
||||
scale_xyz=((1.0, 1.0, 1.0),), # (1, 3)
|
||||
R=r,
|
||||
T=t,
|
||||
R=_R,
|
||||
T=_T,
|
||||
device="cpu",
|
||||
):
|
||||
"""
|
||||
@@ -688,7 +688,7 @@ we assume the parameters are in screen space.
|
||||
|
||||
|
||||
def SfMPerspectiveCameras(
|
||||
focal_length=1.0, principal_point=((0.0, 0.0),), R=r, T=t, device="cpu"
|
||||
focal_length=1.0, principal_point=((0.0, 0.0),), R=_R, T=_R, device="cpu"
|
||||
):
|
||||
"""
|
||||
SfMPerspectiveCameras has been DEPRECATED. Use PerspectiveCameras instead.
|
||||
@@ -747,8 +747,8 @@ class PerspectiveCameras(CamerasBase):
|
||||
self,
|
||||
focal_length=1.0,
|
||||
principal_point=((0.0, 0.0),),
|
||||
R=r,
|
||||
T=t,
|
||||
R=_R,
|
||||
T=_T,
|
||||
device="cpu",
|
||||
image_size=((-1, -1),),
|
||||
):
|
||||
@@ -848,7 +848,7 @@ class PerspectiveCameras(CamerasBase):
|
||||
|
||||
|
||||
def SfMOrthographicCameras(
|
||||
focal_length=1.0, principal_point=((0.0, 0.0),), R=r, T=t, device="cpu"
|
||||
focal_length=1.0, principal_point=((0.0, 0.0),), R=_R, T=_T, device="cpu"
|
||||
):
|
||||
"""
|
||||
SfMOrthographicCameras has been DEPRECATED. Use OrthographicCameras instead.
|
||||
@@ -906,8 +906,8 @@ class OrthographicCameras(CamerasBase):
|
||||
self,
|
||||
focal_length=1.0,
|
||||
principal_point=((0.0, 0.0),),
|
||||
R=r,
|
||||
T=t,
|
||||
R=_R,
|
||||
T=_T,
|
||||
device="cpu",
|
||||
image_size=((-1, -1),),
|
||||
):
|
||||
@@ -1109,7 +1109,7 @@ def _get_sfm_calibration_matrix(
|
||||
################################################
|
||||
|
||||
|
||||
def get_world_to_view_transform(R=r, T=t) -> Transform3d:
|
||||
def get_world_to_view_transform(R=_R, T=_T) -> Transform3d:
|
||||
"""
|
||||
This function returns a Transform3d representing the transformation
|
||||
matrix to go from world space to view space by applying a rotation and
|
||||
|
||||
Reference in New Issue
Block a user