pytorch3d/pytorch3d/implicitron/evaluation/evaluate_new_view_synthesis.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


import copy
import warnings
from collections import OrderedDict
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Sequence, Tuple, TYPE_CHECKING, Union

import numpy as np
import torch
import torch.nn.functional as F
from pytorch3d.implicitron.dataset.frame_data import FrameData
from pytorch3d.implicitron.dataset.utils import is_train_frame
from pytorch3d.implicitron.models.base_model import ImplicitronRender
from pytorch3d.implicitron.tools import vis_utils
from pytorch3d.implicitron.tools.image_utils import mask_background
from pytorch3d.implicitron.tools.metric_utils import calc_psnr, eval_depth, iou, rgb_l1
from pytorch3d.implicitron.tools.point_cloud_utils import get_rgbd_point_cloud
from pytorch3d.implicitron.tools.vis_utils import make_depth_image
from pytorch3d.renderer.cameras import PerspectiveCameras
from pytorch3d.vis.plotly_vis import plot_scene
from tabulate import tabulate

if TYPE_CHECKING:
    from visdom import Visdom


EVAL_N_SRC_VIEWS = [1, 3, 5, 7, 9]


@dataclass
class _Visualizer:
    image_render: torch.Tensor
    image_rgb_masked: torch.Tensor
    depth_render: torch.Tensor
    depth_map: Optional[torch.Tensor]
    depth_mask: Optional[torch.Tensor]

    visdom_env: str = "eval_debug"

    _viz: Optional["Visdom"] = field(init=False)

    def __post_init__(self):
        self._viz = vis_utils.get_visdom_connection()

    def show_rgb(
        self, loss_value: float, metric_name: str, loss_mask_now: torch.Tensor
    ):
        if self._viz is None:
            return
        self._viz.images(
            torch.cat(
                (
                    self.image_render,
                    self.image_rgb_masked,
                    loss_mask_now.repeat(1, 3, 1, 1),
                ),
                dim=3,
            ),
            env=self.visdom_env,
            win=metric_name,
            opts={"title": f"{metric_name}_{loss_value:1.2f}"},
        )

    def show_depth(
        self, depth_loss: float, name_postfix: str, loss_mask_now: torch.Tensor
    ):
        if self._viz is None:
            return
        viz = self._viz
        viz.images(
            torch.cat(
                (make_depth_image(self.depth_render, loss_mask_now),)
                + (
                    (make_depth_image(self.depth_map, loss_mask_now),)
                    if self.depth_map is not None
                    else ()
                ),
                dim=3,
            ),
            env=self.visdom_env,
            win="depth_abs" + name_postfix,
            opts={"title": f"depth_abs_{name_postfix}_{depth_loss:1.2f}"},
        )
        viz.images(
            loss_mask_now,
            env=self.visdom_env,
            win="depth_abs" + name_postfix + "_mask",
            opts={"title": f"depth_abs_{name_postfix}_{depth_loss:1.2f}_mask"},
        )
        if self.depth_mask is not None:
            viz.images(
                self.depth_mask,
                env=self.visdom_env,
                win="depth_abs" + name_postfix + "_maskd",
                opts={"title": f"depth_abs_{name_postfix}_{depth_loss:1.2f}_maskd"},
            )

        # show the 3D plot
        # pyre-fixme[9]: viewpoint_trivial has type `PerspectiveCameras`; used as
        #  `TensorProperties`.
        viewpoint_trivial: PerspectiveCameras = PerspectiveCameras().to(
            loss_mask_now.device
        )
        _pcls = {
            "pred_depth": get_rgbd_point_cloud(
                viewpoint_trivial,
                self.image_render,
                self.depth_render,
                # mask_crop,
                torch.ones_like(self.depth_render),
                # loss_mask_now,
            )
        }
        if self.depth_map is not None:
            _pcls["gt_depth"] = get_rgbd_point_cloud(
                viewpoint_trivial,
                self.image_rgb_masked,
                self.depth_map,
                # mask_crop,
                torch.ones_like(self.depth_map),
                # loss_mask_now,
            )

        _pcls = {pn: p for pn, p in _pcls.items() if int(p.num_points_per_cloud()) > 0}

        plotlyplot = plot_scene(
            {f"pcl{name_postfix}": _pcls},  # pyre-ignore
            camera_scale=1.0,
            pointcloud_max_points=10000,
            pointcloud_marker_size=1,
        )
        viz.plotlyplot(
            plotlyplot,
            env=self.visdom_env,
            win=f"pcl{name_postfix}",
        )


def eval_batch(
    frame_data: FrameData,
    implicitron_render: ImplicitronRender,
    bg_color: Union[torch.Tensor, Sequence, str, float] = "black",
    mask_thr: float = 0.5,
    lpips_model=None,
    visualize: bool = False,
    visualize_visdom_env: str = "eval_debug",
    break_after_visualising: bool = True,
) -> Dict[str, Any]:
    """
    Produce performance metrics for a single batch of new-view synthesis
    predictions.

    Given a set of known views (for which frame_data.frame_type.endswith('known')
    is True), a new-view synthesis method (NVS) is tasked to generate new views
    of the scene from the viewpoint of the target views (for which
    frame_data.frame_type.endswith('known') is False). The resulting
    synthesized new views, stored in `implicitron_render`, are compared to the
    target ground truth in `frame_data` in terms of geometry and appearance
    resulting in a dictionary of metrics returned by the `eval_batch` function.

    Args:
        frame_data: A FrameData object containing the input to the new view
            synthesis method.
        implicitron_render: The data describing the synthesized new views.
        bg_color: The background color of the generated new views and the
            ground truth.
        lpips_model: A pre-trained model for evaluating the LPIPS metric.
        visualize: If True, visualizes the results to Visdom.

    Returns:
        results: A dictionary holding evaluation metrics.

    Throws:
        ValueError if frame_data does not have frame_type, camera, or image_rgb
        ValueError if the batch has a mix of training and test samples
        ValueError if the batch frames are not [unseen, known, known, ...]
        ValueError if one of the required fields in implicitron_render is missing
    """
    frame_type = frame_data.frame_type
    if frame_type is None:
        raise ValueError("Frame type has not been set.")

    # we check that all those fields are not None but Pyre can't infer that properly
    # TODO: assign to local variables and simplify the code.
    if frame_data.image_rgb is None:
        raise ValueError("Image is not in the evaluation batch.")

    if frame_data.camera is None:
        raise ValueError("Camera is not in the evaluation batch.")

    # eval all results in the resolution of the frame_data image
    image_resol = tuple(frame_data.image_rgb.shape[2:])

    # Post-process the render:
    # 1) check implicitron_render for Nones,
    # 2) obtain copies to make sure we dont edit the original data,
    # 3) take only the 1st (target) image
    # 4) resize to match ground-truth resolution
    cloned_render: Dict[str, torch.Tensor] = {}
    for k in ["mask_render", "image_render", "depth_render"]:
        field = getattr(implicitron_render, k)
        if field is None:
            raise ValueError(f"A required predicted field {k} is missing")

        imode = "bilinear" if k == "image_render" else "nearest"
        cloned_render[k] = (
            F.interpolate(field[:1], size=image_resol, mode=imode).detach().clone()
        )

    frame_data = copy.deepcopy(frame_data)

    # mask the ground truth depth in case frame_data contains the depth mask
    if frame_data.depth_map is not None and frame_data.depth_mask is not None:
        frame_data.depth_map *= frame_data.depth_mask

    if not isinstance(frame_type, list):  # not batch FrameData
        frame_type = [frame_type]

    is_train = is_train_frame(frame_type)
    if len(is_train) > 1 and (is_train[1] != is_train[1:]).any():
        raise ValueError(
            "All (conditioning) frames in the eval batch have to be either train/test."
        )

    for k in [
        "depth_map",
        "image_rgb",
        "fg_probability",
        "mask_crop",
    ]:
        if not hasattr(frame_data, k) or getattr(frame_data, k) is None:
            continue
        setattr(frame_data, k, getattr(frame_data, k)[:1])

    if frame_data.depth_map is None or frame_data.depth_map.sum() <= 0:
        warnings.warn("Empty or missing depth map in evaluation!")

    if frame_data.mask_crop is None:
        warnings.warn("mask_crop is None, assuming the whole image is valid.")

    if frame_data.fg_probability is None:
        warnings.warn("fg_probability is None, assuming the whole image is fg.")

    # threshold the masks to make ground truth binary masks
    mask_fg = (
        frame_data.fg_probability >= mask_thr
        if frame_data.fg_probability is not None
        # pyre-ignore [16]
        else torch.ones_like(frame_data.image_rgb[:, :1, ...]).bool()
    )

    mask_crop = (
        frame_data.mask_crop
        if frame_data.mask_crop is not None
        else torch.ones_like(mask_fg)
    )

    # unmasked g.t. image
    image_rgb = frame_data.image_rgb

    # fg-masked g.t. image
    image_rgb_masked = mask_background(
        # pyre-fixme[6]: Expected `Tensor` for 1st param but got
        #  `Optional[torch.Tensor]`.
        frame_data.image_rgb,
        mask_fg,
        bg_color=bg_color,
    )

    # clamp predicted images
    image_render = cloned_render["image_render"].clamp(0.0, 1.0)

    if visualize:
        visualizer = _Visualizer(
            image_render=image_render,
            image_rgb_masked=image_rgb_masked,
            depth_render=cloned_render["depth_render"],
            depth_map=frame_data.depth_map,
            depth_mask=(
                frame_data.depth_mask[:1] if frame_data.depth_mask is not None else None
            ),
            visdom_env=visualize_visdom_env,
        )

    results: Dict[str, Any] = {}

    results["iou"] = iou(
        cloned_render["mask_render"],
        mask_fg,
        mask=mask_crop,
    )

    for loss_fg_mask, name_postfix in zip((mask_crop, mask_fg), ("_masked", "_fg")):

        loss_mask_now = mask_crop * loss_fg_mask

        for rgb_metric_name, rgb_metric_fun in zip(
            ("psnr", "rgb_l1"), (calc_psnr, rgb_l1)
        ):
            metric_name = rgb_metric_name + name_postfix
            results[metric_name] = rgb_metric_fun(
                image_render,
                image_rgb_masked,
                mask=loss_mask_now,
            )

            if visualize:
                visualizer.show_rgb(
                    results[metric_name].item(), metric_name, loss_mask_now
                )

        if name_postfix == "_fg" and frame_data.depth_map is not None:
            # only record depth metrics for the foreground
            _, abs_ = eval_depth(
                cloned_render["depth_render"],
                # pyre-fixme[6]: For 2nd param expected `Tensor` but got
                #  `Optional[Tensor]`.
                frame_data.depth_map,
                get_best_scale=True,
                mask=loss_mask_now,
                crop=5,
            )
            results["depth_abs" + name_postfix] = abs_.mean()

            if visualize:
                visualizer.show_depth(abs_.mean().item(), name_postfix, loss_mask_now)
                if break_after_visualising:
                    breakpoint()  # noqa: B601

    # add the rgb metrics between the render and the unmasked image
    for rgb_metric_name, rgb_metric_fun in zip(
        ("psnr_full_image", "rgb_l1_full_image"), (calc_psnr, rgb_l1)
    ):
        results[rgb_metric_name] = rgb_metric_fun(
            image_render,
            # pyre-fixme[6]: For 2nd argument expected `Tensor` but got
            #  `Optional[Tensor]`.
            image_rgb,
            mask=mask_crop,
        )

    if lpips_model is not None:
        for gt_image_type in ("_full_image", "_masked"):
            im1, im2 = [
                2.0 * im.clamp(0.0, 1.0) - 1.0  # pyre-ignore[16]
                for im in (
                    image_rgb_masked if gt_image_type == "_masked" else image_rgb,
                    cloned_render["image_render"],
                )
            ]
            results["lpips" + gt_image_type] = lpips_model.forward(im1, im2).item()

    # convert all metrics to floats
    results = {k: float(v) for k, v in results.items()}

    results["meta"] = {
        # store the size of the batch (corresponds to n_src_views+1)
        "batch_size": len(frame_type),
        # store the type of the target frame
        # pyre-fixme[16]: `None` has no attribute `__getitem__`.
        "frame_type": str(frame_data.frame_type[0]),
    }

    return results


def average_per_batch_results(
    results_per_batch: List[Dict[str, Any]],
    idx: Optional[torch.Tensor] = None,
) -> dict:
    """
    Average a list of per-batch metrics `results_per_batch`.
    Optionally, if `idx` is given, only a subset of the per-batch
    metrics, indexed by `idx`, is averaged.
    """
    result_keys = list(results_per_batch[0].keys())
    result_keys.remove("meta")
    if idx is not None:
        results_per_batch = [results_per_batch[i] for i in idx]
    if len(results_per_batch) == 0:
        return {k: float("NaN") for k in result_keys}
    return {
        k: float(np.array([r[k] for r in results_per_batch]).mean())
        for k in result_keys
    }


def _reduce_camera_iou_overlap(ious: torch.Tensor, topk: int = 2) -> torch.Tensor:
    """
    Calculate the final camera difficulty by computing the average of the
    ious of the two most similar cameras.

    Returns:
        single-element Tensor
    """
    return ious.topk(k=min(topk, len(ious) - 1)).values.mean()


def _get_camera_difficulty_bin_edges(camera_difficulty_bin_breaks: Tuple[float, float]):
    """
    Get the edges of camera difficulty bins.
    """
    _eps = 1e-5
    lower, upper = camera_difficulty_bin_breaks
    diff_bin_edges = torch.tensor([0.0 - _eps, lower, upper, 1.0 + _eps]).float()
    diff_bin_names = ["hard", "medium", "easy"]
    return diff_bin_edges, diff_bin_names


def summarize_nvs_eval_results(
    per_batch_eval_results: List[Dict[str, Any]],
    is_multisequence: bool,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Compile the per-batch evaluation results `per_batch_eval_results` into
    a set of aggregate metrics. The produced metrics depend on is_multisequence.

    Args:
        per_batch_eval_results: Metrics of each per-batch evaluation.
        is_multisequence: Whether to evaluate as a multisequence task
        camera_difficulty_bin_breaks: edge hard-medium and medium-easy


    Returns:
        nvs_results_flat: A flattened dict of all aggregate metrics.
        aux_out: A dictionary holding a set of auxiliary results.
    """
    n_batches = len(per_batch_eval_results)
    eval_sets: List[Optional[str]] = []
    eval_sets = [None]
    if is_multisequence:
        eval_sets = ["train", "test"]
    batch_sizes = torch.tensor(
        [r["meta"]["batch_size"] for r in per_batch_eval_results]
    ).long()

    is_train = is_train_frame([r["meta"]["frame_type"] for r in per_batch_eval_results])

    # init the result database dict
    results = []

    # add per set averages
    for SET in eval_sets:
        if SET is None:
            ok_set = torch.ones(n_batches, dtype=torch.bool)
            set_name = "test"
        else:
            ok_set = is_train == int(SET == "train")
            set_name = SET

        # average over all results
        bin_results = average_per_batch_results(
            per_batch_eval_results, idx=torch.where(ok_set)[0]
        )
        results.append(
            {
                "subset": set_name,
                "subsubset": "diff=all",
                "metrics": bin_results,
            }
        )

        if is_multisequence:
            # split based on n_src_views
            n_src_views = batch_sizes - 1
            for n_src in EVAL_N_SRC_VIEWS:
                ok_src = ok_set & (n_src_views == n_src)
                n_src_results = average_per_batch_results(
                    per_batch_eval_results,
                    idx=torch.where(ok_src)[0],
                )
                results.append(
                    {
                        "subset": set_name,
                        "subsubset": f"n_src={int(n_src)}",
                        "metrics": n_src_results,
                    }
                )

    aux_out = {"results": results}
    return flatten_nvs_results(results), aux_out


def _get_flat_nvs_metric_key(result, metric_name) -> str:
    metric_key_postfix = f"|subset={result['subset']}|{result['subsubset']}"
    metric_key = f"{metric_name}{metric_key_postfix}"
    return metric_key


def flatten_nvs_results(results) -> Dict[str, Any]:
    """
    Takes input `results` list of dicts of the form::

        [
            {
                'subset':'train/test/...',
                'subsubset': 'src=1/src=2/...',
                'metrics': nvs_eval_metrics}
            },
            ...
        ]

    And converts to a flat dict as follows::

        {
            'subset=train/test/...|subsubset=src=1/src=2/...': nvs_eval_metrics,
            ...
        }
    """
    results_flat = {}
    for result in results:
        for metric_name, metric_val in result["metrics"].items():
            metric_key = _get_flat_nvs_metric_key(result, metric_name)
            assert metric_key not in results_flat
            results_flat[metric_key] = metric_val
    return results_flat


def pretty_print_nvs_metrics(results) -> None:
    subsets, subsubsets = [
        _ordered_set([r[k] for r in results]) for k in ("subset", "subsubset")
    ]
    metrics = _ordered_set([metric for r in results for metric in r["metrics"]])

    for subset in subsets:
        tab = {}
        for metric in metrics:
            tab[metric] = []
            header = ["metric"]
            for subsubset in subsubsets:
                metric_vals = [
                    r["metrics"][metric]
                    for r in results
                    if r["subsubset"] == subsubset and r["subset"] == subset
                ]
                if len(metric_vals) > 0:
                    tab[metric].extend(metric_vals)
                    header.extend(subsubsets)

        if any(len(v) > 0 for v in tab.values()):
            print(f"===== NVS results; subset={subset} =====")
            print(
                tabulate(
                    [[metric, *v] for metric, v in tab.items()],
                    # pyre-fixme[61]: `header` is undefined, or not always defined.
                    headers=header,
                )
            )


def _ordered_set(list_):
    return list(OrderedDict((i, 0) for i in list_).keys())


def aggregate_nvs_results(task_results):
    """
    Aggregate nvs results.
    For singlescene, this averages over all categories and scenes,
    for multiscene, the average is over all per-category results.
    """
    task_results_cat = [r_ for r in task_results for r_ in r]
    subsets, subsubsets = [
        _ordered_set([r[k] for r in task_results_cat]) for k in ("subset", "subsubset")
    ]
    metrics = _ordered_set(
        [metric for r in task_results_cat for metric in r["metrics"]]
    )
    average_results = []
    for subset in subsets:
        for subsubset in subsubsets:
            metrics_lists = [
                r["metrics"]
                for r in task_results_cat
                if r["subsubset"] == subsubset and r["subset"] == subset
            ]
            avg_metrics = {}
            for metric in metrics:
                avg_metrics[metric] = float(
                    np.nanmean(
                        np.array([metric_list[metric] for metric_list in metrics_lists])
                    )
                )
            average_results.append(
                {
                    "subset": subset,
                    "subsubset": subsubset,
                    "metrics": avg_metrics,
                }
            )
    return average_results