This commit is contained in:
hyz317
2025-05-07 16:51:22 +08:00
commit 87c3ed5e40
54 changed files with 8014 additions and 0 deletions

View File

@@ -0,0 +1 @@
# -*- coding: utf-8 -*-

View File

@@ -0,0 +1 @@
# -*- coding: utf-8 -*-

View File

@@ -0,0 +1,483 @@
# -*- coding: utf-8 -*-
from omegaconf import DictConfig
from typing import List, Tuple, Dict, Optional, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_only
from einops import rearrange
from diffusers.schedulers import (
DDPMScheduler,
DDIMScheduler,
KarrasVeScheduler,
DPMSolverMultistepScheduler
)
from ...utils import instantiate_from_config
# from ..tsal.tsal_base import ShapeAsLatentPLModule
from ..tsal.tsal_base import AlignedShapeAsLatentPLModule
from .inference_utils import ddim_sample
SchedulerType = Union[DDIMScheduler, KarrasVeScheduler, DPMSolverMultistepScheduler]
def disabled_train(self, mode=True):
"""Overwrite model.train with this function to make sure train/eval mode
does not change anymore."""
return self
class ASLDiffuser(pl.LightningModule):
first_stage_model: Optional[AlignedShapeAsLatentPLModule]
# cond_stage_model: Optional[Union[nn.Module, pl.LightningModule]]
model: nn.Module
def __init__(self, *,
first_stage_config,
denoiser_cfg,
scheduler_cfg,
optimizer_cfg,
loss_cfg,
first_stage_key: str = "surface",
cond_stage_key: str = "image",
cond_stage_trainable: bool = True,
scale_by_std: bool = False,
z_scale_factor: float = 1.0,
ckpt_path: Optional[str] = None,
ignore_keys: Union[Tuple[str], List[str]] = ()):
super().__init__()
self.first_stage_key = first_stage_key
self.cond_stage_key = cond_stage_key
self.cond_stage_trainable = cond_stage_trainable
# 1. initialize first stage.
# Note: the condition model contained in the first stage model.
self.first_stage_config = first_stage_config
self.first_stage_model = None
# self.instantiate_first_stage(first_stage_config)
# 2. initialize conditional stage
# self.instantiate_cond_stage(cond_stage_config)
self.cond_stage_model = {
"image": self.encode_image,
"image_unconditional_embedding": self.empty_img_cond,
"text": self.encode_text,
"text_unconditional_embedding": self.empty_text_cond,
"surface": self.encode_surface,
"surface_unconditional_embedding": self.empty_surface_cond,
}
# 3. diffusion model
self.model = instantiate_from_config(
denoiser_cfg, device=None, dtype=None
)
self.optimizer_cfg = optimizer_cfg
# 4. scheduling strategy
self.scheduler_cfg = scheduler_cfg
self.noise_scheduler: DDPMScheduler = instantiate_from_config(scheduler_cfg.noise)
self.denoise_scheduler: SchedulerType = instantiate_from_config(scheduler_cfg.denoise)
# 5. loss configures
self.loss_cfg = loss_cfg
self.scale_by_std = scale_by_std
if scale_by_std:
self.register_buffer("z_scale_factor", torch.tensor(z_scale_factor))
else:
self.z_scale_factor = z_scale_factor
self.ckpt_path = ckpt_path
if ckpt_path is not None:
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
def instantiate_first_stage(self, config):
model = instantiate_from_config(config)
self.first_stage_model = model.eval()
self.first_stage_model.train = disabled_train
for param in self.first_stage_model.parameters():
param.requires_grad = False
self.first_stage_model = self.first_stage_model.to(self.device)
# def instantiate_cond_stage(self, config):
# if not self.cond_stage_trainable:
# if config == "__is_first_stage__":
# print("Using first stage also as cond stage.")
# self.cond_stage_model = self.first_stage_model
# elif config == "__is_unconditional__":
# print(f"Training {self.__class__.__name__} as an unconditional model.")
# self.cond_stage_model = None
# # self.be_unconditional = True
# else:
# model = instantiate_from_config(config)
# self.cond_stage_model = model.eval()
# self.cond_stage_model.train = disabled_train
# for param in self.cond_stage_model.parameters():
# param.requires_grad = False
# else:
# assert config != "__is_first_stage__"
# assert config != "__is_unconditional__"
# model = instantiate_from_config(config)
# self.cond_stage_model = model
def init_from_ckpt(self, path, ignore_keys=()):
state_dict = torch.load(path, map_location="cpu")["state_dict"]
keys = list(state_dict.keys())
for k in keys:
for ik in ignore_keys:
if k.startswith(ik):
print("Deleting key {} from state_dict.".format(k))
del state_dict[k]
missing, unexpected = self.load_state_dict(state_dict, strict=False)
print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
if len(missing) > 0:
print(f"Missing Keys: {missing}")
print(f"Unexpected Keys: {unexpected}")
@property
def zero_rank(self):
if self._trainer:
zero_rank = self.trainer.local_rank == 0
else:
zero_rank = True
return zero_rank
def configure_optimizers(self) -> Tuple[List, List]:
lr = self.learning_rate
trainable_parameters = list(self.model.parameters())
# if the conditional encoder is trainable
# if self.cond_stage_trainable:
# conditioner_params = [p for p in self.cond_stage_model.parameters() if p.requires_grad]
# trainable_parameters += conditioner_params
# print(f"number of trainable conditional parameters: {len(conditioner_params)}.")
if self.optimizer_cfg is None:
optimizers = [torch.optim.AdamW(trainable_parameters, lr=lr, betas=(0.9, 0.99), weight_decay=1e-3)]
schedulers = []
else:
optimizer = instantiate_from_config(self.optimizer_cfg.optimizer, params=trainable_parameters)
scheduler_func = instantiate_from_config(
self.optimizer_cfg.scheduler,
max_decay_steps=self.trainer.max_steps,
lr_max=lr
)
scheduler = {
"scheduler": lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_func.schedule),
"interval": "step",
"frequency": 1
}
optimizers = [optimizer]
schedulers = [scheduler]
return optimizers, schedulers
@torch.no_grad()
def encode_text(self, text):
b = text.shape[0]
text_tokens = rearrange(text, "b t l -> (b t) l")
text_embed = self.first_stage_model.model.encode_text_embed(text_tokens)
text_embed = rearrange(text_embed, "(b t) d -> b t d", b=b)
text_embed = text_embed.mean(dim=1)
text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
return text_embed
@torch.no_grad()
def encode_image(self, img):
return self.first_stage_model.model.encode_image_embed(img)
@torch.no_grad()
def encode_surface(self, surface):
return self.first_stage_model.model.encode_shape_embed(surface, return_latents=False)
@torch.no_grad()
def empty_text_cond(self, cond):
return torch.zeros_like(cond, device=cond.device)
@torch.no_grad()
def empty_img_cond(self, cond):
return torch.zeros_like(cond, device=cond.device)
@torch.no_grad()
def empty_surface_cond(self, cond):
return torch.zeros_like(cond, device=cond.device)
@torch.no_grad()
def encode_first_stage(self, surface: torch.FloatTensor, sample_posterior=True):
z_q = self.first_stage_model.encode(surface, sample_posterior)
z_q = self.z_scale_factor * z_q
return z_q
@torch.no_grad()
def decode_first_stage(self, z_q: torch.FloatTensor, **kwargs):
z_q = 1. / self.z_scale_factor * z_q
latents = self.first_stage_model.decode(z_q, **kwargs)
return latents
@rank_zero_only
@torch.no_grad()
def on_train_batch_start(self, batch, batch_idx):
# only for very first batch
if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 \
and batch_idx == 0 and self.ckpt_path is None:
# set rescale weight to 1./std of encodings
print("### USING STD-RESCALING ###")
z_q = self.encode_first_stage(batch[self.first_stage_key])
z = z_q.detach()
del self.z_scale_factor
self.register_buffer("z_scale_factor", 1. / z.flatten().std())
print(f"setting self.z_scale_factor to {self.z_scale_factor}")
print("### USING STD-RESCALING ###")
def compute_loss(self, model_outputs, split):
"""
Args:
model_outputs (dict):
- x_0:
- noise:
- noise_prior:
- noise_pred:
- noise_pred_prior:
split (str):
Returns:
"""
pred = model_outputs["pred"]
if self.noise_scheduler.prediction_type == "epsilon":
target = model_outputs["noise"]
elif self.noise_scheduler.prediction_type == "sample":
target = model_outputs["x_0"]
else:
raise NotImplementedError(f"Prediction Type: {self.noise_scheduler.prediction_type} not yet supported.")
if self.loss_cfg.loss_type == "l1":
simple = F.l1_loss(pred, target, reduction="mean")
elif self.loss_cfg.loss_type in ["mse", "l2"]:
simple = F.mse_loss(pred, target, reduction="mean")
else:
raise NotImplementedError(f"Loss Type: {self.loss_cfg.loss_type} not yet supported.")
total_loss = simple
loss_dict = {
f"{split}/total_loss": total_loss.clone().detach(),
f"{split}/simple": simple.detach(),
}
return total_loss, loss_dict
def forward(self, batch):
"""
Args:
batch:
Returns:
"""
if self.first_stage_model is None:
self.instantiate_first_stage(self.first_stage_config)
latents = self.encode_first_stage(batch[self.first_stage_key])
# conditions = self.cond_stage_model.encode(batch[self.cond_stage_key])
conditions = self.cond_stage_model[self.cond_stage_key](batch[self.cond_stage_key]).unsqueeze(1)
mask = torch.rand((len(conditions), 1, 1), device=conditions.device, dtype=conditions.dtype) >= 0.1
conditions = conditions * mask.to(conditions)
# Sample noise that we"ll add to the latents
# [batch_size, n_token, latent_dim]
noise = torch.randn_like(latents)
bs = latents.shape[0]
# Sample a random timestep for each motion
timesteps = torch.randint(
0,
self.noise_scheduler.config.num_train_timesteps,
(bs,),
device=latents.device,
)
timesteps = timesteps.long()
# Add noise to the latents according to the noise magnitude at each timestep
noisy_z = self.noise_scheduler.add_noise(latents, noise, timesteps)
# diffusion model forward
noise_pred = self.model(noisy_z, timesteps, conditions)
diffusion_outputs = {
"x_0": noisy_z,
"noise": noise,
"pred": noise_pred
}
return diffusion_outputs
def training_step(self, batch: Dict[str, Union[torch.FloatTensor, List[str]]],
batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
"""
Args:
batch (dict): the batch sample, and it contains:
- surface (torch.FloatTensor):
- image (torch.FloatTensor): if provide, [bs, 3, h, w], item range [0, 1]
- depth (torch.FloatTensor): if provide, [bs, 1, h, w], item range [-1, 1]
- normal (torch.FloatTensor): if provide, [bs, 3, h, w], item range [-1, 1]
- text (list of str):
batch_idx (int):
optimizer_idx (int):
Returns:
loss (torch.FloatTensor):
"""
diffusion_outputs = self(batch)
loss, loss_dict = self.compute_loss(diffusion_outputs, "train")
self.log_dict(loss_dict, prog_bar=True, logger=True, sync_dist=False, rank_zero_only=True)
return loss
def validation_step(self, batch: Dict[str, torch.FloatTensor],
batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
"""
Args:
batch (dict): the batch sample, and it contains:
- surface_pc (torch.FloatTensor): [n_pts, 4]
- surface_feats (torch.FloatTensor): [n_pts, c]
- text (list of str):
batch_idx (int):
optimizer_idx (int):
Returns:
loss (torch.FloatTensor):
"""
diffusion_outputs = self(batch)
loss, loss_dict = self.compute_loss(diffusion_outputs, "val")
self.log_dict(loss_dict, prog_bar=True, logger=True, sync_dist=False, rank_zero_only=True)
return loss
@torch.no_grad()
def sample(self,
batch: Dict[str, Union[torch.FloatTensor, List[str]]],
sample_times: int = 1,
steps: Optional[int] = None,
guidance_scale: Optional[float] = None,
eta: float = 0.0,
return_intermediates: bool = False, **kwargs):
if self.first_stage_model is None:
self.instantiate_first_stage(self.first_stage_config)
if steps is None:
steps = self.scheduler_cfg.num_inference_steps
if guidance_scale is None:
guidance_scale = self.scheduler_cfg.guidance_scale
do_classifier_free_guidance = guidance_scale > 0
# conditional encode
xc = batch[self.cond_stage_key]
# cond = self.cond_stage_model[self.cond_stage_key](xc)
cond = self.cond_stage_model[self.cond_stage_key](xc).unsqueeze(1)
if do_classifier_free_guidance:
"""
Note: There are two kinds of uncond for text.
1: using "" as uncond text; (in SAL diffusion)
2: zeros_like(cond) as uncond text; (in MDM)
"""
# un_cond = self.cond_stage_model.unconditional_embedding(batch_size=len(xc))
un_cond = self.cond_stage_model[f"{self.cond_stage_key}_unconditional_embedding"](cond)
# un_cond = torch.zeros_like(cond, device=cond.device)
cond = torch.cat([un_cond, cond], dim=0)
outputs = []
latents = None
if not return_intermediates:
for _ in range(sample_times):
sample_loop = ddim_sample(
self.denoise_scheduler,
self.model,
shape=self.first_stage_model.latent_shape,
cond=cond,
steps=steps,
guidance_scale=guidance_scale,
do_classifier_free_guidance=do_classifier_free_guidance,
device=self.device,
eta=eta,
disable_prog=not self.zero_rank
)
for sample, t in sample_loop:
latents = sample
outputs.append(self.decode_first_stage(latents, **kwargs))
else:
sample_loop = ddim_sample(
self.denoise_scheduler,
self.model,
shape=self.first_stage_model.latent_shape,
cond=cond,
steps=steps,
guidance_scale=guidance_scale,
do_classifier_free_guidance=do_classifier_free_guidance,
device=self.device,
eta=eta,
disable_prog=not self.zero_rank
)
iter_size = steps // sample_times
i = 0
for sample, t in sample_loop:
latents = sample
if i % iter_size == 0 or i == steps - 1:
outputs.append(self.decode_first_stage(latents, **kwargs))
i += 1
return outputs

View File

@@ -0,0 +1,104 @@
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
from typing import Optional
from diffusers.models.embeddings import Timesteps
import math
from ..modules.transformer_blocks import MLP
from ..modules.diffusion_transformer import UNetDiffusionTransformer
class ConditionalASLUDTDenoiser(nn.Module):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
input_channels: int,
output_channels: int,
n_ctx: int,
width: int,
layers: int,
heads: int,
context_dim: int,
context_ln: bool = True,
skip_ln: bool = False,
init_scale: float = 0.25,
flip_sin_to_cos: bool = False,
use_checkpoint: bool = False):
super().__init__()
self.use_checkpoint = use_checkpoint
init_scale = init_scale * math.sqrt(1.0 / width)
self.backbone = UNetDiffusionTransformer(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
layers=layers,
heads=heads,
skip_ln=skip_ln,
init_scale=init_scale,
use_checkpoint=use_checkpoint
)
self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
self.input_proj = nn.Linear(input_channels, width, device=device, dtype=dtype)
self.output_proj = nn.Linear(width, output_channels, device=device, dtype=dtype)
# timestep embedding
self.time_embed = Timesteps(width, flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=0)
self.time_proj = MLP(
device=device, dtype=dtype, width=width, init_scale=init_scale
)
self.context_embed = nn.Sequential(
nn.LayerNorm(context_dim, device=device, dtype=dtype),
nn.Linear(context_dim, width, device=device, dtype=dtype),
)
if context_ln:
self.context_embed = nn.Sequential(
nn.LayerNorm(context_dim, device=device, dtype=dtype),
nn.Linear(context_dim, width, device=device, dtype=dtype),
)
else:
self.context_embed = nn.Linear(context_dim, width, device=device, dtype=dtype)
def forward(self,
model_input: torch.FloatTensor,
timestep: torch.LongTensor,
context: torch.FloatTensor):
r"""
Args:
model_input (torch.FloatTensor): [bs, n_data, c]
timestep (torch.LongTensor): [bs,]
context (torch.FloatTensor): [bs, context_tokens, c]
Returns:
sample (torch.FloatTensor): [bs, n_data, c]
"""
_, n_data, _ = model_input.shape
# 1. time
t_emb = self.time_proj(self.time_embed(timestep)).unsqueeze(dim=1)
# 2. conditions projector
context = self.context_embed(context)
# 3. denoiser
x = self.input_proj(model_input)
x = torch.cat([t_emb, context, x], dim=1)
x = self.backbone(x)
x = self.ln_post(x)
x = x[:, -n_data:]
sample = self.output_proj(x)
return sample

View File

@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
class BaseDenoiser(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x, t, context):
raise NotImplementedError

View File

@@ -0,0 +1,393 @@
# -*- coding: utf-8 -*-
from omegaconf import DictConfig
from typing import List, Tuple, Dict, Optional, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_only
from diffusers.schedulers import (
DDPMScheduler,
DDIMScheduler,
KarrasVeScheduler,
DPMSolverMultistepScheduler
)
from ...utils import instantiate_from_config
from ..tsal.tsal_base import AlignedShapeAsLatentPLModule
from .inference_utils import ddim_sample
SchedulerType = Union[DDIMScheduler, KarrasVeScheduler, DPMSolverMultistepScheduler]
def disabled_train(self, mode=True):
"""Overwrite model.train with this function to make sure train/eval mode
does not change anymore."""
return self
class ClipASLDiffuser(pl.LightningModule):
first_stage_model: Optional[AlignedShapeAsLatentPLModule]
cond_stage_model: Optional[Union[nn.Module, pl.LightningModule]]
model: nn.Module
def __init__(self, *,
first_stage_config,
cond_stage_config,
denoiser_cfg,
scheduler_cfg,
optimizer_cfg,
loss_cfg,
first_stage_key: str = "surface",
cond_stage_key: str = "image",
scale_by_std: bool = False,
z_scale_factor: float = 1.0,
ckpt_path: Optional[str] = None,
ignore_keys: Union[Tuple[str], List[str]] = ()):
super().__init__()
self.first_stage_key = first_stage_key
self.cond_stage_key = cond_stage_key
# 1. lazy initialize first stage
self.instantiate_first_stage(first_stage_config)
# 2. initialize conditional stage
self.instantiate_cond_stage(cond_stage_config)
# 3. diffusion model
self.model = instantiate_from_config(
denoiser_cfg, device=None, dtype=None
)
self.optimizer_cfg = optimizer_cfg
# 4. scheduling strategy
self.scheduler_cfg = scheduler_cfg
self.noise_scheduler: DDPMScheduler = instantiate_from_config(scheduler_cfg.noise)
self.denoise_scheduler: SchedulerType = instantiate_from_config(scheduler_cfg.denoise)
# 5. loss configures
self.loss_cfg = loss_cfg
self.scale_by_std = scale_by_std
if scale_by_std:
self.register_buffer("z_scale_factor", torch.tensor(z_scale_factor))
else:
self.z_scale_factor = z_scale_factor
self.ckpt_path = ckpt_path
if ckpt_path is not None:
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
def instantiate_non_trainable_model(self, config):
model = instantiate_from_config(config)
model = model.eval()
model.train = disabled_train
for param in model.parameters():
param.requires_grad = False
return model
def instantiate_first_stage(self, first_stage_config):
self.first_stage_model = self.instantiate_non_trainable_model(first_stage_config)
self.first_stage_model.set_shape_model_only()
def instantiate_cond_stage(self, cond_stage_config):
self.cond_stage_model = self.instantiate_non_trainable_model(cond_stage_config)
def init_from_ckpt(self, path, ignore_keys=()):
state_dict = torch.load(path, map_location="cpu")["state_dict"]
keys = list(state_dict.keys())
for k in keys:
for ik in ignore_keys:
if k.startswith(ik):
print("Deleting key {} from state_dict.".format(k))
del state_dict[k]
missing, unexpected = self.load_state_dict(state_dict, strict=False)
print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
if len(missing) > 0:
print(f"Missing Keys: {missing}")
print(f"Unexpected Keys: {unexpected}")
@property
def zero_rank(self):
if self._trainer:
zero_rank = self.trainer.local_rank == 0
else:
zero_rank = True
return zero_rank
def configure_optimizers(self) -> Tuple[List, List]:
lr = self.learning_rate
trainable_parameters = list(self.model.parameters())
if self.optimizer_cfg is None:
optimizers = [torch.optim.AdamW(trainable_parameters, lr=lr, betas=(0.9, 0.99), weight_decay=1e-3)]
schedulers = []
else:
optimizer = instantiate_from_config(self.optimizer_cfg.optimizer, params=trainable_parameters)
scheduler_func = instantiate_from_config(
self.optimizer_cfg.scheduler,
max_decay_steps=self.trainer.max_steps,
lr_max=lr
)
scheduler = {
"scheduler": lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_func.schedule),
"interval": "step",
"frequency": 1
}
optimizers = [optimizer]
schedulers = [scheduler]
return optimizers, schedulers
@torch.no_grad()
def encode_first_stage(self, surface: torch.FloatTensor, sample_posterior=True):
z_q = self.first_stage_model.encode(surface, sample_posterior)
z_q = self.z_scale_factor * z_q
return z_q
@torch.no_grad()
def decode_first_stage(self, z_q: torch.FloatTensor, **kwargs):
z_q = 1. / self.z_scale_factor * z_q
latents = self.first_stage_model.decode(z_q, **kwargs)
return latents
@rank_zero_only
@torch.no_grad()
def on_train_batch_start(self, batch, batch_idx):
# only for very first batch
if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 \
and batch_idx == 0 and self.ckpt_path is None:
# set rescale weight to 1./std of encodings
print("### USING STD-RESCALING ###")
z_q = self.encode_first_stage(batch[self.first_stage_key])
z = z_q.detach()
del self.z_scale_factor
self.register_buffer("z_scale_factor", 1. / z.flatten().std())
print(f"setting self.z_scale_factor to {self.z_scale_factor}")
print("### USING STD-RESCALING ###")
def compute_loss(self, model_outputs, split):
"""
Args:
model_outputs (dict):
- x_0:
- noise:
- noise_prior:
- noise_pred:
- noise_pred_prior:
split (str):
Returns:
"""
pred = model_outputs["pred"]
if self.noise_scheduler.prediction_type == "epsilon":
target = model_outputs["noise"]
elif self.noise_scheduler.prediction_type == "sample":
target = model_outputs["x_0"]
else:
raise NotImplementedError(f"Prediction Type: {self.noise_scheduler.prediction_type} not yet supported.")
if self.loss_cfg.loss_type == "l1":
simple = F.l1_loss(pred, target, reduction="mean")
elif self.loss_cfg.loss_type in ["mse", "l2"]:
simple = F.mse_loss(pred, target, reduction="mean")
else:
raise NotImplementedError(f"Loss Type: {self.loss_cfg.loss_type} not yet supported.")
total_loss = simple
loss_dict = {
f"{split}/total_loss": total_loss.clone().detach(),
f"{split}/simple": simple.detach(),
}
return total_loss, loss_dict
def forward(self, batch):
"""
Args:
batch:
Returns:
"""
latents = self.encode_first_stage(batch[self.first_stage_key])
conditions = self.cond_stage_model.encode(batch[self.cond_stage_key])
# Sample noise that we"ll add to the latents
# [batch_size, n_token, latent_dim]
noise = torch.randn_like(latents)
bs = latents.shape[0]
# Sample a random timestep for each motion
timesteps = torch.randint(
0,
self.noise_scheduler.config.num_train_timesteps,
(bs,),
device=latents.device,
)
timesteps = timesteps.long()
# Add noise to the latents according to the noise magnitude at each timestep
noisy_z = self.noise_scheduler.add_noise(latents, noise, timesteps)
# diffusion model forward
noise_pred = self.model(noisy_z, timesteps, conditions)
diffusion_outputs = {
"x_0": noisy_z,
"noise": noise,
"pred": noise_pred
}
return diffusion_outputs
def training_step(self, batch: Dict[str, Union[torch.FloatTensor, List[str]]],
batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
"""
Args:
batch (dict): the batch sample, and it contains:
- surface (torch.FloatTensor):
- image (torch.FloatTensor): if provide, [bs, 3, h, w], item range [0, 1]
- depth (torch.FloatTensor): if provide, [bs, 1, h, w], item range [-1, 1]
- normal (torch.FloatTensor): if provide, [bs, 3, h, w], item range [-1, 1]
- text (list of str):
batch_idx (int):
optimizer_idx (int):
Returns:
loss (torch.FloatTensor):
"""
diffusion_outputs = self(batch)
loss, loss_dict = self.compute_loss(diffusion_outputs, "train")
self.log_dict(loss_dict, prog_bar=True, logger=True, sync_dist=False, rank_zero_only=True)
return loss
def validation_step(self, batch: Dict[str, torch.FloatTensor],
batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
"""
Args:
batch (dict): the batch sample, and it contains:
- surface_pc (torch.FloatTensor): [n_pts, 4]
- surface_feats (torch.FloatTensor): [n_pts, c]
- text (list of str):
batch_idx (int):
optimizer_idx (int):
Returns:
loss (torch.FloatTensor):
"""
diffusion_outputs = self(batch)
loss, loss_dict = self.compute_loss(diffusion_outputs, "val")
self.log_dict(loss_dict, prog_bar=True, logger=True, sync_dist=False, rank_zero_only=True)
return loss
@torch.no_grad()
def sample(self,
batch: Dict[str, Union[torch.FloatTensor, List[str]]],
sample_times: int = 1,
steps: Optional[int] = None,
guidance_scale: Optional[float] = None,
eta: float = 0.0,
return_intermediates: bool = False, **kwargs):
if steps is None:
steps = self.scheduler_cfg.num_inference_steps
if guidance_scale is None:
guidance_scale = self.scheduler_cfg.guidance_scale
do_classifier_free_guidance = guidance_scale > 0
# conditional encode
xc = batch[self.cond_stage_key]
# print(self.first_stage_model.device, self.cond_stage_model.device, self.device)
cond = self.cond_stage_model(xc)
if do_classifier_free_guidance:
un_cond = self.cond_stage_model.unconditional_embedding(batch_size=len(xc))
cond = torch.cat([un_cond, cond], dim=0)
outputs = []
latents = None
if not return_intermediates:
for _ in range(sample_times):
sample_loop = ddim_sample(
self.denoise_scheduler,
self.model,
shape=self.first_stage_model.latent_shape,
cond=cond,
steps=steps,
guidance_scale=guidance_scale,
do_classifier_free_guidance=do_classifier_free_guidance,
device=self.device,
eta=eta,
disable_prog=not self.zero_rank
)
for sample, t in sample_loop:
latents = sample
outputs.append(self.decode_first_stage(latents, **kwargs))
else:
sample_loop = ddim_sample(
self.denoise_scheduler,
self.model,
shape=self.first_stage_model.latent_shape,
cond=cond,
steps=steps,
guidance_scale=guidance_scale,
do_classifier_free_guidance=do_classifier_free_guidance,
device=self.device,
eta=eta,
disable_prog=not self.zero_rank
)
iter_size = steps // sample_times
i = 0
for sample, t in sample_loop:
latents = sample
if i % iter_size == 0 or i == steps - 1:
outputs.append(self.decode_first_stage(latents, **kwargs))
i += 1
return outputs

View File

@@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-
import torch
from tqdm import tqdm
from typing import Tuple, List, Union, Optional
from diffusers.schedulers import DDIMScheduler
__all__ = ["ddim_sample"]
def ddim_sample(ddim_scheduler: DDIMScheduler,
diffusion_model: torch.nn.Module,
shape: Union[List[int], Tuple[int]],
cond: torch.FloatTensor,
steps: int,
eta: float = 0.0,
guidance_scale: float = 3.0,
do_classifier_free_guidance: bool = True,
generator: Optional[torch.Generator] = None,
device: torch.device = "cuda:0",
disable_prog: bool = True):
assert steps > 0, f"{steps} must > 0."
# init latents
bsz = cond.shape[0]
if do_classifier_free_guidance:
bsz = bsz // 2
latents = torch.randn(
(bsz, *shape),
generator=generator,
device=cond.device,
dtype=cond.dtype,
)
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * ddim_scheduler.init_noise_sigma
# set timesteps
ddim_scheduler.set_timesteps(steps)
timesteps = ddim_scheduler.timesteps.to(device)
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
# eta (η) is only used with the DDIMScheduler, and between [0, 1]
extra_step_kwargs = {
"eta": eta,
"generator": generator
}
# reverse
for i, t in enumerate(tqdm(timesteps, disable=disable_prog, desc="DDIM Sampling:", leave=False)):
# expand the latents if we are doing classifier free guidance
latent_model_input = (
torch.cat([latents] * 2)
if do_classifier_free_guidance
else latents
)
# latent_model_input = scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
timestep_tensor = torch.tensor([t], dtype=torch.long, device=device)
timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
noise_pred = diffusion_model.forward(latent_model_input, timestep_tensor, cond)
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (
noise_pred_text - noise_pred_uncond
)
# text_embeddings_for_guidance = encoder_hidden_states.chunk(
# 2)[1] if do_classifier_free_guidance else encoder_hidden_states
# compute the previous noisy sample x_t -> x_t-1
latents = ddim_scheduler.step(
noise_pred, t, latents, **extra_step_kwargs
).prev_sample
yield latents, t
def karra_sample():
pass

View File

@@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
from .clip import CLIPEncoder

View File

@@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
import torch
import numpy as np
from PIL import Image
from dataclasses import dataclass
from torchvision.transforms import Normalize
from transformers import CLIPModel, CLIPTokenizer
from transformers.utils import ModelOutput
from typing import Iterable, Optional, Union, List
ImageType = Union[np.ndarray, torch.Tensor, Image.Image]
@dataclass
class CLIPEmbedOutput(ModelOutput):
last_hidden_state: torch.FloatTensor = None
pooler_output: torch.FloatTensor = None
embeds: torch.FloatTensor = None
class CLIPEncoder(torch.nn.Module):
def __init__(self, model_path="openai/clip-vit-base-patch32"):
super().__init__()
# Load the CLIP model and processor
self.model: CLIPModel = CLIPModel.from_pretrained(model_path)
self.tokenizer = CLIPTokenizer.from_pretrained(model_path)
self.image_preprocess = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
self.model.training = False
for p in self.model.parameters():
p.requires_grad = False
@torch.no_grad()
def encode_image(self, images: Iterable[Optional[ImageType]]):
pixel_values = self.image_preprocess(images)
vision_outputs = self.model.vision_model(pixel_values=pixel_values)
pooler_output = vision_outputs[1] # pooled_output
image_features = self.model.visual_projection(pooler_output)
visual_embeds = CLIPEmbedOutput(
last_hidden_state=vision_outputs.last_hidden_state,
pooler_output=pooler_output,
embeds=image_features
)
return visual_embeds
@torch.no_grad()
def encode_text(self, texts: List[str]):
text_inputs = self.tokenizer(texts, padding=True, return_tensors="pt")
text_outputs = self.model.text_model(input_ids=text_inputs)
pooler_output = text_outputs[1] # pooled_output
text_features = self.model.text_projection(pooler_output)
text_embeds = CLIPEmbedOutput(
last_hidden_state=text_outputs.last_hidden_state,
pooler_output=pooler_output,
embeds=text_features
)
return text_embeds
def forward(self,
images: Iterable[Optional[ImageType]],
texts: List[str]):
visual_embeds = self.encode_image(images)
text_embeds = self.encode_text(texts)
return visual_embeds, text_embeds

View File

@@ -0,0 +1,562 @@
# -*- coding: utf-8 -*-
import os
import torch
import torch.nn as nn
from torchvision import transforms
from transformers import CLIPModel, CLIPTokenizer
from collections import OrderedDict
from ...data.transforms import RandomResize
class AbstractEncoder(nn.Module):
embedding_dim: int
def __init__(self):
super().__init__()
def encode(self, *args, **kwargs):
raise NotImplementedError
class ClassEmbedder(nn.Module):
def __init__(self, embed_dim, n_classes=1000, key="class"):
super().__init__()
self.key = key
self.embedding = nn.Embedding(n_classes, embed_dim)
def forward(self, batch, key=None):
if key is None:
key = self.key
# this is for use in crossattn
c = batch[key][:, None]
c = self.embedding(c)
return c
class FrozenCLIPTextEmbedder(AbstractEncoder):
"""Uses the CLIP transformer encoder for text (from Hugging Face)"""
def __init__(
self,
version="openai/clip-vit-large-patch14",
tokenizer_version=None,
device="cuda",
max_length=77,
zero_embedding_radio: float = 0.1,
):
super().__init__()
self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer_version or version)
self.device = device
self.max_length = max_length
self.zero_embedding_radio = zero_embedding_radio
self.clip_dict = OrderedDict()
self.clip_name = os.path.split(version)[-1]
transformer = CLIPModel.from_pretrained(version).text_model
for param in transformer.parameters():
param.requires_grad = False
self.clip_dict[self.clip_name] = transformer
self._move_flag = False
@property
def clip(self):
return self.clip_dict[self.clip_name]
def move(self):
if self._move_flag:
return
self.clip_dict[self.clip_name] = self.clip_dict[self.clip_name].to(self.device)
self._move_flag = True
def unconditional_embedding(self, batch_size):
empty_text = [""] * batch_size
empty_z = self.forward(empty_text)
return empty_z
def forward(self, text):
self.move()
batch_encoding = self.tokenizer(
text,
truncation=True,
max_length=self.max_length,
return_length=True,
return_overflowing_tokens=False,
padding="max_length",
return_tensors="pt",
)
tokens = batch_encoding["input_ids"].to(self.device)
outputs = self.clip(input_ids=tokens)
z = outputs.last_hidden_state
return z
def encode(self, text):
batch_size = len(text)
batch_mask = torch.rand((batch_size,))
for i in range(batch_size):
if batch_mask[i] < self.zero_embedding_radio:
text[i] = ""
return self(text)
class FrozenAlignedCLIPTextEmbedder(AbstractEncoder):
"""Uses the CLIP transformer encoder for text (from Hugging Face)"""
def __init__(
self,
version="openai/clip-vit-large-patch14",
tokenizer_version=None,
device="cuda",
max_length=77,
zero_embedding_radio: float = 0.1,
):
super().__init__()
self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer_version or version)
self.device = device
self.max_length = max_length
self.zero_embedding_radio = zero_embedding_radio
self.clip_dict = OrderedDict()
self.clip_name = os.path.split(version)[-1]
transformer = CLIPModel.from_pretrained(version).text_model
for param in transformer.parameters():
param.requires_grad = False
self.clip_dict[self.clip_name] = transformer
self._move_flag = False
@property
def clip(self):
return self.clip_dict[self.clip_name]
def move(self):
if self._move_flag:
return
self.clip_dict[self.clip_name] = self.clip_dict[self.clip_name].to(self.device)
self._move_flag = True
def unconditional_embedding(self, batch_size):
empty_text = [""] * batch_size
empty_z = self.forward(empty_text)
return empty_z
def forward(self, text):
self.move()
batch_encoding = self.tokenizer(
text,
truncation=True,
max_length=self.max_length,
return_length=True,
return_overflowing_tokens=False,
padding="max_length",
return_tensors="pt",
)
tokens = batch_encoding["input_ids"].to(self.device)
outputs = self.clip(input_ids=tokens)
z = outputs.last_hidden_state
return z
def encode(self, text):
batch_size = len(text)
batch_mask = torch.rand((batch_size,))
for i in range(batch_size):
if batch_mask[i] < self.zero_embedding_radio:
text[i] = ""
return self(text)
class FrozenCLIPImageEmbedder(AbstractEncoder):
"""Uses the CLIP transformer encoder for text (from Hugging Face)"""
def __init__(
self,
version="openai/clip-vit-large-patch14",
device="cuda",
zero_embedding_radio=0.1,
normalize_embedding=True,
num_projection_vector=0,
linear_mapping_bias=True,
reverse_visual_projection=False,
):
super().__init__()
self.device = device
self.clip_dict = OrderedDict()
self.clip_name = os.path.split(version)[-1]
clip_model = CLIPModel.from_pretrained(version)
clip_model.text_model = None
clip_model.text_projection = None
clip_model = clip_model.eval()
for param in self.parameters():
param.requires_grad = False
self.clip_dict[self.clip_name] = clip_model
self.transform = transforms.Compose(
[
transforms.Resize(224, transforms.InterpolationMode.BICUBIC, antialias=True),
transforms.CenterCrop(224), # crop a (224, 224) square
transforms.Normalize(
mean=[0.48145466, 0.4578275, 0.40821073],
std=[0.26862954, 0.26130258, 0.27577711],
),
]
)
self.zero_embedding_radio = zero_embedding_radio
self.num_projection_vector = num_projection_vector
self.reverse_visual_projection = reverse_visual_projection
self.normalize_embedding = normalize_embedding
embedding_dim = (
clip_model.visual_projection.in_features
if reverse_visual_projection
else clip_model.visual_projection.out_features
)
self.embedding_dim = embedding_dim
if self.num_projection_vector > 0:
self.projection = nn.Linear(
embedding_dim,
clip_model.visual_projection.out_features * num_projection_vector,
bias=linear_mapping_bias,
)
nn.init.normal_(self.projection.weight, std=embedding_dim ** -0.5)
self._move_flag = False
@property
def clip(self):
return self.clip_dict[self.clip_name]
def unconditional_embedding(self, batch_size):
zero = torch.zeros(
batch_size,
1,
self.embedding_dim,
device=self.device,
dtype=self.clip.visual_projection.weight.dtype,
)
if self.num_projection_vector > 0:
zero = self.projection(zero).view(batch_size, self.num_projection_vector, -1)
return zero
def forward(self, image, value_range=(-1, 1), zero_embedding_radio=0):
if value_range is not None:
low, high = value_range
image = (image - low) / (high - low)
image = image.to(self.device, dtype=self.clip.visual_projection.weight.dtype)
if self.reverse_visual_projection:
z = self.clip.vision_model(self.transform(image))[1]
else:
z = self.clip.get_image_features(self.transform(image))
if self.normalize_embedding:
z = z / z.norm(dim=-1, keepdim=True)
if z.ndim == 2:
z = z.unsqueeze(dim=-2)
if zero_embedding_radio > 0:
mask = torch.rand((len(image), 1, 1), device=z.device, dtype=z.dtype) < zero_embedding_radio
z = z * mask.to(z)
if self.num_projection_vector > 0:
z = self.projection(z).view(len(image), self.num_projection_vector, -1)
return z
def move(self):
if self._move_flag:
return
self.clip_dict[self.clip_name] = self.clip_dict[self.clip_name].to(self.device)
self._move_flag = True
def encode(self, image):
self.move()
return self(image, zero_embedding_radio=self.zero_embedding_radio)
class FrozenCLIPImageGridEmbedder(AbstractEncoder):
def __init__(
self,
version="openai/clip-vit-large-patch14",
device="cuda",
zero_embedding_radio=0.1,
):
super().__init__()
self.device = device
self.clip_dict = OrderedDict()
self.clip_name = os.path.split(version)[-1]
clip_model: CLIPModel = CLIPModel.from_pretrained(version)
clip_model.text_model = None
clip_model.text_projection = None
clip_model = clip_model.eval()
for param in self.parameters():
param.requires_grad = False
self.clip_dict[self.clip_name] = clip_model
self.transform = transforms.Compose(
[
transforms.Resize(224, transforms.InterpolationMode.BILINEAR, antialias=True),
transforms.CenterCrop(224), # crop a (224, 224) square
transforms.Normalize(
mean=[0.48145466, 0.4578275, 0.40821073],
std=[0.26862954, 0.26130258, 0.27577711],
),
]
)
self.zero_embedding_radio = zero_embedding_radio
self.embedding_dim = clip_model.vision_embed_dim
self._move_flag = False
@property
def clip(self):
return self.clip_dict[self.clip_name]
def move(self):
if self._move_flag:
return
self.clip_dict[self.clip_name] = self.clip_dict[self.clip_name].to(self.device)
self._move_flag = True
def unconditional_embedding(self, batch_size):
zero = torch.zeros(
batch_size,
self.clip.vision_model.embeddings.num_positions,
self.embedding_dim,
device=self.device,
dtype=self.clip.visual_projection.weight.dtype,
)
return zero
def forward(self, image, value_range=(-1, 1), zero_embedding_radio=0):
self.move()
if value_range is not None:
low, high = value_range
image = (image - low) / (high - low)
image = image.to(self.device, dtype=self.clip.visual_projection.weight.dtype)
z = self.clip.vision_model(self.transform(image)).last_hidden_state
if zero_embedding_radio > 0:
mask = torch.rand((len(image), 1, 1), device=z.device, dtype=z.dtype) >= zero_embedding_radio
z = z * mask.to(z)
return z
def encode(self, image):
return self(image, zero_embedding_radio=self.zero_embedding_radio)
class MoECLIPImageEncoder(nn.Module):
def __init__(
self,
versions,
hidden_state_dim,
num_projection_vector=8,
zero_embedding_radio=0.1,
device="cuda",
precision="fp16",
normalize=False,
clip_max=0,
transform_type="base",
argument_p=0.2,
):
super().__init__()
self.device = torch.device(device)
self.hidden_state_dim = hidden_state_dim
self.zero_embedding_radio = zero_embedding_radio
self.num_projection_vector = num_projection_vector
self.dtype = dict(fp16=torch.float16, fp32=torch.float32, bf16=torch.bfloat16)[precision]
self.normalize = normalize
self.clip_max = clip_max
if transform_type == "base":
self.transform = transforms.Compose(
[
transforms.Resize(224, transforms.InterpolationMode.BICUBIC, antialias=True),
transforms.CenterCrop(224), # crop a (224, 224) square
transforms.Normalize(
mean=[0.48145466, 0.4578275, 0.40821073],
std=[0.26862954, 0.26130258, 0.27577711],
),
]
)
elif transform_type == "crop_blur_resize":
self.transform = transforms.Compose(
[
transforms.Resize(224, transforms.InterpolationMode.BICUBIC, antialias=True),
transforms.CenterCrop(224), # crop a (224, 224) square
transforms.RandomApply(
transforms=[
transforms.RandomResizedCrop(
size=224,
scale=(0.8, 1.0),
ratio=(0.99, 1.01),
interpolation=transforms.InterpolationMode.BICUBIC,
),
],
p=argument_p,
),
transforms.RandomApply(
transforms=[
transforms.GaussianBlur(kernel_size=9, sigma=(0.1, 5)),
],
p=argument_p,
),
transforms.RandomApply(
transforms=[
RandomResize(size=224, resize_radio=(0.2, 1)),
],
p=argument_p,
),
transforms.Normalize(
mean=[0.48145466, 0.4578275, 0.40821073],
std=[0.26862954, 0.26130258, 0.27577711],
),
]
)
else:
raise ValueError(f"invalid {transform_type=}")
if isinstance(versions, str):
versions = (versions,)
# 如果直接把clips定位为当前类的子module1. 会在保存ckp时存无用的多个权重。 2. pl会调用to导致layer_norm的权重也被转换成fp16
clips = OrderedDict()
for v in versions:
# 因为clips不是子module直接指定device="cuda"会错误地导致clip模型权重都被放到cuda:0上。
clips[v], _ = clip.load(name=v, device="cpu", jit=False, download_root=None)
delattr(clips[v], "transformer")
clips[v].eval()
clips[v].requires_grad_(False)
self.clips_hidden_dim = sum(clips[v].ln_final.weight.size(0) for v in clips)
if self.num_projection_vector == 0:
self.projection = nn.Identity()
else:
self.projection = nn.Linear(self.clips_hidden_dim, hidden_state_dim * self.num_projection_vector, bias=True)
self.projection.to(dtype=self.dtype)
nn.init.normal_(self.projection.weight, std=self.clips_hidden_dim ** -0.5)
self.clips = clips
self._move_flag = False
def move(self):
if self._move_flag:
return
def convert_weights(model: nn.Module):
"""Convert applicable model parameters to fp16"""
def _convert_weights_to_fp16(l):
if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
l.weight.data = l.weight.data.type(self.dtype)
if l.bias is not None:
l.bias.data = l.bias.data.type(self.dtype)
if isinstance(l, nn.MultiheadAttention):
for attr in [
*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]],
"in_proj_bias",
"bias_k",
"bias_v",
]:
tensor = getattr(l, attr)
if tensor is not None:
tensor.data = tensor.data.type(self.dtype)
for name in ["text_projection", "proj"]:
if hasattr(l, name):
attr = getattr(l, name)
if attr is not None:
attr.data = attr.data.type(self.dtype)
model.apply(_convert_weights_to_fp16)
for k in self.clips:
self.clips[k].to(self.device)
convert_weights(self.clips[k]) # fp32 -> self.dtype
self._move_flag = True
def unconditional_embedding(self, batch_size=None):
zero = torch.zeros(
batch_size,
self.clips_hidden_dim,
device=self.device,
dtype=self.dtype,
)
if self.num_projection_vector > 0:
zero = self.projection(zero).view(batch_size, self.num_projection_vector, -1)
return zero
def convert_embedding(self, z):
if self.num_projection_vector > 0:
z = self.projection(z.type(self.projection.weight.dtype)).view(len(z), self.num_projection_vector, -1)
return z
def forward(self, image, value_range=(-1, 1), zero_embedding_radio=0):
if value_range is not None:
low, high = value_range
image = (image - low) / (high - low)
image = self.transform(image)
with torch.no_grad():
embs = []
for v in self.clips:
x = self.clips[v].encode_image(image)
if self.normalize:
x = x / x.norm(p=2, dim=-1, keepdim=True) * (x.size(-1) ** 0.5)
# clip_max only works with normalization
if self.clip_max > 0:
x = x.clamp(-self.clip_max, self.clip_max)
embs.append(x)
z = torch.cat(embs, dim=-1)
if self.normalize:
z /= z.size(-1) ** 0.5
if zero_embedding_radio > 0:
mask = torch.rand((len(image), 1, 1), device=z.device, dtype=z.dtype) >= zero_embedding_radio
z = z + mask.to(z)
if self.num_projection_vector > 0:
z = self.projection(z).view(len(image), self.num_projection_vector, -1)
return z
def encode(self, image):
self.move()
return self(image, zero_embedding_radio=self.zero_embedding_radio)

View File

@@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
from .checkpoint import checkpoint

View File

@@ -0,0 +1,69 @@
# -*- coding: utf-8 -*-
"""
Adapted from: https://github.com/openai/guided-diffusion/blob/22e0df8183507e13a7813f8d38d51b072ca1e67c/guided_diffusion/nn.py#L124
"""
import torch
from typing import Callable, Iterable, Sequence, Union
def checkpoint(
func: Callable[..., Union[torch.Tensor, Sequence[torch.Tensor]]],
inputs: Sequence[torch.Tensor],
params: Iterable[torch.Tensor],
flag: bool,
use_deepspeed: bool = False
):
"""
Evaluate a function without caching intermediate activations, allowing for
reduced memory at the expense of extra compute in the backward pass.
:param func: the function to evaluate.
:param inputs: the argument sequence to pass to `func`.
:param params: a sequence of parameters `func` depends on but does not
explicitly take as arguments.
:param flag: if False, disable gradient checkpointing.
:param use_deepspeed: if True, use deepspeed
"""
if flag:
if use_deepspeed:
import deepspeed
return deepspeed.checkpointing.checkpoint(func, *inputs)
args = tuple(inputs) + tuple(params)
return CheckpointFunction.apply(func, len(inputs), *args)
else:
return func(*inputs)
class CheckpointFunction(torch.autograd.Function):
@staticmethod
@torch.cuda.amp.custom_fwd
def forward(ctx, run_function, length, *args):
ctx.run_function = run_function
ctx.input_tensors = list(args[:length])
ctx.input_params = list(args[length:])
with torch.no_grad():
output_tensors = ctx.run_function(*ctx.input_tensors)
return output_tensors
@staticmethod
@torch.cuda.amp.custom_bwd
def backward(ctx, *output_grads):
ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
with torch.enable_grad():
# Fixes a bug where the first op in run_function modifies the
# Tensor storage in place, which is not allowed for detach()'d
# Tensors.
shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
output_tensors = ctx.run_function(*shallow_copies)
input_grads = torch.autograd.grad(
output_tensors,
ctx.input_tensors + ctx.input_params,
output_grads,
allow_unused=True,
)
del ctx.input_tensors
del ctx.input_params
del output_tensors
return (None, None) + input_grads

View File

@@ -0,0 +1,218 @@
# -*- coding: utf-8 -*-
import math
import torch
import torch.nn as nn
from typing import Optional
from .checkpoint import checkpoint
from .transformer_blocks import (
init_linear,
MLP,
MultiheadCrossAttention,
MultiheadAttention,
ResidualAttentionBlock
)
class AdaLayerNorm(nn.Module):
def __init__(self,
device: torch.device,
dtype: torch.dtype,
width: int):
super().__init__()
self.silu = nn.SiLU(inplace=True)
self.linear = nn.Linear(width, width * 2, device=device, dtype=dtype)
self.layernorm = nn.LayerNorm(width, elementwise_affine=False, device=device, dtype=dtype)
def forward(self, x, timestep):
emb = self.linear(timestep)
scale, shift = torch.chunk(emb, 2, dim=2)
x = self.layernorm(x) * (1 + scale) + shift
return x
class DitBlock(nn.Module):
def __init__(
self,
*,
device: torch.device,
dtype: torch.dtype,
n_ctx: int,
width: int,
heads: int,
context_dim: int,
qkv_bias: bool = False,
init_scale: float = 1.0,
use_checkpoint: bool = False
):
super().__init__()
self.use_checkpoint = use_checkpoint
self.attn = MultiheadAttention(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias
)
self.ln_1 = AdaLayerNorm(device, dtype, width)
if context_dim is not None:
self.ln_2 = AdaLayerNorm(device, dtype, width)
self.cross_attn = MultiheadCrossAttention(
device=device,
dtype=dtype,
width=width,
heads=heads,
data_width=context_dim,
init_scale=init_scale,
qkv_bias=qkv_bias
)
self.mlp = MLP(device=device, dtype=dtype, width=width, init_scale=init_scale)
self.ln_3 = AdaLayerNorm(device, dtype, width)
def forward(self, x: torch.Tensor, t: torch.Tensor, context: Optional[torch.Tensor] = None):
return checkpoint(self._forward, (x, t, context), self.parameters(), self.use_checkpoint)
def _forward(self, x: torch.Tensor, t: torch.Tensor, context: Optional[torch.Tensor] = None):
x = x + self.attn(self.ln_1(x, t))
if context is not None:
x = x + self.cross_attn(self.ln_2(x, t), context)
x = x + self.mlp(self.ln_3(x, t))
return x
class DiT(nn.Module):
def __init__(
self,
*,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
n_ctx: int,
width: int,
layers: int,
heads: int,
context_dim: int,
init_scale: float = 0.25,
qkv_bias: bool = False,
use_checkpoint: bool = False
):
super().__init__()
self.n_ctx = n_ctx
self.width = width
self.layers = layers
self.resblocks = nn.ModuleList(
[
DitBlock(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
heads=heads,
context_dim=context_dim,
qkv_bias=qkv_bias,
init_scale=init_scale,
use_checkpoint=use_checkpoint
)
for _ in range(layers)
]
)
def forward(self, x: torch.Tensor, t: torch.Tensor, context: Optional[torch.Tensor] = None):
for block in self.resblocks:
x = block(x, t, context)
return x
class UNetDiffusionTransformer(nn.Module):
def __init__(
self,
*,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
n_ctx: int,
width: int,
layers: int,
heads: int,
init_scale: float = 0.25,
qkv_bias: bool = False,
skip_ln: bool = False,
use_checkpoint: bool = False
):
super().__init__()
self.n_ctx = n_ctx
self.width = width
self.layers = layers
self.encoder = nn.ModuleList()
for _ in range(layers):
resblock = ResidualAttentionBlock(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
use_checkpoint=use_checkpoint
)
self.encoder.append(resblock)
self.middle_block = ResidualAttentionBlock(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
use_checkpoint=use_checkpoint
)
self.decoder = nn.ModuleList()
for _ in range(layers):
resblock = ResidualAttentionBlock(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
use_checkpoint=use_checkpoint
)
linear = nn.Linear(width * 2, width, device=device, dtype=dtype)
init_linear(linear, init_scale)
layer_norm = nn.LayerNorm(width, device=device, dtype=dtype) if skip_ln else None
self.decoder.append(nn.ModuleList([resblock, linear, layer_norm]))
def forward(self, x: torch.Tensor):
enc_outputs = []
for block in self.encoder:
x = block(x)
enc_outputs.append(x)
x = self.middle_block(x)
for i, (resblock, linear, layer_norm) in enumerate(self.decoder):
x = torch.cat([enc_outputs.pop(), x], dim=-1)
x = linear(x)
if layer_norm is not None:
x = layer_norm(x)
x = resblock(x)
return x

View File

@@ -0,0 +1,100 @@
import torch
import numpy as np
from typing import Union, List
class AbstractDistribution(object):
def sample(self):
raise NotImplementedError()
def mode(self):
raise NotImplementedError()
class DiracDistribution(AbstractDistribution):
def __init__(self, value):
self.value = value
def sample(self):
return self.value
def mode(self):
return self.value
class DiagonalGaussianDistribution(object):
def __init__(self, parameters: Union[torch.Tensor, List[torch.Tensor]], deterministic=False, feat_dim=1):
self.feat_dim = feat_dim
self.parameters = parameters
if isinstance(parameters, list):
self.mean = parameters[0]
self.logvar = parameters[1]
else:
self.mean, self.logvar = torch.chunk(parameters, 2, dim=feat_dim)
self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
self.deterministic = deterministic
self.std = torch.exp(0.5 * self.logvar)
self.var = torch.exp(self.logvar)
if self.deterministic:
self.var = self.std = torch.zeros_like(self.mean)
def sample(self):
x = self.mean + self.std * torch.randn_like(self.mean)
return x
def kl(self, other=None, dims=(1, 2, 3)):
if self.deterministic:
return torch.Tensor([0.])
else:
if other is None:
return 0.5 * torch.mean(torch.pow(self.mean, 2)
+ self.var - 1.0 - self.logvar,
dim=dims)
else:
return 0.5 * torch.mean(
torch.pow(self.mean - other.mean, 2) / other.var
+ self.var / other.var - 1.0 - self.logvar + other.logvar,
dim=dims)
def nll(self, sample, dims=(1, 2, 3)):
if self.deterministic:
return torch.Tensor([0.])
logtwopi = np.log(2.0 * np.pi)
return 0.5 * torch.sum(
logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
dim=dims)
def mode(self):
return self.mean
def normal_kl(mean1, logvar1, mean2, logvar2):
"""
source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
Compute the KL divergence between two gaussians.
Shapes are automatically broadcasted, so batches can be compared to
scalars, among other use cases.
"""
tensor = None
for obj in (mean1, logvar1, mean2, logvar2):
if isinstance(obj, torch.Tensor):
tensor = obj
break
assert tensor is not None, "at least one argument must be a Tensor"
# Force variances to be Tensors. Broadcasting helps convert scalars to
# Tensors, but it does not work for torch.exp().
logvar1, logvar2 = [
x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
for x in (logvar1, logvar2)
]
return 0.5 * (
-1.0
+ logvar2
- logvar1
+ torch.exp(logvar1 - logvar2)
+ ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
)

View File

@@ -0,0 +1,213 @@
# -*- coding: utf-8 -*-
import numpy as np
import torch
import torch.nn as nn
import math
VALID_EMBED_TYPES = ["identity", "fourier", "hashgrid", "sphere_harmonic", "triplane_fourier"]
class FourierEmbedder(nn.Module):
"""The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
each feature dimension of `x[..., i]` into:
[
sin(x[..., i]),
sin(f_1*x[..., i]),
sin(f_2*x[..., i]),
...
sin(f_N * x[..., i]),
cos(x[..., i]),
cos(f_1*x[..., i]),
cos(f_2*x[..., i]),
...
cos(f_N * x[..., i]),
x[..., i] # only present if include_input is True.
], here f_i is the frequency.
Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
Args:
num_freqs (int): the number of frequencies, default is 6;
logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
input_dim (int): the input dimension, default is 3;
include_input (bool): include the input tensor or not, default is True.
Attributes:
frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
otherwise, it is input_dim * num_freqs * 2.
"""
def __init__(self,
num_freqs: int = 6,
logspace: bool = True,
input_dim: int = 3,
include_input: bool = True,
include_pi: bool = True) -> None:
"""The initialization"""
super().__init__()
if logspace:
frequencies = 2.0 ** torch.arange(
num_freqs,
dtype=torch.float32
)
else:
frequencies = torch.linspace(
1.0,
2.0 ** (num_freqs - 1),
num_freqs,
dtype=torch.float32
)
if include_pi:
frequencies *= torch.pi
self.register_buffer("frequencies", frequencies, persistent=False)
self.include_input = include_input
self.num_freqs = num_freqs
self.out_dim = self.get_dims(input_dim)
def get_dims(self, input_dim):
temp = 1 if self.include_input or self.num_freqs == 0 else 0
out_dim = input_dim * (self.num_freqs * 2 + temp)
return out_dim
def forward(self, x: torch.Tensor) -> torch.Tensor:
""" Forward process.
Args:
x: tensor of shape [..., dim]
Returns:
embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
where temp is 1 if include_input is True and 0 otherwise.
"""
if self.num_freqs > 0:
embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1)
if self.include_input:
return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
else:
return torch.cat((embed.sin(), embed.cos()), dim=-1)
else:
return x
class LearnedFourierEmbedder(nn.Module):
""" following @crowsonkb "s lead with learned sinusoidal pos emb """
""" https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/models/danbooru_128.py#L8 """
def __init__(self, in_channels, dim):
super().__init__()
assert (dim % 2) == 0
half_dim = dim // 2
per_channel_dim = half_dim // in_channels
self.weights = nn.Parameter(torch.randn(per_channel_dim))
def forward(self, x):
"""
Args:
x (torch.FloatTensor): [..., c]
Returns:
x (torch.FloatTensor): [..., d]
"""
# [b, t, c, 1] * [1, d] = [b, t, c, d] -> [b, t, c * d]
freqs = (x[..., None] * self.weights[None] * 2 * np.pi).view(*x.shape[:-1], -1)
fouriered = torch.cat((x, freqs.sin(), freqs.cos()), dim=-1)
return fouriered
class TriplaneLearnedFourierEmbedder(nn.Module):
def __init__(self, in_channels, dim):
super().__init__()
self.yz_plane_embedder = LearnedFourierEmbedder(in_channels, dim)
self.xz_plane_embedder = LearnedFourierEmbedder(in_channels, dim)
self.xy_plane_embedder = LearnedFourierEmbedder(in_channels, dim)
self.out_dim = in_channels + dim
def forward(self, x):
yz_embed = self.yz_plane_embedder(x)
xz_embed = self.xz_plane_embedder(x)
xy_embed = self.xy_plane_embedder(x)
embed = yz_embed + xz_embed + xy_embed
return embed
def sequential_pos_embed(num_len, embed_dim):
assert embed_dim % 2 == 0
pos = torch.arange(num_len, dtype=torch.float32)
omega = torch.arange(embed_dim // 2, dtype=torch.float32)
omega /= embed_dim / 2.
omega = 1. / 10000 ** omega # (D/2,)
pos = pos.reshape(-1) # (M,)
out = torch.einsum("m,d->md", pos, omega) # (M, D/2), outer product
emb_sin = torch.sin(out) # (M, D/2)
emb_cos = torch.cos(out) # (M, D/2)
embeddings = torch.cat([emb_sin, emb_cos], dim=1) # (M, D)
return embeddings
def timestep_embedding(timesteps, dim, max_period=10000):
"""
Create sinusoidal timestep embeddings.
:param timesteps: a 1-D Tensor of N indices, one per batch element.
These may be fractional.
:param dim: the dimension of the output.
:param max_period: controls the minimum frequency of the embeddings.
:return: an [N x dim] Tensor of positional embeddings.
"""
half = dim // 2
freqs = torch.exp(
-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
).to(device=timesteps.device)
args = timesteps[:, None].to(timesteps.dtype) * freqs[None]
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
if dim % 2:
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
return embedding
def get_embedder(embed_type="fourier", num_freqs=-1, input_dim=3, degree=4,
num_levels=16, level_dim=2, per_level_scale=2, base_resolution=16,
log2_hashmap_size=19, desired_resolution=None):
if embed_type == "identity" or (embed_type == "fourier" and num_freqs == -1):
return nn.Identity(), input_dim
elif embed_type == "fourier":
embedder_obj = FourierEmbedder(num_freqs=num_freqs, input_dim=input_dim,
logspace=True, include_input=True)
return embedder_obj, embedder_obj.out_dim
elif embed_type == "hashgrid":
raise NotImplementedError
elif embed_type == "sphere_harmonic":
raise NotImplementedError
else:
raise ValueError(f"{embed_type} is not valid. Currently only supprts {VALID_EMBED_TYPES}")

View File

@@ -0,0 +1,286 @@
# -*- coding: utf-8 -*-
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional
from .checkpoint import checkpoint
def init_linear(l, stddev):
nn.init.normal_(l.weight, std=stddev)
if l.bias is not None:
nn.init.constant_(l.bias, 0.0)
class MultiheadAttention(nn.Module):
def __init__(
self,
*,
device: torch.device,
dtype: torch.dtype,
n_ctx: int,
width: int,
heads: int,
init_scale: float,
qkv_bias: bool,
flash: bool = False
):
super().__init__()
self.n_ctx = n_ctx
self.width = width
self.heads = heads
self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias, device=device, dtype=dtype)
self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
self.attention = QKVMultiheadAttention(device=device, dtype=dtype, heads=heads, n_ctx=n_ctx, flash=flash)
init_linear(self.c_qkv, init_scale)
init_linear(self.c_proj, init_scale)
def forward(self, x):
x = self.c_qkv(x)
x = checkpoint(self.attention, (x,), (), True)
x = self.c_proj(x)
return x
class QKVMultiheadAttention(nn.Module):
def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int, n_ctx: int, flash: bool = False):
super().__init__()
self.device = device
self.dtype = dtype
self.heads = heads
self.n_ctx = n_ctx
self.flash = flash
def forward(self, qkv):
bs, n_ctx, width = qkv.shape
attn_ch = width // self.heads // 3
scale = 1 / math.sqrt(math.sqrt(attn_ch))
qkv = qkv.view(bs, n_ctx, self.heads, -1)
q, k, v = torch.split(qkv, attn_ch, dim=-1)
if self.flash:
out = F.scaled_dot_product_attention(q, k, v)
else:
weight = torch.einsum(
"bthc,bshc->bhts", q * scale, k * scale
) # More stable with f16 than dividing afterwards
wdtype = weight.dtype
weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
out = torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
return out
class ResidualAttentionBlock(nn.Module):
def __init__(
self,
*,
device: torch.device,
dtype: torch.dtype,
n_ctx: int,
width: int,
heads: int,
init_scale: float = 1.0,
qkv_bias: bool = True,
flash: bool = False,
use_checkpoint: bool = False
):
super().__init__()
self.use_checkpoint = use_checkpoint
self.attn = MultiheadAttention(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash
)
self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
self.mlp = MLP(device=device, dtype=dtype, width=width, init_scale=init_scale)
self.ln_2 = nn.LayerNorm(width, device=device, dtype=dtype)
def _forward(self, x: torch.Tensor):
x = x + self.attn(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x
def forward(self, x: torch.Tensor):
return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint)
class MultiheadCrossAttention(nn.Module):
def __init__(
self,
*,
device: torch.device,
dtype: torch.dtype,
width: int,
heads: int,
init_scale: float,
qkv_bias: bool = True,
flash: bool = False,
n_data: Optional[int] = None,
data_width: Optional[int] = None,
):
super().__init__()
self.n_data = n_data
self.width = width
self.heads = heads
self.data_width = width if data_width is None else data_width
self.c_q = nn.Linear(width, width, bias=qkv_bias, device=device, dtype=dtype)
self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias, device=device, dtype=dtype)
self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
self.attention = QKVMultiheadCrossAttention(
device=device, dtype=dtype, heads=heads, n_data=n_data, flash=flash
)
init_linear(self.c_q, init_scale)
init_linear(self.c_kv, init_scale)
init_linear(self.c_proj, init_scale)
def forward(self, x, data):
x = self.c_q(x)
data = self.c_kv(data)
x = checkpoint(self.attention, (x, data), (), True)
x = self.c_proj(x)
return x
class QKVMultiheadCrossAttention(nn.Module):
def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int,
flash: bool = False, n_data: Optional[int] = None):
super().__init__()
self.device = device
self.dtype = dtype
self.heads = heads
self.n_data = n_data
self.flash = flash
def forward(self, q, kv):
_, n_ctx, _ = q.shape
bs, n_data, width = kv.shape
attn_ch = width // self.heads // 2
scale = 1 / math.sqrt(math.sqrt(attn_ch))
q = q.view(bs, n_ctx, self.heads, -1)
kv = kv.view(bs, n_data, self.heads, -1)
k, v = torch.split(kv, attn_ch, dim=-1)
if self.flash:
out = F.scaled_dot_product_attention(q, k, v)
else:
weight = torch.einsum(
"bthc,bshc->bhts", q * scale, k * scale
) # More stable with f16 than dividing afterwards
wdtype = weight.dtype
weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
out = torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
return out
class ResidualCrossAttentionBlock(nn.Module):
def __init__(
self,
*,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
n_data: Optional[int] = None,
width: int,
heads: int,
data_width: Optional[int] = None,
init_scale: float = 0.25,
qkv_bias: bool = True,
flash: bool = False
):
super().__init__()
if data_width is None:
data_width = width
self.attn = MultiheadCrossAttention(
device=device,
dtype=dtype,
n_data=n_data,
width=width,
heads=heads,
data_width=data_width,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash,
)
self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
self.ln_2 = nn.LayerNorm(data_width, device=device, dtype=dtype)
self.mlp = MLP(device=device, dtype=dtype, width=width, init_scale=init_scale)
self.ln_3 = nn.LayerNorm(width, device=device, dtype=dtype)
def forward(self, x: torch.Tensor, data: torch.Tensor):
x = x + self.attn(self.ln_1(x), self.ln_2(data))
x = x + self.mlp(self.ln_3(x))
return x
class MLP(nn.Module):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
width: int,
init_scale: float):
super().__init__()
self.width = width
self.c_fc = nn.Linear(width, width * 4, device=device, dtype=dtype)
self.c_proj = nn.Linear(width * 4, width, device=device, dtype=dtype)
self.gelu = nn.GELU()
init_linear(self.c_fc, init_scale)
init_linear(self.c_proj, init_scale)
def forward(self, x):
return self.c_proj(self.gelu(self.c_fc(x)))
class Transformer(nn.Module):
def __init__(
self,
*,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
n_ctx: int,
width: int,
layers: int,
heads: int,
init_scale: float = 0.25,
qkv_bias: bool = True,
flash: bool = False,
use_checkpoint: bool = False
):
super().__init__()
self.n_ctx = n_ctx
self.width = width
self.layers = layers
self.resblocks = nn.ModuleList(
[
ResidualAttentionBlock(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash,
use_checkpoint=use_checkpoint
)
for _ in range(layers)
]
)
def forward(self, x: torch.Tensor):
for block in self.resblocks:
x = block(x)
return x

View File

@@ -0,0 +1,308 @@
# -*- coding: utf-8 -*-
import math
import torch
import torch.nn as nn
from typing import Optional
import warnings
from .checkpoint import checkpoint
def _trunc_normal_(tensor, mean, std, a, b):
# Cut & paste from PyTorch official master until it's in a few official releases - RW
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1. + math.erf(x / math.sqrt(2.))) / 2.
if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect.",
stacklevel=2)
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
l = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
tensor.uniform_(2 * l - 1, 2 * u - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
tensor.erfinv_()
# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
tensor.clamp_(min=a, max=b)
return tensor
def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
# type: (Tensor | nn.Parameter, float, float, float, float) -> Tensor
r"""Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \leq \text{mean} \leq b`.
NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
applied while sampling the normal with mean/std applied, therefore a, b args
should be adjusted to match the range of mean, std args.
Args:
tensor: an n-dimensional `torch.Tensor`
mean: the mean of the normal distribution
std: the standard deviation of the normal distribution
a: the minimum cutoff value
b: the maximum cutoff value
Examples:
>>> w = torch.empty(3, 5)
>>> nn.init.trunc_normal_(w)
"""
with torch.no_grad():
return _trunc_normal_(tensor, mean, std, a, b)
def init_weights(m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
class MultiheadAttention(nn.Module):
def __init__(
self,
*,
device: torch.device,
dtype: torch.dtype,
n_ctx: int,
width: int,
heads: int,
qkv_bias: bool
):
super().__init__()
self.n_ctx = n_ctx
self.width = width
self.heads = heads
self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias, device=device, dtype=dtype)
self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
self.attention = QKVMultiheadAttention(device=device, dtype=dtype, heads=heads, n_ctx=n_ctx)
def forward(self, x):
x = self.c_qkv(x)
x = checkpoint(self.attention, (x,), (), True)
x = self.c_proj(x)
return x
class QKVMultiheadAttention(nn.Module):
def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int, n_ctx: int):
super().__init__()
self.device = device
self.dtype = dtype
self.heads = heads
self.n_ctx = n_ctx
def forward(self, qkv):
bs, n_ctx, width = qkv.shape
attn_ch = width // self.heads // 3
scale = 1 / math.sqrt(attn_ch)
qkv = qkv.view(bs, n_ctx, self.heads, -1)
q, k, v = torch.split(qkv, attn_ch, dim=-1)
weight = torch.einsum("bthc,bshc->bhts", q, k) * scale
wdtype = weight.dtype
weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
return torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
class ResidualAttentionBlock(nn.Module):
def __init__(
self,
*,
device: torch.device,
dtype: torch.dtype,
n_ctx: int,
width: int,
heads: int,
qkv_bias: bool = True,
use_checkpoint: bool = False
):
super().__init__()
self.use_checkpoint = use_checkpoint
self.attn = MultiheadAttention(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
heads=heads,
qkv_bias=qkv_bias
)
self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
self.mlp = MLP(device=device, dtype=dtype, width=width)
self.ln_2 = nn.LayerNorm(width, device=device, dtype=dtype)
def _forward(self, x: torch.Tensor):
x = x + self.attn(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x
def forward(self, x: torch.Tensor):
return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint)
class MultiheadCrossAttention(nn.Module):
def __init__(
self,
*,
device: torch.device,
dtype: torch.dtype,
width: int,
heads: int,
qkv_bias: bool = True,
n_data: Optional[int] = None,
data_width: Optional[int] = None,
):
super().__init__()
self.n_data = n_data
self.width = width
self.heads = heads
self.data_width = width if data_width is None else data_width
self.c_q = nn.Linear(width, width, bias=qkv_bias, device=device, dtype=dtype)
self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias, device=device, dtype=dtype)
self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
self.attention = QKVMultiheadCrossAttention(
device=device, dtype=dtype, heads=heads, n_data=n_data
)
def forward(self, x, data):
x = self.c_q(x)
data = self.c_kv(data)
x = checkpoint(self.attention, (x, data), (), True)
x = self.c_proj(x)
return x
class QKVMultiheadCrossAttention(nn.Module):
def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int, n_data: Optional[int] = None):
super().__init__()
self.device = device
self.dtype = dtype
self.heads = heads
self.n_data = n_data
def forward(self, q, kv):
_, n_ctx, _ = q.shape
bs, n_data, width = kv.shape
attn_ch = width // self.heads // 2
scale = 1 / math.sqrt(attn_ch)
q = q.view(bs, n_ctx, self.heads, -1)
kv = kv.view(bs, n_data, self.heads, -1)
k, v = torch.split(kv, attn_ch, dim=-1)
weight = torch.einsum("bthc,bshc->bhts", q, k) * scale
wdtype = weight.dtype
weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
return torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
class ResidualCrossAttentionBlock(nn.Module):
def __init__(
self,
*,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
n_data: Optional[int] = None,
width: int,
heads: int,
data_width: Optional[int] = None,
qkv_bias: bool = True
):
super().__init__()
if data_width is None:
data_width = width
self.attn = MultiheadCrossAttention(
device=device,
dtype=dtype,
n_data=n_data,
width=width,
heads=heads,
data_width=data_width,
qkv_bias=qkv_bias
)
self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
self.ln_2 = nn.LayerNorm(data_width, device=device, dtype=dtype)
self.mlp = MLP(device=device, dtype=dtype, width=width)
self.ln_3 = nn.LayerNorm(width, device=device, dtype=dtype)
def forward(self, x: torch.Tensor, data: torch.Tensor):
x = x + self.attn(self.ln_1(x), self.ln_2(data))
x = x + self.mlp(self.ln_3(x))
return x
class MLP(nn.Module):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
width: int):
super().__init__()
self.width = width
self.c_fc = nn.Linear(width, width * 4, device=device, dtype=dtype)
self.c_proj = nn.Linear(width * 4, width, device=device, dtype=dtype)
self.gelu = nn.GELU()
def forward(self, x):
return self.c_proj(self.gelu(self.c_fc(x)))
class Transformer(nn.Module):
def __init__(
self,
*,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
n_ctx: int,
width: int,
layers: int,
heads: int,
qkv_bias: bool = True,
use_checkpoint: bool = False
):
super().__init__()
self.n_ctx = n_ctx
self.width = width
self.layers = layers
self.resblocks = nn.ModuleList(
[
ResidualAttentionBlock(
device=device,
dtype=dtype,
n_ctx=n_ctx,
width=width,
heads=heads,
qkv_bias=qkv_bias,
use_checkpoint=use_checkpoint
)
for _ in range(layers)
]
)
self.apply(init_weights)
def forward(self, x: torch.Tensor):
for block in self.resblocks:
x = block(x)
return x

View File

@@ -0,0 +1 @@
# -*- coding: utf-8 -*-

View File

@@ -0,0 +1,373 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple, Dict, Optional
from omegaconf import DictConfig
import torch
import torch.nn.functional as F
from torch.optim import lr_scheduler
import pytorch_lightning as pl
from typing import Union
from functools import partial
from ...utils import instantiate_from_config
from .inference_utils import extract_geometry
from .tsal_base import (
AlignedShapeAsLatentModule,
ShapeAsLatentModule,
Latent2MeshOutput,
AlignedMeshOutput
)
class AlignedShapeAsLatentPLModule(pl.LightningModule):
def __init__(self, *,
shape_module_cfg,
aligned_module_cfg,
loss_cfg,
optimizer_cfg: Optional[DictConfig] = None,
ckpt_path: Optional[str] = None,
ignore_keys: Union[Tuple[str], List[str]] = ()):
super().__init__()
shape_model: ShapeAsLatentModule = instantiate_from_config(
shape_module_cfg, device=None, dtype=None
)
self.model: AlignedShapeAsLatentModule = instantiate_from_config(
aligned_module_cfg, shape_model=shape_model
)
self.loss = instantiate_from_config(loss_cfg)
self.optimizer_cfg = optimizer_cfg
if ckpt_path is not None:
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
self.save_hyperparameters()
def set_shape_model_only(self):
self.model.set_shape_model_only()
@property
def latent_shape(self):
return self.model.shape_model.latent_shape
@property
def zero_rank(self):
if self._trainer:
zero_rank = self.trainer.local_rank == 0
else:
zero_rank = True
return zero_rank
def init_from_ckpt(self, path, ignore_keys=()):
state_dict = torch.load(path, map_location="cpu")["state_dict"]
keys = list(state_dict.keys())
for k in keys:
for ik in ignore_keys:
if k.startswith(ik):
print("Deleting key {} from state_dict.".format(k))
del state_dict[k]
missing, unexpected = self.load_state_dict(state_dict, strict=False)
print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
if len(missing) > 0:
print(f"Missing Keys: {missing}")
print(f"Unexpected Keys: {unexpected}")
def configure_optimizers(self) -> Tuple[List, List]:
lr = self.learning_rate
trainable_parameters = list(self.model.parameters())
if self.optimizer_cfg is None:
optimizers = [torch.optim.AdamW(trainable_parameters, lr=lr, betas=(0.9, 0.99), weight_decay=1e-3)]
schedulers = []
else:
optimizer = instantiate_from_config(self.optimizer_cfg.optimizer, params=trainable_parameters)
scheduler_func = instantiate_from_config(
self.optimizer_cfg.scheduler,
max_decay_steps=self.trainer.max_steps,
lr_max=lr
)
scheduler = {
"scheduler": lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_func.schedule),
"interval": "step",
"frequency": 1
}
optimizers = [optimizer]
schedulers = [scheduler]
return optimizers, schedulers
def forward(self,
surface: torch.FloatTensor,
image: torch.FloatTensor,
text: torch.FloatTensor,
volume_queries: torch.FloatTensor):
"""
Args:
surface (torch.FloatTensor):
image (torch.FloatTensor):
text (torch.FloatTensor):
volume_queries (torch.FloatTensor):
Returns:
"""
embed_outputs, shape_z = self.model(surface, image, text)
shape_zq, posterior = self.model.shape_model.encode_kl_embed(shape_z)
latents = self.model.shape_model.decode(shape_zq)
logits = self.model.shape_model.query_geometry(volume_queries, latents)
return embed_outputs, logits, posterior
def encode(self, surface: torch.FloatTensor, sample_posterior=True):
pc = surface[..., 0:3]
feats = surface[..., 3:6]
shape_embed, shape_zq, posterior = self.model.shape_model.encode(
pc=pc, feats=feats, sample_posterior=sample_posterior
)
return shape_zq
def encode_latents(self, surface: torch.FloatTensor):
pc = surface[..., 0:3]
feats = surface[..., 3:6]
shape_embed, shape_latents = self.model.shape_model.encode_latents(
pc=pc, feats=feats
)
shape_embed = shape_embed.unsqueeze(1)
assert shape_embed.shape[1] == 1 and shape_latents.shape[1] == 256
cat_latents = torch.cat([shape_embed, shape_latents], dim=1)
return cat_latents
def to_shape_latents(self, latents):
shape_zq, posterior = self.model.shape_model.encode_kl_embed(latents, sample_posterior = False)
return self.model.shape_model.decode(shape_zq)
def decode(self,
z_q,
bounds: Union[Tuple[float], List[float], float] = 1.1,
octree_depth: int = 7,
num_chunks: int = 10000) -> List[Latent2MeshOutput]:
latents = self.model.shape_model.decode(z_q) # latents: [bs, num_latents, dim]
outputs = self.latent2mesh(latents, bounds=bounds, octree_depth=octree_depth, num_chunks=num_chunks)
return outputs
def training_step(self, batch: Dict[str, torch.FloatTensor],
batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
"""
Args:
batch (dict): the batch sample, and it contains:
- surface (torch.FloatTensor): [bs, n_surface, (3 + input_dim)]
- image (torch.FloatTensor): [bs, 3, 224, 224]
- text (torch.FloatTensor): [bs, num_templates, 77]
- geo_points (torch.FloatTensor): [bs, n_pts, (3 + 1)]
batch_idx (int):
optimizer_idx (int):
Returns:
loss (torch.FloatTensor):
"""
surface = batch["surface"]
image = batch["image"]
text = batch["text"]
volume_queries = batch["geo_points"][..., 0:3]
shape_labels = batch["geo_points"][..., -1]
embed_outputs, shape_logits, posteriors = self(surface, image, text, volume_queries)
aeloss, log_dict_ae = self.loss(
**embed_outputs,
posteriors=posteriors,
shape_logits=shape_logits,
shape_labels=shape_labels,
split="train"
)
self.log_dict(log_dict_ae, prog_bar=True, logger=True, batch_size=shape_logits.shape[0],
sync_dist=False, rank_zero_only=True)
return aeloss
def validation_step(self, batch: Dict[str, torch.FloatTensor], batch_idx: int) -> torch.FloatTensor:
surface = batch["surface"]
image = batch["image"]
text = batch["text"]
volume_queries = batch["geo_points"][..., 0:3]
shape_labels = batch["geo_points"][..., -1]
embed_outputs, shape_logits, posteriors = self(surface, image, text, volume_queries)
aeloss, log_dict_ae = self.loss(
**embed_outputs,
posteriors=posteriors,
shape_logits=shape_logits,
shape_labels=shape_labels,
split="val"
)
self.log_dict(log_dict_ae, prog_bar=True, logger=True, batch_size=shape_logits.shape[0],
sync_dist=False, rank_zero_only=True)
return aeloss
def visual_alignment(self,
surface: torch.FloatTensor,
image: torch.FloatTensor,
text: torch.FloatTensor,
description: Optional[List[str]] = None,
bounds: Union[Tuple[float], List[float]] = (-1.25, -1.25, -1.25, 1.25, 1.25, 1.25),
octree_depth: int = 7,
num_chunks: int = 10000) -> List[AlignedMeshOutput]:
"""
Args:
surface:
image:
text:
description:
bounds:
octree_depth:
num_chunks:
Returns:
mesh_outputs (List[AlignedMeshOutput]): the mesh outputs list.
"""
outputs = []
device = surface.device
bs = surface.shape[0]
embed_outputs, shape_z = self.model(surface, image, text)
# calculate the similarity
image_embed = embed_outputs["image_embed"]
text_embed = embed_outputs["text_embed"]
shape_embed = embed_outputs["shape_embed"]
# normalized features
shape_embed = F.normalize(shape_embed, dim=-1, p=2)
text_embed = F.normalize(text_embed, dim=-1, p=2)
image_embed = F.normalize(image_embed, dim=-1, p=2)
# B x B
shape_text_similarity = (100.0 * shape_embed @ text_embed.T).softmax(dim=-1)
# B x B
shape_image_similarity = (100.0 * shape_embed @ image_embed.T).softmax(dim=-1)
# shape reconstruction
shape_zq, posterior = self.model.shape_model.encode_kl_embed(shape_z)
latents = self.model.shape_model.decode(shape_zq)
geometric_func = partial(self.model.shape_model.query_geometry, latents=latents)
# 2. decode geometry
mesh_v_f, has_surface = extract_geometry(
geometric_func=geometric_func,
device=device,
batch_size=bs,
bounds=bounds,
octree_depth=octree_depth,
num_chunks=num_chunks,
disable=not self.zero_rank
)
# 3. decode texture
for i, ((mesh_v, mesh_f), is_surface) in enumerate(zip(mesh_v_f, has_surface)):
if not is_surface:
outputs.append(None)
continue
out = AlignedMeshOutput()
out.mesh_v = mesh_v
out.mesh_f = mesh_f
out.surface = surface[i].cpu().numpy()
out.image = image[i].cpu().numpy()
if description is not None:
out.text = description[i]
out.shape_text_similarity = shape_text_similarity[i, i]
out.shape_image_similarity = shape_image_similarity[i, i]
outputs.append(out)
return outputs
def latent2mesh(self,
latents: torch.FloatTensor,
bounds: Union[Tuple[float], List[float], float] = 1.1,
octree_depth: int = 7,
num_chunks: int = 10000) -> List[Latent2MeshOutput]:
"""
Args:
latents: [bs, num_latents, dim]
bounds:
octree_depth:
num_chunks:
Returns:
mesh_outputs (List[MeshOutput]): the mesh outputs list.
"""
outputs = []
geometric_func = partial(self.model.shape_model.query_geometry, latents=latents)
# 2. decode geometry
device = latents.device
mesh_v_f, has_surface = extract_geometry(
geometric_func=geometric_func,
device=device,
batch_size=len(latents),
bounds=bounds,
octree_depth=octree_depth,
num_chunks=num_chunks,
disable=not self.zero_rank
)
# 3. decode texture
for i, ((mesh_v, mesh_f), is_surface) in enumerate(zip(mesh_v_f, has_surface)):
if not is_surface:
outputs.append(None)
continue
out = Latent2MeshOutput()
out.mesh_v = mesh_v
out.mesh_f = mesh_f
outputs.append(out)
return outputs

View File

@@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
import torch
from torch import nn
from einops import rearrange
from transformers import CLIPModel
from .tsal_base import AlignedShapeAsLatentModule
class CLIPAlignedShapeAsLatentModule(AlignedShapeAsLatentModule):
def __init__(self, *,
shape_model,
projection_dim=768):
super().__init__()
self.shape_model = shape_model
self.shape_projection = nn.Parameter(torch.empty(self.shape_model.width, projection_dim))
nn.init.normal_(self.shape_projection, std=projection_dim ** -0.5)
def set_shape_model_only(self):
self.clip_model = None
def encode_shape_embed(self, surface, return_latents: bool = False):
"""
Args:
surface (torch.FloatTensor): [bs, n, 3 + c]
return_latents (bool):
Returns:
x (torch.FloatTensor): [bs, projection_dim]
shape_latents (torch.FloatTensor): [bs, m, d]
"""
pc = surface[..., 0:3]
feats = surface[..., 3:]
shape_embed, shape_latents = self.shape_model.encode_latents(pc, feats)
x = shape_embed @ self.shape_projection
if return_latents:
return x, shape_latents
else:
return x
def encode_image_embed(self, image):
"""
Args:
image (torch.FloatTensor): [bs, 3, h, w]
Returns:
x (torch.FloatTensor): [bs, projection_dim]
"""
x = self.clip_model.get_image_features(image)
return x
def encode_text_embed(self, text):
x = self.clip_model.get_text_features(text)
return x
def forward(self, surface, image, text):
"""
Args:
surface (torch.FloatTensor):
image (torch.FloatTensor): [bs, 3, 224, 224]
text (torch.LongTensor): [bs, num_templates, 77]
Returns:
embed_outputs (dict): the embedding outputs, and it contains:
- image_embed (torch.FloatTensor):
- text_embed (torch.FloatTensor):
- shape_embed (torch.FloatTensor):
- logit_scale (float):
"""
# # text embedding
# text_embed_all = []
# for i in range(text.shape[0]):
# text_for_one_sample = text[i]
# text_embed = self.encode_text_embed(text_for_one_sample)
# text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
# text_embed = text_embed.mean(dim=0)
# text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
# text_embed_all.append(text_embed)
# text_embed_all = torch.stack(text_embed_all)
b = text.shape[0]
text_tokens = rearrange(text, "b t l -> (b t) l")
text_embed = self.encode_text_embed(text_tokens)
text_embed = rearrange(text_embed, "(b t) d -> b t d", b=b)
text_embed = text_embed.mean(dim=1)
text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
# image embedding
image_embed = self.encode_image_embed(image)
# shape embedding
shape_embed, shape_latents = self.encode_shape_embed(surface, return_latents=True)
embed_outputs = {
"image_embed": image_embed,
"text_embed": text_embed,
"shape_embed": shape_embed,
"logit_scale": self.clip_model.logit_scale.exp()
}
return embed_outputs, shape_latents

View File

@@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-
import torch
from tqdm import tqdm
from einops import repeat
import numpy as np
from typing import Callable, Tuple, List, Union, Optional
from skimage import measure
from ...graphics.primitives import generate_dense_grid_points
@torch.no_grad()
def extract_geometry(geometric_func: Callable,
device: torch.device,
batch_size: int = 1,
bounds: Union[Tuple[float], List[float], float] = (-1.25, -1.25, -1.25, 1.25, 1.25, 1.25),
octree_depth: int = 7,
num_chunks: int = 10000,
disable: bool = True):
"""
Args:
geometric_func:
device:
bounds:
octree_depth:
batch_size:
num_chunks:
disable:
Returns:
"""
if isinstance(bounds, float):
bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
bbox_min = np.array(bounds[0:3])
bbox_max = np.array(bounds[3:6])
bbox_size = bbox_max - bbox_min
xyz_samples, grid_size, length = generate_dense_grid_points(
bbox_min=bbox_min,
bbox_max=bbox_max,
octree_depth=octree_depth,
indexing="ij"
)
xyz_samples = torch.FloatTensor(xyz_samples)
batch_logits = []
for start in tqdm(range(0, xyz_samples.shape[0], num_chunks),
desc="Implicit Function:", disable=disable, leave=False):
queries = xyz_samples[start: start + num_chunks, :].to(device)
batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
logits = geometric_func(batch_queries)
batch_logits.append(logits.cpu())
grid_logits = torch.cat(batch_logits, dim=1).view((batch_size, grid_size[0], grid_size[1], grid_size[2])).numpy()
mesh_v_f = []
has_surface = np.zeros((batch_size,), dtype=np.bool_)
for i in range(batch_size):
try:
vertices, faces, normals, _ = measure.marching_cubes(grid_logits[i], 0, method="lewiner")
vertices = vertices / grid_size * bbox_size + bbox_min
# vertices[:, [0, 1]] = vertices[:, [1, 0]]
mesh_v_f.append((vertices.astype(np.float32), np.ascontiguousarray(faces)))
has_surface[i] = True
except ValueError:
mesh_v_f.append((None, None))
has_surface[i] = False
except RuntimeError:
mesh_v_f.append((None, None))
has_surface[i] = False
return mesh_v_f, has_surface

View File

@@ -0,0 +1,303 @@
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, Dict
from ..modules.distributions import DiagonalGaussianDistribution
from ...utils.eval import compute_psnr
from ...utils import misc
class KLNearFar(nn.Module):
def __init__(self,
near_weight: float = 0.1,
kl_weight: float = 1.0,
num_near_samples: Optional[int] = None):
super().__init__()
self.near_weight = near_weight
self.kl_weight = kl_weight
self.num_near_samples = num_near_samples
self.geo_criterion = nn.BCEWithLogitsLoss()
def forward(self,
posteriors: Optional[DiagonalGaussianDistribution],
logits: torch.FloatTensor,
labels: torch.FloatTensor,
split: Optional[str] = "train", **kwargs) -> Tuple[torch.FloatTensor, Dict[str, float]]:
"""
Args:
posteriors (DiagonalGaussianDistribution or torch.distributions.Normal):
logits (torch.FloatTensor): [B, 2*N], logits[:, 0:N] is the volume points; logits[:, N:2N] is the near points;
labels (torch.FloatTensor): [B, 2*N], labels[:, 0:N] is the volume points; labels[:, N:2N] is the near points;
split (str):
**kwargs:
Returns:
loss (torch.Tensor): (,)
log (dict):
"""
if self.num_near_samples is None:
num_vol = logits.shape[1] // 2
else:
num_vol = logits.shape[1] - self.num_near_samples
vol_logits = logits[:, 0:num_vol]
vol_labels = labels[:, 0:num_vol]
near_logits = logits[:, num_vol:]
near_labels = labels[:, num_vol:]
# occupancy loss
# vol_bce = self.geo_criterion(vol_logits, vol_labels)
# near_bce = self.geo_criterion(near_logits, near_labels)
vol_bce = self.geo_criterion(vol_logits.float(), vol_labels.float())
near_bce = self.geo_criterion(near_logits.float(), near_labels.float())
if posteriors is None:
kl_loss = torch.tensor(0.0, dtype=vol_logits.dtype, device=vol_logits.device)
else:
kl_loss = posteriors.kl(dims=(1, 2))
kl_loss = torch.mean(kl_loss)
loss = vol_bce + near_bce * self.near_weight + kl_loss * self.kl_weight
with torch.no_grad():
preds = logits >= 0
accuracy = (preds == labels).float()
accuracy = accuracy.mean()
pos_ratio = torch.mean(labels)
log = {
"{}/total_loss".format(split): loss.clone().detach(),
"{}/near".format(split): near_bce.detach(),
"{}/far".format(split): vol_bce.detach(),
"{}/kl".format(split): kl_loss.detach(),
"{}/accuracy".format(split): accuracy,
"{}/pos_ratio".format(split): pos_ratio
}
if posteriors is not None:
log[f"{split}/mean"] = posteriors.mean.mean().detach()
log[f"{split}/std_mean"] = posteriors.std.mean().detach()
log[f"{split}/std_max"] = posteriors.std.max().detach()
return loss, log
class KLNearFarColor(nn.Module):
def __init__(self,
near_weight: float = 0.1,
kl_weight: float = 1.0,
color_weight: float = 1.0,
color_criterion: str = "mse",
num_near_samples: Optional[int] = None):
super().__init__()
self.color_weight = color_weight
self.near_weight = near_weight
self.kl_weight = kl_weight
self.num_near_samples = num_near_samples
if color_criterion == "mse":
self.color_criterion = nn.MSELoss()
elif color_criterion == "l1":
self.color_criterion = nn.L1Loss()
else:
raise ValueError(f"{color_criterion} must be [`mse`, `l1`].")
self.geo_criterion = nn.BCEWithLogitsLoss()
def forward(self,
posteriors: Optional[DiagonalGaussianDistribution],
logits: torch.FloatTensor,
labels: torch.FloatTensor,
pred_colors: torch.FloatTensor,
gt_colors: torch.FloatTensor,
split: Optional[str] = "train", **kwargs) -> Tuple[torch.FloatTensor, Dict[str, float]]:
"""
Args:
posteriors (DiagonalGaussianDistribution or torch.distributions.Normal):
logits (torch.FloatTensor): [B, 2*N], logits[:, 0:N] is the volume points; logits[:, N:2N] is the near points;
labels (torch.FloatTensor): [B, 2*N], labels[:, 0:N] is the volume points; labels[:, N:2N] is the near points;
pred_colors (torch.FloatTensor): [B, M, 3]
gt_colors (torch.FloatTensor): [B, M, 3]
split (str):
**kwargs:
Returns:
loss (torch.Tensor): (,)
log (dict):
"""
if self.num_near_samples is None:
num_vol = logits.shape[1] // 2
else:
num_vol = logits.shape[1] - self.num_near_samples
vol_logits = logits[:, 0:num_vol]
vol_labels = labels[:, 0:num_vol]
near_logits = logits[:, num_vol:]
near_labels = labels[:, num_vol:]
# occupancy loss
# vol_bce = self.geo_criterion(vol_logits, vol_labels)
# near_bce = self.geo_criterion(near_logits, near_labels)
vol_bce = self.geo_criterion(vol_logits.float(), vol_labels.float())
near_bce = self.geo_criterion(near_logits.float(), near_labels.float())
# surface color loss
color = self.color_criterion(pred_colors, gt_colors)
if posteriors is None:
kl_loss = torch.tensor(0.0, dtype=pred_colors.dtype, device=pred_colors.device)
else:
kl_loss = posteriors.kl(dims=(1, 2))
kl_loss = torch.mean(kl_loss)
loss = vol_bce + near_bce * self.near_weight + color * self.color_weight + kl_loss * self.kl_weight
with torch.no_grad():
preds = logits >= 0
accuracy = (preds == labels).float()
accuracy = accuracy.mean()
psnr = compute_psnr(pred_colors, gt_colors)
log = {
"{}/total_loss".format(split): loss.clone().detach(),
"{}/near".format(split): near_bce.detach(),
"{}/far".format(split): vol_bce.detach(),
"{}/color".format(split): color.detach(),
"{}/kl".format(split): kl_loss.detach(),
"{}/psnr".format(split): psnr.detach(),
"{}/accuracy".format(split): accuracy
}
return loss, log
class ContrastKLNearFar(nn.Module):
def __init__(self,
contrast_weight: float = 1.0,
near_weight: float = 0.1,
kl_weight: float = 1.0,
num_near_samples: Optional[int] = None):
super().__init__()
self.labels = None
self.last_local_batch_size = None
self.contrast_weight = contrast_weight
self.near_weight = near_weight
self.kl_weight = kl_weight
self.num_near_samples = num_near_samples
self.geo_criterion = nn.BCEWithLogitsLoss()
def forward(self,
shape_embed: torch.FloatTensor,
text_embed: torch.FloatTensor,
image_embed: torch.FloatTensor,
logit_scale: torch.FloatTensor,
posteriors: Optional[DiagonalGaussianDistribution],
shape_logits: torch.FloatTensor,
shape_labels: torch.FloatTensor,
split: Optional[str] = "train", **kwargs):
local_batch_size = shape_embed.size(0)
if local_batch_size != self.last_local_batch_size:
self.labels = local_batch_size * misc.get_rank() + torch.arange(
local_batch_size, device=shape_embed.device
).long()
self.last_local_batch_size = local_batch_size
# normalized features
shape_embed = F.normalize(shape_embed, dim=-1, p=2)
text_embed = F.normalize(text_embed, dim=-1, p=2)
image_embed = F.normalize(image_embed, dim=-1, p=2)
# gather features from all GPUs
shape_embed_all, text_embed_all, image_embed_all = misc.all_gather_batch(
[shape_embed, text_embed, image_embed]
)
# cosine similarity as logits
logits_per_shape_text = logit_scale * shape_embed @ text_embed_all.t()
logits_per_text_shape = logit_scale * text_embed @ shape_embed_all.t()
logits_per_shape_image = logit_scale * shape_embed @ image_embed_all.t()
logits_per_image_shape = logit_scale * image_embed @ shape_embed_all.t()
contrast_loss = (F.cross_entropy(logits_per_shape_text, self.labels) +
F.cross_entropy(logits_per_text_shape, self.labels)) / 2 + \
(F.cross_entropy(logits_per_shape_image, self.labels) +
F.cross_entropy(logits_per_image_shape, self.labels)) / 2
# shape reconstruction
if self.num_near_samples is None:
num_vol = shape_logits.shape[1] // 2
else:
num_vol = shape_logits.shape[1] - self.num_near_samples
vol_logits = shape_logits[:, 0:num_vol]
vol_labels = shape_labels[:, 0:num_vol]
near_logits = shape_logits[:, num_vol:]
near_labels = shape_labels[:, num_vol:]
# occupancy loss
vol_bce = self.geo_criterion(vol_logits.float(), vol_labels.float())
near_bce = self.geo_criterion(near_logits.float(), near_labels.float())
if posteriors is None:
kl_loss = torch.tensor(0.0, dtype=vol_logits.dtype, device=vol_logits.device)
else:
kl_loss = posteriors.kl(dims=(1, 2))
kl_loss = torch.mean(kl_loss)
loss = vol_bce + near_bce * self.near_weight + kl_loss * self.kl_weight + contrast_loss * self.contrast_weight
# compute accuracy
with torch.no_grad():
pred = torch.argmax(logits_per_shape_text, dim=-1)
correct = pred.eq(self.labels).sum()
shape_text_acc = 100 * correct / local_batch_size
pred = torch.argmax(logits_per_shape_image, dim=-1)
correct = pred.eq(self.labels).sum()
shape_image_acc = 100 * correct / local_batch_size
preds = shape_logits >= 0
accuracy = (preds == shape_labels).float()
accuracy = accuracy.mean()
log = {
"{}/contrast".format(split): contrast_loss.clone().detach(),
"{}/near".format(split): near_bce.detach(),
"{}/far".format(split): vol_bce.detach(),
"{}/kl".format(split): kl_loss.detach(),
"{}/shape_text_acc".format(split): shape_text_acc,
"{}/shape_image_acc".format(split): shape_image_acc,
"{}/total_loss".format(split): loss.clone().detach(),
"{}/accuracy".format(split): accuracy,
}
if posteriors is not None:
log[f"{split}/mean"] = posteriors.mean.mean().detach()
log[f"{split}/std_mean"] = posteriors.std.mean().detach()
log[f"{split}/std_max"] = posteriors.std.max().detach()
return loss, log

View File

@@ -0,0 +1,423 @@
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
from typing import Optional
from einops import repeat
import math
from ..modules import checkpoint
from ..modules.embedder import FourierEmbedder
from ..modules.distributions import DiagonalGaussianDistribution
from ..modules.transformer_blocks import (
ResidualCrossAttentionBlock,
Transformer
)
from .tsal_base import ShapeAsLatentModule
class CrossAttentionEncoder(nn.Module):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
num_latents: int,
fourier_embedder: FourierEmbedder,
point_feats: int,
width: int,
heads: int,
layers: int,
init_scale: float = 0.25,
qkv_bias: bool = True,
flash: bool = False,
use_ln_post: bool = False,
use_checkpoint: bool = False):
super().__init__()
self.use_checkpoint = use_checkpoint
self.num_latents = num_latents
self.query = nn.Parameter(torch.randn((num_latents, width), device=device, dtype=dtype) * 0.02)
self.fourier_embedder = fourier_embedder
self.input_proj = nn.Linear(self.fourier_embedder.out_dim + point_feats, width, device=device, dtype=dtype)
self.cross_attn = ResidualCrossAttentionBlock(
device=device,
dtype=dtype,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash,
)
self.self_attn = Transformer(
device=device,
dtype=dtype,
n_ctx=num_latents,
width=width,
layers=layers,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash,
use_checkpoint=False
)
if use_ln_post:
self.ln_post = nn.LayerNorm(width, dtype=dtype, device=device)
else:
self.ln_post = None
def _forward(self, pc, feats):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, C]
Returns:
"""
bs = pc.shape[0]
data = self.fourier_embedder(pc)
if feats is not None:
data = torch.cat([data, feats], dim=-1)
data = self.input_proj(data)
query = repeat(self.query, "m c -> b m c", b=bs)
latents = self.cross_attn(query, data)
latents = self.self_attn(latents)
if self.ln_post is not None:
latents = self.ln_post(latents)
return latents, pc
def forward(self, pc: torch.FloatTensor, feats: Optional[torch.FloatTensor] = None):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, C]
Returns:
dict
"""
return checkpoint(self._forward, (pc, feats), self.parameters(), self.use_checkpoint)
class CrossAttentionDecoder(nn.Module):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
num_latents: int,
out_channels: int,
fourier_embedder: FourierEmbedder,
width: int,
heads: int,
init_scale: float = 0.25,
qkv_bias: bool = True,
flash: bool = False,
use_checkpoint: bool = False):
super().__init__()
self.use_checkpoint = use_checkpoint
self.fourier_embedder = fourier_embedder
self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width, device=device, dtype=dtype)
self.cross_attn_decoder = ResidualCrossAttentionBlock(
device=device,
dtype=dtype,
n_data=num_latents,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash
)
self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
self.output_proj = nn.Linear(width, out_channels, device=device, dtype=dtype)
def _forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
queries = self.query_proj(self.fourier_embedder(queries))
x = self.cross_attn_decoder(queries, latents)
x = self.ln_post(x)
x = self.output_proj(x)
return x
def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
return checkpoint(self._forward, (queries, latents), self.parameters(), self.use_checkpoint)
class ShapeAsLatentPerceiver(ShapeAsLatentModule):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
num_latents: int,
point_feats: int = 0,
embed_dim: int = 0,
num_freqs: int = 8,
include_pi: bool = True,
width: int,
heads: int,
num_encoder_layers: int,
num_decoder_layers: int,
init_scale: float = 0.25,
qkv_bias: bool = True,
flash: bool = False,
use_ln_post: bool = False,
use_checkpoint: bool = False):
super().__init__()
self.use_checkpoint = use_checkpoint
self.num_latents = num_latents
self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
init_scale = init_scale * math.sqrt(1.0 / width)
self.encoder = CrossAttentionEncoder(
device=device,
dtype=dtype,
fourier_embedder=self.fourier_embedder,
num_latents=num_latents,
point_feats=point_feats,
width=width,
heads=heads,
layers=num_encoder_layers,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash,
use_ln_post=use_ln_post,
use_checkpoint=use_checkpoint
)
self.embed_dim = embed_dim
if embed_dim > 0:
# VAE embed
self.pre_kl = nn.Linear(width, embed_dim * 2, device=device, dtype=dtype)
self.post_kl = nn.Linear(embed_dim, width, device=device, dtype=dtype)
self.latent_shape = (num_latents, embed_dim)
else:
self.latent_shape = (num_latents, width)
self.transformer = Transformer(
device=device,
dtype=dtype,
n_ctx=num_latents,
width=width,
layers=num_decoder_layers,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash,
use_checkpoint=use_checkpoint
)
# geometry decoder
self.geo_decoder = CrossAttentionDecoder(
device=device,
dtype=dtype,
fourier_embedder=self.fourier_embedder,
out_channels=1,
num_latents=num_latents,
width=width,
heads=heads,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash,
use_checkpoint=use_checkpoint
)
def encode(self,
pc: torch.FloatTensor,
feats: Optional[torch.FloatTensor] = None,
sample_posterior: bool = True):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, C]
sample_posterior (bool):
Returns:
latents (torch.FloatTensor)
center_pos (torch.FloatTensor or None):
posterior (DiagonalGaussianDistribution or None):
"""
latents, center_pos = self.encoder(pc, feats)
posterior = None
if self.embed_dim > 0:
moments = self.pre_kl(latents)
posterior = DiagonalGaussianDistribution(moments, feat_dim=-1)
if sample_posterior:
latents = posterior.sample()
else:
latents = posterior.mode()
return latents, center_pos, posterior
def decode(self, latents: torch.FloatTensor):
latents = self.post_kl(latents)
return self.transformer(latents)
def query_geometry(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
logits = self.geo_decoder(queries, latents).squeeze(-1)
return logits
def forward(self,
pc: torch.FloatTensor,
feats: torch.FloatTensor,
volume_queries: torch.FloatTensor,
sample_posterior: bool = True):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, C]
volume_queries (torch.FloatTensor): [B, P, 3]
sample_posterior (bool):
Returns:
logits (torch.FloatTensor): [B, P]
center_pos (torch.FloatTensor): [B, M, 3]
posterior (DiagonalGaussianDistribution or None).
"""
latents, center_pos, posterior = self.encode(pc, feats, sample_posterior=sample_posterior)
latents = self.decode(latents)
logits = self.query_geometry(volume_queries, latents)
return logits, center_pos, posterior
class AlignedShapeLatentPerceiver(ShapeAsLatentPerceiver):
def __init__(self, *,
device: Optional[torch.device],
dtype: Optional[torch.dtype],
num_latents: int,
point_feats: int = 0,
embed_dim: int = 0,
num_freqs: int = 8,
include_pi: bool = True,
width: int,
heads: int,
num_encoder_layers: int,
num_decoder_layers: int,
init_scale: float = 0.25,
qkv_bias: bool = True,
flash: bool = False,
use_ln_post: bool = False,
use_checkpoint: bool = False):
super().__init__(
device=device,
dtype=dtype,
num_latents=1 + num_latents,
point_feats=point_feats,
embed_dim=embed_dim,
num_freqs=num_freqs,
include_pi=include_pi,
width=width,
heads=heads,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
init_scale=init_scale,
qkv_bias=qkv_bias,
flash=flash,
use_ln_post=use_ln_post,
use_checkpoint=use_checkpoint
)
self.width = width
def encode(self,
pc: torch.FloatTensor,
feats: Optional[torch.FloatTensor] = None,
sample_posterior: bool = True):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, c]
sample_posterior (bool):
Returns:
shape_embed (torch.FloatTensor)
kl_embed (torch.FloatTensor):
posterior (DiagonalGaussianDistribution or None):
"""
shape_embed, latents = self.encode_latents(pc, feats)
kl_embed, posterior = self.encode_kl_embed(latents, sample_posterior)
return shape_embed, kl_embed, posterior
def encode_latents(self,
pc: torch.FloatTensor,
feats: Optional[torch.FloatTensor] = None):
x, _ = self.encoder(pc, feats)
shape_embed = x[:, 0]
latents = x[:, 1:]
return shape_embed, latents
def encode_kl_embed(self, latents: torch.FloatTensor, sample_posterior: bool = True):
posterior = None
if self.embed_dim > 0:
moments = self.pre_kl(latents)
posterior = DiagonalGaussianDistribution(moments, feat_dim=-1)
if sample_posterior:
kl_embed = posterior.sample()
else:
kl_embed = posterior.mode()
else:
kl_embed = latents
return kl_embed, posterior
def forward(self,
pc: torch.FloatTensor,
feats: torch.FloatTensor,
volume_queries: torch.FloatTensor,
sample_posterior: bool = True):
"""
Args:
pc (torch.FloatTensor): [B, N, 3]
feats (torch.FloatTensor or None): [B, N, C]
volume_queries (torch.FloatTensor): [B, P, 3]
sample_posterior (bool):
Returns:
shape_embed (torch.FloatTensor): [B, projection_dim]
logits (torch.FloatTensor): [B, M]
posterior (DiagonalGaussianDistribution or None).
"""
shape_embed, kl_embed, posterior = self.encode(pc, feats, sample_posterior=sample_posterior)
latents = self.decode(kl_embed)
logits = self.query_geometry(volume_queries, latents)
return shape_embed, logits, posterior

View File

@@ -0,0 +1,290 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple, Dict, Optional
from omegaconf import DictConfig
import torch
from torch.optim import lr_scheduler
import pytorch_lightning as pl
from typing import Union
from functools import partial
from ...utils import instantiate_from_config
from .inference_utils import extract_geometry
from .tsal_base import (
ShapeAsLatentModule,
Latent2MeshOutput,
Point2MeshOutput
)
class ShapeAsLatentPLModule(pl.LightningModule):
def __init__(self, *,
module_cfg,
loss_cfg,
optimizer_cfg: Optional[DictConfig] = None,
ckpt_path: Optional[str] = None,
ignore_keys: Union[Tuple[str], List[str]] = ()):
super().__init__()
self.sal: ShapeAsLatentModule = instantiate_from_config(module_cfg, device=None, dtype=None)
self.loss = instantiate_from_config(loss_cfg)
self.optimizer_cfg = optimizer_cfg
if ckpt_path is not None:
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
self.save_hyperparameters()
@property
def latent_shape(self):
return self.sal.latent_shape
@property
def zero_rank(self):
if self._trainer:
zero_rank = self.trainer.local_rank == 0
else:
zero_rank = True
return zero_rank
def init_from_ckpt(self, path, ignore_keys=()):
state_dict = torch.load(path, map_location="cpu")["state_dict"]
keys = list(state_dict.keys())
for k in keys:
for ik in ignore_keys:
if k.startswith(ik):
print("Deleting key {} from state_dict.".format(k))
del state_dict[k]
missing, unexpected = self.load_state_dict(state_dict, strict=False)
print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
if len(missing) > 0:
print(f"Missing Keys: {missing}")
print(f"Unexpected Keys: {unexpected}")
def configure_optimizers(self) -> Tuple[List, List]:
lr = self.learning_rate
# optimizers = [torch.optim.AdamW(self.sal.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=1e-4)]
# optimizers = [torch.optim.AdamW(self.sal.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=1e-3)]
if self.optimizer_cfg is None:
optimizers = [torch.optim.AdamW(self.sal.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=1e-3)]
schedulers = []
else:
optimizer = instantiate_from_config(self.optimizer_cfg.optimizer, params=self.sal.parameters())
scheduler_func = instantiate_from_config(
self.optimizer_cfg.scheduler,
max_decay_steps=self.trainer.max_steps,
lr_max=lr
)
scheduler = {
"scheduler": lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_func.schedule),
"interval": "step",
"frequency": 1
}
optimizers = [optimizer]
schedulers = [scheduler]
return optimizers, schedulers
def forward(self,
pc: torch.FloatTensor,
feats: torch.FloatTensor,
volume_queries: torch.FloatTensor):
logits, center_pos, posterior = self.sal(pc, feats, volume_queries)
return posterior, logits
def encode(self, surface: torch.FloatTensor, sample_posterior=True):
pc = surface[..., 0:3]
feats = surface[..., 3:6]
latents, center_pos, posterior = self.sal.encode(
pc=pc, feats=feats, sample_posterior=sample_posterior
)
return latents
def decode(self,
z_q,
bounds: Union[Tuple[float], List[float], float] = 1.1,
octree_depth: int = 7,
num_chunks: int = 10000) -> List[Latent2MeshOutput]:
latents = self.sal.decode(z_q) # latents: [bs, num_latents, dim]
outputs = self.latent2mesh(latents, bounds=bounds, octree_depth=octree_depth, num_chunks=num_chunks)
return outputs
def training_step(self, batch: Dict[str, torch.FloatTensor],
batch_idx: int, optimizer_idx: int = 0) -> torch.FloatTensor:
"""
Args:
batch (dict): the batch sample, and it contains:
- surface (torch.FloatTensor): [bs, n_surface, (3 + input_dim)]
- geo_points (torch.FloatTensor): [bs, n_pts, (3 + 1)]
batch_idx (int):
optimizer_idx (int):
Returns:
loss (torch.FloatTensor):
"""
pc = batch["surface"][..., 0:3]
feats = batch["surface"][..., 3:]
volume_queries = batch["geo_points"][..., 0:3]
volume_labels = batch["geo_points"][..., -1]
posterior, logits = self(
pc=pc, feats=feats, volume_queries=volume_queries
)
aeloss, log_dict_ae = self.loss(posterior, logits, volume_labels, split="train")
self.log_dict(log_dict_ae, prog_bar=True, logger=True, batch_size=logits.shape[0],
sync_dist=False, rank_zero_only=True)
return aeloss
def validation_step(self, batch: Dict[str, torch.FloatTensor], batch_idx: int) -> torch.FloatTensor:
pc = batch["surface"][..., 0:3]
feats = batch["surface"][..., 3:]
volume_queries = batch["geo_points"][..., 0:3]
volume_labels = batch["geo_points"][..., -1]
posterior, logits = self(
pc=pc, feats=feats, volume_queries=volume_queries,
)
aeloss, log_dict_ae = self.loss(posterior, logits, volume_labels, split="val")
self.log_dict(log_dict_ae, prog_bar=True, logger=True, batch_size=logits.shape[0],
sync_dist=False, rank_zero_only=True)
return aeloss
def point2mesh(self,
pc: torch.FloatTensor,
feats: torch.FloatTensor,
bounds: Union[Tuple[float], List[float]] = (-1.25, -1.25, -1.25, 1.25, 1.25, 1.25),
octree_depth: int = 7,
num_chunks: int = 10000) -> List[Point2MeshOutput]:
"""
Args:
pc:
feats:
bounds:
octree_depth:
num_chunks:
Returns:
mesh_outputs (List[MeshOutput]): the mesh outputs list.
"""
outputs = []
device = pc.device
bs = pc.shape[0]
# 1. point encoder + latents transformer
latents, center_pos, posterior = self.sal.encode(pc, feats)
latents = self.sal.decode(latents) # latents: [bs, num_latents, dim]
geometric_func = partial(self.sal.query_geometry, latents=latents)
# 2. decode geometry
mesh_v_f, has_surface = extract_geometry(
geometric_func=geometric_func,
device=device,
batch_size=bs,
bounds=bounds,
octree_depth=octree_depth,
num_chunks=num_chunks,
disable=not self.zero_rank
)
# 3. decode texture
for i, ((mesh_v, mesh_f), is_surface) in enumerate(zip(mesh_v_f, has_surface)):
if not is_surface:
outputs.append(None)
continue
out = Point2MeshOutput()
out.mesh_v = mesh_v
out.mesh_f = mesh_f
out.pc = torch.cat([pc[i], feats[i]], dim=-1).cpu().numpy()
if center_pos is not None:
out.center = center_pos[i].cpu().numpy()
outputs.append(out)
return outputs
def latent2mesh(self,
latents: torch.FloatTensor,
bounds: Union[Tuple[float], List[float], float] = 1.1,
octree_depth: int = 7,
num_chunks: int = 10000) -> List[Latent2MeshOutput]:
"""
Args:
latents: [bs, num_latents, dim]
bounds:
octree_depth:
num_chunks:
Returns:
mesh_outputs (List[MeshOutput]): the mesh outputs list.
"""
outputs = []
geometric_func = partial(self.sal.query_geometry, latents=latents)
# 2. decode geometry
device = latents.device
mesh_v_f, has_surface = extract_geometry(
geometric_func=geometric_func,
device=device,
batch_size=len(latents),
bounds=bounds,
octree_depth=octree_depth,
num_chunks=num_chunks,
disable=not self.zero_rank
)
# 3. decode texture
for i, ((mesh_v, mesh_f), is_surface) in enumerate(zip(mesh_v_f, has_surface)):
if not is_surface:
outputs.append(None)
continue
out = Latent2MeshOutput()
out.mesh_v = mesh_v
out.mesh_f = mesh_f
outputs.append(out)
return outputs

View File

@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
import torch.nn as nn
from typing import Tuple, List, Optional
import pytorch_lightning as pl
class Point2MeshOutput(object):
def __init__(self):
self.mesh_v = None
self.mesh_f = None
self.center = None
self.pc = None
class Latent2MeshOutput(object):
def __init__(self):
self.mesh_v = None
self.mesh_f = None
class AlignedMeshOutput(object):
def __init__(self):
self.mesh_v = None
self.mesh_f = None
self.surface = None
self.image = None
self.text: Optional[str] = None
self.shape_text_similarity: Optional[float] = None
self.shape_image_similarity: Optional[float] = None
class ShapeAsLatentPLModule(pl.LightningModule):
latent_shape: Tuple[int]
def encode(self, surface, *args, **kwargs):
raise NotImplementedError
def decode(self, z_q, *args, **kwargs):
raise NotImplementedError
def latent2mesh(self, latents, *args, **kwargs) -> List[Latent2MeshOutput]:
raise NotImplementedError
def point2mesh(self, *args, **kwargs) -> List[Point2MeshOutput]:
raise NotImplementedError
class ShapeAsLatentModule(nn.Module):
latent_shape: Tuple[int, int]
def __init__(self, *args, **kwargs):
super().__init__()
def encode(self, *args, **kwargs):
raise NotImplementedError
def decode(self, *args, **kwargs):
raise NotImplementedError
def query_geometry(self, *args, **kwargs):
raise NotImplementedError
class AlignedShapeAsLatentPLModule(pl.LightningModule):
latent_shape: Tuple[int]
def set_shape_model_only(self):
raise NotImplementedError
def encode(self, surface, *args, **kwargs):
raise NotImplementedError
def decode(self, z_q, *args, **kwargs):
raise NotImplementedError
def latent2mesh(self, latents, *args, **kwargs) -> List[Latent2MeshOutput]:
raise NotImplementedError
def point2mesh(self, *args, **kwargs) -> List[Point2MeshOutput]:
raise NotImplementedError
class AlignedShapeAsLatentModule(nn.Module):
shape_model: ShapeAsLatentModule
latent_shape: Tuple[int, int]
def __init__(self, *args, **kwargs):
super().__init__()
def set_shape_model_only(self):
raise NotImplementedError
def encode_image_embed(self, *args, **kwargs):
raise NotImplementedError
def encode_text_embed(self, *args, **kwargs):
raise NotImplementedError
def encode_shape_embed(self, *args, **kwargs):
raise NotImplementedError
class TexturedShapeAsLatentModule(nn.Module):
def __init__(self, *args, **kwargs):
super().__init__()
def encode(self, *args, **kwargs):
raise NotImplementedError
def decode(self, *args, **kwargs):
raise NotImplementedError
def query_geometry(self, *args, **kwargs):
raise NotImplementedError
def query_color(self, *args, **kwargs):
raise NotImplementedError