mirror of
https://github.com/PrimitiveAnything/PrimitiveAnything.git
synced 2026-03-01 18:05:58 +08:00
init
This commit is contained in:
3
primitive_anything/michelangelo/models/modules/__init__.py
Executable file
3
primitive_anything/michelangelo/models/modules/__init__.py
Executable file
@@ -0,0 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from .checkpoint import checkpoint
|
||||
69
primitive_anything/michelangelo/models/modules/checkpoint.py
Executable file
69
primitive_anything/michelangelo/models/modules/checkpoint.py
Executable file
@@ -0,0 +1,69 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Adapted from: https://github.com/openai/guided-diffusion/blob/22e0df8183507e13a7813f8d38d51b072ca1e67c/guided_diffusion/nn.py#L124
|
||||
"""
|
||||
|
||||
import torch
|
||||
from typing import Callable, Iterable, Sequence, Union
|
||||
|
||||
|
||||
def checkpoint(
|
||||
func: Callable[..., Union[torch.Tensor, Sequence[torch.Tensor]]],
|
||||
inputs: Sequence[torch.Tensor],
|
||||
params: Iterable[torch.Tensor],
|
||||
flag: bool,
|
||||
use_deepspeed: bool = False
|
||||
):
|
||||
"""
|
||||
Evaluate a function without caching intermediate activations, allowing for
|
||||
reduced memory at the expense of extra compute in the backward pass.
|
||||
:param func: the function to evaluate.
|
||||
:param inputs: the argument sequence to pass to `func`.
|
||||
:param params: a sequence of parameters `func` depends on but does not
|
||||
explicitly take as arguments.
|
||||
:param flag: if False, disable gradient checkpointing.
|
||||
:param use_deepspeed: if True, use deepspeed
|
||||
"""
|
||||
if flag:
|
||||
if use_deepspeed:
|
||||
import deepspeed
|
||||
return deepspeed.checkpointing.checkpoint(func, *inputs)
|
||||
|
||||
args = tuple(inputs) + tuple(params)
|
||||
return CheckpointFunction.apply(func, len(inputs), *args)
|
||||
else:
|
||||
return func(*inputs)
|
||||
|
||||
|
||||
class CheckpointFunction(torch.autograd.Function):
|
||||
@staticmethod
|
||||
@torch.cuda.amp.custom_fwd
|
||||
def forward(ctx, run_function, length, *args):
|
||||
ctx.run_function = run_function
|
||||
ctx.input_tensors = list(args[:length])
|
||||
ctx.input_params = list(args[length:])
|
||||
|
||||
with torch.no_grad():
|
||||
output_tensors = ctx.run_function(*ctx.input_tensors)
|
||||
return output_tensors
|
||||
|
||||
@staticmethod
|
||||
@torch.cuda.amp.custom_bwd
|
||||
def backward(ctx, *output_grads):
|
||||
ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
|
||||
with torch.enable_grad():
|
||||
# Fixes a bug where the first op in run_function modifies the
|
||||
# Tensor storage in place, which is not allowed for detach()'d
|
||||
# Tensors.
|
||||
shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
|
||||
output_tensors = ctx.run_function(*shallow_copies)
|
||||
input_grads = torch.autograd.grad(
|
||||
output_tensors,
|
||||
ctx.input_tensors + ctx.input_params,
|
||||
output_grads,
|
||||
allow_unused=True,
|
||||
)
|
||||
del ctx.input_tensors
|
||||
del ctx.input_params
|
||||
del output_tensors
|
||||
return (None, None) + input_grads
|
||||
218
primitive_anything/michelangelo/models/modules/diffusion_transformer.py
Executable file
218
primitive_anything/michelangelo/models/modules/diffusion_transformer.py
Executable file
@@ -0,0 +1,218 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Optional
|
||||
|
||||
from .checkpoint import checkpoint
|
||||
from .transformer_blocks import (
|
||||
init_linear,
|
||||
MLP,
|
||||
MultiheadCrossAttention,
|
||||
MultiheadAttention,
|
||||
ResidualAttentionBlock
|
||||
)
|
||||
|
||||
|
||||
class AdaLayerNorm(nn.Module):
|
||||
def __init__(self,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
width: int):
|
||||
|
||||
super().__init__()
|
||||
|
||||
self.silu = nn.SiLU(inplace=True)
|
||||
self.linear = nn.Linear(width, width * 2, device=device, dtype=dtype)
|
||||
self.layernorm = nn.LayerNorm(width, elementwise_affine=False, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, timestep):
|
||||
emb = self.linear(timestep)
|
||||
scale, shift = torch.chunk(emb, 2, dim=2)
|
||||
x = self.layernorm(x) * (1 + scale) + shift
|
||||
return x
|
||||
|
||||
|
||||
class DitBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
n_ctx: int,
|
||||
width: int,
|
||||
heads: int,
|
||||
context_dim: int,
|
||||
qkv_bias: bool = False,
|
||||
init_scale: float = 1.0,
|
||||
use_checkpoint: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
self.attn = MultiheadAttention(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_ctx=n_ctx,
|
||||
width=width,
|
||||
heads=heads,
|
||||
init_scale=init_scale,
|
||||
qkv_bias=qkv_bias
|
||||
)
|
||||
self.ln_1 = AdaLayerNorm(device, dtype, width)
|
||||
|
||||
if context_dim is not None:
|
||||
self.ln_2 = AdaLayerNorm(device, dtype, width)
|
||||
self.cross_attn = MultiheadCrossAttention(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
width=width,
|
||||
heads=heads,
|
||||
data_width=context_dim,
|
||||
init_scale=init_scale,
|
||||
qkv_bias=qkv_bias
|
||||
)
|
||||
|
||||
self.mlp = MLP(device=device, dtype=dtype, width=width, init_scale=init_scale)
|
||||
self.ln_3 = AdaLayerNorm(device, dtype, width)
|
||||
|
||||
def forward(self, x: torch.Tensor, t: torch.Tensor, context: Optional[torch.Tensor] = None):
|
||||
return checkpoint(self._forward, (x, t, context), self.parameters(), self.use_checkpoint)
|
||||
|
||||
def _forward(self, x: torch.Tensor, t: torch.Tensor, context: Optional[torch.Tensor] = None):
|
||||
x = x + self.attn(self.ln_1(x, t))
|
||||
if context is not None:
|
||||
x = x + self.cross_attn(self.ln_2(x, t), context)
|
||||
x = x + self.mlp(self.ln_3(x, t))
|
||||
return x
|
||||
|
||||
|
||||
class DiT(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: Optional[torch.device],
|
||||
dtype: Optional[torch.dtype],
|
||||
n_ctx: int,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
context_dim: int,
|
||||
init_scale: float = 0.25,
|
||||
qkv_bias: bool = False,
|
||||
use_checkpoint: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
self.n_ctx = n_ctx
|
||||
self.width = width
|
||||
self.layers = layers
|
||||
|
||||
self.resblocks = nn.ModuleList(
|
||||
[
|
||||
DitBlock(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_ctx=n_ctx,
|
||||
width=width,
|
||||
heads=heads,
|
||||
context_dim=context_dim,
|
||||
qkv_bias=qkv_bias,
|
||||
init_scale=init_scale,
|
||||
use_checkpoint=use_checkpoint
|
||||
)
|
||||
for _ in range(layers)
|
||||
]
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor, t: torch.Tensor, context: Optional[torch.Tensor] = None):
|
||||
for block in self.resblocks:
|
||||
x = block(x, t, context)
|
||||
return x
|
||||
|
||||
|
||||
class UNetDiffusionTransformer(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: Optional[torch.device],
|
||||
dtype: Optional[torch.dtype],
|
||||
n_ctx: int,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
init_scale: float = 0.25,
|
||||
qkv_bias: bool = False,
|
||||
skip_ln: bool = False,
|
||||
use_checkpoint: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.n_ctx = n_ctx
|
||||
self.width = width
|
||||
self.layers = layers
|
||||
|
||||
self.encoder = nn.ModuleList()
|
||||
for _ in range(layers):
|
||||
resblock = ResidualAttentionBlock(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_ctx=n_ctx,
|
||||
width=width,
|
||||
heads=heads,
|
||||
init_scale=init_scale,
|
||||
qkv_bias=qkv_bias,
|
||||
use_checkpoint=use_checkpoint
|
||||
)
|
||||
self.encoder.append(resblock)
|
||||
|
||||
self.middle_block = ResidualAttentionBlock(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_ctx=n_ctx,
|
||||
width=width,
|
||||
heads=heads,
|
||||
init_scale=init_scale,
|
||||
qkv_bias=qkv_bias,
|
||||
use_checkpoint=use_checkpoint
|
||||
)
|
||||
|
||||
self.decoder = nn.ModuleList()
|
||||
for _ in range(layers):
|
||||
resblock = ResidualAttentionBlock(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_ctx=n_ctx,
|
||||
width=width,
|
||||
heads=heads,
|
||||
init_scale=init_scale,
|
||||
qkv_bias=qkv_bias,
|
||||
use_checkpoint=use_checkpoint
|
||||
)
|
||||
linear = nn.Linear(width * 2, width, device=device, dtype=dtype)
|
||||
init_linear(linear, init_scale)
|
||||
|
||||
layer_norm = nn.LayerNorm(width, device=device, dtype=dtype) if skip_ln else None
|
||||
|
||||
self.decoder.append(nn.ModuleList([resblock, linear, layer_norm]))
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
|
||||
enc_outputs = []
|
||||
for block in self.encoder:
|
||||
x = block(x)
|
||||
enc_outputs.append(x)
|
||||
|
||||
x = self.middle_block(x)
|
||||
|
||||
for i, (resblock, linear, layer_norm) in enumerate(self.decoder):
|
||||
x = torch.cat([enc_outputs.pop(), x], dim=-1)
|
||||
x = linear(x)
|
||||
|
||||
if layer_norm is not None:
|
||||
x = layer_norm(x)
|
||||
|
||||
x = resblock(x)
|
||||
|
||||
return x
|
||||
100
primitive_anything/michelangelo/models/modules/distributions.py
Executable file
100
primitive_anything/michelangelo/models/modules/distributions.py
Executable file
@@ -0,0 +1,100 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
from typing import Union, List
|
||||
|
||||
|
||||
class AbstractDistribution(object):
|
||||
def sample(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def mode(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class DiracDistribution(AbstractDistribution):
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
def sample(self):
|
||||
return self.value
|
||||
|
||||
def mode(self):
|
||||
return self.value
|
||||
|
||||
|
||||
class DiagonalGaussianDistribution(object):
|
||||
def __init__(self, parameters: Union[torch.Tensor, List[torch.Tensor]], deterministic=False, feat_dim=1):
|
||||
self.feat_dim = feat_dim
|
||||
self.parameters = parameters
|
||||
|
||||
if isinstance(parameters, list):
|
||||
self.mean = parameters[0]
|
||||
self.logvar = parameters[1]
|
||||
else:
|
||||
self.mean, self.logvar = torch.chunk(parameters, 2, dim=feat_dim)
|
||||
|
||||
self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
|
||||
self.deterministic = deterministic
|
||||
self.std = torch.exp(0.5 * self.logvar)
|
||||
self.var = torch.exp(self.logvar)
|
||||
if self.deterministic:
|
||||
self.var = self.std = torch.zeros_like(self.mean)
|
||||
|
||||
def sample(self):
|
||||
x = self.mean + self.std * torch.randn_like(self.mean)
|
||||
return x
|
||||
|
||||
def kl(self, other=None, dims=(1, 2, 3)):
|
||||
if self.deterministic:
|
||||
return torch.Tensor([0.])
|
||||
else:
|
||||
if other is None:
|
||||
return 0.5 * torch.mean(torch.pow(self.mean, 2)
|
||||
+ self.var - 1.0 - self.logvar,
|
||||
dim=dims)
|
||||
else:
|
||||
return 0.5 * torch.mean(
|
||||
torch.pow(self.mean - other.mean, 2) / other.var
|
||||
+ self.var / other.var - 1.0 - self.logvar + other.logvar,
|
||||
dim=dims)
|
||||
|
||||
def nll(self, sample, dims=(1, 2, 3)):
|
||||
if self.deterministic:
|
||||
return torch.Tensor([0.])
|
||||
logtwopi = np.log(2.0 * np.pi)
|
||||
return 0.5 * torch.sum(
|
||||
logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
|
||||
dim=dims)
|
||||
|
||||
def mode(self):
|
||||
return self.mean
|
||||
|
||||
|
||||
def normal_kl(mean1, logvar1, mean2, logvar2):
|
||||
"""
|
||||
source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
|
||||
Compute the KL divergence between two gaussians.
|
||||
Shapes are automatically broadcasted, so batches can be compared to
|
||||
scalars, among other use cases.
|
||||
"""
|
||||
tensor = None
|
||||
for obj in (mean1, logvar1, mean2, logvar2):
|
||||
if isinstance(obj, torch.Tensor):
|
||||
tensor = obj
|
||||
break
|
||||
assert tensor is not None, "at least one argument must be a Tensor"
|
||||
|
||||
# Force variances to be Tensors. Broadcasting helps convert scalars to
|
||||
# Tensors, but it does not work for torch.exp().
|
||||
logvar1, logvar2 = [
|
||||
x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
|
||||
for x in (logvar1, logvar2)
|
||||
]
|
||||
|
||||
return 0.5 * (
|
||||
-1.0
|
||||
+ logvar2
|
||||
- logvar1
|
||||
+ torch.exp(logvar1 - logvar2)
|
||||
+ ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
|
||||
)
|
||||
213
primitive_anything/michelangelo/models/modules/embedder.py
Executable file
213
primitive_anything/michelangelo/models/modules/embedder.py
Executable file
@@ -0,0 +1,213 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import math
|
||||
|
||||
VALID_EMBED_TYPES = ["identity", "fourier", "hashgrid", "sphere_harmonic", "triplane_fourier"]
|
||||
|
||||
|
||||
class FourierEmbedder(nn.Module):
|
||||
"""The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
|
||||
each feature dimension of `x[..., i]` into:
|
||||
[
|
||||
sin(x[..., i]),
|
||||
sin(f_1*x[..., i]),
|
||||
sin(f_2*x[..., i]),
|
||||
...
|
||||
sin(f_N * x[..., i]),
|
||||
cos(x[..., i]),
|
||||
cos(f_1*x[..., i]),
|
||||
cos(f_2*x[..., i]),
|
||||
...
|
||||
cos(f_N * x[..., i]),
|
||||
x[..., i] # only present if include_input is True.
|
||||
], here f_i is the frequency.
|
||||
|
||||
Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
|
||||
If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
|
||||
Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
|
||||
|
||||
Args:
|
||||
num_freqs (int): the number of frequencies, default is 6;
|
||||
logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
|
||||
otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
|
||||
input_dim (int): the input dimension, default is 3;
|
||||
include_input (bool): include the input tensor or not, default is True.
|
||||
|
||||
Attributes:
|
||||
frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
|
||||
otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
|
||||
|
||||
out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
|
||||
otherwise, it is input_dim * num_freqs * 2.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_freqs: int = 6,
|
||||
logspace: bool = True,
|
||||
input_dim: int = 3,
|
||||
include_input: bool = True,
|
||||
include_pi: bool = True) -> None:
|
||||
|
||||
"""The initialization"""
|
||||
|
||||
super().__init__()
|
||||
|
||||
if logspace:
|
||||
frequencies = 2.0 ** torch.arange(
|
||||
num_freqs,
|
||||
dtype=torch.float32
|
||||
)
|
||||
else:
|
||||
frequencies = torch.linspace(
|
||||
1.0,
|
||||
2.0 ** (num_freqs - 1),
|
||||
num_freqs,
|
||||
dtype=torch.float32
|
||||
)
|
||||
|
||||
if include_pi:
|
||||
frequencies *= torch.pi
|
||||
|
||||
self.register_buffer("frequencies", frequencies, persistent=False)
|
||||
self.include_input = include_input
|
||||
self.num_freqs = num_freqs
|
||||
|
||||
self.out_dim = self.get_dims(input_dim)
|
||||
|
||||
def get_dims(self, input_dim):
|
||||
temp = 1 if self.include_input or self.num_freqs == 0 else 0
|
||||
out_dim = input_dim * (self.num_freqs * 2 + temp)
|
||||
|
||||
return out_dim
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
""" Forward process.
|
||||
|
||||
Args:
|
||||
x: tensor of shape [..., dim]
|
||||
|
||||
Returns:
|
||||
embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
|
||||
where temp is 1 if include_input is True and 0 otherwise.
|
||||
"""
|
||||
|
||||
if self.num_freqs > 0:
|
||||
embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1)
|
||||
if self.include_input:
|
||||
return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
|
||||
else:
|
||||
return torch.cat((embed.sin(), embed.cos()), dim=-1)
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
class LearnedFourierEmbedder(nn.Module):
|
||||
""" following @crowsonkb "s lead with learned sinusoidal pos emb """
|
||||
""" https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/models/danbooru_128.py#L8 """
|
||||
|
||||
def __init__(self, in_channels, dim):
|
||||
super().__init__()
|
||||
assert (dim % 2) == 0
|
||||
half_dim = dim // 2
|
||||
per_channel_dim = half_dim // in_channels
|
||||
self.weights = nn.Parameter(torch.randn(per_channel_dim))
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
|
||||
Args:
|
||||
x (torch.FloatTensor): [..., c]
|
||||
|
||||
Returns:
|
||||
x (torch.FloatTensor): [..., d]
|
||||
"""
|
||||
|
||||
# [b, t, c, 1] * [1, d] = [b, t, c, d] -> [b, t, c * d]
|
||||
freqs = (x[..., None] * self.weights[None] * 2 * np.pi).view(*x.shape[:-1], -1)
|
||||
fouriered = torch.cat((x, freqs.sin(), freqs.cos()), dim=-1)
|
||||
return fouriered
|
||||
|
||||
|
||||
class TriplaneLearnedFourierEmbedder(nn.Module):
|
||||
def __init__(self, in_channels, dim):
|
||||
super().__init__()
|
||||
|
||||
self.yz_plane_embedder = LearnedFourierEmbedder(in_channels, dim)
|
||||
self.xz_plane_embedder = LearnedFourierEmbedder(in_channels, dim)
|
||||
self.xy_plane_embedder = LearnedFourierEmbedder(in_channels, dim)
|
||||
|
||||
self.out_dim = in_channels + dim
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
yz_embed = self.yz_plane_embedder(x)
|
||||
xz_embed = self.xz_plane_embedder(x)
|
||||
xy_embed = self.xy_plane_embedder(x)
|
||||
|
||||
embed = yz_embed + xz_embed + xy_embed
|
||||
|
||||
return embed
|
||||
|
||||
|
||||
def sequential_pos_embed(num_len, embed_dim):
|
||||
assert embed_dim % 2 == 0
|
||||
|
||||
pos = torch.arange(num_len, dtype=torch.float32)
|
||||
omega = torch.arange(embed_dim // 2, dtype=torch.float32)
|
||||
omega /= embed_dim / 2.
|
||||
omega = 1. / 10000 ** omega # (D/2,)
|
||||
|
||||
pos = pos.reshape(-1) # (M,)
|
||||
out = torch.einsum("m,d->md", pos, omega) # (M, D/2), outer product
|
||||
|
||||
emb_sin = torch.sin(out) # (M, D/2)
|
||||
emb_cos = torch.cos(out) # (M, D/2)
|
||||
|
||||
embeddings = torch.cat([emb_sin, emb_cos], dim=1) # (M, D)
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
def timestep_embedding(timesteps, dim, max_period=10000):
|
||||
"""
|
||||
Create sinusoidal timestep embeddings.
|
||||
:param timesteps: a 1-D Tensor of N indices, one per batch element.
|
||||
These may be fractional.
|
||||
:param dim: the dimension of the output.
|
||||
:param max_period: controls the minimum frequency of the embeddings.
|
||||
:return: an [N x dim] Tensor of positional embeddings.
|
||||
"""
|
||||
half = dim // 2
|
||||
freqs = torch.exp(
|
||||
-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
|
||||
).to(device=timesteps.device)
|
||||
args = timesteps[:, None].to(timesteps.dtype) * freqs[None]
|
||||
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||
if dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding
|
||||
|
||||
|
||||
def get_embedder(embed_type="fourier", num_freqs=-1, input_dim=3, degree=4,
|
||||
num_levels=16, level_dim=2, per_level_scale=2, base_resolution=16,
|
||||
log2_hashmap_size=19, desired_resolution=None):
|
||||
if embed_type == "identity" or (embed_type == "fourier" and num_freqs == -1):
|
||||
return nn.Identity(), input_dim
|
||||
|
||||
elif embed_type == "fourier":
|
||||
embedder_obj = FourierEmbedder(num_freqs=num_freqs, input_dim=input_dim,
|
||||
logspace=True, include_input=True)
|
||||
return embedder_obj, embedder_obj.out_dim
|
||||
|
||||
elif embed_type == "hashgrid":
|
||||
raise NotImplementedError
|
||||
|
||||
elif embed_type == "sphere_harmonic":
|
||||
raise NotImplementedError
|
||||
|
||||
else:
|
||||
raise ValueError(f"{embed_type} is not valid. Currently only supprts {VALID_EMBED_TYPES}")
|
||||
286
primitive_anything/michelangelo/models/modules/transformer_blocks.py
Executable file
286
primitive_anything/michelangelo/models/modules/transformer_blocks.py
Executable file
@@ -0,0 +1,286 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from typing import Optional
|
||||
|
||||
from .checkpoint import checkpoint
|
||||
|
||||
|
||||
def init_linear(l, stddev):
|
||||
nn.init.normal_(l.weight, std=stddev)
|
||||
if l.bias is not None:
|
||||
nn.init.constant_(l.bias, 0.0)
|
||||
|
||||
|
||||
class MultiheadAttention(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
n_ctx: int,
|
||||
width: int,
|
||||
heads: int,
|
||||
init_scale: float,
|
||||
qkv_bias: bool,
|
||||
flash: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
self.n_ctx = n_ctx
|
||||
self.width = width
|
||||
self.heads = heads
|
||||
self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
|
||||
self.attention = QKVMultiheadAttention(device=device, dtype=dtype, heads=heads, n_ctx=n_ctx, flash=flash)
|
||||
init_linear(self.c_qkv, init_scale)
|
||||
init_linear(self.c_proj, init_scale)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.c_qkv(x)
|
||||
x = checkpoint(self.attention, (x,), (), True)
|
||||
x = self.c_proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class QKVMultiheadAttention(nn.Module):
|
||||
def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int, n_ctx: int, flash: bool = False):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
self.heads = heads
|
||||
self.n_ctx = n_ctx
|
||||
self.flash = flash
|
||||
|
||||
def forward(self, qkv):
|
||||
bs, n_ctx, width = qkv.shape
|
||||
attn_ch = width // self.heads // 3
|
||||
scale = 1 / math.sqrt(math.sqrt(attn_ch))
|
||||
qkv = qkv.view(bs, n_ctx, self.heads, -1)
|
||||
q, k, v = torch.split(qkv, attn_ch, dim=-1)
|
||||
|
||||
if self.flash:
|
||||
out = F.scaled_dot_product_attention(q, k, v)
|
||||
else:
|
||||
weight = torch.einsum(
|
||||
"bthc,bshc->bhts", q * scale, k * scale
|
||||
) # More stable with f16 than dividing afterwards
|
||||
wdtype = weight.dtype
|
||||
weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
|
||||
out = torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResidualAttentionBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
n_ctx: int,
|
||||
width: int,
|
||||
heads: int,
|
||||
init_scale: float = 1.0,
|
||||
qkv_bias: bool = True,
|
||||
flash: bool = False,
|
||||
use_checkpoint: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
self.attn = MultiheadAttention(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_ctx=n_ctx,
|
||||
width=width,
|
||||
heads=heads,
|
||||
init_scale=init_scale,
|
||||
qkv_bias=qkv_bias,
|
||||
flash=flash
|
||||
)
|
||||
self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
|
||||
self.mlp = MLP(device=device, dtype=dtype, width=width, init_scale=init_scale)
|
||||
self.ln_2 = nn.LayerNorm(width, device=device, dtype=dtype)
|
||||
|
||||
def _forward(self, x: torch.Tensor):
|
||||
x = x + self.attn(self.ln_1(x))
|
||||
x = x + self.mlp(self.ln_2(x))
|
||||
return x
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint)
|
||||
|
||||
|
||||
class MultiheadCrossAttention(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
width: int,
|
||||
heads: int,
|
||||
init_scale: float,
|
||||
qkv_bias: bool = True,
|
||||
flash: bool = False,
|
||||
n_data: Optional[int] = None,
|
||||
data_width: Optional[int] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.n_data = n_data
|
||||
self.width = width
|
||||
self.heads = heads
|
||||
self.data_width = width if data_width is None else data_width
|
||||
self.c_q = nn.Linear(width, width, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
|
||||
self.attention = QKVMultiheadCrossAttention(
|
||||
device=device, dtype=dtype, heads=heads, n_data=n_data, flash=flash
|
||||
)
|
||||
init_linear(self.c_q, init_scale)
|
||||
init_linear(self.c_kv, init_scale)
|
||||
init_linear(self.c_proj, init_scale)
|
||||
|
||||
def forward(self, x, data):
|
||||
x = self.c_q(x)
|
||||
data = self.c_kv(data)
|
||||
x = checkpoint(self.attention, (x, data), (), True)
|
||||
x = self.c_proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class QKVMultiheadCrossAttention(nn.Module):
|
||||
def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int,
|
||||
flash: bool = False, n_data: Optional[int] = None):
|
||||
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
self.heads = heads
|
||||
self.n_data = n_data
|
||||
self.flash = flash
|
||||
|
||||
def forward(self, q, kv):
|
||||
_, n_ctx, _ = q.shape
|
||||
bs, n_data, width = kv.shape
|
||||
attn_ch = width // self.heads // 2
|
||||
scale = 1 / math.sqrt(math.sqrt(attn_ch))
|
||||
q = q.view(bs, n_ctx, self.heads, -1)
|
||||
kv = kv.view(bs, n_data, self.heads, -1)
|
||||
k, v = torch.split(kv, attn_ch, dim=-1)
|
||||
|
||||
if self.flash:
|
||||
out = F.scaled_dot_product_attention(q, k, v)
|
||||
else:
|
||||
weight = torch.einsum(
|
||||
"bthc,bshc->bhts", q * scale, k * scale
|
||||
) # More stable with f16 than dividing afterwards
|
||||
wdtype = weight.dtype
|
||||
weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
|
||||
out = torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResidualCrossAttentionBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: Optional[torch.device],
|
||||
dtype: Optional[torch.dtype],
|
||||
n_data: Optional[int] = None,
|
||||
width: int,
|
||||
heads: int,
|
||||
data_width: Optional[int] = None,
|
||||
init_scale: float = 0.25,
|
||||
qkv_bias: bool = True,
|
||||
flash: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if data_width is None:
|
||||
data_width = width
|
||||
|
||||
self.attn = MultiheadCrossAttention(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_data=n_data,
|
||||
width=width,
|
||||
heads=heads,
|
||||
data_width=data_width,
|
||||
init_scale=init_scale,
|
||||
qkv_bias=qkv_bias,
|
||||
flash=flash,
|
||||
)
|
||||
self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
|
||||
self.ln_2 = nn.LayerNorm(data_width, device=device, dtype=dtype)
|
||||
self.mlp = MLP(device=device, dtype=dtype, width=width, init_scale=init_scale)
|
||||
self.ln_3 = nn.LayerNorm(width, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor, data: torch.Tensor):
|
||||
x = x + self.attn(self.ln_1(x), self.ln_2(data))
|
||||
x = x + self.mlp(self.ln_3(x))
|
||||
return x
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, *,
|
||||
device: Optional[torch.device],
|
||||
dtype: Optional[torch.dtype],
|
||||
width: int,
|
||||
init_scale: float):
|
||||
super().__init__()
|
||||
self.width = width
|
||||
self.c_fc = nn.Linear(width, width * 4, device=device, dtype=dtype)
|
||||
self.c_proj = nn.Linear(width * 4, width, device=device, dtype=dtype)
|
||||
self.gelu = nn.GELU()
|
||||
init_linear(self.c_fc, init_scale)
|
||||
init_linear(self.c_proj, init_scale)
|
||||
|
||||
def forward(self, x):
|
||||
return self.c_proj(self.gelu(self.c_fc(x)))
|
||||
|
||||
|
||||
class Transformer(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: Optional[torch.device],
|
||||
dtype: Optional[torch.dtype],
|
||||
n_ctx: int,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
init_scale: float = 0.25,
|
||||
qkv_bias: bool = True,
|
||||
flash: bool = False,
|
||||
use_checkpoint: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
self.n_ctx = n_ctx
|
||||
self.width = width
|
||||
self.layers = layers
|
||||
self.resblocks = nn.ModuleList(
|
||||
[
|
||||
ResidualAttentionBlock(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_ctx=n_ctx,
|
||||
width=width,
|
||||
heads=heads,
|
||||
init_scale=init_scale,
|
||||
qkv_bias=qkv_bias,
|
||||
flash=flash,
|
||||
use_checkpoint=use_checkpoint
|
||||
)
|
||||
for _ in range(layers)
|
||||
]
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
for block in self.resblocks:
|
||||
x = block(x)
|
||||
return x
|
||||
308
primitive_anything/michelangelo/models/modules/transformer_vit.py
Executable file
308
primitive_anything/michelangelo/models/modules/transformer_vit.py
Executable file
@@ -0,0 +1,308 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Optional
|
||||
import warnings
|
||||
|
||||
from .checkpoint import checkpoint
|
||||
|
||||
|
||||
def _trunc_normal_(tensor, mean, std, a, b):
|
||||
# Cut & paste from PyTorch official master until it's in a few official releases - RW
|
||||
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
|
||||
def norm_cdf(x):
|
||||
# Computes standard normal cumulative distribution function
|
||||
return (1. + math.erf(x / math.sqrt(2.))) / 2.
|
||||
|
||||
if (mean < a - 2 * std) or (mean > b + 2 * std):
|
||||
warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
|
||||
"The distribution of values may be incorrect.",
|
||||
stacklevel=2)
|
||||
|
||||
# Values are generated by using a truncated uniform distribution and
|
||||
# then using the inverse CDF for the normal distribution.
|
||||
# Get upper and lower cdf values
|
||||
l = norm_cdf((a - mean) / std)
|
||||
u = norm_cdf((b - mean) / std)
|
||||
|
||||
# Uniformly fill tensor with values from [l, u], then translate to
|
||||
# [2l-1, 2u-1].
|
||||
tensor.uniform_(2 * l - 1, 2 * u - 1)
|
||||
|
||||
# Use inverse cdf transform for normal distribution to get truncated
|
||||
# standard normal
|
||||
tensor.erfinv_()
|
||||
|
||||
# Transform to proper mean, std
|
||||
tensor.mul_(std * math.sqrt(2.))
|
||||
tensor.add_(mean)
|
||||
|
||||
# Clamp to ensure it's in the proper range
|
||||
tensor.clamp_(min=a, max=b)
|
||||
return tensor
|
||||
|
||||
|
||||
def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
|
||||
# type: (Tensor | nn.Parameter, float, float, float, float) -> Tensor
|
||||
r"""Fills the input Tensor with values drawn from a truncated
|
||||
normal distribution. The values are effectively drawn from the
|
||||
normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
|
||||
with values outside :math:`[a, b]` redrawn until they are within
|
||||
the bounds. The method used for generating the random values works
|
||||
best when :math:`a \leq \text{mean} \leq b`.
|
||||
NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
|
||||
applied while sampling the normal with mean/std applied, therefore a, b args
|
||||
should be adjusted to match the range of mean, std args.
|
||||
Args:
|
||||
tensor: an n-dimensional `torch.Tensor`
|
||||
mean: the mean of the normal distribution
|
||||
std: the standard deviation of the normal distribution
|
||||
a: the minimum cutoff value
|
||||
b: the maximum cutoff value
|
||||
Examples:
|
||||
>>> w = torch.empty(3, 5)
|
||||
>>> nn.init.trunc_normal_(w)
|
||||
"""
|
||||
with torch.no_grad():
|
||||
return _trunc_normal_(tensor, mean, std, a, b)
|
||||
|
||||
|
||||
def init_weights(m):
|
||||
if isinstance(m, nn.Linear):
|
||||
trunc_normal_(m.weight, std=.02)
|
||||
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||
nn.init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.LayerNorm):
|
||||
nn.init.constant_(m.bias, 0)
|
||||
nn.init.constant_(m.weight, 1.0)
|
||||
|
||||
|
||||
class MultiheadAttention(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
n_ctx: int,
|
||||
width: int,
|
||||
heads: int,
|
||||
qkv_bias: bool
|
||||
):
|
||||
super().__init__()
|
||||
self.n_ctx = n_ctx
|
||||
self.width = width
|
||||
self.heads = heads
|
||||
self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
|
||||
self.attention = QKVMultiheadAttention(device=device, dtype=dtype, heads=heads, n_ctx=n_ctx)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.c_qkv(x)
|
||||
x = checkpoint(self.attention, (x,), (), True)
|
||||
x = self.c_proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class QKVMultiheadAttention(nn.Module):
|
||||
def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int, n_ctx: int):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
self.heads = heads
|
||||
self.n_ctx = n_ctx
|
||||
|
||||
def forward(self, qkv):
|
||||
bs, n_ctx, width = qkv.shape
|
||||
attn_ch = width // self.heads // 3
|
||||
scale = 1 / math.sqrt(attn_ch)
|
||||
qkv = qkv.view(bs, n_ctx, self.heads, -1)
|
||||
q, k, v = torch.split(qkv, attn_ch, dim=-1)
|
||||
weight = torch.einsum("bthc,bshc->bhts", q, k) * scale
|
||||
wdtype = weight.dtype
|
||||
weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
|
||||
return torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
|
||||
|
||||
|
||||
class ResidualAttentionBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
n_ctx: int,
|
||||
width: int,
|
||||
heads: int,
|
||||
qkv_bias: bool = True,
|
||||
use_checkpoint: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
self.attn = MultiheadAttention(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_ctx=n_ctx,
|
||||
width=width,
|
||||
heads=heads,
|
||||
qkv_bias=qkv_bias
|
||||
)
|
||||
self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
|
||||
self.mlp = MLP(device=device, dtype=dtype, width=width)
|
||||
self.ln_2 = nn.LayerNorm(width, device=device, dtype=dtype)
|
||||
|
||||
def _forward(self, x: torch.Tensor):
|
||||
x = x + self.attn(self.ln_1(x))
|
||||
x = x + self.mlp(self.ln_2(x))
|
||||
return x
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint)
|
||||
|
||||
|
||||
class MultiheadCrossAttention(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
width: int,
|
||||
heads: int,
|
||||
qkv_bias: bool = True,
|
||||
n_data: Optional[int] = None,
|
||||
data_width: Optional[int] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.n_data = n_data
|
||||
self.width = width
|
||||
self.heads = heads
|
||||
self.data_width = width if data_width is None else data_width
|
||||
self.c_q = nn.Linear(width, width, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.c_proj = nn.Linear(width, width, device=device, dtype=dtype)
|
||||
self.attention = QKVMultiheadCrossAttention(
|
||||
device=device, dtype=dtype, heads=heads, n_data=n_data
|
||||
)
|
||||
|
||||
def forward(self, x, data):
|
||||
x = self.c_q(x)
|
||||
data = self.c_kv(data)
|
||||
x = checkpoint(self.attention, (x, data), (), True)
|
||||
x = self.c_proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class QKVMultiheadCrossAttention(nn.Module):
|
||||
def __init__(self, *, device: torch.device, dtype: torch.dtype, heads: int, n_data: Optional[int] = None):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
self.heads = heads
|
||||
self.n_data = n_data
|
||||
|
||||
def forward(self, q, kv):
|
||||
_, n_ctx, _ = q.shape
|
||||
bs, n_data, width = kv.shape
|
||||
attn_ch = width // self.heads // 2
|
||||
scale = 1 / math.sqrt(attn_ch)
|
||||
q = q.view(bs, n_ctx, self.heads, -1)
|
||||
kv = kv.view(bs, n_data, self.heads, -1)
|
||||
k, v = torch.split(kv, attn_ch, dim=-1)
|
||||
weight = torch.einsum("bthc,bshc->bhts", q, k) * scale
|
||||
wdtype = weight.dtype
|
||||
weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
|
||||
return torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
|
||||
|
||||
|
||||
class ResidualCrossAttentionBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: Optional[torch.device],
|
||||
dtype: Optional[torch.dtype],
|
||||
n_data: Optional[int] = None,
|
||||
width: int,
|
||||
heads: int,
|
||||
data_width: Optional[int] = None,
|
||||
qkv_bias: bool = True
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if data_width is None:
|
||||
data_width = width
|
||||
|
||||
self.attn = MultiheadCrossAttention(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_data=n_data,
|
||||
width=width,
|
||||
heads=heads,
|
||||
data_width=data_width,
|
||||
qkv_bias=qkv_bias
|
||||
)
|
||||
self.ln_1 = nn.LayerNorm(width, device=device, dtype=dtype)
|
||||
self.ln_2 = nn.LayerNorm(data_width, device=device, dtype=dtype)
|
||||
self.mlp = MLP(device=device, dtype=dtype, width=width)
|
||||
self.ln_3 = nn.LayerNorm(width, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor, data: torch.Tensor):
|
||||
x = x + self.attn(self.ln_1(x), self.ln_2(data))
|
||||
x = x + self.mlp(self.ln_3(x))
|
||||
return x
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, *,
|
||||
device: Optional[torch.device],
|
||||
dtype: Optional[torch.dtype],
|
||||
width: int):
|
||||
super().__init__()
|
||||
self.width = width
|
||||
self.c_fc = nn.Linear(width, width * 4, device=device, dtype=dtype)
|
||||
self.c_proj = nn.Linear(width * 4, width, device=device, dtype=dtype)
|
||||
self.gelu = nn.GELU()
|
||||
|
||||
def forward(self, x):
|
||||
return self.c_proj(self.gelu(self.c_fc(x)))
|
||||
|
||||
|
||||
class Transformer(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
device: Optional[torch.device],
|
||||
dtype: Optional[torch.dtype],
|
||||
n_ctx: int,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
qkv_bias: bool = True,
|
||||
use_checkpoint: bool = False
|
||||
):
|
||||
super().__init__()
|
||||
self.n_ctx = n_ctx
|
||||
self.width = width
|
||||
self.layers = layers
|
||||
self.resblocks = nn.ModuleList(
|
||||
[
|
||||
ResidualAttentionBlock(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
n_ctx=n_ctx,
|
||||
width=width,
|
||||
heads=heads,
|
||||
qkv_bias=qkv_bias,
|
||||
use_checkpoint=use_checkpoint
|
||||
)
|
||||
for _ in range(layers)
|
||||
]
|
||||
)
|
||||
|
||||
self.apply(init_weights)
|
||||
|
||||
def forward(self, x: torch.Tensor):
|
||||
for block in self.resblocks:
|
||||
x = block(x)
|
||||
return x
|
||||
Reference in New Issue
Block a user