backup wip

This commit is contained in:
Alexander Soare
2024-04-16 12:51:32 +01:00
parent 5608e659e6
commit 03b08eb74e
8 changed files with 240 additions and 356 deletions

View File

@@ -1,8 +1,10 @@
"""
TODO(alexander-soare):
- Remove reliance on Robomimic for SpatialSoftmax.
- Remove reliance on diffusers for DDPMScheduler.
- Remove reliance on diffusers for DDPMScheduler and LR scheduler.
- Move EMA out of policy.
- Consolidate _DiffusionUnetImagePolicy into DiffusionPolicy.
- One more pass on comments and documentation.
"""
import copy
@@ -10,10 +12,10 @@ import logging
import math
import time
from collections import deque
from itertools import chain
from typing import Callable
import einops
import hydra
import torch
import torch.nn.functional as F # noqa: N812
import torchvision
@@ -23,12 +25,12 @@ from robomimic.models.base_nets import SpatialSoftmax
from torch import Tensor, nn
from torch.nn.modules.batchnorm import _BatchNorm
from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig
from lerobot.common.policies.utils import (
get_device_from_parameters,
get_dtype_from_parameters,
populate_queues,
)
from lerobot.common.utils import get_safe_torch_device
logger = logging.getLogger(__name__)
@@ -41,69 +43,29 @@ class DiffusionPolicy(nn.Module):
name = "diffusion"
def __init__(
self,
cfg,
cfg_device,
cfg_noise_scheduler,
cfg_optimizer,
cfg_ema,
shape_meta: dict,
horizon,
n_action_steps,
n_obs_steps,
num_inference_steps=None,
diffusion_step_embed_dim=256,
down_dims=(256, 512, 1024),
kernel_size=5,
n_groups=8,
film_scale_modulation=True,
**_,
):
def __init__(self, cfg: DiffusionConfig, lr_scheduler_num_training_steps: int):
super().__init__()
self.cfg = cfg
self.n_obs_steps = n_obs_steps
self.n_action_steps = n_action_steps
# queues are populated during rollout of the policy, they contain the n latest observations and actions
self._queues = None
noise_scheduler = hydra.utils.instantiate(cfg_noise_scheduler)
self.diffusion = _DiffusionUnetImagePolicy(
cfg,
shape_meta=shape_meta,
noise_scheduler=noise_scheduler,
horizon=horizon,
n_action_steps=n_action_steps,
n_obs_steps=n_obs_steps,
num_inference_steps=num_inference_steps,
diffusion_step_embed_dim=diffusion_step_embed_dim,
down_dims=down_dims,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
)
self.device = get_safe_torch_device(cfg_device)
self.diffusion.to(self.device)
self.diffusion = _DiffusionUnetImagePolicy(cfg)
# TODO(alexander-soare): This should probably be managed outside of the policy class.
self.ema_diffusion = None
self.ema = None
if self.cfg.use_ema:
self.ema_diffusion = copy.deepcopy(self.diffusion)
self.ema = hydra.utils.instantiate(
cfg_ema,
model=self.ema_diffusion,
)
self.ema = _EMA(cfg, model=self.ema_diffusion)
self.optimizer = hydra.utils.instantiate(
cfg_optimizer,
params=self.diffusion.parameters(),
# TODO(alexander-soare): Move optimizer out of policy.
self.optimizer = torch.optim.Adam(
self.diffusion.parameters(), cfg.lr, cfg.adam_betas, cfg.adam_eps, cfg.adam_weight_decay
)
# TODO(rcadene): modify lr scheduler so that it doesnt depend on epochs but steps
# TODO(alexander-soare): Move LR scheduler out of policy.
# TODO(rcadene): modify lr scheduler so that it doesn't depend on epochs but steps
self.global_step = 0
# configure lr scheduler
@@ -111,7 +73,7 @@ class DiffusionPolicy(nn.Module):
cfg.lr_scheduler,
optimizer=self.optimizer,
num_warmup_steps=cfg.lr_warmup_steps,
num_training_steps=cfg.offline_steps,
num_training_steps=lr_scheduler_num_training_steps,
# pytorch assumes stepping LRScheduler every epoch
# however huggingface diffusers steps it every batch
last_epoch=self.global_step - 1,
@@ -122,9 +84,9 @@ class DiffusionPolicy(nn.Module):
Clear observation and action queues. Should be called on `env.reset()`
"""
self._queues = {
"observation.image": deque(maxlen=self.n_obs_steps),
"observation.state": deque(maxlen=self.n_obs_steps),
"action": deque(maxlen=self.n_action_steps),
"observation.image": deque(maxlen=self.cfg.n_obs_steps),
"observation.state": deque(maxlen=self.cfg.n_obs_steps),
"action": deque(maxlen=self.cfg.n_action_steps),
}
@torch.no_grad
@@ -138,11 +100,13 @@ class DiffusionPolicy(nn.Module):
- The diffusion model generates `horizon` steps worth of actions.
- `n_action_steps` worth of actions are actually kept for execution, starting from the current step.
Schematically this looks like:
----------------------------------------------------------------------------------------------
(legend: o = n_obs_steps, h = horizon, a = n_action_steps)
|timestep | n-o+1 | n-o+2 | ..... | n | ..... | n+a-1 | n+a | ..... |n-o+1+h|
|observation is used | YES | YES | ..... | NO | NO | NO | NO | NO | NO |
|observation is used | YES | YES | YES | NO | NO | NO | NO | NO | NO |
|action is generated | YES | YES | YES | YES | YES | YES | YES | YES | YES |
|action is used | NO | NO | NO | YES | YES | YES | NO | NO | NO |
----------------------------------------------------------------------------------------------
Note that this means we require: `n_action_steps < horizon - n_obs_steps + 1`. Also, note that
"horizon" may not the best name to describe what the variable actually means, because this period is
actually measured from the first observation which (if `n_obs_steps` > 1) happened in the past.
@@ -213,57 +177,41 @@ class DiffusionPolicy(nn.Module):
class _DiffusionUnetImagePolicy(nn.Module):
def __init__(
self,
cfg,
shape_meta: dict,
noise_scheduler: DDPMScheduler,
horizon,
n_action_steps,
n_obs_steps,
num_inference_steps=None,
diffusion_step_embed_dim=256,
down_dims=(256, 512, 1024),
kernel_size=5,
n_groups=8,
film_scale_modulation=True,
):
def __init__(self, cfg: DiffusionConfig):
super().__init__()
action_shape = shape_meta["action"]["shape"]
assert len(action_shape) == 1
action_dim = action_shape[0]
self.rgb_encoder = _RgbEncoder(input_shape=shape_meta.obs.image.shape, **cfg.rgb_encoder)
self.cfg = cfg
self.rgb_encoder = _RgbEncoder(cfg)
self.unet = _ConditionalUnet1D(
input_dim=action_dim,
global_cond_dim=(action_dim + self.rgb_encoder.feature_dim) * n_obs_steps,
diffusion_step_embed_dim=diffusion_step_embed_dim,
down_dims=down_dims,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
cfg, global_cond_dim=(cfg.action_dim + self.rgb_encoder.feature_dim) * cfg.n_obs_steps
)
self.noise_scheduler = noise_scheduler
self.horizon = horizon
self.action_dim = action_dim
self.n_action_steps = n_action_steps
self.n_obs_steps = n_obs_steps
self.noise_scheduler = DDPMScheduler(
num_train_timesteps=cfg.num_train_timesteps,
beta_start=cfg.beta_start,
beta_end=cfg.beta_end,
beta_schedule=cfg.beta_schedule,
variance_type="fixed_small",
clip_sample=cfg.clip_sample,
clip_sample_range=cfg.clip_sample_range,
prediction_type=cfg.prediction_type,
)
if num_inference_steps is None:
num_inference_steps = noise_scheduler.config.num_train_timesteps
self.num_inference_steps = num_inference_steps
if cfg.num_inference_steps is None:
self.num_inference_steps = self.noise_scheduler.config.num_train_timesteps
else:
self.num_inference_steps = cfg.num_inference_steps
# ========= inference ============
def conditional_sample(self, batch_size, global_cond=None, generator=None):
def conditional_sample(
self, batch_size: int, global_cond: Tensor | None = None, generator: torch.Generator | None = None
) -> Tensor:
device = get_device_from_parameters(self)
dtype = get_dtype_from_parameters(self)
# Sample prior.
sample = torch.randn(
size=(batch_size, self.horizon, self.action_dim),
size=(batch_size, self.cfg.horizon, self.cfg.action_dim),
dtype=dtype,
device=device,
generator=generator,
@@ -283,7 +231,7 @@ class _DiffusionUnetImagePolicy(nn.Module):
return sample
def generate_actions(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
def generate_actions(self, batch: dict[str, Tensor]) -> Tensor:
"""
This function expects `batch` to have (at least):
{
@@ -293,8 +241,7 @@ class _DiffusionUnetImagePolicy(nn.Module):
"""
assert set(batch).issuperset({"observation.state", "observation.image"})
batch_size, n_obs_steps = batch["observation.state"].shape[:2]
assert n_obs_steps == self.n_obs_steps
assert self.n_obs_steps == n_obs_steps
assert n_obs_steps == self.cfg.n_obs_steps
# Extract image feature (first combine batch and sequence dims).
img_features = self.rgb_encoder(einops.rearrange(batch["observation.image"], "b n ... -> (b n) ..."))
@@ -307,13 +254,13 @@ class _DiffusionUnetImagePolicy(nn.Module):
sample = self.conditional_sample(batch_size, global_cond=global_cond)
# `horizon` steps worth of actions (from the first observation).
action = sample[..., : self.action_dim]
actions = sample[..., : self.cfg.action_dim]
# Extract `n_action_steps` steps worth of actions (from the current observation).
start = n_obs_steps - 1
end = start + self.n_action_steps
action = action[:, start:end]
end = start + self.cfg.n_action_steps
actions = actions[:, start:end]
return action
return actions
def compute_loss(self, batch: dict[str, Tensor]) -> Tensor:
"""
@@ -329,9 +276,8 @@ class _DiffusionUnetImagePolicy(nn.Module):
assert set(batch).issuperset({"observation.state", "observation.image", "action", "action_is_pad"})
batch_size, n_obs_steps = batch["observation.state"].shape[:2]
horizon = batch["action"].shape[1]
assert horizon == self.horizon
assert n_obs_steps == self.n_obs_steps
assert self.n_obs_steps == n_obs_steps
assert horizon == self.cfg.horizon
assert n_obs_steps == self.cfg.n_obs_steps
# Extract image feature (first combine batch and sequence dims).
img_features = self.rgb_encoder(einops.rearrange(batch["observation.image"], "b n ... -> (b n) ..."))
@@ -359,14 +305,13 @@ class _DiffusionUnetImagePolicy(nn.Module):
pred = self.unet(noisy_trajectory, timesteps, global_cond=global_cond)
# Compute the loss.
# The targe is either the original trajectory, or the noise.
pred_type = self.noise_scheduler.config.prediction_type
if pred_type == "epsilon":
# The target is either the original trajectory, or the noise.
if self.cfg.prediction_type == "epsilon":
target = eps
elif pred_type == "sample":
elif self.cfg.prediction_type == "sample":
target = batch["action"]
else:
raise ValueError(f"Unsupported prediction type {pred_type}")
raise ValueError(f"Unsupported prediction type {self.cfg.prediction_type}")
loss = F.mse_loss(pred, target, reduction="none")
@@ -384,64 +329,35 @@ class _RgbEncoder(nn.Module):
Includes the ability to normalize and crop the image first.
"""
def __init__(
self,
input_shape: tuple[int, int, int],
norm_mean_std: tuple[float, float] = [1.0, 1.0],
crop_shape: tuple[int, int] | None = None,
random_crop: bool = False,
backbone_name: str = "resnet18",
pretrained_backbone: bool = False,
use_group_norm: bool = False,
num_keypoints: int = 32,
):
"""
Args:
input_shape: channel-first input shape (C, H, W)
norm_mean_std: mean and standard deviation used for image normalization. Images are normalized as
(image - mean) / std.
crop_shape: (H, W) shape to crop to (must fit within the input shape). If not provided, no
cropping is done.
random_crop: Whether the crop should be random at training time (it's always a center crop in
eval mode).
backbone_name: The name of one of the available resnet models from torchvision (eg resnet18).
pretrained_backbone: whether to use timm pretrained weights.
use_group_norm: Whether to replace batch normalization with group normalization in the backbone.
The group sizes are set to be about 16 (to be precise, feature_dim // 16).
num_keypoints: Number of keypoints for SpatialSoftmax (default value of 32 matches PushT Image).
"""
def __init__(self, cfg: DiffusionConfig):
super().__init__()
if input_shape[0] != 3:
raise ValueError("Only RGB images are handled")
if not backbone_name.startswith("resnet"):
raise ValueError(
"Only resnet is supported for now (because of the assumption that 'layer4' is the output layer)"
)
# Set up optional preprocessing.
if norm_mean_std == [1.0, 1.0]:
if all(v == 1.0 for v in chain(cfg.image_normalization_mean, cfg.image_normalization_std)):
self.normalizer = nn.Identity()
else:
self.normalizer = torchvision.transforms.Normalize(mean=norm_mean_std[0], std=norm_mean_std[1])
if crop_shape is not None:
self.normalizer = torchvision.transforms.Normalize(
mean=cfg.image_normalization_mean, std=cfg.image_normalization_std
)
if cfg.crop_shape is not None:
self.do_crop = True
# Always use center crop for eval
self.center_crop = torchvision.transforms.CenterCrop(crop_shape)
if random_crop:
self.maybe_random_crop = torchvision.transforms.RandomCrop(crop_shape)
self.center_crop = torchvision.transforms.CenterCrop(cfg.crop_shape)
if cfg.crop_is_random:
self.maybe_random_crop = torchvision.transforms.RandomCrop(cfg.crop_shape)
else:
self.maybe_random_crop = self.center_crop
else:
self.do_crop = False
# Set up backbone.
backbone_model = getattr(torchvision.models, backbone_name)(pretrained=pretrained_backbone)
backbone_model = getattr(torchvision.models, cfg.vision_backbone)(
pretrained=cfg.use_pretrained_backbone
)
# Note: This assumes that the layer4 feature map is children()[-3]
# TODO(alexander-soare): Use a safer alternative.
self.backbone = nn.Sequential(*(list(backbone_model.children())[:-2]))
if use_group_norm:
if pretrained_backbone:
if cfg.use_group_norm:
if cfg.use_pretrained_backbone:
raise ValueError(
"You can't replace BatchNorm in a pretrained model without ruining the weights!"
)
@@ -454,10 +370,10 @@ class _RgbEncoder(nn.Module):
# Set up pooling and final layers.
# Use a dry run to get the feature map shape.
with torch.inference_mode():
feat_map_shape = tuple(self.backbone(torch.zeros(size=(1, *input_shape))).shape[1:])
self.pool = SpatialSoftmax(feat_map_shape, num_kp=num_keypoints)
self.feature_dim = num_keypoints * 2
self.out = nn.Linear(num_keypoints * 2, self.feature_dim)
feat_map_shape = tuple(self.backbone(torch.zeros(size=(1, 3, *cfg.image_size))).shape[1:])
self.pool = SpatialSoftmax(feat_map_shape, num_kp=cfg.spatial_softmax_num_keypoints)
self.feature_dim = cfg.spatial_softmax_num_keypoints * 2
self.out = nn.Linear(cfg.spatial_softmax_num_keypoints * 2, self.feature_dim)
self.relu = nn.ReLU()
def forward(self, x: Tensor) -> Tensor:
@@ -516,16 +432,18 @@ def _replace_submodules(
class _SinusoidalPosEmb(nn.Module):
def __init__(self, dim):
"""1D sinusoidal positional embeddings as in Attention is All You Need."""
def __init__(self, dim: int):
super().__init__()
self.dim = dim
def forward(self, x):
def forward(self, x: Tensor) -> Tensor:
device = x.device
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
emb = x[:, None] * emb[None, :]
emb = x.unsqueeze(-1) * emb.unsqueeze(0)
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
return emb
@@ -549,92 +467,46 @@ class _Conv1dBlock(nn.Module):
class _ConditionalUnet1D(nn.Module):
"""A 1D convolutional UNet with FiLM modulation for conditioning.
Two types of conditioning can be applied:
- Global: Conditioning information that is aggregated over the whole observation window. This is
incorporated via the FiLM technique in the residual convolution blocks of the Unet's encoder/decoder.
- Local: Conditioning information for each timestep in the observation window. This is incorporated
by encoding the information via 1D convolutions and adding the resulting embeddings to the inputs and
outputs of the Unet's encoder/decoder.
Note: this removes local conditioning as compared to the original diffusion policy code.
"""
def __init__(
self,
input_dim: int,
local_cond_dim: int | None = None,
global_cond_dim: int | None = None,
diffusion_step_embed_dim: int = 256,
down_dims: int | None = None,
kernel_size: int = 3,
n_groups: int = 8,
film_scale_modulation: bool = False,
):
def __init__(self, cfg: DiffusionConfig, global_cond_dim: int):
super().__init__()
if down_dims is None:
down_dims = [256, 512, 1024]
self.cfg = cfg
# Encoder for the diffusion timestep.
self.diffusion_step_encoder = nn.Sequential(
_SinusoidalPosEmb(diffusion_step_embed_dim),
nn.Linear(diffusion_step_embed_dim, diffusion_step_embed_dim * 4),
_SinusoidalPosEmb(cfg.diffusion_step_embed_dim),
nn.Linear(cfg.diffusion_step_embed_dim, cfg.diffusion_step_embed_dim * 4),
nn.Mish(),
nn.Linear(diffusion_step_embed_dim * 4, diffusion_step_embed_dim),
nn.Linear(cfg.diffusion_step_embed_dim * 4, cfg.diffusion_step_embed_dim),
)
# The FiLM conditioning dimension.
cond_dim = diffusion_step_embed_dim
if global_cond_dim is not None:
cond_dim += global_cond_dim
self.local_cond_down_encoder = None
self.local_cond_up_encoder = None
if local_cond_dim is not None:
# Encoder for the local conditioning. The output gets added to the Unet encoder input.
self.local_cond_down_encoder = _ConditionalResidualBlock1D(
local_cond_dim,
down_dims[0],
cond_dim=cond_dim,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
)
# Encoder for the local conditioning. The output gets added to the Unet encoder output.
self.local_cond_up_encoder = _ConditionalResidualBlock1D(
local_cond_dim,
down_dims[0],
cond_dim=cond_dim,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
)
cond_dim = cfg.diffusion_step_embed_dim + global_cond_dim
# In channels / out channels for each downsampling block in the Unet's encoder. For the decoder, we
# just reverse these.
in_out = [(input_dim, down_dims[0])] + list(zip(down_dims[:-1], down_dims[1:], strict=True))
in_out = [(cfg.action_dim, cfg.down_dims[0])] + list(
zip(cfg.down_dims[:-1], cfg.down_dims[1:], strict=True)
)
# Unet encoder.
common_res_block_kwargs = {
"cond_dim": cond_dim,
"kernel_size": cfg.kernel_size,
"n_groups": cfg.n_groups,
"use_film_scale_modulation": cfg.use_film_scale_modulation,
}
self.down_modules = nn.ModuleList([])
for ind, (dim_in, dim_out) in enumerate(in_out):
is_last = ind >= (len(in_out) - 1)
self.down_modules.append(
nn.ModuleList(
[
_ConditionalResidualBlock1D(
dim_in,
dim_out,
cond_dim=cond_dim,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
),
_ConditionalResidualBlock1D(
dim_out,
dim_out,
cond_dim=cond_dim,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
),
_ConditionalResidualBlock1D(dim_in, dim_out, **common_res_block_kwargs),
_ConditionalResidualBlock1D(dim_out, dim_out, **common_res_block_kwargs),
# Downsample as long as it is not the last block.
nn.Conv1d(dim_out, dim_out, 3, 2, 1) if not is_last else nn.Identity(),
]
@@ -644,22 +516,8 @@ class _ConditionalUnet1D(nn.Module):
# Processing in the middle of the auto-encoder.
self.mid_modules = nn.ModuleList(
[
_ConditionalResidualBlock1D(
down_dims[-1],
down_dims[-1],
cond_dim=cond_dim,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
),
_ConditionalResidualBlock1D(
down_dims[-1],
down_dims[-1],
cond_dim=cond_dim,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
),
_ConditionalResidualBlock1D(cfg.down_dims[-1], cfg.down_dims[-1], **common_res_block_kwargs),
_ConditionalResidualBlock1D(cfg.down_dims[-1], cfg.down_dims[-1], **common_res_block_kwargs),
]
)
@@ -670,22 +528,9 @@ class _ConditionalUnet1D(nn.Module):
self.up_modules.append(
nn.ModuleList(
[
_ConditionalResidualBlock1D(
dim_in * 2, # x2 as it takes the encoder's skip connection as well
dim_out,
cond_dim=cond_dim,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
),
_ConditionalResidualBlock1D(
dim_out,
dim_out,
cond_dim=cond_dim,
kernel_size=kernel_size,
n_groups=n_groups,
film_scale_modulation=film_scale_modulation,
),
# dim_in * 2, because it takes the encoder's skip connection as well
_ConditionalResidualBlock1D(dim_in * 2, dim_out, **common_res_block_kwargs),
_ConditionalResidualBlock1D(dim_out, dim_out, **common_res_block_kwargs),
# Upsample as long as it is not the last block.
nn.ConvTranspose1d(dim_out, dim_out, 4, 2, 1) if not is_last else nn.Identity(),
]
@@ -693,29 +538,22 @@ class _ConditionalUnet1D(nn.Module):
)
self.final_conv = nn.Sequential(
_Conv1dBlock(down_dims[0], down_dims[0], kernel_size=kernel_size),
nn.Conv1d(down_dims[0], input_dim, 1),
_Conv1dBlock(cfg.down_dims[0], cfg.down_dims[0], kernel_size=cfg.kernel_size),
nn.Conv1d(cfg.down_dims[0], cfg.action_dim, 1),
)
def forward(self, x: Tensor, timestep: Tensor | int, local_cond=None, global_cond=None) -> Tensor:
def forward(self, x: Tensor, timestep: Tensor | int, global_cond=None) -> Tensor:
"""
Args:
x: (B, T, input_dim) tensor for input to the Unet.
timestep: (B,) tensor of (timestep_we_are_denoising_from - 1).
local_cond: (B, T, local_cond_dim)
global_cond: (B, global_cond_dim)
output: (B, T, input_dim)
Returns:
(B, T, input_dim)
(B, T, input_dim) diffusion model prediction.
"""
# For 1D convolutions we'll need feature dimension first.
x = einops.rearrange(x, "b t d -> b d t")
if local_cond is not None:
if self.local_cond_down_encoder is None or self.local_cond_up_encoder is None:
raise ValueError(
"`local_cond` was provided but the relevant encoders weren't built at initialization."
)
local_cond = einops.rearrange(local_cond, "b t d -> b d t")
timesteps_embed = self.diffusion_step_encoder(timestep)
@@ -725,11 +563,10 @@ class _ConditionalUnet1D(nn.Module):
else:
global_feature = timesteps_embed
# Run encoder, keeping track of skip features to pass to the decoder.
encoder_skip_features: list[Tensor] = []
for idx, (resnet, resnet2, downsample) in enumerate(self.down_modules):
for resnet, resnet2, downsample in self.down_modules:
x = resnet(x, global_feature)
if idx == 0 and local_cond is not None:
x = x + self.local_cond_down_encoder(local_cond, global_feature)
x = resnet2(x, global_feature)
encoder_skip_features.append(x)
x = downsample(x)
@@ -737,14 +574,10 @@ class _ConditionalUnet1D(nn.Module):
for mid_module in self.mid_modules:
x = mid_module(x, global_feature)
for idx, (resnet, resnet2, upsample) in enumerate(self.up_modules):
# Run decoder, using the skip features from the encoder.
for resnet, resnet2, upsample in self.up_modules:
x = torch.cat((x, encoder_skip_features.pop()), dim=1)
x = resnet(x, global_feature)
# Note: The condition in the original implementation is:
# if idx == len(self.up_modules) and local_cond is not None:
# But as they mention in their comments, this is incorrect. We use the correct condition here.
if idx == (len(self.up_modules) - 1) and local_cond is not None:
x = x + self.local_cond_up_encoder(local_cond, global_feature)
x = resnet2(x, global_feature)
x = upsample(x)
@@ -766,17 +599,17 @@ class _ConditionalResidualBlock1D(nn.Module):
n_groups: int = 8,
# Set to True to do scale modulation with FiLM as well as bias modulation (defaults to False meaning
# FiLM just modulates bias).
film_scale_modulation: bool = False,
use_film_scale_modulation: bool = False,
):
super().__init__()
self.film_scale_modulation = film_scale_modulation
self.use_film_scale_modulation = use_film_scale_modulation
self.out_channels = out_channels
self.conv1 = _Conv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups)
# FiLM modulation (https://arxiv.org/abs/1709.07871) outputs per-channel bias and (maybe) scale.
cond_channels = out_channels * 2 if film_scale_modulation else out_channels
cond_channels = out_channels * 2 if use_film_scale_modulation else out_channels
self.cond_encoder = nn.Sequential(nn.Mish(), nn.Linear(cond_dim, cond_channels))
self.conv2 = _Conv1dBlock(out_channels, out_channels, kernel_size, n_groups=n_groups)
@@ -798,7 +631,7 @@ class _ConditionalResidualBlock1D(nn.Module):
# Get condition embedding. Unsqueeze for broadcasting to `out`, resulting in (B, out_channels, 1).
cond_embed = self.cond_encoder(cond).unsqueeze(-1)
if self.film_scale_modulation:
if self.use_film_scale_modulation:
# Treat the embedding as a list of scales and biases.
scale = cond_embed[:, : self.out_channels]
bias = cond_embed[:, self.out_channels :]
@@ -817,9 +650,7 @@ class _EMA:
Exponential Moving Average of models weights
"""
def __init__(
self, model, update_after_step=0, inv_gamma=1.0, power=2 / 3, min_value=0.0, max_value=0.9999
):
def __init__(self, cfg: DiffusionConfig, model: nn.Module):
"""
@crowsonkb's notes on EMA Warmup:
If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
@@ -829,18 +660,18 @@ class _EMA:
Args:
inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
power (float): Exponential factor of EMA warmup. Default: 2/3.
min_value (float): The minimum EMA decay rate. Default: 0.
min_alpha (float): The minimum EMA decay rate. Default: 0.
"""
self.averaged_model = model
self.averaged_model.eval()
self.averaged_model.requires_grad_(False)
self.update_after_step = update_after_step
self.inv_gamma = inv_gamma
self.power = power
self.min_value = min_value
self.max_value = max_value
self.update_after_step = cfg.ema_update_after_step
self.inv_gamma = cfg.ema_inv_gamma
self.power = cfg.ema_power
self.min_alpha = cfg.ema_min_alpha
self.max_alpha = cfg.ema_max_alpha
self.alpha = 0.0
self.optimization_step = 0
@@ -855,7 +686,7 @@ class _EMA:
if step <= 0:
return 0.0
return max(self.min_value, min(value, self.max_value))
return max(self.min_alpha, min(value, self.max_alpha))
@torch.no_grad()
def step(self, new_model):