# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ncsnpp.py and ncsnpp_utils are taken from
# https://github.com/sp-uhh/sgmse/
import functools
import numpy as np
import torch
import torch.nn as nn
from espnet2.enh.layers.ncsnpp_utils import layers, layerspp, normalization
ResnetBlockDDPM = layerspp.ResnetBlockDDPMpp
ResnetBlockBigGAN = layerspp.ResnetBlockBigGANpp
Combine = layerspp.Combine
conv3x3 = layerspp.conv3x3
conv1x1 = layerspp.conv1x1
get_act = layers.get_act
get_normalization = normalization.get_normalization
default_initializer = layers.default_init
[docs]class NCSNpp(nn.Module):
"""NCSN++ model, adapted from https://github.com/yang-song/score_sde and
https://github.com/sp-uhh/sgmse repository
"""
def __init__(
self,
scale_by_sigma=True,
nonlinearity="swish",
nf=128,
ch_mult=(1, 1, 2, 2, 2, 2, 2),
num_res_blocks=2,
attn_resolutions=(16,),
resamp_with_conv=True,
conditional=True,
fir=True,
fir_kernel=[1, 3, 3, 1],
skip_rescale=True,
resblock_type="biggan",
progressive="output_skip",
progressive_input="input_skip",
progressive_combine="sum",
init_scale=0.0,
fourier_scale=16,
image_size=256,
embedding_type="fourier",
dropout=0.0,
centered=True,
**unused_kwargs,
):
super().__init__()
self.act = act = get_act(nonlinearity)
self.nf = nf = nf
ch_mult = ch_mult
self.num_res_blocks = num_res_blocks = num_res_blocks
self.attn_resolutions = attn_resolutions = attn_resolutions
dropout = dropout
resamp_with_conv = resamp_with_conv
self.num_resolutions = num_resolutions = len(ch_mult)
self.all_resolutions = all_resolutions = [
image_size // (2**i) for i in range(num_resolutions)
]
self.conditional = conditional = conditional # noise-conditional
self.centered = centered
self.scale_by_sigma = scale_by_sigma
fir = fir
fir_kernel = fir_kernel
self.skip_rescale = skip_rescale = skip_rescale
self.resblock_type = resblock_type = resblock_type.lower()
self.progressive = progressive = progressive.lower()
self.progressive_input = progressive_input = progressive_input.lower()
self.embedding_type = embedding_type = embedding_type.lower()
init_scale = init_scale
assert progressive in ["none", "output_skip", "residual"]
assert progressive_input in ["none", "input_skip", "residual"]
assert embedding_type in ["fourier", "positional"]
combine_method = progressive_combine.lower()
combiner = functools.partial(Combine, method=combine_method)
num_channels = 4 # x.real, x.imag, y.real, y.imag
self.output_layer = nn.Conv2d(num_channels, 2, 1)
modules = []
# timestep/noise_level embedding
if embedding_type == "fourier":
# Gaussian Fourier features embeddings.
modules.append(
layerspp.GaussianFourierProjection(
embedding_size=nf, scale=fourier_scale
)
)
embed_dim = 2 * nf
elif embedding_type == "positional":
embed_dim = nf
else:
raise ValueError(f"embedding type {embedding_type} unknown.")
if conditional:
modules.append(nn.Linear(embed_dim, nf * 4))
modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
nn.init.zeros_(modules[-1].bias)
modules.append(nn.Linear(nf * 4, nf * 4))
modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
nn.init.zeros_(modules[-1].bias)
AttnBlock = functools.partial(
layerspp.AttnBlockpp, init_scale=init_scale, skip_rescale=skip_rescale
)
Upsample = functools.partial(
layerspp.Upsample,
with_conv=resamp_with_conv,
fir=fir,
fir_kernel=fir_kernel,
)
if progressive == "output_skip":
self.pyramid_upsample = layerspp.Upsample(
fir=fir, fir_kernel=fir_kernel, with_conv=False
)
elif progressive == "residual":
pyramid_upsample = functools.partial(
layerspp.Upsample, fir=fir, fir_kernel=fir_kernel, with_conv=True
)
Downsample = functools.partial(
layerspp.Downsample,
with_conv=resamp_with_conv,
fir=fir,
fir_kernel=fir_kernel,
)
if progressive_input == "input_skip":
self.pyramid_downsample = layerspp.Downsample(
fir=fir, fir_kernel=fir_kernel, with_conv=False
)
elif progressive_input == "residual":
pyramid_downsample = functools.partial(
layerspp.Downsample, fir=fir, fir_kernel=fir_kernel, with_conv=True
)
if resblock_type == "ddpm":
ResnetBlock = functools.partial(
ResnetBlockDDPM,
act=act,
dropout=dropout,
init_scale=init_scale,
skip_rescale=skip_rescale,
temb_dim=nf * 4,
)
elif resblock_type == "biggan":
ResnetBlock = functools.partial(
ResnetBlockBigGAN,
act=act,
dropout=dropout,
fir=fir,
fir_kernel=fir_kernel,
init_scale=init_scale,
skip_rescale=skip_rescale,
temb_dim=nf * 4,
)
else:
raise ValueError(f"resblock type {resblock_type} unrecognized.")
# Downsampling block
channels = num_channels
if progressive_input != "none":
input_pyramid_ch = channels
modules.append(conv3x3(channels, nf))
hs_c = [nf]
in_ch = nf
for i_level in range(num_resolutions):
# Residual blocks for this resolution
for i_block in range(num_res_blocks):
out_ch = nf * ch_mult[i_level]
modules.append(ResnetBlock(in_ch=in_ch, out_ch=out_ch))
in_ch = out_ch
if all_resolutions[i_level] in attn_resolutions:
modules.append(AttnBlock(channels=in_ch))
hs_c.append(in_ch)
if i_level != num_resolutions - 1:
if resblock_type == "ddpm":
modules.append(Downsample(in_ch=in_ch))
else:
modules.append(ResnetBlock(down=True, in_ch=in_ch))
if progressive_input == "input_skip":
modules.append(combiner(dim1=input_pyramid_ch, dim2=in_ch))
if combine_method == "cat":
in_ch *= 2
elif progressive_input == "residual":
modules.append(
pyramid_downsample(in_ch=input_pyramid_ch, out_ch=in_ch)
)
input_pyramid_ch = in_ch
hs_c.append(in_ch)
in_ch = hs_c[-1]
modules.append(ResnetBlock(in_ch=in_ch))
modules.append(AttnBlock(channels=in_ch))
modules.append(ResnetBlock(in_ch=in_ch))
pyramid_ch = 0
# Upsampling block
for i_level in reversed(range(num_resolutions)):
for i_block in range(num_res_blocks + 1):
# +1 blocks in upsampling because of skip connection from
# combiner (after downsampling)
out_ch = nf * ch_mult[i_level]
modules.append(ResnetBlock(in_ch=in_ch + hs_c.pop(), out_ch=out_ch))
in_ch = out_ch
if all_resolutions[i_level] in attn_resolutions:
modules.append(AttnBlock(channels=in_ch))
if progressive != "none":
if i_level == num_resolutions - 1:
if progressive == "output_skip":
modules.append(
nn.GroupNorm(
num_groups=min(in_ch // 4, 32),
num_channels=in_ch,
eps=1e-6,
)
)
modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
pyramid_ch = channels
elif progressive == "residual":
modules.append(
nn.GroupNorm(
num_groups=min(in_ch // 4, 32),
num_channels=in_ch,
eps=1e-6,
)
)
modules.append(conv3x3(in_ch, in_ch, bias=True))
pyramid_ch = in_ch
else:
raise ValueError(f"{progressive} is not a valid name.")
else:
if progressive == "output_skip":
modules.append(
nn.GroupNorm(
num_groups=min(in_ch // 4, 32),
num_channels=in_ch,
eps=1e-6,
)
)
modules.append(
conv3x3(in_ch, channels, bias=True, init_scale=init_scale)
)
pyramid_ch = channels
elif progressive == "residual":
modules.append(pyramid_upsample(in_ch=pyramid_ch, out_ch=in_ch))
pyramid_ch = in_ch
else:
raise ValueError(f"{progressive} is not a valid name")
if i_level != 0:
if resblock_type == "ddpm":
modules.append(Upsample(in_ch=in_ch))
else:
modules.append(ResnetBlock(in_ch=in_ch, up=True))
assert not hs_c
if progressive != "output_skip":
modules.append(
nn.GroupNorm(
num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6
)
)
modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
self.all_modules = nn.ModuleList(modules)
[docs] def pad_spec(self, Y):
T = Y.size(3)
if T % 64 != 0:
num_pad = 64 - T % 64
else:
num_pad = 0
pad2d = torch.nn.ZeroPad2d((0, num_pad, 0, 0))
return pad2d(Y)
[docs] def forward(self, x, time_cond):
# timestep/noise_level embedding; only for continuous training
ori_T = x.shape[-1]
x = self.pad_spec(x)
modules = self.all_modules
m_idx = 0
# Convert real and imaginary parts of (x,y) into four channel dimensions
x = torch.cat(
(
x[:, [0], :, :].real,
x[:, [0], :, :].imag,
x[:, [1], :, :].real,
x[:, [1], :, :].imag,
),
dim=1,
)
if self.embedding_type == "fourier":
# Gaussian Fourier features embeddings.
used_sigmas = time_cond
temb = modules[m_idx](torch.log(used_sigmas))
m_idx += 1
else:
raise ValueError(f"embedding type {self.embedding_type} unknown.")
if self.conditional:
temb = modules[m_idx](temb)
m_idx += 1
temb = modules[m_idx](self.act(temb))
m_idx += 1
else:
temb = None
if not self.centered:
# If input data is in [0, 1]
x = 2 * x - 1.0
# Downsampling block
input_pyramid = None
if self.progressive_input != "none":
input_pyramid = x
# Input layer: Conv2d: 4ch -> 128ch
hs = [modules[m_idx](x)]
m_idx += 1
# Down path in U-Net
for i_level in range(self.num_resolutions):
# Residual blocks for this resolution
for i_block in range(self.num_res_blocks):
h = modules[m_idx](hs[-1], temb)
m_idx += 1
# Attention layer (optional)
if (
h.shape[-2] in self.attn_resolutions
): # edit: check H dim (-2) not W dim (-1)
h = modules[m_idx](h)
m_idx += 1
hs.append(h)
# Downsampling
if i_level != self.num_resolutions - 1:
if self.resblock_type == "ddpm":
h = modules[m_idx](hs[-1])
m_idx += 1
else:
h = modules[m_idx](hs[-1], temb)
m_idx += 1
if self.progressive_input == "input_skip": # Combine h with x
input_pyramid = self.pyramid_downsample(input_pyramid)
h = modules[m_idx](input_pyramid, h)
m_idx += 1
elif self.progressive_input == "residual":
input_pyramid = modules[m_idx](input_pyramid)
m_idx += 1
if self.skip_rescale:
input_pyramid = (input_pyramid + h) / np.sqrt(2.0)
else:
input_pyramid = input_pyramid + h
h = input_pyramid
hs.append(h)
h = hs[-1] # actualy equal to: h = h
h = modules[m_idx](h, temb) # ResNet block
m_idx += 1
h = modules[m_idx](h) # Attention block
m_idx += 1
h = modules[m_idx](h, temb) # ResNet block
m_idx += 1
pyramid = None
# Upsampling block
for i_level in reversed(range(self.num_resolutions)):
for i_block in range(self.num_res_blocks + 1):
h = modules[m_idx](torch.cat([h, hs.pop()], dim=1), temb)
m_idx += 1
# edit: from -1 to -2
if h.shape[-2] in self.attn_resolutions:
h = modules[m_idx](h)
m_idx += 1
if self.progressive != "none":
if i_level == self.num_resolutions - 1:
if self.progressive == "output_skip":
pyramid = self.act(modules[m_idx](h)) # GroupNorm
m_idx += 1
pyramid = modules[m_idx](pyramid) # Conv2D: 256 -> 4
m_idx += 1
elif self.progressive == "residual":
pyramid = self.act(modules[m_idx](h))
m_idx += 1
pyramid = modules[m_idx](pyramid)
m_idx += 1
else:
raise ValueError(f"{self.progressive} is not a valid name.")
else:
if self.progressive == "output_skip":
pyramid = self.pyramid_upsample(pyramid) # Upsample
pyramid_h = self.act(modules[m_idx](h)) # GroupNorm
m_idx += 1
pyramid_h = modules[m_idx](pyramid_h)
m_idx += 1
pyramid = pyramid + pyramid_h
elif self.progressive == "residual":
pyramid = modules[m_idx](pyramid)
m_idx += 1
if self.skip_rescale:
pyramid = (pyramid + h) / np.sqrt(2.0)
else:
pyramid = pyramid + h
h = pyramid
else:
raise ValueError(f"{self.progressive} is not a valid name")
# Upsampling Layer
if i_level != 0:
if self.resblock_type == "ddpm":
h = modules[m_idx](h)
m_idx += 1
else:
h = modules[m_idx](h, temb) # Upspampling
m_idx += 1
assert not hs
if self.progressive == "output_skip":
h = pyramid
else:
h = self.act(modules[m_idx](h))
m_idx += 1
h = modules[m_idx](h)
m_idx += 1
assert m_idx == len(modules), "Implementation error"
if self.scale_by_sigma:
used_sigmas = used_sigmas.reshape((x.shape[0], *([1] * len(x.shape[1:]))))
h = h / used_sigmas
# Convert back to complex number
h = self.output_layer(h)
h = torch.permute(h, (0, 2, 3, 1)).contiguous()
h = torch.view_as_complex(h)[:, None, :, :]
h = h[:, :, :, :ori_T]
return h