Spaces:
Paused
Paused
File size: 5,285 Bytes
d3c3894 5b1ae50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import torch
import torch.nn as nn
import torch.nn.functional as F
from videoretalking.models.base_blocks import ResBlock, StyleConv, ToRGB
class ENet(nn.Module):
def __init__(
self,
num_style_feat=512,
lnet=None,
concat=False
):
super(ENet, self).__init__()
self.low_res = lnet
for param in self.low_res.parameters():
param.requires_grad = False
channel_multiplier, narrow = 2, 1
channels = {
'4': int(512 * narrow),
'8': int(512 * narrow),
'16': int(512 * narrow),
'32': int(512 * narrow),
'64': int(256 * channel_multiplier * narrow),
'128': int(128 * channel_multiplier * narrow),
'256': int(64 * channel_multiplier * narrow),
'512': int(32 * channel_multiplier * narrow),
'1024': int(16 * channel_multiplier * narrow)
}
self.log_size = 8
first_out_size = 128
self.conv_body_first = nn.Conv2d(3, channels[f'{first_out_size}'], 1) # 256 -> 128
# downsample
in_channels = channels[f'{first_out_size}']
self.conv_body_down = nn.ModuleList()
for i in range(8, 2, -1):
out_channels = channels[f'{2**(i - 1)}']
self.conv_body_down.append(ResBlock(in_channels, out_channels, mode='down'))
in_channels = out_channels
self.num_style_feat = num_style_feat
linear_out_channel = num_style_feat
self.final_linear = nn.Linear(channels['4'] * 4 * 4, linear_out_channel)
self.final_conv = nn.Conv2d(in_channels, channels['4'], 3, 1, 1)
self.style_convs = nn.ModuleList()
self.to_rgbs = nn.ModuleList()
self.noises = nn.Module()
self.concat = concat
if concat:
in_channels = 3 + 32 # channels['64']
else:
in_channels = 3
for i in range(7, 9): # 128, 256
out_channels = channels[f'{2**i}'] #
self.style_convs.append(
StyleConv(
in_channels,
out_channels,
kernel_size=3,
num_style_feat=num_style_feat,
demodulate=True,
sample_mode='upsample'))
self.style_convs.append(
StyleConv(
out_channels,
out_channels,
kernel_size=3,
num_style_feat=num_style_feat,
demodulate=True,
sample_mode=None))
self.to_rgbs.append(ToRGB(out_channels, num_style_feat, upsample=True))
in_channels = out_channels
def forward(self, audio_sequences, face_sequences, gt_sequences):
B = audio_sequences.size(0)
input_dim_size = len(face_sequences.size())
inp, ref = torch.split(face_sequences,3,dim=1)
if input_dim_size > 4:
audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0)
inp = torch.cat([inp[:, :, i] for i in range(inp.size(2))], dim=0)
ref = torch.cat([ref[:, :, i] for i in range(ref.size(2))], dim=0)
gt_sequences = torch.cat([gt_sequences[:, :, i] for i in range(gt_sequences.size(2))], dim=0)
# get the global style
feat = F.leaky_relu_(self.conv_body_first(F.interpolate(ref, size=(256,256), mode='bilinear')), negative_slope=0.2)
for i in range(self.log_size - 2):
feat = self.conv_body_down[i](feat)
feat = F.leaky_relu_(self.final_conv(feat), negative_slope=0.2)
# style code
style_code = self.final_linear(feat.reshape(feat.size(0), -1))
style_code = style_code.reshape(style_code.size(0), -1, self.num_style_feat)
LNet_input = torch.cat([inp, gt_sequences], dim=1)
LNet_input = F.interpolate(LNet_input, size=(96,96), mode='bilinear')
if self.concat:
low_res_img, low_res_feat = self.low_res(audio_sequences, LNet_input)
low_res_img.detach()
low_res_feat.detach()
out = torch.cat([low_res_img, low_res_feat], dim=1)
else:
low_res_img = self.low_res(audio_sequences, LNet_input)
low_res_img.detach()
# 96 x 96
out = low_res_img
p2d = (2,2,2,2)
out = F.pad(out, p2d, "reflect", 0)
skip = out
for conv1, conv2, to_rgb in zip(self.style_convs[::2], self.style_convs[1::2], self.to_rgbs):
out = conv1(out, style_code) # 96, 192, 384
out = conv2(out, style_code)
skip = to_rgb(out, style_code, skip)
_outputs = skip
# remove padding
_outputs = _outputs[:,:,8:-8,8:-8]
if input_dim_size > 4:
_outputs = torch.split(_outputs, B, dim=0)
outputs = torch.stack(_outputs, dim=2)
low_res_img = F.interpolate(low_res_img, outputs.size()[3:])
low_res_img = torch.split(low_res_img, B, dim=0)
low_res_img = torch.stack(low_res_img, dim=2)
else:
outputs = _outputs
return outputs, low_res_img |