|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Feature loss based on 19 layer VGG network. |
|
|
|
|
|
The network layers in the feature loss is weighted as described in |
|
'Stereo Magnification: Learning View Synthesis using Multiplane Images', |
|
Tinghui Zhou, Richard Tucker, Flynn, Graham Fyffe, Noah Snavely, SIGGRAPH 2018. |
|
""" |
|
|
|
from typing import Any, Callable, Dict, Optional, Sequence, Tuple |
|
|
|
import numpy as np |
|
import scipy.io as sio |
|
import tensorflow.compat.v1 as tf |
|
|
|
|
|
def _build_net( |
|
layer_type: str, |
|
input_tensor: tf.Tensor, |
|
weight_bias: Optional[Tuple[tf.Tensor, tf.Tensor]] = None, |
|
name: Optional[str] = None, |
|
) -> Callable[[Any], Any]: |
|
"""Build a layer of the VGG network. |
|
|
|
Args: |
|
layer_type: A string, type of this layer. |
|
input_tensor: A tensor. |
|
weight_bias: A tuple of weight and bias. |
|
name: A string, name of this layer. |
|
|
|
Returns: |
|
A callable function of the tensorflow layer. |
|
|
|
Raises: |
|
ValueError: If layer_type is not conv or pool. |
|
""" |
|
|
|
if layer_type == "conv": |
|
return tf.nn.relu( |
|
tf.nn.conv2d( |
|
input_tensor, |
|
weight_bias[0], |
|
strides=[1, 1, 1, 1], |
|
padding="SAME", |
|
name=name, |
|
) |
|
+ weight_bias[1] |
|
) |
|
elif layer_type == "pool": |
|
return tf.nn.avg_pool( |
|
input_tensor, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME" |
|
) |
|
else: |
|
raise ValueError("Unsupported layer %s" % layer_type) |
|
|
|
|
|
def _get_weight_and_bias( |
|
vgg_layers: np.ndarray, index: int |
|
) -> Tuple[tf.Tensor, tf.Tensor]: |
|
"""Get the weight and bias of a specific layer from the VGG pretrained model. |
|
|
|
Args: |
|
vgg_layers: An array, the VGG pretrained model. |
|
index: An integer, index of the layer. |
|
|
|
Returns: |
|
weights: A tensor. |
|
bias: A tensor. |
|
""" |
|
|
|
weights = vgg_layers[index][0][0][2][0][0] |
|
weights = tf.constant(weights) |
|
bias = vgg_layers[index][0][0][2][0][1] |
|
bias = tf.constant(np.reshape(bias, (bias.size))) |
|
|
|
return weights, bias |
|
|
|
|
|
def _build_vgg19(image: tf.Tensor, model_filepath: str) -> Dict[str, tf.Tensor]: |
|
"""Builds the VGG network given the model weights. |
|
|
|
The weights are loaded only for the first time this code is invoked. |
|
|
|
Args: |
|
image: A tensor, input image. |
|
model_filepath: A string, path to the VGG pretrained model. |
|
|
|
Returns: |
|
net: A dict mapping a layer name to a tensor. |
|
""" |
|
|
|
with tf.variable_scope("vgg", reuse=True): |
|
net = {} |
|
if not hasattr(_build_vgg19, "vgg_rawnet"): |
|
with tf.io.gfile.GFile(model_filepath, "rb") as f: |
|
_build_vgg19.vgg_rawnet = sio.loadmat(f) |
|
vgg_layers = _build_vgg19.vgg_rawnet["layers"][0] |
|
imagenet_mean = tf.constant([123.6800, 116.7790, 103.9390], shape=[1, 1, 1, 3]) |
|
net["input"] = image - imagenet_mean |
|
net["conv1_1"] = _build_net( |
|
"conv", |
|
net["input"], |
|
_get_weight_and_bias(vgg_layers, 0), |
|
name="vgg_conv1_1", |
|
) |
|
net["conv1_2"] = _build_net( |
|
"conv", |
|
net["conv1_1"], |
|
_get_weight_and_bias(vgg_layers, 2), |
|
name="vgg_conv1_2", |
|
) |
|
net["pool1"] = _build_net("pool", net["conv1_2"]) |
|
net["conv2_1"] = _build_net( |
|
"conv", |
|
net["pool1"], |
|
_get_weight_and_bias(vgg_layers, 5), |
|
name="vgg_conv2_1", |
|
) |
|
net["conv2_2"] = _build_net( |
|
"conv", |
|
net["conv2_1"], |
|
_get_weight_and_bias(vgg_layers, 7), |
|
name="vgg_conv2_2", |
|
) |
|
net["pool2"] = _build_net("pool", net["conv2_2"]) |
|
net["conv3_1"] = _build_net( |
|
"conv", |
|
net["pool2"], |
|
_get_weight_and_bias(vgg_layers, 10), |
|
name="vgg_conv3_1", |
|
) |
|
net["conv3_2"] = _build_net( |
|
"conv", |
|
net["conv3_1"], |
|
_get_weight_and_bias(vgg_layers, 12), |
|
name="vgg_conv3_2", |
|
) |
|
net["conv3_3"] = _build_net( |
|
"conv", |
|
net["conv3_2"], |
|
_get_weight_and_bias(vgg_layers, 14), |
|
name="vgg_conv3_3", |
|
) |
|
net["conv3_4"] = _build_net( |
|
"conv", |
|
net["conv3_3"], |
|
_get_weight_and_bias(vgg_layers, 16), |
|
name="vgg_conv3_4", |
|
) |
|
net["pool3"] = _build_net("pool", net["conv3_4"]) |
|
net["conv4_1"] = _build_net( |
|
"conv", |
|
net["pool3"], |
|
_get_weight_and_bias(vgg_layers, 19), |
|
name="vgg_conv4_1", |
|
) |
|
net["conv4_2"] = _build_net( |
|
"conv", |
|
net["conv4_1"], |
|
_get_weight_and_bias(vgg_layers, 21), |
|
name="vgg_conv4_2", |
|
) |
|
net["conv4_3"] = _build_net( |
|
"conv", |
|
net["conv4_2"], |
|
_get_weight_and_bias(vgg_layers, 23), |
|
name="vgg_conv4_3", |
|
) |
|
net["conv4_4"] = _build_net( |
|
"conv", |
|
net["conv4_3"], |
|
_get_weight_and_bias(vgg_layers, 25), |
|
name="vgg_conv4_4", |
|
) |
|
net["pool4"] = _build_net("pool", net["conv4_4"]) |
|
net["conv5_1"] = _build_net( |
|
"conv", |
|
net["pool4"], |
|
_get_weight_and_bias(vgg_layers, 28), |
|
name="vgg_conv5_1", |
|
) |
|
net["conv5_2"] = _build_net( |
|
"conv", |
|
net["conv5_1"], |
|
_get_weight_and_bias(vgg_layers, 30), |
|
name="vgg_conv5_2", |
|
) |
|
|
|
return net |
|
|
|
|
|
def _compute_error( |
|
fake: tf.Tensor, real: tf.Tensor, mask: Optional[tf.Tensor] = None |
|
) -> tf.Tensor: |
|
"""Computes the L1 loss and reweights by the mask.""" |
|
if mask is None: |
|
return tf.reduce_mean(tf.abs(fake - real)) |
|
else: |
|
|
|
size = (tf.shape(fake)[1], tf.shape(fake)[2]) |
|
resized_mask = tf.image.resize( |
|
mask, size, method=tf.image.ResizeMethod.BILINEAR |
|
) |
|
return tf.reduce_mean(tf.abs(fake - real) * resized_mask) |
|
|
|
|
|
|
|
|
|
def vgg_loss( |
|
image: tf.Tensor, |
|
reference: tf.Tensor, |
|
vgg_model_file: str, |
|
weights: Optional[Sequence[float]] = None, |
|
mask: Optional[tf.Tensor] = None, |
|
) -> tf.Tensor: |
|
"""Computes the VGG loss for an image pair. |
|
|
|
The VGG loss is the average feature vector difference between the two images. |
|
|
|
The input images must be in [0, 1] range in (B, H, W, 3) RGB format and |
|
the recommendation seems to be to have them in gamma space. |
|
|
|
The pretrained weights are publicly available in |
|
http://www.vlfeat.org/matconvnet/models/imagenet-vgg-verydeep-19.mat |
|
|
|
Args: |
|
image: A tensor, typically the prediction from a network. |
|
reference: A tensor, the image to compare against, i.e. the golden image. |
|
vgg_model_file: A string, filename for the VGG 19 network weights in MATLAB |
|
format. |
|
weights: A list of float, optional weights for the layers. The defaults are |
|
from Qifeng Chen and Vladlen Koltun, "Photographic image synthesis with |
|
cascaded refinement networks," ICCV 2017. |
|
mask: An optional image-shape and single-channel tensor, the mask values are |
|
per-pixel weights to be applied on the losses. The mask will be resized to |
|
the same spatial resolution with the feature maps before been applied to |
|
the losses. When the mask value is zero, pixels near the boundary of the |
|
mask can still influence the loss if they fall into the receptive field of |
|
the VGG convolutional layers. |
|
|
|
Returns: |
|
vgg_loss: The linear combination of losses from five VGG layers. |
|
""" |
|
|
|
if not weights: |
|
weights = [1.0 / 2.6, 1.0 / 4.8, 1.0 / 3.7, 1.0 / 5.6, 10.0 / 1.5] |
|
|
|
vgg_ref = _build_vgg19(reference * 255.0, vgg_model_file) |
|
vgg_img = _build_vgg19(image * 255.0, vgg_model_file) |
|
p1 = _compute_error(vgg_ref["conv1_2"], vgg_img["conv1_2"], mask) * weights[0] |
|
p2 = _compute_error(vgg_ref["conv2_2"], vgg_img["conv2_2"], mask) * weights[1] |
|
p3 = _compute_error(vgg_ref["conv3_2"], vgg_img["conv3_2"], mask) * weights[2] |
|
p4 = _compute_error(vgg_ref["conv4_2"], vgg_img["conv4_2"], mask) * weights[3] |
|
p5 = _compute_error(vgg_ref["conv5_2"], vgg_img["conv5_2"], mask) * weights[4] |
|
|
|
final_loss = p1 + p2 + p3 + p4 + p5 |
|
|
|
|
|
final_loss /= 255.0 |
|
|
|
return final_loss |
|
|
|
|
|
def _compute_gram_matrix(input_features: tf.Tensor, mask: tf.Tensor) -> tf.Tensor: |
|
"""Computes Gram matrix of `input_features`. |
|
|
|
Gram matrix described in https://en.wikipedia.org/wiki/Gramian_matrix. |
|
|
|
Args: |
|
input_features: A tf.Tensor of shape (B, H, W, C) representing a feature map |
|
obtained by a convolutional layer of a VGG network. |
|
mask: A tf.Tensor of shape (B, H, W, 1) representing the per-pixel weights |
|
to be applied on the `input_features`. The mask will be resized to the |
|
same spatial resolution as the `input_featues`. When the mask value is |
|
zero, pixels near the boundary of the mask can still influence the loss if |
|
they fall into the receptive field of the VGG convolutional layers. |
|
|
|
Returns: |
|
A tf.Tensor of shape (B, C, C) representing the gram matrix of the masked |
|
`input_features`. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
_, h, w, c = ( |
|
tf.shape(input_features)[0], |
|
tf.shape(input_features)[1], |
|
tf.shape(input_features)[2], |
|
tf.shape(input_features)[3], |
|
) |
|
|
|
if mask is None: |
|
reshaped_features = tf.reshape(input_features, (-1, h * w, c)) |
|
else: |
|
|
|
resized_mask = tf.image.resize( |
|
mask, (h, w), method=tf.image.ResizeMethod.BILINEAR |
|
) |
|
reshaped_features = tf.reshape(input_features * resized_mask, (-1, h * w, c)) |
|
return tf.matmul(reshaped_features, reshaped_features, transpose_a=True) / tf.cast( |
|
tf.multiply(h, w), tf.float32 |
|
) |
|
|
|
|
|
def style_loss( |
|
image: tf.Tensor, |
|
reference: tf.Tensor, |
|
vgg_model_file: str, |
|
weights: Optional[Sequence[float]] = None, |
|
mask: Optional[tf.Tensor] = None, |
|
) -> tf.Tensor: |
|
"""Computes style loss as used in `A Neural Algorithm of Artistic Style`. |
|
|
|
Based on the work in https://github.com/cysmith/neural-style-tf. Weights are |
|
first initilaized to the inverse of the number of elements in each VGG layer |
|
considerd. After 1.5M iterations, they are rescaled to normalize the |
|
contribution of the Style loss to be equal to other losses (L1/VGG). This is |
|
based on the works of image inpainting (https://arxiv.org/abs/1804.07723) |
|
and frame prediction (https://arxiv.org/abs/1811.00684). |
|
|
|
The style loss is the average gram matrix difference between `image` and |
|
`reference`. The gram matrix is the inner product of a feature map of shape |
|
(B, H*W, C) with itself. Results in a symmetric gram matrix shaped (B, C, C). |
|
|
|
The input images must be in [0, 1] range in (B, H, W, 3) RGB format and |
|
the recommendation seems to be to have them in gamma space. |
|
|
|
The pretrained weights are publicly available in |
|
http://www.vlfeat.org/matconvnet/models/imagenet-vgg-verydeep-19.mat |
|
|
|
Args: |
|
image: A tensor, typically the prediction from a network. |
|
reference: A tensor, the image to compare against, i.e. the golden image. |
|
vgg_model_file: A string, filename for the VGG 19 network weights in MATLAB |
|
format. |
|
weights: A list of float, optional weights for the layers. The defaults are |
|
from Qifeng Chen and Vladlen Koltun, "Photographic image synthesis with |
|
cascaded refinement networks," ICCV 2017. |
|
mask: An optional image-shape and single-channel tensor, the mask values are |
|
per-pixel weights to be applied on the losses. The mask will be resized to |
|
the same spatial resolution with the feature maps before been applied to |
|
the losses. When the mask value is zero, pixels near the boundary of the |
|
mask can still influence the loss if they fall into the receptive field of |
|
the VGG convolutional layers. |
|
|
|
Returns: |
|
Style loss, a linear combination of gram matrix L2 differences of from five |
|
VGG layer features. |
|
""" |
|
|
|
if not weights: |
|
weights = [1.0 / 2.6, 1.0 / 4.8, 1.0 / 3.7, 1.0 / 5.6, 10.0 / 1.5] |
|
|
|
vgg_ref = _build_vgg19(reference * 255.0, vgg_model_file) |
|
vgg_img = _build_vgg19(image * 255.0, vgg_model_file) |
|
|
|
p1 = ( |
|
tf.reduce_mean( |
|
tf.squared_difference( |
|
_compute_gram_matrix(vgg_ref["conv1_2"] / 255.0, mask), |
|
_compute_gram_matrix(vgg_img["conv1_2"] / 255.0, mask), |
|
) |
|
) |
|
* weights[0] |
|
) |
|
p2 = ( |
|
tf.reduce_mean( |
|
tf.squared_difference( |
|
_compute_gram_matrix(vgg_ref["conv2_2"] / 255.0, mask), |
|
_compute_gram_matrix(vgg_img["conv2_2"] / 255.0, mask), |
|
) |
|
) |
|
* weights[1] |
|
) |
|
p3 = ( |
|
tf.reduce_mean( |
|
tf.squared_difference( |
|
_compute_gram_matrix(vgg_ref["conv3_2"] / 255.0, mask), |
|
_compute_gram_matrix(vgg_img["conv3_2"] / 255.0, mask), |
|
) |
|
) |
|
* weights[2] |
|
) |
|
p4 = ( |
|
tf.reduce_mean( |
|
tf.squared_difference( |
|
_compute_gram_matrix(vgg_ref["conv4_2"] / 255.0, mask), |
|
_compute_gram_matrix(vgg_img["conv4_2"] / 255.0, mask), |
|
) |
|
) |
|
* weights[3] |
|
) |
|
p5 = ( |
|
tf.reduce_mean( |
|
tf.squared_difference( |
|
_compute_gram_matrix(vgg_ref["conv5_2"] / 255.0, mask), |
|
_compute_gram_matrix(vgg_img["conv5_2"] / 255.0, mask), |
|
) |
|
) |
|
* weights[4] |
|
) |
|
|
|
final_loss = p1 + p2 + p3 + p4 + p5 |
|
|
|
return final_loss |
|
|