File size: 3,556 Bytes
3be620b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
import tensorflow as tf
import tensorflow_gan as tfgan
import tensorflow_hub as hub
from sklearn.metrics.pairwise import polynomial_kernel
from tqdm.auto import tqdm

i3d = hub.KerasLayer("https://tfhub.dev/deepmind/i3d-kinetics-400/1")


def resize_videos(videos, target_resolution):
    """Runs some preprocessing on the videos for I3D model.
    Args:
        videos: <T>[batch_size, num_frames, height, width, depth] The videos to be
            preprocessed. We don't care about the specific dtype of the videos, it can
            be anything that tf.image.resize_bilinear accepts. Values are expected to
            be in [-1, 1].
        target_resolution: (width, height): target video resolution
    Returns:
        videos: <float32>[batch_size, num_frames, height, width, depth]
    """
    min_frames = 9
    B, T, H, W, C = videos.shape
    videos = tf.transpose(videos, (1, 0, 2, 3, 4))
    if T < min_frames:
        videos = tf.concat([tf.zeros((min_frames - T, B, H, W, C)), videos], axis=0)
    scaled_videos = tf.map_fn(lambda x: tf.image.resize(x, target_resolution), videos)
    scaled_videos = tf.transpose(scaled_videos, (1, 0, 2, 3, 4))
    return scaled_videos


def polynomial_mmd(X, Y):
    m = X.shape[0]
    n = Y.shape[0]
    # compute kernels
    K_XX = polynomial_kernel(X)
    K_YY = polynomial_kernel(Y)
    K_XY = polynomial_kernel(X, Y)
    # compute mmd distance
    K_XX_sum = (K_XX.sum() - np.diagonal(K_XX).sum()) / (m * (m - 1))
    K_YY_sum = (K_YY.sum() - np.diagonal(K_YY).sum()) / (n * (n - 1))
    K_XY_sum = K_XY.sum() / (m * n)
    mmd = K_XX_sum + K_YY_sum - 2 * K_XY_sum
    return mmd


def calculate_ssim_videos(fake, real):
    fake = tf.cast(((fake * 0.5) + 1) * 255, tf.uint8)
    real = tf.cast(((real * 0.5) + 1) * 255, tf.uint8)
    ssims = []
    for i in range(fake.shape[0]):
        ssims.append(tf.image.ssim(fake[i], real[i], 255).numpy().mean())

    return np.array(ssims).mean()


def calculate_psnr_videos(fake, real):
    fake = tf.cast(((fake * 0.5) + 1) * 255, tf.uint8)
    real = tf.cast(((real * 0.5) + 1) * 255, tf.uint8)
    psnrs = []
    for i in range(fake.shape[0]):
        psnrs.append(tf.image.psnr(fake[i], real[i], 255).numpy().mean())

    return np.array(psnrs).mean()


def calculate_videos_metrics(dataset, model, total_length):
    fake_embeddings = []
    real_embeddings = []

    psnrs = []
    ssims = []

    for sample in tqdm(dataset, total=total_length):
        generated = model(sample, training=False)
        generated, real = generated[:, 1:], sample["y"][:, 1:]  # ignore first frame

        real_resized = resize_videos(real, (224, 224))
        generated_resized = resize_videos(generated, (224, 224))

        real_activations = i3d(real_resized)
        generated_activations = i3d(generated_resized)
        fake_embeddings.append(generated_activations)
        real_embeddings.append(real_activations)

        psnrs.append(calculate_psnr_videos(generated, real))
        ssims.append(calculate_ssim_videos(generated, real))

    # fake_concat, real_concat = tf.concat(fake_embeddings, axis=0), tf.concat(real_embeddings, axis=0)
    fvd = tfgan.eval.frechet_classifier_distance_from_activations(
        tf.concat(fake_embeddings, axis=0), tf.concat(real_embeddings, axis=0)
    )
    kvd = polynomial_mmd(
        tf.concat(fake_embeddings, axis=0), tf.concat(real_embeddings, axis=0)
    )
    psnr = np.array(psnrs).mean()
    ssim = np.array(ssims).mean()
    return {"fvd": fvd, "kvd": kvd, "ssim": ssim, "psnr": psnr}