File size: 3,556 Bytes
3be620b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import numpy as np
import tensorflow as tf
import tensorflow_gan as tfgan
import tensorflow_hub as hub
from sklearn.metrics.pairwise import polynomial_kernel
from tqdm.auto import tqdm
i3d = hub.KerasLayer("https://tfhub.dev/deepmind/i3d-kinetics-400/1")
def resize_videos(videos, target_resolution):
"""Runs some preprocessing on the videos for I3D model.
Args:
videos: <T>[batch_size, num_frames, height, width, depth] The videos to be
preprocessed. We don't care about the specific dtype of the videos, it can
be anything that tf.image.resize_bilinear accepts. Values are expected to
be in [-1, 1].
target_resolution: (width, height): target video resolution
Returns:
videos: <float32>[batch_size, num_frames, height, width, depth]
"""
min_frames = 9
B, T, H, W, C = videos.shape
videos = tf.transpose(videos, (1, 0, 2, 3, 4))
if T < min_frames:
videos = tf.concat([tf.zeros((min_frames - T, B, H, W, C)), videos], axis=0)
scaled_videos = tf.map_fn(lambda x: tf.image.resize(x, target_resolution), videos)
scaled_videos = tf.transpose(scaled_videos, (1, 0, 2, 3, 4))
return scaled_videos
def polynomial_mmd(X, Y):
m = X.shape[0]
n = Y.shape[0]
# compute kernels
K_XX = polynomial_kernel(X)
K_YY = polynomial_kernel(Y)
K_XY = polynomial_kernel(X, Y)
# compute mmd distance
K_XX_sum = (K_XX.sum() - np.diagonal(K_XX).sum()) / (m * (m - 1))
K_YY_sum = (K_YY.sum() - np.diagonal(K_YY).sum()) / (n * (n - 1))
K_XY_sum = K_XY.sum() / (m * n)
mmd = K_XX_sum + K_YY_sum - 2 * K_XY_sum
return mmd
def calculate_ssim_videos(fake, real):
fake = tf.cast(((fake * 0.5) + 1) * 255, tf.uint8)
real = tf.cast(((real * 0.5) + 1) * 255, tf.uint8)
ssims = []
for i in range(fake.shape[0]):
ssims.append(tf.image.ssim(fake[i], real[i], 255).numpy().mean())
return np.array(ssims).mean()
def calculate_psnr_videos(fake, real):
fake = tf.cast(((fake * 0.5) + 1) * 255, tf.uint8)
real = tf.cast(((real * 0.5) + 1) * 255, tf.uint8)
psnrs = []
for i in range(fake.shape[0]):
psnrs.append(tf.image.psnr(fake[i], real[i], 255).numpy().mean())
return np.array(psnrs).mean()
def calculate_videos_metrics(dataset, model, total_length):
fake_embeddings = []
real_embeddings = []
psnrs = []
ssims = []
for sample in tqdm(dataset, total=total_length):
generated = model(sample, training=False)
generated, real = generated[:, 1:], sample["y"][:, 1:] # ignore first frame
real_resized = resize_videos(real, (224, 224))
generated_resized = resize_videos(generated, (224, 224))
real_activations = i3d(real_resized)
generated_activations = i3d(generated_resized)
fake_embeddings.append(generated_activations)
real_embeddings.append(real_activations)
psnrs.append(calculate_psnr_videos(generated, real))
ssims.append(calculate_ssim_videos(generated, real))
# fake_concat, real_concat = tf.concat(fake_embeddings, axis=0), tf.concat(real_embeddings, axis=0)
fvd = tfgan.eval.frechet_classifier_distance_from_activations(
tf.concat(fake_embeddings, axis=0), tf.concat(real_embeddings, axis=0)
)
kvd = polynomial_mmd(
tf.concat(fake_embeddings, axis=0), tf.concat(real_embeddings, axis=0)
)
psnr = np.array(psnrs).mean()
ssim = np.array(ssims).mean()
return {"fvd": fvd, "kvd": kvd, "ssim": ssim, "psnr": psnr}
|