import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository
HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN")
"Visual Quality",
"Temporal Consistency",
"Dynamic Degree",
"Text-to-Video Alignment",
"Factual Consistency"
DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number',]
SUBMISSION_NAME = "VideoScore-Leaderboard"
CSV_DIR = "./VideoScore-Leaderboard/leaderboard_res.csv"
LEADERBORAD_INTRODUCTION = """# VideoScore Leaderboard
🏆 Welcome to the **VideoScore Leaderboard**! The leaderboard covers many popular text-to-video generative models and evaluates them on 4 dimensions:
"Visual Quality", "Temporal Consistency", "Dynamic Degree", "Text-to-Video Alignment".
To demonstrate the performance of our VideoScore,
we use VideoScore to choose the best from videos with same prompt but different seeds.
Then we use some feature-based metrics mentioned in both VideoScore paper
and EvalCrafter paper,
see more info about these metrics in the second sheet "About" above.
Here is the detailed information for the used metrics.
VideoScore and EvalCrafter both
conduct studies about the correlation between these feature-based metrics (like CLIP-Score and SSIM) and the human scoring on generated videos.
Some of these metrics show a relatively good correlation but some correlates bad with human scores.
Below are the metrics for each dimension, raw score of these metrics is [0,1] and larger is better if there's no extra explanation, then scaled to [0, 100]
(1) Visual Quality = average(VQA_A, VQA_T)
VQA_A and VQA_T are both from EvalCrafter metrics suite.
(2) Temporal Consistency = average(CLIP_Temp, Face_Consistency_Score, Warping_Error)
CLIP_Temp, Face_Consistency_Score, Warping_Error are all from EvalCrafter metrics suite.
Warping_Error is "100*(1 - raw_result)" so that larger score indicate better performance.
(3) Dynamic Degree = average(SSIM_dyn, MSE_dyn)
SSIM_dyn and MSE_dyn are both from VideoScore.
SSIM_dyn is "100*(1-raw_result)" so that larger score indicate better performance.
MSE_dyn is "100*(1-raw_results/255^2)" since the value range of pixel is 0-255 and the theoretical maximum of MSE is 255*255.
(4) Text-to-Video Alignment = average(CLIP-Score, BLIP-BLEU)
CLIP-Scoreand BLIP-BLEU are both from EvalCrafter metrics suite.
CITATION_BUTTON_LABEL = "Copy the following snippet to cite the t2v models and the used metrics"
def get_df():
repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
df = pd.read_csv(CSV_DIR)
df['Model'] = df['Model'].apply(lambda x: f"[{x.split(']')[0][1:]}]({x.split('(')[1][:-1]})")
df['Avg'] = df[["Visual Quality",
"Temporal Consistency",
"Dynamic Degree",
"Text-to-Video Alignment",
"Factual Consistency"]].mean(axis=1).round(2)
df = df.sort_values(by=['Avg'], ascending=False)
return df[COLUMN_NAMES]
def refresh_data():
return get_df()