import pandas as pd import gradio as gr import csv import json import os import shutil from huggingface_hub import Repository HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN") MODEL_INFO = [ "Model", "Avg", "Visual Quality", "Temporal Consistency", "Dynamic Degree", "Text-to-Video Alignment", "Factual Consistency" ] DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number',] SUBMISSION_NAME = "VideoScore-Leaderboard" SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/hexuan21/", SUBMISSION_NAME) CSV_DIR = "./VideoScore-Leaderboard/leaderboard_res.csv" COLUMN_NAMES = MODEL_INFO LEADERBORAD_INTRODUCTION = """# VideoScore Leaderboard 🏆 Welcome to the **VideoScore Leaderboard**! The leaderboard covers many popular text-to-video generative models and evaluates them on 4 dimensions:
"Visual Quality", "Temporal Consistency", "Dynamic Degree", "Text-to-Video Alignment". To demonstrate the performance of our VideoScore, we use VideoScore to choose the best from videos with same prompt but different seeds. Then we use some feature-based metrics mentioned in both VideoScore paper and EvalCrafter paper, see more info about these metrics in the second sheet "About" above. """ TABLE_INTRODUCTION = """ """ LEADERBORAD_INFO = """ Here is the detailed information for the used metrics.
VideoScore and EvalCrafter both conduct studies about the correlation between these feature-based metrics (like CLIP-Score and SSIM) and the human scoring on generated videos. Some of these metrics show a relatively good correlation but some correlates bad with human scores.
Below are the metrics for each dimension, raw score of these metrics is [0,1] and larger is better if there's no extra explanation, then scaled to [0, 100]
(1) Visual Quality = average(VQA_A, VQA_T)
VQA_A and VQA_T are both from EvalCrafter metrics suite. (2) Temporal Consistency = average(CLIP_Temp, Face_Consistency_Score, Warping_Error)
CLIP_Temp, Face_Consistency_Score, Warping_Error are all from EvalCrafter metrics suite. Warping_Error is "100*(1 - raw_result)" so that larger score indicate better performance. (3) Dynamic Degree = average(SSIM_dyn, MSE_dyn)
SSIM_dyn and MSE_dyn are both from VideoScore. SSIM_dyn is "100*(1-raw_result)" so that larger score indicate better performance. MSE_dyn is "100*(1-raw_results/255^2)" since the value range of pixel is 0-255 and the theoretical maximum of MSE is 255*255. (4) Text-to-Video Alignment = average(CLIP-Score, BLIP-BLEU)
CLIP-Scoreand BLIP-BLEU are both from EvalCrafter metrics suite. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite the t2v models and the used metrics" CITATION_BUTTON_TEXT = r""" """ def get_df(): repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN) repo.git_pull() df = pd.read_csv(CSV_DIR) df['Model'] = df['Model'].apply(lambda x: f"[{x.split(']')[0][1:]}]({x.split('(')[1][:-1]})") df['Avg'] = df[["Visual Quality", "Temporal Consistency", "Dynamic Degree", "Text-to-Video Alignment", "Factual Consistency"]].mean(axis=1).round(2) df = df.sort_values(by=['Avg'], ascending=False) return df[COLUMN_NAMES] def refresh_data(): return get_df()