File size: 9,504 Bytes
364b314 3f98a3d 364b314 3f98a3d 364b314 fea5cd1 c016841 fea5cd1 364b314 f4dc807 364b314 f4dc807 364b314 f4dc807 364b314 fea5cd1 364b314 fea5cd1 364b314 3f98a3d 364b314 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository
HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN")
MODEL_INFO = [
"Model",
"Avg",
"Visual Quality",
"Temporal Consistency",
"Dynamic Degree",
"Text-to-Video Alignment",
"Factual Consistency"
]
DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number',]
SUBMISSION_NAME = "VideoScore-Leaderboard"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/hexuan21/", SUBMISSION_NAME)
CSV_DIR = "./VideoScore-Leaderboard/leaderboard_res.csv"
COLUMN_NAMES = MODEL_INFO
LEADERBORAD_INTRODUCTION = """# VideoScore Leaderboard
🏆 Welcome to the **VideoScore Leaderboard**! <br>
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
<a href='https://arxiv.org/abs/2406.15252'>📃Paper</a>
<a href='https://tiger-ai-lab.github.io/VideoScore/'>🌐Website</a>
<a href='https://github.com/TIGER-AI-Lab/VideoScore'>💻Github</a>
<a href='https://huggingface.co/datasets/TIGER-Lab/VideoFeedback'>🛢️VideoFeedback (Dataset)</a>
<a href='https://huggingface.co/TIGER-Lab/VideoScore'>🤗VideoScore (Model)</a>
<a href='https://huggingface.co/spaces/TIGER-Lab/VideoScore'>🤗Demo</a>
<a href='https://api.wandb.ai/links/xuanhe/ptohlfcx'>📉Wandb</a>
</div>
The leaderboard covers many popular text-to-video generative models and evaluates them on 5 dimensions: <br>
"Visual Quality", "Temporal Consistency", "Dynamic Degree", "Text-to-Video Alignment", "Factual Consistency"
We sample 200 prompts from <a href="https://arxiv.org/abs/2403.06098">VidProM</a> to generate 200 videos using various T2V models (for those closed-source model, we generate 100).
<a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FVideoScore-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a>
"""
TABLE_INTRODUCTION = """
"""
LEADERBORAD_INFO = """
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite the t2v models and related papers"
CITATION_BUTTON_TEXT = r"""
@article{he2024videoscore,
title = {VideoScore: Building Automatic Metrics to Simulate Fine-grained Human Feedback for Video Generation},
author = {He, Xuan and Jiang, Dongfu and Zhang, Ge and Ku, Max and Soni, Achint and Siu, Sherman and Chen, Haonan and Chandra, Abhranil and Jiang, Ziyan and Arulraj, Aaran and Wang, Kai and Do, Quy Duc and Ni, Yuansheng and Lyu, Bohan and Narsupalli, Yaswanth and Fan, Rongqi and Lyu, Zhiheng and Lin, Yuchen and Chen, Wenhu},
journal = {ArXiv},
year = {2024},
volume={abs/2406.15252},
url = {https://arxiv.org/abs/2406.15252},
}
@misc{pika,
title = {Pika {L}ab},
howpublished = {\url{https://www.pika.art/}},
}
@article{text2video-zero,
title={Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators},
author={Khachatryan, Levon and Movsisyan, Andranik and Tadevosyan, Vahram and Henschel, Roberto and Wang, Zhangyang and Navasardyan, Shant and Shi, Humphrey},
journal={arXiv preprint arXiv:2303.13439},
year={2023}
}
@misc{chen2024videocrafter2,
title={VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models},
author={Haoxin Chen and Yong Zhang and Xiaodong Cun and Menghan Xia and Xintao Wang and Chao Weng and Ying Shan},
year={2024},
eprint={2401.09047},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{Wang2023ModelScopeTT,
title={ModelScope Text-to-Video Technical Report},
author={Jiuniu Wang and Hangjie Yuan and Dayou Chen and Yingya Zhang and Xiang Wang and Shiwei Zhang},
journal={ArXiv},
year={2023},
volume={abs/2308.06571},
url={https://api.semanticscholar.org/CorpusID:260887737}
}
@article{wang2023lavie,
title={LAVIE: High-Quality Video Generation with Cascaded Latent Diffusion Models},
author={Wang, Yaohui and Chen, Xinyuan and Ma, Xin and Zhou, Shangchen and Huang, Ziqi and Wang, Yi and Yang, Ceyuan and He, Yinan and Yu, Jiashuo and Yang, Peiqing and others},
journal={arXiv preprint arXiv:2309.15103},
year={2023}
}
@article{guo2023animatediff,
title={AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning},
author={Guo, Yuwei and Yang, Ceyuan and Rao, Anyi and Liang, Zhengyang and Wang, Yaohui and Qiao, Yu and Agrawala, Maneesh and Lin, Dahua and Dai, Bo},
journal={International Conference on Learning Representations},
year={2024}
}
@article{guo2023sparsectrl,
title={SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models},
author={Guo, Yuwei and Yang, Ceyuan and Rao, Anyi and Agrawala, Maneesh and Lin, Dahua and Dai, Bo},
journal={arXiv preprint arXiv:2311.16933},
year={2023}
}
@article{he2022lvdm,
title={Latent Video Diffusion Models for High-Fidelity Long Video Generation},
author={Yingqing He and Tianyu Yang and Yong Zhang and Ying Shan and Qifeng Chen},
year={2022},
eprint={2211.13221},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@software{Mullan_Hotshot-XL_2023,
author = {Mullan, John and Crawbuck, Duncan and Sastry, Aakash},
license = {Apache-2.0},
month = oct,
title = {{Hotshot-XL}},
url = {https://github.com/hotshotco/hotshot-xl},
version = {1.0.0},
year = {2023}
}
@misc{zeroscope,
title = {ZeroScope v2},
author = {Spencer Sterling},
url = {https://huggingface.co/cerspense/zeroscope_v2_576w},
year={2024},
}
@article{yuan2024magictime,
title={MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators},
author={Yuan, Shenghai and Huang, Jinfa and Shi, Yujun and Xu, Yongqi and Zhu, Ruijie and Lin, Bin and Cheng, Xinhua and Yuan, Li and Luo, Jiebo},
journal={arXiv preprint arXiv:2404.05014},
year={2024}
}
@misc{chen2023videocrafter1,
title={VideoCrafter1: Open Diffusion Models for High-Quality Video Generation},
author={Haoxin Chen and Menghan Xia and Yingqing He and Yong Zhang and Xiaodong Cun and Shaoshu Yang and Jinbo Xing and Yaofang Liu and Qifeng Chen and Xintao Wang and Chao Weng and Ying Shan},
year={2023},
eprint={2310.19512},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{xing2023dynamicrafter,
title={DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors},
author={Jinbo Xing and Menghan Xia and Yong Zhang and Haoxin Chen and Xintao Wang and Tien-Tsin Wong and Ying Shan},
year={2023},
eprint={2310.12190},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{ma2024latte,
title={Latte: Latent Diffusion Transformer for Video Generation},
author={Ma, Xin and Wang, Yaohui and Jia, Gengyun and Chen, Xinyuan and Liu, Ziwei and Li, Yuan-Fang and Chen, Cunjian and Qiao, Yu},
journal={arXiv preprint arXiv:2401.03048},
year={2024}
}
@software{opensora,
author = {Zangwei Zheng and Xiangyu Peng and Tianji Yang and Chenhui Shen and Shenggui Li and Hongxin Liu and Yukun Zhou and Tianyi Li and Yang You},
title = {Open-Sora: Democratizing Efficient Video Production for All},
month = {March},
year = {2024},
url = {https://github.com/hpcaitech/Open-Sora}
}
@software{pku_yuan_lab_and_tuzhan_ai_etc_2024_10948109,
author = {PKU-Yuan Lab and Tuzhan AI etc.},
title = {Open-Sora-Plan},
month = apr,
year = 2024,
publisher = {GitHub},
doi = {10.5281/zenodo.10948109},
url = {https://doi.org/10.5281/zenodo.10948109}
}
@article{jin2023unified,
title={Unified Language-Vision Pretraining in LLM with Dynamic Discrete Visual Tokenization},
author={Jin, Yang and Xu, Kun and Xu, Kun and Chen, Liwei and Liao, Chao and Tan, Jianchao and Mu, Yadong and others},
journal={arXiv preprint arXiv:2309.04669},
year={2023}
}
@article{jin2024video,
title={Video-LaVIT: Unified Video-Language Pre-training with Decoupled Visual-Motional Tokenization},
author={Jin, Yang and Sun, Zhicheng and Xu, Kun and Chen, Liwei and Jiang, Hao and Huang, Quzhe and Song, Chengru and Liu, Yuliang and Zhang, Di and Song, Yang and others},
journal={arXiv preprint arXiv:2402.03161},
year={2024}
}
@misc{gen2,
title = {Gen-2},
howpublished = {\url{https://runwayml.com/research/gen-2?utm_source=creatorstoolbox.io/}},
}
@misc{morphstudio,
title = {Kling},
howpublished = {\url{https://www.morphstudio.com/}},
}
@misc{kling,
title = {Kling},
howpublished = {\url{https://kling.kuaishou.com/}},
}
@article{wang2024vidprom,
title={Vidprom: A million-scale real prompt-gallery dataset for text-to-video diffusion models},
author={Wang, Wenhao and Yang, Yi},
journal={arXiv preprint arXiv:2403.06098},
year={2024}
}
"""
def get_df():
repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
repo.git_pull()
df = pd.read_csv(CSV_DIR)
df['Model'] = df['Model'].apply(lambda x: f"[{x.split(']')[0][1:]}]({x.split('(')[1][:-1]})")
df['Avg'] = df[["Visual Quality",
"Temporal Consistency",
"Dynamic Degree",
"Text-to-Video Alignment",
"Factual Consistency"]].mean(axis=1).round(2)
df = df.sort_values(by=['Avg'], ascending=False)
return df[COLUMN_NAMES]
def refresh_data():
return get_df() |