File size: 9,504 Bytes
364b314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f98a3d
364b314
 
 
 
 
 
3f98a3d
364b314
 
 
 
 
fea5cd1
 
 
 
 
 
c016841
 
fea5cd1
 
 
 
 
364b314
f4dc807
364b314
f4dc807
364b314
f4dc807
364b314
 
 
 
 
 
 
 
fea5cd1
364b314
fea5cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364b314
 
 
 
 
 
 
 
 
 
 
3f98a3d
 
364b314
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository

HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN")

MODEL_INFO = [
    "Model",
    "Avg",
    "Visual Quality",
    "Temporal Consistency",
    "Dynamic Degree",
    "Text-to-Video Alignment",
    "Factual Consistency"
    ]

DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number',]

SUBMISSION_NAME = "VideoScore-Leaderboard"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/hexuan21/", SUBMISSION_NAME)
CSV_DIR = "./VideoScore-Leaderboard/leaderboard_res.csv"

COLUMN_NAMES = MODEL_INFO

LEADERBORAD_INTRODUCTION = """# VideoScore Leaderboard

    🏆 Welcome to the **VideoScore Leaderboard**! <br>

    <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
    <a href='https://arxiv.org/abs/2406.15252'>📃Paper</a>
    <a href='https://tiger-ai-lab.github.io/VideoScore/'>🌐Website</a>
    <a href='https://github.com/TIGER-AI-Lab/VideoScore'>💻Github</a>
    <a href='https://huggingface.co/datasets/TIGER-Lab/VideoFeedback'>🛢️VideoFeedback (Dataset)</a>
    <a href='https://huggingface.co/TIGER-Lab/VideoScore'>🤗VideoScore (Model)</a>
    <a href='https://huggingface.co/spaces/TIGER-Lab/VideoScore'>🤗Demo</a>
    <a href='https://api.wandb.ai/links/xuanhe/ptohlfcx'>📉Wandb</a>
    </div>
    
    The leaderboard covers many popular text-to-video generative models and evaluates them on 5 dimensions: <br> 
    
    "Visual Quality", "Temporal Consistency", "Dynamic Degree", "Text-to-Video Alignment", "Factual Consistency"
    
    We sample 200 prompts from <a href="https://arxiv.org/abs/2403.06098">VidProM</a> to generate 200 videos using various T2V models (for those closed-source model, we generate 100).    

    <a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FVideoScore-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a>
    """

TABLE_INTRODUCTION = """
    """

LEADERBORAD_INFO = """
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite the t2v models and related papers"
CITATION_BUTTON_TEXT = r"""
@article{he2024videoscore,
  title = {VideoScore: Building Automatic Metrics to Simulate Fine-grained Human Feedback for Video Generation},
  author = {He, Xuan and Jiang, Dongfu and Zhang, Ge and Ku, Max and Soni, Achint and Siu, Sherman and Chen, Haonan and Chandra, Abhranil and Jiang, Ziyan and Arulraj, Aaran and Wang, Kai and Do, Quy Duc and Ni, Yuansheng and Lyu, Bohan and Narsupalli, Yaswanth and Fan, Rongqi and Lyu, Zhiheng and Lin, Yuchen and Chen, Wenhu},
  journal = {ArXiv},
  year = {2024},
  volume={abs/2406.15252},
  url = {https://arxiv.org/abs/2406.15252},
}

@misc{pika,
  title = {Pika {L}ab},
  howpublished = {\url{https://www.pika.art/}},
}

@article{text2video-zero,
    title={Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators},
    author={Khachatryan, Levon and Movsisyan, Andranik and Tadevosyan, Vahram and Henschel, Roberto and Wang, Zhangyang and Navasardyan, Shant and Shi, Humphrey},
    journal={arXiv preprint arXiv:2303.13439},
    year={2023}
}

@misc{chen2024videocrafter2,
      title={VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models}, 
      author={Haoxin Chen and Yong Zhang and Xiaodong Cun and Menghan Xia and Xintao Wang and Chao Weng and Ying Shan},
      year={2024},
      eprint={2401.09047},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{Wang2023ModelScopeTT,
  title={ModelScope Text-to-Video Technical Report},
  author={Jiuniu Wang and Hangjie Yuan and Dayou Chen and Yingya Zhang and Xiang Wang and Shiwei Zhang},
  journal={ArXiv},
  year={2023},
  volume={abs/2308.06571},
  url={https://api.semanticscholar.org/CorpusID:260887737}
}

@article{wang2023lavie,
  title={LAVIE: High-Quality Video Generation with Cascaded Latent Diffusion Models},
  author={Wang, Yaohui and Chen, Xinyuan and Ma, Xin and Zhou, Shangchen and Huang, Ziqi and Wang, Yi and Yang, Ceyuan and He, Yinan and Yu, Jiashuo and Yang, Peiqing and others},
  journal={arXiv preprint arXiv:2309.15103},
  year={2023}
}

@article{guo2023animatediff,
  title={AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning},
  author={Guo, Yuwei and Yang, Ceyuan and Rao, Anyi and Liang, Zhengyang and Wang, Yaohui and Qiao, Yu and Agrawala, Maneesh and Lin, Dahua and Dai, Bo},
  journal={International Conference on Learning Representations},
  year={2024}
}

@article{guo2023sparsectrl,
  title={SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models},
  author={Guo, Yuwei and Yang, Ceyuan and Rao, Anyi and Agrawala, Maneesh and Lin, Dahua and Dai, Bo},
  journal={arXiv preprint arXiv:2311.16933},
  year={2023}
}

@article{he2022lvdm,
      title={Latent Video Diffusion Models for High-Fidelity Long Video Generation}, 
      author={Yingqing He and Tianyu Yang and Yong Zhang and Ying Shan and Qifeng Chen},
      year={2022},
      eprint={2211.13221},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@software{Mullan_Hotshot-XL_2023,
  author = {Mullan, John and Crawbuck, Duncan and Sastry, Aakash},
  license = {Apache-2.0},
  month = oct,
  title = {{Hotshot-XL}},
  url = {https://github.com/hotshotco/hotshot-xl},
  version = {1.0.0},
  year = {2023}
}

@misc{zeroscope,
  title = {ZeroScope v2},
  author = {Spencer Sterling},
  url = {https://huggingface.co/cerspense/zeroscope_v2_576w},
  year={2024},
}

@article{yuan2024magictime,
  title={MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators},
  author={Yuan, Shenghai and Huang, Jinfa and Shi, Yujun and Xu, Yongqi and Zhu, Ruijie and Lin, Bin and Cheng, Xinhua and Yuan, Li and Luo, Jiebo},
  journal={arXiv preprint arXiv:2404.05014},
  year={2024}
}

@misc{chen2023videocrafter1,
      title={VideoCrafter1: Open Diffusion Models for High-Quality Video Generation}, 
      author={Haoxin Chen and Menghan Xia and Yingqing He and Yong Zhang and Xiaodong Cun and Shaoshu Yang and Jinbo Xing and Yaofang Liu and Qifeng Chen and Xintao Wang and Chao Weng and Ying Shan},
      year={2023},
      eprint={2310.19512},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{xing2023dynamicrafter,
      title={DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors}, 
      author={Jinbo Xing and Menghan Xia and Yong Zhang and Haoxin Chen and Xintao Wang and Tien-Tsin Wong and Ying Shan},
      year={2023},
      eprint={2310.12190},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{ma2024latte,
  title={Latte: Latent Diffusion Transformer for Video Generation},
  author={Ma, Xin and Wang, Yaohui and Jia, Gengyun and Chen, Xinyuan and Liu, Ziwei and Li, Yuan-Fang and Chen, Cunjian and Qiao, Yu},
  journal={arXiv preprint arXiv:2401.03048},
  year={2024}
}


@software{opensora,
  author = {Zangwei Zheng and Xiangyu Peng and Tianji Yang and Chenhui Shen and Shenggui Li and Hongxin Liu and Yukun Zhou and Tianyi Li and Yang You},
  title = {Open-Sora: Democratizing Efficient Video Production for All},
  month = {March},
  year = {2024},
  url = {https://github.com/hpcaitech/Open-Sora}
}

@software{pku_yuan_lab_and_tuzhan_ai_etc_2024_10948109,
  author       = {PKU-Yuan Lab and Tuzhan AI etc.},
  title        = {Open-Sora-Plan},
  month        = apr,
  year         = 2024,
  publisher    = {GitHub},
  doi          = {10.5281/zenodo.10948109},
  url          = {https://doi.org/10.5281/zenodo.10948109}
}

@article{jin2023unified,
  title={Unified Language-Vision Pretraining in LLM with Dynamic Discrete Visual Tokenization},
  author={Jin, Yang and Xu, Kun and Xu, Kun and Chen, Liwei and Liao, Chao and Tan, Jianchao and Mu, Yadong and others},
  journal={arXiv preprint arXiv:2309.04669},
  year={2023}
}

@article{jin2024video,
  title={Video-LaVIT: Unified Video-Language Pre-training with Decoupled Visual-Motional Tokenization},
  author={Jin, Yang and Sun, Zhicheng and Xu, Kun and Chen, Liwei and Jiang, Hao and Huang, Quzhe and Song, Chengru and Liu, Yuliang and Zhang, Di and Song, Yang and others},
  journal={arXiv preprint arXiv:2402.03161},
  year={2024}
}



@misc{gen2,
  title = {Gen-2},
  howpublished = {\url{https://runwayml.com/research/gen-2?utm_source=creatorstoolbox.io/}},
}

@misc{morphstudio,
  title = {Kling},
  howpublished = {\url{https://www.morphstudio.com/}},
}

@misc{kling,
  title = {Kling},
  howpublished = {\url{https://kling.kuaishou.com/}},
}


@article{wang2024vidprom,
  title={Vidprom: A million-scale real prompt-gallery dataset for text-to-video diffusion models},
  author={Wang, Wenhao and Yang, Yi},
  journal={arXiv preprint arXiv:2403.06098},
  year={2024}
}

"""

def get_df():
    repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
    repo.git_pull()
    df = pd.read_csv(CSV_DIR)
    df['Model'] = df['Model'].apply(lambda x: f"[{x.split(']')[0][1:]}]({x.split('(')[1][:-1]})")
    df['Avg'] = df[["Visual Quality",
    "Temporal Consistency",
    "Dynamic Degree",
    "Text-to-Video Alignment",
    "Factual Consistency"]].mean(axis=1).round(2)
    df = df.sort_values(by=['Avg'], ascending=False)
    return df[COLUMN_NAMES]


def refresh_data():
    return get_df()