Spaces:
Running
Running
suhyun.kang
commited on
Commit
·
a19f11e
1
Parent(s):
0ac094d
[#1] Add leaderboard based on Elo rating
Browse filesChanges:
- Added leaderboards for Summarization and Translation categories.
- Implemented Elo rating for each rating.
- Ref: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=QLGc6DwxyvQc
Screenshot: https://screen.yanolja.in/j7inrSXCtFtnJije.png
- app.py +4 -0
- leaderboard.py +78 -0
app.py
CHANGED
@@ -13,6 +13,8 @@ import firebase_admin
|
|
13 |
from firebase_admin import firestore
|
14 |
import gradio as gr
|
15 |
|
|
|
|
|
16 |
db_app = firebase_admin.initialize_app()
|
17 |
db = firestore.client()
|
18 |
|
@@ -214,6 +216,8 @@ with gr.Blocks() as app:
|
|
214 |
submit.click(user, prompt, states + model_names,
|
215 |
queue=False).then(bot, states, states + responses)
|
216 |
|
|
|
|
|
217 |
if __name__ == "__main__":
|
218 |
# We need to enable queue to use generators.
|
219 |
app.queue()
|
|
|
13 |
from firebase_admin import firestore
|
14 |
import gradio as gr
|
15 |
|
16 |
+
from leaderboard import build_leaderboard
|
17 |
+
|
18 |
db_app = firebase_admin.initialize_app()
|
19 |
db = firestore.client()
|
20 |
|
|
|
216 |
submit.click(user, prompt, states + model_names,
|
217 |
queue=False).then(bot, states, states + responses)
|
218 |
|
219 |
+
build_leaderboard(db)
|
220 |
+
|
221 |
if __name__ == "__main__":
|
222 |
# We need to enable queue to use generators.
|
223 |
app.queue()
|
leaderboard.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
It provides a leaderboard component.
|
3 |
+
"""
|
4 |
+
|
5 |
+
from collections import defaultdict
|
6 |
+
import enum
|
7 |
+
import math
|
8 |
+
|
9 |
+
import gradio as gr
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
|
13 |
+
class LeaderboardTab(enum.Enum):
|
14 |
+
SUMMARIZATION = "Summarization"
|
15 |
+
TRANSLATION = "Translation"
|
16 |
+
|
17 |
+
|
18 |
+
# Ref: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=QLGc6DwxyvQc pylint: disable=line-too-long
|
19 |
+
def compute_elo(battles, k=4, scale=400, base=10, initial_rating=1000):
|
20 |
+
rating = defaultdict(lambda: initial_rating)
|
21 |
+
|
22 |
+
for model_a, model_b, winner in battles[["model_a", "model_b",
|
23 |
+
"winner"]].itertuples(index=False):
|
24 |
+
rating_a = rating[model_a]
|
25 |
+
rating_b = rating[model_b]
|
26 |
+
|
27 |
+
expected_score_a = 1 / (1 + base**((rating_b - rating_a) / scale))
|
28 |
+
expected_score_b = 1 / (1 + base**((rating_a - rating_b) / scale))
|
29 |
+
|
30 |
+
scored_point_a = 0.5 if winner == "tie" else int(winner == "model_a")
|
31 |
+
|
32 |
+
rating[model_a] += k * (scored_point_a - expected_score_a)
|
33 |
+
rating[model_b] += k * (1 - scored_point_a - expected_score_b)
|
34 |
+
|
35 |
+
return rating
|
36 |
+
|
37 |
+
|
38 |
+
def get_docs(tab, db):
|
39 |
+
if tab.label == LeaderboardTab.SUMMARIZATION.value:
|
40 |
+
return db.collection("arena-summarizations").order_by("timestamp").stream()
|
41 |
+
|
42 |
+
if tab.label == LeaderboardTab.TRANSLATION.value:
|
43 |
+
return db.collection("arena-translations").order_by("timestamp").stream()
|
44 |
+
|
45 |
+
|
46 |
+
# TODO(#8): Update the value periodically.
|
47 |
+
def load_elo_ratings(tab, db):
|
48 |
+
docs = get_docs(tab, db)
|
49 |
+
|
50 |
+
battles = []
|
51 |
+
for doc in docs:
|
52 |
+
data = doc.to_dict()
|
53 |
+
battles.append({
|
54 |
+
"model_a": data["model_a"],
|
55 |
+
"model_b": data["model_b"],
|
56 |
+
"winner": data["winner"]
|
57 |
+
})
|
58 |
+
|
59 |
+
battles = pd.DataFrame(battles)
|
60 |
+
ratings = compute_elo(battles)
|
61 |
+
|
62 |
+
sorted_ratings = sorted(ratings.items(), key=lambda x: x[1], reverse=True)
|
63 |
+
return [[i + 1, model, math.floor(rating + 0.5)]
|
64 |
+
for i, (model, rating) in enumerate(sorted_ratings)]
|
65 |
+
|
66 |
+
|
67 |
+
def build_leaderboard(db):
|
68 |
+
with gr.Tabs():
|
69 |
+
with gr.Tab(LeaderboardTab.SUMMARIZATION.value) as summarization_tab:
|
70 |
+
gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
|
71 |
+
datatype=["number", "str", "number"],
|
72 |
+
value=load_elo_ratings(summarization_tab, db))
|
73 |
+
|
74 |
+
# TODO(#9): Add language filter options.
|
75 |
+
with gr.Tab(LeaderboardTab.TRANSLATION.value) as translation_tab:
|
76 |
+
gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
|
77 |
+
datatype=["number", "str", "number"],
|
78 |
+
value=load_elo_ratings(translation_tab, db))
|