Kang Suhyun commited on
Commit
0e0491e
2 Parent(s): abbd100 cc5a628

Merge pull request #10 from Y-IAB/1-elo

Browse files

[#1] Add leaderboard based on Elo rating

Files changed (2) hide show
  1. app.py +4 -0
  2. leaderboard.py +78 -0
app.py CHANGED
@@ -11,6 +11,8 @@ from firebase_admin import firestore
11
  import gradio as gr
12
  from litellm import completion
13
 
 
 
14
  # TODO(#21): Fix auto-reload issue related to the initialization of Firebase.
15
  db_app = firebase_admin.initialize_app()
16
  db = firestore.client()
@@ -194,6 +196,8 @@ with gr.Blocks() as app:
194
  option_b.click(vote, [option_b] + common_inputs)
195
  tie.click(vote, [tie] + common_inputs)
196
 
 
 
197
  if __name__ == "__main__":
198
  # We need to enable queue to use generators.
199
  app.queue()
 
11
  import gradio as gr
12
  from litellm import completion
13
 
14
+ from leaderboard import build_leaderboard
15
+
16
  # TODO(#21): Fix auto-reload issue related to the initialization of Firebase.
17
  db_app = firebase_admin.initialize_app()
18
  db = firestore.client()
 
196
  option_b.click(vote, [option_b] + common_inputs)
197
  tie.click(vote, [tie] + common_inputs)
198
 
199
+ build_leaderboard(db)
200
+
201
  if __name__ == "__main__":
202
  # We need to enable queue to use generators.
203
  app.queue()
leaderboard.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ It provides a leaderboard component.
3
+ """
4
+
5
+ from collections import defaultdict
6
+ import enum
7
+ import math
8
+
9
+ import gradio as gr
10
+ import pandas as pd
11
+
12
+
13
+ class LeaderboardTab(enum.Enum):
14
+ SUMMARIZATION = "Summarization"
15
+ TRANSLATION = "Translation"
16
+
17
+
18
+ # Ref: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=QLGc6DwxyvQc pylint: disable=line-too-long
19
+ def compute_elo(battles, k=4, scale=400, base=10, initial_rating=1000):
20
+ rating = defaultdict(lambda: initial_rating)
21
+
22
+ for model_a, model_b, winner in battles[["model_a", "model_b",
23
+ "winner"]].itertuples(index=False):
24
+ rating_a = rating[model_a]
25
+ rating_b = rating[model_b]
26
+
27
+ expected_score_a = 1 / (1 + base**((rating_b - rating_a) / scale))
28
+ expected_score_b = 1 / (1 + base**((rating_a - rating_b) / scale))
29
+
30
+ scored_point_a = 0.5 if winner == "tie" else int(winner == "model_a")
31
+
32
+ rating[model_a] += k * (scored_point_a - expected_score_a)
33
+ rating[model_b] += k * (1 - scored_point_a - expected_score_b)
34
+
35
+ return rating
36
+
37
+
38
+ def get_docs(tab, db):
39
+ if tab.label == LeaderboardTab.SUMMARIZATION.value:
40
+ return db.collection("arena-summarizations").order_by("timestamp").stream()
41
+
42
+ if tab.label == LeaderboardTab.TRANSLATION.value:
43
+ return db.collection("arena-translations").order_by("timestamp").stream()
44
+
45
+
46
+ # TODO(#8): Update the value periodically.
47
+ def load_elo_ratings(tab, db):
48
+ docs = get_docs(tab, db)
49
+
50
+ battles = []
51
+ for doc in docs:
52
+ data = doc.to_dict()
53
+ battles.append({
54
+ "model_a": data["model_a"],
55
+ "model_b": data["model_b"],
56
+ "winner": data["winner"]
57
+ })
58
+
59
+ battles = pd.DataFrame(battles)
60
+ ratings = compute_elo(battles)
61
+
62
+ sorted_ratings = sorted(ratings.items(), key=lambda x: x[1], reverse=True)
63
+ return [[i + 1, model, math.floor(rating + 0.5)]
64
+ for i, (model, rating) in enumerate(sorted_ratings)]
65
+
66
+
67
+ def build_leaderboard(db):
68
+ with gr.Tabs():
69
+ with gr.Tab(LeaderboardTab.SUMMARIZATION.value) as summarization_tab:
70
+ gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
71
+ datatype=["number", "str", "number"],
72
+ value=load_elo_ratings(summarization_tab, db))
73
+
74
+ # TODO(#9): Add language filter options.
75
+ with gr.Tab(LeaderboardTab.TRANSLATION.value) as translation_tab:
76
+ gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
77
+ datatype=["number", "str", "number"],
78
+ value=load_elo_ratings(translation_tab, db))