arena / leaderboard.py
Kang Suhyun
[#37] Store ELO ratings in DB after calculation (#112)
5352a13 unverified
raw
history blame
6.79 kB
"""
It provides a leaderboard component.
"""
from collections import defaultdict
import enum
import math
from typing import Dict, List, Tuple
import gradio as gr
import lingua
import db
from db import get_battles
SUPPORTED_LANGUAGES = [
language.name.capitalize() for language in lingua.Language.all()
]
ANY_LANGUAGE = "Any"
class LeaderboardTab(enum.Enum):
SUMMARIZATION = "Summarization"
TRANSLATION = "Translation"
# Ref: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=QLGc6DwxyvQc pylint: disable=line-too-long
def compute_elo(battles: List[db.Battle],
k=4,
scale=400,
base=10,
initial_rating=1000) -> Dict[str, int]:
rating = defaultdict(lambda: initial_rating)
for battle in battles:
model_a, model_b, winner = battle.model_a, battle.model_b, battle.winner
rating_a = rating[model_a]
rating_b = rating[model_b]
expected_score_a = 1 / (1 + base**((rating_b - rating_a) / scale))
expected_score_b = 1 / (1 + base**((rating_a - rating_b) / scale))
scored_point_a = 0.5 if winner == "tie" else int(winner == "model_a")
rating[model_a] += k * (scored_point_a - expected_score_a)
rating[model_b] += k * (1 - scored_point_a - expected_score_b)
return {model: math.floor(rating + 0.5) for model, rating in rating.items()}
def load_elo_ratings(tab, source_lang: str, target_lang: str | None):
category = db.Category.SUMMARIZATION if tab == LeaderboardTab.SUMMARIZATION else db.Category.TRANSLATION
# TODO(#37): Call db.get_ratings and return the ratings if exists.
battles = get_battles(category,
None if source_lang == ANY_LANGUAGE else source_lang,
None if target_lang == ANY_LANGUAGE else target_lang)
if not battles:
return
computed_ratings = compute_elo(battles)
db.set_ratings(
category,
[db.Rating(model, rating) for model, rating in computed_ratings.items()],
source_lang, target_lang)
sorted_ratings = sorted(
computed_ratings.items(),
key=lambda x: x[1], # rating
reverse=True)
rank = 0
last_rating = None
rating_rows = []
for index, (model, rating) in enumerate(sorted_ratings):
if rating != last_rating:
rank = index + 1
rating_rows.append([rank, model, rating])
last_rating = rating
return rating_rows
LEADERBOARD_UPDATE_INTERVAL = 600 # 10 minutes
LEADERBOARD_INFO = "The leaderboard is updated every 10 minutes."
def update_filtered_leaderboard(tab: str, source_lang: str,
target_lang: str | None):
new_value = load_elo_ratings(tab, source_lang, target_lang)
return gr.update(value=new_value)
def build_leaderboard():
with gr.Tabs():
# Returns (original leaderboard, filtered leaderboard).
def toggle_leaderboard(language: str) -> Tuple[gr.Dataframe, gr.Dataframe]:
filter_chosen = language != ANY_LANGUAGE
return gr.Dataframe(visible=not filter_chosen), gr.Dataframe(
visible=filter_chosen)
with gr.Tab(LeaderboardTab.SUMMARIZATION.value):
summary_language = gr.Dropdown(choices=SUPPORTED_LANGUAGES +
[ANY_LANGUAGE],
value=ANY_LANGUAGE,
label="Summary language",
interactive=True)
filtered_summarization = gr.DataFrame(
headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=lambda: load_elo_ratings(LeaderboardTab.SUMMARIZATION,
ANY_LANGUAGE, None),
elem_classes="leaderboard",
visible=False)
original_summarization = gr.Dataframe(
headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=lambda: load_elo_ratings(LeaderboardTab.SUMMARIZATION,
ANY_LANGUAGE, None),
every=LEADERBOARD_UPDATE_INTERVAL,
elem_classes="leaderboard")
gr.Markdown(LEADERBOARD_INFO)
summary_language.change(
fn=update_filtered_leaderboard,
inputs=[
gr.State(LeaderboardTab.SUMMARIZATION), summary_language,
gr.State(None)
],
outputs=filtered_summarization).then(
fn=toggle_leaderboard,
inputs=summary_language,
outputs=[original_summarization, filtered_summarization])
with gr.Tab(LeaderboardTab.TRANSLATION.value):
with gr.Row():
source_language = gr.Dropdown(choices=SUPPORTED_LANGUAGES +
[ANY_LANGUAGE],
label="Source language",
value=ANY_LANGUAGE,
interactive=True)
target_language = gr.Dropdown(choices=SUPPORTED_LANGUAGES +
[ANY_LANGUAGE],
label="Target language",
value=ANY_LANGUAGE,
interactive=True)
filtered_translation = gr.DataFrame(
headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=lambda: load_elo_ratings(LeaderboardTab.TRANSLATION,
ANY_LANGUAGE, ANY_LANGUAGE),
elem_classes="leaderboard",
visible=False)
original_translation = gr.Dataframe(
headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=lambda: load_elo_ratings(LeaderboardTab.TRANSLATION,
ANY_LANGUAGE, ANY_LANGUAGE),
every=LEADERBOARD_UPDATE_INTERVAL,
elem_classes="leaderboard")
gr.Markdown(LEADERBOARD_INFO)
source_language.change(
fn=update_filtered_leaderboard,
inputs=[
gr.State(LeaderboardTab.TRANSLATION), source_language,
target_language
],
outputs=filtered_translation).then(
fn=toggle_leaderboard,
inputs=source_language,
outputs=[original_translation, filtered_translation])
target_language.change(
fn=update_filtered_leaderboard,
inputs=[
gr.State(LeaderboardTab.TRANSLATION), source_language,
target_language
],
outputs=filtered_translation).then(
fn=toggle_leaderboard,
inputs=target_language,
outputs=[original_translation, filtered_translation])