arena / leaderboard.py
Kang Suhyun
[#78] Handle identical ELO ratings (#108)
8f146f1 unverified
raw
history blame
8.53 kB
"""
It provides a leaderboard component.
"""
from collections import defaultdict
import enum
import math
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
from google.cloud.firestore_v1 import base_query
import gradio as gr
import lingua
import pandas as pd
from credentials import get_credentials_json
if gr.NO_RELOAD:
firebase_admin.initialize_app(credentials.Certificate(get_credentials_json()))
db = firestore.client()
SUPPORTED_TRANSLATION_LANGUAGES = [
language.name.capitalize() for language in lingua.Language.all()
]
ANY_LANGUAGE = "Any"
class LeaderboardTab(enum.Enum):
SUMMARIZATION = "Summarization"
TRANSLATION = "Translation"
# Ref: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=QLGc6DwxyvQc pylint: disable=line-too-long
def compute_elo(battles, k=4, scale=400, base=10, initial_rating=1000):
rating = defaultdict(lambda: initial_rating)
for model_a, model_b, winner in battles[["model_a", "model_b",
"winner"]].itertuples(index=False):
rating_a = rating[model_a]
rating_b = rating[model_b]
expected_score_a = 1 / (1 + base**((rating_b - rating_a) / scale))
expected_score_b = 1 / (1 + base**((rating_a - rating_b) / scale))
scored_point_a = 0.5 if winner == "tie" else int(winner == "model_a")
rating[model_a] += k * (scored_point_a - expected_score_a)
rating[model_b] += k * (1 - scored_point_a - expected_score_b)
return rating
def get_docs(tab: str,
summary_lang: str = None,
source_lang: str = None,
target_lang: str = None):
if tab == LeaderboardTab.SUMMARIZATION:
collection = db.collection("arena-summarizations").order_by("timestamp")
if summary_lang:
collection = collection.where(filter=base_query.FieldFilter(
"model_a_response_language", "==", summary_lang.lower())).where(
filter=base_query.FieldFilter("model_b_response_language", "==",
summary_lang.lower()))
return collection.stream()
if tab == LeaderboardTab.TRANSLATION:
collection = db.collection("arena-translations").order_by("timestamp")
if source_lang and (not source_lang == ANY_LANGUAGE):
collection = collection.where(filter=base_query.FieldFilter(
"source_language", "==", source_lang.lower()))
if target_lang and (not target_lang == ANY_LANGUAGE):
collection = collection.where(filter=base_query.FieldFilter(
"target_language", "==", target_lang.lower()))
return collection.stream()
def load_elo_ratings(tab,
summary_lang: str = None,
source_lang: str = None,
target_lang: str = None):
docs = get_docs(tab, summary_lang, source_lang, target_lang)
battles = []
for doc in docs:
data = doc.to_dict()
battles.append({
"model_a": data["model_a"],
"model_b": data["model_b"],
"winner": data["winner"]
})
if not battles:
return
battles = pd.DataFrame(battles)
ratings = compute_elo(battles)
sorted_ratings = sorted(ratings.items(), key=lambda x: x[1], reverse=True)
rank = 0
last_rating = None
rating_rows = []
for index, (model, rating) in enumerate(sorted_ratings):
int_rating = math.floor(rating + 0.5)
if int_rating != last_rating:
rank = index + 1
rating_rows.append([rank, model, int_rating])
last_rating = int_rating
return rating_rows
LEADERBOARD_UPDATE_INTERVAL = 600 # 10 minutes
LEADERBOARD_INFO = "The leaderboard is updated every 10 minutes."
DEFAULT_FILTER_OPTIONS = {
"summary_language": lingua.Language.ENGLISH.name.capitalize(),
"source_language": ANY_LANGUAGE,
"target_language": lingua.Language.ENGLISH.name.capitalize()
}
def update_filtered_leaderboard(tab, summary_lang: str, source_lang: str,
target_lang: str):
new_value = load_elo_ratings(tab, summary_lang, source_lang, target_lang)
return gr.update(value=new_value)
def build_leaderboard():
with gr.Tabs():
with gr.Tab(LeaderboardTab.SUMMARIZATION.value):
with gr.Accordion("Filter", open=False) as summarization_filter:
with gr.Row():
languages = [
language.name.capitalize() for language in lingua.Language.all()
]
summary_language = gr.Dropdown(
choices=languages,
value=DEFAULT_FILTER_OPTIONS["summary_language"],
label="Summary language",
interactive=True)
with gr.Row():
filtered_summarization = gr.DataFrame(
headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=lambda: load_elo_ratings(
LeaderboardTab.SUMMARIZATION, DEFAULT_FILTER_OPTIONS[
"summary_language"]),
elem_classes="leaderboard")
summary_language.change(fn=update_filtered_leaderboard,
inputs=[
gr.State(LeaderboardTab.SUMMARIZATION),
summary_language,
gr.State(),
gr.State()
],
outputs=filtered_summarization)
gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=lambda: load_elo_ratings(LeaderboardTab.SUMMARIZATION),
every=LEADERBOARD_UPDATE_INTERVAL,
elem_classes="leaderboard")
gr.Markdown(LEADERBOARD_INFO)
with gr.Tab(LeaderboardTab.TRANSLATION.value):
with gr.Accordion("Filter", open=False) as translation_filter:
with gr.Row():
source_language = gr.Dropdown(
choices=SUPPORTED_TRANSLATION_LANGUAGES + [ANY_LANGUAGE],
label="Source language",
value=DEFAULT_FILTER_OPTIONS["source_language"],
interactive=True)
target_language = gr.Dropdown(
choices=SUPPORTED_TRANSLATION_LANGUAGES + [ANY_LANGUAGE],
label="Target language",
value=DEFAULT_FILTER_OPTIONS["target_language"],
interactive=True)
with gr.Row():
filtered_translation = gr.DataFrame(
headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=lambda: load_elo_ratings(
LeaderboardTab.TRANSLATION, DEFAULT_FILTER_OPTIONS[
"source_language"], DEFAULT_FILTER_OPTIONS[
"target_language"]),
elem_classes="leaderboard")
source_language.change(fn=update_filtered_leaderboard,
inputs=[
gr.State(LeaderboardTab.TRANSLATION),
gr.State(), source_language,
target_language
],
outputs=filtered_translation)
target_language.change(fn=update_filtered_leaderboard,
inputs=[
gr.State(LeaderboardTab.TRANSLATION),
gr.State(), source_language,
target_language
],
outputs=filtered_translation)
# When filter options are changed, the accordion keeps closed.
# To avoid this, we open the accordion when the filter options are changed.
summary_language.change(fn=lambda: gr.Accordion(open=True),
outputs=summarization_filter)
source_language.change(fn=lambda: gr.Accordion(open=True),
outputs=translation_filter)
target_language.change(fn=lambda: gr.Accordion(open=True),
outputs=translation_filter)
gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=lambda: load_elo_ratings(LeaderboardTab.TRANSLATION),
every=LEADERBOARD_UPDATE_INTERVAL,
elem_classes="leaderboard")
gr.Markdown(LEADERBOARD_INFO)