Spaces:
Runtime error
Runtime error
File size: 4,101 Bytes
f1d79c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import numpy as np
import pandas as pd
import streamlit as st
def create_synthetic_data(n_tasks=100, n_models=4, n_ratings=3):
"""Create a synthetic dataframe with human ratings of model performance on a set of tasks.
Parameters
----------
n_tasks : int
The number of tasks.
n_models : int
The number of models.
n_ratings : int
The number of human ratings of model performance on a set of tasks.
Returns
-------
pandas.DataFrame
DataFrame containing human ratings of model performance on a set of tasks.
"""
# create a synthetic dataframe with 3 human ratings of 4 models performance on a set of 100 tasks
df = pd.DataFrame({'task': np.repeat(range(n_tasks), n_models * n_ratings),
'model': np.tile(np.repeat(range(n_models), n_ratings), n_tasks),
'rating': np.tile(np.random.randint(0, 5, n_models * n_ratings), n_tasks)})
# calculate score for each model
df['score'] = df.groupby(['task', 'model'])['rating'].transform('mean')
# calculate baseline score for each task
df['baseline'] = df.groupby('task')['score'].transform('min')
# calculate score for each model relative to baseline score
df['score'] = df['score'] - df['baseline']
# drop unnecessary columns
df = df.drop(['rating', 'baseline'], axis=1)
# drop duplicates
df = df.drop_duplicates()
return df
def calculate_elo_rating(df, k=32, initial_rating=0):
"""Calculate ELORating for each model based on human ratings of model performance on a set of tasks.
Parameters
----------
df : pandas.DataFrame
DataFrame containing human ratings of model performance on a set of tasks.
k : int
The k-factor.
initial_rating : int
The initial rating.
Returns
-------
pandas.DataFrame
DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
"""
# calculate ELORating for each model based on human ratings of model performance on a set of tasks
# create a dat
df = df.copy()
# create a dataframe with all possible combinations of tasks and models
df_all = pd.DataFrame({'task': np.repeat(range(df['task'].max() + 1), df['model'].max() + 1),
'model': np.tile(range(df['model'].max() + 1), df['task'].max() + 1)})
# merge with original dataframe
df = df_all.merge(df, on=['task', 'model'], how='left')
# fill missing values with 0
df['score'] = df['score'].fillna(0)
# calculate expected score for each model
df['expected_score'] = df.groupby('model')['score'].transform(lambda x: 1 / (1 + 10 ** (-x / 400)))
# calculate actual score for each model
df['actual_score'] = df.groupby('model')['score'].transform(lambda x: x > 0).astype(int)
# calculate rating for each model
df['rating'] = df.groupby('model')['expected_score'].transform(lambda x: x * k + initial_rating)
# calculate rating change for each model
df['rating_change'] = df.groupby('model')['actual_score'].transform(lambda x: x * k)
# calculate new rating for each model
df['new_rating'] = df['rating'] + df['rating_change']
# drop unnecessary columns
df = df.drop(['score', 'expected_score', 'actual_score', 'rating', 'rating_change'], axis=1)
return df
def display_leaderboard(elo, n_models=4):
"""Display Elo rating for each model as a leaderboard based on their ranking.
Parameters
----------
elo : pandas.DataFrame
DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
n_models : int
The number of models.
"""
# calculate average Elo rating for each model
elo = elo.groupby('model')['new_rating'].mean().reset_index()
# sort models by Elo rating
elo = elo.sort_values('new_rating', ascending=False)
# add rank column
elo['rank'] = range(1, n_models + 1)
# display Elo rating for each model as a leaderboard based on their ranking
st.write(elo) |