Commit
•
e16fd64
1
Parent(s):
c69d931
Initial commit (#1)
Browse files- Initial commit (d4c52034a11be912014a2ecc30b9c8a9d4ca6ba3)
Co-authored-by: Koshti <Koshti10@users.noreply.huggingface.co>
- app.py +128 -0
- requirements.txt +75 -0
- src/assets/text_content.py +18 -0
- src/utils.py +217 -0
- versions/v0.7.csv +14 -0
- versions/v0.8.csv +14 -0
- versions/v0.9.csv +14 -0
- versions/v1.0.csv +14 -0
app.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from src.assets.text_content import TITLE, INTRODUCTION_TEXT
|
4 |
+
from src.utils import get_data, compare_plots, filter_search
|
5 |
+
|
6 |
+
############################ For Leaderboards #############################
|
7 |
+
DATA_PATH = 'versions'
|
8 |
+
latest_flag = True #Set flag to iclude latest data in Details and Versions Tab
|
9 |
+
latest_df, latest_vname, previous_df, previous_vname = get_data(DATA_PATH, latest_flag)
|
10 |
+
|
11 |
+
global prev_df
|
12 |
+
prev_df = previous_df[0]
|
13 |
+
def select_prev_df(name):
|
14 |
+
ind = previous_vname.index(name)
|
15 |
+
prev_df = previous_df[ind]
|
16 |
+
return prev_df
|
17 |
+
|
18 |
+
############################ For Plots ####################################
|
19 |
+
global plot_df, MODEL_COLS
|
20 |
+
plot_df = latest_df[0]
|
21 |
+
MODEL_COLS = list(plot_df['Model'].unique())
|
22 |
+
|
23 |
+
|
24 |
+
############# MAIN APPLICATION ######################
|
25 |
+
demo = gr.Blocks()
|
26 |
+
with demo:
|
27 |
+
gr.HTML(TITLE)
|
28 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
29 |
+
|
30 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
31 |
+
with gr.TabItem("🥇 Clem Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
32 |
+
with gr.Row():
|
33 |
+
search_bar = gr.Textbox(
|
34 |
+
placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
|
35 |
+
show_label=False,
|
36 |
+
elem_id="search-bar",
|
37 |
+
)
|
38 |
+
|
39 |
+
leaderboard_table = gr.components.Dataframe(
|
40 |
+
value=latest_df[0],
|
41 |
+
elem_id="leaderboard-table",
|
42 |
+
interactive=False,
|
43 |
+
visible=True,
|
44 |
+
)
|
45 |
+
|
46 |
+
# Add a dummy leaderboard to handle search queries from the latest_df and not update latest_df
|
47 |
+
dummy_leaderboard_table = gr.components.Dataframe(
|
48 |
+
value=latest_df[0],
|
49 |
+
elem_id="leaderboard-table",
|
50 |
+
interactive=False,
|
51 |
+
visible=False,
|
52 |
+
)
|
53 |
+
|
54 |
+
search_bar.submit(
|
55 |
+
filter_search,
|
56 |
+
[dummy_leaderboard_table, search_bar],
|
57 |
+
leaderboard_table,
|
58 |
+
queue=True
|
59 |
+
)
|
60 |
+
with gr.TabItem("📈 Plot", id=3):
|
61 |
+
with gr.Row():
|
62 |
+
model_cols = gr.CheckboxGroup(
|
63 |
+
MODEL_COLS,
|
64 |
+
label="Select Models 🤖",
|
65 |
+
value=[],
|
66 |
+
elem_id="column-select",
|
67 |
+
interactive=True,
|
68 |
+
)
|
69 |
+
|
70 |
+
with gr.Row():
|
71 |
+
plot_grdf = gr.DataFrame(
|
72 |
+
value=plot_df,
|
73 |
+
visible=False
|
74 |
+
)
|
75 |
+
with gr.Row():
|
76 |
+
# Output block for the plot
|
77 |
+
plot_output = gr.Plot()
|
78 |
+
|
79 |
+
model_cols.change(
|
80 |
+
compare_plots,
|
81 |
+
[plot_grdf, model_cols],
|
82 |
+
plot_output,
|
83 |
+
queue=True
|
84 |
+
)
|
85 |
+
|
86 |
+
with gr.TabItem("🔄 Versions and Details", elem_id="details", id=2):
|
87 |
+
with gr.Row():
|
88 |
+
ver_selection = gr.Dropdown(
|
89 |
+
previous_vname, label="Select Version 🕹️", value=previous_vname[0]
|
90 |
+
)
|
91 |
+
with gr.Row():
|
92 |
+
search_bar_prev = gr.Textbox(
|
93 |
+
placeholder=" 🔍 Search for models - separate multiple queries with `;` and press ENTER...",
|
94 |
+
show_label=False,
|
95 |
+
elem_id="search-bar-2",
|
96 |
+
)
|
97 |
+
|
98 |
+
prev_table = gr.components.Dataframe(
|
99 |
+
value=prev_df,
|
100 |
+
elem_id="leaderboard-table",
|
101 |
+
interactive=False,
|
102 |
+
visible=True,
|
103 |
+
)
|
104 |
+
|
105 |
+
dummy_prev_table = gr.components.Dataframe(
|
106 |
+
value=prev_df,
|
107 |
+
elem_id="leaderboard-table",
|
108 |
+
interactive=False,
|
109 |
+
visible=False,
|
110 |
+
)
|
111 |
+
|
112 |
+
search_bar_prev.submit(
|
113 |
+
filter_search,
|
114 |
+
[dummy_prev_table, search_bar_prev],
|
115 |
+
prev_table,
|
116 |
+
queue=True
|
117 |
+
)
|
118 |
+
|
119 |
+
ver_selection.change(
|
120 |
+
select_prev_df,
|
121 |
+
[ver_selection],
|
122 |
+
prev_table,
|
123 |
+
queue=True
|
124 |
+
)
|
125 |
+
|
126 |
+
demo.load()
|
127 |
+
demo.queue()
|
128 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.23.0
|
2 |
+
aiofiles==23.1.0
|
3 |
+
aiohttp==3.8.4
|
4 |
+
aiosignal==1.3.1
|
5 |
+
altair==4.2.2
|
6 |
+
anyio==3.6.2
|
7 |
+
APScheduler==3.10.1
|
8 |
+
async-timeout==4.0.2
|
9 |
+
attrs==23.1.0
|
10 |
+
certifi==2022.12.7
|
11 |
+
charset-normalizer==3.1.0
|
12 |
+
click==8.1.3
|
13 |
+
contourpy==1.0.7
|
14 |
+
cycler==0.11.0
|
15 |
+
datasets==2.12.0
|
16 |
+
entrypoints==0.4
|
17 |
+
fastapi==0.95.1
|
18 |
+
ffmpy==0.3.0
|
19 |
+
filelock==3.11.0
|
20 |
+
fonttools==4.39.3
|
21 |
+
frozenlist==1.3.3
|
22 |
+
fsspec==2023.5.0
|
23 |
+
gradio==3.43.2
|
24 |
+
gradio-client==0.5.0
|
25 |
+
h11==0.14.0
|
26 |
+
httpcore==0.17.0
|
27 |
+
httpx==0.24.0
|
28 |
+
huggingface-hub==0.16.4
|
29 |
+
idna==3.4
|
30 |
+
Jinja2==3.1.2
|
31 |
+
jsonschema==4.17.3
|
32 |
+
kiwisolver==1.4.4
|
33 |
+
linkify-it-py==2.0.0
|
34 |
+
markdown-it-py==2.2.0
|
35 |
+
MarkupSafe==2.1.2
|
36 |
+
matplotlib==3.7.1
|
37 |
+
mdit-py-plugins==0.3.3
|
38 |
+
mdurl==0.1.2
|
39 |
+
multidict==6.0.4
|
40 |
+
numpy==1.24.2
|
41 |
+
orjson==3.8.10
|
42 |
+
packaging==23.1
|
43 |
+
pandas==2.0.0
|
44 |
+
Pillow==9.5.0
|
45 |
+
plotly==5.14.1
|
46 |
+
pyarrow==11.0.0
|
47 |
+
pydantic==1.10.7
|
48 |
+
pydub==0.25.1
|
49 |
+
pyparsing==3.0.9
|
50 |
+
pyrsistent==0.19.3
|
51 |
+
python-dateutil==2.8.2
|
52 |
+
python-multipart==0.0.6
|
53 |
+
pytz==2023.3
|
54 |
+
pytz-deprecation-shim==0.1.0.post0
|
55 |
+
PyYAML==6.0
|
56 |
+
requests==2.28.2
|
57 |
+
semantic-version==2.10.0
|
58 |
+
six==1.16.0
|
59 |
+
sniffio==1.3.0
|
60 |
+
starlette==0.26.1
|
61 |
+
toolz==0.12.0
|
62 |
+
tqdm==4.65.0
|
63 |
+
transformers@git+https://github.com/clefourrier/transformers.git
|
64 |
+
tokenizers==0.14
|
65 |
+
#tokenizers==0.14.1 wait for tokenizers patch in dependencies with hf_hub
|
66 |
+
#transformers==4.34
|
67 |
+
typing_extensions==4.5.0
|
68 |
+
tzdata==2023.3
|
69 |
+
tzlocal==4.3
|
70 |
+
uc-micro-py==1.0.1
|
71 |
+
urllib3==1.26.15
|
72 |
+
uvicorn==0.21.1
|
73 |
+
websockets==11.0.1
|
74 |
+
yarl==1.8.2
|
75 |
+
hf_transfer==0.1.3
|
src/assets/text_content.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TITLE = """<h1 align="center" id="space-title"> 🏆 CLEM Leaderboard</h1>"""
|
2 |
+
|
3 |
+
INTRODUCTION_TEXT = """
|
4 |
+
🔝 The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimized Large Language Models, “clems”) as described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://arxiv.org/abs/2305.13455).
|
5 |
+
"""
|
6 |
+
|
7 |
+
SHORT_NAMES = {
|
8 |
+
"t0.0": "",
|
9 |
+
"claude-v1.3-": "cl",
|
10 |
+
"gpt-3.5-turbo-": "3.5",
|
11 |
+
"gpt-4-": "4",
|
12 |
+
"text-davinci-003-": "3",
|
13 |
+
"luminous-supreme-": "lm",
|
14 |
+
"koala-13b-": "ko",
|
15 |
+
"falcon-40b-": "flc",
|
16 |
+
"oasst-12b-": "ost",
|
17 |
+
"vicuna-13b-": "vcn"
|
18 |
+
}
|
src/utils.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from src.assets.text_content import SHORT_NAMES
|
7 |
+
|
8 |
+
def update_cols(df: pd.DataFrame) -> pd.DataFrame:
|
9 |
+
'''
|
10 |
+
Change three header rows to a single header row
|
11 |
+
Args:
|
12 |
+
df: Raw dataframe containing 3 separate header rows
|
13 |
+
Remove this function if the dataframe has only one header row
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
df: Updated dataframe which has only 1 header row instead of 3
|
17 |
+
'''
|
18 |
+
default_cols = list(df.columns)
|
19 |
+
|
20 |
+
# First 4 columns are initalised in 'update', Append additional columns for games Model, Clemscore, ALL(PLayed) and ALL(Main Score)
|
21 |
+
update = ['Model', 'Clemscore', 'All(Played)', 'All(Quality Score)']
|
22 |
+
game_metrics = default_cols[4:]
|
23 |
+
|
24 |
+
# Change columns Names for each Game
|
25 |
+
for i in range(len(game_metrics)):
|
26 |
+
if i%3 == 0:
|
27 |
+
game = game_metrics[i]
|
28 |
+
update.append(str(game).capitalize() + "(Played)")
|
29 |
+
update.append(str(game).capitalize() + "(Quality Score)")
|
30 |
+
update.append(str(game).capitalize() + "(Quality Score[std])")
|
31 |
+
|
32 |
+
# Create a dict to change names of the columns
|
33 |
+
map_cols = {}
|
34 |
+
for i in range(len(default_cols)):
|
35 |
+
map_cols[default_cols[i]] = str(update[i])
|
36 |
+
|
37 |
+
df = df.rename(columns=map_cols)
|
38 |
+
df = df.iloc[2:]
|
39 |
+
|
40 |
+
return df
|
41 |
+
|
42 |
+
def process_df(df: pd.DataFrame) -> pd.DataFrame:
|
43 |
+
'''
|
44 |
+
Process dataframe - Remove repition in model names, convert datatypes to sort by "float" instead of "str"
|
45 |
+
Args:
|
46 |
+
df: Unprocessed Dataframe (after using update_cols)
|
47 |
+
Returns:
|
48 |
+
df: Processed Dataframe
|
49 |
+
'''
|
50 |
+
|
51 |
+
# Change column type to float from str
|
52 |
+
list_column_names = list(df.columns)
|
53 |
+
model_col_name = list_column_names[0]
|
54 |
+
for col in list_column_names:
|
55 |
+
if col != model_col_name:
|
56 |
+
df[col] = df[col].astype(float)
|
57 |
+
|
58 |
+
# Remove repetition in model names, if any
|
59 |
+
models_list = []
|
60 |
+
for i in range(len(df)):
|
61 |
+
model_name = df.iloc[i][model_col_name]
|
62 |
+
splits = model_name.split('--')
|
63 |
+
splits = [split.replace('-t0.0', '') for split in splits] # Comment to not remove -t0.0
|
64 |
+
if splits[0] == splits[1]:
|
65 |
+
models_list.append(splits[0])
|
66 |
+
else:
|
67 |
+
models_list.append(splits[0] + "--" + splits[1])
|
68 |
+
df[model_col_name] = models_list
|
69 |
+
|
70 |
+
return df
|
71 |
+
|
72 |
+
def get_data(path: str, flag: bool):
|
73 |
+
'''
|
74 |
+
Get a list of all version names and respective Dataframes
|
75 |
+
Args:
|
76 |
+
path: Path to the directory containing CSVs of different versions -> v0.9.csv, v1.0.csv, ....
|
77 |
+
flag: Set this flag to include the latest version in Details and Versions tab
|
78 |
+
Returns:
|
79 |
+
latest_df: singular list containing dataframe of the latest version of the leaderboard with only 4 columns
|
80 |
+
latest_vname: list of the name of latest version
|
81 |
+
previous_df: list of dataframes for previous versions (can skip latest version if required)
|
82 |
+
previous_vname: list of the names for the previous versions (INCLUDED IN Details and Versions Tab)
|
83 |
+
|
84 |
+
'''
|
85 |
+
# Check if Directory is empty
|
86 |
+
list_versions = os.listdir(path)
|
87 |
+
if not list_versions:
|
88 |
+
print("Directory is empty")
|
89 |
+
|
90 |
+
else:
|
91 |
+
files = [file for file in list_versions if file.endswith('.csv')]
|
92 |
+
files.sort(reverse=True)
|
93 |
+
file_names = [os.path.splitext(file)[0] for file in files]
|
94 |
+
|
95 |
+
DFS = []
|
96 |
+
for file in files:
|
97 |
+
df = pd.read_csv(os.path.join(path, file))
|
98 |
+
df = update_cols(df) # Remove if by default there is only one header row
|
99 |
+
df = process_df(df) # Process Dataframe
|
100 |
+
df = df.sort_values(by=list(df.columns)[1], ascending=False) # Sort by clemscore
|
101 |
+
DFS.append(df)
|
102 |
+
|
103 |
+
# Only keep relavant columns for the main leaderboard
|
104 |
+
latest_df_dummy = DFS[0]
|
105 |
+
all_columns = list(latest_df_dummy.columns)
|
106 |
+
keep_columns = all_columns[0:4]
|
107 |
+
latest_df_dummy = latest_df_dummy.drop(columns=[c for c in all_columns if c not in keep_columns])
|
108 |
+
|
109 |
+
latest_df = [latest_df_dummy]
|
110 |
+
latest_vname = [file_names[0]]
|
111 |
+
previous_df = []
|
112 |
+
previous_vname = []
|
113 |
+
for df, name in zip(DFS, file_names):
|
114 |
+
previous_df.append(df)
|
115 |
+
previous_vname.append(name)
|
116 |
+
|
117 |
+
if not flag:
|
118 |
+
previous_df.pop(0)
|
119 |
+
previous_vname.pop(0)
|
120 |
+
|
121 |
+
return latest_df, latest_vname, previous_df, previous_vname
|
122 |
+
|
123 |
+
return None
|
124 |
+
|
125 |
+
|
126 |
+
# ['Model', 'Clemscore', 'All(Played)', 'All(Quality Score)']
|
127 |
+
def compare_plots(df: pd.DataFrame, LIST: list):
|
128 |
+
'''
|
129 |
+
Quality Score v/s % Played plot by selecting models
|
130 |
+
Args:
|
131 |
+
LIST: The list of models to show in the plot, updated from frontend
|
132 |
+
Returns:
|
133 |
+
fig: The plot
|
134 |
+
'''
|
135 |
+
short_names = label_map(LIST)
|
136 |
+
|
137 |
+
list_columns = list(df.columns)
|
138 |
+
df = df[df[list_columns[0]].isin(LIST)]
|
139 |
+
|
140 |
+
X = df[list_columns[2]]
|
141 |
+
fig, ax = plt.subplots()
|
142 |
+
for model in LIST:
|
143 |
+
short = short_names[model][0]
|
144 |
+
same_flag = short_names[model][1]
|
145 |
+
model_df = df[df[list_columns[0]] == model]
|
146 |
+
x = model_df[list_columns[2]]
|
147 |
+
y = model_df[list_columns[3]]
|
148 |
+
color = plt.cm.rainbow(x / max(X)) # Use a colormap for different colors
|
149 |
+
plt.scatter(x, y, color=color)
|
150 |
+
if same_flag:
|
151 |
+
plt.annotate(f'{short}', (x, y), textcoords="offset points", xytext=(0, -15), ha='center', rotation=0)
|
152 |
+
else:
|
153 |
+
plt.annotate(f'{short}', (x, y), textcoords="offset points", xytext=(20, -3), ha='center', rotation=0)
|
154 |
+
ax.grid(which='both', color='grey', linewidth=1, linestyle='-', alpha=0.2)
|
155 |
+
ax.set_xticks(np.arange(0,110,10))
|
156 |
+
plt.xlim(-10, 110)
|
157 |
+
plt.ylim(-10, 110)
|
158 |
+
plt.xlabel('% Played')
|
159 |
+
plt.ylabel('Quality Score')
|
160 |
+
plt.title('Overview of benchmark results')
|
161 |
+
plt.show()
|
162 |
+
|
163 |
+
return fig
|
164 |
+
|
165 |
+
|
166 |
+
def label_map(model_list: list) -> dict:
|
167 |
+
'''
|
168 |
+
Generate a map from long names to short names, to plot them in frontend graph
|
169 |
+
Define the short names in src/assets/text_content.py
|
170 |
+
Args:
|
171 |
+
model_list: A list of long model names
|
172 |
+
Returns:
|
173 |
+
short_name: A map from long to list of short name + indication if models are same or different
|
174 |
+
'''
|
175 |
+
short_name = {}
|
176 |
+
for model_name in model_list:
|
177 |
+
splits = model_name.split('--')
|
178 |
+
if len(splits) != 1:
|
179 |
+
splits[0] = SHORT_NAMES[splits[0] + '-']
|
180 |
+
splits[1] = SHORT_NAMES[splits[1] + '-']
|
181 |
+
# Define the short name and indicate there are two different models
|
182 |
+
short_name[model_name] = [splits[0] + '--' + splits[1], 0]
|
183 |
+
else:
|
184 |
+
splits[0] = SHORT_NAMES[splits[0] + '-']
|
185 |
+
# Define the short name and indicate both models are same
|
186 |
+
short_name[model_name] = [splits[0], 1]
|
187 |
+
|
188 |
+
return short_name
|
189 |
+
|
190 |
+
def filter_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
191 |
+
'''
|
192 |
+
Filter the dataframe based on the search query
|
193 |
+
Args:
|
194 |
+
df: Unfiltered dataframe
|
195 |
+
query: a string of queries separated by ";"
|
196 |
+
Return:
|
197 |
+
filtered_df: Dataframe containing searched queries in the 'Model' column
|
198 |
+
'''
|
199 |
+
queries = query.split(';')
|
200 |
+
list_cols = list(df.columns)
|
201 |
+
df_len = len(df)
|
202 |
+
filtered_models = []
|
203 |
+
models_list = list(df[list_cols[0]])
|
204 |
+
for q in queries:
|
205 |
+
q = q.lower()
|
206 |
+
for i in range(df_len):
|
207 |
+
model_name = models_list[i]
|
208 |
+
if q in model_name.lower():
|
209 |
+
filtered_models.append(model_name) # Append model names containing query q
|
210 |
+
|
211 |
+
filtered_df = df[df[list_cols[0]].isin(filtered_models)]
|
212 |
+
|
213 |
+
if query == "":
|
214 |
+
return df
|
215 |
+
|
216 |
+
return filtered_df
|
217 |
+
|
versions/v0.7.csv
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,-,all,all,imagegame,imagegame,imagegame,privateshared,privateshared,privateshared,referencegame,referencegame,referencegame,taboo,taboo,taboo,wordle,wordle,wordle,wordle_withclue,wordle_withclue,wordle_withclue,wordle_withcritic,wordle_withcritic,wordle_withcritic
|
2 |
+
,clemscore,Average % Played,Average Quality Score,% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std)
|
3 |
+
model,,,,,,,,,,,,,,,,,,,,,,,,
|
4 |
+
claude-v1.3-t0.0--claude-v1.3-t0.0,37.07,74.76,49.58,0.0,,,100.0,84.87,18.87,100.0,82.5,38.48,76.92,68.75,38.71,100.0,0.0,0.0,100.0,30.56,40.13,46.43,30.77,48.04
|
5 |
+
falcon-40b-t0.0--falcon-40b-t0.0,0.71,0.95,75.0,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,3.33,50.0,,3.33,100.0,
|
6 |
+
gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0,37.02,85.86,43.12,97.5,60.28,25.95,64.0,72.83,13.07,100.0,55.0,50.38,69.49,71.95,44.79,100.0,0.0,0.0,93.33,28.57,46.0,76.67,13.19,30.16
|
7 |
+
gpt-3.5-turbo-t0.0--gpt-4-t0.0,42.39,86.75,48.87,97.5,64.95,25.45,,,,100.0,57.5,50.06,69.49,62.6,45.15,,,,,,,80.0,10.42,17.42
|
8 |
+
gpt-4-t0.0--gpt-3.5-turbo-t0.0,55.62,82.78,67.19,65.0,81.0,21.54,,,,100.0,47.5,50.57,66.1,93.59,23.45,,,,,,,100.0,46.67,42.92
|
9 |
+
gpt-4-t0.0--gpt-4-t0.0,7.77,96.06,61.93,77.5,89.06,22.28,100.0,90.79,8.2,100.0,75.0,43.85,94.92,76.19,37.45,100.0,3.67,8.4,100.0,49.67,42.09,100.0,49.11,38.46
|
10 |
+
koala-13b-t0.0--koala-13b-t0.0,1.48,14.76,10.0,0.0,,,0.0,,,0.0,,,0.0,,,86.67,0.0,0.0,16.67,20.0,44.72,0.0,,
|
11 |
+
luminous-supreme-t0.0--luminous-supreme-t0.0,0.0,16.24,0.0,0.0,,,0.0,,,0.0,,,0.0,,,100.0,0.0,0.0,3.33,0.0,,10.34,0.0,0.0
|
12 |
+
oasst-12b-t0.0--oasst-12b-t0.0,1.74,20.85,8.33,0.0,,,0.0,,,15.0,33.33,51.64,0.0,,,100.0,0.0,0.0,16.67,0.0,0.0,14.29,0.0,0.0
|
13 |
+
text-davinci-003-t0.0--text-davinci-003-t0.0,15.78,44.5,35.46,57.5,38.7,27.78,16.0,14.1,25.21,82.5,36.36,48.85,28.81,76.47,43.72,66.67,1.25,5.59,36.67,31.36,38.99,23.33,50.0,50.0
|
14 |
+
vicuna-13b-t0.0--vicuna-13b-t0.0,4.24,13.58,31.25,0.0,,,0.0,,,0.0,,,5.08,100.0,0.0,56.67,0.0,0.0,13.33,25.0,50.0,20.0,0.0,0.0
|
versions/v0.8.csv
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,-,all,all,imagegame,imagegame,imagegame,privateshared,privateshared,privateshared,referencegame,referencegame,referencegame,taboo,taboo,taboo,wordle,wordle,wordle,wordle_withclue,wordle_withclue,wordle_withclue,wordle_withcritic,wordle_withcritic,wordle_withcritic
|
2 |
+
,clemscore,Average % Played,Average Quality Score,% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std)
|
3 |
+
model,,,,,,,,,,,,,,,,,,,,,,,,
|
4 |
+
claude-v1.3-t0.0--claude-v1.3-t0.0,37.07,74.76,49.58,0.0,,,100.0,84.87,18.87,100.0,82.5,38.48,76.92,68.75,38.71,100.0,0.0,0.0,100.0,30.56,40.13,46.43,30.77,48.04
|
5 |
+
falcon-40b-t0.0--falcon-40b-t0.0,0.71,0.95,75.0,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,3.33,50.0,,3.33,100.0,
|
6 |
+
gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0,37.02,85.86,43.12,97.5,60.28,25.95,64.0,72.83,13.07,100.0,55.0,50.38,69.49,71.95,44.79,100.0,0.0,0.0,93.33,28.57,46.0,76.67,13.19,30.16
|
7 |
+
gpt-3.5-turbo-t0.0--gpt-4-t0.0,42.39,86.75,48.87,97.5,64.95,25.45,,,,100.0,57.5,50.06,69.49,62.6,45.15,,,,,,,80.0,10.42,17.42
|
8 |
+
gpt-4-t0.0--gpt-3.5-turbo-t0.0,55.62,82.78,67.19,65.0,81.0,21.54,,,,100.0,47.5,50.57,66.1,93.59,23.45,,,,,,,100.0,46.67,42.92
|
9 |
+
gpt-4-t0.0--gpt-4-t0.0,8.88,96.06,61.93,77.5,89.06,22.28,100.0,90.79,8.2,100.0,75.0,43.85,94.92,76.19,37.45,100.0,3.67,8.4,100.0,49.67,42.09,100.0,49.11,38.46
|
10 |
+
koala-13b-t0.0--koala-13b-t0.0,1.48,14.76,10.0,0.0,,,0.0,,,0.0,,,0.0,,,86.67,0.0,0.0,16.67,20.0,44.72,0.0,,
|
11 |
+
luminous-supreme-t0.0--luminous-supreme-t0.0,0.0,16.24,0.0,0.0,,,0.0,,,0.0,,,0.0,,,100.0,0.0,0.0,3.33,0.0,,10.34,0.0,0.0
|
12 |
+
oasst-12b-t0.0--oasst-12b-t0.0,1.74,20.85,8.33,0.0,,,0.0,,,15.0,33.33,51.64,0.0,,,100.0,0.0,0.0,16.67,0.0,0.0,14.29,0.0,0.0
|
13 |
+
text-davinci-003-t0.0--text-davinci-003-t0.0,15.78,44.5,35.46,57.5,38.7,27.78,16.0,14.1,25.21,82.5,36.36,48.85,28.81,76.47,43.72,66.67,1.25,5.59,36.67,31.36,38.99,23.33,50.0,50.0
|
14 |
+
vicuna-13b-t0.0--vicuna-13b-t0.0,4.24,13.58,31.25,0.0,,,0.0,,,0.0,,,5.08,100.0,0.0,56.67,0.0,0.0,13.33,25.0,50.0,20.0,0.0,0.0
|
versions/v0.9.csv
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,-,all,all,imagegame,imagegame,imagegame,privateshared,privateshared,privateshared,referencegame,referencegame,referencegame,taboo,taboo,taboo,wordle,wordle,wordle,wordle_withclue,wordle_withclue,wordle_withclue,wordle_withcritic,wordle_withcritic,wordle_withcritic
|
2 |
+
,clemscore,Average % Played,Average Quality Score,% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std)
|
3 |
+
model,,,,,,,,,,,,,,,,,,,,,,,,
|
4 |
+
claude-v1.3-t0.0--claude-v1.3-t0.0,37.07,74.76,49.58,0.0,,,100.0,84.87,18.87,100.0,82.5,38.48,76.92,68.75,38.71,100.0,0.0,0.0,100.0,30.56,40.13,46.43,30.77,48.04
|
5 |
+
falcon-40b-t0.0--falcon-40b-t0.0,0.71,0.95,75.0,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,3.33,50.0,,3.33,100.0,
|
6 |
+
gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0,37.02,85.86,43.12,97.5,60.28,25.95,64.0,72.83,13.07,100.0,55.0,50.38,69.49,71.95,44.79,100.0,0.0,0.0,93.33,28.57,46.0,76.67,13.19,30.16
|
7 |
+
gpt-3.5-turbo-t0.0--gpt-4-t0.0,42.39,86.75,48.87,97.5,64.95,25.45,,,,100.0,57.5,50.06,69.49,62.6,45.15,,,,,,,80.0,10.42,17.42
|
8 |
+
gpt-4-t0.0--gpt-3.5-turbo-t0.0,55.62,82.78,67.19,65.0,81.0,21.54,,,,100.0,47.5,50.57,66.1,93.59,23.45,,,,,,,100.0,46.67,42.92
|
9 |
+
gpt-4-t0.0--gpt-4-t0.0,9.99,96.06,61.93,77.5,89.06,22.28,100.0,90.79,8.2,100.0,75.0,43.85,94.92,76.19,37.45,100.0,3.67,8.4,100.0,49.67,42.09,100.0,49.11,38.46
|
10 |
+
koala-13b-t0.0--koala-13b-t0.0,1.48,14.76,10.0,0.0,,,0.0,,,0.0,,,0.0,,,86.67,0.0,0.0,16.67,20.0,44.72,0.0,,
|
11 |
+
luminous-supreme-t0.0--luminous-supreme-t0.0,0.0,16.24,0.0,0.0,,,0.0,,,0.0,,,0.0,,,100.0,0.0,0.0,3.33,0.0,,10.34,0.0,0.0
|
12 |
+
oasst-12b-t0.0--oasst-12b-t0.0,1.74,20.85,8.33,0.0,,,0.0,,,15.0,33.33,51.64,0.0,,,100.0,0.0,0.0,16.67,0.0,0.0,14.29,0.0,0.0
|
13 |
+
text-davinci-003-t0.0--text-davinci-003-t0.0,15.78,44.5,35.46,57.5,38.7,27.78,16.0,14.1,25.21,82.5,36.36,48.85,28.81,76.47,43.72,66.67,1.25,5.59,36.67,31.36,38.99,23.33,50.0,50.0
|
14 |
+
vicuna-13b-t0.0--vicuna-13b-t0.0,4.24,13.58,31.25,0.0,,,0.0,,,0.0,,,5.08,100.0,0.0,56.67,0.0,0.0,13.33,25.0,50.0,20.0,0.0,0.0
|
versions/v1.0.csv
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,-,all,all,imagegame,imagegame,imagegame,privateshared,privateshared,privateshared,referencegame,referencegame,referencegame,taboo,taboo,taboo,wordle,wordle,wordle,wordle_withclue,wordle_withclue,wordle_withclue,wordle_withcritic,wordle_withcritic,wordle_withcritic
|
2 |
+
,clemscore,Average % Played,Average Quality Score,% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std),% Played,Quality Score,Quality Score (std)
|
3 |
+
model,,,,,,,,,,,,,,,,,,,,,,,,
|
4 |
+
claude-v1.3-t0.0--claude-v1.3-t0.0,37.07,74.76,49.58,0.0,,,100.0,84.87,18.87,100.0,82.5,38.48,76.92,68.75,38.71,100.0,0.0,0.0,100.0,30.56,40.13,46.43,30.77,48.04
|
5 |
+
falcon-40b-t0.0--falcon-40b-t0.0,0.71,0.95,75.0,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,3.33,50.0,,3.33,100.0,
|
6 |
+
gpt-3.5-turbo-t0.0--gpt-3.5-turbo-t0.0,37.02,85.86,43.12,97.5,60.28,25.95,64.0,72.83,13.07,100.0,55.0,50.38,69.49,71.95,44.79,100.0,0.0,0.0,93.33,28.57,46.0,76.67,13.19,30.16
|
7 |
+
gpt-3.5-turbo-t0.0--gpt-4-t0.0,42.39,86.75,48.87,97.5,64.95,25.45,,,,100.0,57.5,50.06,69.49,62.6,45.15,,,,,,,80.0,10.42,17.42
|
8 |
+
gpt-4-t0.0--gpt-3.5-turbo-t0.0,55.62,82.78,67.19,65.0,81.0,21.54,,,,100.0,47.5,50.57,66.1,93.59,23.45,,,,,,,100.0,46.67,42.92
|
9 |
+
gpt-4-t0.0--gpt-4-t0.0,59.49,96.06,61.93,77.5,89.06,22.28,100.0,90.79,8.2,100.0,75.0,43.85,94.92,76.19,37.45,100.0,3.67,8.4,100.0,49.67,42.09,100.0,49.11,38.46
|
10 |
+
koala-13b-t0.0--koala-13b-t0.0,1.48,14.76,10.0,0.0,,,0.0,,,0.0,,,0.0,,,86.67,0.0,0.0,16.67,20.0,44.72,0.0,,
|
11 |
+
luminous-supreme-t0.0--luminous-supreme-t0.0,0.0,16.24,0.0,0.0,,,0.0,,,0.0,,,0.0,,,100.0,0.0,0.0,3.33,0.0,,10.34,0.0,0.0
|
12 |
+
oasst-12b-t0.0--oasst-12b-t0.0,1.74,20.85,8.33,0.0,,,0.0,,,15.0,33.33,51.64,0.0,,,100.0,0.0,0.0,16.67,0.0,0.0,14.29,0.0,0.0
|
13 |
+
text-davinci-003-t0.0--text-davinci-003-t0.0,15.78,44.5,35.46,57.5,38.7,27.78,16.0,14.1,25.21,82.5,36.36,48.85,28.81,76.47,43.72,66.67,1.25,5.59,36.67,31.36,38.99,23.33,50.0,50.0
|
14 |
+
vicuna-13b-t0.0--vicuna-13b-t0.0,4.24,13.58,31.25,0.0,,,0.0,,,0.0,,,5.08,100.0,0.0,56.67,0.0,0.0,13.33,25.0,50.0,20.0,0.0,0.0
|