Hynek Kydlicek
commited on
Commit
โข
acc5a5f
1
Parent(s):
2b8570b
new models
Browse files- app.py +58 -19
- leaderboard/klokan.csv +11 -4
- leaderboard/table.csv +11 -4
- leaderboard/tsp.csv +11 -4
app.py
CHANGED
@@ -12,6 +12,23 @@ import plotly.graph_objects as go
|
|
12 |
import pandas as pd
|
13 |
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def make_default_md():
|
16 |
|
17 |
leaderboard_md = f"""
|
@@ -34,7 +51,7 @@ def make_arena_leaderboard_md(arena_df):
|
|
34 |
total_models = len(arena_df)
|
35 |
|
36 |
leaderboard_md = f"""
|
37 |
-
Total #models: **{total_models}**. Last updated:
|
38 |
"""
|
39 |
return leaderboard_md
|
40 |
|
@@ -59,16 +76,30 @@ def plot_spider(df, title):
|
|
59 |
categories[0],
|
60 |
] # Ensure the graph is circular by appending the start to the end
|
61 |
colors = [
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
]
|
73 |
|
74 |
# Setting for 1000x1000
|
@@ -95,7 +126,7 @@ def plot_spider(df, title):
|
|
95 |
|
96 |
fig_1000.update_layout(
|
97 |
width=600,
|
98 |
-
height=
|
99 |
polar=dict(
|
100 |
angularaxis=dict(
|
101 |
gridwidth=2, # Increase line width for better visibility
|
@@ -143,14 +174,23 @@ def get_full_table(model_table_df):
|
|
143 |
model_table_df.sort_values(by="average", ascending=False, inplace=True)
|
144 |
model_table_df.insert(0, "rank", np.arange(1, len(model_table_df) + 1))
|
145 |
|
|
|
|
|
|
|
|
|
|
|
146 |
# Add link
|
147 |
model_table_df["model_name"] = model_table_df["model_name"].apply(
|
148 |
lambda x: openrouter_hyperlink(x)
|
149 |
)
|
150 |
|
|
|
|
|
|
|
151 |
model_table_df.rename(
|
152 |
columns={
|
153 |
"model_name": "๐ค Model",
|
|
|
154 |
"klokan": "๐งฎ Klokan-QA",
|
155 |
"culture": "๐ TSP-Culture",
|
156 |
"analytical": "๐ TSP-Analytical",
|
@@ -161,6 +201,7 @@ def get_full_table(model_table_df):
|
|
161 |
inplace=True,
|
162 |
)
|
163 |
|
|
|
164 |
return model_table_df
|
165 |
|
166 |
|
@@ -195,17 +236,15 @@ def build_leaderboard_tab(leaderboard_table_file, klokan_table_file, tsp_table_f
|
|
195 |
elem_id="arena_leaderboard_dataframe",
|
196 |
height=700,
|
197 |
column_widths=[
|
198 |
-
|
199 |
200,
|
|
|
|
|
|
|
|
|
200 |
120,
|
201 |
100,
|
202 |
100,
|
203 |
-
150,
|
204 |
-
150,
|
205 |
-
100,
|
206 |
-
150,
|
207 |
-
150,
|
208 |
-
150,
|
209 |
],
|
210 |
wrap=True,
|
211 |
)
|
|
|
12 |
import pandas as pd
|
13 |
|
14 |
|
15 |
+
|
16 |
+
MODEL_NAME_COST = {
|
17 |
+
"anthropic/claude-2.1": 8,
|
18 |
+
"anthropic/claude-3-haiku": 0.25,
|
19 |
+
"anthropic/claude-3-opus": 15,
|
20 |
+
"anthropic/claude-3-sonnet": 3,
|
21 |
+
"cohere/command-r": 0.5,
|
22 |
+
"google/gemini-pro": 0.12,
|
23 |
+
"google/gemma-7b-it": 0.1,
|
24 |
+
"mistralai/mistral-large": 8,
|
25 |
+
"mistralai/mistral-medium": 2.7,
|
26 |
+
"mistralai/mixtral-8x7b-instruct": 0.7,
|
27 |
+
"openai/gpt-3.5-turbo": 0.5,
|
28 |
+
"openai/gpt-4-1106-preview": 10,
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
def make_default_md():
|
33 |
|
34 |
leaderboard_md = f"""
|
|
|
51 |
total_models = len(arena_df)
|
52 |
|
53 |
leaderboard_md = f"""
|
54 |
+
Total #models: **{total_models}**. Last updated: Mar 17, 2024.
|
55 |
"""
|
56 |
return leaderboard_md
|
57 |
|
|
|
76 |
categories[0],
|
77 |
] # Ensure the graph is circular by appending the start to the end
|
78 |
colors = [
|
79 |
+
'#1f77b4', # muted blue
|
80 |
+
'#ff7f0e', # safety orange
|
81 |
+
'#2ca02c', # cooked asparagus green
|
82 |
+
'#d62728', # brick red
|
83 |
+
'#9467bd', # muted purple
|
84 |
+
'#8c564b', # chestnut brown
|
85 |
+
'#e377c2', # raspberry yogurt pink
|
86 |
+
'#7f7f7f', # middle gray
|
87 |
+
'#bcbd22', # curry yellow-green
|
88 |
+
'#17becf', # blue-teal
|
89 |
+
'#f7b6d2', # pastel pink
|
90 |
+
'#bcbd22', # faded green
|
91 |
+
'#dbdb8d', # light olive
|
92 |
+
'#17becf', # soft blue
|
93 |
+
'#9edae5', # light blue
|
94 |
+
'#c5b0d5', # soft purple
|
95 |
+
'#c49c94', # dusty rose
|
96 |
+
'#f7b6d2', # pastel pink
|
97 |
+
'#bcbd22', # faded green
|
98 |
+
'#dbdb8d', # light olive
|
99 |
+
'#17becf', # soft blue
|
100 |
+
'#9edae5', # light blue
|
101 |
+
'#c5b0d5', # soft purple
|
102 |
+
'#c49c94', # dusty rose
|
103 |
]
|
104 |
|
105 |
# Setting for 1000x1000
|
|
|
126 |
|
127 |
fig_1000.update_layout(
|
128 |
width=600,
|
129 |
+
height=950,
|
130 |
polar=dict(
|
131 |
angularaxis=dict(
|
132 |
gridwidth=2, # Increase line width for better visibility
|
|
|
174 |
model_table_df.sort_values(by="average", ascending=False, inplace=True)
|
175 |
model_table_df.insert(0, "rank", np.arange(1, len(model_table_df) + 1))
|
176 |
|
177 |
+
# Add cost
|
178 |
+
model_table_df["completion_price"] = model_table_df["model_name"].apply(
|
179 |
+
lambda x: f"{MODEL_NAME_COST.get(x, "N/A")}$"
|
180 |
+
)
|
181 |
+
|
182 |
# Add link
|
183 |
model_table_df["model_name"] = model_table_df["model_name"].apply(
|
184 |
lambda x: openrouter_hyperlink(x)
|
185 |
)
|
186 |
|
187 |
+
# Ensure the dataframe is in the correct order before renaming
|
188 |
+
model_table_df = model_table_df[["rank", "model_name", "completion_price", "klokan", "culture", "analytical", "critical", "verbal", "average"]]
|
189 |
+
|
190 |
model_table_df.rename(
|
191 |
columns={
|
192 |
"model_name": "๐ค Model",
|
193 |
+
"completion_price": "๐ฐ Cost (1M-Tokens)",
|
194 |
"klokan": "๐งฎ Klokan-QA",
|
195 |
"culture": "๐ TSP-Culture",
|
196 |
"analytical": "๐ TSP-Analytical",
|
|
|
201 |
inplace=True,
|
202 |
)
|
203 |
|
204 |
+
|
205 |
return model_table_df
|
206 |
|
207 |
|
|
|
236 |
elem_id="arena_leaderboard_dataframe",
|
237 |
height=700,
|
238 |
column_widths=[
|
239 |
+
70,
|
240 |
200,
|
241 |
+
110,
|
242 |
+
120,
|
243 |
+
120,
|
244 |
+
120,
|
245 |
120,
|
246 |
100,
|
247 |
100,
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
],
|
249 |
wrap=True,
|
250 |
)
|
leaderboard/klokan.csv
CHANGED
@@ -1,6 +1,13 @@
|
|
1 |
,Elementary 2-3,Elementary 4-5,Elementary 6-7,Elementary 8-9,High School 1-2,High School 3-4
|
2 |
anthropic/claude-2.1,43.96551724137931,50.35971223021583,39.87730061349693,39.75155279503105,33.33333333333333,14.772727272727273
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
,Elementary 2-3,Elementary 4-5,Elementary 6-7,Elementary 8-9,High School 1-2,High School 3-4
|
2 |
anthropic/claude-2.1,43.96551724137931,50.35971223021583,39.87730061349693,39.75155279503105,33.33333333333333,14.772727272727273
|
3 |
+
anthropic/claude-3-haiku,37.5,38.405797101449274,27.160493827160494,31.25,32.70440251572327,25.609756097560975
|
4 |
+
anthropic/claude-3-opus,78.57142857142857,65.94202898550725,61.111111111111114,50.0,45.28301886792453,48.78048780487805
|
5 |
+
anthropic/claude-3-sonnet,59.82142857142857,60.86956521739131,38.2716049382716,45.0,33.9622641509434,25.609756097560975
|
6 |
+
cohere/command-r,24.107142857142858,28.26086956521739,29.01234567901235,21.875,24.528301886792452,18.29268292682927
|
7 |
+
google/gemini-pro,33.92857142857143,28.985507246376812,26.543209876543212,28.125,30.18867924528302,23.170731707317074
|
8 |
+
google/gemma-7b-it,16.964285714285715,15.942028985507244,16.666666666666664,12.5,20.125786163522015,21.951219512195124
|
9 |
+
mistralai/mistral-large,51.78571428571429,53.62318840579711,41.358024691358025,37.5,35.22012578616352,23.170731707317074
|
10 |
+
mistralai/mistral-medium,39.285714285714285,34.78260869565217,28.39506172839506,24.375,27.67295597484277,21.951219512195124
|
11 |
+
mistralai/mixtral-8x7b-instruct,30.357142857142854,34.05797101449276,28.39506172839506,22.5,22.0125786163522,23.170731707317074
|
12 |
+
openai/gpt-3.5-turbo,40.17857142857143,39.85507246376812,33.95061728395062,31.874999999999996,26.41509433962264,19.51219512195122
|
13 |
+
openai/gpt-4-1106-preview,71.42857142857143,69.56521739130434,59.876543209876544,52.5,50.314465408805034,42.68292682926829
|
leaderboard/table.csv
CHANGED
@@ -1,6 +1,13 @@
|
|
1 |
model_name,analytical,critical,culture,verbal,klokan
|
2 |
anthropic/claude-2.1,0.3804034582132565,0.6449912126537786,0.7981770833333334,0.6336336336336337,0.3823884197828709
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
model_name,analytical,critical,culture,verbal,klokan
|
2 |
anthropic/claude-2.1,0.3804034582132565,0.6449912126537786,0.7981770833333334,0.6336336336336337,0.3823884197828709
|
3 |
+
anthropic/claude-3-haiku,0.3323727185398655,0.6045694200351494,0.81640625,0.6246246246246246,0.32226322263222634
|
4 |
+
anthropic/claude-3-opus,0.47262247838616717,0.7644991212653779,0.9244791666666666,0.8018018018018018,0.5781057810578106
|
5 |
+
anthropic/claude-3-sonnet,0.37848222862632086,0.6889279437609842,0.8346354166666666,0.6126126126126126,0.44280442804428044
|
6 |
+
cohere/command-r,0.27857829010566765,0.5342706502636204,0.7044270833333334,0.4444444444444444,0.24846248462484624
|
7 |
+
google/gemini-pro,0.28914505283381364,0.6098418277680141,0.8072916666666666,0.6096096096096096,0.2865928659286593
|
8 |
+
google/gemma-7b-it,0.2219020172910663,0.27943760984182775,0.22916666666666666,0.22822822822822822,0.16974169741697417
|
9 |
+
mistralai/mistral-large,0.3852065321805956,0.6678383128295254,0.859375,0.6276276276276276,0.4108241082410824
|
10 |
+
mistralai/mistral-medium,0.3121998078770413,0.5957820738137083,0.7734375,0.5045045045045045,0.2939729397293973
|
11 |
+
mistralai/mixtral-8x7b-instruct,0.2526416906820365,0.5114235500878734,0.7122395833333334,0.43543543543543545,0.26691266912669126
|
12 |
+
openai/gpt-3.5-turbo,0.3045148895292987,0.4991212653778559,0.7213541666666666,0.44744744744744747,0.3247232472324723
|
13 |
+
openai/gpt-4-1106-preview,0.515850144092219,0.7065026362038664,0.90234375,0.7267267267267268,0.5805658056580566
|
leaderboard/tsp.csv
CHANGED
@@ -1,6 +1,13 @@
|
|
1 |
,Analytical,Critical,Cultural,Verbal
|
2 |
anthropic/claude-2.1,38.04034582132565,64.49912126537785,79.81770833333334,63.36336336336337
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
,Analytical,Critical,Cultural,Verbal
|
2 |
anthropic/claude-2.1,38.04034582132565,64.49912126537785,79.81770833333334,63.36336336336337
|
3 |
+
anthropic/claude-3-haiku,33.23727185398655,60.45694200351493,81.640625,62.46246246246246
|
4 |
+
anthropic/claude-3-opus,47.262247838616716,76.44991212653778,92.44791666666666,80.18018018018019
|
5 |
+
anthropic/claude-3-sonnet,37.848222862632085,68.89279437609842,83.46354166666666,61.261261261261254
|
6 |
+
cohere/command-r,27.857829010566764,53.427065026362044,70.44270833333334,44.44444444444444
|
7 |
+
google/gemini-pro,28.914505283381363,60.98418277680141,80.72916666666666,60.96096096096096
|
8 |
+
google/gemma-7b-it,22.19020172910663,27.943760984182774,22.916666666666664,22.822822822822822
|
9 |
+
mistralai/mistral-large,38.52065321805956,66.78383128295255,85.9375,62.76276276276276
|
10 |
+
mistralai/mistral-medium,31.21998078770413,59.57820738137083,77.34375,50.45045045045045
|
11 |
+
mistralai/mixtral-8x7b-instruct,25.26416906820365,51.14235500878734,71.22395833333334,43.54354354354354
|
12 |
+
openai/gpt-3.5-turbo,30.45148895292987,49.91212653778559,72.13541666666666,44.74474474474475
|
13 |
+
openai/gpt-4-1106-preview,51.5850144092219,70.65026362038664,90.234375,72.67267267267268
|