luisrguerra
commited on
Commit
•
512c89a
1
Parent(s):
3b7ce3b
Update index.html
Browse files- index.html +328 -65
index.html
CHANGED
@@ -7,74 +7,337 @@
|
|
7 |
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
8 |
|
9 |
</head>
|
10 |
-
<body>
|
11 |
-
<canvas id="radarChart" height="750"></canvas>
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
<script>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
let data = {
|
15 |
-
labels: ['MMLU', 'MT-bench','Arena Elo'],
|
16 |
-
datasets:
|
17 |
-
{
|
18 |
-
label: 'GPT-4-Turbo',
|
19 |
-
data: [null, 93.2,124.9],
|
20 |
-
borderWidth: 2
|
21 |
-
},
|
22 |
-
{
|
23 |
-
label: 'GPT-4-0314',
|
24 |
-
data: [86.4, 86.4,119.0],
|
25 |
-
borderWidth: 2
|
26 |
-
},
|
27 |
-
{
|
28 |
-
label: 'GPT-3.5-Turbo-0314',
|
29 |
-
data: [70.0, 79.4,112.3],
|
30 |
-
borderWidth: 2
|
31 |
-
},
|
32 |
-
{
|
33 |
-
label: 'Mistral Medium',
|
34 |
-
data: [75.3, 86.1,115.0],
|
35 |
-
borderWidth: 2
|
36 |
-
},
|
37 |
-
{
|
38 |
-
label: 'Mixtral 8x7B Instruct v0.1',
|
39 |
-
data: [70.6, 83.0,112.3],
|
40 |
-
borderWidth: 2
|
41 |
-
},
|
42 |
-
{
|
43 |
-
label: 'Claude 2.0',
|
44 |
-
data: [78.5, 80.6,113.1],
|
45 |
-
borderWidth: 2
|
46 |
-
},
|
47 |
-
{
|
48 |
-
label: 'Claude 1.0',
|
49 |
-
data: [77.0, 79.0,114.9],
|
50 |
-
borderWidth: 2
|
51 |
-
},
|
52 |
-
{
|
53 |
-
label: 'Claude Instant 1',
|
54 |
-
data: [73.4, 78.5,110.9],
|
55 |
-
borderWidth: 2
|
56 |
-
},
|
57 |
-
{
|
58 |
-
label: 'Gemini Pro',
|
59 |
-
data: [71.8, null,111.4],
|
60 |
-
borderWidth: 2
|
61 |
-
},
|
62 |
-
{
|
63 |
-
label: 'Yi 34B Chat',
|
64 |
-
data: [73.5, null,111.1],
|
65 |
-
borderWidth: 2
|
66 |
-
},
|
67 |
-
{
|
68 |
-
label: 'Falcon 180B Chat',
|
69 |
-
data: [68.0, null,103.1],
|
70 |
-
borderWidth: 2
|
71 |
-
},
|
72 |
-
{
|
73 |
-
label: 'LLama 2 70B Chat',
|
74 |
-
data: [63.0, 68.6,107.9],
|
75 |
-
borderWidth: 2
|
76 |
-
},
|
77 |
-
]
|
78 |
};
|
79 |
|
80 |
let options = {
|
|
|
7 |
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
|
8 |
|
9 |
</head>
|
|
|
|
|
10 |
|
11 |
+
<style>
|
12 |
+
body{
|
13 |
+
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
|
14 |
+
color:hsl(0, 0%, 25%);
|
15 |
+
}
|
16 |
+
table{
|
17 |
+
width: 100%;
|
18 |
+
}
|
19 |
+
table, th, td {
|
20 |
+
border: 1px solid;
|
21 |
+
border-color: hsl(0, 0%, 60%);
|
22 |
+
border-collapse: collapse;
|
23 |
+
}
|
24 |
+
th, td {
|
25 |
+
padding: 6px;
|
26 |
+
text-align: left;
|
27 |
+
}
|
28 |
+
</style>
|
29 |
+
|
30 |
+
<body>
|
31 |
+
<div><canvas id="radarChart" height="750"></canvas></div>
|
32 |
+
<p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
|
33 |
+
<p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
|
34 |
+
<div id="tableBenchMark"></div>
|
35 |
+
|
36 |
<script>
|
37 |
+
const benchmarkData = [
|
38 |
+
{
|
39 |
+
name: 'gpt-4-1106-preview',
|
40 |
+
mmlu: null,
|
41 |
+
mtbench: 9.32,
|
42 |
+
arenaelo:1249,
|
43 |
+
gsm8k: null,
|
44 |
+
winogrande: null,
|
45 |
+
truthfulqa: null,
|
46 |
+
hellaswag:null,
|
47 |
+
arc:null,
|
48 |
+
parameters: 'Probably smaller than GPT-4',
|
49 |
+
organization: 'OpenAI',
|
50 |
+
license: 'Proprietary',
|
51 |
+
},
|
52 |
+
{
|
53 |
+
name: 'gpt-4-0613',
|
54 |
+
mmlu: null,
|
55 |
+
mtbench: 9.18,
|
56 |
+
arenaelo:1160,
|
57 |
+
gsm8k: 96.8,
|
58 |
+
winogrande: null,
|
59 |
+
truthfulqa: null,
|
60 |
+
hellaswag:null,
|
61 |
+
arc:null,
|
62 |
+
parameters: '1T (questionable)',
|
63 |
+
organization: 'OpenAI',
|
64 |
+
license: 'Proprietary',
|
65 |
+
},
|
66 |
+
{
|
67 |
+
name: 'gpt-4-0314',
|
68 |
+
mmlu: 86.4,
|
69 |
+
mtbench: 8.96,
|
70 |
+
arenaelo:1190,
|
71 |
+
gsm8k: 92,
|
72 |
+
winogrande: 87.5,
|
73 |
+
truthfulqa: 59,
|
74 |
+
hellaswag:95.4,
|
75 |
+
arc:96.3,
|
76 |
+
parameters: '1T (questionable)',
|
77 |
+
organization: 'OpenAI',
|
78 |
+
license: 'Proprietary',
|
79 |
+
},
|
80 |
+
{
|
81 |
+
name: 'gpt-3.5-turbo-0613',
|
82 |
+
mmlu: null,
|
83 |
+
mtbench: 8.39,
|
84 |
+
arenaelo:1116,
|
85 |
+
gsm8k: null,
|
86 |
+
winogrande: null,
|
87 |
+
truthfulqa: null,
|
88 |
+
hellaswag:null,
|
89 |
+
arc:null,
|
90 |
+
parameters: '20B - 175B (not confirmed)',
|
91 |
+
organization: 'OpenAI',
|
92 |
+
license: 'Proprietary',
|
93 |
+
},
|
94 |
+
{
|
95 |
+
name: 'gpt-3.5-turbo-0301',
|
96 |
+
mmlu: 70,
|
97 |
+
mtbench: 7.94,
|
98 |
+
arenaelo:1104,
|
99 |
+
gsm8k: 57.1,
|
100 |
+
winogrande: 81.6,
|
101 |
+
truthfulqa: 47,
|
102 |
+
hellaswag:85.5,
|
103 |
+
arc:85.2,
|
104 |
+
parameters: '20B - 175B (not confirmed)',
|
105 |
+
organization: 'OpenAI',
|
106 |
+
license: 'Proprietary',
|
107 |
+
},
|
108 |
+
{
|
109 |
+
name: 'Claude 2.1',
|
110 |
+
mmlu: null,
|
111 |
+
mtbench: 8.18,
|
112 |
+
arenaelo:1119,
|
113 |
+
gsm8k: 88,
|
114 |
+
winogrande: null,
|
115 |
+
truthfulqa: null,
|
116 |
+
hellaswag:null,
|
117 |
+
arc:null,
|
118 |
+
parameters: '137B',
|
119 |
+
organization: 'Anthropic',
|
120 |
+
license: 'Proprietary',
|
121 |
+
},
|
122 |
+
{
|
123 |
+
name: 'Claude 2.0',
|
124 |
+
mmlu: 78.5,
|
125 |
+
mtbench: 8.06,
|
126 |
+
arenaelo:1131,
|
127 |
+
gsm8k: 71.2,
|
128 |
+
winogrande: null,
|
129 |
+
truthfulqa: 69,
|
130 |
+
hellaswag:null,
|
131 |
+
arc:91,
|
132 |
+
parameters: '137B',
|
133 |
+
organization: 'Anthropic',
|
134 |
+
license: 'Proprietary',
|
135 |
+
},
|
136 |
+
{
|
137 |
+
name: 'Claude 1.0',
|
138 |
+
mmlu: 77,
|
139 |
+
mtbench: 7.9,
|
140 |
+
arenaelo:1149,
|
141 |
+
gsm8k: null,
|
142 |
+
winogrande: null,
|
143 |
+
truthfulqa: null,
|
144 |
+
hellaswag:null,
|
145 |
+
arc:null,
|
146 |
+
parameters: null,
|
147 |
+
organization: 'Anthropic',
|
148 |
+
license: 'Proprietary',
|
149 |
+
},
|
150 |
+
{
|
151 |
+
name: 'Claude Instant 1',
|
152 |
+
mmlu: 73.4,
|
153 |
+
mtbench: 7.85,
|
154 |
+
arenaelo:1109,
|
155 |
+
gsm8k: 86.7,
|
156 |
+
winogrande: null,
|
157 |
+
truthfulqa: null,
|
158 |
+
hellaswag:null,
|
159 |
+
arc:null,
|
160 |
+
parameters: null,
|
161 |
+
organization: 'Anthropic',
|
162 |
+
license: 'Proprietary',
|
163 |
+
},
|
164 |
+
{
|
165 |
+
name: 'Gemini Ultra',
|
166 |
+
mmlu: 83.7,
|
167 |
+
mtbench: null,
|
168 |
+
arenaelo:null,
|
169 |
+
gsm8k: 94.4,
|
170 |
+
winogrande: null,
|
171 |
+
truthfulqa: null,
|
172 |
+
hellaswag:87.8,
|
173 |
+
arc:null,
|
174 |
+
parameters: null,
|
175 |
+
organization: 'Google',
|
176 |
+
license: 'Proprietary',
|
177 |
+
},
|
178 |
+
{
|
179 |
+
name: 'Gemini Pro',
|
180 |
+
mmlu: 71.8,
|
181 |
+
mtbench: null,
|
182 |
+
arenaelo:1114,
|
183 |
+
gsm8k: 86.5,
|
184 |
+
winogrande: null,
|
185 |
+
truthfulqa: null,
|
186 |
+
hellaswag:84.7,
|
187 |
+
arc:null,
|
188 |
+
parameters: null,
|
189 |
+
organization: 'Google',
|
190 |
+
license: 'Proprietary',
|
191 |
+
},
|
192 |
+
{
|
193 |
+
name: 'Mistral Medium',
|
194 |
+
mmlu: 75.3,
|
195 |
+
mtbench: 8.61,
|
196 |
+
arenaelo:1150,
|
197 |
+
gsm8k: null,
|
198 |
+
winogrande: null,
|
199 |
+
truthfulqa: null,
|
200 |
+
hellaswag:null,
|
201 |
+
arc:null,
|
202 |
+
parameters: null,
|
203 |
+
organization: 'Mistral',
|
204 |
+
license: 'Proprietary',
|
205 |
+
},
|
206 |
+
{
|
207 |
+
name: 'Mixtral 8x7B Instruct',
|
208 |
+
mmlu: 70.6,
|
209 |
+
mtbench: 8.3,
|
210 |
+
arenaelo:1123,
|
211 |
+
gsm8k: 58.4,
|
212 |
+
winogrande: 81.2,
|
213 |
+
truthfulqa: 46.7,
|
214 |
+
hellaswag:86.7,
|
215 |
+
arc:70.14,
|
216 |
+
parameters: '45B (MOE)',
|
217 |
+
organization: 'Mistral',
|
218 |
+
license: 'Apache 2.0',
|
219 |
+
},
|
220 |
+
{
|
221 |
+
name: 'Grok 1',
|
222 |
+
mmlu: 73,
|
223 |
+
mtbench: null,
|
224 |
+
arenaelo:null,
|
225 |
+
gsm8k: 72.9,
|
226 |
+
winogrande: null,
|
227 |
+
truthfulqa: null,
|
228 |
+
hellaswag:null,
|
229 |
+
arc:null,
|
230 |
+
parameters: "33B",
|
231 |
+
organization: 'xAI',
|
232 |
+
license: 'Proprietary',
|
233 |
+
},
|
234 |
+
{
|
235 |
+
name: 'Yi 34B',
|
236 |
+
mmlu: 73.5,
|
237 |
+
mtbench: null,
|
238 |
+
arenaelo:1111,
|
239 |
+
gsm8k: 50.64,
|
240 |
+
winogrande: 83.03,
|
241 |
+
truthfulqa: 56.23,
|
242 |
+
hellaswag:85.69,
|
243 |
+
arc:64.59,
|
244 |
+
parameters: '34B',
|
245 |
+
organization: '01 AI',
|
246 |
+
license: 'Yi License',
|
247 |
+
},
|
248 |
+
]
|
249 |
+
|
250 |
+
function setBenchmarkTable(data) {
|
251 |
+
let tableHTML = '<table border="1">' +
|
252 |
+
'<tr>' +
|
253 |
+
'<th>Name</th>' +
|
254 |
+
'<th>MMLU</th>' +
|
255 |
+
'<th>MT-Bench</th>' +
|
256 |
+
'<th>Arena Elo</th>' +
|
257 |
+
'<th>GSM-8k</th>' +
|
258 |
+
'<th>Winogrande</th>' +
|
259 |
+
'<th>TruthfulQA</th>' +
|
260 |
+
'<th>HellaSwag</th>' +
|
261 |
+
'<th>ARC</th>' +
|
262 |
+
'<th>Parameters</th>' +
|
263 |
+
'<th>Organization</th>' +
|
264 |
+
'<th>License</th>' +
|
265 |
+
'</tr>';
|
266 |
+
|
267 |
+
data.forEach(function(item) {
|
268 |
+
tableHTML += '<tr>' +
|
269 |
+
'<td>' + item.name + '</td>' +
|
270 |
+
'<td>' + item.mmlu + '</td>' +
|
271 |
+
'<td>' + item.mtbench + '</td>' +
|
272 |
+
'<td>' + item.arenaelo + '</td>' +
|
273 |
+
'<td>' + item.gsm8k + '</td>' +
|
274 |
+
'<td>' + item.winogrande + '</td>' +
|
275 |
+
'<td>' + item.truthfulqa + '</td>' +
|
276 |
+
'<td>' + item.hellaswag + '</td>' +
|
277 |
+
'<td>' + item.arc + '</td>' +
|
278 |
+
'<td>' + item.parameters + '</td>' +
|
279 |
+
'<td>' + item.organization + '</td>' +
|
280 |
+
'<td>' + item.license + '</td>' +
|
281 |
+
'</tr>';
|
282 |
+
});
|
283 |
+
|
284 |
+
tableHTML += '</table>';
|
285 |
+
document.getElementById('tableBenchMark').innerHTML = tableHTML;
|
286 |
+
}
|
287 |
+
|
288 |
+
setBenchmarkTable(benchmarkData);
|
289 |
+
|
290 |
+
function getBenchmarkMaxValue(benchmarkName,data) {
|
291 |
+
let maxValue = 0;
|
292 |
+
for (let i = 0; i < data.length; i++) {
|
293 |
+
if (data[i][benchmarkName] > maxValue) {
|
294 |
+
maxValue = data[i][benchmarkName];
|
295 |
+
}
|
296 |
+
}
|
297 |
+
return maxValue;
|
298 |
+
|
299 |
+
}
|
300 |
+
|
301 |
+
function getDataSetRadar(data) {
|
302 |
+
const mmluMaxValue = getBenchmarkMaxValue("mmlu",data);
|
303 |
+
const mmluMultiplier = 100/mmluMaxValue;
|
304 |
+
const mtbenchMaxValue = getBenchmarkMaxValue("mtbench",data);
|
305 |
+
const mtbenchMultiplier = 100/mtbenchMaxValue;
|
306 |
+
const arenaeloMaxValue = getBenchmarkMaxValue("arenaelo",data);
|
307 |
+
const arenaeloMultiplier = 100/arenaeloMaxValue;
|
308 |
+
const gsm8kMaxValue = getBenchmarkMaxValue("gsm8k",data);
|
309 |
+
const gsm8kMultiplier = 100/gsm8kMaxValue;
|
310 |
+
const winograndeMaxValue = getBenchmarkMaxValue("winogrande",data);
|
311 |
+
const winograndeMultiplier = 100/winograndeMaxValue;
|
312 |
+
const truthfulqaMaxValue = getBenchmarkMaxValue("truthfulqa",data);
|
313 |
+
const truthfulqaMultiplier = 100/truthfulqaMaxValue;
|
314 |
+
const hellaswagMaxValue = getBenchmarkMaxValue("hellaswag",data);
|
315 |
+
const hellaswagMultiplier = 100/hellaswagMaxValue;
|
316 |
+
const arcMaxValue = getBenchmarkMaxValue("arc",data);
|
317 |
+
const arcMultiplier = 100/arcMaxValue;
|
318 |
+
let dataset = [];
|
319 |
+
for (let i = 0; i < data.length; i++) {
|
320 |
+
dataset.push({
|
321 |
+
label: data[i].name,
|
322 |
+
data: [
|
323 |
+
(data[i].mmlu*mmluMultiplier),
|
324 |
+
(data[i].mtbench*mtbenchMultiplier),
|
325 |
+
(data[i].arenaelo*arenaeloMultiplier),
|
326 |
+
(data[i].gsm8k*gsm8kMultiplier),
|
327 |
+
(data[i].winogrande*winograndeMultiplier),
|
328 |
+
(data[i].truthfulqa*truthfulqaMultiplier),
|
329 |
+
(data[i].hellaswag*hellaswagMultiplier),
|
330 |
+
(data[i].arc*arcMultiplier),
|
331 |
+
],
|
332 |
+
borderWidth: 2
|
333 |
+
})
|
334 |
+
}
|
335 |
+
return dataset;
|
336 |
+
}
|
337 |
+
const dataSetRadar = getDataSetRadar(benchmarkData);
|
338 |
let data = {
|
339 |
+
labels: ['MMLU', 'MT-bench','Arena Elo','GSM-8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
|
340 |
+
datasets: getDataSetRadar(benchmarkData)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
};
|
342 |
|
343 |
let options = {
|