luisrguerra commited on
Commit
512c89a
1 Parent(s): 3b7ce3b

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +328 -65
index.html CHANGED
@@ -7,74 +7,337 @@
7
  <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
8
 
9
  </head>
10
- <body>
11
- <canvas id="radarChart" height="750"></canvas>
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  <script>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  let data = {
15
- labels: ['MMLU', 'MT-bench','Arena Elo'],
16
- datasets: [
17
- {
18
- label: 'GPT-4-Turbo',
19
- data: [null, 93.2,124.9],
20
- borderWidth: 2
21
- },
22
- {
23
- label: 'GPT-4-0314',
24
- data: [86.4, 86.4,119.0],
25
- borderWidth: 2
26
- },
27
- {
28
- label: 'GPT-3.5-Turbo-0314',
29
- data: [70.0, 79.4,112.3],
30
- borderWidth: 2
31
- },
32
- {
33
- label: 'Mistral Medium',
34
- data: [75.3, 86.1,115.0],
35
- borderWidth: 2
36
- },
37
- {
38
- label: 'Mixtral 8x7B Instruct v0.1',
39
- data: [70.6, 83.0,112.3],
40
- borderWidth: 2
41
- },
42
- {
43
- label: 'Claude 2.0',
44
- data: [78.5, 80.6,113.1],
45
- borderWidth: 2
46
- },
47
- {
48
- label: 'Claude 1.0',
49
- data: [77.0, 79.0,114.9],
50
- borderWidth: 2
51
- },
52
- {
53
- label: 'Claude Instant 1',
54
- data: [73.4, 78.5,110.9],
55
- borderWidth: 2
56
- },
57
- {
58
- label: 'Gemini Pro',
59
- data: [71.8, null,111.4],
60
- borderWidth: 2
61
- },
62
- {
63
- label: 'Yi 34B Chat',
64
- data: [73.5, null,111.1],
65
- borderWidth: 2
66
- },
67
- {
68
- label: 'Falcon 180B Chat',
69
- data: [68.0, null,103.1],
70
- borderWidth: 2
71
- },
72
- {
73
- label: 'LLama 2 70B Chat',
74
- data: [63.0, 68.6,107.9],
75
- borderWidth: 2
76
- },
77
- ]
78
  };
79
 
80
  let options = {
 
7
  <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
8
 
9
  </head>
 
 
10
 
11
+ <style>
12
+ body{
13
+ font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
14
+ color:hsl(0, 0%, 25%);
15
+ }
16
+ table{
17
+ width: 100%;
18
+ }
19
+ table, th, td {
20
+ border: 1px solid;
21
+ border-color: hsl(0, 0%, 60%);
22
+ border-collapse: collapse;
23
+ }
24
+ th, td {
25
+ padding: 6px;
26
+ text-align: left;
27
+ }
28
+ </style>
29
+
30
+ <body>
31
+ <div><canvas id="radarChart" height="750"></canvas></div>
32
+ <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
33
+ <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
34
+ <div id="tableBenchMark"></div>
35
+
36
  <script>
37
+ const benchmarkData = [
38
+ {
39
+ name: 'gpt-4-1106-preview',
40
+ mmlu: null,
41
+ mtbench: 9.32,
42
+ arenaelo:1249,
43
+ gsm8k: null,
44
+ winogrande: null,
45
+ truthfulqa: null,
46
+ hellaswag:null,
47
+ arc:null,
48
+ parameters: 'Probably smaller than GPT-4',
49
+ organization: 'OpenAI',
50
+ license: 'Proprietary',
51
+ },
52
+ {
53
+ name: 'gpt-4-0613',
54
+ mmlu: null,
55
+ mtbench: 9.18,
56
+ arenaelo:1160,
57
+ gsm8k: 96.8,
58
+ winogrande: null,
59
+ truthfulqa: null,
60
+ hellaswag:null,
61
+ arc:null,
62
+ parameters: '1T (questionable)',
63
+ organization: 'OpenAI',
64
+ license: 'Proprietary',
65
+ },
66
+ {
67
+ name: 'gpt-4-0314',
68
+ mmlu: 86.4,
69
+ mtbench: 8.96,
70
+ arenaelo:1190,
71
+ gsm8k: 92,
72
+ winogrande: 87.5,
73
+ truthfulqa: 59,
74
+ hellaswag:95.4,
75
+ arc:96.3,
76
+ parameters: '1T (questionable)',
77
+ organization: 'OpenAI',
78
+ license: 'Proprietary',
79
+ },
80
+ {
81
+ name: 'gpt-3.5-turbo-0613',
82
+ mmlu: null,
83
+ mtbench: 8.39,
84
+ arenaelo:1116,
85
+ gsm8k: null,
86
+ winogrande: null,
87
+ truthfulqa: null,
88
+ hellaswag:null,
89
+ arc:null,
90
+ parameters: '20B - 175B (not confirmed)',
91
+ organization: 'OpenAI',
92
+ license: 'Proprietary',
93
+ },
94
+ {
95
+ name: 'gpt-3.5-turbo-0301',
96
+ mmlu: 70,
97
+ mtbench: 7.94,
98
+ arenaelo:1104,
99
+ gsm8k: 57.1,
100
+ winogrande: 81.6,
101
+ truthfulqa: 47,
102
+ hellaswag:85.5,
103
+ arc:85.2,
104
+ parameters: '20B - 175B (not confirmed)',
105
+ organization: 'OpenAI',
106
+ license: 'Proprietary',
107
+ },
108
+ {
109
+ name: 'Claude 2.1',
110
+ mmlu: null,
111
+ mtbench: 8.18,
112
+ arenaelo:1119,
113
+ gsm8k: 88,
114
+ winogrande: null,
115
+ truthfulqa: null,
116
+ hellaswag:null,
117
+ arc:null,
118
+ parameters: '137B',
119
+ organization: 'Anthropic',
120
+ license: 'Proprietary',
121
+ },
122
+ {
123
+ name: 'Claude 2.0',
124
+ mmlu: 78.5,
125
+ mtbench: 8.06,
126
+ arenaelo:1131,
127
+ gsm8k: 71.2,
128
+ winogrande: null,
129
+ truthfulqa: 69,
130
+ hellaswag:null,
131
+ arc:91,
132
+ parameters: '137B',
133
+ organization: 'Anthropic',
134
+ license: 'Proprietary',
135
+ },
136
+ {
137
+ name: 'Claude 1.0',
138
+ mmlu: 77,
139
+ mtbench: 7.9,
140
+ arenaelo:1149,
141
+ gsm8k: null,
142
+ winogrande: null,
143
+ truthfulqa: null,
144
+ hellaswag:null,
145
+ arc:null,
146
+ parameters: null,
147
+ organization: 'Anthropic',
148
+ license: 'Proprietary',
149
+ },
150
+ {
151
+ name: 'Claude Instant 1',
152
+ mmlu: 73.4,
153
+ mtbench: 7.85,
154
+ arenaelo:1109,
155
+ gsm8k: 86.7,
156
+ winogrande: null,
157
+ truthfulqa: null,
158
+ hellaswag:null,
159
+ arc:null,
160
+ parameters: null,
161
+ organization: 'Anthropic',
162
+ license: 'Proprietary',
163
+ },
164
+ {
165
+ name: 'Gemini Ultra',
166
+ mmlu: 83.7,
167
+ mtbench: null,
168
+ arenaelo:null,
169
+ gsm8k: 94.4,
170
+ winogrande: null,
171
+ truthfulqa: null,
172
+ hellaswag:87.8,
173
+ arc:null,
174
+ parameters: null,
175
+ organization: 'Google',
176
+ license: 'Proprietary',
177
+ },
178
+ {
179
+ name: 'Gemini Pro',
180
+ mmlu: 71.8,
181
+ mtbench: null,
182
+ arenaelo:1114,
183
+ gsm8k: 86.5,
184
+ winogrande: null,
185
+ truthfulqa: null,
186
+ hellaswag:84.7,
187
+ arc:null,
188
+ parameters: null,
189
+ organization: 'Google',
190
+ license: 'Proprietary',
191
+ },
192
+ {
193
+ name: 'Mistral Medium',
194
+ mmlu: 75.3,
195
+ mtbench: 8.61,
196
+ arenaelo:1150,
197
+ gsm8k: null,
198
+ winogrande: null,
199
+ truthfulqa: null,
200
+ hellaswag:null,
201
+ arc:null,
202
+ parameters: null,
203
+ organization: 'Mistral',
204
+ license: 'Proprietary',
205
+ },
206
+ {
207
+ name: 'Mixtral 8x7B Instruct',
208
+ mmlu: 70.6,
209
+ mtbench: 8.3,
210
+ arenaelo:1123,
211
+ gsm8k: 58.4,
212
+ winogrande: 81.2,
213
+ truthfulqa: 46.7,
214
+ hellaswag:86.7,
215
+ arc:70.14,
216
+ parameters: '45B (MOE)',
217
+ organization: 'Mistral',
218
+ license: 'Apache 2.0',
219
+ },
220
+ {
221
+ name: 'Grok 1',
222
+ mmlu: 73,
223
+ mtbench: null,
224
+ arenaelo:null,
225
+ gsm8k: 72.9,
226
+ winogrande: null,
227
+ truthfulqa: null,
228
+ hellaswag:null,
229
+ arc:null,
230
+ parameters: "33B",
231
+ organization: 'xAI',
232
+ license: 'Proprietary',
233
+ },
234
+ {
235
+ name: 'Yi 34B',
236
+ mmlu: 73.5,
237
+ mtbench: null,
238
+ arenaelo:1111,
239
+ gsm8k: 50.64,
240
+ winogrande: 83.03,
241
+ truthfulqa: 56.23,
242
+ hellaswag:85.69,
243
+ arc:64.59,
244
+ parameters: '34B',
245
+ organization: '01 AI',
246
+ license: 'Yi License',
247
+ },
248
+ ]
249
+
250
+ function setBenchmarkTable(data) {
251
+ let tableHTML = '<table border="1">' +
252
+ '<tr>' +
253
+ '<th>Name</th>' +
254
+ '<th>MMLU</th>' +
255
+ '<th>MT-Bench</th>' +
256
+ '<th>Arena Elo</th>' +
257
+ '<th>GSM-8k</th>' +
258
+ '<th>Winogrande</th>' +
259
+ '<th>TruthfulQA</th>' +
260
+ '<th>HellaSwag</th>' +
261
+ '<th>ARC</th>' +
262
+ '<th>Parameters</th>' +
263
+ '<th>Organization</th>' +
264
+ '<th>License</th>' +
265
+ '</tr>';
266
+
267
+ data.forEach(function(item) {
268
+ tableHTML += '<tr>' +
269
+ '<td>' + item.name + '</td>' +
270
+ '<td>' + item.mmlu + '</td>' +
271
+ '<td>' + item.mtbench + '</td>' +
272
+ '<td>' + item.arenaelo + '</td>' +
273
+ '<td>' + item.gsm8k + '</td>' +
274
+ '<td>' + item.winogrande + '</td>' +
275
+ '<td>' + item.truthfulqa + '</td>' +
276
+ '<td>' + item.hellaswag + '</td>' +
277
+ '<td>' + item.arc + '</td>' +
278
+ '<td>' + item.parameters + '</td>' +
279
+ '<td>' + item.organization + '</td>' +
280
+ '<td>' + item.license + '</td>' +
281
+ '</tr>';
282
+ });
283
+
284
+ tableHTML += '</table>';
285
+ document.getElementById('tableBenchMark').innerHTML = tableHTML;
286
+ }
287
+
288
+ setBenchmarkTable(benchmarkData);
289
+
290
+ function getBenchmarkMaxValue(benchmarkName,data) {
291
+ let maxValue = 0;
292
+ for (let i = 0; i < data.length; i++) {
293
+ if (data[i][benchmarkName] > maxValue) {
294
+ maxValue = data[i][benchmarkName];
295
+ }
296
+ }
297
+ return maxValue;
298
+
299
+ }
300
+
301
+ function getDataSetRadar(data) {
302
+ const mmluMaxValue = getBenchmarkMaxValue("mmlu",data);
303
+ const mmluMultiplier = 100/mmluMaxValue;
304
+ const mtbenchMaxValue = getBenchmarkMaxValue("mtbench",data);
305
+ const mtbenchMultiplier = 100/mtbenchMaxValue;
306
+ const arenaeloMaxValue = getBenchmarkMaxValue("arenaelo",data);
307
+ const arenaeloMultiplier = 100/arenaeloMaxValue;
308
+ const gsm8kMaxValue = getBenchmarkMaxValue("gsm8k",data);
309
+ const gsm8kMultiplier = 100/gsm8kMaxValue;
310
+ const winograndeMaxValue = getBenchmarkMaxValue("winogrande",data);
311
+ const winograndeMultiplier = 100/winograndeMaxValue;
312
+ const truthfulqaMaxValue = getBenchmarkMaxValue("truthfulqa",data);
313
+ const truthfulqaMultiplier = 100/truthfulqaMaxValue;
314
+ const hellaswagMaxValue = getBenchmarkMaxValue("hellaswag",data);
315
+ const hellaswagMultiplier = 100/hellaswagMaxValue;
316
+ const arcMaxValue = getBenchmarkMaxValue("arc",data);
317
+ const arcMultiplier = 100/arcMaxValue;
318
+ let dataset = [];
319
+ for (let i = 0; i < data.length; i++) {
320
+ dataset.push({
321
+ label: data[i].name,
322
+ data: [
323
+ (data[i].mmlu*mmluMultiplier),
324
+ (data[i].mtbench*mtbenchMultiplier),
325
+ (data[i].arenaelo*arenaeloMultiplier),
326
+ (data[i].gsm8k*gsm8kMultiplier),
327
+ (data[i].winogrande*winograndeMultiplier),
328
+ (data[i].truthfulqa*truthfulqaMultiplier),
329
+ (data[i].hellaswag*hellaswagMultiplier),
330
+ (data[i].arc*arcMultiplier),
331
+ ],
332
+ borderWidth: 2
333
+ })
334
+ }
335
+ return dataset;
336
+ }
337
+ const dataSetRadar = getDataSetRadar(benchmarkData);
338
  let data = {
339
+ labels: ['MMLU', 'MT-bench','Arena Elo','GSM-8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
340
+ datasets: getDataSetRadar(benchmarkData)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  };
342
 
343
  let options = {