luisrguerra commited on
Commit
04313bf
1 Parent(s): 38e21f8

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +51 -11
index.html CHANGED
@@ -38,6 +38,7 @@
38
  <div><canvas id="winograndeChart" height="150"></canvas></div>
39
  <div><canvas id="arcChart" height="150"></canvas></div>
40
  <div><canvas id="mtbenchChart" height="150"></canvas></div>
 
41
  <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
42
  <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
43
  <p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
@@ -68,13 +69,12 @@
68
  </ul>
69
  <h4>Models with the best cost benefit:</h4>
70
  <ul>
71
- <li>Gemini Pro 1.0</li>
72
  <li>Gemini Pro 1.5</li>
 
73
  <li>gpt-3.5-turbo-0613</li>
74
- <li>gpt-3.5-turbo-1106</li>
75
  <li>Claude 3 Haiku</li>
76
- <li>Claude Instant 1-1.2</li>
77
  <li>Mixtral 8x7B Instruct</li>
 
78
  </ul>
79
  <h4>Models with fewer hallucinations:</h4>
80
  <ul>
@@ -101,6 +101,7 @@
101
  <li>Mistral 7B</li>
102
  <li>Yi 34B</li>
103
  <li>Grok 1</li>
 
104
  <li>Llama 2 7-70B</li>
105
  <li>Gemma 2-7B</li>
106
  </ul>
@@ -123,7 +124,7 @@
123
  <li>gpt-4-0314 - OpenAI</li>
124
  <li>gpt-3.5-turbo-1106 - OpenAI</li>
125
  <li>gpt-4-0314 - OpenAI</li>
126
- <li>Gemini Pro 1.0 - Openrouter with compatibility with OpenAI api, Google api service.</li>
127
  <li>Claude 3 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
128
  <li>Claude 2-2.1 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
129
  <li>Claude Instant 1-1.2 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
@@ -135,24 +136,22 @@
135
  <ul>
136
  <li>Claude 3 Opus</li>
137
  </ul>
138
- <h4>Models with the same level of GPT-4:</h4>
139
  <ul>
140
  <li>Gemini Ultra</li>
141
  <li>Gemini Pro 1.5</li>
142
  <li>Gemini Pro (Bard/Online)</li>
143
- <li>Claude 3 Opus</li>
144
  <li>Claude 3 Sonnet</li>
145
  </ul>
146
  <h4>Models with the same level or better than GPT-3.5 but lower than GPT-4:</h4>
147
  <ul>
148
- <li>Gemini Pro 1.0 without web access</li>
149
- <li>Claude 3 Sonnet</li>
150
  <li>Claude 3 Haiku</li>
151
  <li>Claude 2-2.1</li>
152
  <li>Claude 1</li>
153
  <li>Claude Instant 1-1.2</li>
154
- <li>Mistral Medium</li>
155
  <li>Mistral Large</li>
 
 
156
  </ul>
157
  <h4>Versions of models already surpassed by fine-tune, new versions or new architectures:</h4>
158
  <ul>
@@ -174,6 +173,12 @@
174
  <li>StableLM Tuned Alpha</li>
175
  <li>Stable Beluga 2</li>
176
  </ul>
 
 
 
 
 
 
177
 
178
 
179
  <script>
@@ -189,6 +194,7 @@
189
  hellaswag:null,
190
  arc:null,
191
  nothallucination: null,
 
192
  parameters: 'Probably smaller than GPT-4',
193
  organization: 'OpenAI',
194
  license: 'Proprietary',
@@ -204,6 +210,7 @@
204
  hellaswag:92.7,
205
  arc:94.2,
206
  nothallucination: 97.0,
 
207
  parameters: 'Probably smaller than GPT-4',
208
  organization: 'OpenAI',
209
  license: 'Proprietary',
@@ -219,6 +226,7 @@
219
  hellaswag:91.9,
220
  arc:94.6,
221
  nothallucination: 97.0,
 
222
  parameters: '1T (questionable)',
223
  organization: 'OpenAI',
224
  license: 'Proprietary',
@@ -234,6 +242,7 @@
234
  hellaswag:95.4,
235
  arc:96.3,
236
  nothallucination: 97.0,
 
237
  parameters: '1T (questionable)',
238
  organization: 'OpenAI',
239
  license: 'Proprietary',
@@ -249,6 +258,7 @@
249
  hellaswag:79.4,
250
  arc:81.7,
251
  nothallucination: 96.5,
 
252
  parameters: '20B - 175B (not confirmed)',
253
  organization: 'OpenAI',
254
  license: 'Proprietary',
@@ -264,6 +274,7 @@
264
  hellaswag:85.5,
265
  arc:85.2,
266
  nothallucination: 96.5,
 
267
  parameters: '20B - 175B (not confirmed)',
268
  organization: 'OpenAI',
269
  license: 'Proprietary',
@@ -279,6 +290,7 @@
279
  hellaswag:60.8,
280
  arc:79.1,
281
  nothallucination: 96.5,
 
282
  parameters: '20B - 175B (not confirmed)',
283
  organization: 'OpenAI',
284
  license: 'Proprietary',
@@ -294,6 +306,7 @@
294
  hellaswag:95.4,
295
  arc:96.4,
296
  nothallucination: 92.6,
 
297
  parameters: null,
298
  organization: 'Anthropic',
299
  license: 'Proprietary',
@@ -309,6 +322,7 @@
309
  hellaswag:null,
310
  arc:89.0,
311
  nothallucination: 94,
 
312
  parameters: null,
313
  organization: 'Anthropic',
314
  license: 'Proprietary',
@@ -324,6 +338,7 @@
324
  hellaswag:null,
325
  arc:85.9,
326
  nothallucination: 92.4,
 
327
  parameters: null,
328
  organization: 'Anthropic',
329
  license: 'Proprietary',
@@ -339,6 +354,7 @@
339
  hellaswag:null,
340
  arc:null,
341
  nothallucination: 91.5,
 
342
  parameters: '137B',
343
  organization: 'Anthropic',
344
  license: 'Proprietary',
@@ -354,6 +370,7 @@
354
  hellaswag:null,
355
  arc:91,
356
  nothallucination: 91.5,
 
357
  parameters: '137B',
358
  organization: 'Anthropic',
359
  license: 'Proprietary',
@@ -369,6 +386,7 @@
369
  hellaswag:null,
370
  arc:null,
371
  nothallucination: null,
 
372
  parameters: null,
373
  organization: 'Anthropic',
374
  license: 'Proprietary',
@@ -384,6 +402,7 @@
384
  hellaswag:null,
385
  arc:null,
386
  nothallucination: null,
 
387
  parameters: null,
388
  organization: 'Anthropic',
389
  license: 'Proprietary',
@@ -399,6 +418,7 @@
399
  hellaswag:92.5,
400
  arc:null,
401
  nothallucination: null,
 
402
  parameters: null,
403
  organization: 'Google',
404
  license: 'Proprietary',
@@ -414,6 +434,7 @@
414
  hellaswag:87.8,
415
  arc:null,
416
  nothallucination: null,
 
417
  parameters: null,
418
  organization: 'Google',
419
  license: 'Proprietary',
@@ -429,6 +450,7 @@
429
  hellaswag:null,
430
  arc:null,
431
  nothallucination: null,
 
432
  parameters: null,
433
  organization: 'Google',
434
  license: 'Proprietary',
@@ -437,13 +459,14 @@
437
  name: 'Gemini Pro',
438
  mmlu: 71.8,
439
  mtbench: null,
440
- arenaelo:1111,
441
  gsm8k: 77.9,
442
  winogrande: null,
443
  truthfulqa: null,
444
  hellaswag:84.7,
445
  arc:null,
446
  nothallucination: 95.2,
 
447
  parameters: null,
448
  organization: 'Google',
449
  license: 'Proprietary',
@@ -459,6 +482,7 @@
459
  hellaswag:89.2,
460
  arc:94.2,
461
  nothallucination: null,
 
462
  parameters: null,
463
  organization: 'Mistral',
464
  license: 'Proprietary',
@@ -474,6 +498,7 @@
474
  hellaswag:null,
475
  arc:null,
476
  nothallucination: null,
 
477
  parameters: null,
478
  organization: 'Mistral',
479
  license: 'Proprietary',
@@ -489,6 +514,7 @@
489
  hellaswag:86.7,
490
  arc:70.14,
491
  nothallucination: 90.7,
 
492
  parameters: '45B (MOE)',
493
  organization: 'Mistral',
494
  license: 'Apache 2.0',
@@ -504,6 +530,7 @@
504
  hellaswag:null,
505
  arc:null,
506
  nothallucination: null,
 
507
  parameters: "33B",
508
  organization: 'xAI',
509
  license: 'Proprietary',
@@ -519,6 +546,7 @@
519
  hellaswag:89.0,
520
  arc:68.9,
521
  nothallucination: null,
 
522
  parameters: null,
523
  organization: 'Databricks',
524
  license: 'Databricks Open Model',
@@ -534,6 +562,7 @@
534
  hellaswag:85.69,
535
  arc:64.59,
536
  nothallucination: null,
 
537
  parameters: '34B',
538
  organization: '01 AI',
539
  license: 'Yi License',
@@ -549,6 +578,7 @@
549
  hellaswag:null,
550
  arc:null,
551
  nothallucination: null,
 
552
  parameters: '70B',
553
  organization: 'Perplexity AI',
554
  license: 'Proprietary',
@@ -564,6 +594,7 @@
564
  hellaswag:null,
565
  arc:null,
566
  nothallucination: 94.9,
 
567
  parameters: '70B',
568
  organization: 'Perplexity AI',
569
  license: 'Proprietary',
@@ -582,6 +613,7 @@
582
  '<th>TruthfulQA</th>' +
583
  '<th>HellaSwag</th>' +
584
  '<th>ARC</th>' +
 
585
  '<th>Not hallucination</th>' +
586
  '<th>Parameters</th>' +
587
  '<th>Organization</th>' +
@@ -599,6 +631,7 @@
599
  '<td>' + item.truthfulqa + '</td>' +
600
  '<td>' + item.hellaswag + '</td>' +
601
  '<td>' + item.arc + '</td>' +
 
602
  '<td>' + item.nothallucination + '%'+ '</td>' +
603
  '<td>' + item.parameters + '</td>' +
604
  '<td>' + item.organization + '</td>' +
@@ -640,6 +673,10 @@
640
  const hellaswagMultiplier = 100/hellaswagMaxValue;
641
  const arcMaxValue = getBenchmarkMaxValue("arc",data);
642
  const arcMultiplier = 100/arcMaxValue;
 
 
 
 
643
  let dataset = [];
644
  for (let i = 0; i < data.length; i++) {
645
  dataset.push({
@@ -653,6 +690,8 @@
653
  (data[i].truthfulqa*truthfulqaMultiplier),
654
  (data[i].hellaswag*hellaswagMultiplier),
655
  (data[i].arc*arcMultiplier),
 
 
656
  ],
657
  borderWidth: 2
658
  })
@@ -661,7 +700,7 @@
661
  }
662
  const dataSetRadar = getDataSetRadar(benchmarkData);
663
  let data = {
664
- labels: ['MMLU', 'MT-bench','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
665
  datasets: getDataSetRadar(benchmarkData)
666
  };
667
 
@@ -721,6 +760,7 @@
721
  updateChart('winograndeChart','winogrande');
722
  updateChart('arcChart','arc');
723
  updateChart('mtbenchChart','mtbench');
 
724
 
725
  </script>
726
  </body>
 
38
  <div><canvas id="winograndeChart" height="150"></canvas></div>
39
  <div><canvas id="arcChart" height="150"></canvas></div>
40
  <div><canvas id="mtbenchChart" height="150"></canvas></div>
41
+ <div><canvas id="alpacaevalChart" height="150"></canvas></div>
42
  <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
43
  <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
44
  <p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
 
69
  </ul>
70
  <h4>Models with the best cost benefit:</h4>
71
  <ul>
 
72
  <li>Gemini Pro 1.5</li>
73
+ <li>gpt-3.5-turbo-0125</li>
74
  <li>gpt-3.5-turbo-0613</li>
 
75
  <li>Claude 3 Haiku</li>
 
76
  <li>Mixtral 8x7B Instruct</li>
77
+ <li>OpenChat</li>
78
  </ul>
79
  <h4>Models with fewer hallucinations:</h4>
80
  <ul>
 
101
  <li>Mistral 7B</li>
102
  <li>Yi 34B</li>
103
  <li>Grok 1</li>
104
+ <li>DBRX Instruct</li>
105
  <li>Llama 2 7-70B</li>
106
  <li>Gemma 2-7B</li>
107
  </ul>
 
124
  <li>gpt-4-0314 - OpenAI</li>
125
  <li>gpt-3.5-turbo-1106 - OpenAI</li>
126
  <li>gpt-4-0314 - OpenAI</li>
127
+ <li>Gemini Pro 1.0-1.5 - Openrouter with compatibility with OpenAI api, Google api service.</li>
128
  <li>Claude 3 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
129
  <li>Claude 2-2.1 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
130
  <li>Claude Instant 1-1.2 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
 
136
  <ul>
137
  <li>Claude 3 Opus</li>
138
  </ul>
139
+ <h4>Models with the same level of GPT-4 but lower than GPT-4 Turbo:</h4>
140
  <ul>
141
  <li>Gemini Ultra</li>
142
  <li>Gemini Pro 1.5</li>
143
  <li>Gemini Pro (Bard/Online)</li>
 
144
  <li>Claude 3 Sonnet</li>
145
  </ul>
146
  <h4>Models with the same level or better than GPT-3.5 but lower than GPT-4:</h4>
147
  <ul>
 
 
148
  <li>Claude 3 Haiku</li>
149
  <li>Claude 2-2.1</li>
150
  <li>Claude 1</li>
151
  <li>Claude Instant 1-1.2</li>
 
152
  <li>Mistral Large</li>
153
+ <li>Mistral Medium</li>
154
+ <li>Gemini Pro 1.0 without web access</li>
155
  </ul>
156
  <h4>Versions of models already surpassed by fine-tune, new versions or new architectures:</h4>
157
  <ul>
 
173
  <li>StableLM Tuned Alpha</li>
174
  <li>Stable Beluga 2</li>
175
  </ul>
176
+ <h4>Best OpenAI Models:</h4>
177
+ <ul>
178
+ <li>gpt-4-1106-preview (turbo)</li>
179
+ <li>gpt-3.5-turbo-0613</li>
180
+ <li>gpt-3.5-turbo-0125</li>
181
+ </ul>
182
 
183
 
184
  <script>
 
194
  hellaswag:null,
195
  arc:null,
196
  nothallucination: null,
197
+ alpacaeval: null,
198
  parameters: 'Probably smaller than GPT-4',
199
  organization: 'OpenAI',
200
  license: 'Proprietary',
 
210
  hellaswag:92.7,
211
  arc:94.2,
212
  nothallucination: 97.0,
213
+ alpacaeval: 50,
214
  parameters: 'Probably smaller than GPT-4',
215
  organization: 'OpenAI',
216
  license: 'Proprietary',
 
226
  hellaswag:91.9,
227
  arc:94.6,
228
  nothallucination: 97.0,
229
+ alpacaeval: 30.2,
230
  parameters: '1T (questionable)',
231
  organization: 'OpenAI',
232
  license: 'Proprietary',
 
242
  hellaswag:95.4,
243
  arc:96.3,
244
  nothallucination: 97.0,
245
+ alpacaeval: 35.3,
246
  parameters: '1T (questionable)',
247
  organization: 'OpenAI',
248
  license: 'Proprietary',
 
258
  hellaswag:79.4,
259
  arc:81.7,
260
  nothallucination: 96.5,
261
+ alpacaeval: 22.7,
262
  parameters: '20B - 175B (not confirmed)',
263
  organization: 'OpenAI',
264
  license: 'Proprietary',
 
274
  hellaswag:85.5,
275
  arc:85.2,
276
  nothallucination: 96.5,
277
+ alpacaeval: 18.1,
278
  parameters: '20B - 175B (not confirmed)',
279
  organization: 'OpenAI',
280
  license: 'Proprietary',
 
290
  hellaswag:60.8,
291
  arc:79.1,
292
  nothallucination: 96.5,
293
+ alpacaeval: 19.3,
294
  parameters: '20B - 175B (not confirmed)',
295
  organization: 'OpenAI',
296
  license: 'Proprietary',
 
306
  hellaswag:95.4,
307
  arc:96.4,
308
  nothallucination: 92.6,
309
+ alpacaeval: 40.4,
310
  parameters: null,
311
  organization: 'Anthropic',
312
  license: 'Proprietary',
 
322
  hellaswag:null,
323
  arc:89.0,
324
  nothallucination: 94,
325
+ alpacaeval: 34.9,
326
  parameters: null,
327
  organization: 'Anthropic',
328
  license: 'Proprietary',
 
338
  hellaswag:null,
339
  arc:85.9,
340
  nothallucination: 92.4,
341
+ alpacaeval: null,
342
  parameters: null,
343
  organization: 'Anthropic',
344
  license: 'Proprietary',
 
354
  hellaswag:null,
355
  arc:null,
356
  nothallucination: 91.5,
357
+ alpacaeval: 25.3,
358
  parameters: '137B',
359
  organization: 'Anthropic',
360
  license: 'Proprietary',
 
370
  hellaswag:null,
371
  arc:91,
372
  nothallucination: 91.5,
373
+ alpacaeval: 28.2,
374
  parameters: '137B',
375
  organization: 'Anthropic',
376
  license: 'Proprietary',
 
386
  hellaswag:null,
387
  arc:null,
388
  nothallucination: null,
389
+ alpacaeval: 27.3,
390
  parameters: null,
391
  organization: 'Anthropic',
392
  license: 'Proprietary',
 
402
  hellaswag:null,
403
  arc:null,
404
  nothallucination: null,
405
+ alpacaeval: null,
406
  parameters: null,
407
  organization: 'Anthropic',
408
  license: 'Proprietary',
 
418
  hellaswag:92.5,
419
  arc:null,
420
  nothallucination: null,
421
+ alpacaeval: null,
422
  parameters: null,
423
  organization: 'Google',
424
  license: 'Proprietary',
 
434
  hellaswag:87.8,
435
  arc:null,
436
  nothallucination: null,
437
+ alpacaeval: null,
438
  parameters: null,
439
  organization: 'Google',
440
  license: 'Proprietary',
 
450
  hellaswag:null,
451
  arc:null,
452
  nothallucination: null,
453
+ alpacaeval: null,
454
  parameters: null,
455
  organization: 'Google',
456
  license: 'Proprietary',
 
459
  name: 'Gemini Pro',
460
  mmlu: 71.8,
461
  mtbench: null,
462
+ arenaelo:1127,
463
  gsm8k: 77.9,
464
  winogrande: null,
465
  truthfulqa: null,
466
  hellaswag:84.7,
467
  arc:null,
468
  nothallucination: 95.2,
469
+ alpacaeval: 24.4,
470
  parameters: null,
471
  organization: 'Google',
472
  license: 'Proprietary',
 
482
  hellaswag:89.2,
483
  arc:94.2,
484
  nothallucination: null,
485
+ alpacaeval: 32.7,
486
  parameters: null,
487
  organization: 'Mistral',
488
  license: 'Proprietary',
 
498
  hellaswag:null,
499
  arc:null,
500
  nothallucination: null,
501
+ alpacaeval: 28.6,
502
  parameters: null,
503
  organization: 'Mistral',
504
  license: 'Proprietary',
 
514
  hellaswag:86.7,
515
  arc:70.14,
516
  nothallucination: 90.7,
517
+ alpacaeval: 23.7,
518
  parameters: '45B (MOE)',
519
  organization: 'Mistral',
520
  license: 'Apache 2.0',
 
530
  hellaswag:null,
531
  arc:null,
532
  nothallucination: null,
533
+ alpacaeval: null,
534
  parameters: "33B",
535
  organization: 'xAI',
536
  license: 'Proprietary',
 
546
  hellaswag:89.0,
547
  arc:68.9,
548
  nothallucination: null,
549
+ alpacaeval: null,
550
  parameters: null,
551
  organization: 'Databricks',
552
  license: 'Databricks Open Model',
 
562
  hellaswag:85.69,
563
  arc:64.59,
564
  nothallucination: null,
565
+ alpacaeval: 27.2,
566
  parameters: '34B',
567
  organization: '01 AI',
568
  license: 'Yi License',
 
578
  hellaswag:null,
579
  arc:null,
580
  nothallucination: null,
581
+ alpacaeval: null,
582
  parameters: '70B',
583
  organization: 'Perplexity AI',
584
  license: 'Proprietary',
 
594
  hellaswag:null,
595
  arc:null,
596
  nothallucination: 94.9,
597
+ alpacaeval: null,
598
  parameters: '70B',
599
  organization: 'Perplexity AI',
600
  license: 'Proprietary',
 
613
  '<th>TruthfulQA</th>' +
614
  '<th>HellaSwag</th>' +
615
  '<th>ARC</th>' +
616
+ '<th>AlpacaEval</th>' +
617
  '<th>Not hallucination</th>' +
618
  '<th>Parameters</th>' +
619
  '<th>Organization</th>' +
 
631
  '<td>' + item.truthfulqa + '</td>' +
632
  '<td>' + item.hellaswag + '</td>' +
633
  '<td>' + item.arc + '</td>' +
634
+ '<td>' + item.alpacaeval + '%'+ '</td>' +
635
  '<td>' + item.nothallucination + '%'+ '</td>' +
636
  '<td>' + item.parameters + '</td>' +
637
  '<td>' + item.organization + '</td>' +
 
673
  const hellaswagMultiplier = 100/hellaswagMaxValue;
674
  const arcMaxValue = getBenchmarkMaxValue("arc",data);
675
  const arcMultiplier = 100/arcMaxValue;
676
+ const alpacaevalMaxValue = getBenchmarkMaxValue("alpacaeval",data);
677
+ const alpacaevalMultiplier = 100/alpacaevalMaxValue;
678
+ const notHallucinationMaxValue = getBenchmarkMaxValue("nothallucination",data);
679
+ const notHallucinationMultiplier = 100/notHallucinationMaxValue;
680
  let dataset = [];
681
  for (let i = 0; i < data.length; i++) {
682
  dataset.push({
 
690
  (data[i].truthfulqa*truthfulqaMultiplier),
691
  (data[i].hellaswag*hellaswagMultiplier),
692
  (data[i].arc*arcMultiplier),
693
+ (data[i].alpacaeval*alpacaevalMultiplier),
694
+ (data[i].nothallucination*notHallucinationMultiplier)
695
  ],
696
  borderWidth: 2
697
  })
 
700
  }
701
  const dataSetRadar = getDataSetRadar(benchmarkData);
702
  let data = {
703
+ labels: ['MMLU', 'MT-bench','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC','AlpacaEval','Not Hallucination'],
704
  datasets: getDataSetRadar(benchmarkData)
705
  };
706
 
 
760
  updateChart('winograndeChart','winogrande');
761
  updateChart('arcChart','arc');
762
  updateChart('mtbenchChart','mtbench');
763
+ updateChart('alpacaevalChart','alpacaeval');
764
 
765
  </script>
766
  </body>