luisrguerra
commited on
Commit
•
04313bf
1
Parent(s):
38e21f8
Update index.html
Browse files- index.html +51 -11
index.html
CHANGED
@@ -38,6 +38,7 @@
|
|
38 |
<div><canvas id="winograndeChart" height="150"></canvas></div>
|
39 |
<div><canvas id="arcChart" height="150"></canvas></div>
|
40 |
<div><canvas id="mtbenchChart" height="150"></canvas></div>
|
|
|
41 |
<p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
|
42 |
<p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
|
43 |
<p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
|
@@ -68,13 +69,12 @@
|
|
68 |
</ul>
|
69 |
<h4>Models with the best cost benefit:</h4>
|
70 |
<ul>
|
71 |
-
<li>Gemini Pro 1.0</li>
|
72 |
<li>Gemini Pro 1.5</li>
|
|
|
73 |
<li>gpt-3.5-turbo-0613</li>
|
74 |
-
<li>gpt-3.5-turbo-1106</li>
|
75 |
<li>Claude 3 Haiku</li>
|
76 |
-
<li>Claude Instant 1-1.2</li>
|
77 |
<li>Mixtral 8x7B Instruct</li>
|
|
|
78 |
</ul>
|
79 |
<h4>Models with fewer hallucinations:</h4>
|
80 |
<ul>
|
@@ -101,6 +101,7 @@
|
|
101 |
<li>Mistral 7B</li>
|
102 |
<li>Yi 34B</li>
|
103 |
<li>Grok 1</li>
|
|
|
104 |
<li>Llama 2 7-70B</li>
|
105 |
<li>Gemma 2-7B</li>
|
106 |
</ul>
|
@@ -123,7 +124,7 @@
|
|
123 |
<li>gpt-4-0314 - OpenAI</li>
|
124 |
<li>gpt-3.5-turbo-1106 - OpenAI</li>
|
125 |
<li>gpt-4-0314 - OpenAI</li>
|
126 |
-
<li>Gemini Pro 1.0 - Openrouter with compatibility with OpenAI api, Google api service.</li>
|
127 |
<li>Claude 3 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
|
128 |
<li>Claude 2-2.1 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
|
129 |
<li>Claude Instant 1-1.2 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
|
@@ -135,24 +136,22 @@
|
|
135 |
<ul>
|
136 |
<li>Claude 3 Opus</li>
|
137 |
</ul>
|
138 |
-
<h4>Models with the same level of GPT-4:</h4>
|
139 |
<ul>
|
140 |
<li>Gemini Ultra</li>
|
141 |
<li>Gemini Pro 1.5</li>
|
142 |
<li>Gemini Pro (Bard/Online)</li>
|
143 |
-
<li>Claude 3 Opus</li>
|
144 |
<li>Claude 3 Sonnet</li>
|
145 |
</ul>
|
146 |
<h4>Models with the same level or better than GPT-3.5 but lower than GPT-4:</h4>
|
147 |
<ul>
|
148 |
-
<li>Gemini Pro 1.0 without web access</li>
|
149 |
-
<li>Claude 3 Sonnet</li>
|
150 |
<li>Claude 3 Haiku</li>
|
151 |
<li>Claude 2-2.1</li>
|
152 |
<li>Claude 1</li>
|
153 |
<li>Claude Instant 1-1.2</li>
|
154 |
-
<li>Mistral Medium</li>
|
155 |
<li>Mistral Large</li>
|
|
|
|
|
156 |
</ul>
|
157 |
<h4>Versions of models already surpassed by fine-tune, new versions or new architectures:</h4>
|
158 |
<ul>
|
@@ -174,6 +173,12 @@
|
|
174 |
<li>StableLM Tuned Alpha</li>
|
175 |
<li>Stable Beluga 2</li>
|
176 |
</ul>
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
|
179 |
<script>
|
@@ -189,6 +194,7 @@
|
|
189 |
hellaswag:null,
|
190 |
arc:null,
|
191 |
nothallucination: null,
|
|
|
192 |
parameters: 'Probably smaller than GPT-4',
|
193 |
organization: 'OpenAI',
|
194 |
license: 'Proprietary',
|
@@ -204,6 +210,7 @@
|
|
204 |
hellaswag:92.7,
|
205 |
arc:94.2,
|
206 |
nothallucination: 97.0,
|
|
|
207 |
parameters: 'Probably smaller than GPT-4',
|
208 |
organization: 'OpenAI',
|
209 |
license: 'Proprietary',
|
@@ -219,6 +226,7 @@
|
|
219 |
hellaswag:91.9,
|
220 |
arc:94.6,
|
221 |
nothallucination: 97.0,
|
|
|
222 |
parameters: '1T (questionable)',
|
223 |
organization: 'OpenAI',
|
224 |
license: 'Proprietary',
|
@@ -234,6 +242,7 @@
|
|
234 |
hellaswag:95.4,
|
235 |
arc:96.3,
|
236 |
nothallucination: 97.0,
|
|
|
237 |
parameters: '1T (questionable)',
|
238 |
organization: 'OpenAI',
|
239 |
license: 'Proprietary',
|
@@ -249,6 +258,7 @@
|
|
249 |
hellaswag:79.4,
|
250 |
arc:81.7,
|
251 |
nothallucination: 96.5,
|
|
|
252 |
parameters: '20B - 175B (not confirmed)',
|
253 |
organization: 'OpenAI',
|
254 |
license: 'Proprietary',
|
@@ -264,6 +274,7 @@
|
|
264 |
hellaswag:85.5,
|
265 |
arc:85.2,
|
266 |
nothallucination: 96.5,
|
|
|
267 |
parameters: '20B - 175B (not confirmed)',
|
268 |
organization: 'OpenAI',
|
269 |
license: 'Proprietary',
|
@@ -279,6 +290,7 @@
|
|
279 |
hellaswag:60.8,
|
280 |
arc:79.1,
|
281 |
nothallucination: 96.5,
|
|
|
282 |
parameters: '20B - 175B (not confirmed)',
|
283 |
organization: 'OpenAI',
|
284 |
license: 'Proprietary',
|
@@ -294,6 +306,7 @@
|
|
294 |
hellaswag:95.4,
|
295 |
arc:96.4,
|
296 |
nothallucination: 92.6,
|
|
|
297 |
parameters: null,
|
298 |
organization: 'Anthropic',
|
299 |
license: 'Proprietary',
|
@@ -309,6 +322,7 @@
|
|
309 |
hellaswag:null,
|
310 |
arc:89.0,
|
311 |
nothallucination: 94,
|
|
|
312 |
parameters: null,
|
313 |
organization: 'Anthropic',
|
314 |
license: 'Proprietary',
|
@@ -324,6 +338,7 @@
|
|
324 |
hellaswag:null,
|
325 |
arc:85.9,
|
326 |
nothallucination: 92.4,
|
|
|
327 |
parameters: null,
|
328 |
organization: 'Anthropic',
|
329 |
license: 'Proprietary',
|
@@ -339,6 +354,7 @@
|
|
339 |
hellaswag:null,
|
340 |
arc:null,
|
341 |
nothallucination: 91.5,
|
|
|
342 |
parameters: '137B',
|
343 |
organization: 'Anthropic',
|
344 |
license: 'Proprietary',
|
@@ -354,6 +370,7 @@
|
|
354 |
hellaswag:null,
|
355 |
arc:91,
|
356 |
nothallucination: 91.5,
|
|
|
357 |
parameters: '137B',
|
358 |
organization: 'Anthropic',
|
359 |
license: 'Proprietary',
|
@@ -369,6 +386,7 @@
|
|
369 |
hellaswag:null,
|
370 |
arc:null,
|
371 |
nothallucination: null,
|
|
|
372 |
parameters: null,
|
373 |
organization: 'Anthropic',
|
374 |
license: 'Proprietary',
|
@@ -384,6 +402,7 @@
|
|
384 |
hellaswag:null,
|
385 |
arc:null,
|
386 |
nothallucination: null,
|
|
|
387 |
parameters: null,
|
388 |
organization: 'Anthropic',
|
389 |
license: 'Proprietary',
|
@@ -399,6 +418,7 @@
|
|
399 |
hellaswag:92.5,
|
400 |
arc:null,
|
401 |
nothallucination: null,
|
|
|
402 |
parameters: null,
|
403 |
organization: 'Google',
|
404 |
license: 'Proprietary',
|
@@ -414,6 +434,7 @@
|
|
414 |
hellaswag:87.8,
|
415 |
arc:null,
|
416 |
nothallucination: null,
|
|
|
417 |
parameters: null,
|
418 |
organization: 'Google',
|
419 |
license: 'Proprietary',
|
@@ -429,6 +450,7 @@
|
|
429 |
hellaswag:null,
|
430 |
arc:null,
|
431 |
nothallucination: null,
|
|
|
432 |
parameters: null,
|
433 |
organization: 'Google',
|
434 |
license: 'Proprietary',
|
@@ -437,13 +459,14 @@
|
|
437 |
name: 'Gemini Pro',
|
438 |
mmlu: 71.8,
|
439 |
mtbench: null,
|
440 |
-
arenaelo:
|
441 |
gsm8k: 77.9,
|
442 |
winogrande: null,
|
443 |
truthfulqa: null,
|
444 |
hellaswag:84.7,
|
445 |
arc:null,
|
446 |
nothallucination: 95.2,
|
|
|
447 |
parameters: null,
|
448 |
organization: 'Google',
|
449 |
license: 'Proprietary',
|
@@ -459,6 +482,7 @@
|
|
459 |
hellaswag:89.2,
|
460 |
arc:94.2,
|
461 |
nothallucination: null,
|
|
|
462 |
parameters: null,
|
463 |
organization: 'Mistral',
|
464 |
license: 'Proprietary',
|
@@ -474,6 +498,7 @@
|
|
474 |
hellaswag:null,
|
475 |
arc:null,
|
476 |
nothallucination: null,
|
|
|
477 |
parameters: null,
|
478 |
organization: 'Mistral',
|
479 |
license: 'Proprietary',
|
@@ -489,6 +514,7 @@
|
|
489 |
hellaswag:86.7,
|
490 |
arc:70.14,
|
491 |
nothallucination: 90.7,
|
|
|
492 |
parameters: '45B (MOE)',
|
493 |
organization: 'Mistral',
|
494 |
license: 'Apache 2.0',
|
@@ -504,6 +530,7 @@
|
|
504 |
hellaswag:null,
|
505 |
arc:null,
|
506 |
nothallucination: null,
|
|
|
507 |
parameters: "33B",
|
508 |
organization: 'xAI',
|
509 |
license: 'Proprietary',
|
@@ -519,6 +546,7 @@
|
|
519 |
hellaswag:89.0,
|
520 |
arc:68.9,
|
521 |
nothallucination: null,
|
|
|
522 |
parameters: null,
|
523 |
organization: 'Databricks',
|
524 |
license: 'Databricks Open Model',
|
@@ -534,6 +562,7 @@
|
|
534 |
hellaswag:85.69,
|
535 |
arc:64.59,
|
536 |
nothallucination: null,
|
|
|
537 |
parameters: '34B',
|
538 |
organization: '01 AI',
|
539 |
license: 'Yi License',
|
@@ -549,6 +578,7 @@
|
|
549 |
hellaswag:null,
|
550 |
arc:null,
|
551 |
nothallucination: null,
|
|
|
552 |
parameters: '70B',
|
553 |
organization: 'Perplexity AI',
|
554 |
license: 'Proprietary',
|
@@ -564,6 +594,7 @@
|
|
564 |
hellaswag:null,
|
565 |
arc:null,
|
566 |
nothallucination: 94.9,
|
|
|
567 |
parameters: '70B',
|
568 |
organization: 'Perplexity AI',
|
569 |
license: 'Proprietary',
|
@@ -582,6 +613,7 @@
|
|
582 |
'<th>TruthfulQA</th>' +
|
583 |
'<th>HellaSwag</th>' +
|
584 |
'<th>ARC</th>' +
|
|
|
585 |
'<th>Not hallucination</th>' +
|
586 |
'<th>Parameters</th>' +
|
587 |
'<th>Organization</th>' +
|
@@ -599,6 +631,7 @@
|
|
599 |
'<td>' + item.truthfulqa + '</td>' +
|
600 |
'<td>' + item.hellaswag + '</td>' +
|
601 |
'<td>' + item.arc + '</td>' +
|
|
|
602 |
'<td>' + item.nothallucination + '%'+ '</td>' +
|
603 |
'<td>' + item.parameters + '</td>' +
|
604 |
'<td>' + item.organization + '</td>' +
|
@@ -640,6 +673,10 @@
|
|
640 |
const hellaswagMultiplier = 100/hellaswagMaxValue;
|
641 |
const arcMaxValue = getBenchmarkMaxValue("arc",data);
|
642 |
const arcMultiplier = 100/arcMaxValue;
|
|
|
|
|
|
|
|
|
643 |
let dataset = [];
|
644 |
for (let i = 0; i < data.length; i++) {
|
645 |
dataset.push({
|
@@ -653,6 +690,8 @@
|
|
653 |
(data[i].truthfulqa*truthfulqaMultiplier),
|
654 |
(data[i].hellaswag*hellaswagMultiplier),
|
655 |
(data[i].arc*arcMultiplier),
|
|
|
|
|
656 |
],
|
657 |
borderWidth: 2
|
658 |
})
|
@@ -661,7 +700,7 @@
|
|
661 |
}
|
662 |
const dataSetRadar = getDataSetRadar(benchmarkData);
|
663 |
let data = {
|
664 |
-
labels: ['MMLU', 'MT-bench','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
|
665 |
datasets: getDataSetRadar(benchmarkData)
|
666 |
};
|
667 |
|
@@ -721,6 +760,7 @@
|
|
721 |
updateChart('winograndeChart','winogrande');
|
722 |
updateChart('arcChart','arc');
|
723 |
updateChart('mtbenchChart','mtbench');
|
|
|
724 |
|
725 |
</script>
|
726 |
</body>
|
|
|
38 |
<div><canvas id="winograndeChart" height="150"></canvas></div>
|
39 |
<div><canvas id="arcChart" height="150"></canvas></div>
|
40 |
<div><canvas id="mtbenchChart" height="150"></canvas></div>
|
41 |
+
<div><canvas id="alpacaevalChart" height="150"></canvas></div>
|
42 |
<p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
|
43 |
<p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
|
44 |
<p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
|
|
|
69 |
</ul>
|
70 |
<h4>Models with the best cost benefit:</h4>
|
71 |
<ul>
|
|
|
72 |
<li>Gemini Pro 1.5</li>
|
73 |
+
<li>gpt-3.5-turbo-0125</li>
|
74 |
<li>gpt-3.5-turbo-0613</li>
|
|
|
75 |
<li>Claude 3 Haiku</li>
|
|
|
76 |
<li>Mixtral 8x7B Instruct</li>
|
77 |
+
<li>OpenChat</li>
|
78 |
</ul>
|
79 |
<h4>Models with fewer hallucinations:</h4>
|
80 |
<ul>
|
|
|
101 |
<li>Mistral 7B</li>
|
102 |
<li>Yi 34B</li>
|
103 |
<li>Grok 1</li>
|
104 |
+
<li>DBRX Instruct</li>
|
105 |
<li>Llama 2 7-70B</li>
|
106 |
<li>Gemma 2-7B</li>
|
107 |
</ul>
|
|
|
124 |
<li>gpt-4-0314 - OpenAI</li>
|
125 |
<li>gpt-3.5-turbo-1106 - OpenAI</li>
|
126 |
<li>gpt-4-0314 - OpenAI</li>
|
127 |
+
<li>Gemini Pro 1.0-1.5 - Openrouter with compatibility with OpenAI api, Google api service.</li>
|
128 |
<li>Claude 3 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
|
129 |
<li>Claude 2-2.1 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
|
130 |
<li>Claude Instant 1-1.2 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
|
|
|
136 |
<ul>
|
137 |
<li>Claude 3 Opus</li>
|
138 |
</ul>
|
139 |
+
<h4>Models with the same level of GPT-4 but lower than GPT-4 Turbo:</h4>
|
140 |
<ul>
|
141 |
<li>Gemini Ultra</li>
|
142 |
<li>Gemini Pro 1.5</li>
|
143 |
<li>Gemini Pro (Bard/Online)</li>
|
|
|
144 |
<li>Claude 3 Sonnet</li>
|
145 |
</ul>
|
146 |
<h4>Models with the same level or better than GPT-3.5 but lower than GPT-4:</h4>
|
147 |
<ul>
|
|
|
|
|
148 |
<li>Claude 3 Haiku</li>
|
149 |
<li>Claude 2-2.1</li>
|
150 |
<li>Claude 1</li>
|
151 |
<li>Claude Instant 1-1.2</li>
|
|
|
152 |
<li>Mistral Large</li>
|
153 |
+
<li>Mistral Medium</li>
|
154 |
+
<li>Gemini Pro 1.0 without web access</li>
|
155 |
</ul>
|
156 |
<h4>Versions of models already surpassed by fine-tune, new versions or new architectures:</h4>
|
157 |
<ul>
|
|
|
173 |
<li>StableLM Tuned Alpha</li>
|
174 |
<li>Stable Beluga 2</li>
|
175 |
</ul>
|
176 |
+
<h4>Best OpenAI Models:</h4>
|
177 |
+
<ul>
|
178 |
+
<li>gpt-4-1106-preview (turbo)</li>
|
179 |
+
<li>gpt-3.5-turbo-0613</li>
|
180 |
+
<li>gpt-3.5-turbo-0125</li>
|
181 |
+
</ul>
|
182 |
|
183 |
|
184 |
<script>
|
|
|
194 |
hellaswag:null,
|
195 |
arc:null,
|
196 |
nothallucination: null,
|
197 |
+
alpacaeval: null,
|
198 |
parameters: 'Probably smaller than GPT-4',
|
199 |
organization: 'OpenAI',
|
200 |
license: 'Proprietary',
|
|
|
210 |
hellaswag:92.7,
|
211 |
arc:94.2,
|
212 |
nothallucination: 97.0,
|
213 |
+
alpacaeval: 50,
|
214 |
parameters: 'Probably smaller than GPT-4',
|
215 |
organization: 'OpenAI',
|
216 |
license: 'Proprietary',
|
|
|
226 |
hellaswag:91.9,
|
227 |
arc:94.6,
|
228 |
nothallucination: 97.0,
|
229 |
+
alpacaeval: 30.2,
|
230 |
parameters: '1T (questionable)',
|
231 |
organization: 'OpenAI',
|
232 |
license: 'Proprietary',
|
|
|
242 |
hellaswag:95.4,
|
243 |
arc:96.3,
|
244 |
nothallucination: 97.0,
|
245 |
+
alpacaeval: 35.3,
|
246 |
parameters: '1T (questionable)',
|
247 |
organization: 'OpenAI',
|
248 |
license: 'Proprietary',
|
|
|
258 |
hellaswag:79.4,
|
259 |
arc:81.7,
|
260 |
nothallucination: 96.5,
|
261 |
+
alpacaeval: 22.7,
|
262 |
parameters: '20B - 175B (not confirmed)',
|
263 |
organization: 'OpenAI',
|
264 |
license: 'Proprietary',
|
|
|
274 |
hellaswag:85.5,
|
275 |
arc:85.2,
|
276 |
nothallucination: 96.5,
|
277 |
+
alpacaeval: 18.1,
|
278 |
parameters: '20B - 175B (not confirmed)',
|
279 |
organization: 'OpenAI',
|
280 |
license: 'Proprietary',
|
|
|
290 |
hellaswag:60.8,
|
291 |
arc:79.1,
|
292 |
nothallucination: 96.5,
|
293 |
+
alpacaeval: 19.3,
|
294 |
parameters: '20B - 175B (not confirmed)',
|
295 |
organization: 'OpenAI',
|
296 |
license: 'Proprietary',
|
|
|
306 |
hellaswag:95.4,
|
307 |
arc:96.4,
|
308 |
nothallucination: 92.6,
|
309 |
+
alpacaeval: 40.4,
|
310 |
parameters: null,
|
311 |
organization: 'Anthropic',
|
312 |
license: 'Proprietary',
|
|
|
322 |
hellaswag:null,
|
323 |
arc:89.0,
|
324 |
nothallucination: 94,
|
325 |
+
alpacaeval: 34.9,
|
326 |
parameters: null,
|
327 |
organization: 'Anthropic',
|
328 |
license: 'Proprietary',
|
|
|
338 |
hellaswag:null,
|
339 |
arc:85.9,
|
340 |
nothallucination: 92.4,
|
341 |
+
alpacaeval: null,
|
342 |
parameters: null,
|
343 |
organization: 'Anthropic',
|
344 |
license: 'Proprietary',
|
|
|
354 |
hellaswag:null,
|
355 |
arc:null,
|
356 |
nothallucination: 91.5,
|
357 |
+
alpacaeval: 25.3,
|
358 |
parameters: '137B',
|
359 |
organization: 'Anthropic',
|
360 |
license: 'Proprietary',
|
|
|
370 |
hellaswag:null,
|
371 |
arc:91,
|
372 |
nothallucination: 91.5,
|
373 |
+
alpacaeval: 28.2,
|
374 |
parameters: '137B',
|
375 |
organization: 'Anthropic',
|
376 |
license: 'Proprietary',
|
|
|
386 |
hellaswag:null,
|
387 |
arc:null,
|
388 |
nothallucination: null,
|
389 |
+
alpacaeval: 27.3,
|
390 |
parameters: null,
|
391 |
organization: 'Anthropic',
|
392 |
license: 'Proprietary',
|
|
|
402 |
hellaswag:null,
|
403 |
arc:null,
|
404 |
nothallucination: null,
|
405 |
+
alpacaeval: null,
|
406 |
parameters: null,
|
407 |
organization: 'Anthropic',
|
408 |
license: 'Proprietary',
|
|
|
418 |
hellaswag:92.5,
|
419 |
arc:null,
|
420 |
nothallucination: null,
|
421 |
+
alpacaeval: null,
|
422 |
parameters: null,
|
423 |
organization: 'Google',
|
424 |
license: 'Proprietary',
|
|
|
434 |
hellaswag:87.8,
|
435 |
arc:null,
|
436 |
nothallucination: null,
|
437 |
+
alpacaeval: null,
|
438 |
parameters: null,
|
439 |
organization: 'Google',
|
440 |
license: 'Proprietary',
|
|
|
450 |
hellaswag:null,
|
451 |
arc:null,
|
452 |
nothallucination: null,
|
453 |
+
alpacaeval: null,
|
454 |
parameters: null,
|
455 |
organization: 'Google',
|
456 |
license: 'Proprietary',
|
|
|
459 |
name: 'Gemini Pro',
|
460 |
mmlu: 71.8,
|
461 |
mtbench: null,
|
462 |
+
arenaelo:1127,
|
463 |
gsm8k: 77.9,
|
464 |
winogrande: null,
|
465 |
truthfulqa: null,
|
466 |
hellaswag:84.7,
|
467 |
arc:null,
|
468 |
nothallucination: 95.2,
|
469 |
+
alpacaeval: 24.4,
|
470 |
parameters: null,
|
471 |
organization: 'Google',
|
472 |
license: 'Proprietary',
|
|
|
482 |
hellaswag:89.2,
|
483 |
arc:94.2,
|
484 |
nothallucination: null,
|
485 |
+
alpacaeval: 32.7,
|
486 |
parameters: null,
|
487 |
organization: 'Mistral',
|
488 |
license: 'Proprietary',
|
|
|
498 |
hellaswag:null,
|
499 |
arc:null,
|
500 |
nothallucination: null,
|
501 |
+
alpacaeval: 28.6,
|
502 |
parameters: null,
|
503 |
organization: 'Mistral',
|
504 |
license: 'Proprietary',
|
|
|
514 |
hellaswag:86.7,
|
515 |
arc:70.14,
|
516 |
nothallucination: 90.7,
|
517 |
+
alpacaeval: 23.7,
|
518 |
parameters: '45B (MOE)',
|
519 |
organization: 'Mistral',
|
520 |
license: 'Apache 2.0',
|
|
|
530 |
hellaswag:null,
|
531 |
arc:null,
|
532 |
nothallucination: null,
|
533 |
+
alpacaeval: null,
|
534 |
parameters: "33B",
|
535 |
organization: 'xAI',
|
536 |
license: 'Proprietary',
|
|
|
546 |
hellaswag:89.0,
|
547 |
arc:68.9,
|
548 |
nothallucination: null,
|
549 |
+
alpacaeval: null,
|
550 |
parameters: null,
|
551 |
organization: 'Databricks',
|
552 |
license: 'Databricks Open Model',
|
|
|
562 |
hellaswag:85.69,
|
563 |
arc:64.59,
|
564 |
nothallucination: null,
|
565 |
+
alpacaeval: 27.2,
|
566 |
parameters: '34B',
|
567 |
organization: '01 AI',
|
568 |
license: 'Yi License',
|
|
|
578 |
hellaswag:null,
|
579 |
arc:null,
|
580 |
nothallucination: null,
|
581 |
+
alpacaeval: null,
|
582 |
parameters: '70B',
|
583 |
organization: 'Perplexity AI',
|
584 |
license: 'Proprietary',
|
|
|
594 |
hellaswag:null,
|
595 |
arc:null,
|
596 |
nothallucination: 94.9,
|
597 |
+
alpacaeval: null,
|
598 |
parameters: '70B',
|
599 |
organization: 'Perplexity AI',
|
600 |
license: 'Proprietary',
|
|
|
613 |
'<th>TruthfulQA</th>' +
|
614 |
'<th>HellaSwag</th>' +
|
615 |
'<th>ARC</th>' +
|
616 |
+
'<th>AlpacaEval</th>' +
|
617 |
'<th>Not hallucination</th>' +
|
618 |
'<th>Parameters</th>' +
|
619 |
'<th>Organization</th>' +
|
|
|
631 |
'<td>' + item.truthfulqa + '</td>' +
|
632 |
'<td>' + item.hellaswag + '</td>' +
|
633 |
'<td>' + item.arc + '</td>' +
|
634 |
+
'<td>' + item.alpacaeval + '%'+ '</td>' +
|
635 |
'<td>' + item.nothallucination + '%'+ '</td>' +
|
636 |
'<td>' + item.parameters + '</td>' +
|
637 |
'<td>' + item.organization + '</td>' +
|
|
|
673 |
const hellaswagMultiplier = 100/hellaswagMaxValue;
|
674 |
const arcMaxValue = getBenchmarkMaxValue("arc",data);
|
675 |
const arcMultiplier = 100/arcMaxValue;
|
676 |
+
const alpacaevalMaxValue = getBenchmarkMaxValue("alpacaeval",data);
|
677 |
+
const alpacaevalMultiplier = 100/alpacaevalMaxValue;
|
678 |
+
const notHallucinationMaxValue = getBenchmarkMaxValue("nothallucination",data);
|
679 |
+
const notHallucinationMultiplier = 100/notHallucinationMaxValue;
|
680 |
let dataset = [];
|
681 |
for (let i = 0; i < data.length; i++) {
|
682 |
dataset.push({
|
|
|
690 |
(data[i].truthfulqa*truthfulqaMultiplier),
|
691 |
(data[i].hellaswag*hellaswagMultiplier),
|
692 |
(data[i].arc*arcMultiplier),
|
693 |
+
(data[i].alpacaeval*alpacaevalMultiplier),
|
694 |
+
(data[i].nothallucination*notHallucinationMultiplier)
|
695 |
],
|
696 |
borderWidth: 2
|
697 |
})
|
|
|
700 |
}
|
701 |
const dataSetRadar = getDataSetRadar(benchmarkData);
|
702 |
let data = {
|
703 |
+
labels: ['MMLU', 'MT-bench','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC','AlpacaEval','Not Hallucination'],
|
704 |
datasets: getDataSetRadar(benchmarkData)
|
705 |
};
|
706 |
|
|
|
760 |
updateChart('winograndeChart','winogrande');
|
761 |
updateChart('arcChart','arc');
|
762 |
updateChart('mtbenchChart','mtbench');
|
763 |
+
updateChart('alpacaevalChart','alpacaeval');
|
764 |
|
765 |
</script>
|
766 |
</body>
|