luisrguerra
commited on
Commit
•
3e2aedd
1
Parent(s):
2867ede
Update index.html
Browse files- index.html +48 -0
index.html
CHANGED
@@ -135,6 +135,7 @@
|
|
135 |
truthfulqa: null,
|
136 |
hellaswag:null,
|
137 |
arc:null,
|
|
|
138 |
parameters: 'Probably smaller than GPT-4',
|
139 |
organization: 'OpenAI',
|
140 |
license: 'Proprietary',
|
@@ -149,6 +150,7 @@
|
|
149 |
truthfulqa: null,
|
150 |
hellaswag:null,
|
151 |
arc:null,
|
|
|
152 |
parameters: '1T (questionable)',
|
153 |
organization: 'OpenAI',
|
154 |
license: 'Proprietary',
|
@@ -163,6 +165,7 @@
|
|
163 |
truthfulqa: 59,
|
164 |
hellaswag:95.4,
|
165 |
arc:96.3,
|
|
|
166 |
parameters: '1T (questionable)',
|
167 |
organization: 'OpenAI',
|
168 |
license: 'Proprietary',
|
@@ -177,6 +180,7 @@
|
|
177 |
truthfulqa: null,
|
178 |
hellaswag:null,
|
179 |
arc:null,
|
|
|
180 |
parameters: '20B - 175B (not confirmed)',
|
181 |
organization: 'OpenAI',
|
182 |
license: 'Proprietary',
|
@@ -191,6 +195,7 @@
|
|
191 |
truthfulqa: 47,
|
192 |
hellaswag:85.5,
|
193 |
arc:85.2,
|
|
|
194 |
parameters: '20B - 175B (not confirmed)',
|
195 |
organization: 'OpenAI',
|
196 |
license: 'Proprietary',
|
@@ -205,6 +210,7 @@
|
|
205 |
truthfulqa: null,
|
206 |
hellaswag:null,
|
207 |
arc:null,
|
|
|
208 |
parameters: '20B - 175B (not confirmed)',
|
209 |
organization: 'OpenAI',
|
210 |
license: 'Proprietary',
|
@@ -219,6 +225,7 @@
|
|
219 |
truthfulqa: null,
|
220 |
hellaswag:null,
|
221 |
arc:null,
|
|
|
222 |
parameters: '137B',
|
223 |
organization: 'Anthropic',
|
224 |
license: 'Proprietary',
|
@@ -233,6 +240,7 @@
|
|
233 |
truthfulqa: 69,
|
234 |
hellaswag:null,
|
235 |
arc:91,
|
|
|
236 |
parameters: '137B',
|
237 |
organization: 'Anthropic',
|
238 |
license: 'Proprietary',
|
@@ -247,6 +255,7 @@
|
|
247 |
truthfulqa: null,
|
248 |
hellaswag:null,
|
249 |
arc:null,
|
|
|
250 |
parameters: null,
|
251 |
organization: 'Anthropic',
|
252 |
license: 'Proprietary',
|
@@ -261,6 +270,7 @@
|
|
261 |
truthfulqa: null,
|
262 |
hellaswag:null,
|
263 |
arc:null,
|
|
|
264 |
parameters: null,
|
265 |
organization: 'Anthropic',
|
266 |
license: 'Proprietary',
|
@@ -275,6 +285,7 @@
|
|
275 |
truthfulqa: null,
|
276 |
hellaswag:87.8,
|
277 |
arc:null,
|
|
|
278 |
parameters: null,
|
279 |
organization: 'Google',
|
280 |
license: 'Proprietary',
|
@@ -289,6 +300,7 @@
|
|
289 |
truthfulqa: null,
|
290 |
hellaswag:84.7,
|
291 |
arc:null,
|
|
|
292 |
parameters: null,
|
293 |
organization: 'Google',
|
294 |
license: 'Proprietary',
|
@@ -303,6 +315,7 @@
|
|
303 |
truthfulqa: null,
|
304 |
hellaswag:null,
|
305 |
arc:null,
|
|
|
306 |
parameters: null,
|
307 |
organization: 'Mistral',
|
308 |
license: 'Proprietary',
|
@@ -317,6 +330,7 @@
|
|
317 |
truthfulqa: 46.7,
|
318 |
hellaswag:86.7,
|
319 |
arc:70.14,
|
|
|
320 |
parameters: '45B (MOE)',
|
321 |
organization: 'Mistral',
|
322 |
license: 'Apache 2.0',
|
@@ -331,6 +345,7 @@
|
|
331 |
truthfulqa: null,
|
332 |
hellaswag:null,
|
333 |
arc:null,
|
|
|
334 |
parameters: "33B",
|
335 |
organization: 'xAI',
|
336 |
license: 'Proprietary',
|
@@ -345,10 +360,41 @@
|
|
345 |
truthfulqa: 56.23,
|
346 |
hellaswag:85.69,
|
347 |
arc:64.59,
|
|
|
348 |
parameters: '34B',
|
349 |
organization: '01 AI',
|
350 |
license: 'Yi License',
|
351 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
]
|
353 |
|
354 |
function setBenchmarkTable(data) {
|
@@ -363,6 +409,7 @@
|
|
363 |
'<th>TruthfulQA</th>' +
|
364 |
'<th>HellaSwag</th>' +
|
365 |
'<th>ARC</th>' +
|
|
|
366 |
'<th>Parameters</th>' +
|
367 |
'<th>Organization</th>' +
|
368 |
'<th>License</th>' +
|
@@ -379,6 +426,7 @@
|
|
379 |
'<td>' + item.truthfulqa + '</td>' +
|
380 |
'<td>' + item.hellaswag + '</td>' +
|
381 |
'<td>' + item.arc + '</td>' +
|
|
|
382 |
'<td>' + item.parameters + '</td>' +
|
383 |
'<td>' + item.organization + '</td>' +
|
384 |
'<td>' + item.license + '</td>' +
|
|
|
135 |
truthfulqa: null,
|
136 |
hellaswag:null,
|
137 |
arc:null,
|
138 |
+
nothallucination: 97.0,
|
139 |
parameters: 'Probably smaller than GPT-4',
|
140 |
organization: 'OpenAI',
|
141 |
license: 'Proprietary',
|
|
|
150 |
truthfulqa: null,
|
151 |
hellaswag:null,
|
152 |
arc:null,
|
153 |
+
nothallucination: 97.0,
|
154 |
parameters: '1T (questionable)',
|
155 |
organization: 'OpenAI',
|
156 |
license: 'Proprietary',
|
|
|
165 |
truthfulqa: 59,
|
166 |
hellaswag:95.4,
|
167 |
arc:96.3,
|
168 |
+
nothallucination: 97.0,
|
169 |
parameters: '1T (questionable)',
|
170 |
organization: 'OpenAI',
|
171 |
license: 'Proprietary',
|
|
|
180 |
truthfulqa: null,
|
181 |
hellaswag:null,
|
182 |
arc:null,
|
183 |
+
nothallucination: 96.5,
|
184 |
parameters: '20B - 175B (not confirmed)',
|
185 |
organization: 'OpenAI',
|
186 |
license: 'Proprietary',
|
|
|
195 |
truthfulqa: 47,
|
196 |
hellaswag:85.5,
|
197 |
arc:85.2,
|
198 |
+
nothallucination: 96.5,
|
199 |
parameters: '20B - 175B (not confirmed)',
|
200 |
organization: 'OpenAI',
|
201 |
license: 'Proprietary',
|
|
|
210 |
truthfulqa: null,
|
211 |
hellaswag:null,
|
212 |
arc:null,
|
213 |
+
nothallucination: 96.5,
|
214 |
parameters: '20B - 175B (not confirmed)',
|
215 |
organization: 'OpenAI',
|
216 |
license: 'Proprietary',
|
|
|
225 |
truthfulqa: null,
|
226 |
hellaswag:null,
|
227 |
arc:null,
|
228 |
+
nothallucination: 91.5,
|
229 |
parameters: '137B',
|
230 |
organization: 'Anthropic',
|
231 |
license: 'Proprietary',
|
|
|
240 |
truthfulqa: 69,
|
241 |
hellaswag:null,
|
242 |
arc:91,
|
243 |
+
nothallucination: 91.5,
|
244 |
parameters: '137B',
|
245 |
organization: 'Anthropic',
|
246 |
license: 'Proprietary',
|
|
|
255 |
truthfulqa: null,
|
256 |
hellaswag:null,
|
257 |
arc:null,
|
258 |
+
nothallucination: null,
|
259 |
parameters: null,
|
260 |
organization: 'Anthropic',
|
261 |
license: 'Proprietary',
|
|
|
270 |
truthfulqa: null,
|
271 |
hellaswag:null,
|
272 |
arc:null,
|
273 |
+
nothallucination: null,
|
274 |
parameters: null,
|
275 |
organization: 'Anthropic',
|
276 |
license: 'Proprietary',
|
|
|
285 |
truthfulqa: null,
|
286 |
hellaswag:87.8,
|
287 |
arc:null,
|
288 |
+
nothallucination: null,
|
289 |
parameters: null,
|
290 |
organization: 'Google',
|
291 |
license: 'Proprietary',
|
|
|
300 |
truthfulqa: null,
|
301 |
hellaswag:84.7,
|
302 |
arc:null,
|
303 |
+
nothallucination: 95.2,
|
304 |
parameters: null,
|
305 |
organization: 'Google',
|
306 |
license: 'Proprietary',
|
|
|
315 |
truthfulqa: null,
|
316 |
hellaswag:null,
|
317 |
arc:null,
|
318 |
+
nothallucination: null,
|
319 |
parameters: null,
|
320 |
organization: 'Mistral',
|
321 |
license: 'Proprietary',
|
|
|
330 |
truthfulqa: 46.7,
|
331 |
hellaswag:86.7,
|
332 |
arc:70.14,
|
333 |
+
nothallucination: 90.7,
|
334 |
parameters: '45B (MOE)',
|
335 |
organization: 'Mistral',
|
336 |
license: 'Apache 2.0',
|
|
|
345 |
truthfulqa: null,
|
346 |
hellaswag:null,
|
347 |
arc:null,
|
348 |
+
nothallucination: null,
|
349 |
parameters: "33B",
|
350 |
organization: 'xAI',
|
351 |
license: 'Proprietary',
|
|
|
360 |
truthfulqa: 56.23,
|
361 |
hellaswag:85.69,
|
362 |
arc:64.59,
|
363 |
+
nothallucination: null,
|
364 |
parameters: '34B',
|
365 |
organization: '01 AI',
|
366 |
license: 'Yi License',
|
367 |
},
|
368 |
+
{
|
369 |
+
name: 'PPLX 70B Online',
|
370 |
+
mmlu: null,
|
371 |
+
mtbench: null,
|
372 |
+
arenaelo:1073,
|
373 |
+
gsm8k: null,
|
374 |
+
winogrande: null,
|
375 |
+
truthfulqa: null,
|
376 |
+
hellaswag:null,
|
377 |
+
arc:null,
|
378 |
+
nothallucination: null,
|
379 |
+
parameters: '70B',
|
380 |
+
organization: 'Perplexity AI',
|
381 |
+
license: 'Proprietary',
|
382 |
+
},
|
383 |
+
{
|
384 |
+
name: 'Llama 70B Chat',
|
385 |
+
mmlu: 63,
|
386 |
+
mtbench: 6.86,
|
387 |
+
arenaelo:1079,
|
388 |
+
gsm8k: null,
|
389 |
+
winogrande: null,
|
390 |
+
truthfulqa: null,
|
391 |
+
hellaswag:null,
|
392 |
+
arc:null,
|
393 |
+
nothallucination: 94.9,
|
394 |
+
parameters: '70B',
|
395 |
+
organization: 'Perplexity AI',
|
396 |
+
license: 'Proprietary',
|
397 |
+
},
|
398 |
]
|
399 |
|
400 |
function setBenchmarkTable(data) {
|
|
|
409 |
'<th>TruthfulQA</th>' +
|
410 |
'<th>HellaSwag</th>' +
|
411 |
'<th>ARC</th>' +
|
412 |
+
'<th>Not hallucination</th>' +
|
413 |
'<th>Parameters</th>' +
|
414 |
'<th>Organization</th>' +
|
415 |
'<th>License</th>' +
|
|
|
426 |
'<td>' + item.truthfulqa + '</td>' +
|
427 |
'<td>' + item.hellaswag + '</td>' +
|
428 |
'<td>' + item.arc + '</td>' +
|
429 |
+
'<td>' + item.nothallucination + '%'+ '</td>' +
|
430 |
'<td>' + item.parameters + '</td>' +
|
431 |
'<td>' + item.organization + '</td>' +
|
432 |
'<td>' + item.license + '</td>' +
|