luisrguerra's picture
Update index.html
c302561 verified
raw
history blame
18.7 kB
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Guerra LLM Ranking</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
</head>
<style>
body{
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
color:hsl(0, 0%, 25%);
}
table{
width: 100%;
}
table, th, td {
border: 1px solid;
border-color: hsl(0, 0%, 60%);
border-collapse: collapse;
}
th, td {
padding: 6px;
text-align: left;
}
</style>
<body>
<div><canvas id="radarChart" height="750"></canvas></div>
<p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
<p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
<p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
<p>Vectara's Hallucination Evaluation Model. This evaluates how often an LLM introduces hallucinations when summarizing a document.</p>
<div id="tableBenchMark"></div>
<h4>Best models for solving math problems:</h4>
<ul>
<li>gpt-4-0125-preview (turbo)</li>
<li>gpt-4-1106-preview (turbo)</li>
<li>gpt-4-0613</li>
<li>gpt-4-0314</li>
<li>Gemini Ultra 1.0</li>
<li>Gemini Pro 1.5</li>
<li>Claude 3 Opus</li>
</ul>
<h4>Best models for large text:</h4>
<ul>
<li>gpt-4-0125-preview (turbo)</li>
<li>gpt-4-1106-preview (turbo)</li>
<li>Gemini Ultra</li>
<li>Gemini Pro 1.5</li>
<li>Claude 3 Opus</li>
<li>Claude 3 Sonnet</li>
<li>Claude 3 Haiku</li>
<li>Claude 2-2.1</li>
<li>Claude Instant 1-1.2</li>
</ul>
<h4>Models with the best cost benefit:</h4>
<ul>
<li>Gemini Pro 1.0</li>
<li>Gemini Pro 1.5</li>
<li>gpt-3.5-turbo-0613</li>
<li>gpt-3.5-turbo-1106</li>
<li>Claude 3 Haiku</li>
<li>Claude Instant 1-1.2</li>
<li>Mixtral 8x7B Instruct</li>
</ul>
<h4>Models with fewer hallucinations:</h4>
<ul>
<li>gpt-4-0125-preview (turbo)</li>
<li>gpt-4-1106-preview (turbo)</li>
<li>gpt-4-0613</li>
<li>gpt-4-0314</li>
<li>Gemini Ultra 1.0</li>
<li>Gemini Pro 1.5</li>
<li>Claude 2.1</li>
<li>Intel Neural Chat 7B</li>
</ul>
<h4>Models with a high level of hallucinations:</h4>
<ul>
<li>Microsoft Phi 2</li>
<li>Mistral 7B</li>
<li>Google Palm 2</li>
<li>Mixtral 8x7B Instruct</li>
<li>Yi 34B</li>
</ul>
<h4>Open Models:</h4>
<ul>
<li>Mixtral 8x7B Instruct</li>
<li>Yi 34B</li>
</ul>
<h4>Can be trained in online service:</h4>
<ul>
<li>gpt-3.5-turbo-1106</li>
<li>gpt-3.5-turbo-0613</li>
<li>gpt-4-0613</li>
</ul>
<h4>Can be trained locally:</h4>
<ul>
<li>Mixtral 8x7B Instruct</li>
<li>Yi 34B</li>
</ul>
<h4>Has widely available api service:</h4>
<ul>
<li>gpt-4-0125-preview (turbo) - OpenAI</li>
<li>gpt-4-1106-preview (turbo) - OpenAI</li>
<li>gpt-4-0613 - OpenAI</li>
<li>gpt-4-0314 - OpenAI</li>
<li>gpt-3.5-turbo-1106 - OpenAI</li>
<li>gpt-4-0314 - OpenAI</li>
<li>Gemini Pro 1.0 - Openrouter with compatibility with OpenAI api, Google api service.</li>
<li>Claude 3 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
<li>Claude 2-2.1 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
<li>Claude Instant 1-1.2 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li>
<li>Mistral Medium - Openrouter with compatibility with OpenAI api, Mistral service has a waiting list.</li>
<li>Mixtral 8x7B Instruct - Deepinfra with compatibility with OpenAI api.</li>
<li>Yi 34B - Deepinfra with compatibility with OpenAI api.</li>
</ul>
<h4>Models with the same level of GPT-4:</h4>
<ul>
<li>Gemini Ultra</li>
<li>Gemini Pro 1.5</li>
<li>Gemini Pro (Bard/Online)</li>
<li>Claude 3 Opus</li>
</ul>
<h4>Models with the same level or better than GPT-3.5 but lower than GPT-4:</h4>
<ul>
<li>Gemini Pro</li>
<li>Claude 3 Sonnet</li>
<li>Claude 3 Haiku</li>
<li>Claude 2-2.1</li>
<li>Claude 1</li>
<li>Claude Instant 1-1.2</li>
<li>Mistral Medium</li>
</ul>
<h4>Versions of models already surpassed by fine-tune, new versions or new architectures:</h4>
<ul>
<li>gpt-4-0314</li>
<li>Claude 2-2.1</li>
<li>Claude Instant 1-1.2</li>
<li>Falcon 180B</li>
<li>Llama 1 and Llama 2</li>
<li>Guanaco 65B</li>
<li>Palm 2 Chat Bison</li>
<li>Dolly V2</li>
<li>Alpaca</li>
<li>CodeLlama-34b-Instruct-hf</li>
<li>Mistral-7B-v0.1</li>
<li>MythoMax-L2</li>
<li>Zephyr 7B Alpha and Beta</li>
<li>Airoboros 70b</li>
<li>OpenChat-3.5-1210</li>
<li>StableLM Tuned Alpha</li>
<li>Stable Beluga 2</li>
</ul>
<script>
const benchmarkData = [
{
name: 'gpt-4-0125-preview (turbo)',
mmlu: null,
mtbench: null,
arenaelo:1253,
gsm8k: null,
winogrande: null,
truthfulqa: null,
hellaswag:null,
arc:null,
nothallucination: null,
parameters: 'Probably smaller than GPT-4',
organization: 'OpenAI',
license: 'Proprietary',
},
{
name: 'gpt-4-1106-preview (turbo)',
mmlu: null,
mtbench: 9.32,
arenaelo:1254,
gsm8k: null,
winogrande: 81.8,
truthfulqa: 75.7,
hellaswag:92.7,
arc:94.2,
nothallucination: 97.0,
parameters: 'Probably smaller than GPT-4',
organization: 'OpenAI',
license: 'Proprietary',
},
{
name: 'gpt-4-0613',
mmlu: null,
mtbench: 9.18,
arenaelo:1160,
gsm8k: 96.8,
winogrande: 87.1,
truthfulqa: 79.7,
hellaswag:91.9,
arc:94.6,
nothallucination: 97.0,
parameters: '1T (questionable)',
organization: 'OpenAI',
license: 'Proprietary',
},
{
name: 'gpt-4-0314',
mmlu: 86.4,
mtbench: 8.96,
arenaelo:1190,
gsm8k: 92,
winogrande: 87.5,
truthfulqa: 59,
hellaswag:95.4,
arc:96.3,
nothallucination: 97.0,
parameters: '1T (questionable)',
organization: 'OpenAI',
license: 'Proprietary',
},
{
name: 'gpt-3.5-turbo-0613',
mmlu: null,
mtbench: 8.39,
arenaelo:1116,
gsm8k: null,
winogrande: 55.3,
truthfulqa: 61.4,
hellaswag:79.4,
arc:81.7,
nothallucination: 96.5,
parameters: '20B - 175B (not confirmed)',
organization: 'OpenAI',
license: 'Proprietary',
},
{
name: 'gpt-3.5-turbo-0301',
mmlu: 70,
mtbench: 7.94,
arenaelo:1104,
gsm8k: 57.1,
winogrande: 81.6,
truthfulqa: 47,
hellaswag:85.5,
arc:85.2,
nothallucination: 96.5,
parameters: '20B - 175B (not confirmed)',
organization: 'OpenAI',
license: 'Proprietary',
},
{
name: 'gpt-3.5-turbo-1106',
mmlu: null,
mtbench: 8.32,
arenaelo:1072,
gsm8k: null,
winogrande: 54,
truthfulqa: 60.7,
hellaswag:60.8,
arc:79.1,
nothallucination: 96.5,
parameters: '20B - 175B (not confirmed)',
organization: 'OpenAI',
license: 'Proprietary',
},
{
name: 'Claude 2.1',
mmlu: null,
mtbench: 8.18,
arenaelo:1119,
gsm8k: 88,
winogrande: null,
truthfulqa: null,
hellaswag:null,
arc:null,
nothallucination: 91.5,
parameters: '137B',
organization: 'Anthropic',
license: 'Proprietary',
},
{
name: 'Claude 2.0',
mmlu: 78.5,
mtbench: 8.06,
arenaelo:1131,
gsm8k: 71.2,
winogrande: null,
truthfulqa: 69,
hellaswag:null,
arc:91,
nothallucination: 91.5,
parameters: '137B',
organization: 'Anthropic',
license: 'Proprietary',
},
{
name: 'Claude 1.0',
mmlu: 77,
mtbench: 7.9,
arenaelo:1149,
gsm8k: null,
winogrande: null,
truthfulqa: null,
hellaswag:null,
arc:null,
nothallucination: null,
parameters: null,
organization: 'Anthropic',
license: 'Proprietary',
},
{
name: 'Claude Instant 1',
mmlu: 73.4,
mtbench: 7.85,
arenaelo:1109,
gsm8k: 86.7,
winogrande: null,
truthfulqa: null,
hellaswag:null,
arc:null,
nothallucination: null,
parameters: null,
organization: 'Anthropic',
license: 'Proprietary',
},
{
name: 'Gemini Pro 1.5',
mmlu: 81.9,
mtbench: null,
arenaelo:null,
gsm8k: 91.7,
winogrande: null,
truthfulqa: null,
hellaswag:92.5,
arc:null,
nothallucination: null,
parameters: null,
organization: 'Google',
license: 'Proprietary',
},
{
name: 'Gemini Ultra',
mmlu: 83.7,
mtbench: null,
arenaelo:null,
gsm8k: 88.9,
winogrande: null,
truthfulqa: null,
hellaswag:87.8,
arc:null,
nothallucination: null,
parameters: null,
organization: 'Google',
license: 'Proprietary',
},
{
name: 'Gemini Pro',
mmlu: 71.8,
mtbench: null,
arenaelo:1114,
gsm8k: 77.9,
winogrande: null,
truthfulqa: null,
hellaswag:84.7,
arc:null,
nothallucination: 95.2,
parameters: null,
organization: 'Google',
license: 'Proprietary',
},
{
name: 'Mistral Medium',
mmlu: 75.3,
mtbench: 8.61,
arenaelo:1150,
gsm8k: null,
winogrande: null,
truthfulqa: null,
hellaswag:null,
arc:null,
nothallucination: null,
parameters: null,
organization: 'Mistral',
license: 'Proprietary',
},
{
name: 'Mixtral 8x7B Instruct',
mmlu: 70.6,
mtbench: 8.3,
arenaelo:1123,
gsm8k: 58.4,
winogrande: 81.2,
truthfulqa: 46.7,
hellaswag:86.7,
arc:70.14,
nothallucination: 90.7,
parameters: '45B (MOE)',
organization: 'Mistral',
license: 'Apache 2.0',
},
{
name: 'Grok 1',
mmlu: 73,
mtbench: null,
arenaelo:null,
gsm8k: 72.9,
winogrande: null,
truthfulqa: null,
hellaswag:null,
arc:null,
nothallucination: null,
parameters: "33B",
organization: 'xAI',
license: 'Proprietary',
},
{
name: 'Yi 34B',
mmlu: 73.5,
mtbench: null,
arenaelo:1111,
gsm8k: 50.64,
winogrande: 83.03,
truthfulqa: 56.23,
hellaswag:85.69,
arc:64.59,
nothallucination: null,
parameters: '34B',
organization: '01 AI',
license: 'Yi License',
},
{
name: 'PPLX 70B Online',
mmlu: null,
mtbench: null,
arenaelo:1073,
gsm8k: null,
winogrande: null,
truthfulqa: null,
hellaswag:null,
arc:null,
nothallucination: null,
parameters: '70B',
organization: 'Perplexity AI',
license: 'Proprietary',
},
{
name: 'Llama 70B Chat',
mmlu: 63,
mtbench: 6.86,
arenaelo:1079,
gsm8k: null,
winogrande: null,
truthfulqa: null,
hellaswag:null,
arc:null,
nothallucination: 94.9,
parameters: '70B',
organization: 'Perplexity AI',
license: 'Proprietary',
},
]
function setBenchmarkTable(data) {
let tableHTML = '<table border="1">' +
'<tr>' +
'<th>Name</th>' +
'<th>MMLU</th>' +
'<th>MT-Bench</th>' +
'<th>Arena Elo</th>' +
'<th>GSM8k</th>' +
'<th>Winogrande</th>' +
'<th>TruthfulQA</th>' +
'<th>HellaSwag</th>' +
'<th>ARC</th>' +
'<th>Not hallucination</th>' +
'<th>Parameters</th>' +
'<th>Organization</th>' +
'<th>License</th>' +
'</tr>';
data.forEach(function(item) {
tableHTML += '<tr>' +
'<td>' + item.name + '</td>' +
'<td>' + item.mmlu + '</td>' +
'<td>' + item.mtbench + '</td>' +
'<td>' + item.arenaelo + '</td>' +
'<td>' + item.gsm8k + '</td>' +
'<td>' + item.winogrande + '</td>' +
'<td>' + item.truthfulqa + '</td>' +
'<td>' + item.hellaswag + '</td>' +
'<td>' + item.arc + '</td>' +
'<td>' + item.nothallucination + '%'+ '</td>' +
'<td>' + item.parameters + '</td>' +
'<td>' + item.organization + '</td>' +
'<td>' + item.license + '</td>' +
'</tr>';
});
tableHTML += '</table>';
document.getElementById('tableBenchMark').innerHTML = tableHTML;
}
setBenchmarkTable(benchmarkData);
function getBenchmarkMaxValue(benchmarkName,data) {
let maxValue = 0;
for (let i = 0; i < data.length; i++) {
if (data[i][benchmarkName] > maxValue) {
maxValue = data[i][benchmarkName];
}
}
return maxValue;
}
function getDataSetRadar(data) {
const mmluMaxValue = getBenchmarkMaxValue("mmlu",data);
const mmluMultiplier = 100/mmluMaxValue;
const mtbenchMaxValue = getBenchmarkMaxValue("mtbench",data);
const mtbenchMultiplier = 100/mtbenchMaxValue;
const arenaeloMaxValue = getBenchmarkMaxValue("arenaelo",data);
const arenaeloMultiplier = 100/arenaeloMaxValue;
const gsm8kMaxValue = getBenchmarkMaxValue("gsm8k",data);
const gsm8kMultiplier = 100/gsm8kMaxValue;
const winograndeMaxValue = getBenchmarkMaxValue("winogrande",data);
const winograndeMultiplier = 100/winograndeMaxValue;
const truthfulqaMaxValue = getBenchmarkMaxValue("truthfulqa",data);
const truthfulqaMultiplier = 100/truthfulqaMaxValue;
const hellaswagMaxValue = getBenchmarkMaxValue("hellaswag",data);
const hellaswagMultiplier = 100/hellaswagMaxValue;
const arcMaxValue = getBenchmarkMaxValue("arc",data);
const arcMultiplier = 100/arcMaxValue;
let dataset = [];
for (let i = 0; i < data.length; i++) {
dataset.push({
label: data[i].name,
data: [
(data[i].mmlu*mmluMultiplier),
(data[i].mtbench*mtbenchMultiplier),
(data[i].arenaelo*arenaeloMultiplier),
(data[i].gsm8k*gsm8kMultiplier),
(data[i].winogrande*winograndeMultiplier),
(data[i].truthfulqa*truthfulqaMultiplier),
(data[i].hellaswag*hellaswagMultiplier),
(data[i].arc*arcMultiplier),
],
borderWidth: 2
})
}
return dataset;
}
const dataSetRadar = getDataSetRadar(benchmarkData);
let data = {
labels: ['MMLU', 'MT-bench','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
datasets: getDataSetRadar(benchmarkData)
};
let options = {
responsive: true,
maintainAspectRatio: false,
scale: {
ticks: {
stepSize: 10,
}
},
};
let ctx = document.getElementById('radarChart').getContext('2d');
new Chart(ctx, {
type: 'radar',
data: data,
options: options
});
</script>
</body>
</html>