|
<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="UTF-8"> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
<title>Guerra LLM Ranking</title> |
|
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script> |
|
|
|
</head> |
|
|
|
<style> |
|
body{ |
|
font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; |
|
color:hsl(0, 0%, 25%); |
|
} |
|
table{ |
|
width: 100%; |
|
} |
|
table, th, td { |
|
border: 1px solid; |
|
border-color: hsl(0, 0%, 60%); |
|
border-collapse: collapse; |
|
} |
|
th, td { |
|
padding: 6px; |
|
text-align: left; |
|
} |
|
</style> |
|
|
|
<body> |
|
<div><canvas id="radarChart" height="750"></canvas></div> |
|
<div><canvas id="mmluChart" height="150"></canvas></div> |
|
<div><canvas id="gsm8kChart" height="150"></canvas></div> |
|
<div><canvas id="arenaeloChart" height="150"></canvas></div> |
|
<div><canvas id="nothallucinationChart" height="150"></canvas></div> |
|
<div><canvas id="truthfulqaChart" height="150"></canvas></div> |
|
<div><canvas id="hellaSwagChart" height="150"></canvas></div> |
|
<div><canvas id="winograndeChart" height="150"></canvas></div> |
|
<div><canvas id="arcChart" height="150"></canvas></div> |
|
<div><canvas id="mtbenchChart" height="150"></canvas></div> |
|
<div><canvas id="alpacaevalChart" height="150"></canvas></div> |
|
<p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p> |
|
<p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p> |
|
<p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p> |
|
<p>Vectara's Hallucination Evaluation Model. This evaluates how often an LLM introduces hallucinations when summarizing a document.</p> |
|
<div id="tableBenchMark"></div> |
|
<h4>Best models for solving math problems:</h4> |
|
<ul> |
|
<li>gpt-4o-2024-05-13</li> |
|
<li>gpt-4-Turbo-2024-04-09</li> |
|
<li>gpt-4-0125-preview (turbo)</li> |
|
<li>gpt-4-1106-preview (turbo)</li> |
|
<li>gpt-4-0613</li> |
|
<li>gpt-4-0314</li> |
|
<li>Gemini Ultra 1.0</li> |
|
<li>Gemini Pro 1.5</li> |
|
<li>Gemini Advanced</li> |
|
<li>Claude 3 Opus</li> |
|
<li>Claude 3 Sonnet</li> |
|
</ul> |
|
<h4>Best models for large text:</h4> |
|
<ul> |
|
<li>gpt-4o-2024-05-13</li> |
|
<li>gpt-4-Turbo-2024-04-09</li> |
|
<li>gpt-4-0125-preview (turbo)</li> |
|
<li>gpt-4-1106-preview (turbo)</li> |
|
<li>Gemini Ultra 1.0</li> |
|
<li>Gemini Pro 1.5</li> |
|
<li>Gemini Advanced</li> |
|
<li>Claude 3 Opus</li> |
|
<li>Claude 3 Sonnet</li> |
|
<li>Claude 3 Haiku</li> |
|
<li>Claude 2-2.1</li> |
|
<li>Claude Instant 1-1.2</li> |
|
</ul> |
|
<h4>Models with the best cost benefit:</h4> |
|
<ul> |
|
<li>gpt-4o-2024-05-13</li> |
|
<li>Gemini Pro 1.5</li> |
|
<li>gpt-3.5-turbo-0125</li> |
|
<li>gpt-3.5-turbo-0613</li> |
|
<li>Claude 3 Haiku</li> |
|
<li>Meta Llama 3 70B Instruct</li> |
|
</ul> |
|
<h4>Models with fewer hallucinations:</h4> |
|
<ul> |
|
<li>gpt-4o-2024-05-13</li> |
|
<li>gpt-4-0125-preview (turbo)</li> |
|
<li>gpt-4-1106-preview (turbo)</li> |
|
<li>gpt-4-0613</li> |
|
<li>gpt-4-0314</li> |
|
<li>Gemini Ultra 1.0</li> |
|
<li>Gemini Pro 1.5</li> |
|
<li>Claude 2.1</li> |
|
<li>Snowflake Arctic Instruct</li> |
|
<li>Intel Neural Chat 7B</li> |
|
</ul> |
|
<h4>Models with a high level of hallucinations:</h4> |
|
<ul> |
|
<li>Gemma 1-1.1 7B</li> |
|
<li>DBRX Instruct</li> |
|
<li>Microsoft Phi 2</li> |
|
<li>Mistral 7B</li> |
|
<li>Google Palm 2</li> |
|
<li>Mixtral 8x7B Instruct</li> |
|
<li>Yi 34B</li> |
|
</ul> |
|
<h4>Open Models:</h4> |
|
<ul> |
|
<li>Mixtral 8x7B Instruct</li> |
|
<li>Mistral 7B</li> |
|
<li>Phi-3</li> |
|
<li>Yi 34B</li> |
|
<li>Grok 1</li> |
|
<li>DBRX Instruct</li> |
|
<li>Llama 3 8-70B</li> |
|
<li>Gemma 2-7B</li> |
|
</ul> |
|
<h4>Can be trained in online service:</h4> |
|
<ul> |
|
<li>gpt-3.5-turbo-1106</li> |
|
<li>gpt-3.5-turbo-0613</li> |
|
<li>gpt-4-0613</li> |
|
</ul> |
|
<h4>Can be trained locally:</h4> |
|
<ul> |
|
<li>Llama 3 8-70B</li> |
|
<li>Mixtral 8x7B Instruct</li> |
|
<li>Yi 34B</li> |
|
</ul> |
|
<h4>Has widely available api service:</h4> |
|
<ul> |
|
<li>gpt-4-0125-preview (turbo) - OpenAI</li> |
|
<li>gpt-4-1106-preview (turbo) - OpenAI</li> |
|
<li>gpt-4-0613 - OpenAI</li> |
|
<li>gpt-4-0314 - OpenAI</li> |
|
<li>gpt-3.5-turbo-1106 - OpenAI</li> |
|
<li>gpt-4-0314 - OpenAI</li> |
|
<li>Gemini Pro 1.0-1.5 - Openrouter with compatibility with OpenAI api, Google api service.</li> |
|
<li>Claude 3 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li> |
|
<li>Claude 2-2.1 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li> |
|
<li>Claude Instant 1-1.2 - Openrouter with compatibility with OpenAI api, Anthropic api service.</li> |
|
<li>Mistral Medium - Openrouter with compatibility with OpenAI api, Mistral service has a waiting list.</li> |
|
<li>Mixtral 8x7B Instruct - Deepinfra with compatibility with OpenAI api.</li> |
|
<li>Yi 34B - Deepinfra with compatibility with OpenAI api.</li> |
|
</ul> |
|
<h4>Models with the same level of GPT-4 Turbo:</h4> |
|
<ul> |
|
<li>Claude 3 Opus</li> |
|
</ul> |
|
<h4>Models with the same level of GPT-4 but lower than GPT-4 Turbo:</h4> |
|
<ul> |
|
<li>Gemini Ultra 1.0</li> |
|
<li>Gemini Pro 1.5</li> |
|
<li>Gemini Advanced</li> |
|
<li>Gemini Pro (Bard/Online)</li> |
|
<li>Claude 3 Sonnet</li> |
|
</ul> |
|
<h4>Models with the same level or better than GPT-3.5 but lower than GPT-4:</h4> |
|
<ul> |
|
<li>Claude 3 Haiku</li> |
|
<li>Claude 2-2.1</li> |
|
<li>Claude 1</li> |
|
<li>Claude Instant 1-1.2</li> |
|
<li>Phi-3 Medium</li> |
|
<li>Llama 3 70B Instruct</li> |
|
<li>Gemini-1.5-Flash-API-0514</li> |
|
<li>Command R+</li> |
|
</ul> |
|
<h4>Versions of models already surpassed by fine-tune, new versions or new architectures:</h4> |
|
<ul> |
|
<li>gpt-4-0613</li> |
|
<li>gpt-4-0314</li> |
|
<li>Gemini Pro 1.0</li> |
|
<li>Grok 1</li> |
|
<li>Phi-2</li> |
|
<li>DBRX Instruct</li> |
|
<li>Mistral Medium</li> |
|
<li>Gemma 1.0 7B</li> |
|
<li>Zephyr-ORPO-141b-A35b-v0.1</li> |
|
<li>Yi 1.0 34B</li> |
|
<li>gpt-4-0613</li> |
|
<li>gpt-4-0314</li> |
|
<li>Claude 2-2.1</li> |
|
<li>Claude Instant 1-1.2</li> |
|
<li>Qwen 1.0</li> |
|
<li>Falcon 180B</li> |
|
<li>Llama 1 and Llama 2</li> |
|
<li>Guanaco 65B</li> |
|
<li>Palm 2 Chat Bison</li> |
|
<li>Dolly V2</li> |
|
<li>Alpaca</li> |
|
<li>CodeLlama-34b-Instruct-hf</li> |
|
<li>SOLAR-10.7B-Instruct-v1.0</li> |
|
<li>Mistral-7B-v0.2</li> |
|
<li>Mistral-7B-v0.1</li> |
|
<li>MythoMax-L2</li> |
|
<li>Zephyr 7B Alpha and Beta</li> |
|
<li>Airoboros 70b</li> |
|
<li>OpenChat-3.5-1210</li> |
|
<li>StableLM Tuned Alpha</li> |
|
<li>Stable Beluga 2</li> |
|
</ul> |
|
<h4>Best OpenAI Models:</h4> |
|
<ul> |
|
<li>gpt-4o-2024-05-13</li> |
|
<li>gpt-4-Turbo-2024-04-09</li> |
|
<li>gpt-4-0125-preview (turbo)</li> |
|
<li>gpt-4-1106-preview (turbo)</li> |
|
<li>gpt-3.5-turbo-0613</li> |
|
<li>gpt-3.5-turbo-0125</li> |
|
</ul> |
|
<h4>API services:</h4> |
|
<ul> |
|
<li>Openrouter</li> |
|
<li>OpenAI</li> |
|
<li>Google Cloud</li> |
|
<li>Anthropic</li> |
|
<li>Azure</li> |
|
<li>Deepinfra</li> |
|
<li>Together</li> |
|
<li>OctoAI</li> |
|
<li>Lepton</li> |
|
<li>Fireworks</li> |
|
<li>Perplexity</li> |
|
<li>Groq</li> |
|
<li>Mistral</li> |
|
<li>NovitaAI</li> |
|
<li>Cohere</li> |
|
<li>DeepSeek</li> |
|
</ul> |
|
|
|
|
|
<script> |
|
const benchmarkData = [ |
|
{ |
|
name: 'GPT-4o-2024-05-13', |
|
mmlu: 88.7, |
|
mtbench: null, |
|
arenaelo:1287, |
|
gsm8k: null, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: 57.5, |
|
parameters: 'Unkonwn', |
|
organization: 'OpenAI', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'gpt-4-0125-preview (turbo)', |
|
mmlu: null, |
|
mtbench: null, |
|
arenaelo:1249, |
|
gsm8k: null, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: null, |
|
parameters: 'Probably smaller than GPT-4', |
|
organization: 'OpenAI', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'gpt-4-1106-preview (turbo)', |
|
mmlu: null, |
|
mtbench: 9.32, |
|
arenaelo:1252, |
|
gsm8k: 95.3, |
|
winogrande: 81.8, |
|
truthfulqa: 75.7, |
|
hellaswag:92.7, |
|
arc:94.2, |
|
nothallucination: 97.0, |
|
alpacaeval: 50, |
|
parameters: 'Probably smaller than GPT-4', |
|
organization: 'OpenAI', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'gpt-4-0613', |
|
mmlu: null, |
|
mtbench: 9.18, |
|
arenaelo:1160, |
|
gsm8k: 96.8, |
|
winogrande: 87.1, |
|
truthfulqa: 79.7, |
|
hellaswag:91.9, |
|
arc:94.6, |
|
nothallucination: 97.0, |
|
alpacaeval: 30.2, |
|
parameters: '1T (questionable)', |
|
organization: 'OpenAI', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'gpt-4-0314', |
|
mmlu: 86.4, |
|
mtbench: 8.96, |
|
arenaelo:1185, |
|
gsm8k: 92, |
|
winogrande: 87.5, |
|
truthfulqa: 59, |
|
hellaswag:95.4, |
|
arc:96.3, |
|
nothallucination: 97.0, |
|
alpacaeval: 35.3, |
|
parameters: '1T (questionable)', |
|
organization: 'OpenAI', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'gpt-3.5-turbo-0613', |
|
mmlu: null, |
|
mtbench: 8.39, |
|
arenaelo:1115, |
|
gsm8k: null, |
|
winogrande: 55.3, |
|
truthfulqa: 61.4, |
|
hellaswag:79.4, |
|
arc:81.7, |
|
nothallucination: 96.5, |
|
alpacaeval: 22.7, |
|
parameters: '20B - 175B (not confirmed)', |
|
organization: 'OpenAI', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'gpt-3.5-turbo-0301', |
|
mmlu: 70, |
|
mtbench: 7.94, |
|
arenaelo:1103, |
|
gsm8k: 57.1, |
|
winogrande: 81.6, |
|
truthfulqa: 47, |
|
hellaswag:85.5, |
|
arc:85.2, |
|
nothallucination: 96.5, |
|
alpacaeval: 18.1, |
|
parameters: '20B - 175B (not confirmed)', |
|
organization: 'OpenAI', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'gpt-3.5-turbo-1106', |
|
mmlu: null, |
|
mtbench: 8.32, |
|
arenaelo:1069, |
|
gsm8k: null, |
|
winogrande: 54, |
|
truthfulqa: 60.7, |
|
hellaswag:60.8, |
|
arc:79.1, |
|
nothallucination: 96.5, |
|
alpacaeval: 19.3, |
|
parameters: '20B - 175B (not confirmed)', |
|
organization: 'OpenAI', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Claude 3 Opus', |
|
mmlu: 86.8, |
|
mtbench: null, |
|
arenaelo:1249, |
|
gsm8k: 95.0, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:95.4, |
|
arc:96.4, |
|
nothallucination: 92.6, |
|
alpacaeval: 40.4, |
|
parameters: null, |
|
organization: 'Anthropic', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Claude 3 Sonnet', |
|
mmlu: 79.0, |
|
mtbench: null, |
|
arenaelo:1200, |
|
gsm8k: 92.3, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:89.0, |
|
nothallucination: 94, |
|
alpacaeval: 34.9, |
|
parameters: null, |
|
organization: 'Anthropic', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Claude 3 Haiku', |
|
mmlu: 75.2, |
|
mtbench: null, |
|
arenaelo:1177, |
|
gsm8k: 88.9, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:85.9, |
|
nothallucination: 92.4, |
|
alpacaeval: null, |
|
parameters: null, |
|
organization: 'Anthropic', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Claude 2.1', |
|
mmlu: null, |
|
mtbench: 8.18, |
|
arenaelo:1116, |
|
gsm8k: 88, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: 91.5, |
|
alpacaeval: 25.3, |
|
parameters: '137B', |
|
organization: 'Anthropic', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Claude 2.0', |
|
mmlu: 78.5, |
|
mtbench: 8.06, |
|
arenaelo:1127, |
|
gsm8k: 71.2, |
|
winogrande: null, |
|
truthfulqa: 69, |
|
hellaswag:null, |
|
arc:91, |
|
nothallucination: 91.5, |
|
alpacaeval: 28.2, |
|
parameters: '137B', |
|
organization: 'Anthropic', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Claude 1.0', |
|
mmlu: 77, |
|
mtbench: 7.9, |
|
arenaelo:1146, |
|
gsm8k: null, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: 27.3, |
|
parameters: null, |
|
organization: 'Anthropic', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Claude Instant 1', |
|
mmlu: 73.4, |
|
mtbench: 7.85, |
|
arenaelo:1105, |
|
gsm8k: 86.7, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: null, |
|
parameters: null, |
|
organization: 'Anthropic', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Gemini-Advanced-0514', |
|
mmlu: null, |
|
mtbench: null, |
|
arenaelo:1267, |
|
gsm8k: null, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: null, |
|
parameters: null, |
|
organization: 'Google', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Gemini-1.5-Flash-API-0514', |
|
mmlu: 78.9, |
|
mtbench: null, |
|
arenaelo:1230, |
|
gsm8k: null, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: null, |
|
parameters: null, |
|
organization: 'Google', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Gemini-1.5-Pro-API-0514', |
|
mmlu: 85.9, |
|
mtbench: null, |
|
arenaelo:1265, |
|
gsm8k: null, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: null, |
|
parameters: null, |
|
organization: 'Google', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Gemini-1.5-Pro-API-0409-Preview', |
|
mmlu: 81.9, |
|
mtbench: null, |
|
arenaelo:1258, |
|
gsm8k: 91.7, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:92.5, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: null, |
|
parameters: null, |
|
organization: 'Google', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Gemini Ultra', |
|
mmlu: 83.7, |
|
mtbench: null, |
|
arenaelo:null, |
|
gsm8k: 88.9, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:87.8, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: null, |
|
parameters: null, |
|
organization: 'Google', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Gemini Pro Online', |
|
mmlu: null, |
|
mtbench: null, |
|
arenaelo:1204, |
|
gsm8k: null, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: null, |
|
parameters: null, |
|
organization: 'Google', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Gemini Pro', |
|
mmlu: 71.8, |
|
mtbench: null, |
|
arenaelo:1127, |
|
gsm8k: 77.9, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:84.7, |
|
arc:null, |
|
nothallucination: 95.2, |
|
alpacaeval: 24.4, |
|
parameters: null, |
|
organization: 'Google', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Mistral Large', |
|
mmlu: 81.2, |
|
mtbench: null, |
|
arenaelo:1157, |
|
gsm8k: null, |
|
winogrande: 86.7, |
|
truthfulqa: 50.5, |
|
hellaswag:89.2, |
|
arc:94.2, |
|
nothallucination: null, |
|
alpacaeval: 32.7, |
|
parameters: null, |
|
organization: 'Mistral', |
|
license: 'Proprietary', |
|
}, |
|
{ |
|
name: 'Mistral Medium', |
|
mmlu: 75.3, |
|
mtbench: 8.61, |
|
arenaelo:1146, |
|
gsm8k: null, |
|
winogrande: null, |
|
truthfulqa: null, |
|
hellaswag:null, |
|
arc:null, |
|
nothallucination: null, |
|
alpacaeval: 28.6, |
|
parameters: null, |
|
organization: 'Mistral', |
|
license: 'Proprietary', |
|
}, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{ |
|
name: 'Meta Llama 3 70B Instruct', |
|
mmlu: 80.06, |
|
mtbench: null, |
|
arenaelo:1207, |
|
gsm8k: 85.44, |
|
winogrande: 82.87, |
|
truthfulqa: 61.81, |
|
hellaswag:85.69, |
|
arc:71.42, |
|
nothallucination: 95.5, |
|
alpacaeval: 34.4, |
|
parameters: '70B', |
|
organization: 'Meta', |
|
license: 'Open Model', |
|
}, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
] |
|
|
|
function setBenchmarkTable(data) { |
|
let tableHTML = '<table border="1">' + |
|
'<tr>' + |
|
'<th>Name</th>' + |
|
'<th>MMLU</th>' + |
|
'<th>MT-Bench</th>' + |
|
'<th>Arena Elo</th>' + |
|
'<th>GSM8k</th>' + |
|
'<th>Winogrande</th>' + |
|
'<th>TruthfulQA</th>' + |
|
'<th>HellaSwag</th>' + |
|
'<th>ARC</th>' + |
|
'<th>AlpacaEval</th>' + |
|
'<th>Not hallucination</th>' + |
|
'<th>Parameters</th>' + |
|
'<th>Organization</th>' + |
|
'<th>License</th>' + |
|
'</tr>'; |
|
|
|
data.forEach(function(item) { |
|
tableHTML += '<tr>' + |
|
'<td>' + item.name + '</td>' + |
|
'<td>' + item.mmlu + '</td>' + |
|
'<td>' + item.mtbench + '</td>' + |
|
'<td>' + item.arenaelo + '</td>' + |
|
'<td>' + item.gsm8k + '</td>' + |
|
'<td>' + item.winogrande + '</td>' + |
|
'<td>' + item.truthfulqa + '</td>' + |
|
'<td>' + item.hellaswag + '</td>' + |
|
'<td>' + item.arc + '</td>' + |
|
'<td>' + item.alpacaeval + '%'+ '</td>' + |
|
'<td>' + item.nothallucination + '%'+ '</td>' + |
|
'<td>' + item.parameters + '</td>' + |
|
'<td>' + item.organization + '</td>' + |
|
'<td>' + item.license + '</td>' + |
|
'</tr>'; |
|
}); |
|
|
|
tableHTML += '</table>'; |
|
document.getElementById('tableBenchMark').innerHTML = tableHTML; |
|
} |
|
|
|
setBenchmarkTable(benchmarkData); |
|
|
|
function getBenchmarkMaxValue(benchmarkName,data) { |
|
let maxValue = 0; |
|
for (let i = 0; i < data.length; i++) { |
|
if (data[i][benchmarkName] > maxValue) { |
|
maxValue = data[i][benchmarkName]; |
|
} |
|
} |
|
return maxValue; |
|
|
|
} |
|
|
|
function getDataSetRadar(data) { |
|
const mmluMaxValue = getBenchmarkMaxValue("mmlu",data); |
|
const mmluMultiplier = 100/mmluMaxValue; |
|
const arenaeloMaxValue = getBenchmarkMaxValue("arenaelo",data); |
|
const arenaeloMultiplier = 100/arenaeloMaxValue; |
|
const gsm8kMaxValue = getBenchmarkMaxValue("gsm8k",data); |
|
const gsm8kMultiplier = 100/gsm8kMaxValue; |
|
const winograndeMaxValue = getBenchmarkMaxValue("winogrande",data); |
|
const winograndeMultiplier = 100/winograndeMaxValue; |
|
const truthfulqaMaxValue = getBenchmarkMaxValue("truthfulqa",data); |
|
const truthfulqaMultiplier = 100/truthfulqaMaxValue; |
|
const hellaswagMaxValue = getBenchmarkMaxValue("hellaswag",data); |
|
const hellaswagMultiplier = 100/hellaswagMaxValue; |
|
const arcMaxValue = getBenchmarkMaxValue("arc",data); |
|
const arcMultiplier = 100/arcMaxValue; |
|
const alpacaevalMaxValue = getBenchmarkMaxValue("alpacaeval",data); |
|
const alpacaevalMultiplier = 100/alpacaevalMaxValue; |
|
const notHallucinationMaxValue = getBenchmarkMaxValue("nothallucination",data); |
|
const notHallucinationMultiplier = 100/notHallucinationMaxValue; |
|
let dataset = []; |
|
for (let i = 0; i < data.length; i++) { |
|
dataset.push({ |
|
label: data[i].name, |
|
data: [ |
|
(data[i].mmlu*mmluMultiplier), |
|
(data[i].arenaelo*arenaeloMultiplier), |
|
(data[i].gsm8k*gsm8kMultiplier), |
|
(data[i].winogrande*winograndeMultiplier), |
|
(data[i].truthfulqa*truthfulqaMultiplier), |
|
(data[i].hellaswag*hellaswagMultiplier), |
|
(data[i].arc*arcMultiplier), |
|
(data[i].alpacaeval*alpacaevalMultiplier), |
|
(data[i].nothallucination*notHallucinationMultiplier) |
|
], |
|
borderWidth: 2 |
|
}) |
|
} |
|
return dataset; |
|
} |
|
const dataSetRadar = getDataSetRadar(benchmarkData); |
|
let data = { |
|
labels: ['MMLU','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC','AlpacaEval','Not Hallucination'], |
|
datasets: getDataSetRadar(benchmarkData) |
|
}; |
|
|
|
let options = { |
|
responsive: true, |
|
maintainAspectRatio: false, |
|
scale: { |
|
ticks: { |
|
stepSize: 10, |
|
} |
|
}, |
|
}; |
|
|
|
let ctx = document.getElementById('radarChart').getContext('2d'); |
|
new Chart(ctx, { |
|
type: 'radar', |
|
data: data, |
|
options: options |
|
}); |
|
|
|
|
|
function updateChart(id,benchmarkName){ |
|
function sortBenchmarkData(benchmarkName){ |
|
return benchmarkData.sort((a, b) => b[benchmarkName] - a[benchmarkName]); |
|
} |
|
function removeItemsNull(data,benchmarkName){ |
|
return data.filter(item => item[benchmarkName] !== null); |
|
} |
|
benchmarkData2 = removeItemsNull(sortBenchmarkData(benchmarkName),benchmarkName); |
|
function getLabelSetMlluChart(data){ |
|
return data.map(item => item.name); |
|
} |
|
function getDataSetMlluChart(data){ |
|
return data.map(item => item[benchmarkName]); |
|
} |
|
let element = document.getElementById(id).getContext('2d'); |
|
new Chart(element, { |
|
type: 'bar', |
|
data: { |
|
labels: getLabelSetMlluChart(benchmarkData2), |
|
datasets: [{ |
|
label: benchmarkName, |
|
data: getDataSetMlluChart(benchmarkData2) |
|
}] |
|
}, |
|
options: { |
|
maintainAspectRatio: false |
|
} |
|
}); |
|
} |
|
updateChart('mmluChart','mmlu'); |
|
updateChart('gsm8kChart','gsm8k'); |
|
updateChart('arenaeloChart','arenaelo'); |
|
updateChart('nothallucinationChart','nothallucination'); |
|
updateChart('truthfulqaChart','truthfulqa'); |
|
updateChart('hellaSwagChart','hellaswag'); |
|
updateChart('winograndeChart','winogrande'); |
|
updateChart('arcChart','arc'); |
|
updateChart('mtbenchChart','mtbench'); |
|
updateChart('alpacaevalChart','alpacaeval'); |
|
|
|
</script> |
|
</body> |
|
</html> |