guerra-llm-ai-leaderboard

Running

App Files Files Community

luisrguerra commited on Jan 14

Commit

512c89a

•

1 Parent(s): 3b7ce3b

Update index.html

Browse files

Files changed (1) hide show

index.html +328 -65

index.html CHANGED Viewed

@@ -7,74 +7,337 @@
     <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
 </head>
-<body>
-    <canvas id="radarChart" height="750"></canvas>
     <script>
         let data = {
-            labels: ['MMLU', 'MT-bench','Arena Elo'],
-            datasets: [
-              {
-                label: 'GPT-4-Turbo',
-                data: [null, 93.2,124.9],
-                borderWidth: 2
-              },
-              {
-                label: 'GPT-4-0314',
-                data: [86.4, 86.4,119.0],
-                borderWidth: 2
-              },
-              {
-                label: 'GPT-3.5-Turbo-0314',
-                data: [70.0, 79.4,112.3],
-                borderWidth: 2
-              },
-              {
-                label: 'Mistral Medium',
-                data: [75.3, 86.1,115.0],
-                borderWidth: 2
-              },
-              {
-                label: 'Mixtral 8x7B Instruct v0.1',
-                data: [70.6, 83.0,112.3],
-                borderWidth: 2
-              },
-              {
-                label: 'Claude 2.0',
-                data: [78.5, 80.6,113.1],
-                borderWidth: 2
-              },
-              {
-                label: 'Claude 1.0',
-                data: [77.0, 79.0,114.9],
-                borderWidth: 2
-              },
-              {
-                label: 'Claude Instant 1',
-                data: [73.4, 78.5,110.9],
-                borderWidth: 2
-              },
-              {
-                label: 'Gemini Pro',
-                data: [71.8, null,111.4],
-                borderWidth: 2
-              },
-              {
-                label: 'Yi 34B Chat',
-                data: [73.5, null,111.1],
-                borderWidth: 2
-              },
-              {
-                label: 'Falcon 180B Chat',
-                data: [68.0, null,103.1],
-                borderWidth: 2
-              },
-              {
-                label: 'LLama 2 70B Chat',
-                data: [63.0, 68.6,107.9],
-                borderWidth: 2
-              },
-            ]
         };
         let options = {

     <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.1/dist/chart.umd.min.js"></script>
 </head>
+<style>
+  body{
+    font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
+    color:hsl(0, 0%, 25%);
+  }
+  table{
+    width: 100%;
+  }
+  table, th, td {
+    border: 1px solid;
+    border-color: hsl(0, 0%, 60%);
+    border-collapse: collapse;
+  }
+  th, td {
+    padding: 6px;
+    text-align: left;
+  }
+</style>
+<body>
+    <div><canvas id="radarChart" height="750"></canvas></div>
+    <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
+    <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
+    <div id="tableBenchMark"></div>
     <script>
+        const benchmarkData = [
+        {
+            name: 'gpt-4-1106-preview',
+            mmlu: null,
+            mtbench: 9.32,
+            arenaelo:1249,
+            gsm8k: 	null,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            parameters: 'Probably smaller than GPT-4',
+            organization: 'OpenAI',
+            license: 'Proprietary',
+          },
+          {
+            name: 'gpt-4-0613',
+            mmlu: null,
+            mtbench: 9.18,
+            arenaelo:1160,
+            gsm8k: 	96.8,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            parameters: '1T (questionable)',
+            organization: 'OpenAI',
+            license: 'Proprietary',
+          },
+          {
+            name: 'gpt-4-0314',
+            mmlu: 86.4,
+            mtbench: 8.96,
+            arenaelo:1190,
+            gsm8k: 92,
+            winogrande: 87.5,
+            truthfulqa: 59,
+            hellaswag:95.4,
+            arc:96.3,
+            parameters: '1T (questionable)',
+            organization: 'OpenAI',
+            license: 'Proprietary',
+          },
+          {
+            name: 'gpt-3.5-turbo-0613',
+            mmlu: null,
+            mtbench: 8.39,
+            arenaelo:1116,
+            gsm8k: null,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            parameters: '20B - 175B (not confirmed)',
+            organization: 'OpenAI',
+            license: 'Proprietary',
+          },
+          {
+            name: 'gpt-3.5-turbo-0301',
+            mmlu: 70,
+            mtbench: 7.94,
+            arenaelo:1104,
+            gsm8k: 57.1,
+            winogrande: 81.6,
+            truthfulqa: 47,
+            hellaswag:85.5,
+            arc:85.2,
+            parameters: '20B - 175B (not confirmed)',
+            organization: 'OpenAI',
+            license: 'Proprietary',
+          },
+          {
+            name: 'Claude 2.1',
+            mmlu: null,
+            mtbench: 8.18,
+            arenaelo:1119,
+            gsm8k: 88,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            parameters: '137B',
+            organization: 'Anthropic',
+            license: 'Proprietary',
+          },
+          {
+            name: 'Claude 2.0',
+            mmlu: 78.5,
+            mtbench: 8.06,
+            arenaelo:1131,
+            gsm8k: 71.2,
+            winogrande: null,
+            truthfulqa: 69,
+            hellaswag:null,
+            arc:91,
+            parameters: '137B',
+            organization: 'Anthropic',
+            license: 'Proprietary',
+          },
+          {
+            name: 'Claude 1.0',
+            mmlu: 77,
+            mtbench: 7.9,
+            arenaelo:1149,
+            gsm8k: null,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            parameters: null,
+            organization: 'Anthropic',
+            license: 'Proprietary',
+          },
+          {
+            name: 'Claude Instant 1',
+            mmlu: 73.4,
+            mtbench: 7.85,
+            arenaelo:1109,
+            gsm8k: 86.7,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            parameters: null,
+            organization: 'Anthropic',
+            license: 'Proprietary',
+          },
+          {
+            name: 'Gemini Ultra',
+            mmlu: 83.7,
+            mtbench: null,
+            arenaelo:null,
+            gsm8k: 94.4,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:87.8,
+            arc:null,
+            parameters: null,
+            organization: 'Google',
+            license: 'Proprietary',
+          },
+          {
+            name: 'Gemini Pro',
+            mmlu: 71.8,
+            mtbench: null,
+            arenaelo:1114,
+            gsm8k: 86.5,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:84.7,
+            arc:null,
+            parameters: null,
+            organization: 'Google',
+            license: 'Proprietary',
+          },
+          {
+            name: 'Mistral Medium',
+            mmlu: 75.3,
+            mtbench: 8.61,
+            arenaelo:1150,
+            gsm8k: null,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            parameters: null,
+            organization: 'Mistral',
+            license: 'Proprietary',
+          },
+          {
+            name: 'Mixtral 8x7B Instruct',
+            mmlu: 70.6,
+            mtbench: 8.3,
+            arenaelo:1123,
+            gsm8k: 58.4,
+            winogrande: 81.2,
+            truthfulqa: 46.7,
+            hellaswag:86.7,
+            arc:70.14,
+            parameters: '45B (MOE)',
+            organization: 'Mistral',
+            license: 'Apache 2.0',
+          },
+          {
+            name: 'Grok 1',
+            mmlu: 73,
+            mtbench: null,
+            arenaelo:null,
+            gsm8k: 72.9,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            parameters: "33B",
+            organization: 'xAI',
+            license: 'Proprietary',
+          },
+          {
+            name: 'Yi 34B',
+            mmlu: 73.5,
+            mtbench: null,
+            arenaelo:1111,
+            gsm8k: 50.64,
+            winogrande: 83.03,
+            truthfulqa: 56.23,
+            hellaswag:85.69,
+            arc:64.59,
+            parameters: '34B',
+            organization: '01 AI',
+            license: 'Yi License',
+          },
+        ]
+        function setBenchmarkTable(data) {
+          let tableHTML = '<table border="1">' +
+                              '<tr>' +
+                                  '<th>Name</th>' +
+                                  '<th>MMLU</th>' +
+                                  '<th>MT-Bench</th>' +
+                                  '<th>Arena Elo</th>' +
+                                  '<th>GSM-8k</th>' +
+                                  '<th>Winogrande</th>' +
+                                  '<th>TruthfulQA</th>' +
+                                  '<th>HellaSwag</th>' +
+                                  '<th>ARC</th>' +
+                                  '<th>Parameters</th>' +
+                                  '<th>Organization</th>' +
+                                  '<th>License</th>' +
+                              '</tr>';
+          data.forEach(function(item) {
+              tableHTML += '<tr>' +
+                              '<td>' + item.name + '</td>' +
+                              '<td>' + item.mmlu + '</td>' +
+                              '<td>' + item.mtbench + '</td>' +
+                              '<td>' + item.arenaelo + '</td>' +
+                              '<td>' + item.gsm8k + '</td>' +
+                              '<td>' + item.winogrande + '</td>' +
+                              '<td>' + item.truthfulqa + '</td>' +
+                              '<td>' + item.hellaswag + '</td>' +
+                              '<td>' + item.arc + '</td>' +
+                              '<td>' + item.parameters + '</td>' +
+                              '<td>' + item.organization + '</td>' +
+                              '<td>' + item.license + '</td>' +
+                           '</tr>';
+          });
+          tableHTML += '</table>';
+          document.getElementById('tableBenchMark').innerHTML = tableHTML;
+        }
+        setBenchmarkTable(benchmarkData);
+        function getBenchmarkMaxValue(benchmarkName,data) {
+          let maxValue = 0;
+          for (let i = 0; i < data.length; i++) {
+            if (data[i][benchmarkName] > maxValue) {
+              maxValue = data[i][benchmarkName];
+            }
+          }
+          return maxValue;
+        }
+        function getDataSetRadar(data) {
+          const mmluMaxValue = getBenchmarkMaxValue("mmlu",data);
+          const mmluMultiplier = 100/mmluMaxValue;
+          const mtbenchMaxValue = getBenchmarkMaxValue("mtbench",data);
+          const mtbenchMultiplier = 100/mtbenchMaxValue;
+          const arenaeloMaxValue = getBenchmarkMaxValue("arenaelo",data);
+          const arenaeloMultiplier = 100/arenaeloMaxValue;
+          const gsm8kMaxValue = getBenchmarkMaxValue("gsm8k",data);
+          const gsm8kMultiplier = 100/gsm8kMaxValue;
+          const winograndeMaxValue = getBenchmarkMaxValue("winogrande",data);
+          const winograndeMultiplier = 100/winograndeMaxValue;
+          const truthfulqaMaxValue = getBenchmarkMaxValue("truthfulqa",data);
+          const truthfulqaMultiplier = 100/truthfulqaMaxValue;
+          const hellaswagMaxValue = getBenchmarkMaxValue("hellaswag",data);
+          const hellaswagMultiplier = 100/hellaswagMaxValue;
+          const arcMaxValue = getBenchmarkMaxValue("arc",data);
+          const arcMultiplier = 100/arcMaxValue;
+          let dataset = [];
+          for (let i = 0; i < data.length; i++) {
+            dataset.push({
+              label: data[i].name,
+              data: [
+                (data[i].mmlu*mmluMultiplier),
+                (data[i].mtbench*mtbenchMultiplier),
+                (data[i].arenaelo*arenaeloMultiplier),
+                (data[i].gsm8k*gsm8kMultiplier),
+                (data[i].winogrande*winograndeMultiplier),
+                (data[i].truthfulqa*truthfulqaMultiplier),
+                (data[i].hellaswag*hellaswagMultiplier),
+                (data[i].arc*arcMultiplier),
+              ],
+              borderWidth: 2
+            })
+          }
+          return dataset;
+        }
+        const dataSetRadar = getDataSetRadar(benchmarkData);
         let data = {
+            labels: ['MMLU', 'MT-bench','Arena Elo','GSM-8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
+            datasets: getDataSetRadar(benchmarkData)
         };
         let options = {