guerra-llm-ai-leaderboard

Running

App Files Files Community

luisrguerra commited on Jun 18

Commit

cc187e8

•

1 Parent(s): 04313bf

Update index.html

Browse files

Files changed (1) hide show

index.html +47 -16

index.html CHANGED Viewed

@@ -73,8 +73,7 @@
       <li>gpt-3.5-turbo-0125</li>
       <li>gpt-3.5-turbo-0613</li>
       <li>Claude 3 Haiku</li>
-      <li>Mixtral 8x7B Instruct</li>
-      <li>OpenChat</li>
     </ul>
     <h4>Models with fewer hallucinations:</h4>
     <ul>
@@ -183,6 +182,22 @@
     <script>
         const benchmarkData = [
         {
             name: 'gpt-4-0125-preview (turbo)',
             mmlu: null,
@@ -299,7 +314,7 @@
             name: 'Claude 3 Opus',
             mmlu: 86.8,
             mtbench: null,
-            arenaelo:1255,
             gsm8k: 95.0,
             winogrande: null,
             truthfulqa: null,
@@ -503,7 +518,7 @@
             organization: 'Mistral',
             license: 'Proprietary',
           },
-          {
             name: 'Mixtral 8x7B Instruct',
             mmlu: 70.6,
             mtbench: 8.3,
@@ -518,8 +533,8 @@
             parameters: '45B (MOE)',
             organization: 'Mistral',
             license: 'Apache 2.0',
-          },
-          {
             name: 'Grok 1',
             mmlu: 73,
             mtbench: null,
@@ -534,8 +549,8 @@
             parameters: "33B",
             organization: 'xAI',
             license: 'Proprietary',
-          },
-          {
             name: 'DBRX Instruct',
             mmlu: 73.7,
             mtbench: null,
@@ -550,8 +565,8 @@
             parameters: null,
             organization: 'Databricks',
             license: 'Databricks Open Model',
-          },
-          {
             name: 'Yi 34B',
             mmlu: 73.5,
             mtbench: null,
@@ -566,8 +581,8 @@
             parameters: '34B',
             organization: '01 AI',
             license: 'Yi License',
-          },
-          {
             name: 'PPLX 70B Online',
             mmlu: null,
             mtbench: null,
@@ -582,8 +597,24 @@
             parameters: '70B',
             organization: 'Perplexity AI',
             license: 'Proprietary',
-          },
           {
             name: 'Llama 70B Chat',
             mmlu: 63,
             mtbench: 6.86,
@@ -596,9 +627,9 @@
             nothallucination: 94.9,
             alpacaeval: null,
             parameters: '70B',
-            organization: 'Perplexity AI',
-            license: 'Proprietary',
-          },
         ]
         function setBenchmarkTable(data) {

       <li>gpt-3.5-turbo-0125</li>
       <li>gpt-3.5-turbo-0613</li>
       <li>Claude 3 Haiku</li>
+      <li>Meta Llama 3 70B Instruct</li>
     </ul>
     <h4>Models with fewer hallucinations:</h4>
     <ul>
     <script>
         const benchmarkData = [
+        {
+            name: 'GPT-4o-2024-05-13',
+            mmlu: 88.7,
+            mtbench: null,
+            arenaelo:1287,
+            gsm8k: 	null,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            nothallucination: null,
+            alpacaeval: 57.5,
+            parameters: 'Unkonwn',
+            organization: 'OpenAI',
+            license: 'Proprietary',
+        },
         {
             name: 'gpt-4-0125-preview (turbo)',
             mmlu: null,
             name: 'Claude 3 Opus',
             mmlu: 86.8,
             mtbench: null,
+            arenaelo:1249,
             gsm8k: 95.0,
             winogrande: null,
             truthfulqa: null,
             organization: 'Mistral',
             license: 'Proprietary',
           },
+          /*{
             name: 'Mixtral 8x7B Instruct',
             mmlu: 70.6,
             mtbench: 8.3,
             parameters: '45B (MOE)',
             organization: 'Mistral',
             license: 'Apache 2.0',
+          },*/
+          /*{
             name: 'Grok 1',
             mmlu: 73,
             mtbench: null,
             parameters: "33B",
             organization: 'xAI',
             license: 'Proprietary',
+          },*/
+          /*{
             name: 'DBRX Instruct',
             mmlu: 73.7,
             mtbench: null,
             parameters: null,
             organization: 'Databricks',
             license: 'Databricks Open Model',
+          },*/
+          /*{
             name: 'Yi 34B',
             mmlu: 73.5,
             mtbench: null,
             parameters: '34B',
             organization: '01 AI',
             license: 'Yi License',
+          },*/
+          /*{
             name: 'PPLX 70B Online',
             mmlu: null,
             mtbench: null,
             parameters: '70B',
             organization: 'Perplexity AI',
             license: 'Proprietary',
+          },*/
           {
+            name: 'Meta Llama 3 70B Instruct',
+            mmlu: 80.06,
+            mtbench: null,
+            arenaelo:1207,
+            gsm8k: 85.44,
+            winogrande: 82.87,
+            truthfulqa: 61.81,
+            hellaswag:85.69,
+            arc:71.42,
+            nothallucination: 95.5,
+            alpacaeval: 34.4,
+            parameters: '70B',
+            organization: 'Meta',
+            license: 'Open Model',
+          },
+          /*{
             name: 'Llama 70B Chat',
             mmlu: 63,
             mtbench: 6.86,
             nothallucination: 94.9,
             alpacaeval: null,
             parameters: '70B',
+            organization: 'Meta',
+            license: 'Open Model',
+          },*/
         ]
         function setBenchmarkTable(data) {