CallComply
/

zephyr-7b-beta-128k

Text Generation

Transformers

Safetensors

English

mistral

Generated from Trainer

conversational

Eval Results

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

perlthoughts

leaderboard-pr-bot commited on Mar 4

Commit

7179ae7

•

1 Parent(s): 5f20240

Adding Evaluation Results (#3)

Browse files

- Adding Evaluation Results (f4c0ed9c9b1810a470e146d8c930e65f40cad164)

Co-authored-by: Open LLM Leaderboard PR Bot <leaderboard-pr-bot@users.noreply.huggingface.co>

Files changed (1) hide show

README.md +94 -66

README.md CHANGED Viewed

@@ -1,23 +1,37 @@
 ---
 tags:
 - generated_from_trainer
-license: mit
 datasets:
 - HuggingFaceH4/ultrachat_200k
 - HuggingFaceH4/ultrafeedback_binarized
-language:
-- en
 base_model: mistralai/Mistral-7B-v0.1
 widget:
-  - text: "<|system|>\nYou are a pirate chatbot who always responds with Arr!</s>\n<|user|>\nThere's a llama on my lawn, how can I get rid of him?</s>\n<|assistant|>\n"
-    output:
-      text: "Arr! 'Tis a puzzlin' matter, me hearty! A llama on yer lawn be a rare sight, but I've got a plan that might help ye get rid of 'im. Ye'll need to gather some carrots and hay, and then lure the llama away with the promise of a tasty treat. Once he's gone, ye can clean up yer lawn and enjoy the peace and quiet once again. But beware, me hearty, for there may be more llamas where that one came from! Arr!"
 pipeline_tag: text-generation
 model-index:
 - name: zephyr-7b-beta
   results:
-  # AI2 Reasoning Challenge (25-Shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -28,15 +42,16 @@ model-index:
       args:
         num_few_shot: 25
     metrics:
-       - type: acc_norm
-         name: normalized accuracy
-         value: 62.03071672354948
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
-  # HellaSwag (10-shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -46,15 +61,16 @@ model-index:
       args:
         num_few_shot: 10
     metrics:
-       - type: acc_norm
-         name: normalized accuracy
-         value: 84.35570603465445
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
-  # DROP (3-shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -64,15 +80,13 @@ model-index:
       args:
         num_few_shot: 3
     metrics:
-       - type: f1
-         name: f1 score
-         value: 9.662437080536909
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
-  # TruthfulQA (0-shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -83,14 +97,14 @@ model-index:
       args:
         num_few_shot: 0
     metrics:
-       - type: mc2
-         value: 57.44916942762855
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
-  # GSM8k (5-shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -101,15 +115,16 @@ model-index:
       args:
         num_few_shot: 5
     metrics:
-       - type: acc
-         name: accuracy
-         value: 12.736921910538287
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
-  # MMLU (5-Shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -120,15 +135,16 @@ model-index:
       args:
         num_few_shot: 5
     metrics:
-       - type: acc
-         name: accuracy
-         value: 61.07
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
-  # Winogrande (5-shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -139,38 +155,37 @@ model-index:
       args:
         num_few_shot: 5
     metrics:
-       - type: acc
-         name: accuracy
-         value: 77.74269928966061
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
-  # AlpacaEval (taken from model card)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
       name: AlpacaEval
       type: tatsu-lab/alpaca_eval
     metrics:
-       - type: unknown
-         name: win rate
-         value: 0.9060
     source:
       url: https://tatsu-lab.github.io/alpaca_eval/
-  # MT-Bench (taken from model card)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
       name: MT-Bench
       type: unknown
     metrics:
-       - type: unknown
-         name: score
-         value: 7.34
     source:
       url: https://huggingface.co/spaces/lmsys/mt-bench
 ---
@@ -407,4 +422,17 @@ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-le
 | TruthfulQA (0-shot)   | 57.45   |
 | Winogrande (5-shot)   | 77.74   |
 | GSM8K (5-shot)        | 12.74        |
-| DROP (3-shot)         | 9.66         |

 ---
+language:
+- en
+license: mit
 tags:
 - generated_from_trainer
 datasets:
 - HuggingFaceH4/ultrachat_200k
 - HuggingFaceH4/ultrafeedback_binarized
 base_model: mistralai/Mistral-7B-v0.1
 widget:
+- text: '<|system|>
+    You are a pirate chatbot who always responds with Arr!</s>
+    <|user|>
+    There''s a llama on my lawn, how can I get rid of him?</s>
+    <|assistant|>
+    '
+  output:
+    text: Arr! 'Tis a puzzlin' matter, me hearty! A llama on yer lawn be a rare sight,
+      but I've got a plan that might help ye get rid of 'im. Ye'll need to gather
+      some carrots and hay, and then lure the llama away with the promise of a tasty
+      treat. Once he's gone, ye can clean up yer lawn and enjoy the peace and quiet
+      once again. But beware, me hearty, for there may be more llamas where that one
+      came from! Arr!
 pipeline_tag: text-generation
 model-index:
 - name: zephyr-7b-beta
   results:
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 25
     metrics:
+    - type: acc_norm
+      value: 62.03071672354948
+      name: normalized accuracy
+    - type: acc_norm
+      value: 58.28
+      name: normalized accuracy
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 10
     metrics:
+    - type: acc_norm
+      value: 84.35570603465445
+      name: normalized accuracy
+    - type: acc_norm
+      value: 81.0
+      name: normalized accuracy
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 3
     metrics:
+    - type: f1
+      value: 9.66243708053691
+      name: f1 score
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 0
     metrics:
+    - type: mc2
+      value: 57.44916942762855
+    - type: mc2
+      value: 46.1
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 5
     metrics:
+    - type: acc
+      value: 12.736921910538287
+      name: accuracy
+    - type: acc
+      value: 13.04
+      name: accuracy
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 5
     metrics:
+    - type: acc
+      value: 61.07
+      name: accuracy
+    - type: acc
+      value: 53.57
+      name: accuracy
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 5
     metrics:
+    - type: acc
+      value: 77.7426992896606
+      name: accuracy
+    - type: acc
+      value: 74.74
+      name: accuracy
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=HuggingFaceH4/zephyr-7b-beta
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       name: AlpacaEval
       type: tatsu-lab/alpaca_eval
     metrics:
+    - type: unknown
+      value: 0.906
+      name: win rate
     source:
       url: https://tatsu-lab.github.io/alpaca_eval/
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       name: MT-Bench
       type: unknown
     metrics:
+    - type: unknown
+      value: 7.34
+      name: score
     source:
       url: https://huggingface.co/spaces/lmsys/mt-bench
 ---
 | TruthfulQA (0-shot)   | 57.45   |
 | Winogrande (5-shot)   | 77.74   |
 | GSM8K (5-shot)        | 12.74        |
+| DROP (3-shot)         | 9.66         |
+# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
+Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_CallComply__zephyr-7b-beta-128k)
+|             Metric              |Value|
+|---------------------------------|----:|
+|Avg.                             |54.45|
+|AI2 Reasoning Challenge (25-Shot)|58.28|
+|HellaSwag (10-Shot)              |81.00|
+|MMLU (5-Shot)                    |53.57|
+|TruthfulQA (0-shot)              |46.10|
+|Winogrande (5-shot)              |74.74|
+|GSM8k (5-shot)                   |13.04|