Update README.md
Browse files
README.md
CHANGED
@@ -8,8 +8,82 @@ model-index:
|
|
8 |
results: []
|
9 |
---
|
10 |
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
|
15 |
<details><summary>See axolotl config</summary>
|
|
|
8 |
results: []
|
9 |
---
|
10 |
|
11 |
+
<details><summary>Evals</summary>
|
12 |
+
|
13 |
+
```
|
14 |
+
hf (pretrained=/workspace/axolotl/dolphin-2.9.4-llama3.1-8b-hf,dtype=bfloat16), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (4)
|
15 |
+
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
16 |
+
|-----------------------------------------------------------|-------|------|-----:|-----------------------|---|-----:|---|------|
|
17 |
+
|leaderboard |N/A |none | 0|acc |↑ |0.2926|± |0.0041|
|
18 |
+
| | |none | 0|acc_norm |↑ |0.4513|± |0.0053|
|
19 |
+
| | |none | 0|exact_match |↑ |0.0982|± |0.0079|
|
20 |
+
| | |none | 0|inst_level_loose_acc |↑ |0.3825|± |N/A |
|
21 |
+
| | |none | 0|inst_level_strict_acc |↑ |0.3597|± |N/A |
|
22 |
+
| | |none | 0|prompt_level_loose_acc |↑ |0.2421|± |0.0184|
|
23 |
+
| | |none | 0|prompt_level_strict_acc|↑ |0.2181|± |0.0178|
|
24 |
+
| - leaderboard_bbh |N/A |none | 3|acc_norm |↑ |0.4931|± |0.0061|
|
25 |
+
| - leaderboard_bbh_boolean_expressions | 0|none | 3|acc_norm |↑ |0.8000|± |0.0253|
|
26 |
+
| - leaderboard_bbh_causal_judgement | 0|none | 3|acc_norm |↑ |0.5615|± |0.0364|
|
27 |
+
| - leaderboard_bbh_date_understanding | 0|none | 3|acc_norm |↑ |0.4520|± |0.0315|
|
28 |
+
| - leaderboard_bbh_disambiguation_qa | 0|none | 3|acc_norm |↑ |0.6640|± |0.0299|
|
29 |
+
| - leaderboard_bbh_formal_fallacies | 0|none | 3|acc_norm |↑ |0.5600|± |0.0315|
|
30 |
+
| - leaderboard_bbh_geometric_shapes | 0|none | 3|acc_norm |↑ |0.3640|± |0.0305|
|
31 |
+
| - leaderboard_bbh_hyperbaton | 0|none | 3|acc_norm |↑ |0.6320|± |0.0306|
|
32 |
+
| - leaderboard_bbh_logical_deduction_five_objects | 0|none | 3|acc_norm |↑ |0.4600|± |0.0316|
|
33 |
+
| - leaderboard_bbh_logical_deduction_seven_objects | 0|none | 3|acc_norm |↑ |0.4360|± |0.0314|
|
34 |
+
| - leaderboard_bbh_logical_deduction_three_objects | 0|none | 3|acc_norm |↑ |0.6160|± |0.0308|
|
35 |
+
| - leaderboard_bbh_movie_recommendation | 0|none | 3|acc_norm |↑ |0.7880|± |0.0259|
|
36 |
+
| - leaderboard_bbh_navigate | 0|none | 3|acc_norm |↑ |0.5200|± |0.0317|
|
37 |
+
| - leaderboard_bbh_object_counting | 0|none | 3|acc_norm |↑ |0.4520|± |0.0315|
|
38 |
+
| - leaderboard_bbh_penguins_in_a_table | 0|none | 3|acc_norm |↑ |0.5205|± |0.0415|
|
39 |
+
| - leaderboard_bbh_reasoning_about_colored_objects | 0|none | 3|acc_norm |↑ |0.5120|± |0.0317|
|
40 |
+
| - leaderboard_bbh_ruin_names | 0|none | 3|acc_norm |↑ |0.6320|± |0.0306|
|
41 |
+
| - leaderboard_bbh_salient_translation_error_detection | 0|none | 3|acc_norm |↑ |0.4320|± |0.0314|
|
42 |
+
| - leaderboard_bbh_snarks | 0|none | 3|acc_norm |↑ |0.5843|± |0.0370|
|
43 |
+
| - leaderboard_bbh_sports_understanding | 0|none | 3|acc_norm |↑ |0.7040|± |0.0289|
|
44 |
+
| - leaderboard_bbh_temporal_sequences | 0|none | 3|acc_norm |↑ |0.1440|± |0.0222|
|
45 |
+
| - leaderboard_bbh_tracking_shuffled_objects_five_objects | 0|none | 3|acc_norm |↑ |0.1560|± |0.0230|
|
46 |
+
| - leaderboard_bbh_tracking_shuffled_objects_seven_objects| 0|none | 3|acc_norm |↑ |0.1320|± |0.0215|
|
47 |
+
| - leaderboard_bbh_tracking_shuffled_objects_three_objects| 0|none | 3|acc_norm |↑ |0.2840|± |0.0286|
|
48 |
+
| - leaderboard_bbh_web_of_lies | 0|none | 3|acc_norm |↑ |0.4840|± |0.0317|
|
49 |
+
| - leaderboard_gpqa |N/A |none | 0|acc_norm |↑ |0.2903|± |0.0132|
|
50 |
+
| - leaderboard_gpqa_diamond | 1|none | 0|acc_norm |↑ |0.2980|± |0.0326|
|
51 |
+
| - leaderboard_gpqa_extended | 1|none | 0|acc_norm |↑ |0.2839|± |0.0193|
|
52 |
+
| - leaderboard_gpqa_main | 1|none | 0|acc_norm |↑ |0.2946|± |0.0216|
|
53 |
+
| - leaderboard_ifeval | 2|none | 0|inst_level_loose_acc |↑ |0.3825|± |N/A |
|
54 |
+
| | |none | 0|inst_level_strict_acc |↑ |0.3597|± |N/A |
|
55 |
+
| | |none | 0|prompt_level_loose_acc |↑ |0.2421|± |0.0184|
|
56 |
+
| | |none | 0|prompt_level_strict_acc|↑ |0.2181|± |0.0178|
|
57 |
+
| - leaderboard_math_algebra_hard | 1|none | 4|exact_match |↑ |0.1596|± |0.0209|
|
58 |
+
| - leaderboard_math_counting_and_prob_hard | 1|none | 4|exact_match |↑ |0.0488|± |0.0195|
|
59 |
+
| - leaderboard_math_geometry_hard | 1|none | 4|exact_match |↑ |0.0530|± |0.0196|
|
60 |
+
| - leaderboard_math_hard |N/A |none | 4|exact_match |↑ |0.0982|± |0.0079|
|
61 |
+
| - leaderboard_math_intermediate_algebra_hard | 1|none | 4|exact_match |↑ |0.0143|± |0.0071|
|
62 |
+
| - leaderboard_math_num_theory_hard | 1|none | 4|exact_match |↑ |0.0455|± |0.0168|
|
63 |
+
| - leaderboard_math_prealgebra_hard | 1|none | 4|exact_match |↑ |0.2591|± |0.0316|
|
64 |
+
| - leaderboard_math_precalculus_hard | 1|none | 4|exact_match |↑ |0.0519|± |0.0192|
|
65 |
+
| - leaderboard_mmlu_pro | 0.1|none | 5|acc |↑ |0.2926|± |0.0041|
|
66 |
+
| - leaderboard_musr |N/A |none | 0|acc_norm |↑ |0.3862|± |0.0173|
|
67 |
+
| - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm |↑ |0.5280|± |0.0316|
|
68 |
+
| - leaderboard_musr_object_placements | 1|none | 0|acc_norm |↑ |0.3594|± |0.0300|
|
69 |
+
| - leaderboard_musr_team_allocation | 1|none | 0|acc_norm |↑ |0.2720|± |0.0282|
|
70 |
+
|
71 |
+
| Groups |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
72 |
+
|------------------------|-------|------|-----:|-----------------------|---|-----:|---|------|
|
73 |
+
|leaderboard |N/A |none | 0|acc |↑ |0.2926|± |0.0041|
|
74 |
+
| | |none | 0|acc_norm |↑ |0.4513|± |0.0053|
|
75 |
+
| | |none | 0|exact_match |↑ |0.0982|± |0.0079|
|
76 |
+
| | |none | 0|inst_level_loose_acc |↑ |0.3825|± |N/A |
|
77 |
+
| | |none | 0|inst_level_strict_acc |↑ |0.3597|± |N/A |
|
78 |
+
| | |none | 0|prompt_level_loose_acc |↑ |0.2421|± |0.0184|
|
79 |
+
| | |none | 0|prompt_level_strict_acc|↑ |0.2181|± |0.0178|
|
80 |
+
| - leaderboard_bbh |N/A |none | 3|acc_norm |↑ |0.4931|± |0.0061|
|
81 |
+
| - leaderboard_gpqa |N/A |none | 0|acc_norm |↑ |0.2903|± |0.0132|
|
82 |
+
| - leaderboard_math_hard|N/A |none | 4|exact_match |↑ |0.0982|± |0.0079|
|
83 |
+
| - leaderboard_musr |N/A |none | 0|acc_norm |↑ |0.3862|± |0.0173|
|
84 |
+
```
|
85 |
+
|
86 |
+
</details>
|
87 |
|
88 |
[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
|
89 |
<details><summary>See axolotl config</summary>
|