pavol-bielik commited on
Commit
d793c0c
1 Parent(s): b615923

add gemini results

Browse files
results/gemini-1.5-flash-001/result_gemini-1.5-flash-001.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "gemini-1.5-flash-001",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Gemini-1.5-flash"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9626389489495666
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7403917093940346
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.9416989316822211
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9913237087999452
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.7000000000000001
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.8159509202453987
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.8166666666666667
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.97
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.9875
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.854147378956587
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 1.0
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": null
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.6840062111801243
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.49226804123711343
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.5168507016333104
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": null
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": null
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.7786640079760718
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": null
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 1.0
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 1.0
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.055548201794469586
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.7794545454545454
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ }
83
+ }
84
+ }