amezasor commited on
Commit
67f8d98
1 Parent(s): b848cbd

update: eval results

Browse files
Files changed (1) hide show
  1. README.md +71 -31
README.md CHANGED
@@ -7,23 +7,23 @@ tags:
7
  - language
8
  - granite-3.0
9
  model-index:
10
- - name: granite-3.0-8b-instruct
11
  results:
12
  - task:
13
  type: text-generation
14
  dataset:
15
- type: human-exams
16
- name: MMLU
17
  metrics:
18
  - name: pass@1
19
  type: pass@1
20
- value:
21
  veriefied: false
22
  - task:
23
  type: text-generation
24
  dataset:
25
- type: human-exams
26
- name: MMLU-Pro
27
  metrics:
28
  - name: pass@1
29
  type: pass@1
@@ -37,17 +37,27 @@ model-index:
37
  metrics:
38
  - name: pass@1
39
  type: pass@1
40
- value:
41
  veriefied: false
42
  - task:
43
  type: text-generation
44
  dataset:
45
- type: commonsense
46
- name: WinoGrande
47
  metrics:
48
  - name: pass@1
49
  type: pass@1
50
- value:
 
 
 
 
 
 
 
 
 
 
51
  veriefied: false
52
  - task:
53
  type: text-generation
@@ -57,7 +67,7 @@ model-index:
57
  metrics:
58
  - name: pass@1
59
  type: pass@1
60
- value:
61
  veriefied: false
62
  - task:
63
  type: text-generation
@@ -67,27 +77,27 @@ model-index:
67
  metrics:
68
  - name: pass@1
69
  type: pass@1
70
- value:
71
  veriefied: false
72
  - task:
73
  type: text-generation
74
  dataset:
75
  type: commonsense
76
- name: PIQA
77
  metrics:
78
  - name: pass@1
79
  type: pass@1
80
- value:
81
  veriefied: false
82
  - task:
83
  type: text-generation
84
  dataset:
85
  type: commonsense
86
- name: Hellaswag
87
  metrics:
88
  - name: pass@1
89
  type: pass@1
90
- value:
91
  veriefied: false
92
  - task:
93
  type: text-generation
@@ -97,7 +107,7 @@ model-index:
97
  metrics:
98
  - name: pass@1
99
  type: pass@1
100
- value:
101
  veriefied: false
102
  - task:
103
  type: text-generation
@@ -107,17 +117,17 @@ model-index:
107
  metrics:
108
  - name: pass@1
109
  type: pass@1
110
- value:
111
  veriefied: false
112
  - task:
113
  type: text-generation
114
  dataset:
115
  type: reading-comprehension
116
- name: SQuAD v2
117
  metrics:
118
  - name: pass@1
119
  type: pass@1
120
- value:
121
  veriefied: false
122
  - task:
123
  type: text-generation
@@ -127,7 +137,7 @@ model-index:
127
  metrics:
128
  - name: pass@1
129
  type: pass@1
130
- value:
131
  veriefied: false
132
  - task:
133
  type: text-generation
@@ -137,7 +147,7 @@ model-index:
137
  metrics:
138
  - name: pass@1
139
  type: pass@1
140
- value:
141
  veriefied: false
142
  - task:
143
  type: text-generation
@@ -147,17 +157,37 @@ model-index:
147
  metrics:
148
  - name: pass@1
149
  type: pass@1
150
- value:
151
  veriefied: false
152
  - task:
153
  type: text-generation
154
  dataset:
155
  type: code
156
- name: HumanEval
157
  metrics:
158
  - name: pass@1
159
  type: pass@1
160
- value:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  veriefied: false
162
  - task:
163
  type: text-generation
@@ -167,7 +197,7 @@ model-index:
167
  metrics:
168
  - name: pass@1
169
  type: pass@1
170
- value:
171
  veriefied: false
172
  - task:
173
  type: text-generation
@@ -177,7 +207,7 @@ model-index:
177
  metrics:
178
  - name: pass@1
179
  type: pass@1
180
- value:
181
  veriefied: false
182
  - task:
183
  type: text-generation
@@ -187,18 +217,28 @@ model-index:
187
  metrics:
188
  - name: pass@1
189
  type: pass@1
190
- value:
191
  veriefied: false
192
  - task:
193
  type: text-generation
194
  dataset:
195
  type: multilingual
196
- name: MGSM
197
  metrics:
198
  - name: pass@1
199
  type: pass@1
200
- value:
201
- veriefied: false
 
 
 
 
 
 
 
 
 
 
202
  ---
203
 
204
  <!-- ![image/png](https://cdn-uploads.huggingface.co/production/uploads/62cd5057674cdb524450093d/1hzxoPwqkBJXshKVVe6_9.png) -->
 
7
  - language
8
  - granite-3.0
9
  model-index:
10
+ - name: granite-3.0-2b-instruct
11
  results:
12
  - task:
13
  type: text-generation
14
  dataset:
15
+ type: instruction-following
16
+ name: IFEval
17
  metrics:
18
  - name: pass@1
19
  type: pass@1
20
+ value: 52.27
21
  veriefied: false
22
  - task:
23
  type: text-generation
24
  dataset:
25
+ type: instruction-following
26
+ name: MT-Bench
27
  metrics:
28
  - name: pass@1
29
  type: pass@1
 
37
  metrics:
38
  - name: pass@1
39
  type: pass@1
40
+ value: 40.52
41
  veriefied: false
42
  - task:
43
  type: text-generation
44
  dataset:
45
+ type: human-exams
46
+ name: MMLU
47
  metrics:
48
  - name: pass@1
49
  type: pass@1
50
+ value: 65.82
51
+ veriefied: false
52
+ - task:
53
+ type: text-generation
54
+ dataset:
55
+ type: human-exams
56
+ name: MMLU-Pro
57
+ metrics:
58
+ - name: pass@1
59
+ type: pass@1
60
+ value: 34.45
61
  veriefied: false
62
  - task:
63
  type: text-generation
 
67
  metrics:
68
  - name: pass@1
69
  type: pass@1
70
+ value: 46.60
71
  veriefied: false
72
  - task:
73
  type: text-generation
 
77
  metrics:
78
  - name: pass@1
79
  type: pass@1
80
+ value: 71.21
81
  veriefied: false
82
  - task:
83
  type: text-generation
84
  dataset:
85
  type: commonsense
86
+ name: Hellaswag
87
  metrics:
88
  - name: pass@1
89
  type: pass@1
90
+ value: 82.61
91
  veriefied: false
92
  - task:
93
  type: text-generation
94
  dataset:
95
  type: commonsense
96
+ name: WinoGrande
97
  metrics:
98
  - name: pass@1
99
  type: pass@1
100
+ value: 77.51
101
  veriefied: false
102
  - task:
103
  type: text-generation
 
107
  metrics:
108
  - name: pass@1
109
  type: pass@1
110
+ value: 60.32
111
  veriefied: false
112
  - task:
113
  type: text-generation
 
117
  metrics:
118
  - name: pass@1
119
  type: pass@1
120
+ value: 88.65
121
  veriefied: false
122
  - task:
123
  type: text-generation
124
  dataset:
125
  type: reading-comprehension
126
+ name: SQuAD 2.0
127
  metrics:
128
  - name: pass@1
129
  type: pass@1
130
+ value: 21.58
131
  veriefied: false
132
  - task:
133
  type: text-generation
 
137
  metrics:
138
  - name: pass@1
139
  type: pass@1
140
+ value: 64.16
141
  veriefied: false
142
  - task:
143
  type: text-generation
 
147
  metrics:
148
  - name: pass@1
149
  type: pass@1
150
+ value: 33.81
151
  veriefied: false
152
  - task:
153
  type: text-generation
 
157
  metrics:
158
  - name: pass@1
159
  type: pass@1
160
+ value: 51.55
161
  veriefied: false
162
  - task:
163
  type: text-generation
164
  dataset:
165
  type: code
166
+ name: HumanEvalSynthesis
167
  metrics:
168
  - name: pass@1
169
  type: pass@1
170
+ value: 64.63
171
+ veriefied: false
172
+ - task:
173
+ type: text-generation
174
+ dataset:
175
+ type: code
176
+ name: HumanEvalExplain
177
+ metrics:
178
+ - name: pass@1
179
+ type: pass@1
180
+ value: 57.16
181
+ veriefied: false
182
+ - task:
183
+ type: text-generation
184
+ dataset:
185
+ type: code
186
+ name: HumanEvalFix
187
+ metrics:
188
+ - name: pass@1
189
+ type: pass@1
190
+ value: 65.85
191
  veriefied: false
192
  - task:
193
  type: text-generation
 
197
  metrics:
198
  - name: pass@1
199
  type: pass@1
200
+ value: 49.60
201
  veriefied: false
202
  - task:
203
  type: text-generation
 
207
  metrics:
208
  - name: pass@1
209
  type: pass@1
210
+ value: 68.99
211
  veriefied: false
212
  - task:
213
  type: text-generation
 
217
  metrics:
218
  - name: pass@1
219
  type: pass@1
220
+ value: 30.94
221
  veriefied: false
222
  - task:
223
  type: text-generation
224
  dataset:
225
  type: multilingual
226
+ name: PAWS-X (7 langs)
227
  metrics:
228
  - name: pass@1
229
  type: pass@1
230
+ value: 64.94
231
+ veriefied: false
232
+ - task:
233
+ type: text-generation
234
+ dataset:
235
+ type: multilingual
236
+ name: MGSM (6 langs)
237
+ metrics:
238
+ - name: pass@1
239
+ type: pass@1
240
+ value: 48.20
241
+ veriefied: false
242
  ---
243
 
244
  <!-- ![image/png](https://cdn-uploads.huggingface.co/production/uploads/62cd5057674cdb524450093d/1hzxoPwqkBJXshKVVe6_9.png) -->