JingweiZuo commited on
Commit
441a876
1 Parent(s): da37de5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +48 -55
README.md CHANGED
@@ -105,97 +105,90 @@ We report in the following table our internal pipeline benchmarks. For the bench
105
  <tr>
106
  <td rowspan="3">General</td>
107
  <td>MMLU (5-shot)</td>
108
- <td>30.6%</td>
109
- <td>68.7%</td>
110
- <td>55.9%</td>
111
- <td>65.3%</td>
112
  </tr>
113
  <tr>
114
  <td>MMLU-PRO (5-shot)*</td>
115
- <td>32.4%</td>
116
- <td>31.6%</td>
117
- <td>21.8%</td>
118
- <td>26.3%</td>
119
  </tr>
120
  <tr>
121
  <td>IFEval</td>
122
- <td>69.9%</td>
123
- <td>65.7%</td>
124
- <td>78.8%</td>
125
- <td>71.7%</td>
126
  </tr>
127
  <tr>
128
  <td rowspan="2">Math</td>
129
  <td>GSM8K (5-shot)</td>
130
- <td>0%</td>
131
- <td>74.9%</td>
132
- <td>19.2%</td>
133
- <td>65.2%</td>
134
  </tr>
135
  <tr>
136
  <td>MATH Lvl-5 (4-shot)</td>
137
- <td>13.6%</td>
138
- <td>6.9%</td>
139
- <td>10.4%</td>
140
- <td>27.3%</td>
141
  </tr>
142
  <tr>
143
  <td rowspan="4">Reasoning</td>
144
  <td>Arc Challenge (25-shot)</td>
145
- <td>54%</td>
146
- <td>54.3%</td>
147
- <td>46.6%</td>
148
- <td>53.7%</td>
149
  </tr>
150
  <tr>
151
  <td>GPQA (0-shot)*</td>
152
- <td>10.3%</td>
153
- <td>11.1%</td>
154
- <td>33.6%</td>
155
- <td>7.2%</td>
156
  </tr>
157
  <tr>
158
  <td>MUSR (0-shot)*</td>
159
- <td>8.2%</td>
160
- <td>12.2%</td>
161
- <td>38.6%</td>
162
- <td>8.3%</td>
163
  </tr>
164
  <tr>
165
  <td>BBH (3-shot)*</td>
166
- <td>33.3%</td>
167
- <td>35.3%</td>
168
- <td>43.7%</td>
169
- <td>25.2%</td>
170
  </tr>
171
  <tr>
172
  <td rowspan="4">CommonSense Understanding</td>
173
  <td>PIQA (0-shot)</td>
174
- <td>75.6%</td>
175
- <td>82.3%</td>
176
- <td>78.9%</td>
177
- <td>80.9%</td>
178
  </tr>
179
  <tr>
180
  <td>SciQ (0-shot)</td>
181
- <td>29.2%</td>
182
- <td>94.9%</td>
183
- <td>80.2%</td>
184
- <td>93.6%</td>
185
- </tr>
186
- <tr>
187
- <td>Winogrande (0-shot)</td>
188
- <td>75.9%</td>
189
- <td>64.5%</td>
190
- <td>-</td>
191
- <td>-</td>
192
  </tr>
193
  <tr>
194
  <td>OpenbookQA (0-shot)</td>
195
- <td>45.6%</td>
196
- <td>34.6%</td>
197
- <td>46.2%</td>
198
- <td>47.2%</td>
199
  </tr>
200
  </tbody>
201
  </table>
 
105
  <tr>
106
  <td rowspan="3">General</td>
107
  <td>MMLU (5-shot)</td>
108
+ <td>30.6</td>
109
+ <td>68.7</td>
110
+ <td>55.9</td>
111
+ <td>65.3</td>
112
  </tr>
113
  <tr>
114
  <td>MMLU-PRO (5-shot)*</td>
115
+ <td>32.4</td>
116
+ <td>31.6</td>
117
+ <td>21.8</td>
118
+ <td>26.3</td>
119
  </tr>
120
  <tr>
121
  <td>IFEval</td>
122
+ <td>69.9</td>
123
+ <td>65.7</td>
124
+ <td>78.8</td>
125
+ <td>71.7</td>
126
  </tr>
127
  <tr>
128
  <td rowspan="2">Math</td>
129
  <td>GSM8K (5-shot)</td>
130
+ <td>0</td>
131
+ <td>74.9</td>
132
+ <td>19.2</td>
133
+ <td>65.2</td>
134
  </tr>
135
  <tr>
136
  <td>MATH Lvl-5 (4-shot)</td>
137
+ <td>13.6</td>
138
+ <td>6.9</td>
139
+ <td>10.4</td>
140
+ <td>27.3</td>
141
  </tr>
142
  <tr>
143
  <td rowspan="4">Reasoning</td>
144
  <td>Arc Challenge (25-shot)</td>
145
+ <td>54</td>
146
+ <td>54.3</td>
147
+ <td>46.6</td>
148
+ <td>53.7</td>
149
  </tr>
150
  <tr>
151
  <td>GPQA (0-shot)*</td>
152
+ <td>10.3</td>
153
+ <td>11.1</td>
154
+ <td>6.2</td>
155
+ <td>7.2</td>
156
  </tr>
157
  <tr>
158
  <td>MUSR (0-shot)*</td>
159
+ <td>8.2</td>
160
+ <td>12.2</td>
161
+ <td>38.6</td>
162
+ <td>8.3</td>
163
  </tr>
164
  <tr>
165
  <td>BBH (3-shot)*</td>
166
+ <td>33.3</td>
167
+ <td>35.3</td>
168
+ <td>43.7</td>
169
+ <td>25.2</td>
170
  </tr>
171
  <tr>
172
  <td rowspan="4">CommonSense Understanding</td>
173
  <td>PIQA (0-shot)</td>
174
+ <td>75.6</td>
175
+ <td>82.3</td>
176
+ <td>78.9</td>
177
+ <td>80.9</td>
178
  </tr>
179
  <tr>
180
  <td>SciQ (0-shot)</td>
181
+ <td>29.2</td>
182
+ <td>94.9</td>
183
+ <td>80.2</td>
184
+ <td>93.6</td>
 
 
 
 
 
 
 
185
  </tr>
186
  <tr>
187
  <td>OpenbookQA (0-shot)</td>
188
+ <td>45.6</td>
189
+ <td>34.6</td>
190
+ <td>46.2</td>
191
+ <td>47.2</td>
192
  </tr>
193
  </tbody>
194
  </table>