JingweiZuo
commited on
Commit
•
441a876
1
Parent(s):
da37de5
Update README.md
Browse files
README.md
CHANGED
@@ -105,97 +105,90 @@ We report in the following table our internal pipeline benchmarks. For the bench
|
|
105 |
<tr>
|
106 |
<td rowspan="3">General</td>
|
107 |
<td>MMLU (5-shot)</td>
|
108 |
-
<td>30.6
|
109 |
-
<td>68.7
|
110 |
-
<td>55.9
|
111 |
-
<td>65.3
|
112 |
</tr>
|
113 |
<tr>
|
114 |
<td>MMLU-PRO (5-shot)*</td>
|
115 |
-
<td>32.4
|
116 |
-
<td>31.6
|
117 |
-
<td>21.8
|
118 |
-
<td>26.3
|
119 |
</tr>
|
120 |
<tr>
|
121 |
<td>IFEval</td>
|
122 |
-
<td>69.9
|
123 |
-
<td>65.7
|
124 |
-
<td>78.8
|
125 |
-
<td>71.7
|
126 |
</tr>
|
127 |
<tr>
|
128 |
<td rowspan="2">Math</td>
|
129 |
<td>GSM8K (5-shot)</td>
|
130 |
-
<td>0
|
131 |
-
<td>74.9
|
132 |
-
<td>19.2
|
133 |
-
<td>65.2
|
134 |
</tr>
|
135 |
<tr>
|
136 |
<td>MATH Lvl-5 (4-shot)</td>
|
137 |
-
<td>13.6
|
138 |
-
<td>6.9
|
139 |
-
<td>10.4
|
140 |
-
<td>27.3
|
141 |
</tr>
|
142 |
<tr>
|
143 |
<td rowspan="4">Reasoning</td>
|
144 |
<td>Arc Challenge (25-shot)</td>
|
145 |
-
<td>54
|
146 |
-
<td>54.3
|
147 |
-
<td>46.6
|
148 |
-
<td>53.7
|
149 |
</tr>
|
150 |
<tr>
|
151 |
<td>GPQA (0-shot)*</td>
|
152 |
-
<td>10.3
|
153 |
-
<td>11.1
|
154 |
-
<td>
|
155 |
-
<td>7.2
|
156 |
</tr>
|
157 |
<tr>
|
158 |
<td>MUSR (0-shot)*</td>
|
159 |
-
<td>8.2
|
160 |
-
<td>12.2
|
161 |
-
<td>38.6
|
162 |
-
<td>8.3
|
163 |
</tr>
|
164 |
<tr>
|
165 |
<td>BBH (3-shot)*</td>
|
166 |
-
<td>33.3
|
167 |
-
<td>35.3
|
168 |
-
<td>43.7
|
169 |
-
<td>25.2
|
170 |
</tr>
|
171 |
<tr>
|
172 |
<td rowspan="4">CommonSense Understanding</td>
|
173 |
<td>PIQA (0-shot)</td>
|
174 |
-
<td>75.6
|
175 |
-
<td>82.3
|
176 |
-
<td>78.9
|
177 |
-
<td>80.9
|
178 |
</tr>
|
179 |
<tr>
|
180 |
<td>SciQ (0-shot)</td>
|
181 |
-
<td>29.2
|
182 |
-
<td>94.9
|
183 |
-
<td>80.2
|
184 |
-
<td>93.6
|
185 |
-
</tr>
|
186 |
-
<tr>
|
187 |
-
<td>Winogrande (0-shot)</td>
|
188 |
-
<td>75.9%</td>
|
189 |
-
<td>64.5%</td>
|
190 |
-
<td>-</td>
|
191 |
-
<td>-</td>
|
192 |
</tr>
|
193 |
<tr>
|
194 |
<td>OpenbookQA (0-shot)</td>
|
195 |
-
<td>45.6
|
196 |
-
<td>34.6
|
197 |
-
<td>46.2
|
198 |
-
<td>47.2
|
199 |
</tr>
|
200 |
</tbody>
|
201 |
</table>
|
|
|
105 |
<tr>
|
106 |
<td rowspan="3">General</td>
|
107 |
<td>MMLU (5-shot)</td>
|
108 |
+
<td>30.6</td>
|
109 |
+
<td>68.7</td>
|
110 |
+
<td>55.9</td>
|
111 |
+
<td>65.3</td>
|
112 |
</tr>
|
113 |
<tr>
|
114 |
<td>MMLU-PRO (5-shot)*</td>
|
115 |
+
<td>32.4</td>
|
116 |
+
<td>31.6</td>
|
117 |
+
<td>21.8</td>
|
118 |
+
<td>26.3</td>
|
119 |
</tr>
|
120 |
<tr>
|
121 |
<td>IFEval</td>
|
122 |
+
<td>69.9</td>
|
123 |
+
<td>65.7</td>
|
124 |
+
<td>78.8</td>
|
125 |
+
<td>71.7</td>
|
126 |
</tr>
|
127 |
<tr>
|
128 |
<td rowspan="2">Math</td>
|
129 |
<td>GSM8K (5-shot)</td>
|
130 |
+
<td>0</td>
|
131 |
+
<td>74.9</td>
|
132 |
+
<td>19.2</td>
|
133 |
+
<td>65.2</td>
|
134 |
</tr>
|
135 |
<tr>
|
136 |
<td>MATH Lvl-5 (4-shot)</td>
|
137 |
+
<td>13.6</td>
|
138 |
+
<td>6.9</td>
|
139 |
+
<td>10.4</td>
|
140 |
+
<td>27.3</td>
|
141 |
</tr>
|
142 |
<tr>
|
143 |
<td rowspan="4">Reasoning</td>
|
144 |
<td>Arc Challenge (25-shot)</td>
|
145 |
+
<td>54</td>
|
146 |
+
<td>54.3</td>
|
147 |
+
<td>46.6</td>
|
148 |
+
<td>53.7</td>
|
149 |
</tr>
|
150 |
<tr>
|
151 |
<td>GPQA (0-shot)*</td>
|
152 |
+
<td>10.3</td>
|
153 |
+
<td>11.1</td>
|
154 |
+
<td>6.2</td>
|
155 |
+
<td>7.2</td>
|
156 |
</tr>
|
157 |
<tr>
|
158 |
<td>MUSR (0-shot)*</td>
|
159 |
+
<td>8.2</td>
|
160 |
+
<td>12.2</td>
|
161 |
+
<td>38.6</td>
|
162 |
+
<td>8.3</td>
|
163 |
</tr>
|
164 |
<tr>
|
165 |
<td>BBH (3-shot)*</td>
|
166 |
+
<td>33.3</td>
|
167 |
+
<td>35.3</td>
|
168 |
+
<td>43.7</td>
|
169 |
+
<td>25.2</td>
|
170 |
</tr>
|
171 |
<tr>
|
172 |
<td rowspan="4">CommonSense Understanding</td>
|
173 |
<td>PIQA (0-shot)</td>
|
174 |
+
<td>75.6</td>
|
175 |
+
<td>82.3</td>
|
176 |
+
<td>78.9</td>
|
177 |
+
<td>80.9</td>
|
178 |
</tr>
|
179 |
<tr>
|
180 |
<td>SciQ (0-shot)</td>
|
181 |
+
<td>29.2</td>
|
182 |
+
<td>94.9</td>
|
183 |
+
<td>80.2</td>
|
184 |
+
<td>93.6</td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
</tr>
|
186 |
<tr>
|
187 |
<td>OpenbookQA (0-shot)</td>
|
188 |
+
<td>45.6</td>
|
189 |
+
<td>34.6</td>
|
190 |
+
<td>46.2</td>
|
191 |
+
<td>47.2</td>
|
192 |
</tr>
|
193 |
</tbody>
|
194 |
</table>
|