puneeshkhanna
commited on
Commit
•
940edf8
1
Parent(s):
4feba72
Update README.md
Browse filesUpdate benchmarks
README.md
CHANGED
@@ -126,10 +126,10 @@ We report in the following table our internal pipeline benchmarks:
|
|
126 |
<tr>
|
127 |
<td rowspan="4">Reasoning</td>
|
128 |
<td>Arc Challenge (25-shot)</td>
|
129 |
-
<td>
|
130 |
-
<td>
|
131 |
-
<td>
|
132 |
-
<td>
|
133 |
</tr>
|
134 |
<tr>
|
135 |
<td>GPQA (0-shot)</td>
|
@@ -155,17 +155,17 @@ We report in the following table our internal pipeline benchmarks:
|
|
155 |
<tr>
|
156 |
<td rowspan="4">CommonSense Understanding</td>
|
157 |
<td>PIQA (0-shot)</td>
|
158 |
-
<td>
|
159 |
-
<td>
|
160 |
-
<td>
|
161 |
-
<td>
|
162 |
</tr>
|
163 |
<tr>
|
164 |
<td>SciQ (0-shot)</td>
|
165 |
-
<td>
|
166 |
-
<td>
|
167 |
-
<td>
|
168 |
-
<td>
|
169 |
</tr>
|
170 |
<tr>
|
171 |
<td>Winogrande (0-shot)</td>
|
@@ -176,10 +176,10 @@ We report in the following table our internal pipeline benchmarks:
|
|
176 |
</tr>
|
177 |
<tr>
|
178 |
<td>OpenbookQA (0-shot)</td>
|
179 |
-
<td>
|
180 |
-
<td>
|
181 |
-
<td>
|
182 |
-
<td>
|
183 |
</tr>
|
184 |
</tbody>
|
185 |
</table>
|
|
|
126 |
<tr>
|
127 |
<td rowspan="4">Reasoning</td>
|
128 |
<td>Arc Challenge (25-shot)</td>
|
129 |
+
<td>58.2</td>
|
130 |
+
<td>63.2</td>
|
131 |
+
<td>67.5</td>
|
132 |
+
<td>63.1</td>
|
133 |
</tr>
|
134 |
<tr>
|
135 |
<td>GPQA (0-shot)</td>
|
|
|
155 |
<tr>
|
156 |
<td rowspan="4">CommonSense Understanding</td>
|
157 |
<td>PIQA (0-shot)</td>
|
158 |
+
<td>81.2</td>
|
159 |
+
<td>79.9</td>
|
160 |
+
<td>79.1</td>
|
161 |
+
<td>82.9</td>
|
162 |
</tr>
|
163 |
<tr>
|
164 |
<td>SciQ (0-shot)</td>
|
165 |
+
<td>94.6</td>
|
166 |
+
<td>95.2</td>
|
167 |
+
<td>92.4</td>
|
168 |
+
<td>97.1</td>
|
169 |
</tr>
|
170 |
<tr>
|
171 |
<td>Winogrande (0-shot)</td>
|
|
|
176 |
</tr>
|
177 |
<tr>
|
178 |
<td>OpenbookQA (0-shot)</td>
|
179 |
+
<td>44.8</td>
|
180 |
+
<td>47.0</td>
|
181 |
+
<td>43.8</td>
|
182 |
+
<td>47.2</td>
|
183 |
</tr>
|
184 |
</tbody>
|
185 |
</table>
|