puneeshkhanna commited on
Commit
f422761
1 Parent(s): 8ac821c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -13
README.md CHANGED
@@ -90,14 +90,14 @@ We report in the following table our internal pipeline benchmarks:
90
  <td rowspan="3">General</td>
91
  <td>MMLU (5-shot)</td>
92
  <td>65.2</td>
93
- <td>74.2</td>
94
  <td>70.8</td>
95
  <td>67.5</td>
96
  </tr>
97
  <tr>
98
  <td>MMLU-PRO (5-shot)</td>
99
  <td>32.7</td>
100
- <td>43.5</td>
101
  <td>41.4</td>
102
  <td>39.2</td>
103
  </tr>
@@ -106,13 +106,13 @@ We report in the following table our internal pipeline benchmarks:
106
  <td>12.0</td>
107
  <td>33.9</td>
108
  <td>21.2</td>
109
- <td>34.3</td>
110
  </tr>
111
  <tr>
112
  <td rowspan="2">Math</td>
113
  <td>GSM8K (5-shot)</td>
114
  <td>49.4</td>
115
- <td>82.9</td>
116
  <td>69.1</td>
117
  <td>76.2</td>
118
  </tr>
@@ -121,14 +121,14 @@ We report in the following table our internal pipeline benchmarks:
121
  <td>4.1</td>
122
  <td>15.5</td>
123
  <td>10.5</td>
124
- <td>18.0</td>
125
  </tr>
126
  <tr>
127
  <td rowspan="4">Reasoning</td>
128
  <td>Arc Challenge (25-shot)</td>
129
  <td>58.2</td>
130
  <td>63.2</td>
131
- <td>67.5</td>
132
  <td>63.1</td>
133
  </tr>
134
  <tr>
@@ -136,20 +136,20 @@ We report in the following table our internal pipeline benchmarks:
136
  <td>31.0</td>
137
  <td>33.0</td>
138
  <td>33.4</td>
139
- <td>35.5</td>
140
  </tr>
141
  <tr>
142
  <td>MUSR (0-shot)</td>
143
  <td>38.0</td>
144
  <td>44.2</td>
145
  <td>45.3</td>
146
- <td>47.3</td>
147
  </tr>
148
  <tr>
149
  <td>BBH (3-shot)</td>
150
  <td>46.5</td>
151
  <td>54.0</td>
152
- <td>54.3</td>
153
  <td>51.0</td>
154
  </tr>
155
  <tr>
@@ -157,28 +157,28 @@ We report in the following table our internal pipeline benchmarks:
157
  <td>PIQA (0-shot)</td>
158
  <td>81.2</td>
159
  <td>79.9</td>
160
- <td>82.9</td>
161
  <td>79.1</td>
162
  </tr>
163
  <tr>
164
  <td>SciQ (0-shot)</td>
165
  <td>94.6</td>
166
  <td>95.2</td>
167
- <td>97.1</td>
168
  <td>92.4</td>
169
  </tr>
170
  <tr>
171
  <td>Winogrande (0-shot)</td>
172
  <td>74.0</td>
173
  <td>72.9</td>
174
- <td>74.2</td>
175
  <td>71.0</td>
176
  </tr>
177
  <tr>
178
  <td>OpenbookQA (0-shot)</td>
179
  <td>44.8</td>
180
  <td>47.0</td>
181
- <td>47.2</td>
182
  <td>43.8</td>
183
  </tr>
184
  </tbody>
 
90
  <td rowspan="3">General</td>
91
  <td>MMLU (5-shot)</td>
92
  <td>65.2</td>
93
+ <td><b>74.2</b></td>
94
  <td>70.8</td>
95
  <td>67.5</td>
96
  </tr>
97
  <tr>
98
  <td>MMLU-PRO (5-shot)</td>
99
  <td>32.7</td>
100
+ <td><b>43.5</b></td>
101
  <td>41.4</td>
102
  <td>39.2</td>
103
  </tr>
 
106
  <td>12.0</td>
107
  <td>33.9</td>
108
  <td>21.2</td>
109
+ <td><b>34.3</b></td>
110
  </tr>
111
  <tr>
112
  <td rowspan="2">Math</td>
113
  <td>GSM8K (5-shot)</td>
114
  <td>49.4</td>
115
+ <td><b>82.9</b></td>
116
  <td>69.1</td>
117
  <td>76.2</td>
118
  </tr>
 
121
  <td>4.1</td>
122
  <td>15.5</td>
123
  <td>10.5</td>
124
+ <td><b>18.0</b></td>
125
  </tr>
126
  <tr>
127
  <td rowspan="4">Reasoning</td>
128
  <td>Arc Challenge (25-shot)</td>
129
  <td>58.2</td>
130
  <td>63.2</td>
131
+ <td><b>67.5</b></td>
132
  <td>63.1</td>
133
  </tr>
134
  <tr>
 
136
  <td>31.0</td>
137
  <td>33.0</td>
138
  <td>33.4</td>
139
+ <td><b>35.5</b></td>
140
  </tr>
141
  <tr>
142
  <td>MUSR (0-shot)</td>
143
  <td>38.0</td>
144
  <td>44.2</td>
145
  <td>45.3</td>
146
+ <td><b>47.3</b></td>
147
  </tr>
148
  <tr>
149
  <td>BBH (3-shot)</td>
150
  <td>46.5</td>
151
  <td>54.0</td>
152
+ <td><b>54.3</b></td>
153
  <td>51.0</td>
154
  </tr>
155
  <tr>
 
157
  <td>PIQA (0-shot)</td>
158
  <td>81.2</td>
159
  <td>79.9</td>
160
+ <td><b>82.9</b></td>
161
  <td>79.1</td>
162
  </tr>
163
  <tr>
164
  <td>SciQ (0-shot)</td>
165
  <td>94.6</td>
166
  <td>95.2</td>
167
+ <td><b>97.1</b></td>
168
  <td>92.4</td>
169
  </tr>
170
  <tr>
171
  <td>Winogrande (0-shot)</td>
172
  <td>74.0</td>
173
  <td>72.9</td>
174
+ <td><b>74.2</b></td>
175
  <td>71.0</td>
176
  </tr>
177
  <tr>
178
  <td>OpenbookQA (0-shot)</td>
179
  <td>44.8</td>
180
  <td>47.0</td>
181
+ <td><b>47.2</b></td>
182
  <td>43.8</td>
183
  </tr>
184
  </tbody>