Corey Morris
commited on
Commit
•
1f8cc2a
1
Parent(s):
a79afe8
Added finding from moral scenarios about threshold
Browse files
app.py
CHANGED
@@ -156,7 +156,8 @@ def create_plot(df, arc_column, moral_column, models=None):
|
|
156 |
|
157 |
# Custom scatter plots
|
158 |
st.header('Custom scatter plots')
|
159 |
-
st.write("
|
|
|
160 |
selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=0)
|
161 |
selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=3)
|
162 |
|
@@ -168,27 +169,25 @@ else:
|
|
168 |
|
169 |
# end of custom scatter plots
|
170 |
st.markdown("## Notable findings and plots")
|
171 |
-
st.markdown("### Moral Scenarios Performance")
|
172 |
|
|
|
|
|
|
|
173 |
|
174 |
-
fig = create_plot(filtered_data, '
|
175 |
st.plotly_chart(fig)
|
176 |
|
|
|
|
|
|
|
|
|
177 |
fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios')
|
178 |
st.plotly_chart(fig)
|
179 |
|
180 |
fig = px.histogram(filtered_data, x="MMLU_moral_scenarios", marginal="rug", hover_data=filtered_data.columns)
|
181 |
st.plotly_chart(fig)
|
182 |
|
183 |
-
|
184 |
-
st.write("Small models showed surprisingly strong performance on the abstract algebra task. A 6 Billion parameter model is tied for the best performance on this task and there are a number of other small models in the top 10.")
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
# Usage example:
|
189 |
-
plot_top_n(filtered_data, 'MMLU_abstract_algebra', 10)
|
190 |
-
|
191 |
-
fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
|
192 |
st.plotly_chart(fig)
|
193 |
|
194 |
st.markdown("***Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations.***")
|
|
|
156 |
|
157 |
# Custom scatter plots
|
158 |
st.header('Custom scatter plots')
|
159 |
+
st.write("As expected, there is a strong positive relationship between the number of parameters and average performance on the MMLU evaluation.")
|
160 |
+
st.markdown("***The dashed red line indicates random chance accuracy of 0.25 as the MMLU evaluation is multiple choice with 4 response options.***")
|
161 |
selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=0)
|
162 |
selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=3)
|
163 |
|
|
|
169 |
|
170 |
# end of custom scatter plots
|
171 |
st.markdown("## Notable findings and plots")
|
|
|
172 |
|
173 |
+
st.markdown('### Abstract Algebra Performance')
|
174 |
+
st.write("Small models showed surprisingly strong performance on the abstract algebra task. A 6 Billion parameter model is tied for the best performance on this task and there are a number of other small models in the top 10.")
|
175 |
+
plot_top_n(filtered_data, 'MMLU_abstract_algebra', 10)
|
176 |
|
177 |
+
fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
|
178 |
st.plotly_chart(fig)
|
179 |
|
180 |
+
st.markdown("### Moral Scenarios Performance")
|
181 |
+
st.write("While smaller models can perform well at many tasks, the model size threshold for decent performance on moral scenarios is much higher. There are no models with less than 13 billion parameters with performance much better than random chance.")
|
182 |
+
|
183 |
+
st.write("Impact of Parameter Count on Accuracy for Moral Scenarios")
|
184 |
fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios')
|
185 |
st.plotly_chart(fig)
|
186 |
|
187 |
fig = px.histogram(filtered_data, x="MMLU_moral_scenarios", marginal="rug", hover_data=filtered_data.columns)
|
188 |
st.plotly_chart(fig)
|
189 |
|
190 |
+
fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
st.plotly_chart(fig)
|
192 |
|
193 |
st.markdown("***Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations.***")
|