File size: 15,888 Bytes
c12bd84
 
dfa14a8
a79afe8
 
9695a47
9444cd2
d7b89ce
 
1a1910c
 
d7b89ce
a79afe8
8ef77e5
 
 
a79afe8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c12bd84
9695a47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f52387e
 
 
9695a47
 
 
 
f52387e
9695a47
 
 
 
 
 
 
 
 
 
 
 
 
 
e79bcf3
dc21a69
d506f10
 
dc21a69
 
 
 
 
 
 
 
 
 
9695a47
a5fb364
383dc16
f839734
96ffe12
a5fb364
3f507e0
a5fb364
18ec1ba
 
 
a5fb364
 
43b4e29
1a1910c
f839734
1a1910c
8ef77e5
 
1a1910c
28d4d6a
43b4e29
0a33874
28e8799
0a33874
 
28e8799
e3642ff
8488477
0a33874
8488477
 
28e8799
8488477
 
e3642ff
0a33874
8488477
 
28e8799
8488477
e3642ff
a34a60b
28e8799
 
43b4e29
8474e43
 
0a33874
8474e43
 
 
 
 
 
 
0a33874
8474e43
 
 
0a33874
3abc48f
 
 
 
8474e43
3abc48f
8474e43
 
3abc48f
 
e1345be
3abc48f
 
246a992
3abc48f
 
2a7f691
3abc48f
 
a34a60b
6a7ad7c
 
 
a34a60b
 
 
 
 
 
 
 
9695a47
bdad6e6
337b761
8488477
337b761
b94ee8f
bdad6e6
b94ee8f
337b761
8488477
bdad6e6
 
337b761
 
 
bdad6e6
f9a0f38
 
 
 
 
bdad6e6
 
 
 
 
f9a0f38
2b16774
 
 
bdad6e6
 
337b761
bdad6e6
 
 
7ed3839
bdad6e6
 
7ed3839
bdad6e6
7ed3839
 
 
 
 
 
 
 
 
 
 
bdad6e6
7ed3839
 
 
 
 
 
 
 
 
 
 
337b761
 
bdad6e6
7ed3839
ca8d4b9
cb21769
 
 
 
1f8cc2a
cb21769
 
 
 
25bce6d
 
ca8d4b9
 
 
 
 
 
 
fb25b1e
 
 
 
 
9695a47
a450af5
 
 
 
 
 
 
7b77065
 
 
 
 
2db58a0
618dcce
 
 
 
 
627e0f9
 
 
dc21a69
 
 
fb25b1e
 
 
627e0f9
 
 
2db58a0
627e0f9
 
9695a47
12a9766
 
 
 
e7c50af
19c7c67
 
 
 
 
 
e7c50af
19c7c67
12a9766
e7c50af
 
 
12a9766
 
e7c50af
 
12a9766
 
 
 
 
 
 
 
 
41d7691
 
 
12a9766
41d7691
 
12a9766
a5fb364
4fbdb10
 
ea8703d
4fbdb10
18ec1ba
ea8703d
 
4fbdb10
ea8703d
4fbdb10
ea8703d
4fbdb10
ea8703d
4fbdb10
ea8703d
4fbdb10
ea8703d
4fbdb10
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
import streamlit as st
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go

st.set_page_config(layout="wide")

def load_csv_data(file_path):
    return pd.read_csv(file_path)





def plot_top_n(df, target_column, n=10):
    top_n = df.nlargest(n, target_column)

    # Initialize the bar plot
    fig, ax1 = plt.subplots(figsize=(10, 5))

    # Set width for each bar and their positions
    width = 0.28
    ind = np.arange(len(top_n))

    # Plot target_column and MMLU_average on the primary y-axis with adjusted positions
    ax1.bar(ind - width, top_n[target_column], width=width, color='blue', label=target_column)
    ax1.bar(ind, top_n['MMLU_average'], width=width, color='orange', label='MMLU_average')

    # Set the primary y-axis labels and title
    ax1.set_title(f'Top {n} performing models on {target_column}')
    ax1.set_xlabel('Model')
    ax1.set_ylabel('Score')

    # Create a secondary y-axis for Parameters
    ax2 = ax1.twinx()

    # Plot Parameters as bars on the secondary y-axis with adjusted position
    ax2.bar(ind + width, top_n['Parameters'], width=width, color='red', label='Parameters')

    # Set the secondary y-axis labels
    ax2.set_ylabel('Parameters', color='red')
    ax2.tick_params(axis='y', labelcolor='red')

    # Set the x-ticks and their labels
    ax1.set_xticks(ind)
    ax1.set_xticklabels(top_n.index, rotation=45, ha="right")

    # Adjust the legend
    fig.tight_layout()
    fig.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    # Show the plot
    st.pyplot(fig)

# Function to create an unfilled radar chart
def create_radar_chart_unfilled(df, model_names, metrics):
    fig = go.Figure()
    min_value = df.loc[model_names, metrics].min().min()
    max_value = df.loc[model_names, metrics].max().max()
    for model_name in model_names:
        values_model = df.loc[model_name, metrics]
        fig.add_trace(go.Scatterpolar(
            r=values_model,
            theta=metrics,
            name=model_name
        ))

    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[min_value, max_value]
            )),
        showlegend=True,
        width=800,  # Change the width as needed
        height=600   # Change the height as needed
    )
    return fig



# Function to create a line chart
def create_line_chart(df, model_names, metrics):
    line_data = []
    for model_name in model_names:
        values_model = df.loc[model_name, metrics]
        for metric, value in zip(metrics, values_model):
            line_data.append({'Model': model_name, 'Metric': metric, 'Value': value})

    line_df = pd.DataFrame(line_data)

    fig = px.line(line_df, x='Metric', y='Value', color='Model', title='Comparison of Models', line_dash_sequence=['solid'])
    fig.update_layout(showlegend=True)
    return fig

def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters', 'organization']):
    # Calculate the absolute differences for each task between the target model and the closest models
    new_df = df.drop(columns=exclude_columns)
    differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
    # Unstack the differences and sort by the largest absolute difference
    top_differences = differences.unstack().nlargest(num_differences)
    # Convert the top differences to a DataFrame for display
    top_differences_table = pd.DataFrame({
        'Task': [idx[0] for idx in top_differences.index],
        'Difference': top_differences.values
    })
    # Ensure that only unique tasks are returned
    unique_top_differences_tasks = list(set(top_differences_table['Task'].tolist()))
    return top_differences_table, unique_top_differences_tasks

# st.title('Model Evaluation Results including MMLU by task')
st.title('Interactive Portal for Analyzing Open Source Large Language Models')
st.markdown("""***Last updated October 6th***""")
st.markdown("""**Models that are suspected to have training data contaminated with evaluation data have been removed.**""")
st.markdown("""
            Hugging Face runs evaluations on open source models and provides results on a
            [publicly available leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [dataset](https://huggingface.co/datasets/open-llm-leaderboard/results). 
            The Hugging Face leaderboard currently displays the overall result for Measuring Massive Multitask Language Understanding (MMLU), but not the results for individual tasks.
            This app provides a way to explore the results for individual tasks and compare models across tasks.
            There are 57 tasks in the MMLU evaluation that cover a wide variety of subjects including Science, Math, Humanities, Social Science, Applied Science, Logic, and Security.
            [Preliminary analysis of MMLU-by-Task data](https://coreymorrisdata.medium.com/preliminary-analysis-of-mmlu-evaluation-data-insights-from-500-open-source-models-e67885aa364b)
            """)

# Load the data into memory
data_path = "processed_data_2023-10-06.csv"
data_df = load_csv_data(data_path)
data_df.rename(columns={"Unnamed: 0": "Model Name"}, inplace=True)
data_df.set_index("Model Name", inplace=True)

filters = st.checkbox('Select Models and/or Evaluations')

# Initialize selected columns with "Parameters" and "MMLU_average" if filters are checked
selected_columns = ['Parameters', 'MMLU_average'] if filters else data_df.columns.tolist()

# Initialize selected models as empty if filters are checked
selected_models = [] if filters else data_df.index.tolist()

if filters:
    # Create multi-select for columns with default selection
    selected_columns = st.multiselect(
        'Select Columns',
        data_df.columns.tolist(),
        default=selected_columns
    )

    # Create multi-select for models without default selection
    selected_models = st.multiselect(
        'Select Models',
        data_df.index.tolist()
    )

# Get the filtered data
# filtered_data = data_provider.get_data(selected_models)
filtered_data = data_df

# sort the table by the MMLU_average column
filtered_data = filtered_data.sort_values(by=['MMLU_average'], ascending=False)

# Select box for filtering by Parameters
parameter_threshold = st.selectbox(
    'Filter by Parameters (Less Than or Equal To):',
    options=[3, 7, 13, 35, 'No threshold'],
    index=4,  # Set the default selected option to 'No threshold'
    format_func=lambda x: f"{x}" if isinstance(x, int) else x
)

# Filter the DataFrame based on the selected parameter threshold if not 'No threshold'
if isinstance(parameter_threshold, int):
    filtered_data = filtered_data[filtered_data['Parameters'] <= parameter_threshold]


# Search box
search_query = st.text_input("Filter by Model Name:", "")

# Filter the DataFrame based on the search query in the index (model name)
if search_query:
    filtered_data = filtered_data[filtered_data.index.str.contains(search_query, case=False)]


# Search box for columns
column_search_query = st.text_input("Filter by Column/Task Name:", "").replace(" ", "").split(',')

# Get the columns that contain the search query
matching_columns = [col for col in filtered_data.columns if any(query.lower() in col.lower() for query in column_search_query)]

# Display the DataFrame with only the matching columns
st.markdown("## Sortable Results")
st.dataframe(filtered_data[matching_columns])

# CSV download

filtered_data.index.name = "Model Name"

csv = filtered_data.to_csv(index=True)
st.download_button(
    label="Download data as CSV",
    data=csv,
    file_name="model_evaluation_results.csv",
    mime="text/csv",
)


def create_plot(df, x_values, y_values, models=None, title=None):
    if models is not None:
        df = df[df.index.isin(models)]

    # remove rows with NaN values
    df = df.dropna(subset=[x_values, y_values])

    plot_data = pd.DataFrame({
        'Model': df.index,
        x_values: df[x_values],
        y_values: df[y_values],
    })

    plot_data['color'] = 'purple'
    fig = px.scatter(plot_data, x=x_values, y=y_values, color='color', hover_data=['Model'], trendline="ols")
    
    # If title is not provided, use x_values vs. y_values as the default title
    if title is None:
        title = x_values + " vs. " + y_values
    
    layout_args = dict(
        showlegend=False, 
        xaxis_title=x_values,
        yaxis_title=y_values,
        xaxis=dict(),
        yaxis=dict(),
        title=title,
        height=500,
        width=1000,
    )
    fig.update_layout(**layout_args)
    
    # Add a dashed line at 0.25 for the y_values
    x_min = df[x_values].min()
    x_max = df[x_values].max()

    y_min = df[y_values].min()
    y_max = df[y_values].max()

    if x_values.startswith('MMLU'): 
        fig.add_shape(
        type='line',
        x0=0.25, x1=0.25,
        y0=y_min, y1=y_max,
        line=dict(
            color='red',
            width=2,
            dash='dash'
        )
        )

    if y_values.startswith('MMLU'):
        fig.add_shape(
        type='line',
        x0=x_min, x1=x_max,
        y0=0.25, y1=0.25,
        line=dict(
            color='red',
            width=2,
            dash='dash'
        )
        )

    return fig


# Custom scatter plots
st.header('Custom scatter plots')
st.write("""
         The scatter plot is useful to identify models that outperform or underperform on a particular task in relation to their size or overall performance.
         Identifying these models is a first step to better understand what training strategies result in better performance on a particular task.
         """)
st.markdown("***The dashed red line indicates random chance accuracy of 0.25 as the MMLU evaluation is multiple choice with 4 response options.***")
# add a line separating the writing
st.markdown("***")
st.write("As expected, there is a strong positive relationship between the number of parameters and average performance on the MMLU evaluation.")

selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=1)
selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=4)

if selected_x_column != selected_y_column:    # Avoid creating a plot with the same column on both axes
    fig = create_plot(filtered_data, selected_x_column, selected_y_column)
    st.plotly_chart(fig)
else:
    st.write("Please select different columns for the x and y axes.")




# end of custom scatter plots

# Section to select a model and display radar and line charts
st.header("Compare a Selected Model to the 5 Models Closest in MMLU Average Performance")
st.write("""
         This comparison highlights the nuances in model performance across different tasks. 
         While the overall MMLU average score provides a general understanding of a model's capabilities, 
         examining the closest models reveals variations in performance on individual tasks. 
         Such an analysis can uncover specific strengths and weaknesses and guide further exploration and improvement.
         """)

default_model_name = "GPT-JT-6B-v0"

default_model_index = filtered_data.index.tolist().index(default_model_name) if default_model_name in filtered_data.index else 0
selected_model_name = st.selectbox("Select a Model:", filtered_data.index.tolist(), index=default_model_index)

# Get the closest 5 models with unique indices
closest_models_diffs = filtered_data['MMLU_average'].sub(filtered_data.loc[selected_model_name, 'MMLU_average']).abs()
closest_models = closest_models_diffs.nsmallest(5, keep='first').index.drop_duplicates().tolist()


# Find the top 10 tasks with the largest differences and convert to a DataFrame
top_differences_table, top_differences_tasks = find_top_differences_table(filtered_data, selected_model_name, closest_models)

# Display the DataFrame for the closest models and the top differences tasks
st.dataframe(filtered_data.loc[closest_models, top_differences_tasks])

# # Display the table in the Streamlit app
# st.markdown("## Top Differences")
# st.dataframe(top_differences_table)

# Create a radar chart for the tasks with the largest differences
fig_radar_top_differences = create_radar_chart_unfilled(filtered_data, closest_models, top_differences_tasks)

# Display the radar chart
st.plotly_chart(fig_radar_top_differences)


st.markdown("## Notable findings and plots")

# Moral scenarios plots
st.markdown("### MMLU’s Moral Scenarios Benchmark Doesn’t Measure What You Think it Measures")
def show_random_moral_scenarios_question():
    moral_scenarios_data = pd.read_csv('moral_scenarios_questions.csv')
    random_question = moral_scenarios_data.sample()
    expander = st.expander("Show a random moral scenarios question")
    expander.write(random_question['query'].values[0])



st.write("""
         After a deeper dive into the moral scenarios task, it appears that benchmark is not a valid measurement of moral judgement.
         The challenges these models face are not rooted in understanding each scenario, but rather in the structure of the task itself.
         I would recommend using a different benchmark for moral judgement. More details of the analysis can be found here: [MMLU’s Moral Scenarios Benchmark Doesn’t Measure What You Think it Measures ](https://medium.com/p/74fd6e512521)
            """)

show_random_moral_scenarios_question()

fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios', title="Impact of Parameter Count on Accuracy for Moral Scenarios")
st.plotly_chart(fig)
st.write()



fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
st.plotly_chart(fig)

st.markdown('### Abstract Algebra Performance')
st.write("Small models showed surprisingly strong performance on the abstract algebra task.  A 6 Billion parameter model is tied for the best performance on this task and there are a number of other small models in the top 10.")
plot_top_n(filtered_data, 'MMLU_abstract_algebra', 10)

fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
st.plotly_chart(fig)

st.markdown("***Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations.***")

st.markdown("""
# Citation

1. Corey Morris (2023). *Exploring the Characteristics of Large Language Models: An Interactive Portal for Analyzing 700+ Open Source Models Across 57 Diverse Evaluation Tasks*. [link](https://huggingface.co/spaces/CoreyMorris/MMLU-by-task-Leaderboard)
            
2. Edward Beeching, Clémentine Fourrier, Nathan Habib, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf. (2023). *Open LLM Leaderboard*. Hugging Face. [link](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)

3. Gao, Leo et al. (2021). *A framework for few-shot language model evaluation*. Zenodo. [link](https://doi.org/10.5281/zenodo.5371628)

4. Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, Oyvind Tafjord. (2018). *Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge*. arXiv. [link](https://arxiv.org/abs/1803.05457)

5. Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, Yejin Choi. (2019). *HellaSwag: Can a Machine Really Finish Your Sentence?*. arXiv. [link](https://arxiv.org/abs/1905.07830)

6. Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, Jacob Steinhardt. (2021). *Measuring Massive Multitask Language Understanding*. arXiv. [link](https://arxiv.org/abs/2009.03300)

7. Stephanie Lin, Jacob Hilton, Owain Evans. (2022). *TruthfulQA: Measuring How Models Mimic Human Falsehoods*. arXiv. [link](https://arxiv.org/abs/2109.07958)
""")