CoreyMorris
commited on
Commit
•
36799a9
1
Parent(s):
28fcccf
Updated data and added notes about the site.
Browse files- app.py +4 -3
- processed_data_2023-09-29.csv +0 -0
- results +1 -1
app.py
CHANGED
@@ -112,8 +112,9 @@ def find_top_differences_table(df, target_model, closest_models, num_differences
|
|
112 |
|
113 |
# st.title('Model Evaluation Results including MMLU by task')
|
114 |
st.title('Interactive Portal for Analyzing Open Source Large Language Models')
|
115 |
-
st.markdown("""***Last updated
|
116 |
-
st.markdown("""**
|
|
|
117 |
st.markdown("""
|
118 |
This page provides a way to explore the results for individual tasks and compare models across tasks. Data for the benchmarks hellaswag, arc_challenge, and truthfulQA have also been included for comparison.
|
119 |
There are 57 tasks in the MMLU evaluation that cover a wide variety of subjects including Science, Math, Humanities, Social Science, Applied Science, Logic, and Security.
|
@@ -121,7 +122,7 @@ st.markdown("""
|
|
121 |
""")
|
122 |
|
123 |
# Load the data into memory
|
124 |
-
data_path = "
|
125 |
data_df = load_csv_data(data_path)
|
126 |
# drop the column Unnamed: 0
|
127 |
data_df.rename(columns={'Unnamed: 0': "Model Name"}, inplace=True)
|
|
|
112 |
|
113 |
# st.title('Model Evaluation Results including MMLU by task')
|
114 |
st.title('Interactive Portal for Analyzing Open Source Large Language Models')
|
115 |
+
st.markdown("""***Last updated March 17th 2024***""")
|
116 |
+
st.markdown("""**It has not been updated to correctly extract the parameter number from mixture of experts models.**""")
|
117 |
+
st.markdown("""**As of 04-17-2024, this data was not generated using the chat templates. Smaller models are especially sensative to this and other aspects related to the format of the inputs.**""")
|
118 |
st.markdown("""
|
119 |
This page provides a way to explore the results for individual tasks and compare models across tasks. Data for the benchmarks hellaswag, arc_challenge, and truthfulQA have also been included for comparison.
|
120 |
There are 57 tasks in the MMLU evaluation that cover a wide variety of subjects including Science, Math, Humanities, Social Science, Applied Science, Logic, and Security.
|
|
|
122 |
""")
|
123 |
|
124 |
# Load the data into memory
|
125 |
+
data_path = "processed_data_2024-04-16.csv"
|
126 |
data_df = load_csv_data(data_path)
|
127 |
# drop the column Unnamed: 0
|
128 |
data_df.rename(columns={'Unnamed: 0': "Model Name"}, inplace=True)
|
processed_data_2023-09-29.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
results
CHANGED
@@ -1 +1 @@
|
|
1 |
-
Subproject commit
|
|
|
1 |
+
Subproject commit ae58c7715592b2f354a89b5b64f3d2d12335dc89
|