Spaces:
Running
Running
natolambert
commited on
Commit
•
8799e00
1
Parent(s):
4e61a96
LFG
Browse files- app.py +70 -45
- src/constants.py +57 -0
- src/md.py +6 -0
- src/utils.py +5 -0
app.py
CHANGED
@@ -6,6 +6,7 @@ from datasets import load_dataset
|
|
6 |
from src.utils import load_all_data
|
7 |
from src.md import ABOUT_TEXT, TOP_TEXT
|
8 |
from src.plt import plot_avg_correlation
|
|
|
9 |
import numpy as np
|
10 |
|
11 |
api = HfApi()
|
@@ -33,54 +34,34 @@ repo = snapshot_download(
|
|
33 |
def avg_over_herm(dataframe):
|
34 |
"""
|
35 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
"""
|
37 |
new_df = dataframe.copy()
|
38 |
-
|
39 |
-
# for
|
40 |
-
for subset in
|
41 |
-
if
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
|
46 |
-
|
47 |
-
keep_columns = ["model",
|
|
|
48 |
new_df = new_df[keep_columns]
|
49 |
-
|
50 |
-
new_df["average"] = np.round(np.nanmean(new_df[subsets].values, axis=1), 2)
|
51 |
-
# rename column "hep" to "hep (code)"
|
52 |
-
new_df = new_df.rename(columns={"hep": "hep (code)"})
|
53 |
return new_df
|
54 |
|
55 |
def expand_subsets(dataframe):
|
56 |
# TODO need to modify data/ script to do this
|
57 |
pass
|
58 |
|
59 |
-
# reference for length bias categories
|
60 |
-
length_categories = {
|
61 |
-
'alpacaeval-easy': 'True',
|
62 |
-
'alpacaeval-hard': 'True',
|
63 |
-
'alpacaeval-length': 'Neutral',
|
64 |
-
'donotanswer': 'False',
|
65 |
-
'hep-cpp': 'Neutral',
|
66 |
-
'hep-go': 'Neutral',
|
67 |
-
'hep-java': 'Neutral',
|
68 |
-
'hep-js': 'Neutral',
|
69 |
-
'hep-python': 'Neutral',
|
70 |
-
'hep-rust': 'Neutral',
|
71 |
-
'llmbar-adver-GPTInst': 'False',
|
72 |
-
'llmbar-adver-GPTOut': 'Neutral',
|
73 |
-
'llmbar-adver-manual': 'False',
|
74 |
-
'llmbar-adver-neighbor': 'False',
|
75 |
-
'llmbar-natural': 'Neutral',
|
76 |
-
'mt-bench-easy': 'False',
|
77 |
-
'mt-bench-hard': 'False',
|
78 |
-
'mt-bench-med': 'Neutral',
|
79 |
-
'refusals-dangerous': 'False',
|
80 |
-
'refusals-offensive': 'False',
|
81 |
-
'xstest-should-refuse': 'False',
|
82 |
-
'xstest-should-respond': 'True'
|
83 |
-
}
|
84 |
|
85 |
def length_bias_check(dataframe):
|
86 |
"""
|
@@ -119,7 +100,7 @@ def length_bias_check(dataframe):
|
|
119 |
|
120 |
|
121 |
herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
|
122 |
-
herm_data_avg = avg_over_herm(herm_data).sort_values(by='
|
123 |
herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
|
124 |
prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
|
125 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
@@ -149,10 +130,23 @@ def random_sample(r: gr.Request, subset):
|
|
149 |
|
150 |
subsets = eval_set.unique("subset")
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
with gr.Blocks() as app:
|
153 |
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
|
154 |
with gr.Row():
|
155 |
gr.Markdown(TOP_TEXT)
|
|
|
156 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
157 |
with gr.TabItem("HERM Eval Set - Overview"):
|
158 |
with gr.Row():
|
@@ -163,24 +157,45 @@ with gr.Blocks() as app:
|
|
163 |
elem_id="herm_dataframe_avg",
|
164 |
height=1000,
|
165 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
with gr.TabItem("HERM Eval Set - Detailed"):
|
167 |
with gr.Row():
|
168 |
-
|
169 |
herm_data.values,
|
170 |
datatype=col_types_herm,
|
171 |
headers=herm_data.columns.tolist(),
|
172 |
elem_id="herm_dataframe",
|
173 |
height=1000,
|
174 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
with gr.TabItem("HERM Eval Set - Length Bias"):
|
176 |
with gr.Row():
|
177 |
-
|
178 |
herm_data_length.values,
|
179 |
datatype=cols_herm_data_length,
|
180 |
headers=herm_data_length.columns.tolist(),
|
181 |
elem_id="herm_dataframe_length",
|
182 |
height=1000,
|
183 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
with gr.TabItem("Known Pref. Sets"):
|
185 |
with gr.Row():
|
186 |
PREF_SET_TEXT = """
|
@@ -195,6 +210,13 @@ with gr.Blocks() as app:
|
|
195 |
elem_id="prefs_dataframe",
|
196 |
height=1000,
|
197 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
with gr.TabItem("About"):
|
200 |
with gr.Row():
|
@@ -216,6 +238,11 @@ with gr.Blocks() as app:
|
|
216 |
# with gr.Row():
|
217 |
# plot = plot_avg_correlation(herm_data_avg, prefs_data)
|
218 |
# gr.Plot(plot)
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
# Load data when app starts, TODO make this used somewhere...
|
221 |
# def load_data_on_start():
|
@@ -231,6 +258,4 @@ with gr.Blocks() as app:
|
|
231 |
scheduler = BackgroundScheduler()
|
232 |
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
233 |
scheduler.start()
|
234 |
-
|
235 |
-
|
236 |
-
app.queue().launch()
|
|
|
6 |
from src.utils import load_all_data
|
7 |
from src.md import ABOUT_TEXT, TOP_TEXT
|
8 |
from src.plt import plot_avg_correlation
|
9 |
+
from src.constants import subset_mapping, length_categories, example_counts
|
10 |
import numpy as np
|
11 |
|
12 |
api = HfApi()
|
|
|
34 |
def avg_over_herm(dataframe):
|
35 |
"""
|
36 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
37 |
+
|
38 |
+
We average over 4 core sections (per prompt weighting):
|
39 |
+
1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
|
40 |
+
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
41 |
+
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
42 |
+
4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
43 |
+
|
44 |
"""
|
45 |
new_df = dataframe.copy()
|
46 |
+
|
47 |
+
# for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
|
48 |
+
for subset, sub_subsets in subset_mapping.items():
|
49 |
+
subset_cols = [col for col in new_df.columns if col in sub_subsets]
|
50 |
+
sub_data = new_df[subset_cols].values # take the relevant column values
|
51 |
+
sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts
|
52 |
+
new_df[subset] = np.round(np.average(sub_data, axis=1, weights=sub_counts), 2) # take the weighted average
|
53 |
+
# new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
|
54 |
+
|
55 |
+
keep_columns = ["model",] + list(subset_mapping.keys())
|
56 |
+
# keep_columns = ["model", "average"] + subsets
|
57 |
new_df = new_df[keep_columns]
|
58 |
+
|
|
|
|
|
|
|
59 |
return new_df
|
60 |
|
61 |
def expand_subsets(dataframe):
|
62 |
# TODO need to modify data/ script to do this
|
63 |
pass
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
def length_bias_check(dataframe):
|
67 |
"""
|
|
|
100 |
|
101 |
|
102 |
herm_data = load_all_data(repo_dir_herm, subdir="eval-set").sort_values(by='average', ascending=False)
|
103 |
+
herm_data_avg = avg_over_herm(herm_data).sort_values(by='Chat', ascending=False)
|
104 |
herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
|
105 |
prefs_data = load_all_data(repo_dir_herm, subdir="pref-sets").sort_values(by='average', ascending=False)
|
106 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
|
|
130 |
|
131 |
subsets = eval_set.unique("subset")
|
132 |
|
133 |
+
def regex_table(dataframe, regex):
|
134 |
+
"""
|
135 |
+
Takes a model name as a regex, then returns only the rows that has that in it.
|
136 |
+
"""
|
137 |
+
# Split regex statement by comma and trim whitespace around regexes
|
138 |
+
regex_list = [x.strip() for x in regex.split(",")]
|
139 |
+
# Join the list into a single regex pattern with '|' acting as OR
|
140 |
+
combined_regex = '|'.join(regex_list)
|
141 |
+
# Filter the dataframe such that 'model' contains any of the regex patterns
|
142 |
+
return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
|
143 |
+
|
144 |
+
|
145 |
with gr.Blocks() as app:
|
146 |
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
|
147 |
with gr.Row():
|
148 |
gr.Markdown(TOP_TEXT)
|
149 |
+
search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
150 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
151 |
with gr.TabItem("HERM Eval Set - Overview"):
|
152 |
with gr.Row():
|
|
|
157 |
elem_id="herm_dataframe_avg",
|
158 |
height=1000,
|
159 |
)
|
160 |
+
# backup reference data
|
161 |
+
herm_table_hidden = gr.Dataframe(
|
162 |
+
herm_data_avg.values,
|
163 |
+
datatype=col_types_herm_avg,
|
164 |
+
headers=herm_data_avg.columns.tolist(),
|
165 |
+
visible=False,
|
166 |
+
)
|
167 |
with gr.TabItem("HERM Eval Set - Detailed"):
|
168 |
with gr.Row():
|
169 |
+
herm_table_detailed = gr.Dataframe(
|
170 |
herm_data.values,
|
171 |
datatype=col_types_herm,
|
172 |
headers=herm_data.columns.tolist(),
|
173 |
elem_id="herm_dataframe",
|
174 |
height=1000,
|
175 |
)
|
176 |
+
# backup
|
177 |
+
herm_table_detailed_hidden = gr.Dataframe(
|
178 |
+
herm_data.values,
|
179 |
+
datatype=col_types_herm,
|
180 |
+
headers=herm_data.columns.tolist(),
|
181 |
+
visible=False,
|
182 |
+
)
|
183 |
with gr.TabItem("HERM Eval Set - Length Bias"):
|
184 |
with gr.Row():
|
185 |
+
herm_table_len = gr.Dataframe(
|
186 |
herm_data_length.values,
|
187 |
datatype=cols_herm_data_length,
|
188 |
headers=herm_data_length.columns.tolist(),
|
189 |
elem_id="herm_dataframe_length",
|
190 |
height=1000,
|
191 |
)
|
192 |
+
# backup
|
193 |
+
herm_table_len_hidden = gr.Dataframe(
|
194 |
+
herm_data_length.values,
|
195 |
+
datatype=cols_herm_data_length,
|
196 |
+
headers=herm_data_length.columns.tolist(),
|
197 |
+
visible=False,
|
198 |
+
)
|
199 |
with gr.TabItem("Known Pref. Sets"):
|
200 |
with gr.Row():
|
201 |
PREF_SET_TEXT = """
|
|
|
210 |
elem_id="prefs_dataframe",
|
211 |
height=1000,
|
212 |
)
|
213 |
+
# backup
|
214 |
+
pref_sets_table_hidden = gr.Dataframe(
|
215 |
+
prefs_data.values,
|
216 |
+
datatype=col_types_prefs,
|
217 |
+
headers=prefs_data.columns.tolist(),
|
218 |
+
visible=False,
|
219 |
+
)
|
220 |
|
221 |
with gr.TabItem("About"):
|
222 |
with gr.Row():
|
|
|
238 |
# with gr.Row():
|
239 |
# plot = plot_avg_correlation(herm_data_avg, prefs_data)
|
240 |
# gr.Plot(plot)
|
241 |
+
|
242 |
+
search.change(regex_table, inputs=[herm_table_hidden, search], outputs=herm_table)
|
243 |
+
search.change(regex_table, inputs=[herm_table_detailed_hidden, search], outputs=herm_table_detailed)
|
244 |
+
search.change(regex_table, inputs=[herm_table_len_hidden, search], outputs=herm_table_len)
|
245 |
+
search.change(regex_table, inputs=[pref_sets_table_hidden, search], outputs=pref_sets_table)
|
246 |
|
247 |
# Load data when app starts, TODO make this used somewhere...
|
248 |
# def load_data_on_start():
|
|
|
258 |
scheduler = BackgroundScheduler()
|
259 |
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
260 |
scheduler.start()
|
261 |
+
app.launch() # had .queue() before launch before... not sure if that's necessary
|
|
|
|
src/constants.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# reference for length bias categories
|
2 |
+
length_categories = {
|
3 |
+
'alpacaeval-easy': 'True',
|
4 |
+
'alpacaeval-hard': 'True',
|
5 |
+
'alpacaeval-length': 'Neutral',
|
6 |
+
'donotanswer': 'False',
|
7 |
+
'hep-cpp': 'Neutral',
|
8 |
+
'hep-go': 'Neutral',
|
9 |
+
'hep-java': 'Neutral',
|
10 |
+
'hep-js': 'Neutral',
|
11 |
+
'hep-python': 'Neutral',
|
12 |
+
'hep-rust': 'Neutral',
|
13 |
+
'llmbar-adver-GPTInst': 'False',
|
14 |
+
'llmbar-adver-GPTOut': 'Neutral',
|
15 |
+
'llmbar-adver-manual': 'False',
|
16 |
+
'llmbar-adver-neighbor': 'False',
|
17 |
+
'llmbar-natural': 'Neutral',
|
18 |
+
'mt-bench-easy': 'False',
|
19 |
+
'mt-bench-hard': 'False',
|
20 |
+
'mt-bench-med': 'Neutral',
|
21 |
+
'refusals-dangerous': 'False',
|
22 |
+
'refusals-offensive': 'False',
|
23 |
+
'xstest-should-refuse': 'False',
|
24 |
+
'xstest-should-respond': 'True'
|
25 |
+
}
|
26 |
+
|
27 |
+
example_counts = {
|
28 |
+
"alpacaeval-easy": 100,
|
29 |
+
"alpacaeval-length": 95,
|
30 |
+
"alpacaeval-hard": 95,
|
31 |
+
"mt-bench-easy": 28,
|
32 |
+
"mt-bench-med": 40,
|
33 |
+
"mt-bench-hard": 37,
|
34 |
+
"refusals-dangerous": 100,
|
35 |
+
"refusals-offensive": 100,
|
36 |
+
"llmbar-natural": 100,
|
37 |
+
"llmbar-adver-neighbor": 134,
|
38 |
+
"llmbar-adver-GPTInst": 92,
|
39 |
+
"llmbar-adver-GPTOut": 47,
|
40 |
+
"llmbar-adver-manual": 46,
|
41 |
+
"xstest-should-refuse": 250,
|
42 |
+
"xstest-should-respond": 154,
|
43 |
+
"donotanswer": 136,
|
44 |
+
"hep-cpp": 164,
|
45 |
+
"hep-go": 164,
|
46 |
+
"hep-java": 164,
|
47 |
+
"hep-js": 164,
|
48 |
+
"hep-python": 164,
|
49 |
+
"hep-rust": 164
|
50 |
+
}
|
51 |
+
|
52 |
+
subset_mapping = {
|
53 |
+
"Chat": ["alpacaeval-easy", "alpacaeval-length", "alpacaeval-hard", "mt-bench-easy", "mt-bench-med"],
|
54 |
+
"Chat Hard": ["mt-bench-hard", "llmbar-natural", "llmbar-adver-neighbor", "llmbar-adver-GPTInst", "llmbar-adver-GPTOut", "llmbar-adver-manual"],
|
55 |
+
"Safety": ["refusals-dangerous", "refusals-offensive", "xstest-should-refuse", "xstest-should-respond", "donotanswer"],
|
56 |
+
"Code": ["hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust"]
|
57 |
+
}
|
src/md.py
CHANGED
@@ -2,6 +2,12 @@ ABOUT_TEXT = """
|
|
2 |
We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
|
3 |
A win is when the score for the chosen response is higher than the score for the rejected response.
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
## Subset Summary
|
6 |
|
7 |
Total number of the prompts is: 2538, filtered from 4676.
|
|
|
2 |
We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
|
3 |
A win is when the score for the chosen response is higher than the score for the rejected response.
|
4 |
|
5 |
+
We average over 4 core sections (per prompt weighting):
|
6 |
+
1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
|
7 |
+
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
8 |
+
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
9 |
+
4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
10 |
+
|
11 |
## Subset Summary
|
12 |
|
13 |
Total number of the prompts is: 2538, filtered from 4676.
|
src/utils.py
CHANGED
@@ -61,6 +61,11 @@ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to p
|
|
61 |
# select all columns except "model"
|
62 |
cols = df.columns.tolist()
|
63 |
cols.remove("model")
|
|
|
|
|
|
|
|
|
|
|
64 |
# round
|
65 |
df[cols] = df[cols].round(2)
|
66 |
avg = np.nanmean(df[cols].values,axis=1).round(2)
|
|
|
61 |
# select all columns except "model"
|
62 |
cols = df.columns.tolist()
|
63 |
cols.remove("model")
|
64 |
+
# remove model_beaker from dataframe
|
65 |
+
if "model_beaker" in cols:
|
66 |
+
cols.remove("model_beaker")
|
67 |
+
df = df.drop(columns=["model_beaker"])
|
68 |
+
|
69 |
# round
|
70 |
df[cols] = df[cols].round(2)
|
71 |
avg = np.nanmean(df[cols].values,axis=1).round(2)
|