future-xy
commited on
Commit
·
d6d7ec6
1
Parent(s):
2d754ab
formatting code
Browse files- app.py +75 -56
- backend-cli.py +161 -44
- cli/analysis-cli.py +93 -76
- cli/averitec-upload-cli.py +8 -6
- cli/beta-cli.py +12 -6
- cli/completed-cli.py +20 -8
- cli/eval-cli.py +18 -8
- cli/fever-upload-cli.py +15 -18
- cli/fix-requests-cli.py +8 -8
- cli/halueval-upload-cli.py +23 -23
- cli/isp-upload-cli.py +4 -4
- cli/nqswap-upload-cli.py +2 -6
- cli/shroom-upload-cli.py +3 -3
- cli/submit-cli.py +39 -16
- cli/sync-open-llm-cli.py +29 -11
- cli/truefalse-upload-cli.py +3 -3
- src/backend/envs.py +1 -1
- src/backend/huggingface_generate_until.py +4 -5
- src/backend/manage_requests.py +37 -20
- src/backend/moe_infinity.py +12 -9
- src/backend/run_eval_suite.py +30 -16
- src/backend/tasks/cnndm/task.py +42 -18
- src/backend/tasks/cnndm/task_v2.py +47 -19
- src/backend/tasks/faithdial/utils.py +3 -2
- src/backend/tasks/halueval/utils.py +21 -3
- src/backend/tasks/selfcheckgpt/task.py +48 -36
- src/backend/tasks/xsum/task.py +51 -10
- src/backend/tasks/xsum/task_v2.py +56 -11
- src/browse.py +65 -59
- src/display/utils.py +1 -0
- src/leaderboard/filter_models.py +3 -3
- src/leaderboard/read_evals.py +34 -23
- src/populate.py +11 -7
- src/submission/check_validity.py +22 -6
- src/submission/submit.py +3 -1
- src/utils.py +8 -5
app.py
CHANGED
@@ -19,7 +19,7 @@ from src.display.about import (
|
|
19 |
LLM_BENCHMARKS_TEXT,
|
20 |
LLM_BENCHMARKS_DETAILS,
|
21 |
FAQ_TEXT,
|
22 |
-
TITLE
|
23 |
)
|
24 |
|
25 |
from src.display.css_html_js import custom_css
|
@@ -35,7 +35,7 @@ from src.display.utils import (
|
|
35 |
ModelType,
|
36 |
fields,
|
37 |
WeightType,
|
38 |
-
Precision
|
39 |
)
|
40 |
|
41 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
@@ -47,7 +47,9 @@ from src.utils import get_dataset_summary_table
|
|
47 |
def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
|
48 |
try:
|
49 |
print(local_dir)
|
50 |
-
snapshot_download(
|
|
|
|
|
51 |
except Exception as e:
|
52 |
restart_space()
|
53 |
|
@@ -57,15 +59,21 @@ def restart_space():
|
|
57 |
|
58 |
|
59 |
def init_space():
|
60 |
-
dataset_df = get_dataset_summary_table(file_path=
|
61 |
|
62 |
-
if socket.gethostname() not in {
|
63 |
# sync model_type with open-llm-leaderboard
|
64 |
-
ui_snapshot_download(
|
65 |
-
|
|
|
|
|
|
|
|
|
66 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, "", COLS, BENCHMARK_COLS)
|
67 |
|
68 |
-
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
|
|
|
|
69 |
return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
70 |
|
71 |
|
@@ -74,12 +82,9 @@ leaderboard_df = original_df.copy()
|
|
74 |
|
75 |
|
76 |
# Searching and filtering
|
77 |
-
def update_table(
|
78 |
-
|
79 |
-
|
80 |
-
precision_query: list,
|
81 |
-
size_query: list,
|
82 |
-
query: str):
|
83 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
|
84 |
filtered_df = filter_queries(query, filtered_df)
|
85 |
df = select_columns(filtered_df, columns)
|
@@ -99,7 +104,9 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
99 |
# We use COLS to maintain sorting
|
100 |
filtered_df = df[
|
101 |
# always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
102 |
-
always_here_cols
|
|
|
|
|
103 |
]
|
104 |
return filtered_df
|
105 |
|
@@ -121,10 +128,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
|
|
121 |
return filtered_df
|
122 |
|
123 |
|
124 |
-
def filter_models(df: pd.DataFrame,
|
125 |
-
type_query: list,
|
126 |
-
size_query: list,
|
127 |
-
precision_query: list) -> pd.DataFrame:
|
128 |
# Show all models
|
129 |
filtered_df = df
|
130 |
|
@@ -152,15 +156,15 @@ with demo:
|
|
152 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
153 |
|
154 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
155 |
-
with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark",
|
156 |
-
elem_id="llm-benchmark-tab-table",
|
157 |
-
id=0):
|
158 |
with gr.Row():
|
159 |
with gr.Column():
|
160 |
with gr.Row():
|
161 |
-
search_bar = gr.Textbox(
|
162 |
-
|
163 |
-
|
|
|
|
|
164 |
with gr.Row():
|
165 |
shown_columns = gr.CheckboxGroup(
|
166 |
choices=[
|
@@ -175,7 +179,8 @@ with demo:
|
|
175 |
],
|
176 |
label="Select columns to show",
|
177 |
elem_id="column-select",
|
178 |
-
interactive=True
|
|
|
179 |
|
180 |
with gr.Column(min_width=320):
|
181 |
filter_columns_type = gr.CheckboxGroup(
|
@@ -183,40 +188,51 @@ with demo:
|
|
183 |
choices=[t.to_str() for t in ModelType],
|
184 |
value=[t.to_str() for t in ModelType],
|
185 |
interactive=True,
|
186 |
-
elem_id="filter-columns-type"
|
|
|
187 |
|
188 |
filter_columns_precision = gr.CheckboxGroup(
|
189 |
label="Precision",
|
190 |
choices=[i.value.name for i in Precision],
|
191 |
value=[i.value.name for i in Precision],
|
192 |
interactive=True,
|
193 |
-
elem_id="filter-columns-precision"
|
|
|
194 |
|
195 |
filter_columns_size = gr.CheckboxGroup(
|
196 |
label="Model sizes (in billions of parameters)",
|
197 |
choices=list(NUMERIC_INTERVALS.keys()),
|
198 |
value=list(NUMERIC_INTERVALS.keys()),
|
199 |
interactive=True,
|
200 |
-
elem_id="filter-columns-size"
|
|
|
201 |
|
202 |
# breakpoint()
|
203 |
|
204 |
leaderboard_table = gr.components.Dataframe(
|
205 |
-
value=
|
206 |
-
[
|
207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
209 |
datatype=TYPES,
|
210 |
elem_id="leaderboard-table",
|
211 |
interactive=False,
|
212 |
-
visible=True
|
|
|
213 |
|
214 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
215 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
216 |
value=original_df[COLS] if original_df.empty is False else original_df,
|
217 |
headers=COLS,
|
218 |
datatype=TYPES,
|
219 |
-
visible=False
|
|
|
220 |
|
221 |
search_bar.submit(
|
222 |
update_table,
|
@@ -228,7 +244,8 @@ with demo:
|
|
228 |
filter_columns_size,
|
229 |
search_bar,
|
230 |
],
|
231 |
-
leaderboard_table
|
|
|
232 |
|
233 |
# Check query parameter once at startup and update search bar
|
234 |
demo.load(load_query, inputs=[], outputs=[search_bar])
|
@@ -245,7 +262,8 @@ with demo:
|
|
245 |
search_bar,
|
246 |
],
|
247 |
leaderboard_table,
|
248 |
-
queue=True
|
|
|
249 |
|
250 |
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
251 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
@@ -253,11 +271,12 @@ with demo:
|
|
253 |
dataset_table = gr.components.Dataframe(
|
254 |
value=dataset_df,
|
255 |
headers=list(dataset_df.columns),
|
256 |
-
datatype=[
|
257 |
elem_id="dataset-table",
|
258 |
interactive=False,
|
259 |
visible=True,
|
260 |
-
column_widths=["15%", "20%"]
|
|
|
261 |
|
262 |
gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
|
263 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
@@ -271,26 +290,20 @@ with demo:
|
|
271 |
with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
|
272 |
with gr.Row():
|
273 |
finished_eval_table = gr.components.Dataframe(
|
274 |
-
value=finished_eval_queue_df,
|
275 |
-
|
276 |
-
datatype=EVAL_TYPES,
|
277 |
-
row_count=5)
|
278 |
|
279 |
with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
|
280 |
with gr.Row():
|
281 |
running_eval_table = gr.components.Dataframe(
|
282 |
-
value=running_eval_queue_df,
|
283 |
-
|
284 |
-
datatype=EVAL_TYPES,
|
285 |
-
row_count=5)
|
286 |
|
287 |
with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
|
288 |
with gr.Row():
|
289 |
pending_eval_table = gr.components.Dataframe(
|
290 |
-
value=pending_eval_queue_df,
|
291 |
-
|
292 |
-
datatype=EVAL_TYPES,
|
293 |
-
row_count=5)
|
294 |
|
295 |
with gr.Row():
|
296 |
gr.Markdown("# Submit your model here", elem_classes="markdown-text")
|
@@ -305,7 +318,8 @@ with demo:
|
|
305 |
label="Model type",
|
306 |
multiselect=False,
|
307 |
value=None,
|
308 |
-
interactive=True
|
|
|
309 |
|
310 |
with gr.Column():
|
311 |
precision = gr.Dropdown(
|
@@ -313,14 +327,16 @@ with demo:
|
|
313 |
label="Precision",
|
314 |
multiselect=False,
|
315 |
value="float32",
|
316 |
-
interactive=True
|
|
|
317 |
|
318 |
weight_type = gr.Dropdown(
|
319 |
choices=[i.value.name for i in WeightType],
|
320 |
label="Weights type",
|
321 |
multiselect=False,
|
322 |
value="Original",
|
323 |
-
interactive=True
|
|
|
324 |
|
325 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
326 |
|
@@ -337,7 +353,8 @@ with demo:
|
|
337 |
weight_type,
|
338 |
model_type,
|
339 |
],
|
340 |
-
submission_result
|
|
|
341 |
|
342 |
with gr.Row():
|
343 |
with gr.Accordion("Citing this leaderboard", open=False):
|
@@ -346,7 +363,8 @@ with demo:
|
|
346 |
label=CITATION_BUTTON_LABEL,
|
347 |
lines=20,
|
348 |
elem_id="citation-button",
|
349 |
-
show_copy_button=True
|
|
|
350 |
|
351 |
scheduler = BackgroundScheduler()
|
352 |
|
@@ -356,7 +374,8 @@ scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
|
|
356 |
def launch_backend():
|
357 |
import subprocess
|
358 |
from src.backend.envs import DEVICE
|
359 |
-
|
|
|
360 |
_ = subprocess.run(["python", "backend-cli.py"])
|
361 |
|
362 |
|
|
|
19 |
LLM_BENCHMARKS_TEXT,
|
20 |
LLM_BENCHMARKS_DETAILS,
|
21 |
FAQ_TEXT,
|
22 |
+
TITLE,
|
23 |
)
|
24 |
|
25 |
from src.display.css_html_js import custom_css
|
|
|
35 |
ModelType,
|
36 |
fields,
|
37 |
WeightType,
|
38 |
+
Precision,
|
39 |
)
|
40 |
|
41 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
|
|
47 |
def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
|
48 |
try:
|
49 |
print(local_dir)
|
50 |
+
snapshot_download(
|
51 |
+
repo_id=repo_id, local_dir=local_dir, repo_type=repo_type, tqdm_class=tqdm_class, etag_timeout=etag_timeout
|
52 |
+
)
|
53 |
except Exception as e:
|
54 |
restart_space()
|
55 |
|
|
|
59 |
|
60 |
|
61 |
def init_space():
|
62 |
+
dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
|
63 |
|
64 |
+
if socket.gethostname() not in {"neuromancer"}:
|
65 |
# sync model_type with open-llm-leaderboard
|
66 |
+
ui_snapshot_download(
|
67 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
68 |
+
)
|
69 |
+
ui_snapshot_download(
|
70 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
71 |
+
)
|
72 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, "", COLS, BENCHMARK_COLS)
|
73 |
|
74 |
+
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
75 |
+
EVAL_REQUESTS_PATH, EVAL_COLS
|
76 |
+
)
|
77 |
return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
78 |
|
79 |
|
|
|
82 |
|
83 |
|
84 |
# Searching and filtering
|
85 |
+
def update_table(
|
86 |
+
hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: list, size_query: list, query: str
|
87 |
+
):
|
|
|
|
|
|
|
88 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
|
89 |
filtered_df = filter_queries(query, filtered_df)
|
90 |
df = select_columns(filtered_df, columns)
|
|
|
104 |
# We use COLS to maintain sorting
|
105 |
filtered_df = df[
|
106 |
# always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
107 |
+
always_here_cols
|
108 |
+
+ [c for c in COLS if c in df.columns and c in columns]
|
109 |
+
+ dummy_col
|
110 |
]
|
111 |
return filtered_df
|
112 |
|
|
|
128 |
return filtered_df
|
129 |
|
130 |
|
131 |
+
def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precision_query: list) -> pd.DataFrame:
|
|
|
|
|
|
|
132 |
# Show all models
|
133 |
filtered_df = df
|
134 |
|
|
|
156 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
157 |
|
158 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
159 |
+
with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
|
|
160 |
with gr.Row():
|
161 |
with gr.Column():
|
162 |
with gr.Row():
|
163 |
+
search_bar = gr.Textbox(
|
164 |
+
placeholder=" 🔍 Model search (separate multiple queries with `;`)",
|
165 |
+
show_label=False,
|
166 |
+
elem_id="search-bar",
|
167 |
+
)
|
168 |
with gr.Row():
|
169 |
shown_columns = gr.CheckboxGroup(
|
170 |
choices=[
|
|
|
179 |
],
|
180 |
label="Select columns to show",
|
181 |
elem_id="column-select",
|
182 |
+
interactive=True,
|
183 |
+
)
|
184 |
|
185 |
with gr.Column(min_width=320):
|
186 |
filter_columns_type = gr.CheckboxGroup(
|
|
|
188 |
choices=[t.to_str() for t in ModelType],
|
189 |
value=[t.to_str() for t in ModelType],
|
190 |
interactive=True,
|
191 |
+
elem_id="filter-columns-type",
|
192 |
+
)
|
193 |
|
194 |
filter_columns_precision = gr.CheckboxGroup(
|
195 |
label="Precision",
|
196 |
choices=[i.value.name for i in Precision],
|
197 |
value=[i.value.name for i in Precision],
|
198 |
interactive=True,
|
199 |
+
elem_id="filter-columns-precision",
|
200 |
+
)
|
201 |
|
202 |
filter_columns_size = gr.CheckboxGroup(
|
203 |
label="Model sizes (in billions of parameters)",
|
204 |
choices=list(NUMERIC_INTERVALS.keys()),
|
205 |
value=list(NUMERIC_INTERVALS.keys()),
|
206 |
interactive=True,
|
207 |
+
elem_id="filter-columns-size",
|
208 |
+
)
|
209 |
|
210 |
# breakpoint()
|
211 |
|
212 |
leaderboard_table = gr.components.Dataframe(
|
213 |
+
value=(
|
214 |
+
leaderboard_df[
|
215 |
+
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
216 |
+
+ shown_columns.value
|
217 |
+
+ [AutoEvalColumn.dummy.name]
|
218 |
+
]
|
219 |
+
if leaderboard_df.empty is False
|
220 |
+
else leaderboard_df
|
221 |
+
),
|
222 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
223 |
datatype=TYPES,
|
224 |
elem_id="leaderboard-table",
|
225 |
interactive=False,
|
226 |
+
visible=True,
|
227 |
+
) # column_widths=["2%", "20%"]
|
228 |
|
229 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
230 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
231 |
value=original_df[COLS] if original_df.empty is False else original_df,
|
232 |
headers=COLS,
|
233 |
datatype=TYPES,
|
234 |
+
visible=False,
|
235 |
+
)
|
236 |
|
237 |
search_bar.submit(
|
238 |
update_table,
|
|
|
244 |
filter_columns_size,
|
245 |
search_bar,
|
246 |
],
|
247 |
+
leaderboard_table,
|
248 |
+
)
|
249 |
|
250 |
# Check query parameter once at startup and update search bar
|
251 |
demo.load(load_query, inputs=[], outputs=[search_bar])
|
|
|
262 |
search_bar,
|
263 |
],
|
264 |
leaderboard_table,
|
265 |
+
queue=True,
|
266 |
+
)
|
267 |
|
268 |
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
269 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
271 |
dataset_table = gr.components.Dataframe(
|
272 |
value=dataset_df,
|
273 |
headers=list(dataset_df.columns),
|
274 |
+
datatype=["str", "markdown", "str", "str", "str"],
|
275 |
elem_id="dataset-table",
|
276 |
interactive=False,
|
277 |
visible=True,
|
278 |
+
column_widths=["15%", "20%"],
|
279 |
+
)
|
280 |
|
281 |
gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
|
282 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
|
|
290 |
with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
|
291 |
with gr.Row():
|
292 |
finished_eval_table = gr.components.Dataframe(
|
293 |
+
value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
|
294 |
+
)
|
|
|
|
|
295 |
|
296 |
with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
|
297 |
with gr.Row():
|
298 |
running_eval_table = gr.components.Dataframe(
|
299 |
+
value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
|
300 |
+
)
|
|
|
|
|
301 |
|
302 |
with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
|
303 |
with gr.Row():
|
304 |
pending_eval_table = gr.components.Dataframe(
|
305 |
+
value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
|
306 |
+
)
|
|
|
|
|
307 |
|
308 |
with gr.Row():
|
309 |
gr.Markdown("# Submit your model here", elem_classes="markdown-text")
|
|
|
318 |
label="Model type",
|
319 |
multiselect=False,
|
320 |
value=None,
|
321 |
+
interactive=True,
|
322 |
+
)
|
323 |
|
324 |
with gr.Column():
|
325 |
precision = gr.Dropdown(
|
|
|
327 |
label="Precision",
|
328 |
multiselect=False,
|
329 |
value="float32",
|
330 |
+
interactive=True,
|
331 |
+
)
|
332 |
|
333 |
weight_type = gr.Dropdown(
|
334 |
choices=[i.value.name for i in WeightType],
|
335 |
label="Weights type",
|
336 |
multiselect=False,
|
337 |
value="Original",
|
338 |
+
interactive=True,
|
339 |
+
)
|
340 |
|
341 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
342 |
|
|
|
353 |
weight_type,
|
354 |
model_type,
|
355 |
],
|
356 |
+
submission_result,
|
357 |
+
)
|
358 |
|
359 |
with gr.Row():
|
360 |
with gr.Accordion("Citing this leaderboard", open=False):
|
|
|
363 |
label=CITATION_BUTTON_LABEL,
|
364 |
lines=20,
|
365 |
elem_id="citation-button",
|
366 |
+
show_copy_button=True,
|
367 |
+
)
|
368 |
|
369 |
scheduler = BackgroundScheduler()
|
370 |
|
|
|
374 |
def launch_backend():
|
375 |
import subprocess
|
376 |
from src.backend.envs import DEVICE
|
377 |
+
|
378 |
+
if DEVICE not in {"cpu"}:
|
379 |
_ = subprocess.run(["python", "backend-cli.py"])
|
380 |
|
381 |
|
backend-cli.py
CHANGED
@@ -32,7 +32,9 @@ import pprint
|
|
32 |
def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
|
33 |
for i in range(10):
|
34 |
try:
|
35 |
-
set_eval_request(
|
|
|
|
|
36 |
return
|
37 |
except Exception as e:
|
38 |
print(f"Error setting eval request to {set_to_status}: {e}. Retrying in 60 seconds")
|
@@ -53,19 +55,32 @@ FAILED_STATUS = "FAILED"
|
|
53 |
TASKS_HARNESS = [task.value for task in Tasks]
|
54 |
|
55 |
|
56 |
-
my_snapshot_download(
|
57 |
-
|
|
|
|
|
|
|
|
|
58 |
|
59 |
|
60 |
def sanity_checks():
|
61 |
-
print(f
|
62 |
|
63 |
# pull the eval dataset from the hub and parse any eval requests
|
64 |
# check completed evals and set them to finished
|
65 |
-
my_snapshot_download(
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
return
|
70 |
|
71 |
|
@@ -97,29 +112,51 @@ def request_to_result_name(request: EvalRequest) -> str:
|
|
97 |
def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
|
98 |
batch_size = 2
|
99 |
try:
|
100 |
-
results = run_evaluation(
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
except RuntimeError as e:
|
103 |
if "No executable batch size found" in str(e):
|
104 |
batch_size = 1
|
105 |
-
results = run_evaluation(
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
else:
|
108 |
raise
|
109 |
|
110 |
-
print(
|
111 |
|
112 |
-
dumped = json.dumps(results, indent=2, default=lambda o:
|
113 |
print(dumped)
|
114 |
|
115 |
-
output_path = os.path.join(
|
|
|
|
|
116 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
117 |
with open(output_path, "w") as f:
|
118 |
f.write(dumped)
|
119 |
|
120 |
-
my_snapshot_download(
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
return results
|
124 |
|
125 |
|
@@ -129,7 +166,9 @@ def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = Non
|
|
129 |
current_finished_status = [FINISHED_STATUS, FAILED_STATUS]
|
130 |
|
131 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
132 |
-
eval_requests: list[EvalRequest] = get_eval_requests(
|
|
|
|
|
133 |
# Sort the evals by priority (first submitted, first run)
|
134 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
135 |
|
@@ -145,7 +184,9 @@ def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = Non
|
|
145 |
result_name: str = request_to_result_name(eval_request)
|
146 |
|
147 |
# Check the corresponding result
|
148 |
-
eval_result: Optional[EvalResult] =
|
|
|
|
|
149 |
|
150 |
# breakpoint()
|
151 |
|
@@ -163,13 +204,37 @@ def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = Non
|
|
163 |
if (eval_result is None or task_name not in eval_result.results) and do_run_task:
|
164 |
eval_request: EvalRequest = result_name_to_request[result_name]
|
165 |
|
166 |
-
my_snapshot_download(
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
results = process_evaluation(task, eval_request)
|
170 |
|
171 |
-
my_snapshot_download(
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
return True
|
175 |
|
@@ -182,7 +247,9 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
|
|
182 |
current_finished_status = [PENDING_STATUS, FINISHED_STATUS, FAILED_STATUS]
|
183 |
|
184 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
185 |
-
eval_requests: list[EvalRequest] = get_eval_requests(
|
|
|
|
|
186 |
# Sort the evals by priority (first submitted, first run)
|
187 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
188 |
|
@@ -198,7 +265,9 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
|
|
198 |
result_name: str = request_to_result_name(eval_request)
|
199 |
|
200 |
# Check the corresponding result
|
201 |
-
eval_result: Optional[EvalResult] =
|
|
|
|
|
202 |
|
203 |
task_lst = TASKS_HARNESS.copy()
|
204 |
random.shuffle(task_lst)
|
@@ -211,18 +280,46 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
|
|
211 |
if hard_task_lst is None or any(ss in task_name for ss in hard_task_lst):
|
212 |
do_run_task = True
|
213 |
|
214 |
-
task_lst = [
|
215 |
-
if (
|
216 |
-
|
|
|
|
|
|
|
|
|
217 |
eval_request: EvalRequest = result_name_to_request[result_name]
|
218 |
|
219 |
-
my_snapshot_download(
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
results = process_evaluation(task, eval_request)
|
223 |
|
224 |
-
my_snapshot_download(
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
return True
|
228 |
|
@@ -235,7 +332,9 @@ def process_pending_requests() -> bool:
|
|
235 |
current_pending_status = [PENDING_STATUS]
|
236 |
|
237 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
238 |
-
eval_requests = get_eval_requests(
|
|
|
|
|
239 |
# Sort the evals by priority (first submitted, first run)
|
240 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
241 |
|
@@ -249,8 +348,16 @@ def process_pending_requests() -> bool:
|
|
249 |
eval_request = eval_requests[0]
|
250 |
pp.pprint(eval_request)
|
251 |
|
252 |
-
my_snapshot_download(
|
253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
task_lst = TASKS_HARNESS.copy()
|
256 |
random.shuffle(task_lst)
|
@@ -258,34 +365,44 @@ def process_pending_requests() -> bool:
|
|
258 |
for task in task_lst:
|
259 |
results = process_evaluation(task, eval_request)
|
260 |
|
261 |
-
my_snapshot_download(
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
return True
|
265 |
|
266 |
|
267 |
def get_args():
|
268 |
-
parser = argparse.ArgumentParser(description=
|
269 |
-
parser.add_argument(
|
270 |
return parser.parse_args()
|
271 |
|
272 |
|
273 |
if __name__ == "__main__":
|
274 |
args = get_args()
|
275 |
local_debug = args.debug
|
276 |
-
#debug specific task by ping
|
277 |
if local_debug:
|
278 |
-
debug_model_names = [
|
279 |
# debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
|
280 |
# debug_task_name = 'ifeval'
|
281 |
-
debug_task_name =
|
282 |
task_lst = TASKS_HARNESS.copy()
|
283 |
for task in task_lst:
|
284 |
for debug_model_name in debug_model_names:
|
285 |
task_name = task.benchmark
|
286 |
if task_name != debug_task_name:
|
287 |
continue
|
288 |
-
eval_request = EvalRequest(
|
|
|
|
|
289 |
results = process_evaluation(task, eval_request)
|
290 |
|
291 |
while True:
|
|
|
32 |
def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
|
33 |
for i in range(10):
|
34 |
try:
|
35 |
+
set_eval_request(
|
36 |
+
api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir
|
37 |
+
)
|
38 |
return
|
39 |
except Exception as e:
|
40 |
print(f"Error setting eval request to {set_to_status}: {e}. Retrying in 60 seconds")
|
|
|
55 |
TASKS_HARNESS = [task.value for task in Tasks]
|
56 |
|
57 |
|
58 |
+
my_snapshot_download(
|
59 |
+
repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
60 |
+
)
|
61 |
+
my_snapshot_download(
|
62 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
63 |
+
)
|
64 |
|
65 |
|
66 |
def sanity_checks():
|
67 |
+
print(f"Device: {DEVICE}")
|
68 |
|
69 |
# pull the eval dataset from the hub and parse any eval requests
|
70 |
# check completed evals and set them to finished
|
71 |
+
my_snapshot_download(
|
72 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
73 |
+
)
|
74 |
+
check_completed_evals(
|
75 |
+
api=API,
|
76 |
+
checked_status=RUNNING_STATUS,
|
77 |
+
completed_status=FINISHED_STATUS,
|
78 |
+
failed_status=FAILED_STATUS,
|
79 |
+
hf_repo=QUEUE_REPO,
|
80 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
81 |
+
hf_repo_results=RESULTS_REPO,
|
82 |
+
local_dir_results=EVAL_RESULTS_PATH_BACKEND,
|
83 |
+
)
|
84 |
return
|
85 |
|
86 |
|
|
|
112 |
def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
|
113 |
batch_size = 2
|
114 |
try:
|
115 |
+
results = run_evaluation(
|
116 |
+
eval_request=eval_request,
|
117 |
+
task_names=[task.benchmark],
|
118 |
+
num_fewshot=task.num_fewshot,
|
119 |
+
batch_size=batch_size,
|
120 |
+
device=DEVICE,
|
121 |
+
use_cache=None,
|
122 |
+
limit=LIMIT,
|
123 |
+
)
|
124 |
except RuntimeError as e:
|
125 |
if "No executable batch size found" in str(e):
|
126 |
batch_size = 1
|
127 |
+
results = run_evaluation(
|
128 |
+
eval_request=eval_request,
|
129 |
+
task_names=[task.benchmark],
|
130 |
+
num_fewshot=task.num_fewshot,
|
131 |
+
batch_size=batch_size,
|
132 |
+
device=DEVICE,
|
133 |
+
use_cache=None,
|
134 |
+
limit=LIMIT,
|
135 |
+
)
|
136 |
else:
|
137 |
raise
|
138 |
|
139 |
+
print("RESULTS", results)
|
140 |
|
141 |
+
dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
|
142 |
print(dumped)
|
143 |
|
144 |
+
output_path = os.path.join(
|
145 |
+
EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json"
|
146 |
+
)
|
147 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
148 |
with open(output_path, "w") as f:
|
149 |
f.write(dumped)
|
150 |
|
151 |
+
my_snapshot_download(
|
152 |
+
repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
153 |
+
)
|
154 |
+
API.upload_file(
|
155 |
+
path_or_fileobj=output_path,
|
156 |
+
path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
157 |
+
repo_id=RESULTS_REPO,
|
158 |
+
repo_type="dataset",
|
159 |
+
)
|
160 |
return results
|
161 |
|
162 |
|
|
|
166 |
current_finished_status = [FINISHED_STATUS, FAILED_STATUS]
|
167 |
|
168 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
169 |
+
eval_requests: list[EvalRequest] = get_eval_requests(
|
170 |
+
job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
171 |
+
)
|
172 |
# Sort the evals by priority (first submitted, first run)
|
173 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
174 |
|
|
|
184 |
result_name: str = request_to_result_name(eval_request)
|
185 |
|
186 |
# Check the corresponding result
|
187 |
+
eval_result: Optional[EvalResult] = (
|
188 |
+
result_name_to_result[result_name] if result_name in result_name_to_result else None
|
189 |
+
)
|
190 |
|
191 |
# breakpoint()
|
192 |
|
|
|
204 |
if (eval_result is None or task_name not in eval_result.results) and do_run_task:
|
205 |
eval_request: EvalRequest = result_name_to_request[result_name]
|
206 |
|
207 |
+
my_snapshot_download(
|
208 |
+
repo_id=QUEUE_REPO,
|
209 |
+
revision="main",
|
210 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
211 |
+
repo_type="dataset",
|
212 |
+
max_workers=60,
|
213 |
+
)
|
214 |
+
my_set_eval_request(
|
215 |
+
api=API,
|
216 |
+
eval_request=eval_request,
|
217 |
+
set_to_status=RUNNING_STATUS,
|
218 |
+
hf_repo=QUEUE_REPO,
|
219 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
220 |
+
)
|
221 |
|
222 |
results = process_evaluation(task, eval_request)
|
223 |
|
224 |
+
my_snapshot_download(
|
225 |
+
repo_id=QUEUE_REPO,
|
226 |
+
revision="main",
|
227 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
228 |
+
repo_type="dataset",
|
229 |
+
max_workers=60,
|
230 |
+
)
|
231 |
+
my_set_eval_request(
|
232 |
+
api=API,
|
233 |
+
eval_request=eval_request,
|
234 |
+
set_to_status=FINISHED_STATUS,
|
235 |
+
hf_repo=QUEUE_REPO,
|
236 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
237 |
+
)
|
238 |
|
239 |
return True
|
240 |
|
|
|
247 |
current_finished_status = [PENDING_STATUS, FINISHED_STATUS, FAILED_STATUS]
|
248 |
|
249 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
250 |
+
eval_requests: list[EvalRequest] = get_eval_requests(
|
251 |
+
job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
252 |
+
)
|
253 |
# Sort the evals by priority (first submitted, first run)
|
254 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
255 |
|
|
|
265 |
result_name: str = request_to_result_name(eval_request)
|
266 |
|
267 |
# Check the corresponding result
|
268 |
+
eval_result: Optional[EvalResult] = (
|
269 |
+
result_name_to_result[result_name] if result_name in result_name_to_result else None
|
270 |
+
)
|
271 |
|
272 |
task_lst = TASKS_HARNESS.copy()
|
273 |
random.shuffle(task_lst)
|
|
|
280 |
if hard_task_lst is None or any(ss in task_name for ss in hard_task_lst):
|
281 |
do_run_task = True
|
282 |
|
283 |
+
task_lst = ["nq", "trivia", "tqa", "self"]
|
284 |
+
if (
|
285 |
+
eval_result is None
|
286 |
+
or do_run_task
|
287 |
+
or task_name not in eval_result.results
|
288 |
+
or any(ss in task_name for ss in task_lst)
|
289 |
+
):
|
290 |
eval_request: EvalRequest = result_name_to_request[result_name]
|
291 |
|
292 |
+
my_snapshot_download(
|
293 |
+
repo_id=QUEUE_REPO,
|
294 |
+
revision="main",
|
295 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
296 |
+
repo_type="dataset",
|
297 |
+
max_workers=60,
|
298 |
+
)
|
299 |
+
my_set_eval_request(
|
300 |
+
api=API,
|
301 |
+
eval_request=eval_request,
|
302 |
+
set_to_status=RUNNING_STATUS,
|
303 |
+
hf_repo=QUEUE_REPO,
|
304 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
305 |
+
)
|
306 |
|
307 |
results = process_evaluation(task, eval_request)
|
308 |
|
309 |
+
my_snapshot_download(
|
310 |
+
repo_id=QUEUE_REPO,
|
311 |
+
revision="main",
|
312 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
313 |
+
repo_type="dataset",
|
314 |
+
max_workers=60,
|
315 |
+
)
|
316 |
+
my_set_eval_request(
|
317 |
+
api=API,
|
318 |
+
eval_request=eval_request,
|
319 |
+
set_to_status=FINISHED_STATUS,
|
320 |
+
hf_repo=QUEUE_REPO,
|
321 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
322 |
+
)
|
323 |
|
324 |
return True
|
325 |
|
|
|
332 |
current_pending_status = [PENDING_STATUS]
|
333 |
|
334 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
335 |
+
eval_requests = get_eval_requests(
|
336 |
+
job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
337 |
+
)
|
338 |
# Sort the evals by priority (first submitted, first run)
|
339 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
340 |
|
|
|
348 |
eval_request = eval_requests[0]
|
349 |
pp.pprint(eval_request)
|
350 |
|
351 |
+
my_snapshot_download(
|
352 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
353 |
+
)
|
354 |
+
my_set_eval_request(
|
355 |
+
api=API,
|
356 |
+
eval_request=eval_request,
|
357 |
+
set_to_status=RUNNING_STATUS,
|
358 |
+
hf_repo=QUEUE_REPO,
|
359 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
360 |
+
)
|
361 |
|
362 |
task_lst = TASKS_HARNESS.copy()
|
363 |
random.shuffle(task_lst)
|
|
|
365 |
for task in task_lst:
|
366 |
results = process_evaluation(task, eval_request)
|
367 |
|
368 |
+
my_snapshot_download(
|
369 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
370 |
+
)
|
371 |
+
my_set_eval_request(
|
372 |
+
api=API,
|
373 |
+
eval_request=eval_request,
|
374 |
+
set_to_status=FINISHED_STATUS,
|
375 |
+
hf_repo=QUEUE_REPO,
|
376 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
377 |
+
)
|
378 |
|
379 |
return True
|
380 |
|
381 |
|
382 |
def get_args():
|
383 |
+
parser = argparse.ArgumentParser(description="Run the backend")
|
384 |
+
parser.add_argument("--debug", action="store_true", help="Run in debug mode")
|
385 |
return parser.parse_args()
|
386 |
|
387 |
|
388 |
if __name__ == "__main__":
|
389 |
args = get_args()
|
390 |
local_debug = args.debug
|
391 |
+
# debug specific task by ping
|
392 |
if local_debug:
|
393 |
+
debug_model_names = ["mistralai/Mixtral-8x7B-Instruct-v0.1"]
|
394 |
# debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
|
395 |
# debug_task_name = 'ifeval'
|
396 |
+
debug_task_name = "mmlu"
|
397 |
task_lst = TASKS_HARNESS.copy()
|
398 |
for task in task_lst:
|
399 |
for debug_model_name in debug_model_names:
|
400 |
task_name = task.benchmark
|
401 |
if task_name != debug_task_name:
|
402 |
continue
|
403 |
+
eval_request = EvalRequest(
|
404 |
+
model=debug_model_name, private=False, status="", json_filepath="", precision="float16"
|
405 |
+
)
|
406 |
results = process_evaluation(task, eval_request)
|
407 |
|
408 |
while True:
|
cli/analysis-cli.py
CHANGED
@@ -77,19 +77,19 @@ def sanitise_dataset(name: str) -> str:
|
|
77 |
return res
|
78 |
|
79 |
|
80 |
-
cache_file =
|
81 |
|
82 |
|
83 |
def load_data_map_from_cache(cache_file):
|
84 |
if os.path.exists(cache_file):
|
85 |
-
with open(cache_file,
|
86 |
return pickle.load(f)
|
87 |
else:
|
88 |
return None
|
89 |
|
90 |
|
91 |
def save_data_map_to_cache(data_map, cache_file):
|
92 |
-
with open(cache_file,
|
93 |
pickle.dump(data_map, f)
|
94 |
|
95 |
|
@@ -98,8 +98,12 @@ data_map = load_data_map_from_cache(cache_file)
|
|
98 |
|
99 |
|
100 |
if data_map is None:
|
101 |
-
my_snapshot_download(
|
102 |
-
|
|
|
|
|
|
|
|
|
103 |
|
104 |
result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
|
105 |
request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
|
@@ -107,7 +111,7 @@ if data_map is None:
|
|
107 |
model_name_to_model_map = {}
|
108 |
|
109 |
for path in request_path_lst:
|
110 |
-
with open(path,
|
111 |
data = json.load(f)
|
112 |
model_name_to_model_map[data["model"]] = data
|
113 |
|
@@ -117,7 +121,7 @@ if data_map is None:
|
|
117 |
data_map = {}
|
118 |
|
119 |
for path in result_path_lst:
|
120 |
-
with open(path,
|
121 |
data = json.load(f)
|
122 |
model_name = data["config"]["model_name"]
|
123 |
for dataset_name, results_dict in data["results"].items():
|
@@ -127,42 +131,42 @@ if data_map is None:
|
|
127 |
|
128 |
to_add = True
|
129 |
|
130 |
-
if
|
131 |
to_add = False
|
132 |
|
133 |
-
if
|
134 |
to_add = False
|
135 |
|
136 |
-
if
|
137 |
to_add = False
|
138 |
|
139 |
-
if
|
140 |
to_add = False
|
141 |
|
142 |
-
if
|
143 |
to_add = False
|
144 |
|
145 |
-
if
|
146 |
-
if
|
147 |
to_add = False
|
148 |
|
149 |
-
if
|
150 |
-
if
|
151 |
to_add = False
|
152 |
|
153 |
-
if
|
154 |
-
if
|
155 |
to_add = False
|
156 |
|
157 |
-
if
|
158 |
# to_add = False
|
159 |
-
if
|
160 |
to_add = False
|
161 |
|
162 |
-
if
|
163 |
to_add = False
|
164 |
|
165 |
-
if (
|
166 |
to_add = False
|
167 |
|
168 |
if isinstance(value, str):
|
@@ -172,25 +176,36 @@ if data_map is None:
|
|
172 |
to_add = False
|
173 |
|
174 |
if to_add:
|
175 |
-
if
|
176 |
value /= 100.0
|
177 |
|
178 |
-
if
|
179 |
value /= 100.0
|
180 |
|
181 |
sanitised_metric_name = metric_name
|
182 |
if "," in sanitised_metric_name:
|
183 |
-
sanitised_metric_name = sanitised_metric_name.split(
|
184 |
sanitised_metric_name = sanitise_metric(sanitised_metric_name)
|
185 |
sanitised_dataset_name = sanitise_dataset(dataset_name)
|
186 |
|
187 |
-
model_dataset_metric_to_result_map[
|
|
|
|
|
188 |
|
189 |
if model_name not in data_map:
|
190 |
data_map[model_name] = {}
|
191 |
data_map[model_name][(sanitised_dataset_name, sanitised_metric_name)] = value
|
192 |
|
193 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
save_data_map_to_cache(data_map, cache_file)
|
196 |
|
@@ -202,7 +217,7 @@ for model_name in model_name_lst:
|
|
202 |
if len(data_map[model_name]) < nb_max_metrics - 5:
|
203 |
del data_map[model_name]
|
204 |
|
205 |
-
plot_type_lst = [
|
206 |
|
207 |
for plot_type in plot_type_lst:
|
208 |
|
@@ -212,39 +227,39 @@ for plot_type in plot_type_lst:
|
|
212 |
if dataset_metric not in data_map_v2:
|
213 |
data_map_v2[dataset_metric] = {}
|
214 |
|
215 |
-
if plot_type in {
|
216 |
to_add = True
|
217 |
-
if
|
218 |
to_add = False
|
219 |
-
if
|
220 |
to_add = False
|
221 |
-
if
|
222 |
to_add = False
|
223 |
-
if
|
224 |
to_add = False
|
225 |
if to_add is True:
|
226 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
227 |
-
elif plot_type in {
|
228 |
-
if
|
229 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
230 |
-
elif plot_type in {
|
231 |
-
if
|
232 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
233 |
-
elif plot_type in {
|
234 |
-
if
|
235 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
236 |
-
elif plot_type in {
|
237 |
-
if
|
238 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
239 |
-
elif plot_type in {
|
240 |
-
if
|
241 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
242 |
else:
|
243 |
assert False, f"Unknown plot type: {plot_type}"
|
244 |
|
245 |
# df = pd.DataFrame.from_dict(data_map, orient='index') # Invert the y-axis (rows)
|
246 |
-
df = pd.DataFrame.from_dict(data_map_v2, orient=
|
247 |
-
df.index = [
|
248 |
|
249 |
o_df = df.copy(deep=True)
|
250 |
|
@@ -263,7 +278,7 @@ for plot_type in plot_type_lst:
|
|
263 |
|
264 |
# Calculate dimensions based on the DataFrame size
|
265 |
cell_height = 1.0 # Height of each cell in inches
|
266 |
-
cell_width = 1.0
|
267 |
|
268 |
n_rows = len(df.index) # Datasets and Metrics
|
269 |
n_cols = len(df.columns) # Models
|
@@ -277,60 +292,62 @@ for plot_type in plot_type_lst:
|
|
277 |
|
278 |
sns.set_context("notebook", font_scale=1.3)
|
279 |
|
280 |
-
dendrogram_ratio = (.1, .1)
|
281 |
|
282 |
-
if plot_type in {
|
283 |
fig_width = cell_width * n_cols - 2
|
284 |
fig_height = cell_height * n_rows + 5.2
|
285 |
-
dendrogram_ratio = (.1, .2)
|
286 |
|
287 |
-
if plot_type in {
|
288 |
fig_width = cell_width * n_cols - 2
|
289 |
fig_height = cell_height * n_rows + 5.2
|
290 |
-
dendrogram_ratio = (.1, .4)
|
291 |
|
292 |
-
if plot_type in {
|
293 |
fig_width = cell_width * n_cols - 2
|
294 |
fig_height = cell_height * n_rows + 4
|
295 |
-
dendrogram_ratio = (.1, .2)
|
296 |
|
297 |
-
if plot_type in {
|
298 |
fig_width = cell_width * n_cols - 2
|
299 |
fig_height = cell_height * n_rows + 2.0
|
300 |
-
dendrogram_ratio = (.1, .1)
|
301 |
row_cluster = False
|
302 |
|
303 |
-
if plot_type in {
|
304 |
fig_width = cell_width * n_cols - 2
|
305 |
fig_height = cell_height * n_rows + 5.2
|
306 |
-
dendrogram_ratio = (.1, .4)
|
307 |
|
308 |
-
print(
|
309 |
|
310 |
-
o_df.to_json(f
|
311 |
|
312 |
-
print(f
|
313 |
|
314 |
-
for cmap in [None,
|
315 |
-
fig = sns.clustermap(
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
|
|
|
|
326 |
|
327 |
# Adjust the size of the cells (less wide)
|
328 |
plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
|
329 |
plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
|
330 |
|
331 |
-
cmap_suffix =
|
332 |
|
333 |
# Save the clustermap to file
|
334 |
-
fig.savefig(f
|
335 |
-
fig.savefig(f
|
336 |
-
fig.savefig(f
|
|
|
77 |
return res
|
78 |
|
79 |
|
80 |
+
cache_file = "data_map_cache.pkl"
|
81 |
|
82 |
|
83 |
def load_data_map_from_cache(cache_file):
|
84 |
if os.path.exists(cache_file):
|
85 |
+
with open(cache_file, "rb") as f:
|
86 |
return pickle.load(f)
|
87 |
else:
|
88 |
return None
|
89 |
|
90 |
|
91 |
def save_data_map_to_cache(data_map, cache_file):
|
92 |
+
with open(cache_file, "wb") as f:
|
93 |
pickle.dump(data_map, f)
|
94 |
|
95 |
|
|
|
98 |
|
99 |
|
100 |
if data_map is None:
|
101 |
+
my_snapshot_download(
|
102 |
+
repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
103 |
+
)
|
104 |
+
my_snapshot_download(
|
105 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
106 |
+
)
|
107 |
|
108 |
result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
|
109 |
request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
|
|
|
111 |
model_name_to_model_map = {}
|
112 |
|
113 |
for path in request_path_lst:
|
114 |
+
with open(path, "r") as f:
|
115 |
data = json.load(f)
|
116 |
model_name_to_model_map[data["model"]] = data
|
117 |
|
|
|
121 |
data_map = {}
|
122 |
|
123 |
for path in result_path_lst:
|
124 |
+
with open(path, "r") as f:
|
125 |
data = json.load(f)
|
126 |
model_name = data["config"]["model_name"]
|
127 |
for dataset_name, results_dict in data["results"].items():
|
|
|
131 |
|
132 |
to_add = True
|
133 |
|
134 |
+
if "f1" in metric_name:
|
135 |
to_add = False
|
136 |
|
137 |
+
if "stderr" in metric_name:
|
138 |
to_add = False
|
139 |
|
140 |
+
if "memo-trap_v2" in dataset_name:
|
141 |
to_add = False
|
142 |
|
143 |
+
if "faithdial" in dataset_name:
|
144 |
to_add = False
|
145 |
|
146 |
+
if "truthfulqa_gen" in dataset_name:
|
147 |
to_add = False
|
148 |
|
149 |
+
if "bertscore" in metric_name:
|
150 |
+
if "precision" not in metric_name:
|
151 |
to_add = False
|
152 |
|
153 |
+
if "halueval" in dataset_name:
|
154 |
+
if "acc" not in metric_name:
|
155 |
to_add = False
|
156 |
|
157 |
+
if "ifeval" in dataset_name:
|
158 |
+
if "prompt_level_strict_acc" not in metric_name:
|
159 |
to_add = False
|
160 |
|
161 |
+
if "squad" in dataset_name:
|
162 |
# to_add = False
|
163 |
+
if "best_exact" in metric_name:
|
164 |
to_add = False
|
165 |
|
166 |
+
if "fever" in dataset_name:
|
167 |
to_add = False
|
168 |
|
169 |
+
if ("xsum" in dataset_name or "cnn" in dataset_name) and "v2" not in dataset_name:
|
170 |
to_add = False
|
171 |
|
172 |
if isinstance(value, str):
|
|
|
176 |
to_add = False
|
177 |
|
178 |
if to_add:
|
179 |
+
if "rouge" in metric_name:
|
180 |
value /= 100.0
|
181 |
|
182 |
+
if "squad" in dataset_name:
|
183 |
value /= 100.0
|
184 |
|
185 |
sanitised_metric_name = metric_name
|
186 |
if "," in sanitised_metric_name:
|
187 |
+
sanitised_metric_name = sanitised_metric_name.split(",")[0]
|
188 |
sanitised_metric_name = sanitise_metric(sanitised_metric_name)
|
189 |
sanitised_dataset_name = sanitise_dataset(dataset_name)
|
190 |
|
191 |
+
model_dataset_metric_to_result_map[
|
192 |
+
(model_name, sanitised_dataset_name, sanitised_metric_name)
|
193 |
+
] = value
|
194 |
|
195 |
if model_name not in data_map:
|
196 |
data_map[model_name] = {}
|
197 |
data_map[model_name][(sanitised_dataset_name, sanitised_metric_name)] = value
|
198 |
|
199 |
+
print(
|
200 |
+
"model_name",
|
201 |
+
model_name,
|
202 |
+
"dataset_name",
|
203 |
+
sanitised_dataset_name,
|
204 |
+
"metric_name",
|
205 |
+
sanitised_metric_name,
|
206 |
+
"value",
|
207 |
+
value,
|
208 |
+
)
|
209 |
|
210 |
save_data_map_to_cache(data_map, cache_file)
|
211 |
|
|
|
217 |
if len(data_map[model_name]) < nb_max_metrics - 5:
|
218 |
del data_map[model_name]
|
219 |
|
220 |
+
plot_type_lst = ["all", "summ", "qa", "instr", "detect", "rc"]
|
221 |
|
222 |
for plot_type in plot_type_lst:
|
223 |
|
|
|
227 |
if dataset_metric not in data_map_v2:
|
228 |
data_map_v2[dataset_metric] = {}
|
229 |
|
230 |
+
if plot_type in {"all"}:
|
231 |
to_add = True
|
232 |
+
if "ROUGE" in dataset_metric[1] and "ROUGE-L" not in dataset_metric[1]:
|
233 |
to_add = False
|
234 |
+
if "SQuAD" in dataset_metric[0] and "EM" not in dataset_metric[1]:
|
235 |
to_add = False
|
236 |
+
if "SelfCheckGPT" in dataset_metric[0] and "MAX" not in dataset_metric[1]:
|
237 |
to_add = False
|
238 |
+
if "64-shot" in dataset_metric[0]:
|
239 |
to_add = False
|
240 |
if to_add is True:
|
241 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
242 |
+
elif plot_type in {"summ"}:
|
243 |
+
if "CNN" in dataset_metric[0] or "XSum" in dataset_metric[0]:
|
244 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
245 |
+
elif plot_type in {"qa"}:
|
246 |
+
if "TriviaQA" in dataset_metric[0] or "NQ" in dataset_metric[0] or "TruthfulQA" in dataset_metric[0]:
|
247 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
248 |
+
elif plot_type in {"instr"}:
|
249 |
+
if "MemoTrap" in dataset_metric[0] or "IFEval" in dataset_metric[0]:
|
250 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
251 |
+
elif plot_type in {"detect"}:
|
252 |
+
if "HaluEval" in dataset_metric[0] or "SelfCheck" in dataset_metric[0]:
|
253 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
254 |
+
elif plot_type in {"rc"}:
|
255 |
+
if "RACE" in dataset_metric[0] or "SQuAD" in dataset_metric[0]:
|
256 |
data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
|
257 |
else:
|
258 |
assert False, f"Unknown plot type: {plot_type}"
|
259 |
|
260 |
# df = pd.DataFrame.from_dict(data_map, orient='index') # Invert the y-axis (rows)
|
261 |
+
df = pd.DataFrame.from_dict(data_map_v2, orient="index") # Invert the y-axis (rows)
|
262 |
+
df.index = [", ".join(map(str, idx)) for idx in df.index]
|
263 |
|
264 |
o_df = df.copy(deep=True)
|
265 |
|
|
|
278 |
|
279 |
# Calculate dimensions based on the DataFrame size
|
280 |
cell_height = 1.0 # Height of each cell in inches
|
281 |
+
cell_width = 1.0 # Width of each cell in inches
|
282 |
|
283 |
n_rows = len(df.index) # Datasets and Metrics
|
284 |
n_cols = len(df.columns) # Models
|
|
|
292 |
|
293 |
sns.set_context("notebook", font_scale=1.3)
|
294 |
|
295 |
+
dendrogram_ratio = (0.1, 0.1)
|
296 |
|
297 |
+
if plot_type in {"detect"}:
|
298 |
fig_width = cell_width * n_cols - 2
|
299 |
fig_height = cell_height * n_rows + 5.2
|
300 |
+
dendrogram_ratio = (0.1, 0.2)
|
301 |
|
302 |
+
if plot_type in {"instr"}:
|
303 |
fig_width = cell_width * n_cols - 2
|
304 |
fig_height = cell_height * n_rows + 5.2
|
305 |
+
dendrogram_ratio = (0.1, 0.4)
|
306 |
|
307 |
+
if plot_type in {"qa"}:
|
308 |
fig_width = cell_width * n_cols - 2
|
309 |
fig_height = cell_height * n_rows + 4
|
310 |
+
dendrogram_ratio = (0.1, 0.2)
|
311 |
|
312 |
+
if plot_type in {"summ"}:
|
313 |
fig_width = cell_width * n_cols - 2
|
314 |
fig_height = cell_height * n_rows + 2.0
|
315 |
+
dendrogram_ratio = (0.1, 0.1)
|
316 |
row_cluster = False
|
317 |
|
318 |
+
if plot_type in {"rc"}:
|
319 |
fig_width = cell_width * n_cols - 2
|
320 |
fig_height = cell_height * n_rows + 5.2
|
321 |
+
dendrogram_ratio = (0.1, 0.4)
|
322 |
|
323 |
+
print("figsize", (fig_width, fig_height))
|
324 |
|
325 |
+
o_df.to_json(f"plots/clustermap_{plot_type}.json", orient="split")
|
326 |
|
327 |
+
print(f"Generating the clustermaps for {plot_type}")
|
328 |
|
329 |
+
for cmap in [None, "coolwarm", "viridis"]:
|
330 |
+
fig = sns.clustermap(
|
331 |
+
df,
|
332 |
+
method="ward",
|
333 |
+
metric="euclidean",
|
334 |
+
cmap=cmap,
|
335 |
+
figsize=(fig_width, fig_height), # figsize=(24, 16),
|
336 |
+
annot=True,
|
337 |
+
mask=o_df.isnull(),
|
338 |
+
dendrogram_ratio=dendrogram_ratio,
|
339 |
+
fmt=".2f",
|
340 |
+
col_cluster=col_cluster,
|
341 |
+
row_cluster=row_cluster,
|
342 |
+
)
|
343 |
|
344 |
# Adjust the size of the cells (less wide)
|
345 |
plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
|
346 |
plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
|
347 |
|
348 |
+
cmap_suffix = "" if cmap is None else f"_{cmap}"
|
349 |
|
350 |
# Save the clustermap to file
|
351 |
+
fig.savefig(f"blog/figures/clustermap_{plot_type}{cmap_suffix}.pdf")
|
352 |
+
fig.savefig(f"blog/figures/clustermap_{plot_type}{cmap_suffix}.png")
|
353 |
+
fig.savefig(f"blog/figures/clustermap_{plot_type}{cmap_suffix}_t.png", transparent=True, facecolor="none")
|
cli/averitec-upload-cli.py
CHANGED
@@ -2,11 +2,13 @@
|
|
2 |
|
3 |
from datasets import load_dataset
|
4 |
|
5 |
-
path =
|
6 |
|
7 |
-
ds = load_dataset(
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
12 |
ds.push_to_hub(path)
|
|
|
2 |
|
3 |
from datasets import load_dataset
|
4 |
|
5 |
+
path = "pminervini/averitec"
|
6 |
|
7 |
+
ds = load_dataset(
|
8 |
+
"json",
|
9 |
+
data_files={
|
10 |
+
"train": "/Users/pasquale/workspace/AVeriTeC/data/train.json",
|
11 |
+
"dev": "/Users/pasquale/workspace/AVeriTeC/data/dev.json",
|
12 |
+
},
|
13 |
+
)
|
14 |
ds.push_to_hub(path)
|
cli/beta-cli.py
CHANGED
@@ -14,8 +14,12 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
14 |
from src.backend.manage_requests import EvalRequest
|
15 |
from src.leaderboard.read_evals import EvalResult
|
16 |
|
17 |
-
snapshot_download(
|
18 |
-
|
|
|
|
|
|
|
|
|
19 |
|
20 |
PENDING_STATUS = "PENDING"
|
21 |
RUNNING_STATUS = "RUNNING"
|
@@ -40,7 +44,9 @@ def request_to_result_name(request: EvalRequest) -> str:
|
|
40 |
|
41 |
|
42 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
43 |
-
eval_requests: list[EvalRequest] = get_eval_requests(
|
|
|
|
|
44 |
# Sort the evals by priority (first submitted first run)
|
45 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
46 |
|
@@ -49,8 +55,8 @@ eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_RE
|
|
49 |
result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
|
50 |
result_name_to_result = {r.eval_name: r for r in eval_results}
|
51 |
|
52 |
-
print(
|
53 |
-
print(
|
54 |
|
55 |
for eval_request in eval_requests:
|
56 |
result_name: str = request_to_result_name(eval_request)
|
@@ -63,7 +69,7 @@ for eval_request in eval_requests:
|
|
63 |
task_name = task.benchmark
|
64 |
|
65 |
if task_name not in eval_result.results:
|
66 |
-
print(
|
67 |
|
68 |
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
69 |
all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]
|
|
|
14 |
from src.backend.manage_requests import EvalRequest
|
15 |
from src.leaderboard.read_evals import EvalResult
|
16 |
|
17 |
+
snapshot_download(
|
18 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
19 |
+
)
|
20 |
+
snapshot_download(
|
21 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
22 |
+
)
|
23 |
|
24 |
PENDING_STATUS = "PENDING"
|
25 |
RUNNING_STATUS = "RUNNING"
|
|
|
44 |
|
45 |
|
46 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
47 |
+
eval_requests: list[EvalRequest] = get_eval_requests(
|
48 |
+
job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
49 |
+
)
|
50 |
# Sort the evals by priority (first submitted first run)
|
51 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
52 |
|
|
|
55 |
result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
|
56 |
result_name_to_result = {r.eval_name: r for r in eval_results}
|
57 |
|
58 |
+
print("Requests", sorted(result_name_to_request.keys()))
|
59 |
+
print("Results", sorted(result_name_to_result.keys()))
|
60 |
|
61 |
for eval_request in eval_requests:
|
62 |
result_name: str = request_to_result_name(eval_request)
|
|
|
69 |
task_name = task.benchmark
|
70 |
|
71 |
if task_name not in eval_result.results:
|
72 |
+
print("RUN THIS ONE!", result_name, task_name)
|
73 |
|
74 |
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
75 |
all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]
|
cli/completed-cli.py
CHANGED
@@ -26,8 +26,12 @@ FAILED_STATUS = "FAILED"
|
|
26 |
|
27 |
TASKS_HARNESS = [task.value for task in Tasks]
|
28 |
|
29 |
-
snapshot_download(
|
30 |
-
|
|
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
def request_to_result_name(request: EvalRequest) -> str:
|
@@ -48,9 +52,10 @@ def process_finished_requests() -> bool:
|
|
48 |
if False:
|
49 |
import os
|
50 |
import dateutil
|
|
|
51 |
model_result_filepaths = []
|
52 |
-
results_path = f
|
53 |
-
requests_path = f
|
54 |
|
55 |
for root, _, files in os.walk(results_path):
|
56 |
# We should only have json files in model results
|
@@ -72,7 +77,7 @@ def process_finished_requests() -> bool:
|
|
72 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
73 |
eval_result.update_with_request_file(requests_path)
|
74 |
|
75 |
-
print(
|
76 |
|
77 |
# Store results of same eval together
|
78 |
eval_name = eval_result.eval_name
|
@@ -86,7 +91,9 @@ def process_finished_requests() -> bool:
|
|
86 |
return True
|
87 |
|
88 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
89 |
-
eval_requests: list[EvalRequest] = get_eval_requests(
|
|
|
|
|
90 |
# Sort the evals by priority (first submitted first run)
|
91 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
92 |
|
@@ -94,9 +101,11 @@ def process_finished_requests() -> bool:
|
|
94 |
# eval_requests = [r for r in eval_requests if 'neo-1.3B' in r.model]
|
95 |
|
96 |
import random
|
|
|
97 |
random.shuffle(eval_requests)
|
98 |
|
99 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
100 |
eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
|
101 |
|
102 |
result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
|
@@ -107,7 +116,10 @@ def process_finished_requests() -> bool:
|
|
107 |
|
108 |
# Check the corresponding result
|
109 |
from typing import Optional
|
110 |
-
|
|
|
|
|
|
|
111 |
|
112 |
# Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
|
113 |
for task in TASKS_HARNESS:
|
@@ -117,7 +129,7 @@ def process_finished_requests() -> bool:
|
|
117 |
eval_request: EvalRequest = result_name_to_request[result_name]
|
118 |
|
119 |
# print(eval_result)
|
120 |
-
print(result_name,
|
121 |
|
122 |
|
123 |
if __name__ == "__main__":
|
|
|
26 |
|
27 |
TASKS_HARNESS = [task.value for task in Tasks]
|
28 |
|
29 |
+
snapshot_download(
|
30 |
+
repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
31 |
+
)
|
32 |
+
snapshot_download(
|
33 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
34 |
+
)
|
35 |
|
36 |
|
37 |
def request_to_result_name(request: EvalRequest) -> str:
|
|
|
52 |
if False:
|
53 |
import os
|
54 |
import dateutil
|
55 |
+
|
56 |
model_result_filepaths = []
|
57 |
+
results_path = f"{EVAL_RESULTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B"
|
58 |
+
requests_path = f"{EVAL_REQUESTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B_eval_request_False_False_False.json"
|
59 |
|
60 |
for root, _, files in os.walk(results_path):
|
61 |
# We should only have json files in model results
|
|
|
77 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
78 |
eval_result.update_with_request_file(requests_path)
|
79 |
|
80 |
+
print("XXX", eval_result)
|
81 |
|
82 |
# Store results of same eval together
|
83 |
eval_name = eval_result.eval_name
|
|
|
91 |
return True
|
92 |
|
93 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
94 |
+
eval_requests: list[EvalRequest] = get_eval_requests(
|
95 |
+
job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
96 |
+
)
|
97 |
# Sort the evals by priority (first submitted first run)
|
98 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
99 |
|
|
|
101 |
# eval_requests = [r for r in eval_requests if 'neo-1.3B' in r.model]
|
102 |
|
103 |
import random
|
104 |
+
|
105 |
random.shuffle(eval_requests)
|
106 |
|
107 |
from src.leaderboard.read_evals import get_raw_eval_results
|
108 |
+
|
109 |
eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
|
110 |
|
111 |
result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
|
|
|
116 |
|
117 |
# Check the corresponding result
|
118 |
from typing import Optional
|
119 |
+
|
120 |
+
eval_result: Optional[EvalResult] = (
|
121 |
+
result_name_to_result[result_name] if result_name in result_name_to_result else None
|
122 |
+
)
|
123 |
|
124 |
# Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
|
125 |
for task in TASKS_HARNESS:
|
|
|
129 |
eval_request: EvalRequest = result_name_to_request[result_name]
|
130 |
|
131 |
# print(eval_result)
|
132 |
+
print(result_name, "is incomplete -- missing task:", task_name, eval_result, eval_request.likes)
|
133 |
|
134 |
|
135 |
if __name__ == "__main__":
|
cli/eval-cli.py
CHANGED
@@ -35,12 +35,11 @@ def main():
|
|
35 |
status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
|
36 |
|
37 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
38 |
-
eval_requests: list[EvalRequest] = get_eval_requests(
|
39 |
-
|
40 |
-
|
41 |
-
do_download=False)
|
42 |
# eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
43 |
-
eval_request = [r for r in eval_requests if
|
44 |
|
45 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
46 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
@@ -56,6 +55,7 @@ def main():
|
|
56 |
|
57 |
eval_logger = utils.eval_logger
|
58 |
import logging
|
|
|
59 |
eval_logger.setLevel(getattr(logging, "DEBUG"))
|
60 |
|
61 |
TASKS_HARNESS = [my_task]
|
@@ -75,9 +75,19 @@ def main():
|
|
75 |
import torch
|
76 |
|
77 |
# breakpoint()
|
78 |
-
results = evaluator.simple_evaluate(
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
breakpoint()
|
83 |
|
|
|
35 |
status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
|
36 |
|
37 |
# Get all eval request that are FINISHED, if you want to run other evals, change this parameter
|
38 |
+
eval_requests: list[EvalRequest] = get_eval_requests(
|
39 |
+
job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND, do_download=False
|
40 |
+
)
|
|
|
41 |
# eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
42 |
+
eval_request = [r for r in eval_requests if "meta-llama/Llama-2-7b-hf" in r.model][0]
|
43 |
|
44 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
45 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
|
|
55 |
|
56 |
eval_logger = utils.eval_logger
|
57 |
import logging
|
58 |
+
|
59 |
eval_logger.setLevel(getattr(logging, "DEBUG"))
|
60 |
|
61 |
TASKS_HARNESS = [my_task]
|
|
|
75 |
import torch
|
76 |
|
77 |
# breakpoint()
|
78 |
+
results = evaluator.simple_evaluate(
|
79 |
+
model="hf",
|
80 |
+
model_args=eval_request.get_model_args(),
|
81 |
+
tasks=[task.benchmark],
|
82 |
+
num_fewshot=task.num_fewshot,
|
83 |
+
batch_size=1,
|
84 |
+
device="mps",
|
85 |
+
use_cache=None,
|
86 |
+
limit=2,
|
87 |
+
write_out=True,
|
88 |
+
task_manager=task_manager,
|
89 |
+
)
|
90 |
+
print("AAA", results["results"])
|
91 |
|
92 |
breakpoint()
|
93 |
|
cli/fever-upload-cli.py
CHANGED
@@ -18,12 +18,9 @@ def convert(list_of_dicts):
|
|
18 |
|
19 |
|
20 |
v10 = load_dataset("fever", "v1.0")
|
21 |
-
name_lst = [
|
22 |
|
23 |
-
old_to_new_label_map = {
|
24 |
-
'SUPPORTS': 'supported',
|
25 |
-
'REFUTES': 'refuted'
|
26 |
-
}
|
27 |
|
28 |
data_map = {}
|
29 |
|
@@ -31,28 +28,28 @@ for name in name_lst:
|
|
31 |
instance_lst = []
|
32 |
|
33 |
for entry in tqdm(v10[name]):
|
34 |
-
id_ = entry[
|
35 |
-
label = entry[
|
36 |
-
claim = entry[
|
37 |
|
38 |
-
evidence_id = entry[
|
39 |
-
evidence_wiki_url = entry[
|
40 |
|
41 |
if evidence_id != -1:
|
42 |
-
assert label in {
|
43 |
|
44 |
-
instance = {
|
45 |
instance_lst.append(instance)
|
46 |
|
47 |
-
key =
|
48 |
|
49 |
-
instance_lst = sorted([dict(t) for t in {tuple(d.items()) for d in instance_lst}], key=lambda d: d[
|
50 |
|
51 |
label_to_instance_lst = {}
|
52 |
for e in instance_lst:
|
53 |
-
if e[
|
54 |
-
label_to_instance_lst[e[
|
55 |
-
label_to_instance_lst[e[
|
56 |
|
57 |
min_len = min(len(v) for k, v in label_to_instance_lst.items())
|
58 |
|
@@ -63,7 +60,7 @@ for name in name_lst:
|
|
63 |
random.Random(42).shuffle(new_instance_lst)
|
64 |
data_map[key] = new_instance_lst
|
65 |
|
66 |
-
ds_path =
|
67 |
|
68 |
task_to_ds_map = {k: Dataset.from_dict(convert(v)) for k, v in data_map.items()}
|
69 |
ds_dict = DatasetDict(task_to_ds_map)
|
|
|
18 |
|
19 |
|
20 |
v10 = load_dataset("fever", "v1.0")
|
21 |
+
name_lst = ["train", "labelled_dev"]
|
22 |
|
23 |
+
old_to_new_label_map = {"SUPPORTS": "supported", "REFUTES": "refuted"}
|
|
|
|
|
|
|
24 |
|
25 |
data_map = {}
|
26 |
|
|
|
28 |
instance_lst = []
|
29 |
|
30 |
for entry in tqdm(v10[name]):
|
31 |
+
id_ = entry["id"]
|
32 |
+
label = entry["label"]
|
33 |
+
claim = entry["claim"]
|
34 |
|
35 |
+
evidence_id = entry["evidence_id"]
|
36 |
+
evidence_wiki_url = entry["evidence_wiki_url"]
|
37 |
|
38 |
if evidence_id != -1:
|
39 |
+
assert label in {"SUPPORTS", "REFUTES"}
|
40 |
|
41 |
+
instance = {"id": id_, "label": old_to_new_label_map[label], "claim": claim}
|
42 |
instance_lst.append(instance)
|
43 |
|
44 |
+
key = "dev" if name in {"labelled_dev"} else name
|
45 |
|
46 |
+
instance_lst = sorted([dict(t) for t in {tuple(d.items()) for d in instance_lst}], key=lambda d: d["claim"])
|
47 |
|
48 |
label_to_instance_lst = {}
|
49 |
for e in instance_lst:
|
50 |
+
if e["label"] not in label_to_instance_lst:
|
51 |
+
label_to_instance_lst[e["label"]] = []
|
52 |
+
label_to_instance_lst[e["label"]].append(e)
|
53 |
|
54 |
min_len = min(len(v) for k, v in label_to_instance_lst.items())
|
55 |
|
|
|
60 |
random.Random(42).shuffle(new_instance_lst)
|
61 |
data_map[key] = new_instance_lst
|
62 |
|
63 |
+
ds_path = "pminervini/hl-fever"
|
64 |
|
65 |
task_to_ds_map = {k: Dataset.from_dict(convert(v)) for k, v in data_map.items()}
|
66 |
ds_dict = DatasetDict(task_to_ds_map)
|
cli/fix-requests-cli.py
CHANGED
@@ -10,12 +10,12 @@ from huggingface_hub import HfApi
|
|
10 |
def find_json_files(directory):
|
11 |
matches = []
|
12 |
for root, dirnames, filenames in os.walk(directory):
|
13 |
-
for filename in fnmatch.filter(filenames,
|
14 |
matches.append(os.path.join(root, filename))
|
15 |
return matches
|
16 |
|
17 |
|
18 |
-
directory_path =
|
19 |
json_files = find_json_files(directory_path)
|
20 |
|
21 |
api = HfApi()
|
@@ -26,29 +26,29 @@ model_lst = [m for m in model_lst]
|
|
26 |
id_to_model = {m.id: m for m in model_lst}
|
27 |
|
28 |
for path in json_files:
|
29 |
-
with open(path,
|
30 |
data = json.load(fr)
|
31 |
|
32 |
-
model_id = data[
|
33 |
if model_id in id_to_model:
|
34 |
model = id_to_model[model_id]
|
35 |
|
36 |
to_overwrite = False
|
37 |
|
38 |
-
is_finetuned = any(tag.startswith(
|
39 |
|
40 |
if is_finetuned:
|
41 |
data["model_type"] = "fine-tuned"
|
42 |
to_overwrite = True
|
43 |
|
44 |
-
is_instruction_tuned = (
|
45 |
if is_instruction_tuned:
|
46 |
data["model_type"] = "instruction-tuned"
|
47 |
to_overwrite = True
|
48 |
|
49 |
if to_overwrite is True:
|
50 |
-
with open(path,
|
51 |
json.dump(data, fw)
|
52 |
|
53 |
else:
|
54 |
-
print(f
|
|
|
10 |
def find_json_files(directory):
|
11 |
matches = []
|
12 |
for root, dirnames, filenames in os.walk(directory):
|
13 |
+
for filename in fnmatch.filter(filenames, "*.json"):
|
14 |
matches.append(os.path.join(root, filename))
|
15 |
return matches
|
16 |
|
17 |
|
18 |
+
directory_path = "/Users/pasquale/workspace/eval/requests"
|
19 |
json_files = find_json_files(directory_path)
|
20 |
|
21 |
api = HfApi()
|
|
|
26 |
id_to_model = {m.id: m for m in model_lst}
|
27 |
|
28 |
for path in json_files:
|
29 |
+
with open(path, "r") as fr:
|
30 |
data = json.load(fr)
|
31 |
|
32 |
+
model_id = data["model"]
|
33 |
if model_id in id_to_model:
|
34 |
model = id_to_model[model_id]
|
35 |
|
36 |
to_overwrite = False
|
37 |
|
38 |
+
is_finetuned = any(tag.startswith("base_model:") for tag in id_to_model[data["model"]].tags)
|
39 |
|
40 |
if is_finetuned:
|
41 |
data["model_type"] = "fine-tuned"
|
42 |
to_overwrite = True
|
43 |
|
44 |
+
is_instruction_tuned = ("nstruct" in model_id) or ("chat" in model_id)
|
45 |
if is_instruction_tuned:
|
46 |
data["model_type"] = "instruction-tuned"
|
47 |
to_overwrite = True
|
48 |
|
49 |
if to_overwrite is True:
|
50 |
+
with open(path, "w") as fw:
|
51 |
json.dump(data, fw)
|
52 |
|
53 |
else:
|
54 |
+
print(f"Model {model_id} not found")
|
cli/halueval-upload-cli.py
CHANGED
@@ -6,20 +6,20 @@ import requests
|
|
6 |
from datasets import load_dataset, Dataset, DatasetDict
|
7 |
|
8 |
|
9 |
-
path =
|
10 |
|
11 |
API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}"
|
12 |
response = requests.get(API_URL)
|
13 |
res_json = response.json()
|
14 |
|
15 |
-
gold_splits = {
|
16 |
|
17 |
-
available_splits = {split[
|
18 |
|
19 |
name_to_ds = dict()
|
20 |
|
21 |
for name in gold_splits:
|
22 |
-
ds = load_dataset("json", data_files={
|
23 |
name_to_ds[name] = ds
|
24 |
# if name not in available_splits:
|
25 |
ds.push_to_hub(path, config_name=name)
|
@@ -35,38 +35,38 @@ def list_to_dict(lst: list) -> dict:
|
|
35 |
return res
|
36 |
|
37 |
|
38 |
-
for name in
|
39 |
random.seed(42)
|
40 |
ds = name_to_ds[name]
|
41 |
new_entry_lst = []
|
42 |
-
|
43 |
-
for entry in ds[
|
44 |
is_hallucinated = random.random() > 0.5
|
45 |
new_entry = None
|
46 |
-
if name in {
|
47 |
new_entry = {
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
}
|
53 |
-
if name in {
|
54 |
new_entry = {
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
}
|
60 |
-
if name in {
|
61 |
new_entry = {
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
}
|
66 |
assert new_entry is not None
|
67 |
new_entry_lst += [new_entry]
|
68 |
new_ds_map = list_to_dict(new_entry_lst)
|
69 |
new_ds = Dataset.from_dict(new_ds_map)
|
70 |
-
new_dsd = DatasetDict({
|
71 |
|
72 |
-
new_dsd.push_to_hub(path, config_name=f
|
|
|
6 |
from datasets import load_dataset, Dataset, DatasetDict
|
7 |
|
8 |
|
9 |
+
path = "pminervini/HaluEval"
|
10 |
|
11 |
API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}"
|
12 |
response = requests.get(API_URL)
|
13 |
res_json = response.json()
|
14 |
|
15 |
+
gold_splits = {"dialogue", "qa", "summarization", "general"}
|
16 |
|
17 |
+
available_splits = {split["config"] for split in res_json["splits"]} if "splits" in res_json else set()
|
18 |
|
19 |
name_to_ds = dict()
|
20 |
|
21 |
for name in gold_splits:
|
22 |
+
ds = load_dataset("json", data_files={"data": f"data/{name}_data.json"})
|
23 |
name_to_ds[name] = ds
|
24 |
# if name not in available_splits:
|
25 |
ds.push_to_hub(path, config_name=name)
|
|
|
35 |
return res
|
36 |
|
37 |
|
38 |
+
for name in gold_splits - {"general"}:
|
39 |
random.seed(42)
|
40 |
ds = name_to_ds[name]
|
41 |
new_entry_lst = []
|
42 |
+
|
43 |
+
for entry in ds["data"]:
|
44 |
is_hallucinated = random.random() > 0.5
|
45 |
new_entry = None
|
46 |
+
if name in {"qa"}:
|
47 |
new_entry = {
|
48 |
+
"knowledge": entry["knowledge"],
|
49 |
+
"question": entry["question"],
|
50 |
+
"answer": entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'],
|
51 |
+
"hallucination": "yes" if is_hallucinated else "no",
|
52 |
}
|
53 |
+
if name in {"dialogue"}:
|
54 |
new_entry = {
|
55 |
+
"knowledge": entry["knowledge"],
|
56 |
+
"dialogue_history": entry["dialogue_history"],
|
57 |
+
"response": entry[f'{"hallucinated" if is_hallucinated else "right"}_response'],
|
58 |
+
"hallucination": "yes" if is_hallucinated else "no",
|
59 |
}
|
60 |
+
if name in {"summarization"}:
|
61 |
new_entry = {
|
62 |
+
"document": entry["document"],
|
63 |
+
"summary": entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'],
|
64 |
+
"hallucination": "yes" if is_hallucinated else "no",
|
65 |
}
|
66 |
assert new_entry is not None
|
67 |
new_entry_lst += [new_entry]
|
68 |
new_ds_map = list_to_dict(new_entry_lst)
|
69 |
new_ds = Dataset.from_dict(new_ds_map)
|
70 |
+
new_dsd = DatasetDict({"data": new_ds})
|
71 |
|
72 |
+
new_dsd.push_to_hub(path, config_name=f"{name}_samples")
|
cli/isp-upload-cli.py
CHANGED
@@ -5,16 +5,16 @@ import os
|
|
5 |
|
6 |
from datasets import load_dataset
|
7 |
|
8 |
-
folder_path =
|
9 |
|
10 |
# Search for all .json files in the folder
|
11 |
-
json_files = glob.glob(os.path.join(folder_path,
|
12 |
|
13 |
-
path =
|
14 |
|
15 |
for json_path in json_files:
|
16 |
base_name = os.path.basename(json_path)
|
17 |
name = base_name.split("_")[0]
|
18 |
|
19 |
-
ds = load_dataset("json", data_files={
|
20 |
ds.push_to_hub(path, config_name=name)
|
|
|
5 |
|
6 |
from datasets import load_dataset
|
7 |
|
8 |
+
folder_path = "isp-data-json/" # Replace with your folder path
|
9 |
|
10 |
# Search for all .json files in the folder
|
11 |
+
json_files = glob.glob(os.path.join(folder_path, "*.jsonl"))
|
12 |
|
13 |
+
path = "pminervini/inverse-scaling"
|
14 |
|
15 |
for json_path in json_files:
|
16 |
base_name = os.path.basename(json_path)
|
17 |
name = base_name.split("_")[0]
|
18 |
|
19 |
+
ds = load_dataset("json", data_files={"data": json_path})
|
20 |
ds.push_to_hub(path, config_name=name)
|
cli/nqswap-upload-cli.py
CHANGED
@@ -2,11 +2,7 @@
|
|
2 |
|
3 |
from datasets import load_dataset
|
4 |
|
5 |
-
path =
|
6 |
|
7 |
-
ds = load_dataset("json",
|
8 |
-
data_files={
|
9 |
-
'original': 'nqswap/original.jsonl',
|
10 |
-
'substituted': 'nqswap/substituted.jsonl'
|
11 |
-
})
|
12 |
ds.push_to_hub(path)
|
|
|
2 |
|
3 |
from datasets import load_dataset
|
4 |
|
5 |
+
path = "pminervini/NQ-Swap"
|
6 |
|
7 |
+
ds = load_dataset("json", data_files={"original": "nqswap/original.jsonl", "substituted": "nqswap/substituted.jsonl"})
|
|
|
|
|
|
|
|
|
8 |
ds.push_to_hub(path)
|
cli/shroom-upload-cli.py
CHANGED
@@ -4,9 +4,9 @@ import json
|
|
4 |
from datasets import Dataset, DatasetDict
|
5 |
|
6 |
file_path = "shroom-data/val.model-agnostic.json"
|
7 |
-
ds_path =
|
8 |
|
9 |
-
with open(file_path,
|
10 |
data = json.load(file)
|
11 |
|
12 |
|
@@ -15,7 +15,7 @@ def convert(list_of_dicts):
|
|
15 |
for d in list_of_dicts:
|
16 |
for key, value in d.items():
|
17 |
dict_of_lists.setdefault(key, []).append(value)
|
18 |
-
return
|
19 |
|
20 |
|
21 |
task_to_data_map = {}
|
|
|
4 |
from datasets import Dataset, DatasetDict
|
5 |
|
6 |
file_path = "shroom-data/val.model-agnostic.json"
|
7 |
+
ds_path = "pminervini/shroom"
|
8 |
|
9 |
+
with open(file_path, "r") as file:
|
10 |
data = json.load(file)
|
11 |
|
12 |
|
|
|
15 |
for d in list_of_dicts:
|
16 |
for key, value in d.items():
|
17 |
dict_of_lists.setdefault(key, []).append(value)
|
18 |
+
return dict_of_lists
|
19 |
|
20 |
|
21 |
task_to_data_map = {}
|
cli/submit-cli.py
CHANGED
@@ -15,7 +15,9 @@ from src.backend.manage_requests import get_eval_requests
|
|
15 |
from src.backend.manage_requests import EvalRequest
|
16 |
|
17 |
|
18 |
-
def add_new_eval(
|
|
|
|
|
19 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
20 |
|
21 |
user_name = ""
|
@@ -37,7 +39,9 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, pri
|
|
37 |
|
38 |
# Is the model on the hub?
|
39 |
if weight_type in ["Delta", "Adapter"]:
|
40 |
-
base_model_on_hub, error, _ = is_model_on_hub(
|
|
|
|
|
41 |
if not base_model_on_hub:
|
42 |
print(f'Base model "{base_model}" {error}')
|
43 |
return
|
@@ -57,7 +61,7 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, pri
|
|
57 |
|
58 |
model_size = get_model_size(model_info=model_info, precision=precision)
|
59 |
|
60 |
-
license =
|
61 |
try:
|
62 |
license = model_info.cardData["license"]
|
63 |
except Exception:
|
@@ -101,13 +105,20 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, pri
|
|
101 |
f.write(json.dumps(eval_entry))
|
102 |
|
103 |
print("Uploading eval file")
|
104 |
-
API.upload_file(
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
# Remove the local file
|
108 |
os.remove(out_path)
|
109 |
|
110 |
-
print(
|
|
|
|
|
111 |
return
|
112 |
|
113 |
|
@@ -122,12 +133,14 @@ def main():
|
|
122 |
def custom_filter(m) -> bool:
|
123 |
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False
|
124 |
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False and 'mistralai/' in m.id
|
125 |
-
res =
|
126 |
return res
|
127 |
|
128 |
filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
|
129 |
|
130 |
-
snapshot_download(
|
|
|
|
|
131 |
|
132 |
PENDING_STATUS = "PENDING"
|
133 |
RUNNING_STATUS = "RUNNING"
|
@@ -137,7 +150,9 @@ def main():
|
|
137 |
status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
|
138 |
|
139 |
# Get all eval requests
|
140 |
-
eval_requests: list[EvalRequest] = get_eval_requests(
|
|
|
|
|
141 |
|
142 |
requested_model_names = {e.model for e in eval_requests}
|
143 |
|
@@ -146,25 +161,33 @@ def main():
|
|
146 |
for i in range(min(200, len(filtered_model_lst))):
|
147 |
model = filtered_model_lst[i]
|
148 |
|
149 |
-
print(f
|
150 |
|
151 |
-
is_finetuned = any(tag.startswith(
|
152 |
|
153 |
-
model_type =
|
154 |
if is_finetuned:
|
155 |
model_type = "fine-tuned"
|
156 |
|
157 |
-
is_instruction_tuned =
|
158 |
if is_instruction_tuned:
|
159 |
model_type = "instruction-tuned"
|
160 |
|
161 |
if model.id not in requested_model_names:
|
162 |
|
163 |
-
if
|
164 |
-
add_new_eval(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
time.sleep(10)
|
166 |
else:
|
167 |
-
print(f
|
168 |
|
169 |
|
170 |
if __name__ == "__main__":
|
|
|
15 |
from src.backend.manage_requests import EvalRequest
|
16 |
|
17 |
|
18 |
+
def add_new_eval(
|
19 |
+
model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str
|
20 |
+
):
|
21 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
22 |
|
23 |
user_name = ""
|
|
|
39 |
|
40 |
# Is the model on the hub?
|
41 |
if weight_type in ["Delta", "Adapter"]:
|
42 |
+
base_model_on_hub, error, _ = is_model_on_hub(
|
43 |
+
model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True
|
44 |
+
)
|
45 |
if not base_model_on_hub:
|
46 |
print(f'Base model "{base_model}" {error}')
|
47 |
return
|
|
|
61 |
|
62 |
model_size = get_model_size(model_info=model_info, precision=precision)
|
63 |
|
64 |
+
license = "none"
|
65 |
try:
|
66 |
license = model_info.cardData["license"]
|
67 |
except Exception:
|
|
|
105 |
f.write(json.dumps(eval_entry))
|
106 |
|
107 |
print("Uploading eval file")
|
108 |
+
API.upload_file(
|
109 |
+
path_or_fileobj=out_path,
|
110 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
111 |
+
repo_id=QUEUE_REPO,
|
112 |
+
repo_type="dataset",
|
113 |
+
commit_message=f"Add {model} to eval queue",
|
114 |
+
)
|
115 |
|
116 |
# Remove the local file
|
117 |
os.remove(out_path)
|
118 |
|
119 |
+
print(
|
120 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
121 |
+
)
|
122 |
return
|
123 |
|
124 |
|
|
|
133 |
def custom_filter(m) -> bool:
|
134 |
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False
|
135 |
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False and 'mistralai/' in m.id
|
136 |
+
res = "mistralai/" in m.id
|
137 |
return res
|
138 |
|
139 |
filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
|
140 |
|
141 |
+
snapshot_download(
|
142 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
143 |
+
)
|
144 |
|
145 |
PENDING_STATUS = "PENDING"
|
146 |
RUNNING_STATUS = "RUNNING"
|
|
|
150 |
status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
|
151 |
|
152 |
# Get all eval requests
|
153 |
+
eval_requests: list[EvalRequest] = get_eval_requests(
|
154 |
+
job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
155 |
+
)
|
156 |
|
157 |
requested_model_names = {e.model for e in eval_requests}
|
158 |
|
|
|
161 |
for i in range(min(200, len(filtered_model_lst))):
|
162 |
model = filtered_model_lst[i]
|
163 |
|
164 |
+
print(f"Considering {model.id} ..")
|
165 |
|
166 |
+
is_finetuned = any(tag.startswith("base_model:") for tag in model.tags)
|
167 |
|
168 |
+
model_type = "pretrained"
|
169 |
if is_finetuned:
|
170 |
model_type = "fine-tuned"
|
171 |
|
172 |
+
is_instruction_tuned = "nstruct" in model.id
|
173 |
if is_instruction_tuned:
|
174 |
model_type = "instruction-tuned"
|
175 |
|
176 |
if model.id not in requested_model_names:
|
177 |
|
178 |
+
if "mage" not in model.id:
|
179 |
+
add_new_eval(
|
180 |
+
model=model.id,
|
181 |
+
base_model="",
|
182 |
+
revision="main",
|
183 |
+
precision="float32",
|
184 |
+
private=False,
|
185 |
+
weight_type="Original",
|
186 |
+
model_type=model_type,
|
187 |
+
)
|
188 |
time.sleep(10)
|
189 |
else:
|
190 |
+
print(f"Model {model.id} already added, not adding it to the queue again.")
|
191 |
|
192 |
|
193 |
if __name__ == "__main__":
|
cli/sync-open-llm-cli.py
CHANGED
@@ -10,6 +10,7 @@ from src.envs import QUEUE_REPO, API
|
|
10 |
from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
|
11 |
from src.utils import my_snapshot_download
|
12 |
|
|
|
13 |
def my_set_eval_request(api, json_filepath, hf_repo, local_dir):
|
14 |
for i in range(10):
|
15 |
try:
|
@@ -29,8 +30,12 @@ def set_eval_request(api: HfApi, json_filepath: str, hf_repo: str, local_dir: st
|
|
29 |
with open(json_filepath, "w") as f:
|
30 |
f.write(json.dumps(data))
|
31 |
|
32 |
-
api.upload_file(
|
33 |
-
|
|
|
|
|
|
|
|
|
34 |
|
35 |
|
36 |
def get_request_file_for_model(data, requests_path):
|
@@ -54,6 +59,7 @@ def get_request_file_for_model(data, requests_path):
|
|
54 |
request_file = tmp_request_file
|
55 |
return request_file
|
56 |
|
|
|
57 |
def update_model_type(data, requests_path):
|
58 |
open_llm_request_file = get_request_file_for_model(data, requests_path)
|
59 |
|
@@ -71,21 +77,33 @@ def read_and_write_json_files(directory, requests_path_open_llm):
|
|
71 |
for subdir, dirs, files in tqdm(os.walk(directory), desc="updating model type according to open llm leaderboard"):
|
72 |
for file in files:
|
73 |
# Check if the file is a JSON file
|
74 |
-
if file.endswith(
|
75 |
file_path = os.path.join(subdir, file)
|
76 |
# Open and read the JSON file
|
77 |
-
with open(file_path,
|
78 |
data = json.load(json_file)
|
79 |
sucess, data = update_model_type(data, requests_path_open_llm)
|
80 |
if sucess:
|
81 |
-
with open(file_path,
|
82 |
json.dump(data, json_file)
|
83 |
-
my_set_eval_request(
|
84 |
-
|
85 |
-
|
86 |
|
87 |
|
88 |
if __name__ == "__main__":
|
89 |
-
my_snapshot_download(
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
|
11 |
from src.utils import my_snapshot_download
|
12 |
|
13 |
+
|
14 |
def my_set_eval_request(api, json_filepath, hf_repo, local_dir):
|
15 |
for i in range(10):
|
16 |
try:
|
|
|
30 |
with open(json_filepath, "w") as f:
|
31 |
f.write(json.dumps(data))
|
32 |
|
33 |
+
api.upload_file(
|
34 |
+
path_or_fileobj=json_filepath,
|
35 |
+
path_in_repo=json_filepath.replace(local_dir, ""),
|
36 |
+
repo_id=hf_repo,
|
37 |
+
repo_type="dataset",
|
38 |
+
)
|
39 |
|
40 |
|
41 |
def get_request_file_for_model(data, requests_path):
|
|
|
59 |
request_file = tmp_request_file
|
60 |
return request_file
|
61 |
|
62 |
+
|
63 |
def update_model_type(data, requests_path):
|
64 |
open_llm_request_file = get_request_file_for_model(data, requests_path)
|
65 |
|
|
|
77 |
for subdir, dirs, files in tqdm(os.walk(directory), desc="updating model type according to open llm leaderboard"):
|
78 |
for file in files:
|
79 |
# Check if the file is a JSON file
|
80 |
+
if file.endswith(".json"):
|
81 |
file_path = os.path.join(subdir, file)
|
82 |
# Open and read the JSON file
|
83 |
+
with open(file_path, "r") as json_file:
|
84 |
data = json.load(json_file)
|
85 |
sucess, data = update_model_type(data, requests_path_open_llm)
|
86 |
if sucess:
|
87 |
+
with open(file_path, "w") as json_file:
|
88 |
json.dump(data, json_file)
|
89 |
+
my_set_eval_request(
|
90 |
+
api=API, json_filepath=file_path, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC
|
91 |
+
)
|
92 |
|
93 |
|
94 |
if __name__ == "__main__":
|
95 |
+
my_snapshot_download(
|
96 |
+
repo_id=QUEUE_REPO_OPEN_LLM,
|
97 |
+
revision="main",
|
98 |
+
local_dir=EVAL_REQUESTS_PATH_OPEN_LLM,
|
99 |
+
repo_type="dataset",
|
100 |
+
max_workers=60,
|
101 |
+
)
|
102 |
+
my_snapshot_download(
|
103 |
+
repo_id=QUEUE_REPO,
|
104 |
+
revision="main",
|
105 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC,
|
106 |
+
repo_type="dataset",
|
107 |
+
max_workers=60,
|
108 |
+
)
|
109 |
+
read_and_write_json_files(EVAL_REQUESTS_PATH_BACKEND_SYNC, EVAL_REQUESTS_PATH_OPEN_LLM)
|
cli/truefalse-upload-cli.py
CHANGED
@@ -5,11 +5,11 @@ import os
|
|
5 |
|
6 |
from datasets import load_dataset
|
7 |
|
8 |
-
path =
|
9 |
-
folder_path =
|
10 |
|
11 |
# Search for all .json files in the folder
|
12 |
-
csv_files = glob.glob(os.path.join(folder_path,
|
13 |
|
14 |
ds = load_dataset("csv", data_files={os.path.basename(path).split("_")[0]: path for path in csv_files})
|
15 |
ds.push_to_hub(path)
|
|
|
5 |
|
6 |
from datasets import load_dataset
|
7 |
|
8 |
+
path = "pminervini/true-false"
|
9 |
+
folder_path = "true-false-data/" # Replace with your folder path
|
10 |
|
11 |
# Search for all .json files in the folder
|
12 |
+
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
|
13 |
|
14 |
ds = load_dataset("csv", data_files={os.path.basename(path).split("_")[0]: path for path in csv_files})
|
15 |
ds.push_to_hub(path)
|
src/backend/envs.py
CHANGED
@@ -63,6 +63,6 @@ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
|
63 |
EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
|
64 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
65 |
|
66 |
-
DEVICE = "cuda" if torch.cuda.is_available() else
|
67 |
|
68 |
LIMIT = None # Testing; needs to be None
|
|
|
63 |
EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
|
64 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
65 |
|
66 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
67 |
|
68 |
LIMIT = None # Testing; needs to be None
|
src/backend/huggingface_generate_until.py
CHANGED
@@ -5,7 +5,8 @@ import transformers
|
|
5 |
from lm_eval.models.huggingface import HFLM
|
6 |
from lm_eval.api.registry import register_model
|
7 |
|
8 |
-
|
|
|
9 |
class HFLMwithChatTemplate(HFLM):
|
10 |
def __init__(self, use_chat_template=True, **kwargs):
|
11 |
super().__init__(**kwargs)
|
@@ -49,9 +50,7 @@ class HFLMwithChatTemplate(HFLM):
|
|
49 |
)
|
50 |
if left_truncate_len:
|
51 |
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
|
52 |
-
encoding["attention_mask"] = encoding["attention_mask"][
|
53 |
-
:, -left_truncate_len:
|
54 |
-
]
|
55 |
self.tokenizer.padding_side = old_padding_side
|
56 |
|
57 |
-
return encoding["input_ids"], encoding["attention_mask"]
|
|
|
5 |
from lm_eval.models.huggingface import HFLM
|
6 |
from lm_eval.api.registry import register_model
|
7 |
|
8 |
+
|
9 |
+
@register_model("hf-chat")
|
10 |
class HFLMwithChatTemplate(HFLM):
|
11 |
def __init__(self, use_chat_template=True, **kwargs):
|
12 |
super().__init__(**kwargs)
|
|
|
50 |
)
|
51 |
if left_truncate_len:
|
52 |
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
|
53 |
+
encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
|
|
|
|
|
54 |
self.tokenizer.padding_side = old_padding_side
|
55 |
|
56 |
+
return encoding["input_ids"], encoding["attention_mask"]
|
src/backend/manage_requests.py
CHANGED
@@ -17,24 +17,27 @@ class EvalRequest:
|
|
17 |
weight_type: str = "Original"
|
18 |
model_type: str = "" # pretrained, finetuned, with RL
|
19 |
precision: str = "" # float16, bfloat16
|
20 |
-
base_model: Optional[str] = None
|
21 |
-
revision: str = "main"
|
22 |
-
submitted_time: Optional[str] =
|
|
|
|
|
23 |
model_type: Optional[str] = None
|
24 |
likes: Optional[int] = 0
|
25 |
params: Optional[int] = None
|
26 |
license: Optional[str] = ""
|
|
|
27 |
def get_model_args(self) -> str:
|
28 |
-
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True"
|
29 |
|
30 |
if self.precision in ["float16", "float32", "bfloat16"]:
|
31 |
model_args += f",dtype={self.precision}"
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
# A GPTQ model does not need dtype to be specified,
|
39 |
# it will be inferred from the config
|
40 |
pass
|
@@ -55,8 +58,12 @@ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
|
|
55 |
with open(json_filepath, "w") as f:
|
56 |
f.write(json.dumps(data))
|
57 |
|
58 |
-
api.upload_file(
|
59 |
-
|
|
|
|
|
|
|
|
|
60 |
|
61 |
|
62 |
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_download: bool = True) -> list[EvalRequest]:
|
@@ -68,7 +75,9 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_downloa
|
|
68 |
`list[EvalRequest]`: a list of model info dicts.
|
69 |
"""
|
70 |
if do_download:
|
71 |
-
my_snapshot_download(
|
|
|
|
|
72 |
|
73 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
74 |
|
@@ -81,8 +90,8 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_downloa
|
|
81 |
# breakpoint()
|
82 |
data["json_filepath"] = json_filepath
|
83 |
|
84 |
-
if
|
85 |
-
del data[
|
86 |
|
87 |
eval_request = EvalRequest(**data)
|
88 |
eval_requests.append(eval_request)
|
@@ -90,10 +99,20 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_downloa
|
|
90 |
return eval_requests
|
91 |
|
92 |
|
93 |
-
def check_completed_evals(
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
96 |
-
my_snapshot_download(
|
|
|
|
|
97 |
|
98 |
running_evals = get_eval_requests([checked_status], hf_repo=hf_repo, local_dir=local_dir)
|
99 |
|
@@ -109,5 +128,3 @@ def check_completed_evals(api: HfApi, hf_repo: str, local_dir: str, checked_stat
|
|
109 |
if output_file_exists:
|
110 |
print(f"EXISTS output file exists for {model} setting it to {completed_status}")
|
111 |
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
112 |
-
|
113 |
-
|
|
|
17 |
weight_type: str = "Original"
|
18 |
model_type: str = "" # pretrained, finetuned, with RL
|
19 |
precision: str = "" # float16, bfloat16
|
20 |
+
base_model: Optional[str] = None # for adapter models
|
21 |
+
revision: str = "main" # commit
|
22 |
+
submitted_time: Optional[str] = (
|
23 |
+
"2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
24 |
+
)
|
25 |
model_type: Optional[str] = None
|
26 |
likes: Optional[int] = 0
|
27 |
params: Optional[int] = None
|
28 |
license: Optional[str] = ""
|
29 |
+
|
30 |
def get_model_args(self) -> str:
|
31 |
+
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
|
32 |
|
33 |
if self.precision in ["float16", "float32", "bfloat16"]:
|
34 |
model_args += f",dtype={self.precision}"
|
35 |
+
# Quantized models need some added config, the install of bits and bytes, etc
|
36 |
+
# elif self.precision == "8bit":
|
37 |
+
# model_args += ",load_in_8bit=True"
|
38 |
+
# elif self.precision == "4bit":
|
39 |
+
# model_args += ",load_in_4bit=True"
|
40 |
+
# elif self.precision == "GPTQ":
|
41 |
# A GPTQ model does not need dtype to be specified,
|
42 |
# it will be inferred from the config
|
43 |
pass
|
|
|
58 |
with open(json_filepath, "w") as f:
|
59 |
f.write(json.dumps(data))
|
60 |
|
61 |
+
api.upload_file(
|
62 |
+
path_or_fileobj=json_filepath,
|
63 |
+
path_in_repo=json_filepath.replace(local_dir, ""),
|
64 |
+
repo_id=hf_repo,
|
65 |
+
repo_type="dataset",
|
66 |
+
)
|
67 |
|
68 |
|
69 |
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_download: bool = True) -> list[EvalRequest]:
|
|
|
75 |
`list[EvalRequest]`: a list of model info dicts.
|
76 |
"""
|
77 |
if do_download:
|
78 |
+
my_snapshot_download(
|
79 |
+
repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60
|
80 |
+
)
|
81 |
|
82 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
83 |
|
|
|
90 |
# breakpoint()
|
91 |
data["json_filepath"] = json_filepath
|
92 |
|
93 |
+
if "job_id" in data:
|
94 |
+
del data["job_id"]
|
95 |
|
96 |
eval_request = EvalRequest(**data)
|
97 |
eval_requests.append(eval_request)
|
|
|
99 |
return eval_requests
|
100 |
|
101 |
|
102 |
+
def check_completed_evals(
|
103 |
+
api: HfApi,
|
104 |
+
hf_repo: str,
|
105 |
+
local_dir: str,
|
106 |
+
checked_status: str,
|
107 |
+
completed_status: str,
|
108 |
+
failed_status: str,
|
109 |
+
hf_repo_results: str,
|
110 |
+
local_dir_results: str,
|
111 |
+
):
|
112 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
113 |
+
my_snapshot_download(
|
114 |
+
repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60
|
115 |
+
)
|
116 |
|
117 |
running_evals = get_eval_requests([checked_status], hf_repo=hf_repo, local_dir=local_dir)
|
118 |
|
|
|
128 |
if output_file_exists:
|
129 |
print(f"EXISTS output file exists for {model} setting it to {completed_status}")
|
130 |
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
|
|
|
src/backend/moe_infinity.py
CHANGED
@@ -8,17 +8,18 @@ from typing import List, Tuple, Optional, Union
|
|
8 |
from lm_eval.models.huggingface import HFLM
|
9 |
from lm_eval.api.registry import register_model
|
10 |
|
11 |
-
|
|
|
12 |
class MoEHFLM(HFLM):
|
13 |
def __init__(
|
14 |
self,
|
15 |
pretrained: str = "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
16 |
moe_config: dict = None,
|
17 |
-
offload_path
|
18 |
-
device_memory_ratio
|
19 |
use_chat_template=True,
|
20 |
*args,
|
21 |
-
**kwargs
|
22 |
):
|
23 |
# Initialize parent class without calling _create_model in the parent's __init__
|
24 |
self.checkpoint = pretrained
|
@@ -28,7 +29,9 @@ class MoEHFLM(HFLM):
|
|
28 |
self.use_chat_template = use_chat_template
|
29 |
if "device" in kwargs:
|
30 |
kwargs.pop("device")
|
31 |
-
super().__init__(
|
|
|
|
|
32 |
# self._create_model()
|
33 |
|
34 |
def _create_model(self, *args, **kwargs):
|
@@ -43,7 +46,9 @@ class MoEHFLM(HFLM):
|
|
43 |
# Update default config with any user-provided config
|
44 |
final_moe_config = {**default_moe_config, **self.moe_config}
|
45 |
# self._model = MoE(self.checkpoint, final_moe_config)
|
46 |
-
self._model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
47 |
|
48 |
@property
|
49 |
def max_length(self):
|
@@ -94,9 +99,7 @@ class MoEHFLM(HFLM):
|
|
94 |
)
|
95 |
if left_truncate_len:
|
96 |
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
|
97 |
-
encoding["attention_mask"] = encoding["attention_mask"][
|
98 |
-
:, -left_truncate_len:
|
99 |
-
]
|
100 |
self.tokenizer.padding_side = old_padding_side
|
101 |
|
102 |
return encoding["input_ids"], encoding["attention_mask"]
|
|
|
8 |
from lm_eval.models.huggingface import HFLM
|
9 |
from lm_eval.api.registry import register_model
|
10 |
|
11 |
+
|
12 |
+
@register_model("moe-infinity")
|
13 |
class MoEHFLM(HFLM):
|
14 |
def __init__(
|
15 |
self,
|
16 |
pretrained: str = "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
17 |
moe_config: dict = None,
|
18 |
+
offload_path=os.path.expanduser("~"),
|
19 |
+
device_memory_ratio=0.75,
|
20 |
use_chat_template=True,
|
21 |
*args,
|
22 |
+
**kwargs,
|
23 |
):
|
24 |
# Initialize parent class without calling _create_model in the parent's __init__
|
25 |
self.checkpoint = pretrained
|
|
|
29 |
self.use_chat_template = use_chat_template
|
30 |
if "device" in kwargs:
|
31 |
kwargs.pop("device")
|
32 |
+
super().__init__(
|
33 |
+
*args, **kwargs, pretrained=pretrained, device_map="cuda:0"
|
34 |
+
) # Assuming HFLM accepts a 'pretrained' arg and handles it
|
35 |
# self._create_model()
|
36 |
|
37 |
def _create_model(self, *args, **kwargs):
|
|
|
46 |
# Update default config with any user-provided config
|
47 |
final_moe_config = {**default_moe_config, **self.moe_config}
|
48 |
# self._model = MoE(self.checkpoint, final_moe_config)
|
49 |
+
self._model = AutoModelForCausalLM.from_pretrained(
|
50 |
+
self.checkpoint, torch_dtype=torch.float16, device_map="auto"
|
51 |
+
)
|
52 |
|
53 |
@property
|
54 |
def max_length(self):
|
|
|
99 |
)
|
100 |
if left_truncate_len:
|
101 |
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
|
102 |
+
encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
|
|
|
|
|
103 |
self.tokenizer.padding_side = old_padding_side
|
104 |
|
105 |
return encoding["input_ids"], encoding["attention_mask"]
|
src/backend/run_eval_suite.py
CHANGED
@@ -14,7 +14,17 @@ from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
|
|
14 |
from src.backend.huggingface_generate_until import HFLMwithChatTemplate
|
15 |
from src.backend.moe_infinity import MoEHFLM
|
16 |
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
if limit:
|
19 |
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
20 |
|
@@ -33,30 +43,34 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
33 |
|
34 |
print(f"Selected Tasks: {task_names}")
|
35 |
print(f"Eval Request: {eval_request.get_model_args()}")
|
36 |
-
print(
|
|
|
|
|
37 |
# hf-chat is implemented to use apply_chat_template
|
38 |
-
results = evaluator.simple_evaluate(
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
49 |
|
50 |
results["config"]["model_dtype"] = eval_request.precision
|
51 |
results["config"]["model_name"] = eval_request.model
|
52 |
results["config"]["model_sha"] = eval_request.revision
|
53 |
|
54 |
if max_nb_samples is not None:
|
55 |
-
if
|
56 |
-
samples = results[
|
57 |
for task_name in samples.keys():
|
58 |
if len(samples[task_name]) > max_nb_samples:
|
59 |
-
results[
|
60 |
|
61 |
# print(evaluator.make_table(results))
|
62 |
|
|
|
14 |
from src.backend.huggingface_generate_until import HFLMwithChatTemplate
|
15 |
from src.backend.moe_infinity import MoEHFLM
|
16 |
|
17 |
+
|
18 |
+
def run_evaluation(
|
19 |
+
eval_request: EvalRequest,
|
20 |
+
task_names,
|
21 |
+
num_fewshot,
|
22 |
+
batch_size,
|
23 |
+
device,
|
24 |
+
use_cache=None,
|
25 |
+
limit=None,
|
26 |
+
max_nb_samples=100,
|
27 |
+
) -> dict:
|
28 |
if limit:
|
29 |
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
30 |
|
|
|
43 |
|
44 |
print(f"Selected Tasks: {task_names}")
|
45 |
print(f"Eval Request: {eval_request.get_model_args()}")
|
46 |
+
print(
|
47 |
+
f"Num Fewshot: {num_fewshot}, Batch Size: {batch_size}, Device: {device}, Use Cache: {use_cache}, Limit: {limit}"
|
48 |
+
)
|
49 |
# hf-chat is implemented to use apply_chat_template
|
50 |
+
results = evaluator.simple_evaluate(
|
51 |
+
model="moe-infinity", # "hf-causal-experimental", # "hf-causal", hf-chat
|
52 |
+
model_args=eval_request.get_model_args(),
|
53 |
+
tasks=task_names,
|
54 |
+
num_fewshot=num_fewshot,
|
55 |
+
batch_size=batch_size,
|
56 |
+
max_batch_size=8,
|
57 |
+
device=device,
|
58 |
+
use_cache=use_cache,
|
59 |
+
limit=limit,
|
60 |
+
write_out=True,
|
61 |
+
task_manager=task_manager,
|
62 |
+
)
|
63 |
|
64 |
results["config"]["model_dtype"] = eval_request.precision
|
65 |
results["config"]["model_name"] = eval_request.model
|
66 |
results["config"]["model_sha"] = eval_request.revision
|
67 |
|
68 |
if max_nb_samples is not None:
|
69 |
+
if "samples" in results:
|
70 |
+
samples = results["samples"]
|
71 |
for task_name in samples.keys():
|
72 |
if len(samples[task_name]) > max_nb_samples:
|
73 |
+
results["samples"][task_name] = results["samples"][task_name][:max_nb_samples]
|
74 |
|
75 |
# print(evaluator.make_table(results))
|
76 |
|
src/backend/tasks/cnndm/task.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from lm_eval.api.task import ConfigurableTask
|
2 |
from lm_eval.api.instance import Instance
|
|
|
3 |
# from lm_eval.api.registry import register_task
|
4 |
from lm_eval.api.metrics import mean
|
5 |
|
@@ -66,7 +67,7 @@ class CNNDM(ConfigurableTask):
|
|
66 |
DATASET_NAME = "3.0.0"
|
67 |
|
68 |
def __init__(self):
|
69 |
-
super().__init__(config={
|
70 |
self.factkb_tokenizer = None
|
71 |
self.factkb_model = None
|
72 |
self.bert_score = None
|
@@ -74,12 +75,18 @@ class CNNDM(ConfigurableTask):
|
|
74 |
def maybe_init_factkb(self):
|
75 |
if self.factkb_tokenizer is None or self.factkb_model is None:
|
76 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
77 |
-
|
78 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
def maybe_init_bertscore(self):
|
81 |
if self.bert_score is None:
|
82 |
from evaluate import load
|
|
|
83 |
self.bert_score = load("bertscore")
|
84 |
|
85 |
def has_training_docs(self):
|
@@ -125,15 +132,7 @@ class CNNDM(ConfigurableTask):
|
|
125 |
part of the document for `doc`.
|
126 |
"""
|
127 |
|
128 |
-
return [
|
129 |
-
Instance(
|
130 |
-
request_type="generate_until",
|
131 |
-
doc=doc,
|
132 |
-
arguments=(ctx, {"until": ["\n"]}),
|
133 |
-
idx=0,
|
134 |
-
**kwargs
|
135 |
-
)
|
136 |
-
]
|
137 |
|
138 |
def process_results(self, doc, results):
|
139 |
completion = results[0]
|
@@ -157,12 +156,16 @@ class CNNDM(ConfigurableTask):
|
|
157 |
|
158 |
self.maybe_init_factkb()
|
159 |
input_factkb = [[completion, document]]
|
160 |
-
factkb_tokens = self.factkb_tokenizer(
|
|
|
|
|
161 |
factkb_logits = self.factkb_model(**factkb_tokens).logits
|
162 |
factkb_res = torch.softmax(factkb_logits, dim=1)
|
163 |
|
164 |
self.maybe_init_bertscore()
|
165 |
-
bert_score_res = self.bert_score.compute(
|
|
|
|
|
166 |
|
167 |
res = {
|
168 |
"rouge1": rouge1_scores[0],
|
@@ -171,7 +174,7 @@ class CNNDM(ConfigurableTask):
|
|
171 |
"factKB": float(factkb_res[0][1]),
|
172 |
"bertscore_precision": float(bert_score_res["precision"][0]),
|
173 |
"bertscore_recall": float(bert_score_res["recall"][0]),
|
174 |
-
"bertscore_f1": float(bert_score_res["f1"][0])
|
175 |
}
|
176 |
|
177 |
return res
|
@@ -182,7 +185,18 @@ class CNNDM(ConfigurableTask):
|
|
182 |
A dictionary where keys are the names of submetrics and values are
|
183 |
functions that aggregate a list of metrics
|
184 |
"""
|
185 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
def higher_is_better(self):
|
188 |
"""
|
@@ -190,5 +204,15 @@ class CNNDM(ConfigurableTask):
|
|
190 |
A dictionary where keys are the names of submetrics and values are
|
191 |
whether a higher value of the submetric is better
|
192 |
"""
|
193 |
-
return {
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from lm_eval.api.task import ConfigurableTask
|
2 |
from lm_eval.api.instance import Instance
|
3 |
+
|
4 |
# from lm_eval.api.registry import register_task
|
5 |
from lm_eval.api.metrics import mean
|
6 |
|
|
|
67 |
DATASET_NAME = "3.0.0"
|
68 |
|
69 |
def __init__(self):
|
70 |
+
super().__init__(config={"metadata": {"version": self.VERSION}})
|
71 |
self.factkb_tokenizer = None
|
72 |
self.factkb_model = None
|
73 |
self.bert_score = None
|
|
|
75 |
def maybe_init_factkb(self):
|
76 |
if self.factkb_tokenizer is None or self.factkb_model is None:
|
77 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
78 |
+
|
79 |
+
self.factkb_tokenizer = AutoTokenizer.from_pretrained(
|
80 |
+
"roberta-base", padding="max_length", truncation=True
|
81 |
+
)
|
82 |
+
self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
|
83 |
+
"bunsenfeng/FactKB", num_labels=2, device_map="auto"
|
84 |
+
)
|
85 |
|
86 |
def maybe_init_bertscore(self):
|
87 |
if self.bert_score is None:
|
88 |
from evaluate import load
|
89 |
+
|
90 |
self.bert_score = load("bertscore")
|
91 |
|
92 |
def has_training_docs(self):
|
|
|
132 |
part of the document for `doc`.
|
133 |
"""
|
134 |
|
135 |
+
return [Instance(request_type="generate_until", doc=doc, arguments=(ctx, {"until": ["\n"]}), idx=0, **kwargs)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
def process_results(self, doc, results):
|
138 |
completion = results[0]
|
|
|
156 |
|
157 |
self.maybe_init_factkb()
|
158 |
input_factkb = [[completion, document]]
|
159 |
+
factkb_tokens = self.factkb_tokenizer(
|
160 |
+
input_factkb, return_tensors="pt", padding="max_length", truncation=True
|
161 |
+
).to(self.factkb_model.device)
|
162 |
factkb_logits = self.factkb_model(**factkb_tokens).logits
|
163 |
factkb_res = torch.softmax(factkb_logits, dim=1)
|
164 |
|
165 |
self.maybe_init_bertscore()
|
166 |
+
bert_score_res = self.bert_score.compute(
|
167 |
+
predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
|
168 |
+
)
|
169 |
|
170 |
res = {
|
171 |
"rouge1": rouge1_scores[0],
|
|
|
174 |
"factKB": float(factkb_res[0][1]),
|
175 |
"bertscore_precision": float(bert_score_res["precision"][0]),
|
176 |
"bertscore_recall": float(bert_score_res["recall"][0]),
|
177 |
+
"bertscore_f1": float(bert_score_res["f1"][0]),
|
178 |
}
|
179 |
|
180 |
return res
|
|
|
185 |
A dictionary where keys are the names of submetrics and values are
|
186 |
functions that aggregate a list of metrics
|
187 |
"""
|
188 |
+
return {
|
189 |
+
k: mean
|
190 |
+
for k in [
|
191 |
+
"rouge1",
|
192 |
+
"rouge2",
|
193 |
+
"rougeL",
|
194 |
+
"factKB",
|
195 |
+
"bertscore_precision",
|
196 |
+
"bertscore_recall",
|
197 |
+
"bertscore_f1",
|
198 |
+
]
|
199 |
+
}
|
200 |
|
201 |
def higher_is_better(self):
|
202 |
"""
|
|
|
204 |
A dictionary where keys are the names of submetrics and values are
|
205 |
whether a higher value of the submetric is better
|
206 |
"""
|
207 |
+
return {
|
208 |
+
k: True
|
209 |
+
for k in [
|
210 |
+
"rouge1",
|
211 |
+
"rouge2",
|
212 |
+
"rougeL",
|
213 |
+
"factKB",
|
214 |
+
"bertscore_precision",
|
215 |
+
"bertscore_recall",
|
216 |
+
"bertscore_f1",
|
217 |
+
]
|
218 |
+
}
|
src/backend/tasks/cnndm/task_v2.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from lm_eval.api.task import ConfigurableTask
|
2 |
from lm_eval.api.instance import Instance
|
|
|
3 |
# from lm_eval.api.registry import register_task
|
4 |
from lm_eval.api.metrics import mean
|
5 |
|
@@ -66,8 +67,12 @@ class CNNDMv2(ConfigurableTask):
|
|
66 |
DATASET_NAME = "3.0.0"
|
67 |
|
68 |
def __init__(self):
|
69 |
-
super().__init__(
|
70 |
-
|
|
|
|
|
|
|
|
|
71 |
self.factkb_tokenizer = None
|
72 |
self.factkb_model = None
|
73 |
self.bert_score = None
|
@@ -75,12 +80,18 @@ class CNNDMv2(ConfigurableTask):
|
|
75 |
def maybe_init_factkb(self):
|
76 |
if self.factkb_tokenizer is None or self.factkb_model is None:
|
77 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
78 |
-
|
79 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
def maybe_init_bertscore(self):
|
82 |
if self.bert_score is None:
|
83 |
from evaluate import load
|
|
|
84 |
self.bert_score = load("bertscore")
|
85 |
|
86 |
def has_training_docs(self):
|
@@ -134,15 +145,7 @@ class CNNDMv2(ConfigurableTask):
|
|
134 |
part of the document for `doc`.
|
135 |
"""
|
136 |
|
137 |
-
return [
|
138 |
-
Instance(
|
139 |
-
request_type="generate_until",
|
140 |
-
doc=doc,
|
141 |
-
arguments=(ctx, {"until": ["\n"]}),
|
142 |
-
idx=0,
|
143 |
-
**kwargs
|
144 |
-
)
|
145 |
-
]
|
146 |
|
147 |
def process_results(self, doc, results):
|
148 |
completion = results[0]
|
@@ -166,12 +169,16 @@ class CNNDMv2(ConfigurableTask):
|
|
166 |
|
167 |
self.maybe_init_factkb()
|
168 |
input_factkb = [[completion, document]]
|
169 |
-
factkb_tokens = self.factkb_tokenizer(
|
|
|
|
|
170 |
factkb_logits = self.factkb_model(**factkb_tokens).logits
|
171 |
factkb_res = torch.softmax(factkb_logits, dim=1)
|
172 |
|
173 |
self.maybe_init_bertscore()
|
174 |
-
bert_score_res = self.bert_score.compute(
|
|
|
|
|
175 |
|
176 |
res = {
|
177 |
"rouge1": rouge1_scores[0],
|
@@ -180,7 +187,7 @@ class CNNDMv2(ConfigurableTask):
|
|
180 |
"factKB": float(factkb_res[0][1]),
|
181 |
"bertscore_precision": float(bert_score_res["precision"][0]),
|
182 |
"bertscore_recall": float(bert_score_res["recall"][0]),
|
183 |
-
"bertscore_f1": float(bert_score_res["f1"][0])
|
184 |
}
|
185 |
|
186 |
return res
|
@@ -191,7 +198,18 @@ class CNNDMv2(ConfigurableTask):
|
|
191 |
A dictionary where keys are the names of submetrics and values are
|
192 |
functions that aggregate a list of metrics
|
193 |
"""
|
194 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
def higher_is_better(self):
|
197 |
"""
|
@@ -199,5 +217,15 @@ class CNNDMv2(ConfigurableTask):
|
|
199 |
A dictionary where keys are the names of submetrics and values are
|
200 |
whether a higher value of the submetric is better
|
201 |
"""
|
202 |
-
return {
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from lm_eval.api.task import ConfigurableTask
|
2 |
from lm_eval.api.instance import Instance
|
3 |
+
|
4 |
# from lm_eval.api.registry import register_task
|
5 |
from lm_eval.api.metrics import mean
|
6 |
|
|
|
67 |
DATASET_NAME = "3.0.0"
|
68 |
|
69 |
def __init__(self):
|
70 |
+
super().__init__(
|
71 |
+
config={
|
72 |
+
"metadata": {"version": self.VERSION},
|
73 |
+
"generation_kwargs": {"do_sample": False, "temperature": 0.0, "until": ["\n", "\n\n"]},
|
74 |
+
}
|
75 |
+
)
|
76 |
self.factkb_tokenizer = None
|
77 |
self.factkb_model = None
|
78 |
self.bert_score = None
|
|
|
80 |
def maybe_init_factkb(self):
|
81 |
if self.factkb_tokenizer is None or self.factkb_model is None:
|
82 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
83 |
+
|
84 |
+
self.factkb_tokenizer = AutoTokenizer.from_pretrained(
|
85 |
+
"roberta-base", padding="max_length", truncation=True
|
86 |
+
)
|
87 |
+
self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
|
88 |
+
"bunsenfeng/FactKB", num_labels=2, device_map="auto"
|
89 |
+
)
|
90 |
|
91 |
def maybe_init_bertscore(self):
|
92 |
if self.bert_score is None:
|
93 |
from evaluate import load
|
94 |
+
|
95 |
self.bert_score = load("bertscore")
|
96 |
|
97 |
def has_training_docs(self):
|
|
|
145 |
part of the document for `doc`.
|
146 |
"""
|
147 |
|
148 |
+
return [Instance(request_type="generate_until", doc=doc, arguments=(ctx, {"until": ["\n"]}), idx=0, **kwargs)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
def process_results(self, doc, results):
|
151 |
completion = results[0]
|
|
|
169 |
|
170 |
self.maybe_init_factkb()
|
171 |
input_factkb = [[completion, document]]
|
172 |
+
factkb_tokens = self.factkb_tokenizer(
|
173 |
+
input_factkb, return_tensors="pt", padding="max_length", truncation=True
|
174 |
+
).to(self.factkb_model.device)
|
175 |
factkb_logits = self.factkb_model(**factkb_tokens).logits
|
176 |
factkb_res = torch.softmax(factkb_logits, dim=1)
|
177 |
|
178 |
self.maybe_init_bertscore()
|
179 |
+
bert_score_res = self.bert_score.compute(
|
180 |
+
predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
|
181 |
+
)
|
182 |
|
183 |
res = {
|
184 |
"rouge1": rouge1_scores[0],
|
|
|
187 |
"factKB": float(factkb_res[0][1]),
|
188 |
"bertscore_precision": float(bert_score_res["precision"][0]),
|
189 |
"bertscore_recall": float(bert_score_res["recall"][0]),
|
190 |
+
"bertscore_f1": float(bert_score_res["f1"][0]),
|
191 |
}
|
192 |
|
193 |
return res
|
|
|
198 |
A dictionary where keys are the names of submetrics and values are
|
199 |
functions that aggregate a list of metrics
|
200 |
"""
|
201 |
+
return {
|
202 |
+
k: mean
|
203 |
+
for k in [
|
204 |
+
"rouge1",
|
205 |
+
"rouge2",
|
206 |
+
"rougeL",
|
207 |
+
"factKB",
|
208 |
+
"bertscore_precision",
|
209 |
+
"bertscore_recall",
|
210 |
+
"bertscore_f1",
|
211 |
+
]
|
212 |
+
}
|
213 |
|
214 |
def higher_is_better(self):
|
215 |
"""
|
|
|
217 |
A dictionary where keys are the names of submetrics and values are
|
218 |
whether a higher value of the submetric is better
|
219 |
"""
|
220 |
+
return {
|
221 |
+
k: True
|
222 |
+
for k in [
|
223 |
+
"rouge1",
|
224 |
+
"rouge2",
|
225 |
+
"rougeL",
|
226 |
+
"factKB",
|
227 |
+
"bertscore_precision",
|
228 |
+
"bertscore_recall",
|
229 |
+
"bertscore_f1",
|
230 |
+
]
|
231 |
+
}
|
src/backend/tasks/faithdial/utils.py
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
from typing import List, Union
|
|
|
2 |
ValueType = Union[str, List[str]]
|
3 |
|
4 |
|
5 |
def doc_to_text(doc: dict[str, ValueType]) -> str:
|
6 |
-
history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc[
|
7 |
doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["response"]}\n#Hallucinated#:'
|
8 |
return doc_text
|
9 |
|
10 |
|
11 |
def doc_to_text_v2(doc: dict[str, ValueType]) -> str:
|
12 |
-
history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc[
|
13 |
doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["original_response"]}\n#Hallucinated#:'
|
14 |
return doc_text
|
15 |
|
|
|
1 |
from typing import List, Union
|
2 |
+
|
3 |
ValueType = Union[str, List[str]]
|
4 |
|
5 |
|
6 |
def doc_to_text(doc: dict[str, ValueType]) -> str:
|
7 |
+
history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc["history"])])
|
8 |
doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["response"]}\n#Hallucinated#:'
|
9 |
return doc_text
|
10 |
|
11 |
|
12 |
def doc_to_text_v2(doc: dict[str, ValueType]) -> str:
|
13 |
+
history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc["history"])])
|
14 |
doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["original_response"]}\n#Hallucinated#:'
|
15 |
return doc_text
|
16 |
|
src/backend/tasks/halueval/utils.py
CHANGED
@@ -83,13 +83,31 @@ You should try your best to determine if the summary contains non-factual or hal
|
|
83 |
|
84 |
def doc_to_text_qa(doc: dict[str, str]) -> str:
|
85 |
# prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
|
86 |
-
doc_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
return doc_text
|
88 |
|
89 |
|
90 |
def doc_to_text_dialogue(doc: dict[str, str]) -> str:
|
91 |
# prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
|
92 |
-
doc_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
return doc_text
|
94 |
|
95 |
|
@@ -103,7 +121,7 @@ def doc_to_text_summarization(doc: dict[str, str]) -> str:
|
|
103 |
|
104 |
|
105 |
def doc_to_target(doc: dict[str, str]) -> str:
|
106 |
-
return doc[
|
107 |
|
108 |
|
109 |
def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:
|
|
|
83 |
|
84 |
def doc_to_text_qa(doc: dict[str, str]) -> str:
|
85 |
# prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
|
86 |
+
doc_text = (
|
87 |
+
QA_INSTURCTIONS
|
88 |
+
+ "\n\n#Knowledge#: "
|
89 |
+
+ doc["knowledge"]
|
90 |
+
+ "\n#Question#: "
|
91 |
+
+ doc["question"]
|
92 |
+
+ "\n#Answer#: "
|
93 |
+
+ doc["answer"]
|
94 |
+
+ "\n#Your Judgement#:"
|
95 |
+
)
|
96 |
return doc_text
|
97 |
|
98 |
|
99 |
def doc_to_text_dialogue(doc: dict[str, str]) -> str:
|
100 |
# prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
|
101 |
+
doc_text = (
|
102 |
+
DIALOGUE_INSTRUCTIONS
|
103 |
+
+ "\n\n#Knowledge#: "
|
104 |
+
+ doc["knowledge"]
|
105 |
+
+ "\n#Dialogue History#: "
|
106 |
+
+ doc["dialogue_history"]
|
107 |
+
+ "\n#Response#: "
|
108 |
+
+ doc["response"]
|
109 |
+
+ "\n#Your Judgement#:"
|
110 |
+
)
|
111 |
return doc_text
|
112 |
|
113 |
|
|
|
121 |
|
122 |
|
123 |
def doc_to_target(doc: dict[str, str]) -> str:
|
124 |
+
return doc["hallucination"]
|
125 |
|
126 |
|
127 |
def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:
|
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -3,6 +3,7 @@ from typing import Union, List
|
|
3 |
|
4 |
from lm_eval.api.task import ConfigurableTask
|
5 |
from lm_eval.api.instance import Instance
|
|
|
6 |
# from lm_eval.api.registry import register_task
|
7 |
from lm_eval.api.metrics import mean
|
8 |
|
@@ -17,26 +18,31 @@ class SelfCheckGPT(ConfigurableTask):
|
|
17 |
VERSION = 0.0
|
18 |
DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
|
19 |
DATASET_NAME = None
|
20 |
-
OUTPUT_TYPE =
|
21 |
|
22 |
def __init__(self):
|
23 |
-
super().__init__(config={
|
24 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
25 |
self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
26 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
27 |
-
self.generation_kwargs_sampling = {
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
31 |
self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
|
32 |
|
33 |
-
if self.selfcheckgpt_type ==
|
34 |
self.selfcheckgpt = SelfCheckNgram(n=1)
|
35 |
-
elif self.selfcheckgpt_type ==
|
36 |
self.selfcheckgpt = SelfCheckBERTScore(rescale_with_baseline=True)
|
37 |
-
elif self.selfcheckgpt_type ==
|
38 |
self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
|
39 |
-
elif self.selfcheckgpt_type ==
|
40 |
self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
|
41 |
self.SelfCheckNLI_error_cnt = 0
|
42 |
|
@@ -53,10 +59,10 @@ class SelfCheckGPT(ConfigurableTask):
|
|
53 |
return self.dataset["evaluation"]
|
54 |
|
55 |
def doc_to_text(self, doc):
|
56 |
-
if not hasattr(self,
|
57 |
self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
|
58 |
|
59 |
-
sentences = [x.text.strip() for x in self.selfcheckgpt_nlp(doc[
|
60 |
if len(sentences) < 2:
|
61 |
raise ValueError("This wikipedia passage is too short for self-consistency check: {sentences}")
|
62 |
# disscussed with Potsawee
|
@@ -65,18 +71,19 @@ class SelfCheckGPT(ConfigurableTask):
|
|
65 |
return doc_text
|
66 |
|
67 |
def doc_to_target(self, doc):
|
68 |
-
answer = doc[
|
69 |
return answer
|
70 |
|
71 |
def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
|
72 |
arguments = (ctx, self.generation_kwargs)
|
73 |
request_list = [
|
74 |
-
Instance(request_type=
|
75 |
]
|
76 |
sampling_arguments = (ctx, self.generation_kwargs_sampling)
|
77 |
-
request_list.extend(
|
78 |
-
|
79 |
-
|
|
|
80 |
]
|
81 |
)
|
82 |
return request_list
|
@@ -88,48 +95,53 @@ class SelfCheckGPT(ConfigurableTask):
|
|
88 |
|
89 |
sentences = self.selfcheckgpt_nlp(response_temperature_0)
|
90 |
sentences = [sent.text.strip() for sent in sentences.sents]
|
91 |
-
if self.selfcheckgpt_type ==
|
92 |
-
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
|
|
|
|
93 |
return {
|
94 |
-
|
95 |
-
|
96 |
}
|
97 |
|
98 |
-
elif self.selfcheckgpt_type ==
|
99 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
100 |
-
elif self.selfcheckgpt_type ==
|
101 |
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
102 |
sentences=sentences,
|
103 |
passage=response_temperature_0,
|
104 |
sampled_passages=other_responses,
|
105 |
-
num_questions_per_sent=5,
|
106 |
-
scoring_method=
|
107 |
-
beta1=0.8,
|
108 |
-
|
|
|
|
|
109 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
110 |
|
111 |
if len(selfcheckgpt_scores) < 2:
|
112 |
# at least two sentences
|
113 |
self.SelfCheckNLI_error_cnt += 1
|
114 |
-
result = {
|
115 |
-
'avg-selfcheckgpt': 0.0,
|
116 |
-
'max-selfcheckgpt': 0.0
|
117 |
-
}
|
118 |
|
119 |
else:
|
120 |
-
threshold = 0.7
|
121 |
# passage is hallucianted if one sentence is hallucinated. It's very strict.
|
122 |
selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
|
123 |
# passage is hallucianted if average score of all sentences is hallucinated.
|
124 |
-
selfcheckgpt_scores_avg =
|
125 |
-
|
|
|
|
|
126 |
|
127 |
return result
|
128 |
|
129 |
-
selfcheckgpt_scores_avg =
|
|
|
|
|
130 |
selfcheckgpt_scores_max = max(selfcheckgpt_scores)
|
131 |
|
132 |
-
return {
|
133 |
|
134 |
def aggregation(self):
|
135 |
"""
|
|
|
3 |
|
4 |
from lm_eval.api.task import ConfigurableTask
|
5 |
from lm_eval.api.instance import Instance
|
6 |
+
|
7 |
# from lm_eval.api.registry import register_task
|
8 |
from lm_eval.api.metrics import mean
|
9 |
|
|
|
18 |
VERSION = 0.0
|
19 |
DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
|
20 |
DATASET_NAME = None
|
21 |
+
OUTPUT_TYPE = "generate_until"
|
22 |
|
23 |
def __init__(self):
|
24 |
+
super().__init__(config={"metadata": {"version": self.VERSION}})
|
25 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
26 |
self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
27 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
28 |
+
self.generation_kwargs_sampling = {
|
29 |
+
"temperature": 0.99,
|
30 |
+
"do_sample": True,
|
31 |
+
"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"],
|
32 |
+
"max_length": 512,
|
33 |
+
}
|
34 |
+
|
35 |
+
self.selfcheckgpt_type = os.environ.get("SELFCHECKGPTTYPE", "SelfCheckNLI")
|
36 |
+
self.selfcheckgpt_device = os.environ.get("SELFCHECKGPTDEVICE", DEVICE)
|
37 |
self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
|
38 |
|
39 |
+
if self.selfcheckgpt_type == "SelfCheckNgram":
|
40 |
self.selfcheckgpt = SelfCheckNgram(n=1)
|
41 |
+
elif self.selfcheckgpt_type == "SelfCheckBERTScore":
|
42 |
self.selfcheckgpt = SelfCheckBERTScore(rescale_with_baseline=True)
|
43 |
+
elif self.selfcheckgpt_type == "SelfCheckMQAG":
|
44 |
self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
|
45 |
+
elif self.selfcheckgpt_type == "SelfCheckNLI":
|
46 |
self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
|
47 |
self.SelfCheckNLI_error_cnt = 0
|
48 |
|
|
|
59 |
return self.dataset["evaluation"]
|
60 |
|
61 |
def doc_to_text(self, doc):
|
62 |
+
if not hasattr(self, "selfcheckgpt_nlp"):
|
63 |
self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
|
64 |
|
65 |
+
sentences = [x.text.strip() for x in self.selfcheckgpt_nlp(doc["wiki_bio_text"]).sents]
|
66 |
if len(sentences) < 2:
|
67 |
raise ValueError("This wikipedia passage is too short for self-consistency check: {sentences}")
|
68 |
# disscussed with Potsawee
|
|
|
71 |
return doc_text
|
72 |
|
73 |
def doc_to_target(self, doc):
|
74 |
+
answer = doc["wiki_bio_text"]
|
75 |
return answer
|
76 |
|
77 |
def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
|
78 |
arguments = (ctx, self.generation_kwargs)
|
79 |
request_list = [
|
80 |
+
Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
|
81 |
]
|
82 |
sampling_arguments = (ctx, self.generation_kwargs_sampling)
|
83 |
+
request_list.extend(
|
84 |
+
[
|
85 |
+
Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
|
86 |
+
for idx in range(1, self.generation_kwargs_sampling_number + 1)
|
87 |
]
|
88 |
)
|
89 |
return request_list
|
|
|
95 |
|
96 |
sentences = self.selfcheckgpt_nlp(response_temperature_0)
|
97 |
sentences = [sent.text.strip() for sent in sentences.sents]
|
98 |
+
if self.selfcheckgpt_type == "SelfCheckNgram":
|
99 |
+
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
100 |
+
sentences=sentences, passage=response_temperature_0, sampled_passages=other_responses
|
101 |
+
)
|
102 |
return {
|
103 |
+
"avg-selfcheckgpt": selfcheckgpt_scores["doc_level"]["avg_neg_logprob"],
|
104 |
+
"max-selfcheckgpt": selfcheckgpt_scores["doc_level"]["avg_max_neg_logprob"],
|
105 |
}
|
106 |
|
107 |
+
elif self.selfcheckgpt_type == "SelfCheckBERTScore":
|
108 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
109 |
+
elif self.selfcheckgpt_type == "SelfCheckMQAG":
|
110 |
selfcheckgpt_scores = self.selfcheckgpt.predict(
|
111 |
sentences=sentences,
|
112 |
passage=response_temperature_0,
|
113 |
sampled_passages=other_responses,
|
114 |
+
num_questions_per_sent=5, # number of questions to be drawn
|
115 |
+
scoring_method="bayes_with_alpha", # options = 'counting', 'bayes', 'bayes_with_alpha'
|
116 |
+
beta1=0.8,
|
117 |
+
beta2=0.8,
|
118 |
+
) # additional params depending on scoring_method
|
119 |
+
elif self.selfcheckgpt_type == "SelfCheckNLI":
|
120 |
selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
|
121 |
|
122 |
if len(selfcheckgpt_scores) < 2:
|
123 |
# at least two sentences
|
124 |
self.SelfCheckNLI_error_cnt += 1
|
125 |
+
result = {"avg-selfcheckgpt": 0.0, "max-selfcheckgpt": 0.0}
|
|
|
|
|
|
|
126 |
|
127 |
else:
|
128 |
+
threshold = 0.7 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
|
129 |
# passage is hallucianted if one sentence is hallucinated. It's very strict.
|
130 |
selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
|
131 |
# passage is hallucianted if average score of all sentences is hallucinated.
|
132 |
+
selfcheckgpt_scores_avg = (
|
133 |
+
0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
|
134 |
+
)
|
135 |
+
result = {"avg-selfcheckgpt": selfcheckgpt_scores_avg, "max-selfcheckgpt": selfcheckgpt_scores_max}
|
136 |
|
137 |
return result
|
138 |
|
139 |
+
selfcheckgpt_scores_avg = (
|
140 |
+
sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
|
141 |
+
)
|
142 |
selfcheckgpt_scores_max = max(selfcheckgpt_scores)
|
143 |
|
144 |
+
return {"avg-selfcheckgpt": selfcheckgpt_scores_avg, "max-selfcheckgpt": selfcheckgpt_scores_max}
|
145 |
|
146 |
def aggregation(self):
|
147 |
"""
|
src/backend/tasks/xsum/task.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from lm_eval.api.task import ConfigurableTask
|
2 |
from lm_eval.api.instance import Instance
|
|
|
3 |
# from lm_eval.api.registry import register_task
|
4 |
from lm_eval.api.metrics import mean
|
5 |
|
@@ -18,8 +19,16 @@ def bleu(refs, preds):
|
|
18 |
:param preds:
|
19 |
A `list` of predicted `str`s.
|
20 |
"""
|
21 |
-
score = sacrebleu.corpus_bleu(
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
return score
|
24 |
|
25 |
|
@@ -58,7 +67,7 @@ class XSum(ConfigurableTask):
|
|
58 |
DATASET_NAME = None
|
59 |
|
60 |
def __init__(self):
|
61 |
-
super().__init__(config={
|
62 |
self.factkb_tokenizer = None
|
63 |
self.factkb_model = None
|
64 |
self.bert_score = None
|
@@ -66,12 +75,18 @@ class XSum(ConfigurableTask):
|
|
66 |
def maybe_init_factkb(self):
|
67 |
if self.factkb_tokenizer is None or self.factkb_model is None:
|
68 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
69 |
-
|
70 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
def maybe_init_bertscore(self):
|
73 |
if self.bert_score is None:
|
74 |
from evaluate import load
|
|
|
75 |
self.bert_score = load("bertscore")
|
76 |
|
77 |
def has_training_docs(self):
|
@@ -124,7 +139,7 @@ class XSum(ConfigurableTask):
|
|
124 |
# arguments=(ctx, {"until": ["\n", "."]}),
|
125 |
arguments=(ctx, {"until": ["\n"]}),
|
126 |
idx=0,
|
127 |
-
**kwargs
|
128 |
)
|
129 |
]
|
130 |
|
@@ -150,12 +165,16 @@ class XSum(ConfigurableTask):
|
|
150 |
|
151 |
self.maybe_init_factkb()
|
152 |
input_factkb = [[completion, document]]
|
153 |
-
factkb_tokens = self.factkb_tokenizer(
|
|
|
|
|
154 |
factkb_logits = self.factkb_model(**factkb_tokens).logits
|
155 |
factkb_res = torch.softmax(factkb_logits, dim=1)
|
156 |
|
157 |
self.maybe_init_bertscore()
|
158 |
-
bert_score_res = self.bert_score.compute(
|
|
|
|
|
159 |
|
160 |
res = {
|
161 |
"rouge1": rouge1_scores[0],
|
@@ -177,7 +196,18 @@ class XSum(ConfigurableTask):
|
|
177 |
A dictionary where keys are the names of submetrics and values are
|
178 |
functions that aggregate a list of metrics
|
179 |
"""
|
180 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
def higher_is_better(self):
|
183 |
"""
|
@@ -185,4 +215,15 @@ class XSum(ConfigurableTask):
|
|
185 |
A dictionary where keys are the names of submetrics and values are
|
186 |
whether a higher value of the submetric is better
|
187 |
"""
|
188 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from lm_eval.api.task import ConfigurableTask
|
2 |
from lm_eval.api.instance import Instance
|
3 |
+
|
4 |
# from lm_eval.api.registry import register_task
|
5 |
from lm_eval.api.metrics import mean
|
6 |
|
|
|
19 |
:param preds:
|
20 |
A `list` of predicted `str`s.
|
21 |
"""
|
22 |
+
score = sacrebleu.corpus_bleu(
|
23 |
+
preds,
|
24 |
+
refs,
|
25 |
+
smooth_method="exp",
|
26 |
+
smooth_value=0.0,
|
27 |
+
force=False,
|
28 |
+
lowercase=False,
|
29 |
+
tokenize="intl",
|
30 |
+
use_effective_order=False,
|
31 |
+
).score
|
32 |
return score
|
33 |
|
34 |
|
|
|
67 |
DATASET_NAME = None
|
68 |
|
69 |
def __init__(self):
|
70 |
+
super().__init__(config={"metadata": {"version": self.VERSION}})
|
71 |
self.factkb_tokenizer = None
|
72 |
self.factkb_model = None
|
73 |
self.bert_score = None
|
|
|
75 |
def maybe_init_factkb(self):
|
76 |
if self.factkb_tokenizer is None or self.factkb_model is None:
|
77 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
78 |
+
|
79 |
+
self.factkb_tokenizer = AutoTokenizer.from_pretrained(
|
80 |
+
"roberta-base", padding="max_length", truncation=True
|
81 |
+
)
|
82 |
+
self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
|
83 |
+
"bunsenfeng/FactKB", num_labels=2, device_map="auto"
|
84 |
+
)
|
85 |
|
86 |
def maybe_init_bertscore(self):
|
87 |
if self.bert_score is None:
|
88 |
from evaluate import load
|
89 |
+
|
90 |
self.bert_score = load("bertscore")
|
91 |
|
92 |
def has_training_docs(self):
|
|
|
139 |
# arguments=(ctx, {"until": ["\n", "."]}),
|
140 |
arguments=(ctx, {"until": ["\n"]}),
|
141 |
idx=0,
|
142 |
+
**kwargs,
|
143 |
)
|
144 |
]
|
145 |
|
|
|
165 |
|
166 |
self.maybe_init_factkb()
|
167 |
input_factkb = [[completion, document]]
|
168 |
+
factkb_tokens = self.factkb_tokenizer(
|
169 |
+
input_factkb, return_tensors="pt", padding="max_length", truncation=True
|
170 |
+
).to(self.factkb_model.device)
|
171 |
factkb_logits = self.factkb_model(**factkb_tokens).logits
|
172 |
factkb_res = torch.softmax(factkb_logits, dim=1)
|
173 |
|
174 |
self.maybe_init_bertscore()
|
175 |
+
bert_score_res = self.bert_score.compute(
|
176 |
+
predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
|
177 |
+
)
|
178 |
|
179 |
res = {
|
180 |
"rouge1": rouge1_scores[0],
|
|
|
196 |
A dictionary where keys are the names of submetrics and values are
|
197 |
functions that aggregate a list of metrics
|
198 |
"""
|
199 |
+
return {
|
200 |
+
k: mean
|
201 |
+
for k in [
|
202 |
+
"rouge1",
|
203 |
+
"rouge2",
|
204 |
+
"rougeL",
|
205 |
+
"factKB",
|
206 |
+
"bertscore_precision",
|
207 |
+
"bertscore_recall",
|
208 |
+
"bertscore_f1",
|
209 |
+
]
|
210 |
+
}
|
211 |
|
212 |
def higher_is_better(self):
|
213 |
"""
|
|
|
215 |
A dictionary where keys are the names of submetrics and values are
|
216 |
whether a higher value of the submetric is better
|
217 |
"""
|
218 |
+
return {
|
219 |
+
k: True
|
220 |
+
for k in [
|
221 |
+
"rouge1",
|
222 |
+
"rouge2",
|
223 |
+
"rougeL",
|
224 |
+
"factKB",
|
225 |
+
"bertscore_precision",
|
226 |
+
"bertscore_recall",
|
227 |
+
"bertscore_f1",
|
228 |
+
]
|
229 |
+
}
|
src/backend/tasks/xsum/task_v2.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from lm_eval.api.task import ConfigurableTask
|
2 |
from lm_eval.api.instance import Instance
|
|
|
3 |
# from lm_eval.api.registry import register_task
|
4 |
from lm_eval.api.metrics import mean
|
5 |
|
@@ -18,8 +19,16 @@ def bleu(refs, preds):
|
|
18 |
:param preds:
|
19 |
A `list` of predicted `str`s.
|
20 |
"""
|
21 |
-
score = sacrebleu.corpus_bleu(
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
return score
|
24 |
|
25 |
|
@@ -59,8 +68,12 @@ class XSumv2(ConfigurableTask):
|
|
59 |
|
60 |
def __init__(self):
|
61 |
# breakpoint()
|
62 |
-
super().__init__(
|
63 |
-
|
|
|
|
|
|
|
|
|
64 |
self.factkb_tokenizer = None
|
65 |
self.factkb_model = None
|
66 |
self.bert_score = None
|
@@ -68,12 +81,18 @@ class XSumv2(ConfigurableTask):
|
|
68 |
def maybe_init_factkb(self):
|
69 |
if self.factkb_tokenizer is None or self.factkb_model is None:
|
70 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
71 |
-
|
72 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
def maybe_init_bertscore(self):
|
75 |
if self.bert_score is None:
|
76 |
from evaluate import load
|
|
|
77 |
self.bert_score = load("bertscore")
|
78 |
|
79 |
def has_training_docs(self):
|
@@ -129,7 +148,7 @@ class XSumv2(ConfigurableTask):
|
|
129 |
# arguments=(ctx, {"until": ["\n", "."]}),
|
130 |
arguments=(ctx, {"until": ["\n"]}),
|
131 |
idx=0,
|
132 |
-
**kwargs
|
133 |
)
|
134 |
]
|
135 |
|
@@ -155,12 +174,16 @@ class XSumv2(ConfigurableTask):
|
|
155 |
|
156 |
self.maybe_init_factkb()
|
157 |
input_factkb = [[completion, document]]
|
158 |
-
factkb_tokens = self.factkb_tokenizer(
|
|
|
|
|
159 |
factkb_logits = self.factkb_model(**factkb_tokens).logits
|
160 |
factkb_res = torch.softmax(factkb_logits, dim=1)
|
161 |
|
162 |
self.maybe_init_bertscore()
|
163 |
-
bert_score_res = self.bert_score.compute(
|
|
|
|
|
164 |
|
165 |
res = {
|
166 |
"rouge1": rouge1_scores[0],
|
@@ -182,7 +205,18 @@ class XSumv2(ConfigurableTask):
|
|
182 |
A dictionary where keys are the names of submetrics and values are
|
183 |
functions that aggregate a list of metrics
|
184 |
"""
|
185 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
def higher_is_better(self):
|
188 |
"""
|
@@ -190,4 +224,15 @@ class XSumv2(ConfigurableTask):
|
|
190 |
A dictionary where keys are the names of submetrics and values are
|
191 |
whether a higher value of the submetric is better
|
192 |
"""
|
193 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from lm_eval.api.task import ConfigurableTask
|
2 |
from lm_eval.api.instance import Instance
|
3 |
+
|
4 |
# from lm_eval.api.registry import register_task
|
5 |
from lm_eval.api.metrics import mean
|
6 |
|
|
|
19 |
:param preds:
|
20 |
A `list` of predicted `str`s.
|
21 |
"""
|
22 |
+
score = sacrebleu.corpus_bleu(
|
23 |
+
preds,
|
24 |
+
refs,
|
25 |
+
smooth_method="exp",
|
26 |
+
smooth_value=0.0,
|
27 |
+
force=False,
|
28 |
+
lowercase=False,
|
29 |
+
tokenize="intl",
|
30 |
+
use_effective_order=False,
|
31 |
+
).score
|
32 |
return score
|
33 |
|
34 |
|
|
|
68 |
|
69 |
def __init__(self):
|
70 |
# breakpoint()
|
71 |
+
super().__init__(
|
72 |
+
config={
|
73 |
+
"metadata": {"version": self.VERSION},
|
74 |
+
"generation_kwargs": {"do_sample": False, "temperature": 0.0, "until": ["\n", "\n\n"]},
|
75 |
+
}
|
76 |
+
)
|
77 |
self.factkb_tokenizer = None
|
78 |
self.factkb_model = None
|
79 |
self.bert_score = None
|
|
|
81 |
def maybe_init_factkb(self):
|
82 |
if self.factkb_tokenizer is None or self.factkb_model is None:
|
83 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
84 |
+
|
85 |
+
self.factkb_tokenizer = AutoTokenizer.from_pretrained(
|
86 |
+
"roberta-base", padding="max_length", truncation=True
|
87 |
+
)
|
88 |
+
self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
|
89 |
+
"bunsenfeng/FactKB", num_labels=2, device_map="auto"
|
90 |
+
)
|
91 |
|
92 |
def maybe_init_bertscore(self):
|
93 |
if self.bert_score is None:
|
94 |
from evaluate import load
|
95 |
+
|
96 |
self.bert_score = load("bertscore")
|
97 |
|
98 |
def has_training_docs(self):
|
|
|
148 |
# arguments=(ctx, {"until": ["\n", "."]}),
|
149 |
arguments=(ctx, {"until": ["\n"]}),
|
150 |
idx=0,
|
151 |
+
**kwargs,
|
152 |
)
|
153 |
]
|
154 |
|
|
|
174 |
|
175 |
self.maybe_init_factkb()
|
176 |
input_factkb = [[completion, document]]
|
177 |
+
factkb_tokens = self.factkb_tokenizer(
|
178 |
+
input_factkb, return_tensors="pt", padding="max_length", truncation=True
|
179 |
+
).to(self.factkb_model.device)
|
180 |
factkb_logits = self.factkb_model(**factkb_tokens).logits
|
181 |
factkb_res = torch.softmax(factkb_logits, dim=1)
|
182 |
|
183 |
self.maybe_init_bertscore()
|
184 |
+
bert_score_res = self.bert_score.compute(
|
185 |
+
predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
|
186 |
+
)
|
187 |
|
188 |
res = {
|
189 |
"rouge1": rouge1_scores[0],
|
|
|
205 |
A dictionary where keys are the names of submetrics and values are
|
206 |
functions that aggregate a list of metrics
|
207 |
"""
|
208 |
+
return {
|
209 |
+
k: mean
|
210 |
+
for k in [
|
211 |
+
"rouge1",
|
212 |
+
"rouge2",
|
213 |
+
"rougeL",
|
214 |
+
"factKB",
|
215 |
+
"bertscore_precision",
|
216 |
+
"bertscore_recall",
|
217 |
+
"bertscore_f1",
|
218 |
+
]
|
219 |
+
}
|
220 |
|
221 |
def higher_is_better(self):
|
222 |
"""
|
|
|
224 |
A dictionary where keys are the names of submetrics and values are
|
225 |
whether a higher value of the submetric is better
|
226 |
"""
|
227 |
+
return {
|
228 |
+
k: True
|
229 |
+
for k in [
|
230 |
+
"rouge1",
|
231 |
+
"rouge2",
|
232 |
+
"rougeL",
|
233 |
+
"factKB",
|
234 |
+
"bertscore_precision",
|
235 |
+
"bertscore_recall",
|
236 |
+
"bertscore_f1",
|
237 |
+
]
|
238 |
+
}
|
src/browse.py
CHANGED
@@ -32,6 +32,7 @@ import socket
|
|
32 |
import subprocess
|
33 |
import sys
|
34 |
import webbrowser
|
|
|
35 |
if sys.version_info >= (3, 2):
|
36 |
from html import escape
|
37 |
else:
|
@@ -42,7 +43,7 @@ except ImportError:
|
|
42 |
from urllib2 import unquote
|
43 |
from collections import namedtuple
|
44 |
|
45 |
-
Node = namedtuple(
|
46 |
|
47 |
# Ideally we'd allow you to navigate to a build edge or a build node,
|
48 |
# with appropriate views for each. But there's no way to *name* a build
|
@@ -57,16 +58,19 @@ Node = namedtuple('Node', ['inputs', 'rule', 'target', 'outputs'])
|
|
57 |
# This means there's no single view that shows you all inputs and outputs
|
58 |
# of an edge. But I think it's less confusing than alternatives.
|
59 |
|
|
|
60 |
def match_strip(line, prefix):
|
61 |
if not line.startswith(prefix):
|
62 |
return (False, line)
|
63 |
-
return (True, line[len(prefix):])
|
|
|
64 |
|
65 |
def html_escape(text):
|
66 |
return escape(text, quote=True)
|
67 |
|
|
|
68 |
def parse(text):
|
69 |
-
lines = iter(text.split(
|
70 |
|
71 |
target = None
|
72 |
rule = None
|
@@ -77,33 +81,35 @@ def parse(text):
|
|
77 |
target = next(lines)[:-1] # strip trailing colon
|
78 |
|
79 |
line = next(lines)
|
80 |
-
(match, rule) = match_strip(line,
|
81 |
if match:
|
82 |
-
(match, line) = match_strip(next(lines),
|
83 |
while match:
|
84 |
type = None
|
85 |
-
(match, line) = match_strip(line,
|
86 |
if match:
|
87 |
-
type =
|
88 |
-
(match, line) = match_strip(line,
|
89 |
if match:
|
90 |
-
type =
|
91 |
inputs.append((line, type))
|
92 |
-
(match, line) = match_strip(next(lines),
|
93 |
|
94 |
-
match, _ = match_strip(line,
|
95 |
if match:
|
96 |
-
(match, line) = match_strip(next(lines),
|
97 |
while match:
|
98 |
outputs.append(line)
|
99 |
-
(match, line) = match_strip(next(lines),
|
100 |
except StopIteration:
|
101 |
pass
|
102 |
|
103 |
return Node(inputs, rule, target, outputs)
|
104 |
|
|
|
105 |
def create_page(body):
|
106 |
-
return
|
|
|
107 |
<style>
|
108 |
body {
|
109 |
font-family: sans;
|
@@ -128,52 +134,55 @@ tt {
|
|
128 |
-webkit-columns: auto 2;
|
129 |
}
|
130 |
</style>
|
131 |
-
|
|
|
|
|
|
|
132 |
|
133 |
def generate_html(node):
|
134 |
-
document = [
|
135 |
|
136 |
if node.inputs:
|
137 |
-
document.append(
|
138 |
-
html_escape(node.rule))
|
139 |
if len(node.inputs) > 0:
|
140 |
-
document.append(
|
141 |
for input, type in sorted(node.inputs):
|
142 |
-
extra =
|
143 |
if type:
|
144 |
-
extra =
|
145 |
-
document.append(
|
146 |
-
|
147 |
-
|
|
|
148 |
|
149 |
if node.outputs:
|
150 |
-
document.append(
|
151 |
-
document.append(
|
152 |
for output in sorted(node.outputs):
|
153 |
-
document.append('<tt><a href="?%s">%s</a></tt><br>' %
|
154 |
-
|
155 |
-
|
|
|
156 |
|
157 |
-
return '\n'.join(document)
|
158 |
|
159 |
def ninja_dump(target):
|
160 |
-
cmd = [args.ninja_command,
|
161 |
-
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
162 |
-
universal_newlines=True)
|
163 |
return proc.communicate() + (proc.returncode,)
|
164 |
|
|
|
165 |
class RequestHandler(httpserver.BaseHTTPRequestHandler):
|
166 |
def do_GET(self):
|
167 |
-
assert self.path[0] ==
|
168 |
target = unquote(self.path[1:])
|
169 |
|
170 |
-
if target ==
|
171 |
self.send_response(302)
|
172 |
-
self.send_header(
|
173 |
self.end_headers()
|
174 |
return
|
175 |
|
176 |
-
if not target.startswith(
|
177 |
self.send_response(404)
|
178 |
self.end_headers()
|
179 |
return
|
@@ -184,48 +193,45 @@ class RequestHandler(httpserver.BaseHTTPRequestHandler):
|
|
184 |
page_body = generate_html(parse(ninja_output.strip()))
|
185 |
else:
|
186 |
# Relay ninja's error message.
|
187 |
-
page_body =
|
188 |
|
189 |
self.send_response(200)
|
190 |
self.end_headers()
|
191 |
-
self.wfile.write(create_page(page_body).encode(
|
192 |
|
193 |
def log_message(self, format, *args):
|
194 |
pass # Swallow console spam.
|
195 |
|
196 |
-
|
197 |
-
parser.
|
198 |
-
|
199 |
-
parser.add_argument(
|
200 |
-
help=
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
parser.add_argument(
|
205 |
-
|
206 |
-
parser.add_argument(
|
207 |
-
|
208 |
-
parser.add_argument('initial_target', default='all', nargs='?',
|
209 |
-
help='Initial target to show (default %(default)s)')
|
210 |
|
211 |
class HTTPServer(socketserver.ThreadingMixIn, httpserver.HTTPServer):
|
212 |
# terminate server immediately when Python exits.
|
213 |
daemon_threads = True
|
214 |
|
|
|
215 |
args = parser.parse_args()
|
216 |
port = args.port
|
217 |
hostname = args.hostname
|
218 |
-
httpd = HTTPServer((hostname,port), RequestHandler)
|
219 |
try:
|
220 |
if hostname == "":
|
221 |
hostname = socket.gethostname()
|
222 |
-
print(
|
223 |
-
print(
|
224 |
if not args.no_browser:
|
225 |
-
webbrowser.open_new(
|
226 |
httpd.serve_forever()
|
227 |
except KeyboardInterrupt:
|
228 |
print()
|
229 |
pass # Swallow console spam.
|
230 |
-
|
231 |
-
|
|
|
32 |
import subprocess
|
33 |
import sys
|
34 |
import webbrowser
|
35 |
+
|
36 |
if sys.version_info >= (3, 2):
|
37 |
from html import escape
|
38 |
else:
|
|
|
43 |
from urllib2 import unquote
|
44 |
from collections import namedtuple
|
45 |
|
46 |
+
Node = namedtuple("Node", ["inputs", "rule", "target", "outputs"])
|
47 |
|
48 |
# Ideally we'd allow you to navigate to a build edge or a build node,
|
49 |
# with appropriate views for each. But there's no way to *name* a build
|
|
|
58 |
# This means there's no single view that shows you all inputs and outputs
|
59 |
# of an edge. But I think it's less confusing than alternatives.
|
60 |
|
61 |
+
|
62 |
def match_strip(line, prefix):
|
63 |
if not line.startswith(prefix):
|
64 |
return (False, line)
|
65 |
+
return (True, line[len(prefix) :])
|
66 |
+
|
67 |
|
68 |
def html_escape(text):
|
69 |
return escape(text, quote=True)
|
70 |
|
71 |
+
|
72 |
def parse(text):
|
73 |
+
lines = iter(text.split("\n"))
|
74 |
|
75 |
target = None
|
76 |
rule = None
|
|
|
81 |
target = next(lines)[:-1] # strip trailing colon
|
82 |
|
83 |
line = next(lines)
|
84 |
+
(match, rule) = match_strip(line, " input: ")
|
85 |
if match:
|
86 |
+
(match, line) = match_strip(next(lines), " ")
|
87 |
while match:
|
88 |
type = None
|
89 |
+
(match, line) = match_strip(line, "| ")
|
90 |
if match:
|
91 |
+
type = "implicit"
|
92 |
+
(match, line) = match_strip(line, "|| ")
|
93 |
if match:
|
94 |
+
type = "order-only"
|
95 |
inputs.append((line, type))
|
96 |
+
(match, line) = match_strip(next(lines), " ")
|
97 |
|
98 |
+
match, _ = match_strip(line, " outputs:")
|
99 |
if match:
|
100 |
+
(match, line) = match_strip(next(lines), " ")
|
101 |
while match:
|
102 |
outputs.append(line)
|
103 |
+
(match, line) = match_strip(next(lines), " ")
|
104 |
except StopIteration:
|
105 |
pass
|
106 |
|
107 |
return Node(inputs, rule, target, outputs)
|
108 |
|
109 |
+
|
110 |
def create_page(body):
|
111 |
+
return (
|
112 |
+
"""<!DOCTYPE html>
|
113 |
<style>
|
114 |
body {
|
115 |
font-family: sans;
|
|
|
134 |
-webkit-columns: auto 2;
|
135 |
}
|
136 |
</style>
|
137 |
+
"""
|
138 |
+
+ body
|
139 |
+
)
|
140 |
+
|
141 |
|
142 |
def generate_html(node):
|
143 |
+
document = ["<h1><tt>%s</tt></h1>" % html_escape(node.target)]
|
144 |
|
145 |
if node.inputs:
|
146 |
+
document.append("<h2>target is built using rule <tt>%s</tt> of</h2>" % html_escape(node.rule))
|
|
|
147 |
if len(node.inputs) > 0:
|
148 |
+
document.append("<div class=filelist>")
|
149 |
for input, type in sorted(node.inputs):
|
150 |
+
extra = ""
|
151 |
if type:
|
152 |
+
extra = " (%s)" % html_escape(type)
|
153 |
+
document.append(
|
154 |
+
'<tt><a href="?%s">%s</a>%s</tt><br>' % (html_escape(input), html_escape(input), extra)
|
155 |
+
)
|
156 |
+
document.append("</div>")
|
157 |
|
158 |
if node.outputs:
|
159 |
+
document.append("<h2>dependent edges build:</h2>")
|
160 |
+
document.append("<div class=filelist>")
|
161 |
for output in sorted(node.outputs):
|
162 |
+
document.append('<tt><a href="?%s">%s</a></tt><br>' % (html_escape(output), html_escape(output)))
|
163 |
+
document.append("</div>")
|
164 |
+
|
165 |
+
return "\n".join(document)
|
166 |
|
|
|
167 |
|
168 |
def ninja_dump(target):
|
169 |
+
cmd = [args.ninja_command, "-f", args.f, "-t", "query", target]
|
170 |
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
|
|
|
171 |
return proc.communicate() + (proc.returncode,)
|
172 |
|
173 |
+
|
174 |
class RequestHandler(httpserver.BaseHTTPRequestHandler):
|
175 |
def do_GET(self):
|
176 |
+
assert self.path[0] == "/"
|
177 |
target = unquote(self.path[1:])
|
178 |
|
179 |
+
if target == "":
|
180 |
self.send_response(302)
|
181 |
+
self.send_header("Location", "?" + args.initial_target)
|
182 |
self.end_headers()
|
183 |
return
|
184 |
|
185 |
+
if not target.startswith("?"):
|
186 |
self.send_response(404)
|
187 |
self.end_headers()
|
188 |
return
|
|
|
193 |
page_body = generate_html(parse(ninja_output.strip()))
|
194 |
else:
|
195 |
# Relay ninja's error message.
|
196 |
+
page_body = "<h1><tt>%s</tt></h1>" % html_escape(ninja_error)
|
197 |
|
198 |
self.send_response(200)
|
199 |
self.end_headers()
|
200 |
+
self.wfile.write(create_page(page_body).encode("utf-8"))
|
201 |
|
202 |
def log_message(self, format, *args):
|
203 |
pass # Swallow console spam.
|
204 |
|
205 |
+
|
206 |
+
parser = argparse.ArgumentParser(prog="ninja -t browse")
|
207 |
+
parser.add_argument("--port", "-p", default=8000, type=int, help="Port number to use (default %(default)d)")
|
208 |
+
parser.add_argument(
|
209 |
+
"--hostname", "-a", default="localhost", type=str, help="Hostname to bind to (default %(default)s)"
|
210 |
+
)
|
211 |
+
parser.add_argument("--no-browser", action="store_true", help="Do not open a webbrowser on startup.")
|
212 |
+
|
213 |
+
parser.add_argument("--ninja-command", default="ninja", help="Path to ninja binary (default %(default)s)")
|
214 |
+
parser.add_argument("-f", default="build.ninja", help="Path to build.ninja file (default %(default)s)")
|
215 |
+
parser.add_argument("initial_target", default="all", nargs="?", help="Initial target to show (default %(default)s)")
|
216 |
+
|
|
|
|
|
217 |
|
218 |
class HTTPServer(socketserver.ThreadingMixIn, httpserver.HTTPServer):
|
219 |
# terminate server immediately when Python exits.
|
220 |
daemon_threads = True
|
221 |
|
222 |
+
|
223 |
args = parser.parse_args()
|
224 |
port = args.port
|
225 |
hostname = args.hostname
|
226 |
+
httpd = HTTPServer((hostname, port), RequestHandler)
|
227 |
try:
|
228 |
if hostname == "":
|
229 |
hostname = socket.gethostname()
|
230 |
+
print("Web server running on %s:%d, ctl-C to abort..." % (hostname, port))
|
231 |
+
print("Web server pid %d" % os.getpid(), file=sys.stderr)
|
232 |
if not args.no_browser:
|
233 |
+
webbrowser.open_new("http://%s:%s" % (hostname, port))
|
234 |
httpd.serve_forever()
|
235 |
except KeyboardInterrupt:
|
236 |
print()
|
237 |
pass # Swallow console spam.
|
|
|
|
src/display/utils.py
CHANGED
@@ -61,6 +61,7 @@ class ColumnContent:
|
|
61 |
never_hidden: bool = False
|
62 |
dummy: bool = False
|
63 |
|
|
|
64 |
auto_eval_column_dict = []
|
65 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)])
|
66 |
auto_eval_column_dict.append(["hardware", ColumnContent, ColumnContent("Hardware", "str", True, never_hidden=True)])
|
|
|
61 |
never_hidden: bool = False
|
62 |
dummy: bool = False
|
63 |
|
64 |
+
|
65 |
auto_eval_column_dict = []
|
66 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)])
|
67 |
auto_eval_column_dict.append(["hardware", ColumnContent, ColumnContent("Hardware", "str", True, never_hidden=True)])
|
src/leaderboard/filter_models.py
CHANGED
@@ -29,9 +29,9 @@ def flag_models(leaderboard_data: list[dict]):
|
|
29 |
FLAGGED_MODELS[model_data["model_name_for_query"]],
|
30 |
f"See discussion #{issue_num}",
|
31 |
)
|
32 |
-
model_data[
|
33 |
-
AutoEvalColumn.model.name
|
34 |
-
|
35 |
|
36 |
|
37 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|
|
|
29 |
FLAGGED_MODELS[model_data["model_name_for_query"]],
|
30 |
f"See discussion #{issue_num}",
|
31 |
)
|
32 |
+
model_data[AutoEvalColumn.model.name] = (
|
33 |
+
f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
34 |
+
)
|
35 |
|
36 |
|
37 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|
src/leaderboard/read_evals.py
CHANGED
@@ -5,6 +5,7 @@ from tqdm import tqdm
|
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
import dateutil
|
|
|
8 |
# import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
@@ -32,13 +33,13 @@ class EvalResult:
|
|
32 |
revision: str # commit hash, "" if main
|
33 |
results: dict
|
34 |
precision: Precision = Precision.Unknown
|
35 |
-
model_type: ModelType = ModelType.Unknown
|
36 |
-
weight_type: WeightType = WeightType.Original
|
37 |
-
architecture: str = "Unknown"
|
38 |
license: str = "?"
|
39 |
likes: int = 0
|
40 |
num_params: int = 0
|
41 |
-
date: str = ""
|
42 |
still_on_hub: bool = False
|
43 |
|
44 |
@staticmethod
|
@@ -67,7 +68,9 @@ class EvalResult:
|
|
67 |
result_key = f"{org}_{model}_{precision.value.name}"
|
68 |
full_model = "/".join(org_and_model)
|
69 |
|
70 |
-
still_on_hub, error, model_config = is_model_on_hub(
|
|
|
|
|
71 |
architecture = "?"
|
72 |
if model_config is not None:
|
73 |
architectures = getattr(model_config, "architectures", None)
|
@@ -79,35 +82,43 @@ class EvalResult:
|
|
79 |
# data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}}
|
80 |
|
81 |
results = {}
|
82 |
-
for benchmark, benchmark_results in data[
|
83 |
if benchmark not in results:
|
84 |
results[benchmark] = {}
|
85 |
|
86 |
for metric, value in benchmark_results.items():
|
87 |
to_add = True
|
88 |
-
if
|
89 |
to_add = False
|
90 |
-
if
|
91 |
to_add = False
|
92 |
|
93 |
-
if
|
94 |
-
metric = metric.split(
|
95 |
metric = metric.replace("exact_match", "em")
|
96 |
|
97 |
if to_add is True:
|
98 |
multiplier = 100.0
|
99 |
-
if
|
100 |
multiplier = 1.0
|
101 |
-
if
|
102 |
multiplier = 1.0
|
103 |
|
104 |
# print('RESULTS', data['results'])
|
105 |
# print('XXX', benchmark, metric, value, multiplier)
|
106 |
results[benchmark][metric] = value * multiplier
|
107 |
|
108 |
-
res = EvalResult(
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
return res
|
113 |
|
@@ -183,6 +194,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
183 |
request_file = tmp_request_file
|
184 |
return request_file
|
185 |
|
|
|
186 |
def get_request_file_for_model_open_llm(requests_path, model_name, precision):
|
187 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
188 |
request_files = os.path.join(
|
@@ -197,16 +209,16 @@ def get_request_file_for_model_open_llm(requests_path, model_name, precision):
|
|
197 |
for tmp_request_file in request_files:
|
198 |
with open(tmp_request_file, "r") as f:
|
199 |
req_content = json.load(f)
|
200 |
-
if (
|
201 |
-
req_content["status"] in ["FINISHED"]
|
202 |
-
and req_content["precision"] == precision.split(".")[-1]
|
203 |
-
):
|
204 |
request_file = tmp_request_file
|
205 |
return request_file
|
206 |
|
|
|
207 |
def update_model_type_with_open_llm_request_file(result, open_llm_requests_path):
|
208 |
"""Finds the relevant request file for the current model and updates info with it"""
|
209 |
-
request_file = get_request_file_for_model_open_llm(
|
|
|
|
|
210 |
|
211 |
if request_file:
|
212 |
try:
|
@@ -219,9 +231,8 @@ def update_model_type_with_open_llm_request_file(result, open_llm_requests_path)
|
|
219 |
pass
|
220 |
return result
|
221 |
|
222 |
-
|
223 |
-
|
224 |
-
is_backend: bool = False) -> list[EvalResult]:
|
225 |
"""From the path of the results folder root, extract all needed info for results"""
|
226 |
model_result_filepaths = []
|
227 |
|
|
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
import dateutil
|
8 |
+
|
9 |
# import numpy as np
|
10 |
|
11 |
from src.display.formatting import make_clickable_model
|
|
|
33 |
revision: str # commit hash, "" if main
|
34 |
results: dict
|
35 |
precision: Precision = Precision.Unknown
|
36 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
37 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
38 |
+
architecture: str = "Unknown" # From config file
|
39 |
license: str = "?"
|
40 |
likes: int = 0
|
41 |
num_params: int = 0
|
42 |
+
date: str = "" # submission date of request file
|
43 |
still_on_hub: bool = False
|
44 |
|
45 |
@staticmethod
|
|
|
68 |
result_key = f"{org}_{model}_{precision.value.name}"
|
69 |
full_model = "/".join(org_and_model)
|
70 |
|
71 |
+
still_on_hub, error, model_config = is_model_on_hub(
|
72 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
73 |
+
)
|
74 |
architecture = "?"
|
75 |
if model_config is not None:
|
76 |
architectures = getattr(model_config, "architectures", None)
|
|
|
82 |
# data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}}
|
83 |
|
84 |
results = {}
|
85 |
+
for benchmark, benchmark_results in data["results"].items():
|
86 |
if benchmark not in results:
|
87 |
results[benchmark] = {}
|
88 |
|
89 |
for metric, value in benchmark_results.items():
|
90 |
to_add = True
|
91 |
+
if "_stderr" in metric:
|
92 |
to_add = False
|
93 |
+
if "alias" in metric:
|
94 |
to_add = False
|
95 |
|
96 |
+
if "," in metric:
|
97 |
+
metric = metric.split(",")[0]
|
98 |
metric = metric.replace("exact_match", "em")
|
99 |
|
100 |
if to_add is True:
|
101 |
multiplier = 100.0
|
102 |
+
if "rouge" in metric and "truthful" not in benchmark:
|
103 |
multiplier = 1.0
|
104 |
+
if "squad" in benchmark:
|
105 |
multiplier = 1.0
|
106 |
|
107 |
# print('RESULTS', data['results'])
|
108 |
# print('XXX', benchmark, metric, value, multiplier)
|
109 |
results[benchmark][metric] = value * multiplier
|
110 |
|
111 |
+
res = EvalResult(
|
112 |
+
eval_name=result_key,
|
113 |
+
full_model=full_model,
|
114 |
+
org=org,
|
115 |
+
model=model,
|
116 |
+
results=results,
|
117 |
+
precision=precision,
|
118 |
+
revision=config.get("model_sha", ""),
|
119 |
+
still_on_hub=still_on_hub,
|
120 |
+
architecture=architecture,
|
121 |
+
)
|
122 |
|
123 |
return res
|
124 |
|
|
|
194 |
request_file = tmp_request_file
|
195 |
return request_file
|
196 |
|
197 |
+
|
198 |
def get_request_file_for_model_open_llm(requests_path, model_name, precision):
|
199 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
200 |
request_files = os.path.join(
|
|
|
209 |
for tmp_request_file in request_files:
|
210 |
with open(tmp_request_file, "r") as f:
|
211 |
req_content = json.load(f)
|
212 |
+
if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
|
|
|
|
|
|
|
213 |
request_file = tmp_request_file
|
214 |
return request_file
|
215 |
|
216 |
+
|
217 |
def update_model_type_with_open_llm_request_file(result, open_llm_requests_path):
|
218 |
"""Finds the relevant request file for the current model and updates info with it"""
|
219 |
+
request_file = get_request_file_for_model_open_llm(
|
220 |
+
open_llm_requests_path, result.full_model, result.precision.value.name
|
221 |
+
)
|
222 |
|
223 |
if request_file:
|
224 |
try:
|
|
|
231 |
pass
|
232 |
return result
|
233 |
|
234 |
+
|
235 |
+
def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool = False) -> list[EvalResult]:
|
|
|
236 |
"""From the path of the results folder root, extract all needed info for results"""
|
237 |
model_result_filepaths = []
|
238 |
|
src/populate.py
CHANGED
@@ -13,17 +13,21 @@ from src.backend.envs import Tasks as BackendTasks
|
|
13 |
from src.display.utils import Tasks
|
14 |
|
15 |
|
16 |
-
def get_leaderboard_df(
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
22 |
# Returns a list of EvalResult
|
23 |
raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
|
24 |
if requests_path_open_llm != "":
|
25 |
for result_idx in tqdm(range(len(raw_data)), desc="updating model type with open llm leaderboard"):
|
26 |
-
raw_data[result_idx] = update_model_type_with_open_llm_request_file(
|
|
|
|
|
27 |
|
28 |
all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
|
29 |
|
|
|
13 |
from src.display.utils import Tasks
|
14 |
|
15 |
|
16 |
+
def get_leaderboard_df(
|
17 |
+
results_path: str,
|
18 |
+
requests_path: str,
|
19 |
+
requests_path_open_llm: str,
|
20 |
+
cols: list,
|
21 |
+
benchmark_cols: list,
|
22 |
+
is_backend: bool = False,
|
23 |
+
) -> tuple[list[EvalResult], pd.DataFrame]:
|
24 |
# Returns a list of EvalResult
|
25 |
raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
|
26 |
if requests_path_open_llm != "":
|
27 |
for result_idx in tqdm(range(len(raw_data)), desc="updating model type with open llm leaderboard"):
|
28 |
+
raw_data[result_idx] = update_model_type_with_open_llm_request_file(
|
29 |
+
raw_data[result_idx], requests_path_open_llm
|
30 |
+
)
|
31 |
|
32 |
all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
|
33 |
|
src/submission/check_validity.py
CHANGED
@@ -40,20 +40,34 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
40 |
return True, ""
|
41 |
|
42 |
|
43 |
-
def is_model_on_hub(
|
|
|
|
|
44 |
try:
|
45 |
-
config = AutoConfig.from_pretrained(
|
|
|
|
|
46 |
if test_tokenizer:
|
47 |
try:
|
48 |
-
AutoTokenizer.from_pretrained(
|
|
|
|
|
49 |
except ValueError as e:
|
50 |
return False, f"uses a tokenizer which is not in a transformers release: {e}", None
|
51 |
except Exception as e:
|
52 |
-
return
|
|
|
|
|
|
|
|
|
53 |
return True, None, config
|
54 |
|
55 |
except ValueError as e:
|
56 |
-
return
|
|
|
|
|
|
|
|
|
57 |
|
58 |
except Exception as e:
|
59 |
return False, f"was not found on hub -- {str(e)}", None
|
@@ -63,7 +77,7 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
63 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
64 |
try:
|
65 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
66 |
-
except (AttributeError, TypeError
|
67 |
try:
|
68 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
69 |
model_size = size_match.group(0)
|
@@ -75,9 +89,11 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
75 |
model_size = size_factor * model_size
|
76 |
return model_size
|
77 |
|
|
|
78 |
def get_model_arch(model_info: ModelInfo):
|
79 |
return model_info.config.get("architectures", "Unknown")
|
80 |
|
|
|
81 |
def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
|
82 |
if org_or_user not in users_to_submission_dates:
|
83 |
return True, ""
|
|
|
40 |
return True, ""
|
41 |
|
42 |
|
43 |
+
def is_model_on_hub(
|
44 |
+
model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
|
45 |
+
) -> tuple[bool, Optional[str], Optional[AutoConfig]]:
|
46 |
try:
|
47 |
+
config = AutoConfig.from_pretrained(
|
48 |
+
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
49 |
+
)
|
50 |
if test_tokenizer:
|
51 |
try:
|
52 |
+
AutoTokenizer.from_pretrained(
|
53 |
+
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
54 |
+
)
|
55 |
except ValueError as e:
|
56 |
return False, f"uses a tokenizer which is not in a transformers release: {e}", None
|
57 |
except Exception as e:
|
58 |
+
return (
|
59 |
+
False,
|
60 |
+
"'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
|
61 |
+
None,
|
62 |
+
)
|
63 |
return True, None, config
|
64 |
|
65 |
except ValueError as e:
|
66 |
+
return (
|
67 |
+
False,
|
68 |
+
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
69 |
+
None,
|
70 |
+
)
|
71 |
|
72 |
except Exception as e:
|
73 |
return False, f"was not found on hub -- {str(e)}", None
|
|
|
77 |
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
78 |
try:
|
79 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
80 |
+
except (AttributeError, TypeError):
|
81 |
try:
|
82 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
83 |
model_size = size_match.group(0)
|
|
|
89 |
model_size = size_factor * model_size
|
90 |
return model_size
|
91 |
|
92 |
+
|
93 |
def get_model_arch(model_info: ModelInfo):
|
94 |
return model_info.config.get("architectures", "Unknown")
|
95 |
|
96 |
+
|
97 |
def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
|
98 |
if org_or_user not in users_to_submission_dates:
|
99 |
return True, ""
|
src/submission/submit.py
CHANGED
@@ -61,7 +61,9 @@ def add_new_eval(
|
|
61 |
|
62 |
# Is the model on the hub?
|
63 |
if weight_type in ["Delta", "Adapter"]:
|
64 |
-
base_model_on_hub, error, _ = is_model_on_hub(
|
|
|
|
|
65 |
if not base_model_on_hub:
|
66 |
return styled_error(f'Base model "{base_model}" {error}')
|
67 |
|
|
|
61 |
|
62 |
# Is the model on the hub?
|
63 |
if weight_type in ["Delta", "Adapter"]:
|
64 |
+
base_model_on_hub, error, _ = is_model_on_hub(
|
65 |
+
model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=False
|
66 |
+
)
|
67 |
if not base_model_on_hub:
|
68 |
return styled_error(f'Base model "{base_model}" {error}')
|
69 |
|
src/utils.py
CHANGED
@@ -5,18 +5,21 @@ from huggingface_hub import snapshot_download
|
|
5 |
def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
|
6 |
for i in range(10):
|
7 |
try:
|
8 |
-
snapshot_download(
|
|
|
|
|
9 |
return
|
10 |
except Exception as e:
|
11 |
print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...")
|
12 |
import time
|
|
|
13 |
time.sleep(60)
|
14 |
return
|
15 |
|
16 |
|
17 |
def get_dataset_url(row):
|
18 |
-
dataset_name = row[
|
19 |
-
dataset_url = row[
|
20 |
benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>'
|
21 |
return benchmark
|
22 |
|
@@ -24,8 +27,8 @@ def get_dataset_url(row):
|
|
24 |
def get_dataset_summary_table(file_path):
|
25 |
df = pd.read_csv(file_path)
|
26 |
|
27 |
-
df[
|
28 |
|
29 |
-
df = df[[
|
30 |
|
31 |
return df
|
|
|
5 |
def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
|
6 |
for i in range(10):
|
7 |
try:
|
8 |
+
snapshot_download(
|
9 |
+
repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers
|
10 |
+
)
|
11 |
return
|
12 |
except Exception as e:
|
13 |
print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...")
|
14 |
import time
|
15 |
+
|
16 |
time.sleep(60)
|
17 |
return
|
18 |
|
19 |
|
20 |
def get_dataset_url(row):
|
21 |
+
dataset_name = row["Benchmark"]
|
22 |
+
dataset_url = row["Dataset Link"]
|
23 |
benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>'
|
24 |
return benchmark
|
25 |
|
|
|
27 |
def get_dataset_summary_table(file_path):
|
28 |
df = pd.read_csv(file_path)
|
29 |
|
30 |
+
df["Benchmark"] = df.apply(lambda x: get_dataset_url(x), axis=1)
|
31 |
|
32 |
+
df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
|
33 |
|
34 |
return df
|