Spaces:
Running
Running
shounakpaul95
commited on
Commit
•
647c84c
1
Parent(s):
25db2bc
Update eval_utils.py
Browse files- eval_utils.py +12 -11
eval_utils.py
CHANGED
@@ -37,7 +37,7 @@ def evaluate_bail(gold_data, pred_data):
|
|
37 |
|
38 |
f1 = f1_score(gold_labels, pred_labels, average="macro")
|
39 |
print("Macro-F1 on HLDC-all-districts test set:", f1)
|
40 |
-
return f1
|
41 |
|
42 |
def get_BLEU_score(ref_text_all, machine_text_all):
|
43 |
sc_all = []
|
@@ -89,7 +89,8 @@ def evaluate_cjpe(gold_data, pred_data):
|
|
89 |
}
|
90 |
}
|
91 |
print("Explanability for ILDC Expert:", explanation_result)
|
92 |
-
return {**prediction_result, **explanation_result}
|
|
|
93 |
|
94 |
def span2bio(txt, roles):
|
95 |
roles = sorted(roles, key = lambda x:x['start'])
|
@@ -161,7 +162,7 @@ def evaluate_lner(gold_data, pred_data, text_data):
|
|
161 |
results_per_fold[f"fold_{fold}"] = avg_f1
|
162 |
|
163 |
print("Strict macro-F1 on L-NER Dataset:", results_per_fold)
|
164 |
-
return results_per_fold
|
165 |
|
166 |
|
167 |
def evaluate_rr(gold_data, pred_data):
|
@@ -187,7 +188,7 @@ def evaluate_rr(gold_data, pred_data):
|
|
187 |
|
188 |
f1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
|
189 |
print(f"Macro-F1 on combined test set:", f1)
|
190 |
-
return f1
|
191 |
|
192 |
|
193 |
def evaluate_lsi(gold_data, pred_data):
|
@@ -210,7 +211,7 @@ def evaluate_lsi(gold_data, pred_data):
|
|
210 |
|
211 |
f1 = f1_score(gold_matrix, pred_matrix, average="macro")
|
212 |
print("Macro-F1 on ILSI test set:", f1)
|
213 |
-
return f1
|
214 |
|
215 |
|
216 |
def evaluate_pcr(gold_data, pred_data):
|
@@ -240,7 +241,7 @@ def evaluate_pcr(gold_data, pred_data):
|
|
240 |
|
241 |
max_f1 = max(f1_scores)
|
242 |
index_max = f1_scores.index(max_f1) + 1
|
243 |
-
return f"{max_f1:.2f}@{index_max}"
|
244 |
|
245 |
|
246 |
def evaluate_summ(gold_data, pred_data):
|
@@ -262,7 +263,7 @@ def evaluate_summ(gold_data, pred_data):
|
|
262 |
|
263 |
_, _, bs = bert_score.score(pred_summaries, gold_summaries, lang="en", verbose=True, device='cuda')
|
264 |
print("BERTSCORE:", bs.mean().item())
|
265 |
-
return {'ROUGE': rl_scores['rouge-l']['f'], 'BERTSCORE': bs.mean().item()}
|
266 |
|
267 |
|
268 |
def evaluate_lmt(gold_data, pred_data):
|
@@ -323,12 +324,12 @@ def create_output_json(evaluation_results):
|
|
323 |
"ROUGE-L": evaluation_results["cjpe"]["ROUGE-L"],
|
324 |
"BLEU": evaluation_results["cjpe"]["BLEU"],
|
325 |
},
|
326 |
-
"BAIL": {"mF1": evaluation_results["bail"]},
|
327 |
-
"LSI": {"mF1": evaluation_results["lsi"]},
|
328 |
-
"PCR": {"muF1@K": evaluation_results["pcr"]},
|
329 |
"SUMM": {
|
330 |
"ROUGE-L": evaluation_results["summ"]["ROUGE-L"],
|
331 |
-
"BERTSCORE": "-", # Placeholder BERTSCORE
|
332 |
},
|
333 |
"L-MT": {
|
334 |
"BLEU": evaluation_results["lmt"]["BLEU"],
|
|
|
37 |
|
38 |
f1 = f1_score(gold_labels, pred_labels, average="macro")
|
39 |
print("Macro-F1 on HLDC-all-districts test set:", f1)
|
40 |
+
return {"mF1": f1}
|
41 |
|
42 |
def get_BLEU_score(ref_text_all, machine_text_all):
|
43 |
sc_all = []
|
|
|
89 |
}
|
90 |
}
|
91 |
print("Explanability for ILDC Expert:", explanation_result)
|
92 |
+
#return {**prediction_result, **explanation_result}
|
93 |
+
return {"mF1": f1, "ROUGE-L": rouge_score, "BLEU": bleu_score}
|
94 |
|
95 |
def span2bio(txt, roles):
|
96 |
roles = sorted(roles, key = lambda x:x['start'])
|
|
|
162 |
results_per_fold[f"fold_{fold}"] = avg_f1
|
163 |
|
164 |
print("Strict macro-F1 on L-NER Dataset:", results_per_fold)
|
165 |
+
return {"strict mF1": sum(results_per_fold.values())/len(results_per_fold)}
|
166 |
|
167 |
|
168 |
def evaluate_rr(gold_data, pred_data):
|
|
|
188 |
|
189 |
f1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
|
190 |
print(f"Macro-F1 on combined test set:", f1)
|
191 |
+
return {"mF1": f1}
|
192 |
|
193 |
|
194 |
def evaluate_lsi(gold_data, pred_data):
|
|
|
211 |
|
212 |
f1 = f1_score(gold_matrix, pred_matrix, average="macro")
|
213 |
print("Macro-F1 on ILSI test set:", f1)
|
214 |
+
return {"mF1": f1}
|
215 |
|
216 |
|
217 |
def evaluate_pcr(gold_data, pred_data):
|
|
|
241 |
|
242 |
max_f1 = max(f1_scores)
|
243 |
index_max = f1_scores.index(max_f1) + 1
|
244 |
+
return {"muF1@K": f"{max_f1:.2f}@{index_max}"}
|
245 |
|
246 |
|
247 |
def evaluate_summ(gold_data, pred_data):
|
|
|
263 |
|
264 |
_, _, bs = bert_score.score(pred_summaries, gold_summaries, lang="en", verbose=True, device='cuda')
|
265 |
print("BERTSCORE:", bs.mean().item())
|
266 |
+
return {'ROUGE-L': rl_scores['rouge-l']['f'], 'BERTSCORE': bs.mean().item()}
|
267 |
|
268 |
|
269 |
def evaluate_lmt(gold_data, pred_data):
|
|
|
324 |
"ROUGE-L": evaluation_results["cjpe"]["ROUGE-L"],
|
325 |
"BLEU": evaluation_results["cjpe"]["BLEU"],
|
326 |
},
|
327 |
+
"BAIL": {"mF1": evaluation_results["bail"]["mF1"]},
|
328 |
+
"LSI": {"mF1": evaluation_results["lsi"]["mF1"]},
|
329 |
+
"PCR": {"muF1@K": evaluation_results["pcr"]["muF1@K"]},
|
330 |
"SUMM": {
|
331 |
"ROUGE-L": evaluation_results["summ"]["ROUGE-L"],
|
332 |
+
"BERTSCORE": evaluation_results["summ"]["BERTSCORE"] #"-", # Placeholder BERTSCORE
|
333 |
},
|
334 |
"L-MT": {
|
335 |
"BLEU": evaluation_results["lmt"]["BLEU"],
|