shounakpaul95 commited on
Commit
647c84c
1 Parent(s): 25db2bc

Update eval_utils.py

Browse files
Files changed (1) hide show
  1. eval_utils.py +12 -11
eval_utils.py CHANGED
@@ -37,7 +37,7 @@ def evaluate_bail(gold_data, pred_data):
37
 
38
  f1 = f1_score(gold_labels, pred_labels, average="macro")
39
  print("Macro-F1 on HLDC-all-districts test set:", f1)
40
- return f1
41
 
42
  def get_BLEU_score(ref_text_all, machine_text_all):
43
  sc_all = []
@@ -89,7 +89,8 @@ def evaluate_cjpe(gold_data, pred_data):
89
  }
90
  }
91
  print("Explanability for ILDC Expert:", explanation_result)
92
- return {**prediction_result, **explanation_result}
 
93
 
94
  def span2bio(txt, roles):
95
  roles = sorted(roles, key = lambda x:x['start'])
@@ -161,7 +162,7 @@ def evaluate_lner(gold_data, pred_data, text_data):
161
  results_per_fold[f"fold_{fold}"] = avg_f1
162
 
163
  print("Strict macro-F1 on L-NER Dataset:", results_per_fold)
164
- return results_per_fold
165
 
166
 
167
  def evaluate_rr(gold_data, pred_data):
@@ -187,7 +188,7 @@ def evaluate_rr(gold_data, pred_data):
187
 
188
  f1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
189
  print(f"Macro-F1 on combined test set:", f1)
190
- return f1
191
 
192
 
193
  def evaluate_lsi(gold_data, pred_data):
@@ -210,7 +211,7 @@ def evaluate_lsi(gold_data, pred_data):
210
 
211
  f1 = f1_score(gold_matrix, pred_matrix, average="macro")
212
  print("Macro-F1 on ILSI test set:", f1)
213
- return f1
214
 
215
 
216
  def evaluate_pcr(gold_data, pred_data):
@@ -240,7 +241,7 @@ def evaluate_pcr(gold_data, pred_data):
240
 
241
  max_f1 = max(f1_scores)
242
  index_max = f1_scores.index(max_f1) + 1
243
- return f"{max_f1:.2f}@{index_max}"
244
 
245
 
246
  def evaluate_summ(gold_data, pred_data):
@@ -262,7 +263,7 @@ def evaluate_summ(gold_data, pred_data):
262
 
263
  _, _, bs = bert_score.score(pred_summaries, gold_summaries, lang="en", verbose=True, device='cuda')
264
  print("BERTSCORE:", bs.mean().item())
265
- return {'ROUGE': rl_scores['rouge-l']['f'], 'BERTSCORE': bs.mean().item()}
266
 
267
 
268
  def evaluate_lmt(gold_data, pred_data):
@@ -323,12 +324,12 @@ def create_output_json(evaluation_results):
323
  "ROUGE-L": evaluation_results["cjpe"]["ROUGE-L"],
324
  "BLEU": evaluation_results["cjpe"]["BLEU"],
325
  },
326
- "BAIL": {"mF1": evaluation_results["bail"]},
327
- "LSI": {"mF1": evaluation_results["lsi"]},
328
- "PCR": {"muF1@K": evaluation_results["pcr"]},
329
  "SUMM": {
330
  "ROUGE-L": evaluation_results["summ"]["ROUGE-L"],
331
- "BERTSCORE": "-", # Placeholder BERTSCORE
332
  },
333
  "L-MT": {
334
  "BLEU": evaluation_results["lmt"]["BLEU"],
 
37
 
38
  f1 = f1_score(gold_labels, pred_labels, average="macro")
39
  print("Macro-F1 on HLDC-all-districts test set:", f1)
40
+ return {"mF1": f1}
41
 
42
  def get_BLEU_score(ref_text_all, machine_text_all):
43
  sc_all = []
 
89
  }
90
  }
91
  print("Explanability for ILDC Expert:", explanation_result)
92
+ #return {**prediction_result, **explanation_result}
93
+ return {"mF1": f1, "ROUGE-L": rouge_score, "BLEU": bleu_score}
94
 
95
  def span2bio(txt, roles):
96
  roles = sorted(roles, key = lambda x:x['start'])
 
162
  results_per_fold[f"fold_{fold}"] = avg_f1
163
 
164
  print("Strict macro-F1 on L-NER Dataset:", results_per_fold)
165
+ return {"strict mF1": sum(results_per_fold.values())/len(results_per_fold)}
166
 
167
 
168
  def evaluate_rr(gold_data, pred_data):
 
188
 
189
  f1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
190
  print(f"Macro-F1 on combined test set:", f1)
191
+ return {"mF1": f1}
192
 
193
 
194
  def evaluate_lsi(gold_data, pred_data):
 
211
 
212
  f1 = f1_score(gold_matrix, pred_matrix, average="macro")
213
  print("Macro-F1 on ILSI test set:", f1)
214
+ return {"mF1": f1}
215
 
216
 
217
  def evaluate_pcr(gold_data, pred_data):
 
241
 
242
  max_f1 = max(f1_scores)
243
  index_max = f1_scores.index(max_f1) + 1
244
+ return {"muF1@K": f"{max_f1:.2f}@{index_max}"}
245
 
246
 
247
  def evaluate_summ(gold_data, pred_data):
 
263
 
264
  _, _, bs = bert_score.score(pred_summaries, gold_summaries, lang="en", verbose=True, device='cuda')
265
  print("BERTSCORE:", bs.mean().item())
266
+ return {'ROUGE-L': rl_scores['rouge-l']['f'], 'BERTSCORE': bs.mean().item()}
267
 
268
 
269
  def evaluate_lmt(gold_data, pred_data):
 
324
  "ROUGE-L": evaluation_results["cjpe"]["ROUGE-L"],
325
  "BLEU": evaluation_results["cjpe"]["BLEU"],
326
  },
327
+ "BAIL": {"mF1": evaluation_results["bail"]["mF1"]},
328
+ "LSI": {"mF1": evaluation_results["lsi"]["mF1"]},
329
+ "PCR": {"muF1@K": evaluation_results["pcr"]["muF1@K"]},
330
  "SUMM": {
331
  "ROUGE-L": evaluation_results["summ"]["ROUGE-L"],
332
+ "BERTSCORE": evaluation_results["summ"]["BERTSCORE"] #"-", # Placeholder BERTSCORE
333
  },
334
  "L-MT": {
335
  "BLEU": evaluation_results["lmt"]["BLEU"],