shounakpaul95 commited on
Commit
cf29d86
1 Parent(s): 59f79a1

Upload eval_utils.py

Browse files
Files changed (1) hide show
  1. eval_utils.py +104 -191
eval_utils.py CHANGED
@@ -3,157 +3,19 @@ import re
3
  from collections import defaultdict
4
 
5
  import evaluate
6
-
7
- # import nltk
8
  import numpy as np
9
  from nervaluate import Evaluator
10
- from rouge_score import rouge_scorer
11
  from sacrebleu.metrics import BLEU, CHRF
12
  from sklearn.metrics import f1_score
13
  from tqdm import tqdm
14
  from transformers import AutoTokenizer
15
-
16
- from transformers import AutoTokenizer
17
- import re
18
  import string
19
 
20
-
21
- class TF_Tokenizer:
22
- def __init__(self, model_str):
23
- tok = AutoTokenizer.from_pretrained(model_str)
24
-
25
- def __call__(self, txt):
26
- return self.tok.tokenize(txt)
27
-
28
-
29
- class WS_Tokenizer:
30
- def __init__(self):
31
- pass
32
-
33
- def __call__(self, txt):
34
- return re.findall(r"[{}]|\w+".format(string.punctuation), txt)
35
-
36
-
37
- def convert_spans_to_bio(txt, roles, tokenizer_func):
38
- roles = sorted(roles, key=lambda x: x["start"])
39
- roles_left = [r["start"] for r in roles]
40
-
41
- ttxt = tokenizer_func(txt)
42
-
43
- c = 0
44
- cr = -1
45
- prev = "O"
46
- troles = []
47
- for tok in ttxt:
48
- if c >= len(txt):
49
- break
50
-
51
- while txt[c] == " ":
52
- c += 1
53
-
54
- else:
55
- if c in roles_left: # Start of a new role
56
- ind = roles_left.index(c)
57
- cr = roles[ind]["end"]
58
- prev = "I-" + roles[ind]["label"]
59
- troles.append("B-" + roles[ind]["label"])
60
- else:
61
- if c < cr: # Assign previous role
62
- troles.append(prev)
63
- else: # Assign 'O'
64
- troles.append("O")
65
-
66
- c += len(tok)
67
-
68
- if len(ttxt) != len(troles):
69
- troles += ["O"] * (len(ttxt) - len(troles))
70
-
71
- assert len(ttxt) == len(troles)
72
- return troles
73
-
74
-
75
- def convert_bio_to_spans(txt, troles, tokenizer_func):
76
- c = 0
77
- c2 = 0
78
- cr = -1
79
- cs = -1
80
- prev = "O"
81
-
82
- roles = []
83
- ttxt = tokenizer_func(txt)
84
-
85
- if len(ttxt) != len(troles):
86
- ttxt = ttxt[: len(troles)]
87
-
88
- for j, tok in enumerate(ttxt):
89
- if c >= len(txt):
90
- break
91
-
92
- while c < len(txt) and txt[c].isspace():
93
- c += 1
94
-
95
- if tok[:2] == "##" or tok == "[UNK]":
96
- c += len(tok) - 2 if tok[:2] == "##" else 1
97
- else:
98
- if troles[j].startswith("B-"):
99
- if cs >= cr:
100
- cr = c
101
- if cs >= 0:
102
- roles.append({"start": cs, "end": c2, "label": prev})
103
- cs = c
104
- prev = troles[j][2:]
105
- else:
106
- if troles[j] == "O":
107
- if cs >= cr:
108
- cr = c
109
- if cs >= 0:
110
- roles.append({"start": cs, "end": c2, "label": prev})
111
- c += len(tok)
112
- c2 = c
113
-
114
- if cs >= cr:
115
- if cs >= 0:
116
- roles.append({"start": cs, "end": c2, "label": prev})
117
-
118
- return roles
119
-
120
-
121
- def span2bio(txt, labels):
122
- roles = sorted(labels, key=lambda x: x["label"])
123
- roles_left = [r["start"] for r in roles]
124
-
125
- ttxt = re.findall(r"[{}]|\w+".format(string.punctuation), txt)
126
-
127
- c = 0
128
- cr = -1
129
- prev = "O"
130
- troles = []
131
- for tok in ttxt:
132
- if c >= len(txt):
133
- break
134
-
135
- while txt[c] == " ":
136
- c += 1
137
-
138
- else:
139
- if c in roles_left: # Start of a new role
140
- ind = roles_left.index(c)
141
- cr = roles[ind]["end"]
142
- prev = "I-" + roles[ind]["label"]
143
- troles.append("B-" + roles[ind]["label"])
144
- else:
145
- if c < cr: # Assign previous role
146
- troles.append(prev)
147
- else: # Assign 'O'
148
- troles.append("O")
149
-
150
- c += len(tok)
151
-
152
- if len(ttxt) != len(troles):
153
- troles += ["O"] * (len(ttxt) - len(troles))
154
-
155
- assert len(ttxt) == len(troles)
156
- return ttxt, troles
157
 
158
 
159
  def load_json(file_path):
@@ -176,9 +38,18 @@ def evaluate_bail(gold_data, pred_data):
176
 
177
  f1 = f1_score(gold_labels, pred_labels, average="macro")
178
  print("Macro-F1 on HLDC-all-districts test set:", f1)
 
179
 
180
- return f"{f1:.2f}"
181
-
 
 
 
 
 
 
 
 
182
 
183
  def evaluate_cjpe(gold_data, pred_data):
184
  # Evaluate prediction
@@ -191,48 +62,76 @@ def evaluate_cjpe(gold_data, pred_data):
191
  f1 = f1_score(gold_labels, pred_labels, average="macro")
192
  prediction_result = {"cjpe-eval": f1}
193
 
194
- # Evaluate explanation
195
- rouge = evaluate.load("rouge")
196
- bleu = evaluate.load("bleu")
197
-
198
- gold_explanations = [exp["expert_1"] for exp in gold_data["explanation"].values()]
199
- pred_explanations = [exp["expert_1"] for exp in pred_data["explanation"].values()]
200
-
201
- rouge_scores = rouge.compute(
202
- predictions=pred_explanations, references=gold_explanations
203
- )
204
- bleu_score = bleu.compute(
205
- predictions=pred_explanations, references=gold_explanations
206
- )
 
 
 
207
 
208
  explanation_result = {
209
  "cjpe-exp-eval": {
210
- "rouge": [rouge_scores],
211
- "bleu": [bleu_score],
212
  }
213
  }
214
-
 
215
  return {**prediction_result, **explanation_result}
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  def evaluate_lner(gold_data, pred_data, text_data):
219
- labels = [
220
- "APP",
221
- "RESP",
222
- "A.COUNSEL",
223
- "R.COUNSEL",
224
- "JUDGE",
225
- "WIT",
226
- "AUTH",
227
- "COURT",
228
- "STAT",
229
- "PREC",
230
- "DATE",
231
- "CASENO",
232
- ]
233
 
234
  results_per_fold = {}
235
- for fold in range(1, 4):
236
  gold = gold_data[f"fold_{fold}"]
237
  pred = pred_data[f"fold_{fold}"]
238
  text = text_data[f"fold_{fold}"]
@@ -251,6 +150,7 @@ def evaluate_lner(gold_data, pred_data, text_data):
251
  pred_labels.append(pred_bio)
252
 
253
  evaluator = Evaluator(gold_labels, pred_labels, tags=labels, loader="list")
 
254
  results, results_per_tag, _, _ = evaluator.evaluate()
255
 
256
  f1_scores = [results_per_tag[l]["strict"]["f1"] for l in results_per_tag]
@@ -258,22 +158,34 @@ def evaluate_lner(gold_data, pred_data, text_data):
258
  print(f"Strict Macro-F1 on Fold {fold}:", avg_f1)
259
  results_per_fold[f"fold_{fold}"] = avg_f1
260
 
261
- return {"strict mF1": f"{np.mean(list(results_per_fold.values()))}:.2f"}
 
262
 
263
 
264
  def evaluate_rr(gold_data, pred_data):
265
  all_gold_labels = []
266
  all_pred_labels = []
 
 
 
267
 
268
  for id, gold_labels in gold_data.items():
269
  pred_labels = pred_data.get(id, ["None"] * len(gold_labels))
270
- all_gold_labels.extend(gold_labels)
271
- all_pred_labels.extend(pred_labels)
272
-
273
- mf1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
274
- print(f"Macro-F1 on combined test set:", mf1)
275
-
276
- return {"mF1": f"{mf1:.2f}"}
 
 
 
 
 
 
 
 
277
 
278
 
279
  def evaluate_lsi(gold_data, pred_data):
@@ -324,7 +236,7 @@ def evaluate_pcr(gold_data, pred_data):
324
 
325
  print(f"Micro-F1@{k} on IL-PCR test set:", f1)
326
 
327
- return np.mean(f1_scores)
328
 
329
 
330
  def evaluate_summ(gold_data, pred_data):
@@ -339,11 +251,12 @@ def evaluate_summ(gold_data, pred_data):
339
  gold_summaries.append(gold_summary)
340
  pred_summaries.append(pred_summary)
341
 
342
- rouge = evaluate.load("rouge")
343
- rouge_scores = rouge.compute(predictions=pred_summaries, references=gold_summaries)
344
- print("Rouge-L:", rouge_scores)
345
 
346
- return {"ROUGE-L": rouge_scores, "BERTSCORE": "-"}
 
 
347
 
348
 
349
  def evaluate_lmt(gold_data, pred_data):
@@ -423,8 +336,8 @@ def create_output_json(evaluation_results):
423
  def main():
424
  # gold_data = load_json("IL_TUR_eval_gold.json")
425
  # pred_data = load_json("IL_TUR_eval_submission2.json")
426
- gold_data = load_json("submissions/baseline/IL_TUR_eval_gold_small.json")
427
- pred_data = load_json("submissions/baseline/IL_TUR_eval_submission_small.json")
428
  pred_data = gold_data
429
  evaluation_results = {}
430
 
 
3
  from collections import defaultdict
4
 
5
  import evaluate
6
+ import nltk
 
7
  import numpy as np
8
  from nervaluate import Evaluator
9
+ # from rouge_score import rouge_scorer
10
  from sacrebleu.metrics import BLEU, CHRF
11
  from sklearn.metrics import f1_score
12
  from tqdm import tqdm
13
  from transformers import AutoTokenizer
14
+ import rouge
15
+ import bert_score
 
16
  import string
17
 
18
+ from ner_helpers import span2bio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  def load_json(file_path):
 
38
 
39
  f1 = f1_score(gold_labels, pred_labels, average="macro")
40
  print("Macro-F1 on HLDC-all-districts test set:", f1)
41
+ return f1
42
 
43
+ def get_BLEU_score(ref_text_all, machine_text_all):
44
+ sc_all = []
45
+ for i in range(len(ref_text_all)):
46
+ ref_text = ref_text_all[i]
47
+ machine_text = machine_text_all[i]
48
+ tok_ref_text = nltk.word_tokenize(ref_text)
49
+ tok_machine_text = nltk.word_tokenize(machine_text)
50
+ sc = nltk.translate.bleu_score.sentence_bleu([tok_ref_text], tok_machine_text, weights = (0.5,0.5))
51
+ sc_all.append(sc)
52
+ return sum(sc_all)/len(sc_all)
53
 
54
  def evaluate_cjpe(gold_data, pred_data):
55
  # Evaluate prediction
 
62
  f1 = f1_score(gold_labels, pred_labels, average="macro")
63
  prediction_result = {"cjpe-eval": f1}
64
 
65
+ R = []
66
+ B = []
67
+ rl_evaluator = rouge.Rouge(metrics=['rouge-l'], max_n=2, limit_length=False, apply_avg=True)
68
+ for x in range(1, 6):
69
+ gold_explanations = []
70
+ pred_explanations = []
71
+ for k,v in gold_data['explanation'].items():
72
+ gold_explanations.append(v[f'expert_{x}'])
73
+ pred_explanations.append(pred_data['explanation'][k])
74
+ rougex = rl_evaluator.get_scores(pred_explanations, gold_explanations)['rouge-l']['f']
75
+ bleux = get_BLEU_score(gold_explanations, pred_explanations)
76
+ R.append(rougex)
77
+ B.append(bleux)
78
+
79
+ rouge_score = sum(R)/len(R)
80
+ bleu_score = sum(B)/len(B)
81
 
82
  explanation_result = {
83
  "cjpe-exp-eval": {
84
+ "rouge": rouge_score,
85
+ "bleu": bleu_score,
86
  }
87
  }
88
+ print("Macro-F1 on ILDC test:", prediction_result)
89
+ print("Explanability for ILDC Expert:", explanation_result)
90
  return {**prediction_result, **explanation_result}
91
 
92
+ def span2bio(txt, roles):
93
+ roles = sorted(roles, key = lambda x:x['start'])
94
+ roles_left = [r['start'] for r in roles]
95
+
96
+ ttxt = re.findall(r'[{}]|\w+'.format(string.punctuation), txt)
97
+
98
+ c = 0
99
+ cr = -1
100
+ prev = 'O'
101
+ troles = []
102
+ for tok in ttxt:
103
+ if c >= len(txt):
104
+ break
105
+
106
+ while txt[c] == ' ':
107
+ c += 1
108
+
109
+ else:
110
+ if c in roles_left: # Start of a new role
111
+ ind = roles_left.index(c)
112
+ cr = roles[ind]['end']
113
+ prev = 'I-' + roles[ind]['label']
114
+ troles.append('B-' + roles[ind]['label'])
115
+ else:
116
+ if c < cr: # Assign previous role
117
+ troles.append(prev)
118
+ else: # Assign 'O'
119
+ troles.append('O')
120
+
121
+ c += len(tok)
122
+
123
+ if len(ttxt) != len(troles):
124
+ troles += ['O'] * (len(ttxt) - len(troles))
125
+
126
+ assert len(ttxt) == len(troles)
127
+ return ttxt, troles
128
 
129
  def evaluate_lner(gold_data, pred_data, text_data):
130
+ with open("ner_labels.txt") as f:
131
+ labels = f.read().strip().split("\n")
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  results_per_fold = {}
134
+ for fold in range(1, len(gold_data) + 1):
135
  gold = gold_data[f"fold_{fold}"]
136
  pred = pred_data[f"fold_{fold}"]
137
  text = text_data[f"fold_{fold}"]
 
150
  pred_labels.append(pred_bio)
151
 
152
  evaluator = Evaluator(gold_labels, pred_labels, tags=labels, loader="list")
153
+
154
  results, results_per_tag, _, _ = evaluator.evaluate()
155
 
156
  f1_scores = [results_per_tag[l]["strict"]["f1"] for l in results_per_tag]
 
158
  print(f"Strict Macro-F1 on Fold {fold}:", avg_f1)
159
  results_per_fold[f"fold_{fold}"] = avg_f1
160
 
161
+ print("Strict macro-F1 on L-NER Dataset:", results_per_fold)
162
+ return results_per_fold
163
 
164
 
165
  def evaluate_rr(gold_data, pred_data):
166
  all_gold_labels = []
167
  all_pred_labels = []
168
+ with open("rr_label_vocab.json") as f:
169
+ label_vocab = json.load(f)
170
+
171
 
172
  for id, gold_labels in gold_data.items():
173
  pred_labels = pred_data.get(id, ["None"] * len(gold_labels))
174
+ for i in range(len(gold_labels)):
175
+ g = gold_labels[i]
176
+ p = pred_labels[i]
177
+ if g not in label_vocab: continue
178
+ for pp in p.split():
179
+ if pp in label_vocab:
180
+ p = pp
181
+ break
182
+ if p not in label_vocab: continue
183
+ all_gold_labels.append([label_vocab[g]])
184
+ all_pred_labels.append([label_vocab[p]])
185
+
186
+ f1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
187
+ print(f"Macro-F1 on combined test set:", f1)
188
+ return f1
189
 
190
 
191
  def evaluate_lsi(gold_data, pred_data):
 
236
 
237
  print(f"Micro-F1@{k} on IL-PCR test set:", f1)
238
 
239
+ return f1_scores
240
 
241
 
242
  def evaluate_summ(gold_data, pred_data):
 
251
  gold_summaries.append(gold_summary)
252
  pred_summaries.append(pred_summary)
253
 
254
+ rl_evaluator = rouge.Rouge(metrics=['rouge-n','rouge-l'], max_n=2, limit_length=False, apply_avg=True)
255
+ rl_scores = rl_evaluator.get_scores(pred_summaries, gold_summaries)
 
256
 
257
+ _, _, bs = bert_score.score(pred_summaries, gold_summaries, lang="en", verbose=True, device='cuda')
258
+ print("Rouge:", {k:v['f'] for k,v in rl_scores.items()}, "BERTSCORE:", bs.mean().item())
259
+ return {'ROUGE': rl_scores['rouge-l']['f'], 'BERTSCORE': bs.mean().item()}
260
 
261
 
262
  def evaluate_lmt(gold_data, pred_data):
 
336
  def main():
337
  # gold_data = load_json("IL_TUR_eval_gold.json")
338
  # pred_data = load_json("IL_TUR_eval_submission2.json")
339
+ gold_data = load_json("submissions/baseline/IL_TUR_eval_gold.json")
340
+ pred_data = load_json("submissions/baseline/IL_TUR_eval_submission_dummy.json")
341
  pred_data = gold_data
342
  evaluation_results = {}
343