abhinav-joshi
commited on
Commit
•
e1043c6
1
Parent(s):
d1ca5fe
add prediction submission
Browse files- eval_utils.py +435 -0
- evaluation_results.json +38 -0
- labels.txt +12 -0
- lner-text.json +0 -0
- lsi_label_vocab.json +102 -0
- ner_helpers.py +141 -0
- requirements.txt +2 -1
- submissions/baseline/IL_TUR_eval_gold_small.json +0 -0
- submissions/baseline/IL_TUR_eval_submission_small.json +0 -0
- uploads.py +19 -3
eval_utils.py
ADDED
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
from collections import defaultdict
|
4 |
+
|
5 |
+
import evaluate
|
6 |
+
import nltk
|
7 |
+
import numpy as np
|
8 |
+
from nervaluate import Evaluator
|
9 |
+
from rouge_score import rouge_scorer
|
10 |
+
from sacrebleu.metrics import BLEU, CHRF
|
11 |
+
from sklearn.metrics import f1_score
|
12 |
+
from tqdm import tqdm
|
13 |
+
from transformers import AutoTokenizer
|
14 |
+
|
15 |
+
from ner_helpers import span2bio
|
16 |
+
|
17 |
+
|
18 |
+
def load_json(file_path):
|
19 |
+
with open(file_path, "r") as f:
|
20 |
+
return json.load(f)
|
21 |
+
|
22 |
+
|
23 |
+
def get_micro_at_k(gold, pred, k):
|
24 |
+
gold_set = set(gold)
|
25 |
+
pred_set = set(pred[:k])
|
26 |
+
return len(gold_set & pred_set), len(gold_set), len(pred_set)
|
27 |
+
|
28 |
+
|
29 |
+
def evaluate_bail(gold_data, pred_data):
|
30 |
+
gold_labels = []
|
31 |
+
pred_labels = []
|
32 |
+
for id, label in gold_data.items():
|
33 |
+
gold_labels.append(label)
|
34 |
+
pred_labels.append(pred_data.get(id, 0))
|
35 |
+
|
36 |
+
f1 = f1_score(gold_labels, pred_labels, average="macro")
|
37 |
+
print("Macro-F1 on HLDC-all-districts test set:", f1)
|
38 |
+
|
39 |
+
return f"{f1:.2f}"
|
40 |
+
|
41 |
+
|
42 |
+
def evaluate_cjpe(gold_data, pred_data):
|
43 |
+
# Evaluate prediction
|
44 |
+
gold_labels = []
|
45 |
+
pred_labels = []
|
46 |
+
for id, label in gold_data["prediction"].items():
|
47 |
+
gold_labels.append(label)
|
48 |
+
pred_labels.append(pred_data["prediction"].get(id, 0))
|
49 |
+
|
50 |
+
f1 = f1_score(gold_labels, pred_labels, average="macro")
|
51 |
+
prediction_result = {"cjpe-eval": f1}
|
52 |
+
|
53 |
+
# Evaluate explanation
|
54 |
+
rouge = evaluate.load("rouge")
|
55 |
+
bleu = evaluate.load("bleu")
|
56 |
+
|
57 |
+
gold_explanations = [exp["expert_1"] for exp in gold_data["explanation"].values()]
|
58 |
+
pred_explanations = [exp["expert_1"] for exp in pred_data["explanation"].values()]
|
59 |
+
|
60 |
+
rouge_scores = rouge.compute(
|
61 |
+
predictions=pred_explanations, references=gold_explanations
|
62 |
+
)
|
63 |
+
bleu_score = bleu.compute(
|
64 |
+
predictions=pred_explanations, references=gold_explanations
|
65 |
+
)
|
66 |
+
|
67 |
+
explanation_result = {
|
68 |
+
"cjpe-exp-eval": {
|
69 |
+
"rouge": [rouge_scores],
|
70 |
+
"bleu": [bleu_score],
|
71 |
+
}
|
72 |
+
}
|
73 |
+
|
74 |
+
return {**prediction_result, **explanation_result}
|
75 |
+
|
76 |
+
|
77 |
+
def evaluate_lner(gold_data, pred_data, text_data):
|
78 |
+
with open("labels.txt") as f:
|
79 |
+
labels = f.read().strip().split("\n")
|
80 |
+
|
81 |
+
results_per_fold = {}
|
82 |
+
for fold in range(1, 4):
|
83 |
+
gold = gold_data[f"fold_{fold}"]
|
84 |
+
pred = pred_data[f"fold_{fold}"]
|
85 |
+
text = text_data[f"fold_{fold}"]
|
86 |
+
|
87 |
+
texts, gold_labels, pred_labels = [], [], []
|
88 |
+
|
89 |
+
for id, gold_label in tqdm(gold.items()):
|
90 |
+
txt = text[id]
|
91 |
+
pred_label = pred.get(id, [])
|
92 |
+
|
93 |
+
txt_seg, gold_bio = span2bio(txt, gold_label)
|
94 |
+
_, pred_bio = span2bio(txt, pred_label)
|
95 |
+
|
96 |
+
texts.append(txt_seg)
|
97 |
+
gold_labels.append(gold_bio)
|
98 |
+
pred_labels.append(pred_bio)
|
99 |
+
|
100 |
+
evaluator = Evaluator(gold_labels, pred_labels, tags=labels, loader="list")
|
101 |
+
results, results_per_tag, _, _ = evaluator.evaluate()
|
102 |
+
|
103 |
+
f1_scores = [results_per_tag[l]["strict"]["f1"] for l in results_per_tag]
|
104 |
+
avg_f1 = sum(f1_scores) / len(f1_scores)
|
105 |
+
print(f"Strict Macro-F1 on Fold {fold}:", avg_f1)
|
106 |
+
results_per_fold[f"fold_{fold}"] = avg_f1
|
107 |
+
|
108 |
+
return {"strict mF1": f"{np.mean(list(results_per_fold.values()))}:.2f"}
|
109 |
+
|
110 |
+
|
111 |
+
def evaluate_rr(gold_data, pred_data):
|
112 |
+
all_gold_labels = []
|
113 |
+
all_pred_labels = []
|
114 |
+
|
115 |
+
for id, gold_labels in gold_data.items():
|
116 |
+
pred_labels = pred_data.get(id, ["None"] * len(gold_labels))
|
117 |
+
all_gold_labels.extend(gold_labels)
|
118 |
+
all_pred_labels.extend(pred_labels)
|
119 |
+
|
120 |
+
mf1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
|
121 |
+
print(f"Macro-F1 on combined test set:", mf1)
|
122 |
+
|
123 |
+
return {"mF1": f"{mf1:.2f}"}
|
124 |
+
|
125 |
+
|
126 |
+
def evaluate_lsi(gold_data, pred_data):
|
127 |
+
with open("lsi_label_vocab.json") as f:
|
128 |
+
label_vocab = json.load(f)
|
129 |
+
|
130 |
+
gold_matrix = np.zeros((len(gold_data), len(label_vocab)))
|
131 |
+
pred_matrix = np.zeros((len(gold_data), len(label_vocab)))
|
132 |
+
|
133 |
+
for i, (id, gold_labels) in enumerate(gold_data.items()):
|
134 |
+
pred_labels = pred_data.get(id, [])
|
135 |
+
|
136 |
+
for label in gold_labels:
|
137 |
+
if label in label_vocab:
|
138 |
+
gold_matrix[i, label_vocab[label]] = 1
|
139 |
+
|
140 |
+
for label in pred_labels:
|
141 |
+
if label in label_vocab:
|
142 |
+
pred_matrix[i, label_vocab[label]] = 1
|
143 |
+
|
144 |
+
f1 = f1_score(gold_matrix, pred_matrix, average="macro")
|
145 |
+
print("Macro-F1 on ILSI test set:", f1)
|
146 |
+
return f1
|
147 |
+
|
148 |
+
|
149 |
+
def evaluate_pcr(gold_data, pred_data):
|
150 |
+
f1_scores = []
|
151 |
+
for k in range(1, 21):
|
152 |
+
correct, gold_total, pred_total = 0, 0, 0
|
153 |
+
for id, gold_candidates in gold_data.items():
|
154 |
+
pred_candidates = pred_data.get(id, [])
|
155 |
+
gold_candidates = [c for c in gold_candidates if c != id]
|
156 |
+
pred_candidates = [c for c in pred_candidates if c != id]
|
157 |
+
|
158 |
+
c, g, p = get_micro_at_k(gold_candidates, pred_candidates, k)
|
159 |
+
correct += c
|
160 |
+
gold_total += g
|
161 |
+
pred_total += p
|
162 |
+
|
163 |
+
precision = correct / pred_total if pred_total > 0 else 0
|
164 |
+
recall = correct / gold_total if gold_total > 0 else 0
|
165 |
+
f1 = (
|
166 |
+
2 * precision * recall / (precision + recall)
|
167 |
+
if precision + recall > 0
|
168 |
+
else 0
|
169 |
+
)
|
170 |
+
f1_scores.append(f1)
|
171 |
+
|
172 |
+
print(f"Micro-F1@{k} on IL-PCR test set:", f1)
|
173 |
+
|
174 |
+
return np.mean(f1_scores)
|
175 |
+
|
176 |
+
|
177 |
+
def evaluate_summ(gold_data, pred_data):
|
178 |
+
gold_summaries = []
|
179 |
+
pred_summaries = []
|
180 |
+
|
181 |
+
for id, gold_summary in gold_data.items():
|
182 |
+
if id in pred_data:
|
183 |
+
gold_summary = re.sub(r"\s+", " ", gold_summary.replace("\n", " ")).strip()
|
184 |
+
pred_summary = re.sub(r"\s+", " ", pred_data[id].replace("\n", " ")).strip()
|
185 |
+
|
186 |
+
gold_summaries.append(gold_summary)
|
187 |
+
pred_summaries.append(pred_summary)
|
188 |
+
|
189 |
+
rouge = evaluate.load("rouge")
|
190 |
+
rouge_scores = rouge.compute(predictions=pred_summaries, references=gold_summaries)
|
191 |
+
print("Rouge-L:", rouge_scores)
|
192 |
+
|
193 |
+
return {"ROUGE-L": rouge_scores, "BERTSCORE": "-"}
|
194 |
+
|
195 |
+
|
196 |
+
def evaluate_lmt(gold_data, pred_data):
|
197 |
+
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
|
198 |
+
bleu = BLEU()
|
199 |
+
chrfp = CHRF(word_order=2)
|
200 |
+
gleu = evaluate.load("google_bleu")
|
201 |
+
|
202 |
+
G = defaultdict(lambda: defaultdict(list))
|
203 |
+
P = defaultdict(lambda: defaultdict(list))
|
204 |
+
|
205 |
+
for dataset in gold_data:
|
206 |
+
for id, gold_text in gold_data[dataset].items():
|
207 |
+
lang = id.split("/")[1].strip()
|
208 |
+
gold_tokens = " ".join(tokenizer.tokenize(gold_text))
|
209 |
+
pred_tokens = " ".join(tokenizer.tokenize(pred_data[dataset][id]))
|
210 |
+
G[dataset][lang].append(gold_tokens)
|
211 |
+
P[dataset][lang].append(pred_tokens)
|
212 |
+
|
213 |
+
bleu_scores, chrfpp_scores, gleu_scores = [], [], []
|
214 |
+
|
215 |
+
for dataset in G:
|
216 |
+
print("Dataset", dataset)
|
217 |
+
dataset_bleu, dataset_chrfpp, dataset_gleu = [], [], []
|
218 |
+
|
219 |
+
for lang in G[dataset]:
|
220 |
+
gold = G[dataset][lang]
|
221 |
+
pred = P[dataset][lang]
|
222 |
+
|
223 |
+
bleu_score = bleu.corpus_score(pred, [gold]).score
|
224 |
+
chrfpp_score = chrfp.corpus_score(pred, [gold]).score
|
225 |
+
gleu_score = gleu.compute(predictions=pred, references=gold)["google_bleu"]
|
226 |
+
|
227 |
+
dataset_bleu.append(bleu_score)
|
228 |
+
dataset_chrfpp.append(chrfpp_score)
|
229 |
+
dataset_gleu.append(gleu_score)
|
230 |
+
|
231 |
+
bleu_scores.append(sum(dataset_bleu) / len(dataset_bleu))
|
232 |
+
chrfpp_scores.append(sum(dataset_chrfpp) / len(dataset_chrfpp))
|
233 |
+
gleu_scores.append(sum(dataset_gleu) / len(dataset_gleu))
|
234 |
+
|
235 |
+
return {
|
236 |
+
"BLEU": sum(bleu_scores) / len(bleu_scores),
|
237 |
+
"GLEU": sum(gleu_scores) / len(gleu_scores),
|
238 |
+
"chrF++": sum(chrfpp_scores) / len(chrfpp_scores),
|
239 |
+
}
|
240 |
+
|
241 |
+
|
242 |
+
def create_output_json(evaluation_results):
|
243 |
+
output = {
|
244 |
+
"Method": "GPT-5 (2-shot)",
|
245 |
+
"Submitted By": "IL-TUR",
|
246 |
+
"Github Link": "dummy submission",
|
247 |
+
"L-NER": {"strict mF1": evaluation_results["lner"]["strict mF1"]},
|
248 |
+
"RR": {"mF1": evaluation_results["rr"]["mF1"]},
|
249 |
+
"CJPE": {
|
250 |
+
"mF1": evaluation_results["cjpe"]["mF1"],
|
251 |
+
"ROUGE-L": evaluation_results["cjpe"]["ROUGE-L"],
|
252 |
+
"BLEU": evaluation_results["cjpe"]["BLEU"],
|
253 |
+
},
|
254 |
+
"BAIL": {"mF1": evaluation_results["bail"]},
|
255 |
+
"LSI": {"mF1": evaluation_results["lsi"]},
|
256 |
+
"PCR": {"muF1@K": evaluation_results["pcr"]},
|
257 |
+
"SUMM": {
|
258 |
+
"ROUGE-L": evaluation_results["summ"]["ROUGE-L"],
|
259 |
+
"BERTSCORE": "-", # Placeholder BERTSCORE
|
260 |
+
},
|
261 |
+
"L-MT": {
|
262 |
+
"BLEU": evaluation_results["lmt"]["BLEU"],
|
263 |
+
"GLEU": evaluation_results["lmt"]["GLEU"],
|
264 |
+
"chrF++": evaluation_results["lmt"]["chrF++"],
|
265 |
+
},
|
266 |
+
}
|
267 |
+
return [output] # Wrap in a list to match the desired format
|
268 |
+
|
269 |
+
|
270 |
+
def main():
|
271 |
+
# gold_data = load_json("IL_TUR_eval_gold.json")
|
272 |
+
# pred_data = load_json("IL_TUR_eval_submission2.json")
|
273 |
+
gold_data = load_json("submissions/baseline/IL_TUR_eval_gold_small.json")
|
274 |
+
pred_data = load_json("submissions/baseline/IL_TUR_eval_submission_small.json")
|
275 |
+
pred_data = gold_data
|
276 |
+
evaluation_results = {}
|
277 |
+
|
278 |
+
for task in pred_data.keys():
|
279 |
+
print(f"Task: {task}")
|
280 |
+
|
281 |
+
if task == "bail":
|
282 |
+
evaluation_results[task] = evaluate_bail(gold_data[task], pred_data[task])
|
283 |
+
elif task == "cjpe":
|
284 |
+
evaluation_results.update(evaluate_cjpe(gold_data[task], pred_data[task]))
|
285 |
+
elif task == "lner":
|
286 |
+
text_data = load_json("lner-text.json")
|
287 |
+
evaluation_results[task] = evaluate_lner(
|
288 |
+
gold_data[task], pred_data[task], text_data
|
289 |
+
)
|
290 |
+
elif task == "rr":
|
291 |
+
evaluation_results[task] = evaluate_rr(gold_data[task], pred_data[task])
|
292 |
+
elif task == "lsi":
|
293 |
+
evaluation_results[task] = evaluate_lsi(gold_data[task], pred_data[task])
|
294 |
+
elif task == "pcr":
|
295 |
+
evaluation_results[task] = evaluate_pcr(gold_data[task], pred_data[task])
|
296 |
+
elif task == "summ":
|
297 |
+
evaluation_results[task] = evaluate_summ(gold_data[task], pred_data[task])
|
298 |
+
elif task == "lmt":
|
299 |
+
evaluation_results[task] = evaluate_lmt(gold_data[task], pred_data[task])
|
300 |
+
|
301 |
+
# convert the evaluation results to the required format
|
302 |
+
for task, result in evaluation_results.items():
|
303 |
+
if isinstance(result, dict):
|
304 |
+
for subtask, subresult in result.items():
|
305 |
+
if isinstance(subresult, dict):
|
306 |
+
for subsubtask, subsubresult in subresult.items():
|
307 |
+
evaluation_results[task][subtask][
|
308 |
+
subsubtask
|
309 |
+
] = f"{subsubresult:.2f}"
|
310 |
+
else:
|
311 |
+
if isinstance(subresult, str):
|
312 |
+
evaluation_results[task][subtask] = subresult
|
313 |
+
else:
|
314 |
+
evaluation_results[task][subtask] = f"{subresult:.2f}"
|
315 |
+
else:
|
316 |
+
if isinstance(result, str):
|
317 |
+
evaluation_results[task] = result
|
318 |
+
else:
|
319 |
+
evaluation_results[task] = f"{result:.2f}"
|
320 |
+
|
321 |
+
blank_scores = {
|
322 |
+
"lner": {"strict mF1": "-"},
|
323 |
+
"rr": {"mF1": "-"},
|
324 |
+
"cjpe": {"mF1": "-", "ROUGE-L": "-", "BLEU": "-"},
|
325 |
+
"bail": {"mF1": "-"},
|
326 |
+
"lsi": {"mF1": "-"},
|
327 |
+
"pcr": {"muF1@K": "-"},
|
328 |
+
"summ": {"ROUGE-L": "-", "BERTSCORE": "-"},
|
329 |
+
"lmt": {"BLEU": "-", "GLEU": "-", "chrF++": "-"},
|
330 |
+
}
|
331 |
+
|
332 |
+
print("--------------------------Evaluation Summary--------------------------")
|
333 |
+
for task, result in evaluation_results.items():
|
334 |
+
print(f"{task}: {result}")
|
335 |
+
print("---------------------------------------------------------------------")
|
336 |
+
|
337 |
+
# for tasks that were not present in the submission, add blank scores
|
338 |
+
for task in gold_data.keys():
|
339 |
+
if task not in pred_data:
|
340 |
+
evaluation_results[task] = blank_scores[task]
|
341 |
+
|
342 |
+
# Generate the output JSON
|
343 |
+
output_json = create_output_json(evaluation_results)
|
344 |
+
with open("evaluation_results.json", "w") as f:
|
345 |
+
json.dump(output_json, f, indent=2)
|
346 |
+
print("Evaluation results saved to evaluation_results.json")
|
347 |
+
|
348 |
+
|
349 |
+
def get_evaluation_scores(gold_data, submission_data):
|
350 |
+
evaluation_results = {}
|
351 |
+
|
352 |
+
for task in submission_data.keys():
|
353 |
+
print(f"Task: {task}")
|
354 |
+
|
355 |
+
if task == "bail":
|
356 |
+
evaluation_results[task] = evaluate_bail(
|
357 |
+
gold_data[task], submission_data[task]
|
358 |
+
)
|
359 |
+
elif task == "cjpe":
|
360 |
+
evaluation_results.update(
|
361 |
+
evaluate_cjpe(gold_data[task], submission_data[task])
|
362 |
+
)
|
363 |
+
elif task == "lner":
|
364 |
+
text_data = load_json("lner-text.json")
|
365 |
+
evaluation_results[task] = evaluate_lner(
|
366 |
+
gold_data[task], submission_data[task], text_data
|
367 |
+
)
|
368 |
+
elif task == "rr":
|
369 |
+
evaluation_results[task] = evaluate_rr(
|
370 |
+
gold_data[task], submission_data[task]
|
371 |
+
)
|
372 |
+
elif task == "lsi":
|
373 |
+
evaluation_results[task] = evaluate_lsi(
|
374 |
+
gold_data[task], submission_data[task]
|
375 |
+
)
|
376 |
+
elif task == "pcr":
|
377 |
+
evaluation_results[task] = evaluate_pcr(
|
378 |
+
gold_data[task], submission_data[task]
|
379 |
+
)
|
380 |
+
elif task == "summ":
|
381 |
+
evaluation_results[task] = evaluate_summ(
|
382 |
+
gold_data[task], submission_data[task]
|
383 |
+
)
|
384 |
+
elif task == "lmt":
|
385 |
+
evaluation_results[task] = evaluate_lmt(
|
386 |
+
gold_data[task], submission_data[task]
|
387 |
+
)
|
388 |
+
|
389 |
+
# convert the evaluation results to the required format
|
390 |
+
for task, result in evaluation_results.items():
|
391 |
+
if isinstance(result, dict):
|
392 |
+
for subtask, subresult in result.items():
|
393 |
+
if isinstance(subresult, dict):
|
394 |
+
for subsubtask, subsubresult in subresult.items():
|
395 |
+
evaluation_results[task][subtask][
|
396 |
+
subsubtask
|
397 |
+
] = f"{subsubresult:.2f}"
|
398 |
+
else:
|
399 |
+
if isinstance(subresult, str):
|
400 |
+
evaluation_results[task][subtask] = subresult
|
401 |
+
else:
|
402 |
+
evaluation_results[task][subtask] = f"{subresult:.2f}"
|
403 |
+
else:
|
404 |
+
if isinstance(result, str):
|
405 |
+
evaluation_results[task] = result
|
406 |
+
else:
|
407 |
+
evaluation_results[task] = f"{result:.2f}"
|
408 |
+
|
409 |
+
blank_scores = {
|
410 |
+
"lner": {"strict mF1": "-"},
|
411 |
+
"rr": {"mF1": "-"},
|
412 |
+
"cjpe": {"mF1": "-", "ROUGE-L": "-", "BLEU": "-"},
|
413 |
+
"bail": {"mF1": "-"},
|
414 |
+
"lsi": {"mF1": "-"},
|
415 |
+
"pcr": {"muF1@K": "-"},
|
416 |
+
"summ": {"ROUGE-L": "-", "BERTSCORE": "-"},
|
417 |
+
"lmt": {"BLEU": "-", "GLEU": "-", "chrF++": "-"},
|
418 |
+
}
|
419 |
+
|
420 |
+
# for tasks that were not present in the submission, add blank scores
|
421 |
+
for task in gold_data.keys():
|
422 |
+
if task not in submission_data:
|
423 |
+
evaluation_results[task] = blank_scores[task]
|
424 |
+
|
425 |
+
print("--------------------------Evaluation Summary--------------------------")
|
426 |
+
for task, result in evaluation_results.items():
|
427 |
+
print(f"{task}: {result}")
|
428 |
+
print("---------------------------------------------------------------------")
|
429 |
+
output_json = create_output_json(evaluation_results)
|
430 |
+
|
431 |
+
return output_json
|
432 |
+
|
433 |
+
|
434 |
+
if __name__ == "__main__":
|
435 |
+
main()
|
evaluation_results.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"Method": "GPT-5 (2-shot)",
|
4 |
+
"Submitted By": "IL-TUR",
|
5 |
+
"Github Link": "dummy submission",
|
6 |
+
"L-NER": {
|
7 |
+
"strict mF1": "-"
|
8 |
+
},
|
9 |
+
"RR": {
|
10 |
+
"mF1": {
|
11 |
+
"mF1": "0.10"
|
12 |
+
}
|
13 |
+
},
|
14 |
+
"CJPE": {
|
15 |
+
"mF1": "-",
|
16 |
+
"ROUGE-L": "-",
|
17 |
+
"BLEU": "-"
|
18 |
+
},
|
19 |
+
"BAIL": {
|
20 |
+
"mF1": "0.02"
|
21 |
+
},
|
22 |
+
"LSI": {
|
23 |
+
"mF1": "0.26"
|
24 |
+
},
|
25 |
+
"PCR": {
|
26 |
+
"muF1@K": "0.63"
|
27 |
+
},
|
28 |
+
"SUMM": {
|
29 |
+
"ROUGE-L": "-",
|
30 |
+
"BERTSCORE": "-"
|
31 |
+
},
|
32 |
+
"L-MT": {
|
33 |
+
"BLEU": "-",
|
34 |
+
"GLEU": "-",
|
35 |
+
"chrF++": "-"
|
36 |
+
}
|
37 |
+
}
|
38 |
+
]
|
labels.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
APP
|
2 |
+
RESP
|
3 |
+
A.COUNSEL
|
4 |
+
R.COUNSEL
|
5 |
+
JUDGE
|
6 |
+
WIT
|
7 |
+
AUTH
|
8 |
+
COURT
|
9 |
+
STAT
|
10 |
+
PREC
|
11 |
+
DATE
|
12 |
+
CASENO
|
lner-text.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lsi_label_vocab.json
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Section 2": 0,
|
3 |
+
"Section 3": 1,
|
4 |
+
"Section 4": 2,
|
5 |
+
"Section 5": 3,
|
6 |
+
"Section 13": 4,
|
7 |
+
"Section 34": 5,
|
8 |
+
"Section 107": 6,
|
9 |
+
"Section 109": 7,
|
10 |
+
"Section 114": 8,
|
11 |
+
"Section 120": 9,
|
12 |
+
"Section 120B": 10,
|
13 |
+
"Section 143": 11,
|
14 |
+
"Section 147": 12,
|
15 |
+
"Section 148": 13,
|
16 |
+
"Section 149": 14,
|
17 |
+
"Section 155": 15,
|
18 |
+
"Section 156": 16,
|
19 |
+
"Section 161": 17,
|
20 |
+
"Section 164": 18,
|
21 |
+
"Section 173": 19,
|
22 |
+
"Section 174A": 20,
|
23 |
+
"Section 186": 21,
|
24 |
+
"Section 188": 22,
|
25 |
+
"Section 190": 23,
|
26 |
+
"Section 193": 24,
|
27 |
+
"Section 200": 25,
|
28 |
+
"Section 201": 26,
|
29 |
+
"Section 228": 27,
|
30 |
+
"Section 229A": 28,
|
31 |
+
"Section 279": 29,
|
32 |
+
"Section 294": 30,
|
33 |
+
"Section 294(b)": 31,
|
34 |
+
"Section 299": 32,
|
35 |
+
"Section 300": 33,
|
36 |
+
"Section 302": 34,
|
37 |
+
"Section 304": 35,
|
38 |
+
"Section 304A": 36,
|
39 |
+
"Section 304B": 37,
|
40 |
+
"Section 306": 38,
|
41 |
+
"Section 307": 39,
|
42 |
+
"Section 308": 40,
|
43 |
+
"Section 313": 41,
|
44 |
+
"Section 320": 42,
|
45 |
+
"Section 323": 43,
|
46 |
+
"Section 324": 44,
|
47 |
+
"Section 325": 45,
|
48 |
+
"Section 326": 46,
|
49 |
+
"Section 332": 47,
|
50 |
+
"Section 336": 48,
|
51 |
+
"Section 337": 49,
|
52 |
+
"Section 338": 50,
|
53 |
+
"Section 341": 51,
|
54 |
+
"Section 342": 52,
|
55 |
+
"Section 353": 53,
|
56 |
+
"Section 354": 54,
|
57 |
+
"Section 363": 55,
|
58 |
+
"Section 364": 56,
|
59 |
+
"Section 365": 57,
|
60 |
+
"Section 366": 58,
|
61 |
+
"Section 366A": 59,
|
62 |
+
"Section 375": 60,
|
63 |
+
"Section 376": 61,
|
64 |
+
"Section 376(2)": 62,
|
65 |
+
"Section 379": 63,
|
66 |
+
"Section 380": 64,
|
67 |
+
"Section 384": 65,
|
68 |
+
"Section 389": 66,
|
69 |
+
"Section 392": 67,
|
70 |
+
"Section 394": 68,
|
71 |
+
"Section 395": 69,
|
72 |
+
"Section 397": 70,
|
73 |
+
"Section 406": 71,
|
74 |
+
"Section 409": 72,
|
75 |
+
"Section 411": 73,
|
76 |
+
"Section 415": 74,
|
77 |
+
"Section 417": 75,
|
78 |
+
"Section 419": 76,
|
79 |
+
"Section 420": 77,
|
80 |
+
"Section 427": 78,
|
81 |
+
"Section 436": 79,
|
82 |
+
"Section 437": 80,
|
83 |
+
"Section 438": 81,
|
84 |
+
"Section 447": 82,
|
85 |
+
"Section 448": 83,
|
86 |
+
"Section 450": 84,
|
87 |
+
"Section 452": 85,
|
88 |
+
"Section 457": 86,
|
89 |
+
"Section 465": 87,
|
90 |
+
"Section 467": 88,
|
91 |
+
"Section 468": 89,
|
92 |
+
"Section 471": 90,
|
93 |
+
"Section 482": 91,
|
94 |
+
"Section 494": 92,
|
95 |
+
"Section 498": 93,
|
96 |
+
"Section 498A": 94,
|
97 |
+
"Section 500": 95,
|
98 |
+
"Section 504": 96,
|
99 |
+
"Section 506": 97,
|
100 |
+
"Section 509": 98,
|
101 |
+
"Section 511": 99
|
102 |
+
}
|
ner_helpers.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
import re
|
3 |
+
import string
|
4 |
+
|
5 |
+
|
6 |
+
class TF_Tokenizer:
|
7 |
+
def __init__(self, model_str):
|
8 |
+
tok = AutoTokenizer.from_pretrained(model_str)
|
9 |
+
|
10 |
+
def __call__(self, txt):
|
11 |
+
return self.tok.tokenize(txt)
|
12 |
+
|
13 |
+
|
14 |
+
class WS_Tokenizer:
|
15 |
+
def __init__(self):
|
16 |
+
pass
|
17 |
+
|
18 |
+
def __call__(self, txt):
|
19 |
+
return re.findall(r"[{}]|\w+".format(string.punctuation), txt)
|
20 |
+
|
21 |
+
|
22 |
+
def convert_spans_to_bio(txt, roles, tokenizer_func):
|
23 |
+
roles = sorted(roles, key=lambda x: x["start"])
|
24 |
+
roles_left = [r["start"] for r in roles]
|
25 |
+
|
26 |
+
ttxt = tokenizer_func(txt)
|
27 |
+
|
28 |
+
c = 0
|
29 |
+
cr = -1
|
30 |
+
prev = "O"
|
31 |
+
troles = []
|
32 |
+
for tok in ttxt:
|
33 |
+
if c >= len(txt):
|
34 |
+
break
|
35 |
+
|
36 |
+
while txt[c] == " ":
|
37 |
+
c += 1
|
38 |
+
|
39 |
+
else:
|
40 |
+
if c in roles_left: # Start of a new role
|
41 |
+
ind = roles_left.index(c)
|
42 |
+
cr = roles[ind]["end"]
|
43 |
+
prev = "I-" + roles[ind]["label"]
|
44 |
+
troles.append("B-" + roles[ind]["label"])
|
45 |
+
else:
|
46 |
+
if c < cr: # Assign previous role
|
47 |
+
troles.append(prev)
|
48 |
+
else: # Assign 'O'
|
49 |
+
troles.append("O")
|
50 |
+
|
51 |
+
c += len(tok)
|
52 |
+
|
53 |
+
if len(ttxt) != len(troles):
|
54 |
+
troles += ["O"] * (len(ttxt) - len(troles))
|
55 |
+
|
56 |
+
assert len(ttxt) == len(troles)
|
57 |
+
return troles
|
58 |
+
|
59 |
+
|
60 |
+
def convert_bio_to_spans(txt, troles, tokenizer_func):
|
61 |
+
c = 0
|
62 |
+
c2 = 0
|
63 |
+
cr = -1
|
64 |
+
cs = -1
|
65 |
+
prev = "O"
|
66 |
+
|
67 |
+
roles = []
|
68 |
+
ttxt = tokenizer_func(txt)
|
69 |
+
|
70 |
+
if len(ttxt) != len(troles):
|
71 |
+
ttxt = ttxt[: len(troles)]
|
72 |
+
|
73 |
+
for j, tok in enumerate(ttxt):
|
74 |
+
if c >= len(txt):
|
75 |
+
break
|
76 |
+
|
77 |
+
while c < len(txt) and txt[c].isspace():
|
78 |
+
c += 1
|
79 |
+
|
80 |
+
if tok[:2] == "##" or tok == "[UNK]":
|
81 |
+
c += len(tok) - 2 if tok[:2] == "##" else 1
|
82 |
+
else:
|
83 |
+
if troles[j].startswith("B-"):
|
84 |
+
if cs >= cr:
|
85 |
+
cr = c
|
86 |
+
if cs >= 0:
|
87 |
+
roles.append({"start": cs, "end": c2, "label": prev})
|
88 |
+
cs = c
|
89 |
+
prev = troles[j][2:]
|
90 |
+
else:
|
91 |
+
if troles[j] == "O":
|
92 |
+
if cs >= cr:
|
93 |
+
cr = c
|
94 |
+
if cs >= 0:
|
95 |
+
roles.append({"start": cs, "end": c2, "label": prev})
|
96 |
+
c += len(tok)
|
97 |
+
c2 = c
|
98 |
+
|
99 |
+
if cs >= cr:
|
100 |
+
if cs >= 0:
|
101 |
+
roles.append({"start": cs, "end": c2, "label": prev})
|
102 |
+
|
103 |
+
return roles
|
104 |
+
|
105 |
+
|
106 |
+
def span2bio(txt, labels):
|
107 |
+
roles = sorted(labels, key=lambda x: x["label"])
|
108 |
+
roles_left = [r["start"] for r in roles]
|
109 |
+
|
110 |
+
ttxt = re.findall(r"[{}]|\w+".format(string.punctuation), txt)
|
111 |
+
|
112 |
+
c = 0
|
113 |
+
cr = -1
|
114 |
+
prev = "O"
|
115 |
+
troles = []
|
116 |
+
for tok in ttxt:
|
117 |
+
if c >= len(txt):
|
118 |
+
break
|
119 |
+
|
120 |
+
while txt[c] == " ":
|
121 |
+
c += 1
|
122 |
+
|
123 |
+
else:
|
124 |
+
if c in roles_left: # Start of a new role
|
125 |
+
ind = roles_left.index(c)
|
126 |
+
cr = roles[ind]["end"]
|
127 |
+
prev = "I-" + roles[ind]["label"]
|
128 |
+
troles.append("B-" + roles[ind]["label"])
|
129 |
+
else:
|
130 |
+
if c < cr: # Assign previous role
|
131 |
+
troles.append(prev)
|
132 |
+
else: # Assign 'O'
|
133 |
+
troles.append("O")
|
134 |
+
|
135 |
+
c += len(tok)
|
136 |
+
|
137 |
+
if len(ttxt) != len(troles):
|
138 |
+
troles += ["O"] * (len(ttxt) - len(troles))
|
139 |
+
|
140 |
+
assert len(ttxt) == len(troles)
|
141 |
+
return ttxt, troles
|
requirements.txt
CHANGED
@@ -5,4 +5,5 @@ gradio
|
|
5 |
huggingface-hub==0.18.0
|
6 |
numpy==1.24.2
|
7 |
APScheduler==3.10.1
|
8 |
-
pandas==1.3.4
|
|
|
|
5 |
huggingface-hub==0.18.0
|
6 |
numpy==1.24.2
|
7 |
APScheduler==3.10.1
|
8 |
+
pandas==1.3.4
|
9 |
+
nervaluate==0.2.0
|
submissions/baseline/IL_TUR_eval_gold_small.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
submissions/baseline/IL_TUR_eval_submission_small.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
uploads.py
CHANGED
@@ -6,7 +6,11 @@ import json
|
|
6 |
import pandas as pd
|
7 |
import gradio as gr
|
8 |
|
|
|
|
|
|
|
9 |
LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
|
|
|
10 |
# RESULTS_PATH = "Exploration-Lab/IL-TUR-Leaderboard-results"
|
11 |
TOKEN = os.environ.get("TOKEN", None)
|
12 |
YEAR_VERSION = "2024"
|
@@ -93,9 +97,21 @@ def add_new_eval(
|
|
93 |
# upload the df to spaces
|
94 |
import io
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
with open("submissions/baseline/results.json", "r") as f:
|
101 |
results = json.load(f)
|
|
|
6 |
import pandas as pd
|
7 |
import gradio as gr
|
8 |
|
9 |
+
from eval_utils import get_evaluation_scores
|
10 |
+
|
11 |
+
|
12 |
LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
|
13 |
+
SUBMISSION_FORMAT = "predictions"
|
14 |
# RESULTS_PATH = "Exploration-Lab/IL-TUR-Leaderboard-results"
|
15 |
TOKEN = os.environ.get("TOKEN", None)
|
16 |
YEAR_VERSION = "2024"
|
|
|
97 |
# upload the df to spaces
|
98 |
import io
|
99 |
|
100 |
+
if SUBMISSION_FORMAT == "predictions":
|
101 |
+
# read the submission json file
|
102 |
+
with open(path_to_file, "r") as f:
|
103 |
+
submission_data = json.load(f)
|
104 |
+
|
105 |
+
# read the gold json file
|
106 |
+
with open("submissions/baseline/IL_TUR_eval_gold_small.json", "r") as f:
|
107 |
+
gold_data = json.load(f)
|
108 |
+
|
109 |
+
submission = get_evaluation_scores(gold_data, submission_data)
|
110 |
+
|
111 |
+
else:
|
112 |
+
# read the submission json file
|
113 |
+
with open(path_to_file, "r") as f:
|
114 |
+
submission = json.load(f)
|
115 |
|
116 |
with open("submissions/baseline/results.json", "r") as f:
|
117 |
results = json.load(f)
|