IL-TUR-Leaderboard

Running

shounakpaul95 commited on Jul 8

Commit

52aa0e7

•

1 Parent(s): c00c6b5

Update eval_utils.py

Files changed (1) hide show

eval_utils.py CHANGED Viewed

@@ -65,16 +65,19 @@ def evaluate_cjpe(gold_data, pred_data):
     R = []
     B = []
     rl_evaluator = rouge.Rouge(metrics=['rouge-l'], max_n=2, limit_length=False, apply_avg=True)
-    for x in tqdm(range(1, 6), desc="cjpe explanation expert-wise"):
         gold_explanations = []
         pred_explanations = []
         for k,v in gold_data['explanation'].items():
             gold_explanations.append(v[f'expert_{x}'])
             pred_explanations.append(pred_data['explanation'][k])
         rougex = rl_evaluator.get_scores(pred_explanations, gold_explanations)['rouge-l']['f']
         bleux = get_BLEU_score(gold_explanations, pred_explanations)
         R.append(rougex)
         B.append(bleux)
     rouge_score = sum(R)/len(R)
     bleu_score = sum(B)/len(B)
@@ -214,7 +217,7 @@ def evaluate_pcr(gold_data, pred_data):
     f1_scores = []
     for k in range(1, 21):
         correct, gold_total, pred_total = 0, 0, 0
-        for id, gold_candidates in gold_data.items():
             pred_candidates = pred_data.get(id, [])
             gold_candidates = [c for c in gold_candidates if c != id]
             pred_candidates = [c for c in pred_candidates if c != id]

     R = []
     B = []
     rl_evaluator = rouge.Rouge(metrics=['rouge-l'], max_n=2, limit_length=False, apply_avg=True)
+    for x in range(1, 6):
         gold_explanations = []
         pred_explanations = []
         for k,v in gold_data['explanation'].items():
             gold_explanations.append(v[f'expert_{x}'])
             pred_explanations.append(pred_data['explanation'][k])
+        print("Metrics for expert", x, "...", end=' ')
         rougex = rl_evaluator.get_scores(pred_explanations, gold_explanations)['rouge-l']['f']
         bleux = get_BLEU_score(gold_explanations, pred_explanations)
         R.append(rougex)
         B.append(bleux)
+        print("Done.")
     rouge_score = sum(R)/len(R)
     bleu_score = sum(B)/len(B)
     f1_scores = []
     for k in range(1, 21):
         correct, gold_total, pred_total = 0, 0, 0
+        for id, gold_candidates in tqdm(gold_data.items(), desc="pcr"):
             pred_candidates = pred_data.get(id, [])
             gold_candidates = [c for c in gold_candidates if c != id]
             pred_candidates = [c for c in pred_candidates if c != id]