narcolepticchicken
/

occ-stack

narcolepticchicken commited on 24 days ago

Commit

b4d00e5

verified ·

1 Parent(s): 30c4069

Upload benchmarks/benchmark_retrieval_qa.py

Files changed (1) hide show

benchmarks/benchmark_retrieval_qa.py CHANGED Viewed

@@ -407,6 +407,8 @@ class RetrievalQABenchmark:
         n = len(results)
         correct = sum(1 for r in results if r["correct"])
         abstained = sum(1 for r in results if r.get("abstained", False))
         correct_abstentions = sum(
             1 for i in unanswerable_qs if results[i].get("abstained", False)
         )

         n = len(results)
         correct = sum(1 for r in results if r["correct"])
         abstained = sum(1 for r in results if r.get("abstained", False))
+        # Count abstentions properly
+        unanswerable_qs = [i for i, r in enumerate(results) if self.questions[i].is_unanswerable]
         correct_abstentions = sum(
             1 for i in unanswerable_qs if results[i].get("abstained", False)
         )