RhinoWithAcape commited on
Commit
5295833
·
verified ·
1 Parent(s): cb4ba69

Upload benchmark_results/gpqa_diamond_full_198_questions.json with huggingface_hub

Browse files
benchmark_results/gpqa_diamond_full_198_questions.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "benchmark": "GPQA Diamond",
3
+ "description": "Graduate-Level Google-Proof Q&A - Diamond Set (Hardest)",
4
+ "evaluation_date": "2025-10-17",
5
+ "total_questions": 198,
6
+ "sample_type": "FULL DATASET",
7
+ "note": "This is the COMPLETE GPQA Diamond benchmark - all 198 PhD-level science questions",
8
+
9
+ "results": {
10
+ "overall_accuracy": 0.990,
11
+ "correct_answers": 196,
12
+ "incorrect_answers": 2,
13
+ "accuracy_percentage": "99.0%"
14
+ },
15
+
16
+ "agent_performance": {
17
+ "grogu_round1_accuracy": 0.561,
18
+ "grogu_round1_correct": 111,
19
+ "grogu_round2_accuracy": 0.566,
20
+ "grogu_round2_correct": 112,
21
+ "grogu_improvement": "+0.5 points"
22
+ },
23
+
24
+ "debate_statistics": {
25
+ "total_mind_changes": 572,
26
+ "ties_broken_by_debate": 54,
27
+ "ties_broken_percentage": "27.3%"
28
+ },
29
+
30
+ "synthesis_performance": {
31
+ "synthesis_solo_accuracy": 0.975,
32
+ "synthesis_agreement_with_final": 0.975
33
+ },
34
+
35
+ "improvement_over_baseline": {
36
+ "debate_vs_grogu_r1": "+42.9 points",
37
+ "debate_vs_grogu_r2": "+42.4 points"
38
+ },
39
+
40
+ "inference_details": {
41
+ "total_runtime_hours": 20.5,
42
+ "average_time_per_question_seconds": 372.87,
43
+ "hardware": "NVIDIA RTX GPU",
44
+ "agents_used": ["grogu", "physics", "chemistry", "biology"]
45
+ },
46
+
47
+ "domains_covered": [
48
+ "Physics (Quantum mechanics, thermodynamics, electromagnetism)",
49
+ "Chemistry (Organic, inorganic, physical chemistry)",
50
+ "Biology (Molecular biology, genetics, biochemistry)"
51
+ ],
52
+
53
+ "difficulty_context": {
54
+ "expert_human_accuracy": "~65%",
55
+ "non_expert_human_accuracy": "~30%",
56
+ "note": "GPQA Diamond questions are designed to be 'Google-proof' - answerable only by domain experts"
57
+ },
58
+
59
+ "sample_questions": [
60
+ {
61
+ "domain": "Biology",
62
+ "topic": "Molecular Biology / Gene Therapy",
63
+ "question_preview": "Morpholino-based exon skipping therapy for dystrophin gene...",
64
+ "correct_answer": "A (R-loops)",
65
+ "system_answer": "A",
66
+ "result": "correct"
67
+ },
68
+ {
69
+ "domain": "Physics",
70
+ "topic": "Quantum Mechanics",
71
+ "question_preview": "Heisenberg uncertainty principle - energy level resolution...",
72
+ "correct_answer": "A (10^-4 eV)",
73
+ "system_answer": "A",
74
+ "result": "correct"
75
+ }
76
+ ],
77
+
78
+ "citation": {
79
+ "benchmark_source": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
80
+ "benchmark_url": "https://arxiv.org/abs/2311.12022",
81
+ "model": "Grogu Science MoE (Zenith Global AI Ltd)"
82
+ }
83
+ }