narcolepticchicken commited on
Commit
9b4c8e8
·
verified ·
1 Parent(s): ae2b06a

Upload jobs/run_eval_standalone.py

Browse files
Files changed (1) hide show
  1. jobs/run_eval_standalone.py +511 -0
jobs/run_eval_standalone.py ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Self-contained evaluation runner for OCC stack.
3
+ Includes all core classes inline + all simulated benchmarks + ablations + anti-gaming.
4
+ Runs on CPU. Outputs JSON report.
5
+ """
6
+ import json
7
+ import random
8
+ import sys
9
+ import time
10
+ from dataclasses import dataclass, field
11
+ from enum import Enum
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ import numpy as np
16
+
17
+ # --- CORE CLASSES (INLINE) ---
18
+
19
+ @dataclass
20
+ class OracleResult:
21
+ raw_score: float
22
+ cost_adjusted_score: float
23
+ confidence: float
24
+ evidence: Dict[str, Any]
25
+ reason: str
26
+ failure_tags: List[str] = field(default_factory=list)
27
+ reward_value: float = 0.0
28
+
29
+
30
+ class ImpactOracle:
31
+ def __init__(self, compute_penalty_rate=0.0001, calibration_weight=0.2,
32
+ abstention_bonus=1.0, hallucination_penalty=2.0,
33
+ confident_wrong_penalty=3.0, gaming_penalty=2.0,
34
+ code_weights=None, qa_weights=None, debate_weights=None):
35
+ self.code_weights = code_weights or {"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001}
36
+ self.qa_weights = qa_weights or {"correctness": 1.0, "evidence_support": 0.5, "calibration": 0.2, "abstention_utility": 1.0, "hallucination_penalty": 2.0, "confident_wrong_penalty": 3.0}
37
+ self.debate_weights = debate_weights or {"decision_quality": 1.0, "influence_efficiency": 0.5, "throughput": 0.3, "marginal_contribution": 0.5}
38
+ self.compute_penalty_rate = compute_penalty_rate
39
+ self.calibration_weight = calibration_weight
40
+ self.abstention_bonus = abstention_bonus
41
+ self.hallucination_penalty = hallucination_penalty
42
+ self.confident_wrong_penalty = confident_wrong_penalty
43
+ self.gaming_penalty = gaming_penalty
44
+
45
+ def score(self, mode, action, context, result, agent_id=""):
46
+ if mode == "code":
47
+ return self._score_code(action, context, result, agent_id)
48
+ elif mode == "retrieval_qa":
49
+ return self._score_qa(action, context, result, agent_id)
50
+ elif mode == "debate":
51
+ return self._score_debate(action, context, result, agent_id)
52
+ return OracleResult(0.0, 0.0, 0.0, {}, f"unknown mode {mode}", ["unknown_mode"])
53
+
54
+ def _score_code(self, action, context, result, agent_id):
55
+ correctness = result.get("correctness", 0.0)
56
+ pass_at_k = result.get("pass_at_k", 0.0)
57
+ regression = result.get("regression", False)
58
+ compute_cost = result.get("compute_cost", 0.0)
59
+ hidden_pass = result.get("hidden_tests_pass", correctness)
60
+ public_pass = result.get("public_pass", correctness)
61
+ tags = []
62
+ if public_pass and not hidden_pass:
63
+ tags.append("gaming_hidden_tests")
64
+ raw = (correctness * self.code_weights["correctness"] +
65
+ pass_at_k * self.code_weights["pass_at_k"] +
66
+ (self.code_weights["regression"] if regression else 0.0) -
67
+ compute_cost * self.code_weights.get("compute_penalty", self.compute_penalty_rate))
68
+ if "gaming_hidden_tests" in tags:
69
+ raw -= self.gaming_penalty
70
+ cost_adj = raw - compute_cost * self.compute_penalty_rate
71
+ return OracleResult(raw, cost_adj, result.get("confidence", correctness),
72
+ {"correctness": correctness}, f"corr={correctness:.2f}, cost={compute_cost}", tags, cost_adj)
73
+
74
+ def _score_qa(self, action, context, result, agent_id):
75
+ gold = context.get("gold_answer", "")
76
+ answer = result.get("answer", "")
77
+ confidence = result.get("confidence", 0.5)
78
+ compute_cost = result.get("compute_cost", 0.0)
79
+ abstained = action.get("abstained", False)
80
+ if abstained:
81
+ correct_abstention = context.get("is_unanswerable", False)
82
+ raw = self.abstention_bonus if correct_abstention else -self.abstention_bonus
83
+ tags = ["correct_abstention" if correct_abstention else "wrong_abstention"]
84
+ return OracleResult(raw, raw - compute_cost * self.compute_penalty_rate, confidence, {}, f"abstain correct={correct_abstention}", tags, raw)
85
+ correctness = self._fuzzy_match(answer, gold)
86
+ evidence = result.get("evidence", {})
87
+ entailment = evidence.get("entailment_score", 0.0)
88
+ contradiction = evidence.get("contradiction_score", 0.0)
89
+ hallucination = contradiction > 0.5
90
+ confident_wrong = confidence > 0.8 and correctness < 0.5
91
+ tags = []
92
+ if hallucination: tags.append("hallucination")
93
+ if confident_wrong: tags.append("confident_wrong")
94
+ if compute_cost > 2000: tags.append("excessive_compute")
95
+ if compute_cost > 500 and correctness < 0.5: tags.append("compute_waste")
96
+ raw = (correctness * self.qa_weights["correctness"] +
97
+ entailment * self.qa_weights.get("evidence_support", 0.5) -
98
+ (self.hallucination_penalty if hallucination else 0.0) -
99
+ (self.confident_wrong_penalty if confident_wrong else 0.0) -
100
+ compute_cost * self.compute_penalty_rate)
101
+ brier = (confidence - correctness) ** 2
102
+ raw += (1 - brier) * self.calibration_weight
103
+ cost_adj = raw - compute_cost * self.compute_penalty_rate
104
+ if compute_cost > 100 and raw < 0.5:
105
+ cost_adj -= self.gaming_penalty * 0.5
106
+ return OracleResult(raw, cost_adj, confidence, evidence, f"corr={correctness:.2f}, conf={confidence:.2f}", tags, cost_adj)
107
+
108
+ def _score_debate(self, action, context, result, agent_id):
109
+ decision_quality = result.get("decision_quality", 0.0)
110
+ marginal = result.get("marginal_contribution", 0.0)
111
+ tokens = result.get("tokens", 0)
112
+ compute_cost = result.get("compute_cost", tokens)
113
+ spam = result.get("spam", False)
114
+ collusion = result.get("collusion", False)
115
+ tags = []
116
+ if spam: tags.append("spam")
117
+ if collusion: tags.append("collusion")
118
+ if tokens > 5000: tags.append("verbose_waste")
119
+ raw = (decision_quality * self.debate_weights["decision_quality"] +
120
+ marginal * self.debate_weights["marginal_contribution"] +
121
+ (1.0 / max(tokens, 1)) * self.debate_weights["influence_efficiency"] -
122
+ compute_cost * self.compute_penalty_rate)
123
+ if spam: raw -= self.gaming_penalty
124
+ if collusion: raw -= self.gaming_penalty * 2
125
+ cost_adj = raw - compute_cost * self.compute_penalty_rate
126
+ return OracleResult(raw, cost_adj, result.get("confidence", 0.5),
127
+ {"marginal": marginal}, f"dq={decision_quality:.2f}, tokens={tokens}", tags, cost_adj)
128
+
129
+ def _fuzzy_match(self, a, b):
130
+ if not a or not b: return 0.0
131
+ a, b = a.strip().lower(), b.strip().lower()
132
+ return 1.0 if a == b else 0.5 if (a in b or b in a) else 0.0
133
+
134
+
135
+ @dataclass
136
+ class LedgerEntry:
137
+ agent_id: str; task_id: str; action_id: str; earned_credit: float; spent_credit: float
138
+ decayed_credit: float; remaining_credit: float; reason: str; oracle_score: float
139
+ compute_cost: float; timestamp: float; capability_scope: str = "global"
140
+
141
+
142
+ class CreditLedger:
143
+ def __init__(self, decay_lambda=0.05):
144
+ self.entries = []
145
+ self.balances = {}
146
+ self.decay_lambda = decay_lambda
147
+
148
+ def earn(self, agent_id, task_id, action_id, amount, oracle_score, compute_cost, reason, capability_scope="global"):
149
+ now = time.time()
150
+ self._apply_decay(agent_id, now, capability_scope)
151
+ current = self._get(agent_id, capability_scope)
152
+ new_bal = current + amount
153
+ self.entries.append(LedgerEntry(agent_id, task_id, action_id, amount, 0.0, 0.0, new_bal, reason, oracle_score, compute_cost, now, capability_scope))
154
+ self._set(agent_id, capability_scope, new_bal)
155
+
156
+ def spend(self, agent_id, task_id, action_id, amount, capability_scope="global", reason="spend"):
157
+ now = time.time()
158
+ self._apply_decay(agent_id, now, capability_scope)
159
+ current = self._get(agent_id, capability_scope)
160
+ if current < amount:
161
+ return False
162
+ new_bal = current - amount
163
+ self.entries.append(LedgerEntry(agent_id, task_id, action_id, 0.0, amount, 0.0, new_bal, reason, 0.0, 0.0, now, capability_scope))
164
+ self._set(agent_id, capability_scope, new_bal)
165
+ return True
166
+
167
+ def transfer(self, from_agent, to_agent, amount, capability_scope="global"):
168
+ return False # non-transferable
169
+
170
+ def balance(self, agent_id, capability_scope="global"):
171
+ now = time.time()
172
+ self._apply_decay(agent_id, now, capability_scope)
173
+ return self._get(agent_id, capability_scope)
174
+
175
+ def _get(self, agent_id, cap):
176
+ return self.balances.get(agent_id, {}).get(cap, 0.0)
177
+
178
+ def _set(self, agent_id, cap, val):
179
+ if agent_id not in self.balances: self.balances[agent_id] = {}
180
+ self.balances[agent_id][cap] = val
181
+
182
+ def _apply_decay(self, agent_id, now, cap):
183
+ current = self._get(agent_id, cap)
184
+ if current <= 0: return
185
+ decayed = current * (1 - self.decay_lambda)
186
+ if decayed < current:
187
+ self.entries.append(LedgerEntry(agent_id, "decay", "decay", 0.0, 0.0, current - decayed, decayed, "credit_decay", 0.0, 0.0, now, cap))
188
+ self._set(agent_id, cap, decayed)
189
+
190
+ def detect_collusion(self, window=10):
191
+ recent = self.entries[-window:]
192
+ agents = set(e.agent_id for e in recent)
193
+ if len(agents) < 2: return None
194
+ return {"suspicious_agents": list(agents), "count": len(recent)}
195
+
196
+
197
+ class Decision(Enum):
198
+ ALLOW = "allow"; DENY = "deny"; REQUIRE_APPROVAL = "require_approval"
199
+ DOWNGRADE = "downgrade"; ESCALATE = "escalate"; ASK_JUSTIFICATION = "ask_justification"
200
+
201
+
202
+ @dataclass
203
+ class ResourceDecision:
204
+ decision: Decision; reason: str; capability: str; downgrade_to: Optional[str] = None
205
+
206
+
207
+ class ResourceBroker:
208
+ RESOURCE_RISK = {"model_call": "medium", "retrieval_call": "low", "verifier_call": "medium",
209
+ "debate_turn": "low", "file_write": "high", "shell_execute": "high",
210
+ "memory_write": "medium", "human_escalation": "high", "larger_model": "medium"}
211
+ DEFAULT_THRESHOLDS = {"low": 0.5, "medium": 2.0, "high": 5.0}
212
+
213
+ def __init__(self, thresholds=None, urgency_boost=0.5):
214
+ self.thresholds = thresholds or self.DEFAULT_THRESHOLDS.copy()
215
+ self.urgency_boost = urgency_boost
216
+ self.denial_history = {}
217
+
218
+ def request(self, capability, agent_id, credit_balance, task_state=None, risk_score=0.0, gaming_flags=None):
219
+ task_state = task_state or {}
220
+ gaming_flags = gaming_flags or []
221
+ risk_class = self.RESOURCE_RISK.get(capability, "medium")
222
+ threshold = self.thresholds.get(risk_class, 2.0)
223
+ urgency = task_state.get("urgency", 0.0)
224
+ adjusted = max(0.1, threshold - urgency * self.urgency_boost)
225
+ if gaming_flags:
226
+ return ResourceDecision(Decision.DENY, f"Gaming: {gaming_flags}", capability)
227
+ if risk_class == "high" and risk_score > 0.7:
228
+ return ResourceDecision(Decision.REQUIRE_APPROVAL, f"High risk {risk_score:.2f}", capability)
229
+ if credit_balance >= adjusted:
230
+ return ResourceDecision(Decision.ALLOW, f"Balance {credit_balance:.2f} >= {adjusted:.2f}", capability)
231
+ if credit_balance >= adjusted * 0.5:
232
+ if risk_class == "medium":
233
+ return ResourceDecision(Decision.DOWNGRADE, f"Downgrading from {capability}", capability, "retrieval_call")
234
+ return ResourceDecision(Decision.ASK_JUSTIFICATION, f"Justification required", capability)
235
+ denials = self.denial_history.get(agent_id, 0)
236
+ if denials > 3:
237
+ return ResourceDecision(Decision.ESCALATE, f"Denied {denials} times", capability)
238
+ self.denial_history[agent_id] = denials + 1
239
+ return ResourceDecision(Decision.DENY, f"Balance {credit_balance:.2f} < {adjusted:.2f}", capability)
240
+
241
+
242
+ # --- BENCHMARK SIMULATIONS (INLINE) ---
243
+
244
+ @dataclass
245
+ class CodeProblem:
246
+ task_id: str; difficulty: float; hidden_test_difficulty: float; public_test_difficulty: float
247
+
248
+
249
+ class SimulatedCodeAgent:
250
+ def __init__(self, agent_id, pass_rate_easy=0.9, pass_rate_hard=0.3, hidden_test_falloff=0.15, cost_per_attempt=200):
251
+ self.agent_id = agent_id
252
+ self.pass_rate_easy = pass_rate_easy
253
+ self.pass_rate_hard = pass_rate_hard
254
+ self.hidden_test_falloff = hidden_test_falloff
255
+ self.cost_per_attempt = cost_per_attempt
256
+ self.attempts = 0
257
+
258
+ def solve(self, problem):
259
+ self.attempts += 1
260
+ base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty
261
+ public_pass = random.random() < base_acc
262
+ hidden_acc = max(0.0, base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty)
263
+ hidden_pass = random.random() < hidden_acc
264
+ return {"public_pass": public_pass, "hidden_pass": hidden_pass, "compute_cost": self.cost_per_attempt}
265
+
266
+
267
+ def gen_code_problems(n, seed):
268
+ random.seed(seed); np.random.seed(seed)
269
+ return [CodeProblem(f"task_{i}", random.random(), random.random(), random.random()) for i in range(n)]
270
+
271
+
272
+ def run_code_baseline(problems, agent):
273
+ total = 0; results = []
274
+ for p in problems:
275
+ r = agent.solve(p)
276
+ total += r["compute_cost"]
277
+ results.append(r)
278
+ acc = sum(1 for r in results if r["public_pass"]) / len(results)
279
+ return {"accuracy": acc, "total_compute": total, "mean_compute": total / len(problems)}
280
+
281
+
282
+ def run_code_occ(problems, agents, oracle, ledger, broker, max_attempts=3):
283
+ total = 0; results = []
284
+ for a in agents:
285
+ q = (a.pass_rate_easy + a.pass_rate_hard) / 2
286
+ ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
287
+ for p in problems:
288
+ solved = False; cost = 0; used = []
289
+ ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
290
+ for agent in ranked:
291
+ if solved or len(used) >= max_attempts: break
292
+ r = agent.solve(p); cost += r["compute_cost"]; total += r["compute_cost"]; used.append(agent.agent_id)
293
+ solved = r["public_pass"]; hidden = r["hidden_pass"]
294
+ ora = oracle.score("code", {"attempt": len(used)}, {},
295
+ {"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0,
296
+ "compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden},
297
+ agent_id=agent.agent_id)
298
+ if ora.raw_score > 0:
299
+ ledger.earn(agent.agent_id, p.task_id, "solve", ora.raw_score * 5, ora.raw_score, cost, "pass", "model_call")
300
+ else:
301
+ ledger.spend(agent.agent_id, p.task_id, "solve", 1.0, "model_call", "fail")
302
+ if hidden: break
303
+ results.append({"solved": solved, "cost": cost, "agents": used})
304
+ acc = sum(1 for r in results if r["solved"]) / len(results)
305
+ return {"accuracy": acc, "total_compute": total, "mean_compute": total / len(problems), "mean_agents": sum(len(r["agents"]) for r in results) / len(results)}
306
+
307
+
308
+ def create_qa_dataset(seed=42, n=50):
309
+ random.seed(seed)
310
+ evidence_pool = ["alpha", "beta", "gamma", "delta"]
311
+ questions = []
312
+ for i in range(n):
313
+ q_type = random.choice(["answerable", "unanswerable", "misleading", "incomplete", "conflicting"])
314
+ answer = random.choice(["paris", "42", "yes", "no", "tokyo"])
315
+ evidence = random.sample(evidence_pool, k=random.randint(1, 3))
316
+ questions.append({"id": f"q_{i}", "question": f"Q{i}", "type": q_type, "answer": answer, "evidence": evidence, "is_unanswerable": q_type == "unanswerable"})
317
+ return questions
318
+
319
+
320
+ def run_qa_occ(dataset, agent_params, oracle, ledger, broker):
321
+ total_compute = 0; correct = 0
322
+ ledger.earn("qa_agent", "seed", "seed", 20, 0.0, 0.0, "initial", "retrieval_call")
323
+ for item in dataset:
324
+ balance = ledger.balance("qa_agent", "retrieval_call")
325
+ dec = broker.request("retrieval_call", "qa_agent", balance, task_state={"urgency": 0.5})
326
+ if dec.decision == Decision.DENY:
327
+ continue
328
+ tokens = 200 if dec.decision == Decision.ALLOW else 100
329
+ total_compute += tokens
330
+ should_answer = item["type"] != "unanswerable"
331
+ ans = item["answer"] if (should_answer and random.random() < agent_params["acc"]) else None
332
+ conf = 0.9 if ans else 0.3
333
+ ora = oracle.score("retrieval_qa", {"abstained": ans is None}, item,
334
+ {"answer": ans, "confidence": conf, "evidence": {}, "compute_cost": tokens}, "qa_agent")
335
+ if ora.raw_score > 0:
336
+ ledger.earn("qa_agent", item["id"], "ans", ora.raw_score * 3, ora.raw_score, tokens, "correct", "retrieval_call")
337
+ correct += 1
338
+ else:
339
+ ledger.spend("qa_agent", item["id"], "ans", 0.5, "retrieval_call", "wrong")
340
+ return {"accuracy": correct / len(dataset), "total_compute": total_compute, "mean_compute": total_compute / len(dataset)}
341
+
342
+
343
+ def run_debate_occ(n_debates, n_agents, agent_configs, oracle, ledger, broker, seed=42):
344
+ random.seed(seed)
345
+ correct = 0; total_compute = 0; consensus = 0
346
+ for _ in range(n_debates):
347
+ truth = random.choice(["A", "B", "C"])
348
+ agents = []
349
+ for cfg in agent_configs:
350
+ acc = cfg["acc"] if cfg["honest"] else random.random() * 0.4
351
+ agents.append({"honest": cfg["honest"], "acc": acc, "id": cfg["id"], "tokens": cfg.get("tokens", 200)})
352
+ votes = []
353
+ for a in agents:
354
+ balance = ledger.balance(a["id"], "debate_turn")
355
+ dec = broker.request("debate_turn", a["id"], balance)
356
+ if dec.decision == Decision.DENY:
357
+ continue
358
+ total_compute += a["tokens"]
359
+ vote = truth if (a["honest"] and random.random() < a["acc"]) else random.choice(["A", "B", "C"])
360
+ votes.append((a["id"], vote, a["honest"], a["acc"]))
361
+ ledger.spend(a["id"], "debate", "turn", 1.0, "debate_turn", "participate")
362
+ if not votes:
363
+ continue
364
+ honest_votes = [v for _, v, h, _ in votes if h]
365
+ final = max(set([v for _, v, _, _ in votes]), key=lambda x: sum(1 for _, v, _, _ in votes if v == x))
366
+ if final == truth:
367
+ correct += 1
368
+ for vid, _, h, _ in votes:
369
+ if h:
370
+ ledger.earn(vid, "debate", "consensus", 2.0, 1.0, 0, "consensus", "debate_turn")
371
+ if len(set(v for _, v, _, _ in votes)) == 1:
372
+ consensus += 1
373
+ n = n_debates
374
+ return {"accuracy": correct / n, "consensus_reached": consensus / n, "total_compute": total_compute, "mean_compute": total_compute / n}
375
+
376
+
377
+ # --- ABLATIONS & ANTI-GAMING ---
378
+
379
+ ABLATIONS = [
380
+ ("default", "Full OCC", 0.02, 2.0, 0.0001, True, {}),
381
+ ("no_decay", "No credit decay", 0.0, 2.0, 0.0001, True, {}),
382
+ ("fast_decay", "Aggressive decay", 0.1, 2.0, 0.0001, True, {}),
383
+ ("no_gaming_penalty", "No gaming penalties", 0.02, 0.0, 0.0001, True, {}),
384
+ ("high_gaming_penalty", "Severe gaming penalties", 0.02, 5.0, 0.0001, True, {}),
385
+ ("lenient_broker", "Lenient broker", 0.02, 2.0, 0.0001, True, {"low": 0.25, "medium": 1.0, "high": 2.5}),
386
+ ("strict_broker", "Strict broker", 0.02, 2.0, 0.0001, True, {"low": 1.0, "medium": 4.0, "high": 10.0}),
387
+ ("high_compute_cost", "High compute penalty", 0.02, 2.0, 0.001, True, {}),
388
+ ("low_compute_cost", "Low compute penalty", 0.02, 2.0, 0.00001, True, {}),
389
+ ("anti_gaming_off", "Anti-gaming disabled", 0.02, 2.0, 0.0001, False, {}),
390
+ ]
391
+
392
+
393
+ def run_all():
394
+ print("=" * 60)
395
+ print("OCC UNIFIED EVALUATION RUNNER (SELF-CONTAINED)")
396
+ print("=" * 60)
397
+ seed = 42
398
+ n_problems = 100
399
+ n_qa = 100
400
+ n_debates = 50
401
+
402
+ results = {"ablations": {}, "anti_gaming": {}}
403
+
404
+ # Ablations
405
+ for name, desc, decay, game_pen, comp_pen, anti_on, broker_thresh in ABLATIONS:
406
+ print(f"\n--- ABLATION: {name} ---")
407
+ oracle = ImpactOracle(compute_penalty_rate=comp_pen, gaming_penalty=game_pen if anti_on else 0.0)
408
+ ledger = CreditLedger(decay_lambda=decay)
409
+ broker = ResourceBroker(thresholds=broker_thresh if broker_thresh else None)
410
+
411
+ problems = gen_code_problems(n_problems, seed)
412
+ cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60)
413
+ medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150)
414
+ expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350)
415
+ code_res = run_code_occ(problems, [cheap, medium, expensive], oracle, ledger, broker, max_attempts=3)
416
+ print(f" Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}")
417
+
418
+ qa_data = create_qa_dataset(seed=seed, n=n_qa)
419
+ qa_res = run_qa_occ(qa_data, {"acc": 0.85}, oracle, ledger, broker)
420
+ print(f" QA: acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}")
421
+
422
+ ledger2 = CreditLedger(decay_lambda=decay)
423
+ broker2 = ResourceBroker(thresholds=broker_thresh if broker_thresh else None)
424
+ for i in range(3):
425
+ ledger2.earn(f"f{i}", "seed", "seed", 5, 0, 0, "initial", "debate_turn")
426
+ debate_res = run_debate_occ(n_debates, 3,
427
+ [{"id": f"f{i}", "honest": True, "acc": 0.9, "tokens": 200} for i in range(3)],
428
+ oracle, ledger2, broker2, seed=seed)
429
+ print(f" Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}")
430
+
431
+ results["ablations"][name] = {"description": desc, "code": code_res, "qa": qa_res, "debate": debate_res}
432
+
433
+ # Anti-gaming
434
+ print("\n--- ANTI-GAMING TESTS ---")
435
+
436
+ # Hidden-test gaming
437
+ oracle = ImpactOracle(gaming_penalty=2.0)
438
+ normal_res = []; gamer_res = []
439
+ for _ in range(50):
440
+ public_pass = random.random() < 0.9
441
+ hidden_pass = random.random() < 0.5 if True else random.random() < 0.9
442
+ ora_normal = oracle.score("code", {}, {}, {"correctness": 1.0 if public_pass else 0.0, "pass_at_k": 1.0 if hidden_pass else 0.0, "compute_cost": 150, "public_pass": public_pass, "hidden_tests_pass": hidden_pass})
443
+ normal_res.append(ora_normal.raw_score)
444
+ # Gamer: always passes public, fails hidden
445
+ ora_gamer = oracle.score("code", {}, {}, {"correctness": 1.0, "pass_at_k": 0.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": False})
446
+ gamer_res.append(ora_gamer.raw_score)
447
+ results["anti_gaming"]["hidden_test_gaming"] = {
448
+ "normal_mean_raw": sum(normal_res) / len(normal_res),
449
+ "gamer_mean_raw": sum(gamer_res) / len(gamer_res),
450
+ "gamer_penalized_rate": sum(1 for r in gamer_res if r < 0) / len(gamer_res),
451
+ }
452
+ print(f" Hidden-test gaming: normal={results['anti_gaming']['hidden_test_gaming']['normal_mean_raw']:.2f}, gamer={results['anti_gaming']['hidden_test_gaming']['gamer_mean_raw']:.2f}")
453
+
454
+ # Collusion
455
+ ledger = CreditLedger()
456
+ ledger.earn("alice", "seed", "seed", 10, 0, 0, "initial")
457
+ ledger.earn("bob", "seed", "seed", 1, 0, 0, "initial")
458
+ ok = ledger.transfer("alice", "bob", 5.0)
459
+ results["anti_gaming"]["collusion"] = {
460
+ "transfer_allowed": ok,
461
+ "alice_balance": ledger.balance("alice"),
462
+ "bob_balance": ledger.balance("bob"),
463
+ "blocked": not ok,
464
+ }
465
+ print(f" Collusion: transfer_allowed={ok}, alice={ledger.balance('alice'):.1f}, bob={ledger.balance('bob'):.1f}")
466
+
467
+ # Over-abstention
468
+ oracle = ImpactOracle()
469
+ abstention_rewards = []
470
+ for _ in range(10):
471
+ res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
472
+ {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
473
+ abstention_rewards.append(res.reward_value)
474
+ results["anti_gaming"]["abstention"] = {
475
+ "mean_reward": sum(abstention_rewards) / len(abstention_rewards),
476
+ "negative": sum(abstention_rewards) < 0,
477
+ }
478
+ print(f" Abstention: mean_reward={results['anti_gaming']['abstention']['mean_reward']:.2f}, negative={results['anti_gaming']['abstention']['negative']}")
479
+
480
+ # Spam
481
+ oracle = ImpactOracle()
482
+ spam_res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"},
483
+ {"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000})
484
+ results["anti_gaming"]["spam"] = {
485
+ "reward": spam_res.reward_value,
486
+ "tags": spam_res.failure_tags,
487
+ }
488
+ print(f" Spam: reward={spam_res.reward_value:.2f}, tags={spam_res.failure_tags}")
489
+
490
+ # Save
491
+ out = Path("/app/occ/reports")
492
+ out.mkdir(parents=True, exist_ok=True)
493
+ with open(out / "eval_runner_results.json", "w") as f:
494
+ json.dump(results, f, indent=2, default=str)
495
+ print(f"\nSaved to {out / 'eval_runner_results.json'}")
496
+
497
+ # Summary table
498
+ print("\n" + "=" * 60)
499
+ print("ABLATION SUMMARY")
500
+ print("=" * 60)
501
+ print(f"{'Name':<20} {'Code Acc':>10} {'Code Comp':>10} {'QA Acc':>10} {'QA Comp':>10} {'Deb Acc':>10} {'Deb Comp':>10}")
502
+ for name, data in results["ablations"].items():
503
+ print(f"{name:<20} {data['code']['accuracy']:>10.3f} {data['code']['total_compute']:>10.0f} "
504
+ f"{data['qa']['accuracy']:>10.3f} {data['qa']['total_compute']:>10.0f} "
505
+ f"{data['debate']['accuracy']:>10.3f} {data['debate']['total_compute']:>10.0f}")
506
+
507
+ return results
508
+
509
+
510
+ if __name__ == "__main__":
511
+ run_all()