occ-stack / jobs /run_eval_standalone.py

Upload jobs/run_eval_standalone.py

9b4c8e8 verified 20 days ago

25.5 kB

	"""
	Self-contained evaluation runner for OCC stack.
	Includes all core classes inline + all simulated benchmarks + ablations + anti-gaming.
	Runs on CPU. Outputs JSON report.
	"""
	import json
	import random
	import sys
	import time
	from dataclasses import dataclass, field
	from enum import Enum
	from pathlib import Path
	from typing import Any, Dict, List, Optional

	import numpy as np

	# --- CORE CLASSES (INLINE) ---

	@dataclass
	class OracleResult:
	raw_score: float
	cost_adjusted_score: float
	confidence: float
	evidence: Dict[str, Any]
	reason: str
	failure_tags: List[str] = field(default_factory=list)
	reward_value: float = 0.0


	class ImpactOracle:
	def __init__(self, compute_penalty_rate=0.0001, calibration_weight=0.2,
	abstention_bonus=1.0, hallucination_penalty=2.0,
	confident_wrong_penalty=3.0, gaming_penalty=2.0,
	code_weights=None, qa_weights=None, debate_weights=None):
	self.code_weights = code_weights or {"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001}
	self.qa_weights = qa_weights or {"correctness": 1.0, "evidence_support": 0.5, "calibration": 0.2, "abstention_utility": 1.0, "hallucination_penalty": 2.0, "confident_wrong_penalty": 3.0}
	self.debate_weights = debate_weights or {"decision_quality": 1.0, "influence_efficiency": 0.5, "throughput": 0.3, "marginal_contribution": 0.5}
	self.compute_penalty_rate = compute_penalty_rate
	self.calibration_weight = calibration_weight
	self.abstention_bonus = abstention_bonus
	self.hallucination_penalty = hallucination_penalty
	self.confident_wrong_penalty = confident_wrong_penalty
	self.gaming_penalty = gaming_penalty

	def score(self, mode, action, context, result, agent_id=""):
	if mode == "code":
	return self._score_code(action, context, result, agent_id)
	elif mode == "retrieval_qa":
	return self._score_qa(action, context, result, agent_id)
	elif mode == "debate":
	return self._score_debate(action, context, result, agent_id)
	return OracleResult(0.0, 0.0, 0.0, {}, f"unknown mode {mode}", ["unknown_mode"])

	def _score_code(self, action, context, result, agent_id):
	correctness = result.get("correctness", 0.0)
	pass_at_k = result.get("pass_at_k", 0.0)
	regression = result.get("regression", False)
	compute_cost = result.get("compute_cost", 0.0)
	hidden_pass = result.get("hidden_tests_pass", correctness)
	public_pass = result.get("public_pass", correctness)
	tags = []
	if public_pass and not hidden_pass:
	tags.append("gaming_hidden_tests")
	raw = (correctness * self.code_weights["correctness"] +
	pass_at_k * self.code_weights["pass_at_k"] +
	(self.code_weights["regression"] if regression else 0.0) -
	compute_cost * self.code_weights.get("compute_penalty", self.compute_penalty_rate))
	if "gaming_hidden_tests" in tags:
	raw -= self.gaming_penalty
	cost_adj = raw - compute_cost * self.compute_penalty_rate
	return OracleResult(raw, cost_adj, result.get("confidence", correctness),
	{"correctness": correctness}, f"corr={correctness:.2f}, cost={compute_cost}", tags, cost_adj)

	def _score_qa(self, action, context, result, agent_id):
	gold = context.get("gold_answer", "")
	answer = result.get("answer", "")
	confidence = result.get("confidence", 0.5)
	compute_cost = result.get("compute_cost", 0.0)
	abstained = action.get("abstained", False)
	if abstained:
	correct_abstention = context.get("is_unanswerable", False)
	raw = self.abstention_bonus if correct_abstention else -self.abstention_bonus
	tags = ["correct_abstention" if correct_abstention else "wrong_abstention"]
	return OracleResult(raw, raw - compute_cost * self.compute_penalty_rate, confidence, {}, f"abstain correct={correct_abstention}", tags, raw)
	correctness = self._fuzzy_match(answer, gold)
	evidence = result.get("evidence", {})
	entailment = evidence.get("entailment_score", 0.0)
	contradiction = evidence.get("contradiction_score", 0.0)
	hallucination = contradiction > 0.5
	confident_wrong = confidence > 0.8 and correctness < 0.5
	tags = []
	if hallucination: tags.append("hallucination")
	if confident_wrong: tags.append("confident_wrong")
	if compute_cost > 2000: tags.append("excessive_compute")
	if compute_cost > 500 and correctness < 0.5: tags.append("compute_waste")
	raw = (correctness * self.qa_weights["correctness"] +
	entailment * self.qa_weights.get("evidence_support", 0.5) -
	(self.hallucination_penalty if hallucination else 0.0) -
	(self.confident_wrong_penalty if confident_wrong else 0.0) -
	compute_cost * self.compute_penalty_rate)
	brier = (confidence - correctness) ** 2
	raw += (1 - brier) * self.calibration_weight
	cost_adj = raw - compute_cost * self.compute_penalty_rate
	if compute_cost > 100 and raw < 0.5:
	cost_adj -= self.gaming_penalty * 0.5
	return OracleResult(raw, cost_adj, confidence, evidence, f"corr={correctness:.2f}, conf={confidence:.2f}", tags, cost_adj)

	def _score_debate(self, action, context, result, agent_id):
	decision_quality = result.get("decision_quality", 0.0)
	marginal = result.get("marginal_contribution", 0.0)
	tokens = result.get("tokens", 0)
	compute_cost = result.get("compute_cost", tokens)
	spam = result.get("spam", False)
	collusion = result.get("collusion", False)
	tags = []
	if spam: tags.append("spam")
	if collusion: tags.append("collusion")
	if tokens > 5000: tags.append("verbose_waste")
	raw = (decision_quality * self.debate_weights["decision_quality"] +
	marginal * self.debate_weights["marginal_contribution"] +
	(1.0 / max(tokens, 1)) * self.debate_weights["influence_efficiency"] -
	compute_cost * self.compute_penalty_rate)
	if spam: raw -= self.gaming_penalty
	if collusion: raw -= self.gaming_penalty * 2
	cost_adj = raw - compute_cost * self.compute_penalty_rate
	return OracleResult(raw, cost_adj, result.get("confidence", 0.5),
	{"marginal": marginal}, f"dq={decision_quality:.2f}, tokens={tokens}", tags, cost_adj)

	def _fuzzy_match(self, a, b):
	if not a or not b: return 0.0
	a, b = a.strip().lower(), b.strip().lower()
	return 1.0 if a == b else 0.5 if (a in b or b in a) else 0.0


	@dataclass
	class LedgerEntry:
	agent_id: str; task_id: str; action_id: str; earned_credit: float; spent_credit: float
	decayed_credit: float; remaining_credit: float; reason: str; oracle_score: float
	compute_cost: float; timestamp: float; capability_scope: str = "global"


	class CreditLedger:
	def __init__(self, decay_lambda=0.05):
	self.entries = []
	self.balances = {}
	self.decay_lambda = decay_lambda

	def earn(self, agent_id, task_id, action_id, amount, oracle_score, compute_cost, reason, capability_scope="global"):
	now = time.time()
	self._apply_decay(agent_id, now, capability_scope)
	current = self._get(agent_id, capability_scope)
	new_bal = current + amount
	self.entries.append(LedgerEntry(agent_id, task_id, action_id, amount, 0.0, 0.0, new_bal, reason, oracle_score, compute_cost, now, capability_scope))
	self._set(agent_id, capability_scope, new_bal)

	def spend(self, agent_id, task_id, action_id, amount, capability_scope="global", reason="spend"):
	now = time.time()
	self._apply_decay(agent_id, now, capability_scope)
	current = self._get(agent_id, capability_scope)
	if current < amount:
	return False
	new_bal = current - amount
	self.entries.append(LedgerEntry(agent_id, task_id, action_id, 0.0, amount, 0.0, new_bal, reason, 0.0, 0.0, now, capability_scope))
	self._set(agent_id, capability_scope, new_bal)
	return True

	def transfer(self, from_agent, to_agent, amount, capability_scope="global"):
	return False # non-transferable

	def balance(self, agent_id, capability_scope="global"):
	now = time.time()
	self._apply_decay(agent_id, now, capability_scope)
	return self._get(agent_id, capability_scope)

	def _get(self, agent_id, cap):
	return self.balances.get(agent_id, {}).get(cap, 0.0)

	def _set(self, agent_id, cap, val):
	if agent_id not in self.balances: self.balances[agent_id] = {}
	self.balances[agent_id][cap] = val

	def _apply_decay(self, agent_id, now, cap):
	current = self._get(agent_id, cap)
	if current <= 0: return
	decayed = current * (1 - self.decay_lambda)
	if decayed < current:
	self.entries.append(LedgerEntry(agent_id, "decay", "decay", 0.0, 0.0, current - decayed, decayed, "credit_decay", 0.0, 0.0, now, cap))
	self._set(agent_id, cap, decayed)

	def detect_collusion(self, window=10):
	recent = self.entries[-window:]
	agents = set(e.agent_id for e in recent)
	if len(agents) < 2: return None
	return {"suspicious_agents": list(agents), "count": len(recent)}


	class Decision(Enum):
	ALLOW = "allow"; DENY = "deny"; REQUIRE_APPROVAL = "require_approval"
	DOWNGRADE = "downgrade"; ESCALATE = "escalate"; ASK_JUSTIFICATION = "ask_justification"


	@dataclass
	class ResourceDecision:
	decision: Decision; reason: str; capability: str; downgrade_to: Optional[str] = None


	class ResourceBroker:
	RESOURCE_RISK = {"model_call": "medium", "retrieval_call": "low", "verifier_call": "medium",
	"debate_turn": "low", "file_write": "high", "shell_execute": "high",
	"memory_write": "medium", "human_escalation": "high", "larger_model": "medium"}
	DEFAULT_THRESHOLDS = {"low": 0.5, "medium": 2.0, "high": 5.0}

	def __init__(self, thresholds=None, urgency_boost=0.5):
	self.thresholds = thresholds or self.DEFAULT_THRESHOLDS.copy()
	self.urgency_boost = urgency_boost
	self.denial_history = {}

	def request(self, capability, agent_id, credit_balance, task_state=None, risk_score=0.0, gaming_flags=None):
	task_state = task_state or {}
	gaming_flags = gaming_flags or []
	risk_class = self.RESOURCE_RISK.get(capability, "medium")
	threshold = self.thresholds.get(risk_class, 2.0)
	urgency = task_state.get("urgency", 0.0)
	adjusted = max(0.1, threshold - urgency * self.urgency_boost)
	if gaming_flags:
	return ResourceDecision(Decision.DENY, f"Gaming: {gaming_flags}", capability)
	if risk_class == "high" and risk_score > 0.7:
	return ResourceDecision(Decision.REQUIRE_APPROVAL, f"High risk {risk_score:.2f}", capability)
	if credit_balance >= adjusted:
	return ResourceDecision(Decision.ALLOW, f"Balance {credit_balance:.2f} >= {adjusted:.2f}", capability)
	if credit_balance >= adjusted * 0.5:
	if risk_class == "medium":
	return ResourceDecision(Decision.DOWNGRADE, f"Downgrading from {capability}", capability, "retrieval_call")
	return ResourceDecision(Decision.ASK_JUSTIFICATION, f"Justification required", capability)
	denials = self.denial_history.get(agent_id, 0)
	if denials > 3:
	return ResourceDecision(Decision.ESCALATE, f"Denied {denials} times", capability)
	self.denial_history[agent_id] = denials + 1
	return ResourceDecision(Decision.DENY, f"Balance {credit_balance:.2f} < {adjusted:.2f}", capability)


	# --- BENCHMARK SIMULATIONS (INLINE) ---

	@dataclass
	class CodeProblem:
	task_id: str; difficulty: float; hidden_test_difficulty: float; public_test_difficulty: float


	class SimulatedCodeAgent:
	def __init__(self, agent_id, pass_rate_easy=0.9, pass_rate_hard=0.3, hidden_test_falloff=0.15, cost_per_attempt=200):
	self.agent_id = agent_id
	self.pass_rate_easy = pass_rate_easy
	self.pass_rate_hard = pass_rate_hard
	self.hidden_test_falloff = hidden_test_falloff
	self.cost_per_attempt = cost_per_attempt
	self.attempts = 0

	def solve(self, problem):
	self.attempts += 1
	base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty
	public_pass = random.random() < base_acc
	hidden_acc = max(0.0, base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty)
	hidden_pass = random.random() < hidden_acc
	return {"public_pass": public_pass, "hidden_pass": hidden_pass, "compute_cost": self.cost_per_attempt}


	def gen_code_problems(n, seed):
	random.seed(seed); np.random.seed(seed)
	return [CodeProblem(f"task_{i}", random.random(), random.random(), random.random()) for i in range(n)]


	def run_code_baseline(problems, agent):
	total = 0; results = []
	for p in problems:
	r = agent.solve(p)
	total += r["compute_cost"]
	results.append(r)
	acc = sum(1 for r in results if r["public_pass"]) / len(results)
	return {"accuracy": acc, "total_compute": total, "mean_compute": total / len(problems)}


	def run_code_occ(problems, agents, oracle, ledger, broker, max_attempts=3):
	total = 0; results = []
	for a in agents:
	q = (a.pass_rate_easy + a.pass_rate_hard) / 2
	ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
	for p in problems:
	solved = False; cost = 0; used = []
	ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
	for agent in ranked:
	if solved or len(used) >= max_attempts: break
	r = agent.solve(p); cost += r["compute_cost"]; total += r["compute_cost"]; used.append(agent.agent_id)
	solved = r["public_pass"]; hidden = r["hidden_pass"]
	ora = oracle.score("code", {"attempt": len(used)}, {},
	{"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0,
	"compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden},
	agent_id=agent.agent_id)
	if ora.raw_score > 0:
	ledger.earn(agent.agent_id, p.task_id, "solve", ora.raw_score * 5, ora.raw_score, cost, "pass", "model_call")
	else:
	ledger.spend(agent.agent_id, p.task_id, "solve", 1.0, "model_call", "fail")
	if hidden: break
	results.append({"solved": solved, "cost": cost, "agents": used})
	acc = sum(1 for r in results if r["solved"]) / len(results)
	return {"accuracy": acc, "total_compute": total, "mean_compute": total / len(problems), "mean_agents": sum(len(r["agents"]) for r in results) / len(results)}


	def create_qa_dataset(seed=42, n=50):
	random.seed(seed)
	evidence_pool = ["alpha", "beta", "gamma", "delta"]
	questions = []
	for i in range(n):
	q_type = random.choice(["answerable", "unanswerable", "misleading", "incomplete", "conflicting"])
	answer = random.choice(["paris", "42", "yes", "no", "tokyo"])
	evidence = random.sample(evidence_pool, k=random.randint(1, 3))
	questions.append({"id": f"q_{i}", "question": f"Q{i}", "type": q_type, "answer": answer, "evidence": evidence, "is_unanswerable": q_type == "unanswerable"})
	return questions


	def run_qa_occ(dataset, agent_params, oracle, ledger, broker):
	total_compute = 0; correct = 0
	ledger.earn("qa_agent", "seed", "seed", 20, 0.0, 0.0, "initial", "retrieval_call")
	for item in dataset:
	balance = ledger.balance("qa_agent", "retrieval_call")
	dec = broker.request("retrieval_call", "qa_agent", balance, task_state={"urgency": 0.5})
	if dec.decision == Decision.DENY:
	continue
	tokens = 200 if dec.decision == Decision.ALLOW else 100
	total_compute += tokens
	should_answer = item["type"] != "unanswerable"
	ans = item["answer"] if (should_answer and random.random() < agent_params["acc"]) else None
	conf = 0.9 if ans else 0.3
	ora = oracle.score("retrieval_qa", {"abstained": ans is None}, item,
	{"answer": ans, "confidence": conf, "evidence": {}, "compute_cost": tokens}, "qa_agent")
	if ora.raw_score > 0:
	ledger.earn("qa_agent", item["id"], "ans", ora.raw_score * 3, ora.raw_score, tokens, "correct", "retrieval_call")
	correct += 1
	else:
	ledger.spend("qa_agent", item["id"], "ans", 0.5, "retrieval_call", "wrong")
	return {"accuracy": correct / len(dataset), "total_compute": total_compute, "mean_compute": total_compute / len(dataset)}


	def run_debate_occ(n_debates, n_agents, agent_configs, oracle, ledger, broker, seed=42):
	random.seed(seed)
	correct = 0; total_compute = 0; consensus = 0
	for _ in range(n_debates):
	truth = random.choice(["A", "B", "C"])
	agents = []
	for cfg in agent_configs:
	acc = cfg["acc"] if cfg["honest"] else random.random() * 0.4
	agents.append({"honest": cfg["honest"], "acc": acc, "id": cfg["id"], "tokens": cfg.get("tokens", 200)})
	votes = []
	for a in agents:
	balance = ledger.balance(a["id"], "debate_turn")
	dec = broker.request("debate_turn", a["id"], balance)
	if dec.decision == Decision.DENY:
	continue
	total_compute += a["tokens"]
	vote = truth if (a["honest"] and random.random() < a["acc"]) else random.choice(["A", "B", "C"])
	votes.append((a["id"], vote, a["honest"], a["acc"]))
	ledger.spend(a["id"], "debate", "turn", 1.0, "debate_turn", "participate")
	if not votes:
	continue
	honest_votes = [v for _, v, h, _ in votes if h]
	final = max(set([v for _, v, _, _ in votes]), key=lambda x: sum(1 for _, v, _, _ in votes if v == x))
	if final == truth:
	correct += 1
	for vid, _, h, _ in votes:
	if h:
	ledger.earn(vid, "debate", "consensus", 2.0, 1.0, 0, "consensus", "debate_turn")
	if len(set(v for _, v, _, _ in votes)) == 1:
	consensus += 1
	n = n_debates
	return {"accuracy": correct / n, "consensus_reached": consensus / n, "total_compute": total_compute, "mean_compute": total_compute / n}


	# --- ABLATIONS & ANTI-GAMING ---

	ABLATIONS = [
	("default", "Full OCC", 0.02, 2.0, 0.0001, True, {}),
	("no_decay", "No credit decay", 0.0, 2.0, 0.0001, True, {}),
	("fast_decay", "Aggressive decay", 0.1, 2.0, 0.0001, True, {}),
	("no_gaming_penalty", "No gaming penalties", 0.02, 0.0, 0.0001, True, {}),
	("high_gaming_penalty", "Severe gaming penalties", 0.02, 5.0, 0.0001, True, {}),
	("lenient_broker", "Lenient broker", 0.02, 2.0, 0.0001, True, {"low": 0.25, "medium": 1.0, "high": 2.5}),
	("strict_broker", "Strict broker", 0.02, 2.0, 0.0001, True, {"low": 1.0, "medium": 4.0, "high": 10.0}),
	("high_compute_cost", "High compute penalty", 0.02, 2.0, 0.001, True, {}),
	("low_compute_cost", "Low compute penalty", 0.02, 2.0, 0.00001, True, {}),
	("anti_gaming_off", "Anti-gaming disabled", 0.02, 2.0, 0.0001, False, {}),
	]


	def run_all():
	print("=" * 60)
	print("OCC UNIFIED EVALUATION RUNNER (SELF-CONTAINED)")
	print("=" * 60)
	seed = 42
	n_problems = 100
	n_qa = 100
	n_debates = 50

	results = {"ablations": {}, "anti_gaming": {}}

	# Ablations
	for name, desc, decay, game_pen, comp_pen, anti_on, broker_thresh in ABLATIONS:
	print(f"\n--- ABLATION: {name} ---")
	oracle = ImpactOracle(compute_penalty_rate=comp_pen, gaming_penalty=game_pen if anti_on else 0.0)
	ledger = CreditLedger(decay_lambda=decay)
	broker = ResourceBroker(thresholds=broker_thresh if broker_thresh else None)

	problems = gen_code_problems(n_problems, seed)
	cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60)
	medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150)
	expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350)
	code_res = run_code_occ(problems, [cheap, medium, expensive], oracle, ledger, broker, max_attempts=3)
	print(f" Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}")

	qa_data = create_qa_dataset(seed=seed, n=n_qa)
	qa_res = run_qa_occ(qa_data, {"acc": 0.85}, oracle, ledger, broker)
	print(f" QA: acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}")

	ledger2 = CreditLedger(decay_lambda=decay)
	broker2 = ResourceBroker(thresholds=broker_thresh if broker_thresh else None)
	for i in range(3):
	ledger2.earn(f"f{i}", "seed", "seed", 5, 0, 0, "initial", "debate_turn")
	debate_res = run_debate_occ(n_debates, 3,
	[{"id": f"f{i}", "honest": True, "acc": 0.9, "tokens": 200} for i in range(3)],
	oracle, ledger2, broker2, seed=seed)
	print(f" Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}")

	results["ablations"][name] = {"description": desc, "code": code_res, "qa": qa_res, "debate": debate_res}

	# Anti-gaming
	print("\n--- ANTI-GAMING TESTS ---")

	# Hidden-test gaming
	oracle = ImpactOracle(gaming_penalty=2.0)
	normal_res = []; gamer_res = []
	for _ in range(50):
	public_pass = random.random() < 0.9
	hidden_pass = random.random() < 0.5 if True else random.random() < 0.9
	ora_normal = oracle.score("code", {}, {}, {"correctness": 1.0 if public_pass else 0.0, "pass_at_k": 1.0 if hidden_pass else 0.0, "compute_cost": 150, "public_pass": public_pass, "hidden_tests_pass": hidden_pass})
	normal_res.append(ora_normal.raw_score)
	# Gamer: always passes public, fails hidden
	ora_gamer = oracle.score("code", {}, {}, {"correctness": 1.0, "pass_at_k": 0.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": False})
	gamer_res.append(ora_gamer.raw_score)
	results["anti_gaming"]["hidden_test_gaming"] = {
	"normal_mean_raw": sum(normal_res) / len(normal_res),
	"gamer_mean_raw": sum(gamer_res) / len(gamer_res),
	"gamer_penalized_rate": sum(1 for r in gamer_res if r < 0) / len(gamer_res),
	}
	print(f" Hidden-test gaming: normal={results['anti_gaming']['hidden_test_gaming']['normal_mean_raw']:.2f}, gamer={results['anti_gaming']['hidden_test_gaming']['gamer_mean_raw']:.2f}")

	# Collusion
	ledger = CreditLedger()
	ledger.earn("alice", "seed", "seed", 10, 0, 0, "initial")
	ledger.earn("bob", "seed", "seed", 1, 0, 0, "initial")
	ok = ledger.transfer("alice", "bob", 5.0)
	results["anti_gaming"]["collusion"] = {
	"transfer_allowed": ok,
	"alice_balance": ledger.balance("alice"),
	"bob_balance": ledger.balance("bob"),
	"blocked": not ok,
	}
	print(f" Collusion: transfer_allowed={ok}, alice={ledger.balance('alice'):.1f}, bob={ledger.balance('bob'):.1f}")

	# Over-abstention
	oracle = ImpactOracle()
	abstention_rewards = []
	for _ in range(10):
	res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
	{"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
	abstention_rewards.append(res.reward_value)
	results["anti_gaming"]["abstention"] = {
	"mean_reward": sum(abstention_rewards) / len(abstention_rewards),
	"negative": sum(abstention_rewards) < 0,
	}
	print(f" Abstention: mean_reward={results['anti_gaming']['abstention']['mean_reward']:.2f}, negative={results['anti_gaming']['abstention']['negative']}")

	# Spam
	oracle = ImpactOracle()
	spam_res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"},
	{"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000})
	results["anti_gaming"]["spam"] = {
	"reward": spam_res.reward_value,
	"tags": spam_res.failure_tags,
	}
	print(f" Spam: reward={spam_res.reward_value:.2f}, tags={spam_res.failure_tags}")

	# Save
	out = Path("/app/occ/reports")
	out.mkdir(parents=True, exist_ok=True)
	with open(out / "eval_runner_results.json", "w") as f:
	json.dump(results, f, indent=2, default=str)
	print(f"\nSaved to {out / 'eval_runner_results.json'}")

	# Summary table
	print("\n" + "=" * 60)
	print("ABLATION SUMMARY")
	print("=" * 60)
	print(f"{'Name':<20} {'Code Acc':>10} {'Code Comp':>10} {'QA Acc':>10} {'QA Comp':>10} {'Deb Acc':>10} {'Deb Comp':>10}")
	for name, data in results["ablations"].items():
	print(f"{name:<20} {data['code']['accuracy']:>10.3f} {data['code']['total_compute']:>10.0f} "
	f"{data['qa']['accuracy']:>10.3f} {data['qa']['total_compute']:>10.0f} "
	f"{data['debate']['accuracy']:>10.3f} {data['debate']['total_compute']:>10.0f}")

	return results


	if __name__ == "__main__":
	run_all()