"""
Self-contained evaluation runner for OCC stack.
Includes all core classes inline + all simulated benchmarks + ablations + anti-gaming.
Runs on CPU. Outputs JSON report.
"""
import json
import random
import sys
import time
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional

import numpy as np

# --- CORE CLASSES (INLINE) ---

@dataclass
class OracleResult:
    raw_score: float
    cost_adjusted_score: float
    confidence: float
    evidence: Dict[str, Any]
    reason: str
    failure_tags: List[str] = field(default_factory=list)
    reward_value: float = 0.0


class ImpactOracle:
    def __init__(self, compute_penalty_rate=0.0001, calibration_weight=0.2,
                 abstention_bonus=1.0, hallucination_penalty=2.0,
                 confident_wrong_penalty=3.0, gaming_penalty=2.0,
                 code_weights=None, qa_weights=None, debate_weights=None):
        self.code_weights = code_weights or {"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001}
        self.qa_weights = qa_weights or {"correctness": 1.0, "evidence_support": 0.5, "calibration": 0.2, "abstention_utility": 1.0, "hallucination_penalty": 2.0, "confident_wrong_penalty": 3.0}
        self.debate_weights = debate_weights or {"decision_quality": 1.0, "influence_efficiency": 0.5, "throughput": 0.3, "marginal_contribution": 0.5}
        self.compute_penalty_rate = compute_penalty_rate
        self.calibration_weight = calibration_weight
        self.abstention_bonus = abstention_bonus
        self.hallucination_penalty = hallucination_penalty
        self.confident_wrong_penalty = confident_wrong_penalty
        self.gaming_penalty = gaming_penalty

    def score(self, mode, action, context, result, agent_id=""):
        if mode == "code":
            return self._score_code(action, context, result, agent_id)
        elif mode == "retrieval_qa":
            return self._score_qa(action, context, result, agent_id)
        elif mode == "debate":
            return self._score_debate(action, context, result, agent_id)
        return OracleResult(0.0, 0.0, 0.0, {}, f"unknown mode {mode}", ["unknown_mode"])

    def _score_code(self, action, context, result, agent_id):
        correctness = result.get("correctness", 0.0)
        pass_at_k = result.get("pass_at_k", 0.0)
        regression = result.get("regression", False)
        compute_cost = result.get("compute_cost", 0.0)
        hidden_pass = result.get("hidden_tests_pass", correctness)
        public_pass = result.get("public_pass", correctness)
        tags = []
        if public_pass and not hidden_pass:
            tags.append("gaming_hidden_tests")
        raw = (correctness * self.code_weights["correctness"] +
               pass_at_k * self.code_weights["pass_at_k"] +
               (self.code_weights["regression"] if regression else 0.0) -
               compute_cost * self.code_weights.get("compute_penalty", self.compute_penalty_rate))
        if "gaming_hidden_tests" in tags:
            raw -= self.gaming_penalty
        cost_adj = raw - compute_cost * self.compute_penalty_rate
        return OracleResult(raw, cost_adj, result.get("confidence", correctness),
                            {"correctness": correctness}, f"corr={correctness:.2f}, cost={compute_cost}", tags, cost_adj)

    def _score_qa(self, action, context, result, agent_id):
        gold = context.get("gold_answer", "")
        answer = result.get("answer", "")
        confidence = result.get("confidence", 0.5)
        compute_cost = result.get("compute_cost", 0.0)
        abstained = action.get("abstained", False)
        if abstained:
            correct_abstention = context.get("is_unanswerable", False)
            raw = self.abstention_bonus if correct_abstention else -self.abstention_bonus
            tags = ["correct_abstention" if correct_abstention else "wrong_abstention"]
            return OracleResult(raw, raw - compute_cost * self.compute_penalty_rate, confidence, {}, f"abstain correct={correct_abstention}", tags, raw)
        correctness = self._fuzzy_match(answer, gold)
        evidence = result.get("evidence", {})
        entailment = evidence.get("entailment_score", 0.0)
        contradiction = evidence.get("contradiction_score", 0.0)
        hallucination = contradiction > 0.5
        confident_wrong = confidence > 0.8 and correctness < 0.5
        tags = []
        if hallucination: tags.append("hallucination")
        if confident_wrong: tags.append("confident_wrong")
        if compute_cost > 2000: tags.append("excessive_compute")
        if compute_cost > 500 and correctness < 0.5: tags.append("compute_waste")
        raw = (correctness * self.qa_weights["correctness"] +
               entailment * self.qa_weights.get("evidence_support", 0.5) -
               (self.hallucination_penalty if hallucination else 0.0) -
               (self.confident_wrong_penalty if confident_wrong else 0.0) -
               compute_cost * self.compute_penalty_rate)
        brier = (confidence - correctness) ** 2
        raw += (1 - brier) * self.calibration_weight
        cost_adj = raw - compute_cost * self.compute_penalty_rate
        if compute_cost > 100 and raw < 0.5:
            cost_adj -= self.gaming_penalty * 0.5
        return OracleResult(raw, cost_adj, confidence, evidence, f"corr={correctness:.2f}, conf={confidence:.2f}", tags, cost_adj)

    def _score_debate(self, action, context, result, agent_id):
        decision_quality = result.get("decision_quality", 0.0)
        marginal = result.get("marginal_contribution", 0.0)
        tokens = result.get("tokens", 0)
        compute_cost = result.get("compute_cost", tokens)
        spam = result.get("spam", False)
        collusion = result.get("collusion", False)
        tags = []
        if spam: tags.append("spam")
        if collusion: tags.append("collusion")
        if tokens > 5000: tags.append("verbose_waste")
        raw = (decision_quality * self.debate_weights["decision_quality"] +
               marginal * self.debate_weights["marginal_contribution"] +
               (1.0 / max(tokens, 1)) * self.debate_weights["influence_efficiency"] -
               compute_cost * self.compute_penalty_rate)
        if spam: raw -= self.gaming_penalty
        if collusion: raw -= self.gaming_penalty * 2
        cost_adj = raw - compute_cost * self.compute_penalty_rate
        return OracleResult(raw, cost_adj, result.get("confidence", 0.5),
                            {"marginal": marginal}, f"dq={decision_quality:.2f}, tokens={tokens}", tags, cost_adj)

    def _fuzzy_match(self, a, b):
        if not a or not b: return 0.0
        a, b = a.strip().lower(), b.strip().lower()
        return 1.0 if a == b else 0.5 if (a in b or b in a) else 0.0


@dataclass
class LedgerEntry:
    agent_id: str; task_id: str; action_id: str; earned_credit: float; spent_credit: float
    decayed_credit: float; remaining_credit: float; reason: str; oracle_score: float
    compute_cost: float; timestamp: float; capability_scope: str = "global"


class CreditLedger:
    def __init__(self, decay_lambda=0.05):
        self.entries = []
        self.balances = {}
        self.decay_lambda = decay_lambda

    def earn(self, agent_id, task_id, action_id, amount, oracle_score, compute_cost, reason, capability_scope="global"):
        now = time.time()
        self._apply_decay(agent_id, now, capability_scope)
        current = self._get(agent_id, capability_scope)
        new_bal = current + amount
        self.entries.append(LedgerEntry(agent_id, task_id, action_id, amount, 0.0, 0.0, new_bal, reason, oracle_score, compute_cost, now, capability_scope))
        self._set(agent_id, capability_scope, new_bal)

    def spend(self, agent_id, task_id, action_id, amount, capability_scope="global", reason="spend"):
        now = time.time()
        self._apply_decay(agent_id, now, capability_scope)
        current = self._get(agent_id, capability_scope)
        if current < amount:
            return False
        new_bal = current - amount
        self.entries.append(LedgerEntry(agent_id, task_id, action_id, 0.0, amount, 0.0, new_bal, reason, 0.0, 0.0, now, capability_scope))
        self._set(agent_id, capability_scope, new_bal)
        return True

    def transfer(self, from_agent, to_agent, amount, capability_scope="global"):
        return False  # non-transferable

    def balance(self, agent_id, capability_scope="global"):
        now = time.time()
        self._apply_decay(agent_id, now, capability_scope)
        return self._get(agent_id, capability_scope)

    def _get(self, agent_id, cap):
        return self.balances.get(agent_id, {}).get(cap, 0.0)

    def _set(self, agent_id, cap, val):
        if agent_id not in self.balances: self.balances[agent_id] = {}
        self.balances[agent_id][cap] = val

    def _apply_decay(self, agent_id, now, cap):
        current = self._get(agent_id, cap)
        if current <= 0: return
        decayed = current * (1 - self.decay_lambda)
        if decayed < current:
            self.entries.append(LedgerEntry(agent_id, "decay", "decay", 0.0, 0.0, current - decayed, decayed, "credit_decay", 0.0, 0.0, now, cap))
            self._set(agent_id, cap, decayed)

    def detect_collusion(self, window=10):
        recent = self.entries[-window:]
        agents = set(e.agent_id for e in recent)
        if len(agents) < 2: return None
        return {"suspicious_agents": list(agents), "count": len(recent)}


class Decision(Enum):
    ALLOW = "allow"; DENY = "deny"; REQUIRE_APPROVAL = "require_approval"
    DOWNGRADE = "downgrade"; ESCALATE = "escalate"; ASK_JUSTIFICATION = "ask_justification"


@dataclass
class ResourceDecision:
    decision: Decision; reason: str; capability: str; downgrade_to: Optional[str] = None


class ResourceBroker:
    RESOURCE_RISK = {"model_call": "medium", "retrieval_call": "low", "verifier_call": "medium",
                     "debate_turn": "low", "file_write": "high", "shell_execute": "high",
                     "memory_write": "medium", "human_escalation": "high", "larger_model": "medium"}
    DEFAULT_THRESHOLDS = {"low": 0.5, "medium": 2.0, "high": 5.0}

    def __init__(self, thresholds=None, urgency_boost=0.5):
        self.thresholds = thresholds or self.DEFAULT_THRESHOLDS.copy()
        self.urgency_boost = urgency_boost
        self.denial_history = {}

    def request(self, capability, agent_id, credit_balance, task_state=None, risk_score=0.0, gaming_flags=None):
        task_state = task_state or {}
        gaming_flags = gaming_flags or []
        risk_class = self.RESOURCE_RISK.get(capability, "medium")
        threshold = self.thresholds.get(risk_class, 2.0)
        urgency = task_state.get("urgency", 0.0)
        adjusted = max(0.1, threshold - urgency * self.urgency_boost)
        if gaming_flags:
            return ResourceDecision(Decision.DENY, f"Gaming: {gaming_flags}", capability)
        if risk_class == "high" and risk_score > 0.7:
            return ResourceDecision(Decision.REQUIRE_APPROVAL, f"High risk {risk_score:.2f}", capability)
        if credit_balance >= adjusted:
            return ResourceDecision(Decision.ALLOW, f"Balance {credit_balance:.2f} >= {adjusted:.2f}", capability)
        if credit_balance >= adjusted * 0.5:
            if risk_class == "medium":
                return ResourceDecision(Decision.DOWNGRADE, f"Downgrading from {capability}", capability, "retrieval_call")
            return ResourceDecision(Decision.ASK_JUSTIFICATION, f"Justification required", capability)
        denials = self.denial_history.get(agent_id, 0)
        if denials > 3:
            return ResourceDecision(Decision.ESCALATE, f"Denied {denials} times", capability)
        self.denial_history[agent_id] = denials + 1
        return ResourceDecision(Decision.DENY, f"Balance {credit_balance:.2f} < {adjusted:.2f}", capability)


# --- BENCHMARK SIMULATIONS (INLINE) ---

@dataclass
class CodeProblem:
    task_id: str; difficulty: float; hidden_test_difficulty: float; public_test_difficulty: float


class SimulatedCodeAgent:
    def __init__(self, agent_id, pass_rate_easy=0.9, pass_rate_hard=0.3, hidden_test_falloff=0.15, cost_per_attempt=200):
        self.agent_id = agent_id
        self.pass_rate_easy = pass_rate_easy
        self.pass_rate_hard = pass_rate_hard
        self.hidden_test_falloff = hidden_test_falloff
        self.cost_per_attempt = cost_per_attempt
        self.attempts = 0

    def solve(self, problem):
        self.attempts += 1
        base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty
        public_pass = random.random() < base_acc
        hidden_acc = max(0.0, base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty)
        hidden_pass = random.random() < hidden_acc
        return {"public_pass": public_pass, "hidden_pass": hidden_pass, "compute_cost": self.cost_per_attempt}


def gen_code_problems(n, seed):
    random.seed(seed); np.random.seed(seed)
    return [CodeProblem(f"task_{i}", random.random(), random.random(), random.random()) for i in range(n)]


def run_code_baseline(problems, agent):
    total = 0; results = []
    for p in problems:
        r = agent.solve(p)
        total += r["compute_cost"]
        results.append(r)
    acc = sum(1 for r in results if r["public_pass"]) / len(results)
    return {"accuracy": acc, "total_compute": total, "mean_compute": total / len(problems)}


def run_code_occ(problems, agents, oracle, ledger, broker, max_attempts=3):
    total = 0; results = []
    for a in agents:
        q = (a.pass_rate_easy + a.pass_rate_hard) / 2
        ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
    for p in problems:
        solved = False; cost = 0; used = []
        ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
        for agent in ranked:
            if solved or len(used) >= max_attempts: break
            r = agent.solve(p); cost += r["compute_cost"]; total += r["compute_cost"]; used.append(agent.agent_id)
            solved = r["public_pass"]; hidden = r["hidden_pass"]
            ora = oracle.score("code", {"attempt": len(used)}, {},
                               {"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0,
                                "compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden},
                               agent_id=agent.agent_id)
            if ora.raw_score > 0:
                ledger.earn(agent.agent_id, p.task_id, "solve", ora.raw_score * 5, ora.raw_score, cost, "pass", "model_call")
            else:
                ledger.spend(agent.agent_id, p.task_id, "solve", 1.0, "model_call", "fail")
            if hidden: break
        results.append({"solved": solved, "cost": cost, "agents": used})
    acc = sum(1 for r in results if r["solved"]) / len(results)
    return {"accuracy": acc, "total_compute": total, "mean_compute": total / len(problems), "mean_agents": sum(len(r["agents"]) for r in results) / len(results)}


def create_qa_dataset(seed=42, n=50):
    random.seed(seed)
    evidence_pool = ["alpha", "beta", "gamma", "delta"]
    questions = []
    for i in range(n):
        q_type = random.choice(["answerable", "unanswerable", "misleading", "incomplete", "conflicting"])
        answer = random.choice(["paris", "42", "yes", "no", "tokyo"])
        evidence = random.sample(evidence_pool, k=random.randint(1, 3))
        questions.append({"id": f"q_{i}", "question": f"Q{i}", "type": q_type, "answer": answer, "evidence": evidence, "is_unanswerable": q_type == "unanswerable"})
    return questions


def run_qa_occ(dataset, agent_params, oracle, ledger, broker):
    total_compute = 0; correct = 0
    ledger.earn("qa_agent", "seed", "seed", 20, 0.0, 0.0, "initial", "retrieval_call")
    for item in dataset:
        balance = ledger.balance("qa_agent", "retrieval_call")
        dec = broker.request("retrieval_call", "qa_agent", balance, task_state={"urgency": 0.5})
        if dec.decision == Decision.DENY:
            continue
        tokens = 200 if dec.decision == Decision.ALLOW else 100
        total_compute += tokens
        should_answer = item["type"] != "unanswerable"
        ans = item["answer"] if (should_answer and random.random() < agent_params["acc"]) else None
        conf = 0.9 if ans else 0.3
        ora = oracle.score("retrieval_qa", {"abstained": ans is None}, item,
                           {"answer": ans, "confidence": conf, "evidence": {}, "compute_cost": tokens}, "qa_agent")
        if ora.raw_score > 0:
            ledger.earn("qa_agent", item["id"], "ans", ora.raw_score * 3, ora.raw_score, tokens, "correct", "retrieval_call")
            correct += 1
        else:
            ledger.spend("qa_agent", item["id"], "ans", 0.5, "retrieval_call", "wrong")
    return {"accuracy": correct / len(dataset), "total_compute": total_compute, "mean_compute": total_compute / len(dataset)}


def run_debate_occ(n_debates, n_agents, agent_configs, oracle, ledger, broker, seed=42):
    random.seed(seed)
    correct = 0; total_compute = 0; consensus = 0
    for _ in range(n_debates):
        truth = random.choice(["A", "B", "C"])
        agents = []
        for cfg in agent_configs:
            acc = cfg["acc"] if cfg["honest"] else random.random() * 0.4
            agents.append({"honest": cfg["honest"], "acc": acc, "id": cfg["id"], "tokens": cfg.get("tokens", 200)})
        votes = []
        for a in agents:
            balance = ledger.balance(a["id"], "debate_turn")
            dec = broker.request("debate_turn", a["id"], balance)
            if dec.decision == Decision.DENY:
                continue
            total_compute += a["tokens"]
            vote = truth if (a["honest"] and random.random() < a["acc"]) else random.choice(["A", "B", "C"])
            votes.append((a["id"], vote, a["honest"], a["acc"]))
            ledger.spend(a["id"], "debate", "turn", 1.0, "debate_turn", "participate")
        if not votes:
            continue
        honest_votes = [v for _, v, h, _ in votes if h]
        final = max(set([v for _, v, _, _ in votes]), key=lambda x: sum(1 for _, v, _, _ in votes if v == x))
        if final == truth:
            correct += 1
            for vid, _, h, _ in votes:
                if h:
                    ledger.earn(vid, "debate", "consensus", 2.0, 1.0, 0, "consensus", "debate_turn")
        if len(set(v for _, v, _, _ in votes)) == 1:
            consensus += 1
    n = n_debates
    return {"accuracy": correct / n, "consensus_reached": consensus / n, "total_compute": total_compute, "mean_compute": total_compute / n}


# --- ABLATIONS & ANTI-GAMING ---

ABLATIONS = [
    ("default", "Full OCC", 0.02, 2.0, 0.0001, True, {}),
    ("no_decay", "No credit decay", 0.0, 2.0, 0.0001, True, {}),
    ("fast_decay", "Aggressive decay", 0.1, 2.0, 0.0001, True, {}),
    ("no_gaming_penalty", "No gaming penalties", 0.02, 0.0, 0.0001, True, {}),
    ("high_gaming_penalty", "Severe gaming penalties", 0.02, 5.0, 0.0001, True, {}),
    ("lenient_broker", "Lenient broker", 0.02, 2.0, 0.0001, True, {"low": 0.25, "medium": 1.0, "high": 2.5}),
    ("strict_broker", "Strict broker", 0.02, 2.0, 0.0001, True, {"low": 1.0, "medium": 4.0, "high": 10.0}),
    ("high_compute_cost", "High compute penalty", 0.02, 2.0, 0.001, True, {}),
    ("low_compute_cost", "Low compute penalty", 0.02, 2.0, 0.00001, True, {}),
    ("anti_gaming_off", "Anti-gaming disabled", 0.02, 2.0, 0.0001, False, {}),
]


def run_all():
    print("=" * 60)
    print("OCC UNIFIED EVALUATION RUNNER (SELF-CONTAINED)")
    print("=" * 60)
    seed = 42
    n_problems = 100
    n_qa = 100
    n_debates = 50

    results = {"ablations": {}, "anti_gaming": {}}

    # Ablations
    for name, desc, decay, game_pen, comp_pen, anti_on, broker_thresh in ABLATIONS:
        print(f"\n--- ABLATION: {name} ---")
        oracle = ImpactOracle(compute_penalty_rate=comp_pen, gaming_penalty=game_pen if anti_on else 0.0)
        ledger = CreditLedger(decay_lambda=decay)
        broker = ResourceBroker(thresholds=broker_thresh if broker_thresh else None)

        problems = gen_code_problems(n_problems, seed)
        cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60)
        medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150)
        expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350)
        code_res = run_code_occ(problems, [cheap, medium, expensive], oracle, ledger, broker, max_attempts=3)
        print(f"  Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}")

        qa_data = create_qa_dataset(seed=seed, n=n_qa)
        qa_res = run_qa_occ(qa_data, {"acc": 0.85}, oracle, ledger, broker)
        print(f"  QA:   acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}")

        ledger2 = CreditLedger(decay_lambda=decay)
        broker2 = ResourceBroker(thresholds=broker_thresh if broker_thresh else None)
        for i in range(3):
            ledger2.earn(f"f{i}", "seed", "seed", 5, 0, 0, "initial", "debate_turn")
        debate_res = run_debate_occ(n_debates, 3,
            [{"id": f"f{i}", "honest": True, "acc": 0.9, "tokens": 200} for i in range(3)],
            oracle, ledger2, broker2, seed=seed)
        print(f"  Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}")

        results["ablations"][name] = {"description": desc, "code": code_res, "qa": qa_res, "debate": debate_res}

    # Anti-gaming
    print("\n--- ANTI-GAMING TESTS ---")

    # Hidden-test gaming
    oracle = ImpactOracle(gaming_penalty=2.0)
    normal_res = []; gamer_res = []
    for _ in range(50):
        public_pass = random.random() < 0.9
        hidden_pass = random.random() < 0.5 if True else random.random() < 0.9
        ora_normal = oracle.score("code", {}, {}, {"correctness": 1.0 if public_pass else 0.0, "pass_at_k": 1.0 if hidden_pass else 0.0, "compute_cost": 150, "public_pass": public_pass, "hidden_tests_pass": hidden_pass})
        normal_res.append(ora_normal.raw_score)
        # Gamer: always passes public, fails hidden
        ora_gamer = oracle.score("code", {}, {}, {"correctness": 1.0, "pass_at_k": 0.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": False})
        gamer_res.append(ora_gamer.raw_score)
    results["anti_gaming"]["hidden_test_gaming"] = {
        "normal_mean_raw": sum(normal_res) / len(normal_res),
        "gamer_mean_raw": sum(gamer_res) / len(gamer_res),
        "gamer_penalized_rate": sum(1 for r in gamer_res if r < 0) / len(gamer_res),
    }
    print(f"  Hidden-test gaming: normal={results['anti_gaming']['hidden_test_gaming']['normal_mean_raw']:.2f}, gamer={results['anti_gaming']['hidden_test_gaming']['gamer_mean_raw']:.2f}")

    # Collusion
    ledger = CreditLedger()
    ledger.earn("alice", "seed", "seed", 10, 0, 0, "initial")
    ledger.earn("bob", "seed", "seed", 1, 0, 0, "initial")
    ok = ledger.transfer("alice", "bob", 5.0)
    results["anti_gaming"]["collusion"] = {
        "transfer_allowed": ok,
        "alice_balance": ledger.balance("alice"),
        "bob_balance": ledger.balance("bob"),
        "blocked": not ok,
    }
    print(f"  Collusion: transfer_allowed={ok}, alice={ledger.balance('alice'):.1f}, bob={ledger.balance('bob'):.1f}")

    # Over-abstention
    oracle = ImpactOracle()
    abstention_rewards = []
    for _ in range(10):
        res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
                           {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
        abstention_rewards.append(res.reward_value)
    results["anti_gaming"]["abstention"] = {
        "mean_reward": sum(abstention_rewards) / len(abstention_rewards),
        "negative": sum(abstention_rewards) < 0,
    }
    print(f"  Abstention: mean_reward={results['anti_gaming']['abstention']['mean_reward']:.2f}, negative={results['anti_gaming']['abstention']['negative']}")

    # Spam
    oracle = ImpactOracle()
    spam_res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"},
                            {"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000})
    results["anti_gaming"]["spam"] = {
        "reward": spam_res.reward_value,
        "tags": spam_res.failure_tags,
    }
    print(f"  Spam: reward={spam_res.reward_value:.2f}, tags={spam_res.failure_tags}")

    # Save
    out = Path("/app/occ/reports")
    out.mkdir(parents=True, exist_ok=True)
    with open(out / "eval_runner_results.json", "w") as f:
        json.dump(results, f, indent=2, default=str)
    print(f"\nSaved to {out / 'eval_runner_results.json'}")

    # Summary table
    print("\n" + "=" * 60)
    print("ABLATION SUMMARY")
    print("=" * 60)
    print(f"{'Name':<20} {'Code Acc':>10} {'Code Comp':>10} {'QA Acc':>10} {'QA Comp':>10} {'Deb Acc':>10} {'Deb Comp':>10}")
    for name, data in results["ablations"].items():
        print(f"{name:<20} {data['code']['accuracy']:>10.3f} {data['code']['total_compute']:>10.0f} "
              f"{data['qa']['accuracy']:>10.3f} {data['qa']['total_compute']:>10.0f} "
              f"{data['debate']['accuracy']:>10.3f} {data['debate']['total_compute']:>10.0f}")

    return results


if __name__ == "__main__":
    run_all()