Upload jobs/run_eval_standalone.py
Browse files- jobs/run_eval_standalone.py +511 -0
jobs/run_eval_standalone.py
ADDED
|
@@ -0,0 +1,511 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Self-contained evaluation runner for OCC stack.
|
| 3 |
+
Includes all core classes inline + all simulated benchmarks + ablations + anti-gaming.
|
| 4 |
+
Runs on CPU. Outputs JSON report.
|
| 5 |
+
"""
|
| 6 |
+
import json
|
| 7 |
+
import random
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from enum import Enum
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any, Dict, List, Optional
|
| 14 |
+
|
| 15 |
+
import numpy as np
|
| 16 |
+
|
| 17 |
+
# --- CORE CLASSES (INLINE) ---
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class OracleResult:
|
| 21 |
+
raw_score: float
|
| 22 |
+
cost_adjusted_score: float
|
| 23 |
+
confidence: float
|
| 24 |
+
evidence: Dict[str, Any]
|
| 25 |
+
reason: str
|
| 26 |
+
failure_tags: List[str] = field(default_factory=list)
|
| 27 |
+
reward_value: float = 0.0
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ImpactOracle:
|
| 31 |
+
def __init__(self, compute_penalty_rate=0.0001, calibration_weight=0.2,
|
| 32 |
+
abstention_bonus=1.0, hallucination_penalty=2.0,
|
| 33 |
+
confident_wrong_penalty=3.0, gaming_penalty=2.0,
|
| 34 |
+
code_weights=None, qa_weights=None, debate_weights=None):
|
| 35 |
+
self.code_weights = code_weights or {"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001}
|
| 36 |
+
self.qa_weights = qa_weights or {"correctness": 1.0, "evidence_support": 0.5, "calibration": 0.2, "abstention_utility": 1.0, "hallucination_penalty": 2.0, "confident_wrong_penalty": 3.0}
|
| 37 |
+
self.debate_weights = debate_weights or {"decision_quality": 1.0, "influence_efficiency": 0.5, "throughput": 0.3, "marginal_contribution": 0.5}
|
| 38 |
+
self.compute_penalty_rate = compute_penalty_rate
|
| 39 |
+
self.calibration_weight = calibration_weight
|
| 40 |
+
self.abstention_bonus = abstention_bonus
|
| 41 |
+
self.hallucination_penalty = hallucination_penalty
|
| 42 |
+
self.confident_wrong_penalty = confident_wrong_penalty
|
| 43 |
+
self.gaming_penalty = gaming_penalty
|
| 44 |
+
|
| 45 |
+
def score(self, mode, action, context, result, agent_id=""):
|
| 46 |
+
if mode == "code":
|
| 47 |
+
return self._score_code(action, context, result, agent_id)
|
| 48 |
+
elif mode == "retrieval_qa":
|
| 49 |
+
return self._score_qa(action, context, result, agent_id)
|
| 50 |
+
elif mode == "debate":
|
| 51 |
+
return self._score_debate(action, context, result, agent_id)
|
| 52 |
+
return OracleResult(0.0, 0.0, 0.0, {}, f"unknown mode {mode}", ["unknown_mode"])
|
| 53 |
+
|
| 54 |
+
def _score_code(self, action, context, result, agent_id):
|
| 55 |
+
correctness = result.get("correctness", 0.0)
|
| 56 |
+
pass_at_k = result.get("pass_at_k", 0.0)
|
| 57 |
+
regression = result.get("regression", False)
|
| 58 |
+
compute_cost = result.get("compute_cost", 0.0)
|
| 59 |
+
hidden_pass = result.get("hidden_tests_pass", correctness)
|
| 60 |
+
public_pass = result.get("public_pass", correctness)
|
| 61 |
+
tags = []
|
| 62 |
+
if public_pass and not hidden_pass:
|
| 63 |
+
tags.append("gaming_hidden_tests")
|
| 64 |
+
raw = (correctness * self.code_weights["correctness"] +
|
| 65 |
+
pass_at_k * self.code_weights["pass_at_k"] +
|
| 66 |
+
(self.code_weights["regression"] if regression else 0.0) -
|
| 67 |
+
compute_cost * self.code_weights.get("compute_penalty", self.compute_penalty_rate))
|
| 68 |
+
if "gaming_hidden_tests" in tags:
|
| 69 |
+
raw -= self.gaming_penalty
|
| 70 |
+
cost_adj = raw - compute_cost * self.compute_penalty_rate
|
| 71 |
+
return OracleResult(raw, cost_adj, result.get("confidence", correctness),
|
| 72 |
+
{"correctness": correctness}, f"corr={correctness:.2f}, cost={compute_cost}", tags, cost_adj)
|
| 73 |
+
|
| 74 |
+
def _score_qa(self, action, context, result, agent_id):
|
| 75 |
+
gold = context.get("gold_answer", "")
|
| 76 |
+
answer = result.get("answer", "")
|
| 77 |
+
confidence = result.get("confidence", 0.5)
|
| 78 |
+
compute_cost = result.get("compute_cost", 0.0)
|
| 79 |
+
abstained = action.get("abstained", False)
|
| 80 |
+
if abstained:
|
| 81 |
+
correct_abstention = context.get("is_unanswerable", False)
|
| 82 |
+
raw = self.abstention_bonus if correct_abstention else -self.abstention_bonus
|
| 83 |
+
tags = ["correct_abstention" if correct_abstention else "wrong_abstention"]
|
| 84 |
+
return OracleResult(raw, raw - compute_cost * self.compute_penalty_rate, confidence, {}, f"abstain correct={correct_abstention}", tags, raw)
|
| 85 |
+
correctness = self._fuzzy_match(answer, gold)
|
| 86 |
+
evidence = result.get("evidence", {})
|
| 87 |
+
entailment = evidence.get("entailment_score", 0.0)
|
| 88 |
+
contradiction = evidence.get("contradiction_score", 0.0)
|
| 89 |
+
hallucination = contradiction > 0.5
|
| 90 |
+
confident_wrong = confidence > 0.8 and correctness < 0.5
|
| 91 |
+
tags = []
|
| 92 |
+
if hallucination: tags.append("hallucination")
|
| 93 |
+
if confident_wrong: tags.append("confident_wrong")
|
| 94 |
+
if compute_cost > 2000: tags.append("excessive_compute")
|
| 95 |
+
if compute_cost > 500 and correctness < 0.5: tags.append("compute_waste")
|
| 96 |
+
raw = (correctness * self.qa_weights["correctness"] +
|
| 97 |
+
entailment * self.qa_weights.get("evidence_support", 0.5) -
|
| 98 |
+
(self.hallucination_penalty if hallucination else 0.0) -
|
| 99 |
+
(self.confident_wrong_penalty if confident_wrong else 0.0) -
|
| 100 |
+
compute_cost * self.compute_penalty_rate)
|
| 101 |
+
brier = (confidence - correctness) ** 2
|
| 102 |
+
raw += (1 - brier) * self.calibration_weight
|
| 103 |
+
cost_adj = raw - compute_cost * self.compute_penalty_rate
|
| 104 |
+
if compute_cost > 100 and raw < 0.5:
|
| 105 |
+
cost_adj -= self.gaming_penalty * 0.5
|
| 106 |
+
return OracleResult(raw, cost_adj, confidence, evidence, f"corr={correctness:.2f}, conf={confidence:.2f}", tags, cost_adj)
|
| 107 |
+
|
| 108 |
+
def _score_debate(self, action, context, result, agent_id):
|
| 109 |
+
decision_quality = result.get("decision_quality", 0.0)
|
| 110 |
+
marginal = result.get("marginal_contribution", 0.0)
|
| 111 |
+
tokens = result.get("tokens", 0)
|
| 112 |
+
compute_cost = result.get("compute_cost", tokens)
|
| 113 |
+
spam = result.get("spam", False)
|
| 114 |
+
collusion = result.get("collusion", False)
|
| 115 |
+
tags = []
|
| 116 |
+
if spam: tags.append("spam")
|
| 117 |
+
if collusion: tags.append("collusion")
|
| 118 |
+
if tokens > 5000: tags.append("verbose_waste")
|
| 119 |
+
raw = (decision_quality * self.debate_weights["decision_quality"] +
|
| 120 |
+
marginal * self.debate_weights["marginal_contribution"] +
|
| 121 |
+
(1.0 / max(tokens, 1)) * self.debate_weights["influence_efficiency"] -
|
| 122 |
+
compute_cost * self.compute_penalty_rate)
|
| 123 |
+
if spam: raw -= self.gaming_penalty
|
| 124 |
+
if collusion: raw -= self.gaming_penalty * 2
|
| 125 |
+
cost_adj = raw - compute_cost * self.compute_penalty_rate
|
| 126 |
+
return OracleResult(raw, cost_adj, result.get("confidence", 0.5),
|
| 127 |
+
{"marginal": marginal}, f"dq={decision_quality:.2f}, tokens={tokens}", tags, cost_adj)
|
| 128 |
+
|
| 129 |
+
def _fuzzy_match(self, a, b):
|
| 130 |
+
if not a or not b: return 0.0
|
| 131 |
+
a, b = a.strip().lower(), b.strip().lower()
|
| 132 |
+
return 1.0 if a == b else 0.5 if (a in b or b in a) else 0.0
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@dataclass
|
| 136 |
+
class LedgerEntry:
|
| 137 |
+
agent_id: str; task_id: str; action_id: str; earned_credit: float; spent_credit: float
|
| 138 |
+
decayed_credit: float; remaining_credit: float; reason: str; oracle_score: float
|
| 139 |
+
compute_cost: float; timestamp: float; capability_scope: str = "global"
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class CreditLedger:
|
| 143 |
+
def __init__(self, decay_lambda=0.05):
|
| 144 |
+
self.entries = []
|
| 145 |
+
self.balances = {}
|
| 146 |
+
self.decay_lambda = decay_lambda
|
| 147 |
+
|
| 148 |
+
def earn(self, agent_id, task_id, action_id, amount, oracle_score, compute_cost, reason, capability_scope="global"):
|
| 149 |
+
now = time.time()
|
| 150 |
+
self._apply_decay(agent_id, now, capability_scope)
|
| 151 |
+
current = self._get(agent_id, capability_scope)
|
| 152 |
+
new_bal = current + amount
|
| 153 |
+
self.entries.append(LedgerEntry(agent_id, task_id, action_id, amount, 0.0, 0.0, new_bal, reason, oracle_score, compute_cost, now, capability_scope))
|
| 154 |
+
self._set(agent_id, capability_scope, new_bal)
|
| 155 |
+
|
| 156 |
+
def spend(self, agent_id, task_id, action_id, amount, capability_scope="global", reason="spend"):
|
| 157 |
+
now = time.time()
|
| 158 |
+
self._apply_decay(agent_id, now, capability_scope)
|
| 159 |
+
current = self._get(agent_id, capability_scope)
|
| 160 |
+
if current < amount:
|
| 161 |
+
return False
|
| 162 |
+
new_bal = current - amount
|
| 163 |
+
self.entries.append(LedgerEntry(agent_id, task_id, action_id, 0.0, amount, 0.0, new_bal, reason, 0.0, 0.0, now, capability_scope))
|
| 164 |
+
self._set(agent_id, capability_scope, new_bal)
|
| 165 |
+
return True
|
| 166 |
+
|
| 167 |
+
def transfer(self, from_agent, to_agent, amount, capability_scope="global"):
|
| 168 |
+
return False # non-transferable
|
| 169 |
+
|
| 170 |
+
def balance(self, agent_id, capability_scope="global"):
|
| 171 |
+
now = time.time()
|
| 172 |
+
self._apply_decay(agent_id, now, capability_scope)
|
| 173 |
+
return self._get(agent_id, capability_scope)
|
| 174 |
+
|
| 175 |
+
def _get(self, agent_id, cap):
|
| 176 |
+
return self.balances.get(agent_id, {}).get(cap, 0.0)
|
| 177 |
+
|
| 178 |
+
def _set(self, agent_id, cap, val):
|
| 179 |
+
if agent_id not in self.balances: self.balances[agent_id] = {}
|
| 180 |
+
self.balances[agent_id][cap] = val
|
| 181 |
+
|
| 182 |
+
def _apply_decay(self, agent_id, now, cap):
|
| 183 |
+
current = self._get(agent_id, cap)
|
| 184 |
+
if current <= 0: return
|
| 185 |
+
decayed = current * (1 - self.decay_lambda)
|
| 186 |
+
if decayed < current:
|
| 187 |
+
self.entries.append(LedgerEntry(agent_id, "decay", "decay", 0.0, 0.0, current - decayed, decayed, "credit_decay", 0.0, 0.0, now, cap))
|
| 188 |
+
self._set(agent_id, cap, decayed)
|
| 189 |
+
|
| 190 |
+
def detect_collusion(self, window=10):
|
| 191 |
+
recent = self.entries[-window:]
|
| 192 |
+
agents = set(e.agent_id for e in recent)
|
| 193 |
+
if len(agents) < 2: return None
|
| 194 |
+
return {"suspicious_agents": list(agents), "count": len(recent)}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
class Decision(Enum):
|
| 198 |
+
ALLOW = "allow"; DENY = "deny"; REQUIRE_APPROVAL = "require_approval"
|
| 199 |
+
DOWNGRADE = "downgrade"; ESCALATE = "escalate"; ASK_JUSTIFICATION = "ask_justification"
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
@dataclass
|
| 203 |
+
class ResourceDecision:
|
| 204 |
+
decision: Decision; reason: str; capability: str; downgrade_to: Optional[str] = None
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
class ResourceBroker:
|
| 208 |
+
RESOURCE_RISK = {"model_call": "medium", "retrieval_call": "low", "verifier_call": "medium",
|
| 209 |
+
"debate_turn": "low", "file_write": "high", "shell_execute": "high",
|
| 210 |
+
"memory_write": "medium", "human_escalation": "high", "larger_model": "medium"}
|
| 211 |
+
DEFAULT_THRESHOLDS = {"low": 0.5, "medium": 2.0, "high": 5.0}
|
| 212 |
+
|
| 213 |
+
def __init__(self, thresholds=None, urgency_boost=0.5):
|
| 214 |
+
self.thresholds = thresholds or self.DEFAULT_THRESHOLDS.copy()
|
| 215 |
+
self.urgency_boost = urgency_boost
|
| 216 |
+
self.denial_history = {}
|
| 217 |
+
|
| 218 |
+
def request(self, capability, agent_id, credit_balance, task_state=None, risk_score=0.0, gaming_flags=None):
|
| 219 |
+
task_state = task_state or {}
|
| 220 |
+
gaming_flags = gaming_flags or []
|
| 221 |
+
risk_class = self.RESOURCE_RISK.get(capability, "medium")
|
| 222 |
+
threshold = self.thresholds.get(risk_class, 2.0)
|
| 223 |
+
urgency = task_state.get("urgency", 0.0)
|
| 224 |
+
adjusted = max(0.1, threshold - urgency * self.urgency_boost)
|
| 225 |
+
if gaming_flags:
|
| 226 |
+
return ResourceDecision(Decision.DENY, f"Gaming: {gaming_flags}", capability)
|
| 227 |
+
if risk_class == "high" and risk_score > 0.7:
|
| 228 |
+
return ResourceDecision(Decision.REQUIRE_APPROVAL, f"High risk {risk_score:.2f}", capability)
|
| 229 |
+
if credit_balance >= adjusted:
|
| 230 |
+
return ResourceDecision(Decision.ALLOW, f"Balance {credit_balance:.2f} >= {adjusted:.2f}", capability)
|
| 231 |
+
if credit_balance >= adjusted * 0.5:
|
| 232 |
+
if risk_class == "medium":
|
| 233 |
+
return ResourceDecision(Decision.DOWNGRADE, f"Downgrading from {capability}", capability, "retrieval_call")
|
| 234 |
+
return ResourceDecision(Decision.ASK_JUSTIFICATION, f"Justification required", capability)
|
| 235 |
+
denials = self.denial_history.get(agent_id, 0)
|
| 236 |
+
if denials > 3:
|
| 237 |
+
return ResourceDecision(Decision.ESCALATE, f"Denied {denials} times", capability)
|
| 238 |
+
self.denial_history[agent_id] = denials + 1
|
| 239 |
+
return ResourceDecision(Decision.DENY, f"Balance {credit_balance:.2f} < {adjusted:.2f}", capability)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# --- BENCHMARK SIMULATIONS (INLINE) ---
|
| 243 |
+
|
| 244 |
+
@dataclass
|
| 245 |
+
class CodeProblem:
|
| 246 |
+
task_id: str; difficulty: float; hidden_test_difficulty: float; public_test_difficulty: float
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
class SimulatedCodeAgent:
|
| 250 |
+
def __init__(self, agent_id, pass_rate_easy=0.9, pass_rate_hard=0.3, hidden_test_falloff=0.15, cost_per_attempt=200):
|
| 251 |
+
self.agent_id = agent_id
|
| 252 |
+
self.pass_rate_easy = pass_rate_easy
|
| 253 |
+
self.pass_rate_hard = pass_rate_hard
|
| 254 |
+
self.hidden_test_falloff = hidden_test_falloff
|
| 255 |
+
self.cost_per_attempt = cost_per_attempt
|
| 256 |
+
self.attempts = 0
|
| 257 |
+
|
| 258 |
+
def solve(self, problem):
|
| 259 |
+
self.attempts += 1
|
| 260 |
+
base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty
|
| 261 |
+
public_pass = random.random() < base_acc
|
| 262 |
+
hidden_acc = max(0.0, base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty)
|
| 263 |
+
hidden_pass = random.random() < hidden_acc
|
| 264 |
+
return {"public_pass": public_pass, "hidden_pass": hidden_pass, "compute_cost": self.cost_per_attempt}
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def gen_code_problems(n, seed):
|
| 268 |
+
random.seed(seed); np.random.seed(seed)
|
| 269 |
+
return [CodeProblem(f"task_{i}", random.random(), random.random(), random.random()) for i in range(n)]
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def run_code_baseline(problems, agent):
|
| 273 |
+
total = 0; results = []
|
| 274 |
+
for p in problems:
|
| 275 |
+
r = agent.solve(p)
|
| 276 |
+
total += r["compute_cost"]
|
| 277 |
+
results.append(r)
|
| 278 |
+
acc = sum(1 for r in results if r["public_pass"]) / len(results)
|
| 279 |
+
return {"accuracy": acc, "total_compute": total, "mean_compute": total / len(problems)}
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def run_code_occ(problems, agents, oracle, ledger, broker, max_attempts=3):
|
| 283 |
+
total = 0; results = []
|
| 284 |
+
for a in agents:
|
| 285 |
+
q = (a.pass_rate_easy + a.pass_rate_hard) / 2
|
| 286 |
+
ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
|
| 287 |
+
for p in problems:
|
| 288 |
+
solved = False; cost = 0; used = []
|
| 289 |
+
ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
|
| 290 |
+
for agent in ranked:
|
| 291 |
+
if solved or len(used) >= max_attempts: break
|
| 292 |
+
r = agent.solve(p); cost += r["compute_cost"]; total += r["compute_cost"]; used.append(agent.agent_id)
|
| 293 |
+
solved = r["public_pass"]; hidden = r["hidden_pass"]
|
| 294 |
+
ora = oracle.score("code", {"attempt": len(used)}, {},
|
| 295 |
+
{"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0,
|
| 296 |
+
"compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden},
|
| 297 |
+
agent_id=agent.agent_id)
|
| 298 |
+
if ora.raw_score > 0:
|
| 299 |
+
ledger.earn(agent.agent_id, p.task_id, "solve", ora.raw_score * 5, ora.raw_score, cost, "pass", "model_call")
|
| 300 |
+
else:
|
| 301 |
+
ledger.spend(agent.agent_id, p.task_id, "solve", 1.0, "model_call", "fail")
|
| 302 |
+
if hidden: break
|
| 303 |
+
results.append({"solved": solved, "cost": cost, "agents": used})
|
| 304 |
+
acc = sum(1 for r in results if r["solved"]) / len(results)
|
| 305 |
+
return {"accuracy": acc, "total_compute": total, "mean_compute": total / len(problems), "mean_agents": sum(len(r["agents"]) for r in results) / len(results)}
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def create_qa_dataset(seed=42, n=50):
|
| 309 |
+
random.seed(seed)
|
| 310 |
+
evidence_pool = ["alpha", "beta", "gamma", "delta"]
|
| 311 |
+
questions = []
|
| 312 |
+
for i in range(n):
|
| 313 |
+
q_type = random.choice(["answerable", "unanswerable", "misleading", "incomplete", "conflicting"])
|
| 314 |
+
answer = random.choice(["paris", "42", "yes", "no", "tokyo"])
|
| 315 |
+
evidence = random.sample(evidence_pool, k=random.randint(1, 3))
|
| 316 |
+
questions.append({"id": f"q_{i}", "question": f"Q{i}", "type": q_type, "answer": answer, "evidence": evidence, "is_unanswerable": q_type == "unanswerable"})
|
| 317 |
+
return questions
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def run_qa_occ(dataset, agent_params, oracle, ledger, broker):
|
| 321 |
+
total_compute = 0; correct = 0
|
| 322 |
+
ledger.earn("qa_agent", "seed", "seed", 20, 0.0, 0.0, "initial", "retrieval_call")
|
| 323 |
+
for item in dataset:
|
| 324 |
+
balance = ledger.balance("qa_agent", "retrieval_call")
|
| 325 |
+
dec = broker.request("retrieval_call", "qa_agent", balance, task_state={"urgency": 0.5})
|
| 326 |
+
if dec.decision == Decision.DENY:
|
| 327 |
+
continue
|
| 328 |
+
tokens = 200 if dec.decision == Decision.ALLOW else 100
|
| 329 |
+
total_compute += tokens
|
| 330 |
+
should_answer = item["type"] != "unanswerable"
|
| 331 |
+
ans = item["answer"] if (should_answer and random.random() < agent_params["acc"]) else None
|
| 332 |
+
conf = 0.9 if ans else 0.3
|
| 333 |
+
ora = oracle.score("retrieval_qa", {"abstained": ans is None}, item,
|
| 334 |
+
{"answer": ans, "confidence": conf, "evidence": {}, "compute_cost": tokens}, "qa_agent")
|
| 335 |
+
if ora.raw_score > 0:
|
| 336 |
+
ledger.earn("qa_agent", item["id"], "ans", ora.raw_score * 3, ora.raw_score, tokens, "correct", "retrieval_call")
|
| 337 |
+
correct += 1
|
| 338 |
+
else:
|
| 339 |
+
ledger.spend("qa_agent", item["id"], "ans", 0.5, "retrieval_call", "wrong")
|
| 340 |
+
return {"accuracy": correct / len(dataset), "total_compute": total_compute, "mean_compute": total_compute / len(dataset)}
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def run_debate_occ(n_debates, n_agents, agent_configs, oracle, ledger, broker, seed=42):
|
| 344 |
+
random.seed(seed)
|
| 345 |
+
correct = 0; total_compute = 0; consensus = 0
|
| 346 |
+
for _ in range(n_debates):
|
| 347 |
+
truth = random.choice(["A", "B", "C"])
|
| 348 |
+
agents = []
|
| 349 |
+
for cfg in agent_configs:
|
| 350 |
+
acc = cfg["acc"] if cfg["honest"] else random.random() * 0.4
|
| 351 |
+
agents.append({"honest": cfg["honest"], "acc": acc, "id": cfg["id"], "tokens": cfg.get("tokens", 200)})
|
| 352 |
+
votes = []
|
| 353 |
+
for a in agents:
|
| 354 |
+
balance = ledger.balance(a["id"], "debate_turn")
|
| 355 |
+
dec = broker.request("debate_turn", a["id"], balance)
|
| 356 |
+
if dec.decision == Decision.DENY:
|
| 357 |
+
continue
|
| 358 |
+
total_compute += a["tokens"]
|
| 359 |
+
vote = truth if (a["honest"] and random.random() < a["acc"]) else random.choice(["A", "B", "C"])
|
| 360 |
+
votes.append((a["id"], vote, a["honest"], a["acc"]))
|
| 361 |
+
ledger.spend(a["id"], "debate", "turn", 1.0, "debate_turn", "participate")
|
| 362 |
+
if not votes:
|
| 363 |
+
continue
|
| 364 |
+
honest_votes = [v for _, v, h, _ in votes if h]
|
| 365 |
+
final = max(set([v for _, v, _, _ in votes]), key=lambda x: sum(1 for _, v, _, _ in votes if v == x))
|
| 366 |
+
if final == truth:
|
| 367 |
+
correct += 1
|
| 368 |
+
for vid, _, h, _ in votes:
|
| 369 |
+
if h:
|
| 370 |
+
ledger.earn(vid, "debate", "consensus", 2.0, 1.0, 0, "consensus", "debate_turn")
|
| 371 |
+
if len(set(v for _, v, _, _ in votes)) == 1:
|
| 372 |
+
consensus += 1
|
| 373 |
+
n = n_debates
|
| 374 |
+
return {"accuracy": correct / n, "consensus_reached": consensus / n, "total_compute": total_compute, "mean_compute": total_compute / n}
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
# --- ABLATIONS & ANTI-GAMING ---
|
| 378 |
+
|
| 379 |
+
ABLATIONS = [
|
| 380 |
+
("default", "Full OCC", 0.02, 2.0, 0.0001, True, {}),
|
| 381 |
+
("no_decay", "No credit decay", 0.0, 2.0, 0.0001, True, {}),
|
| 382 |
+
("fast_decay", "Aggressive decay", 0.1, 2.0, 0.0001, True, {}),
|
| 383 |
+
("no_gaming_penalty", "No gaming penalties", 0.02, 0.0, 0.0001, True, {}),
|
| 384 |
+
("high_gaming_penalty", "Severe gaming penalties", 0.02, 5.0, 0.0001, True, {}),
|
| 385 |
+
("lenient_broker", "Lenient broker", 0.02, 2.0, 0.0001, True, {"low": 0.25, "medium": 1.0, "high": 2.5}),
|
| 386 |
+
("strict_broker", "Strict broker", 0.02, 2.0, 0.0001, True, {"low": 1.0, "medium": 4.0, "high": 10.0}),
|
| 387 |
+
("high_compute_cost", "High compute penalty", 0.02, 2.0, 0.001, True, {}),
|
| 388 |
+
("low_compute_cost", "Low compute penalty", 0.02, 2.0, 0.00001, True, {}),
|
| 389 |
+
("anti_gaming_off", "Anti-gaming disabled", 0.02, 2.0, 0.0001, False, {}),
|
| 390 |
+
]
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
def run_all():
|
| 394 |
+
print("=" * 60)
|
| 395 |
+
print("OCC UNIFIED EVALUATION RUNNER (SELF-CONTAINED)")
|
| 396 |
+
print("=" * 60)
|
| 397 |
+
seed = 42
|
| 398 |
+
n_problems = 100
|
| 399 |
+
n_qa = 100
|
| 400 |
+
n_debates = 50
|
| 401 |
+
|
| 402 |
+
results = {"ablations": {}, "anti_gaming": {}}
|
| 403 |
+
|
| 404 |
+
# Ablations
|
| 405 |
+
for name, desc, decay, game_pen, comp_pen, anti_on, broker_thresh in ABLATIONS:
|
| 406 |
+
print(f"\n--- ABLATION: {name} ---")
|
| 407 |
+
oracle = ImpactOracle(compute_penalty_rate=comp_pen, gaming_penalty=game_pen if anti_on else 0.0)
|
| 408 |
+
ledger = CreditLedger(decay_lambda=decay)
|
| 409 |
+
broker = ResourceBroker(thresholds=broker_thresh if broker_thresh else None)
|
| 410 |
+
|
| 411 |
+
problems = gen_code_problems(n_problems, seed)
|
| 412 |
+
cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60)
|
| 413 |
+
medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150)
|
| 414 |
+
expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350)
|
| 415 |
+
code_res = run_code_occ(problems, [cheap, medium, expensive], oracle, ledger, broker, max_attempts=3)
|
| 416 |
+
print(f" Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}")
|
| 417 |
+
|
| 418 |
+
qa_data = create_qa_dataset(seed=seed, n=n_qa)
|
| 419 |
+
qa_res = run_qa_occ(qa_data, {"acc": 0.85}, oracle, ledger, broker)
|
| 420 |
+
print(f" QA: acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}")
|
| 421 |
+
|
| 422 |
+
ledger2 = CreditLedger(decay_lambda=decay)
|
| 423 |
+
broker2 = ResourceBroker(thresholds=broker_thresh if broker_thresh else None)
|
| 424 |
+
for i in range(3):
|
| 425 |
+
ledger2.earn(f"f{i}", "seed", "seed", 5, 0, 0, "initial", "debate_turn")
|
| 426 |
+
debate_res = run_debate_occ(n_debates, 3,
|
| 427 |
+
[{"id": f"f{i}", "honest": True, "acc": 0.9, "tokens": 200} for i in range(3)],
|
| 428 |
+
oracle, ledger2, broker2, seed=seed)
|
| 429 |
+
print(f" Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}")
|
| 430 |
+
|
| 431 |
+
results["ablations"][name] = {"description": desc, "code": code_res, "qa": qa_res, "debate": debate_res}
|
| 432 |
+
|
| 433 |
+
# Anti-gaming
|
| 434 |
+
print("\n--- ANTI-GAMING TESTS ---")
|
| 435 |
+
|
| 436 |
+
# Hidden-test gaming
|
| 437 |
+
oracle = ImpactOracle(gaming_penalty=2.0)
|
| 438 |
+
normal_res = []; gamer_res = []
|
| 439 |
+
for _ in range(50):
|
| 440 |
+
public_pass = random.random() < 0.9
|
| 441 |
+
hidden_pass = random.random() < 0.5 if True else random.random() < 0.9
|
| 442 |
+
ora_normal = oracle.score("code", {}, {}, {"correctness": 1.0 if public_pass else 0.0, "pass_at_k": 1.0 if hidden_pass else 0.0, "compute_cost": 150, "public_pass": public_pass, "hidden_tests_pass": hidden_pass})
|
| 443 |
+
normal_res.append(ora_normal.raw_score)
|
| 444 |
+
# Gamer: always passes public, fails hidden
|
| 445 |
+
ora_gamer = oracle.score("code", {}, {}, {"correctness": 1.0, "pass_at_k": 0.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": False})
|
| 446 |
+
gamer_res.append(ora_gamer.raw_score)
|
| 447 |
+
results["anti_gaming"]["hidden_test_gaming"] = {
|
| 448 |
+
"normal_mean_raw": sum(normal_res) / len(normal_res),
|
| 449 |
+
"gamer_mean_raw": sum(gamer_res) / len(gamer_res),
|
| 450 |
+
"gamer_penalized_rate": sum(1 for r in gamer_res if r < 0) / len(gamer_res),
|
| 451 |
+
}
|
| 452 |
+
print(f" Hidden-test gaming: normal={results['anti_gaming']['hidden_test_gaming']['normal_mean_raw']:.2f}, gamer={results['anti_gaming']['hidden_test_gaming']['gamer_mean_raw']:.2f}")
|
| 453 |
+
|
| 454 |
+
# Collusion
|
| 455 |
+
ledger = CreditLedger()
|
| 456 |
+
ledger.earn("alice", "seed", "seed", 10, 0, 0, "initial")
|
| 457 |
+
ledger.earn("bob", "seed", "seed", 1, 0, 0, "initial")
|
| 458 |
+
ok = ledger.transfer("alice", "bob", 5.0)
|
| 459 |
+
results["anti_gaming"]["collusion"] = {
|
| 460 |
+
"transfer_allowed": ok,
|
| 461 |
+
"alice_balance": ledger.balance("alice"),
|
| 462 |
+
"bob_balance": ledger.balance("bob"),
|
| 463 |
+
"blocked": not ok,
|
| 464 |
+
}
|
| 465 |
+
print(f" Collusion: transfer_allowed={ok}, alice={ledger.balance('alice'):.1f}, bob={ledger.balance('bob'):.1f}")
|
| 466 |
+
|
| 467 |
+
# Over-abstention
|
| 468 |
+
oracle = ImpactOracle()
|
| 469 |
+
abstention_rewards = []
|
| 470 |
+
for _ in range(10):
|
| 471 |
+
res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
|
| 472 |
+
{"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
|
| 473 |
+
abstention_rewards.append(res.reward_value)
|
| 474 |
+
results["anti_gaming"]["abstention"] = {
|
| 475 |
+
"mean_reward": sum(abstention_rewards) / len(abstention_rewards),
|
| 476 |
+
"negative": sum(abstention_rewards) < 0,
|
| 477 |
+
}
|
| 478 |
+
print(f" Abstention: mean_reward={results['anti_gaming']['abstention']['mean_reward']:.2f}, negative={results['anti_gaming']['abstention']['negative']}")
|
| 479 |
+
|
| 480 |
+
# Spam
|
| 481 |
+
oracle = ImpactOracle()
|
| 482 |
+
spam_res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"},
|
| 483 |
+
{"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000})
|
| 484 |
+
results["anti_gaming"]["spam"] = {
|
| 485 |
+
"reward": spam_res.reward_value,
|
| 486 |
+
"tags": spam_res.failure_tags,
|
| 487 |
+
}
|
| 488 |
+
print(f" Spam: reward={spam_res.reward_value:.2f}, tags={spam_res.failure_tags}")
|
| 489 |
+
|
| 490 |
+
# Save
|
| 491 |
+
out = Path("/app/occ/reports")
|
| 492 |
+
out.mkdir(parents=True, exist_ok=True)
|
| 493 |
+
with open(out / "eval_runner_results.json", "w") as f:
|
| 494 |
+
json.dump(results, f, indent=2, default=str)
|
| 495 |
+
print(f"\nSaved to {out / 'eval_runner_results.json'}")
|
| 496 |
+
|
| 497 |
+
# Summary table
|
| 498 |
+
print("\n" + "=" * 60)
|
| 499 |
+
print("ABLATION SUMMARY")
|
| 500 |
+
print("=" * 60)
|
| 501 |
+
print(f"{'Name':<20} {'Code Acc':>10} {'Code Comp':>10} {'QA Acc':>10} {'QA Comp':>10} {'Deb Acc':>10} {'Deb Comp':>10}")
|
| 502 |
+
for name, data in results["ablations"].items():
|
| 503 |
+
print(f"{name:<20} {data['code']['accuracy']:>10.3f} {data['code']['total_compute']:>10.0f} "
|
| 504 |
+
f"{data['qa']['accuracy']:>10.3f} {data['qa']['total_compute']:>10.0f} "
|
| 505 |
+
f"{data['debate']['accuracy']:>10.3f} {data['debate']['total_compute']:>10.0f}")
|
| 506 |
+
|
| 507 |
+
return results
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
if __name__ == "__main__":
|
| 511 |
+
run_all()
|