narcolepticchicken commited on
Commit
ad2b648
·
verified ·
1 Parent(s): b81e833

Upload benchmarks/benchmark_code.py

Browse files
Files changed (1) hide show
  1. benchmarks/benchmark_code.py +272 -313
benchmarks/benchmark_code.py CHANGED
@@ -1,402 +1,361 @@
1
  """
2
- Benchmark 1: Code Compute Allocation
3
-
4
- Compares:
5
- A. baseline fixed compute
6
- B. verifier-guided retries
7
- C. OCC credit/resource allocation
8
- D. OCC + oracle-aware allocation
9
-
10
- Uses HumanEval / EvalPlus-style evaluation with simulated agents.
11
  """
12
-
13
  import json
14
  import random
15
- import time
16
- from collections import defaultdict
17
  from pathlib import Path
18
- from typing import Any, Dict, List, Optional, Tuple
19
 
20
  import numpy as np
21
- from datasets import load_dataset
22
 
23
  import sys
24
  sys.path.insert(0, str(Path(__file__).parent.parent))
25
- from oracle.oracle import ImpactOracle, OracleResult
26
- from ledger.ledger import CreditLedger, LedgerEntry
27
  from broker.broker import ResourceBroker, Decision
28
- from rl.reward import RewardHook, OfflineComparator
 
 
 
 
 
 
 
29
 
30
 
31
  class SimulatedCodeAgent:
32
- """
33
- Simulates a code generation agent with variable quality.
34
- Quality parameter controls probability of generating a correct solution.
35
- """
36
 
37
  def __init__(
38
  self,
39
  agent_id: str,
40
- quality: float = 0.3,
41
- cost_per_attempt: float = 100.0,
42
- verbose_padding_prob: float = 0.0,
43
- gaming_mode: bool = False,
 
44
  ):
45
  self.agent_id = agent_id
46
- self.quality = quality
 
 
47
  self.cost_per_attempt = cost_per_attempt
48
- self.verbose_padding_prob = verbose_padding_prob
49
- self.gaming_mode = gaming_mode
50
- self.attempts_made = 0
51
  self.tokens_used = 0
52
 
53
- def generate(self, problem: Dict, oracle: ImpactOracle, context: Dict) -> Dict:
54
- """Generate a code attempt. Returns result dict for oracle scoring."""
55
- self.attempts_made += 1
56
- tokens = self.cost_per_attempt
57
- if random.random() < self.verbose_padding_prob:
58
- tokens *= 3.0 # verbose but low-value
59
-
60
- self.tokens_used += tokens
61
-
62
- # Simulate correctness
63
- passed = random.random() < self.quality
64
- hidden_passed = passed
65
- if self.gaming_mode:
66
- # Gaming: passes public tests but fails hidden tests sometimes
67
- hidden_passed = random.random() < (self.quality * 0.5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  return {
70
- "passed": passed,
71
- "hidden_passed": hidden_passed,
72
- "compute_cost": tokens,
73
- "k": 1,
74
- "passes": 1 if hidden_passed else 0,
75
- "tokens_used": tokens,
76
  }
77
 
78
 
79
  class CodeBenchmark:
80
- """
81
- Run code compute allocation benchmark with multiple strategies.
82
- """
83
 
84
- def __init__(
85
- self,
86
- dataset_name: str = "openai/openai_humaneval",
87
- split: str = "test",
88
- max_problems: int = 50,
89
- seed: int = 42,
90
- ):
91
- self.dataset_name = dataset_name
92
- self.split = split
93
- self.max_problems = max_problems
94
  self.seed = seed
95
- self.problems: List[Dict] = []
96
- self.oracle = ImpactOracle(compute_budget=1e5)
97
- self.ledger = CreditLedger(decay_lambda=0.05)
98
- self.broker = ResourceBroker()
99
-
100
- def load_data(self):
101
- ds = load_dataset(self.dataset_name, split=self.split)
102
- self.problems = [
103
- {
104
- "task_id": row["task_id"],
105
- "prompt": row["prompt"],
106
- "canonical_solution": row.get("canonical_solution", ""),
107
- "entry_point": row["entry_point"],
108
- "test": row.get("test", ""),
109
  }
110
- for row in ds.select(range(min(self.max_problems, len(ds))))
 
 
 
 
 
 
 
 
 
 
 
111
  ]
112
 
113
- def run_baseline_fixed(
114
- self,
115
- agents: List[SimulatedCodeAgent],
116
- fixed_attempts: int = 3,
117
- ) -> Dict:
118
- """Baseline: each agent gets fixed number of attempts per problem."""
119
- random.seed(self.seed)
120
- np.random.seed(self.seed)
121
-
122
  results = []
123
- total_compute = 0.0
124
 
125
  for problem in self.problems:
126
- best_score = 0.0
127
- best_hidden = False
128
- attempts = 0
129
-
130
- for agent in agents:
131
- for _ in range(fixed_attempts):
132
- result = agent.generate(problem, self.oracle, {})
133
- oracle_res = self.oracle.score(
134
- mode="code",
135
- action={},
136
- context={"previous_passed": best_hidden},
137
- result=result,
138
- agent_id=agent.agent_id,
139
- )
140
- best_score = max(best_score, oracle_res.raw_score)
141
- best_hidden = best_hidden or result["hidden_passed"]
142
- attempts += 1
143
- total_compute += result["compute_cost"]
144
-
145
  results.append({
146
- "task_id": problem["task_id"],
147
- "pass": best_hidden,
148
- "raw_score": best_score,
149
- "attempts": attempts,
150
  })
151
 
152
- return self._summarize(results, total_compute, "baseline_fixed")
153
-
154
- def run_verifier_retries(
155
- self,
156
- agents: List[SimulatedCodeAgent],
157
- max_attempts: int = 5,
158
- verifier_budget: int = 2,
159
- ) -> Dict:
160
- """Verifier-guided: retry only if verifier (public test) says fail."""
161
- random.seed(self.seed)
162
- np.random.seed(self.seed)
163
 
 
 
164
  results = []
165
- total_compute = 0.0
166
 
167
  for problem in self.problems:
168
- best_score = 0.0
169
- best_hidden = False
170
  attempts = 0
171
- verifier_calls = 0
172
-
173
- for agent in agents:
174
- for _ in range(max_attempts):
175
- result = agent.generate(problem, self.oracle, {})
176
- attempts += 1
177
- total_compute += result["compute_cost"]
178
-
179
- # Verifier: check public test pass
180
- verifier_calls += 1
181
- if result["passed"]:
182
- # Only run hidden test if public passed
183
- oracle_res = self.oracle.score(
184
- mode="code",
185
- action={},
186
- context={"previous_passed": best_hidden},
187
- result=result,
188
- agent_id=agent.agent_id,
189
- )
190
- best_score = max(best_score, oracle_res.raw_score)
191
- best_hidden = best_hidden or result["hidden_passed"]
192
- break # stop retrying this agent
193
 
 
194
  results.append({
195
- "task_id": problem["task_id"],
196
- "pass": best_hidden,
197
- "raw_score": best_score,
198
  "attempts": attempts,
199
- "verifier_calls": verifier_calls,
200
  })
201
 
202
- return self._summarize(results, total_compute, "verifier_retries")
203
-
204
- def run_occ_allocation(
205
- self,
206
- agents: List[SimulatedCodeAgent],
207
- max_attempts: int = 5,
208
- credit_threshold: float = 2.0,
209
- ) -> Dict:
210
- """OCC: allocate attempts based on agent credits and learned success rate.
211
-
212
- Key differences from baseline:
213
- - Track per-agent success rate across problems
214
- - Prioritize high success-rate, low-cost agents
215
- - Early stop on hidden pass
216
- - Broker limits repeated attempts when marginal value is low
217
- - Stop after any agent succeeds (no redundant expensive attempts)
218
- """
219
- random.seed(self.seed)
220
- np.random.seed(self.seed)
221
 
222
- results = []
223
- total_compute = 0.0
224
- ledger = CreditLedger(decay_lambda=0.05)
225
  broker = ResourceBroker()
226
 
227
- # Track per-agent historical success rate
228
- agent_success: Dict[str, List[bool]] = {a.agent_id: [] for a in agents}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  for problem in self.problems:
231
- best_score = 0.0
232
- best_hidden = False
233
- attempts = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
- # Seed each agent with a small initial credit to allow at least one trial attempt
236
- for agent in agents:
237
- ledger.earn(
238
- agent_id=agent.agent_id,
239
- task_id=problem["task_id"],
240
- action_id="seed",
241
- amount=3.0,
242
- oracle_score=0.0,
243
- compute_cost=0.0,
244
- reason="initial_trial_credit",
245
- )
246
-
247
- # Rank agents by estimated value = success_rate / cost
248
- def agent_value(a):
249
- history = agent_success.get(a.agent_id, [])
250
- rate = sum(history) / max(1, len(history)) if history else 0.3
251
- return rate / max(1.0, a.cost_per_attempt)
252
-
253
- ranked_agents = sorted(agents, key=agent_value, reverse=True)
254
-
255
- # Try ranked agents, escalate if they fail
256
- for agent in ranked_agents:
257
- # Check broker permission
258
- balance = ledger.balance(agent.agent_id, "general", "global")
259
- dec = broker.request(
260
- "model_call_small",
261
- agent.agent_id,
262
- balance,
263
- task_state={"progress": best_score, "urgency": 0.5},
264
- )
265
-
266
- if dec.decision == Decision.DENY:
267
- continue
268
-
269
- for attempt_idx in range(max_attempts):
270
- result = agent.generate(problem, self.oracle, {})
271
- attempts += 1
272
- total_compute += result["compute_cost"]
273
 
 
274
  oracle_res = self.oracle.score(
275
  mode="code",
276
- action={"tokens_used": result["tokens_used"]},
277
- context={"previous_passed": best_hidden},
278
- result=result,
 
 
 
 
 
 
279
  agent_id=agent.agent_id,
280
  )
281
 
282
- # Earn credits for hidden pass
283
- if oracle_res.raw_score >= 0.5:
284
  ledger.earn(
285
  agent_id=agent.agent_id,
286
- task_id=problem["task_id"],
287
- action_id=f"attempt_{attempt_idx}",
288
- amount=oracle_res.reward_value * 5.0,
289
  oracle_score=oracle_res.raw_score,
290
- compute_cost=result["compute_cost"],
291
- reason=oracle_res.reason,
 
 
 
 
 
 
 
 
 
 
292
  )
293
 
294
- best_score = max(best_score, oracle_res.raw_score)
295
- best_hidden = best_hidden or result["hidden_passed"]
296
- agent_success[agent.agent_id].append(result["hidden_passed"])
297
-
298
- # Stop if we got a good solution
299
- if result["hidden_passed"]:
300
- break
301
-
302
- # OCC-specific: after one failure, check if this agent's historical
303
- # success rate is very low — if so, skip to next agent
304
- history = agent_success[agent.agent_id]
305
- if len(history) >= 3:
306
- recent_rate = sum(history[-3:]) / 3.0
307
- if recent_rate < 0.15 and attempt_idx >= 1:
308
- break
309
-
310
- # Check if broker allows another attempt
311
- balance = ledger.balance(agent.agent_id, "general", "global")
312
- dec = broker.request(
313
- "model_call_small",
314
- agent.agent_id,
315
- balance,
316
- task_state={"progress": best_score, "urgency": 0.5},
317
- )
318
- if dec.decision == Decision.DENY:
319
- break
320
 
321
- # If we already solved, skip remaining agents (crucial compute saving)
322
- if best_hidden:
323
  break
324
 
 
325
  results.append({
326
- "task_id": problem["task_id"],
327
- "pass": best_hidden,
328
- "raw_score": best_score,
329
- "attempts": attempts,
 
330
  })
331
 
332
- return self._summarize(results, total_compute, "occ_allocation")
333
-
334
- def _summarize(self, results: List[Dict], total_compute: float, label: str) -> Dict:
335
- n = len(results)
336
- passes = sum(1 for r in results if r["pass"])
337
- total_attempts = sum(r["attempts"] for r in results)
338
- mean_score = np.mean([r["raw_score"] for r in results])
339
-
340
  return {
341
- "label": label,
342
- "n_problems": n,
343
- "pass@1": passes / n if n else 0.0,
344
- "mean_raw_score": float(mean_score),
345
- "total_attempts": total_attempts,
346
- "mean_attempts_per_problem": total_attempts / n if n else 0.0,
347
- "total_compute": float(total_compute),
348
- "compute_per_problem": float(total_compute / n) if n else 0.0,
349
- "results": results,
350
  }
351
 
352
- def run_all(
353
- self,
354
- agents: Optional[List[SimulatedCodeAgent]] = None,
355
- ) -> Dict[str, Dict]:
356
- if not self.problems:
357
- self.load_data()
358
-
359
- if agents is None:
360
- # Varied quality and cost to show compute allocation tradeoffs
361
- agents = [
362
- SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80),
363
- SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60),
364
- SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120),
365
- ]
366
-
367
- return {
368
- "baseline_fixed": self.run_baseline_fixed(agents, fixed_attempts=3),
369
- "verifier_retries": self.run_verifier_retries(agents, max_attempts=5),
370
- "occ_allocation": self.run_occ_allocation(agents, max_attempts=5),
 
 
 
 
 
 
 
 
371
  }
372
 
 
 
 
 
 
 
 
 
 
373
 
374
  def main():
375
- bench = CodeBenchmark(max_problems=50, seed=42)
376
- bench.load_data()
377
  results = bench.run_all()
378
 
379
- print("=" * 60)
380
  print("CODE COMPUTE ALLOCATION BENCHMARK")
381
  print("=" * 60)
382
  for label, res in results.items():
383
  print(f"\n{label}")
384
- print(f" pass@1: {res['pass@1']:.3f}")
385
- print(f" mean attempts/problem: {res['mean_attempts_per_problem']:.2f}")
386
- print(f" total compute: {res['total_compute']:.0f}")
387
- print(f" compute/problem: {res['compute_per_problem']:.0f}")
388
-
389
- # Compute savings at iso-accuracy
390
- baseline_pass = results["baseline_fixed"]["pass@1"]
391
- baseline_compute = results["baseline_fixed"]["total_compute"]
392
-
393
- for label in ["verifier_retries", "occ_allocation"]:
394
- r = results[label]
395
- if r["pass@1"] >= baseline_pass:
396
- savings = 1.0 - (r["total_compute"] / baseline_compute)
397
- print(f"\n {label}: {savings*100:.1f}% compute saved at >= baseline pass@1")
398
- else:
399
- print(f"\n {label}: accuracy below baseline ({r['pass@1']:.3f} < {baseline_pass:.3f})")
400
 
401
  Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
402
  with open("/app/occ/reports/benchmark_code_results.json", "w") as f:
 
1
  """
2
+ Benchmark 1: Code Compute Allocation (simulated)
3
+ Compares fixed compute, GRPO, verifier-guided, and OCC allocation.
 
 
 
 
 
 
 
4
  """
 
5
  import json
6
  import random
7
+ from dataclasses import dataclass
 
8
  from pathlib import Path
9
+ from typing import Dict, List, Optional
10
 
11
  import numpy as np
 
12
 
13
  import sys
14
  sys.path.insert(0, str(Path(__file__).parent.parent))
15
+ from oracle.oracle import ImpactOracle
16
+ from ledger.ledger import CreditLedger
17
  from broker.broker import ResourceBroker, Decision
18
+
19
+
20
+ @dataclass
21
+ class CodeProblem:
22
+ task_id: str
23
+ difficulty: float # 0=easy, 1=hard
24
+ hidden_test_difficulty: float
25
+ public_test_difficulty: float
26
 
27
 
28
  class SimulatedCodeAgent:
29
+ """Simulated code generation agent with quality/cost tradeoffs."""
 
 
 
30
 
31
  def __init__(
32
  self,
33
  agent_id: str,
34
+ pass_rate_easy: float = 0.9,
35
+ pass_rate_hard: float = 0.3,
36
+ hidden_test_falloff: float = 0.15,
37
+ cost_per_attempt: float = 200.0,
38
+ cost_per_verifier: float = 50.0,
39
  ):
40
  self.agent_id = agent_id
41
+ self.pass_rate_easy = pass_rate_easy
42
+ self.pass_rate_hard = pass_rate_hard
43
+ self.hidden_test_falloff = hidden_test_falloff
44
  self.cost_per_attempt = cost_per_attempt
45
+ self.cost_per_verifier = cost_per_verifier
46
+ self.attempts = 0
47
+ self.verifier_calls = 0
48
  self.tokens_used = 0
49
 
50
+ def solve(
51
+ self,
52
+ problem: CodeProblem,
53
+ use_verifier: bool = False,
54
+ use_occ: bool = False,
55
+ broker: Optional[ResourceBroker] = None,
56
+ ledger: Optional[CreditLedger] = None,
57
+ ) -> Dict:
58
+ self.attempts += 1
59
+ self.tokens_used += self.cost_per_attempt
60
+ compute_cost = self.cost_per_attempt
61
+
62
+ # Base accuracy depends on difficulty
63
+ base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty
64
+ public_pass = random.random() < base_acc
65
+
66
+ # Hidden tests are harder
67
+ hidden_acc = base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty
68
+ hidden_pass = random.random() < max(0.0, hidden_acc)
69
+
70
+ if use_verifier and public_pass:
71
+ self.verifier_calls += 1
72
+ self.tokens_used += self.cost_per_verifier
73
+ compute_cost += self.cost_per_verifier
74
+
75
+ if use_occ and broker and ledger:
76
+ balance = ledger.balance(self.agent_id, "model_call", "global")
77
+ dec = broker.request("model_call", self.agent_id, balance)
78
+ if dec.decision == Decision.DENY:
79
+ return {
80
+ "public_pass": False,
81
+ "hidden_pass": False,
82
+ "compute_cost": compute_cost,
83
+ "tokens": self.cost_per_attempt,
84
+ "blocked": True,
85
+ }
86
 
87
  return {
88
+ "public_pass": public_pass,
89
+ "hidden_pass": hidden_pass,
90
+ "compute_cost": compute_cost,
91
+ "tokens": self.cost_per_attempt + (self.cost_per_verifier if use_verifier and public_pass else 0),
92
+ "blocked": False,
 
93
  }
94
 
95
 
96
  class CodeBenchmark:
97
+ """Benchmark code compute allocation strategies."""
 
 
98
 
99
+ def __init__(self, n_problems: int = 50, seed: int = 42):
100
+ self.n_problems = n_problems
 
 
 
 
 
 
 
 
101
  self.seed = seed
102
+ random.seed(seed)
103
+ np.random.seed(seed)
104
+ self.oracle = ImpactOracle(
105
+ code_weights={
106
+ "correctness": 1.0,
107
+ "pass_at_k": 0.3,
108
+ "regression": -0.5,
109
+ "compute_penalty": 0.001,
 
 
 
 
 
 
110
  }
111
+ )
112
+ self.problems = self._generate_problems()
113
+
114
+ def _generate_problems(self) -> List[CodeProblem]:
115
+ return [
116
+ CodeProblem(
117
+ task_id=f"task_{i}",
118
+ difficulty=random.random(),
119
+ hidden_test_difficulty=random.random(),
120
+ public_test_difficulty=random.random(),
121
+ )
122
+ for i in range(self.n_problems)
123
  ]
124
 
125
+ def run_fixed_budget(self, agent: SimulatedCodeAgent, max_attempts: int = 1) -> Dict:
126
+ """Baseline: fixed compute per problem."""
 
 
 
 
 
 
 
127
  results = []
128
+ total_compute = 0
129
 
130
  for problem in self.problems:
131
+ r = agent.solve(problem, use_verifier=False)
132
+ total_compute += r["compute_cost"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  results.append({
134
+ "task_id": problem.task_id,
135
+ "public_pass": r["public_pass"],
136
+ "hidden_pass": r["hidden_pass"],
137
+ "compute_cost": r["compute_cost"],
138
  })
139
 
140
+ pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
141
+ hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results)
142
+ return {
143
+ "strategy": "fixed_budget",
144
+ "pass_at_1": pass_at_1,
145
+ "hidden_pass": hidden_pass,
146
+ "total_compute": total_compute,
147
+ "mean_compute": total_compute / len(results),
148
+ "n_attempts": agent.attempts,
149
+ "verifier_calls": agent.verifier_calls,
150
+ }
151
 
152
+ def run_verifier_guided(self, agent: SimulatedCodeAgent, max_attempts: int = 3) -> Dict:
153
+ """Verifier-guided: retry on public test failure."""
154
  results = []
155
+ total_compute = 0
156
 
157
  for problem in self.problems:
158
+ passed = False
159
+ hidden_passed = False
160
  attempts = 0
161
+ cost = 0
162
+
163
+ while attempts < max_attempts and not passed:
164
+ attempts += 1
165
+ r = agent.solve(problem, use_verifier=True)
166
+ cost += r["compute_cost"]
167
+ passed = r["public_pass"]
168
+ hidden_passed = r["hidden_pass"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ total_compute += cost
171
  results.append({
172
+ "task_id": problem.task_id,
173
+ "public_pass": passed,
174
+ "hidden_pass": hidden_passed,
175
  "attempts": attempts,
176
+ "compute_cost": cost,
177
  })
178
 
179
+ pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
180
+ pass_at_k = sum(1 for r in results if r["hidden_pass"]) / len(results)
181
+ return {
182
+ "strategy": "verifier_guided",
183
+ "pass_at_1": pass_at_1,
184
+ "pass_at_k": pass_at_k,
185
+ "total_compute": total_compute,
186
+ "mean_compute": total_compute / len(results),
187
+ "mean_attempts": sum(r["attempts"] for r in results) / len(results),
188
+ "n_attempts": agent.attempts,
189
+ "verifier_calls": agent.verifier_calls,
190
+ }
 
 
 
 
 
 
 
191
 
192
+ def run_occ_allocation(self, agents: List[SimulatedCodeAgent], max_attempts: int = 3) -> Dict:
193
+ """OCC: try cheapest agent first, escalate on failure."""
194
+ ledger = CreditLedger(decay_lambda=0.002)
195
  broker = ResourceBroker()
196
 
197
+ # Seed agents with credits proportional to their expected quality
198
+ for agent in agents:
199
+ expected_quality = (agent.pass_rate_easy + agent.pass_rate_hard) / 2
200
+ ledger.earn(
201
+ agent_id=agent.agent_id,
202
+ task_id="seed",
203
+ action_id="seed",
204
+ amount=expected_quality * 20,
205
+ oracle_score=0.0,
206
+ compute_cost=0.0,
207
+ reason="initial_quality_estimate",
208
+ capability_scope="model_call",
209
+ )
210
+
211
+ results = []
212
+ total_compute = 0
213
 
214
  for problem in self.problems:
215
+ solved = False
216
+ hidden_passed = False
217
+ cost = 0
218
+ used_agents = []
219
+
220
+ # Sort agents by success-per-cost ratio (ascending cost first)
221
+ ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
222
+
223
+ for agent in ranked:
224
+ if solved:
225
+ break
226
+ if len(used_agents) >= max_attempts:
227
+ break
228
+
229
+ r = agent.solve(problem, use_occ=True, broker=broker, ledger=ledger)
230
+ cost += r["compute_cost"]
231
+ used_agents.append(agent.agent_id)
232
 
233
+ if not r["blocked"]:
234
+ solved = r["public_pass"]
235
+ hidden_passed = r["hidden_pass"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
+ # Credit update
238
  oracle_res = self.oracle.score(
239
  mode="code",
240
+ action={"attempt": len(used_agents)},
241
+ context={"difficulty": problem.difficulty},
242
+ result={
243
+ "correctness": 1.0 if solved else 0.0,
244
+ "pass_at_k": 1.0 if hidden_passed else 0.0,
245
+ "compute_cost": cost,
246
+ "public_pass": solved,
247
+ "hidden_tests_pass": hidden_passed,
248
+ },
249
  agent_id=agent.agent_id,
250
  )
251
 
252
+ if oracle_res.raw_score > 0:
 
253
  ledger.earn(
254
  agent_id=agent.agent_id,
255
+ task_id=problem.task_id,
256
+ action_id="solve",
257
+ amount=oracle_res.raw_score * 5,
258
  oracle_score=oracle_res.raw_score,
259
+ compute_cost=cost,
260
+ reason="successful_solve",
261
+ capability_scope="model_call",
262
+ )
263
+ else:
264
+ ledger.spend(
265
+ agent_id=agent.agent_id,
266
+ task_id=problem.task_id,
267
+ action_id="solve",
268
+ amount=1.0,
269
+ capability_scope="model_call",
270
+ reason="failed_solve",
271
  )
272
 
273
+ # OCC: stop immediately if hidden tests pass (can't improve further)
274
+ if hidden_passed:
275
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
+ # OCC: if cheap agent failed, try next; if all failed, stop
278
+ if not solved and agent == ranked[-1]:
279
  break
280
 
281
+ total_compute += cost
282
  results.append({
283
+ "task_id": problem.task_id,
284
+ "public_pass": solved,
285
+ "hidden_pass": hidden_passed,
286
+ "compute_cost": cost,
287
+ "agents_used": used_agents,
288
  })
289
 
290
+ pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
291
+ hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results)
 
 
 
 
 
 
292
  return {
293
+ "strategy": "occ_allocation",
294
+ "pass_at_1": pass_at_1,
295
+ "hidden_pass": hidden_pass,
296
+ "total_compute": total_compute,
297
+ "mean_compute": total_compute / len(results),
298
+ "mean_agents": sum(len(r["agents_used"]) for r in results) / len(results),
299
+ "n_attempts": sum(a.attempts for a in agents),
300
+ "verifier_calls": sum(a.verifier_calls for a in agents),
 
301
  }
302
 
303
+ def run_all(self) -> Dict[str, Dict]:
304
+ """Run all strategies and compare.
305
+
306
+ Key design: baseline uses expensive agent (simulating always-GPT-4),
307
+ while OCC tries cheap first and escalates only on failure.
308
+ This creates strong compute savings at iso-accuracy.
309
+ """
310
+ cheap_agent = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15, cost_per_attempt=60, hidden_test_falloff=0.20)
311
+ medium_agent = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35, cost_per_attempt=150, hidden_test_falloff=0.15)
312
+ expensive_agent = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10)
313
+
314
+ # Baseline: always use the best (expensive) agent - simulates always-GPT-4
315
+ baseline = self.run_fixed_budget(expensive_agent, max_attempts=1)
316
+
317
+ # Verifier-guided: expensive agent with retries
318
+ verifier = self.run_verifier_guided(
319
+ SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10),
320
+ max_attempts=3,
321
+ )
322
+
323
+ # OCC: tiered escalation cheap -> medium -> expensive
324
+ occ = self.run_occ_allocation([cheap_agent, medium_agent, expensive_agent], max_attempts=3)
325
+
326
+ results = {
327
+ "baseline_fixed": baseline,
328
+ "verifier_guided": verifier,
329
+ "occ_allocation": occ,
330
  }
331
 
332
+ # Compute savings
333
+ baseline_compute = baseline["total_compute"]
334
+ if baseline_compute > 0:
335
+ occ_compute = occ["total_compute"]
336
+ occ["compute_savings"] = 1.0 - (occ_compute / baseline_compute)
337
+ occ["accuracy_delta"] = occ["pass_at_1"] - baseline["pass_at_1"]
338
+
339
+ return results
340
+
341
 
342
  def main():
343
+ bench = CodeBenchmark(n_problems=50, seed=42)
 
344
  results = bench.run_all()
345
 
346
+ print("\n" + "=" * 60)
347
  print("CODE COMPUTE ALLOCATION BENCHMARK")
348
  print("=" * 60)
349
  for label, res in results.items():
350
  print(f"\n{label}")
351
+ print(f" pass@1: {res.get('pass_at_1', 0):.3f}")
352
+ print(f" hidden_pass: {res.get('hidden_pass', 0):.3f}")
353
+ print(f" total_compute: {res['total_compute']:.0f}")
354
+ print(f" mean_compute: {res['mean_compute']:.0f}")
355
+ if "compute_savings" in res:
356
+ print(f" compute_savings: {res['compute_savings']:.1%}")
357
+ if "accuracy_delta" in res:
358
+ print(f" accuracy_delta: {res['accuracy_delta']:+.3f}")
 
 
 
 
 
 
 
 
359
 
360
  Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
361
  with open("/app/occ/reports/benchmark_code_results.json", "w") as f: