benediktstroebl commited on
Commit
178673f
1 Parent(s): bd0b3ec

minor tweaks

Browse files
Files changed (4) hide show
  1. agent_monitor/monitor.py +5 -0
  2. app.py +3 -3
  3. utils/viz.py +2 -1
  4. verified_agents.yaml +16 -4
agent_monitor/monitor.py CHANGED
@@ -23,6 +23,11 @@ async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
23
  task_calls[call['weave_task_id']].append(call)
24
 
25
  for task_id in task_calls:
 
 
 
 
 
26
  task_calls[task_id].sort(key=lambda x: x['created_timestamp'])
27
 
28
  tasks = [analyze_task(calls, llm_client, llm_eval) for task_id, calls in task_calls.items()]
 
23
  task_calls[call['weave_task_id']].append(call)
24
 
25
  for task_id in task_calls:
26
+
27
+ # sort calls by timestamp and handle null timestamps
28
+ for call in task_calls[task_id]:
29
+ if call['created_timestamp'] is None:
30
+ call['created_timestamp'] = 0
31
  task_calls[task_id].sort(key=lambda x: x['created_timestamp'])
32
 
33
  tasks = [analyze_task(calls, llm_client, llm_eval) for task_id, calls in task_calls.items()]
app.py CHANGED
@@ -16,7 +16,7 @@ import re
16
  import markdown
17
  import asyncio
18
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
19
- import weave
20
  from utils.db import TracePreprocessor
21
  from gradio.themes.soft import Soft
22
 
@@ -771,7 +771,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
771
  demo.load(
772
  lambda: create_task_success_heatmap(
773
  preprocessor.get_task_success_data('swebench_verified_mini'),
774
- 'SWE-bench Verified'
775
  ),
776
  outputs=[task_success_heatmap]
777
  )
@@ -1454,5 +1454,5 @@ async def main():
1454
  await demo.launch(favicon_path="hal.png")
1455
 
1456
  if __name__ == "__main__":
1457
- weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
1458
  asyncio.run(main())
 
16
  import markdown
17
  import asyncio
18
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
19
+ # import weave
20
  from utils.db import TracePreprocessor
21
  from gradio.themes.soft import Soft
22
 
 
771
  demo.load(
772
  lambda: create_task_success_heatmap(
773
  preprocessor.get_task_success_data('swebench_verified_mini'),
774
+ 'SWE-bench Verified (Mini)'
775
  ),
776
  outputs=[task_success_heatmap]
777
  )
 
1454
  await demo.launch(favicon_path="hal.png")
1455
 
1456
  if __name__ == "__main__":
1457
+ # weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
1458
  asyncio.run(main())
utils/viz.py CHANGED
@@ -41,7 +41,8 @@ def create_task_success_heatmap(df, benchmark_name):
41
  tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
42
  # Total number of tasks (columns)
43
  total_tasks = len(pivot_df.columns)
44
- if 'SWE-bench' in benchmark_name:
 
45
  total_tasks = 50 # TODO - remove hardcoding
46
 
47
  # Add the new row to the pivot table
 
41
  tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
42
  # Total number of tasks (columns)
43
  total_tasks = len(pivot_df.columns)
44
+ print(benchmark_name)
45
+ if benchmark_name == "SWE-bench Verified (Mini)":
46
  total_tasks = 50 # TODO - remove hardcoding
47
 
48
  # Add the new row to the pivot table
verified_agents.yaml CHANGED
@@ -48,12 +48,24 @@ usaco:
48
  - agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
49
  verification_date: 2024-08-24
50
 
51
-
52
- swebench_verified:
53
- - agent_name: "Agentless (gpt-4o-mini-2024-07-18) (50 Instances)"
54
  verification_date: 2024-08-17
55
- - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1) (50 Instances)"
56
  verification_date: 2024-08-19
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  mlagentbench:
59
  - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
 
48
  - agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
49
  verification_date: 2024-08-24
50
 
51
+ swebench_verified_mini:
52
+ - agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
 
53
  verification_date: 2024-08-17
54
+ - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
55
  verification_date: 2024-08-19
56
+ - agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
57
+ verification_date: 2024-10-30
58
+ - agent_name: "Moatless (gpt-4o-2024-08-06)"
59
+ verification_date: 2024-10-30
60
+ - agent_name: "Moatless (claude-3-5-sonnet-20241022)"
61
+ verification_date: 2024-10-30
62
+ - agent_name: "Agentless (o1-mini-2024-09-12)"
63
+ verification_date: 2024-10-30
64
+
65
+
66
+ swebench_verified:
67
+ - agent_name: "Moatless (gpt-4o-2024-08-06)"
68
+ verification_date: 2024-10-30
69
 
70
  mlagentbench:
71
  - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"