Spaces:
Running
Running
benediktstroebl
commited on
Commit
•
178673f
1
Parent(s):
bd0b3ec
minor tweaks
Browse files- agent_monitor/monitor.py +5 -0
- app.py +3 -3
- utils/viz.py +2 -1
- verified_agents.yaml +16 -4
agent_monitor/monitor.py
CHANGED
@@ -23,6 +23,11 @@ async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
|
|
23 |
task_calls[call['weave_task_id']].append(call)
|
24 |
|
25 |
for task_id in task_calls:
|
|
|
|
|
|
|
|
|
|
|
26 |
task_calls[task_id].sort(key=lambda x: x['created_timestamp'])
|
27 |
|
28 |
tasks = [analyze_task(calls, llm_client, llm_eval) for task_id, calls in task_calls.items()]
|
|
|
23 |
task_calls[call['weave_task_id']].append(call)
|
24 |
|
25 |
for task_id in task_calls:
|
26 |
+
|
27 |
+
# sort calls by timestamp and handle null timestamps
|
28 |
+
for call in task_calls[task_id]:
|
29 |
+
if call['created_timestamp'] is None:
|
30 |
+
call['created_timestamp'] = 0
|
31 |
task_calls[task_id].sort(key=lambda x: x['created_timestamp'])
|
32 |
|
33 |
tasks = [analyze_task(calls, llm_client, llm_eval) for task_id, calls in task_calls.items()]
|
app.py
CHANGED
@@ -16,7 +16,7 @@ import re
|
|
16 |
import markdown
|
17 |
import asyncio
|
18 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
19 |
-
import weave
|
20 |
from utils.db import TracePreprocessor
|
21 |
from gradio.themes.soft import Soft
|
22 |
|
@@ -771,7 +771,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
771 |
demo.load(
|
772 |
lambda: create_task_success_heatmap(
|
773 |
preprocessor.get_task_success_data('swebench_verified_mini'),
|
774 |
-
'SWE-bench Verified'
|
775 |
),
|
776 |
outputs=[task_success_heatmap]
|
777 |
)
|
@@ -1454,5 +1454,5 @@ async def main():
|
|
1454 |
await demo.launch(favicon_path="hal.png")
|
1455 |
|
1456 |
if __name__ == "__main__":
|
1457 |
-
weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
|
1458 |
asyncio.run(main())
|
|
|
16 |
import markdown
|
17 |
import asyncio
|
18 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
19 |
+
# import weave
|
20 |
from utils.db import TracePreprocessor
|
21 |
from gradio.themes.soft import Soft
|
22 |
|
|
|
771 |
demo.load(
|
772 |
lambda: create_task_success_heatmap(
|
773 |
preprocessor.get_task_success_data('swebench_verified_mini'),
|
774 |
+
'SWE-bench Verified (Mini)'
|
775 |
),
|
776 |
outputs=[task_success_heatmap]
|
777 |
)
|
|
|
1454 |
await demo.launch(favicon_path="hal.png")
|
1455 |
|
1456 |
if __name__ == "__main__":
|
1457 |
+
# weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
|
1458 |
asyncio.run(main())
|
utils/viz.py
CHANGED
@@ -41,7 +41,8 @@ def create_task_success_heatmap(df, benchmark_name):
|
|
41 |
tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
|
42 |
# Total number of tasks (columns)
|
43 |
total_tasks = len(pivot_df.columns)
|
44 |
-
|
|
|
45 |
total_tasks = 50 # TODO - remove hardcoding
|
46 |
|
47 |
# Add the new row to the pivot table
|
|
|
41 |
tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
|
42 |
# Total number of tasks (columns)
|
43 |
total_tasks = len(pivot_df.columns)
|
44 |
+
print(benchmark_name)
|
45 |
+
if benchmark_name == "SWE-bench Verified (Mini)":
|
46 |
total_tasks = 50 # TODO - remove hardcoding
|
47 |
|
48 |
# Add the new row to the pivot table
|
verified_agents.yaml
CHANGED
@@ -48,12 +48,24 @@ usaco:
|
|
48 |
- agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
|
49 |
verification_date: 2024-08-24
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
- agent_name: "Agentless (gpt-4o-mini-2024-07-18) (50 Instances)"
|
54 |
verification_date: 2024-08-17
|
55 |
-
- agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)
|
56 |
verification_date: 2024-08-19
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
mlagentbench:
|
59 |
- agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
|
|
|
48 |
- agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
|
49 |
verification_date: 2024-08-24
|
50 |
|
51 |
+
swebench_verified_mini:
|
52 |
+
- agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
|
|
|
53 |
verification_date: 2024-08-17
|
54 |
+
- agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
|
55 |
verification_date: 2024-08-19
|
56 |
+
- agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
|
57 |
+
verification_date: 2024-10-30
|
58 |
+
- agent_name: "Moatless (gpt-4o-2024-08-06)"
|
59 |
+
verification_date: 2024-10-30
|
60 |
+
- agent_name: "Moatless (claude-3-5-sonnet-20241022)"
|
61 |
+
verification_date: 2024-10-30
|
62 |
+
- agent_name: "Agentless (o1-mini-2024-09-12)"
|
63 |
+
verification_date: 2024-10-30
|
64 |
+
|
65 |
+
|
66 |
+
swebench_verified:
|
67 |
+
- agent_name: "Moatless (gpt-4o-2024-08-06)"
|
68 |
+
verification_date: 2024-10-30
|
69 |
|
70 |
mlagentbench:
|
71 |
- agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
|