Spaces:

agent-evals
/

leaderboard

Running

benediktstroebl commited on 25 days ago

Commit

178673f

•

1 Parent(s): bd0b3ec

minor tweaks

Files changed (4) hide show

agent_monitor/monitor.py CHANGED Viewed

@@ -23,6 +23,11 @@ async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
         task_calls[call['weave_task_id']].append(call)
     for task_id in task_calls:
         task_calls[task_id].sort(key=lambda x: x['created_timestamp'])
     tasks = [analyze_task(calls, llm_client, llm_eval) for task_id, calls in task_calls.items()]

         task_calls[call['weave_task_id']].append(call)
     for task_id in task_calls:
+        # sort calls by timestamp and handle null timestamps
+        for call in task_calls[task_id]:
+            if call['created_timestamp'] is None:
+                call['created_timestamp'] = 0
         task_calls[task_id].sort(key=lambda x: x['created_timestamp'])
     tasks = [analyze_task(calls, llm_client, llm_eval) for task_id, calls in task_calls.items()]

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ import re
 import markdown
 import asyncio
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
-import weave
 from utils.db import TracePreprocessor
 from gradio.themes.soft import Soft
@@ -771,7 +771,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
             demo.load(
             lambda: create_task_success_heatmap(
                 preprocessor.get_task_success_data('swebench_verified_mini'),
-                'SWE-bench Verified'
             ),
             outputs=[task_success_heatmap]
             )
@@ -1454,5 +1454,5 @@ async def main():
     await demo.launch(favicon_path="hal.png")
 if __name__ == "__main__":
-    weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
     asyncio.run(main())

 import markdown
 import asyncio
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
+# import weave
 from utils.db import TracePreprocessor
 from gradio.themes.soft import Soft
             demo.load(
             lambda: create_task_success_heatmap(
                 preprocessor.get_task_success_data('swebench_verified_mini'),
+                'SWE-bench Verified (Mini)'
             ),
             outputs=[task_success_heatmap]
             )
     await demo.launch(favicon_path="hal.png")
 if __name__ == "__main__":
+    # weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
     asyncio.run(main())

utils/viz.py CHANGED Viewed

@@ -41,7 +41,8 @@ def create_task_success_heatmap(df, benchmark_name):
     tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
     # Total number of tasks (columns)
     total_tasks = len(pivot_df.columns)
-    if 'SWE-bench' in benchmark_name:
         total_tasks = 50 # TODO - remove hardcoding
     # Add the new row to the pivot table

     tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
     # Total number of tasks (columns)
     total_tasks = len(pivot_df.columns)
+    print(benchmark_name)
+    if benchmark_name == "SWE-bench Verified (Mini)":
         total_tasks = 50 # TODO - remove hardcoding
     # Add the new row to the pivot table

verified_agents.yaml CHANGED Viewed

@@ -48,12 +48,24 @@ usaco:
   - agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
     verification_date: 2024-08-24
-swebench_verified:
-  - agent_name: "Agentless (gpt-4o-mini-2024-07-18) (50 Instances)"
     verification_date: 2024-08-17
-  - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1) (50 Instances)"
     verification_date: 2024-08-19
 mlagentbench:
   - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"

   - agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
     verification_date: 2024-08-24
+swebench_verified_mini:
+  - agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
     verification_date: 2024-08-17
+  - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
     verification_date: 2024-08-19
+  - agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
+    verification_date: 2024-10-30
+  - agent_name: "Moatless (gpt-4o-2024-08-06)"
+    verification_date: 2024-10-30
+  - agent_name: "Moatless (claude-3-5-sonnet-20241022)"
+    verification_date: 2024-10-30
+  - agent_name: "Agentless (o1-mini-2024-09-12)"
+    verification_date: 2024-10-30
+swebench_verified:
+  - agent_name: "Moatless (gpt-4o-2024-08-06)"
+    verification_date: 2024-10-30
 mlagentbench:
   - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"