Spaces:
Running
Running
# This file contains information about verified agent results for different benchmarks. | |
# Format: | |
# benchmark_name: | |
# - agent_name: "Name of the agent" | |
# verification_date: YYYY-MM-DD | |
usaco: | |
- agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-08-20 | |
- agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-08-20 | |
- agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-08-20 | |
- agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-08-12 | |
- agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-08-20 | |
- agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-08-11 | |
- agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-08-12 | |
- agent_name: USACO Reflexion + Episodic + Semantic (gpt-4o-2024-05-13) | |
verification_date: 2024-08-25 | |
- agent_name: USACO Reflexion + Episodic (gpt-4o-2024-05-13) | |
verification_date: 2024-08-25 | |
- agent_name: USACO Reflexion + Semantic (gpt-4o-2024-05-13) | |
verification_date: 2024-08-25 | |
- agent_name: Episodic Retrial (2x) (gpt-4o-2024-05-13) | |
verification_date: 2024-08-25 | |
- agent_name: Episodic Retrial (3x) (gpt-4o-mini-2024-07-18) | |
verification_date: 2024-08-25 | |
- agent_name: Episodic Retrial (2x) (gpt-4o-mini-2024-07-18) | |
verification_date: 2024-08-25 | |
- agent_name: Episodic Retrial (5x) (gpt-4o-mini-2024-07-18) | |
verification_date: 2024-08-25 | |
- agent_name: Episodic Warming (3 Steps) (gpt-4o-mini-2024-07-18) | |
verification_date: 2024-08-24 | |
- agent_name: USACO Episodic (gpt-4o-2024-05-13) | |
verification_date: 2024-08-24 | |
- agent_name: USACO Semantic (gpt-4o-2024-05-13) | |
verification_date: 2024-08-24 | |
- agent_name: Zero-shot Retrial (2x) (gpt-4o-mini-2024-07-18) | |
verification_date: 2024-08-24 | |
- agent_name: Zero-shot Retrial (3x) (gpt-4o-mini-2024-07-18) | |
verification_date: 2024-08-24 | |
- agent_name: Zero-shot Retrial (5x) (gpt-4o-mini-2024-07-18) | |
verification_date: 2024-08-24 | |
- agent_name: USACO Zero-shot (gpt-4o-2024-05-13) | |
verification_date: 2024-08-24 | |
swebench_verified_mini: | |
- agent_name: "Agentless (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-08-17 | |
- agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)" | |
verification_date: 2024-08-19 | |
- agent_name: "Moatless (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-10-30 | |
- agent_name: "Moatless (gpt-4o-2024-08-06)" | |
verification_date: 2024-10-30 | |
- agent_name: "Moatless (claude-3-5-sonnet-20241022)" | |
verification_date: 2024-10-30 | |
- agent_name: "Agentless (o1-mini-2024-09-12)" | |
verification_date: 2024-10-30 | |
swebench_verified: | |
- agent_name: "Moatless (gpt-4o-2024-08-06)" | |
verification_date: 2024-10-30 | |
mlagentbench: | |
- agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)" | |
verification_date: 2024-08-19 | |
corebench_easy: | |
- agent_name: "AutoGPT (GPT-4o)" | |
verification_date: 2024-09-28 | |
- agent_name: "AutoGPT (GPT-4o-mini)" | |
verification_date: 2024-09-28 | |
- agent_name: "CORE-Agent (GPT-4o)" | |
verification_date: 2024-09-28 | |
- agent_name: "CORE-Agent (GPT-4o-mini)" | |
verification_date: 2024-09-28 | |
corebench_medium: | |
- agent_name: "AutoGPT (GPT-4o)" | |
verification_date: 2024-09-28 | |
- agent_name: "AutoGPT (GPT-4o-mini)" | |
verification_date: 2024-09-28 | |
- agent_name: "CORE-Agent (GPT-4o)" | |
verification_date: 2024-09-28 | |
- agent_name: "CORE-Agent (GPT-4o-mini)" | |
verification_date: 2024-09-28 | |
corebench_hard: | |
- agent_name: "AutoGPT (GPT-4o)" | |
verification_date: 2024-09-28 | |
- agent_name: "AutoGPT (GPT-4o-mini)" | |
verification_date: 2024-09-28 | |
- agent_name: "CORE-Agent (GPT-4o)" | |
verification_date: 2024-09-28 | |
- agent_name: "CORE-Agent (GPT-4o-mini)" | |
verification_date: 2024-09-28 |