File size: 3,941 Bytes
7c691e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178673f
 
7c691e6
178673f
7c691e6
178673f
 
 
 
 
 
 
 
 
 
 
 
 
7c691e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# This file contains information about verified agent results for different benchmarks.
# Format: 
#   benchmark_name:
#     - agent_name: "Name of the agent"
#       verification_date: YYYY-MM-DD

usaco:
  - agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-20
  - agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-20
  - agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-20
  - agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-12
  - agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-20
  - agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-11
  - agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-12
  - agent_name: USACO Reflexion + Episodic + Semantic (gpt-4o-2024-05-13)
    verification_date: 2024-08-25
  - agent_name: USACO Reflexion + Episodic (gpt-4o-2024-05-13)
    verification_date: 2024-08-25
  - agent_name: USACO Reflexion + Semantic (gpt-4o-2024-05-13)
    verification_date: 2024-08-25
  - agent_name: Episodic Retrial (2x) (gpt-4o-2024-05-13)
    verification_date: 2024-08-25
  - agent_name: Episodic Retrial (3x) (gpt-4o-mini-2024-07-18)
    verification_date: 2024-08-25
  - agent_name: Episodic Retrial (2x) (gpt-4o-mini-2024-07-18)
    verification_date: 2024-08-25
  - agent_name: Episodic Retrial (5x) (gpt-4o-mini-2024-07-18)
    verification_date: 2024-08-25
  - agent_name: Episodic Warming (3 Steps) (gpt-4o-mini-2024-07-18)
    verification_date: 2024-08-24
  - agent_name: USACO Episodic (gpt-4o-2024-05-13)
    verification_date: 2024-08-24
  - agent_name: USACO Semantic (gpt-4o-2024-05-13)
    verification_date: 2024-08-24
  - agent_name: Zero-shot Retrial (2x) (gpt-4o-mini-2024-07-18)
    verification_date: 2024-08-24
  - agent_name: Zero-shot Retrial (3x) (gpt-4o-mini-2024-07-18)
    verification_date: 2024-08-24
  - agent_name: Zero-shot Retrial (5x) (gpt-4o-mini-2024-07-18)
    verification_date: 2024-08-24
  - agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
    verification_date: 2024-08-24

swebench_verified_mini:
  - agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-17
  - agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
    verification_date: 2024-08-19
  - agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-10-30
  - agent_name: "Moatless (gpt-4o-2024-08-06)"
    verification_date: 2024-10-30
  - agent_name: "Moatless (claude-3-5-sonnet-20241022)"
    verification_date: 2024-10-30
  - agent_name: "Agentless (o1-mini-2024-09-12)"
    verification_date: 2024-10-30


swebench_verified:
  - agent_name: "Moatless (gpt-4o-2024-08-06)"
    verification_date: 2024-10-30

mlagentbench:
  - agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
    verification_date: 2024-08-19


corebench_easy:
  - agent_name: "AutoGPT (GPT-4o)"
    verification_date: 2024-09-28
  - agent_name: "AutoGPT (GPT-4o-mini)"
    verification_date: 2024-09-28
  - agent_name: "CORE-Agent (GPT-4o)"
    verification_date: 2024-09-28
  - agent_name: "CORE-Agent (GPT-4o-mini)"
    verification_date: 2024-09-28

corebench_medium:
  - agent_name: "AutoGPT (GPT-4o)"
    verification_date: 2024-09-28
  - agent_name: "AutoGPT (GPT-4o-mini)"
    verification_date: 2024-09-28
  - agent_name: "CORE-Agent (GPT-4o)"
    verification_date: 2024-09-28
  - agent_name: "CORE-Agent (GPT-4o-mini)"
    verification_date: 2024-09-28

corebench_hard:
  - agent_name: "AutoGPT (GPT-4o)"
    verification_date: 2024-09-28
  - agent_name: "AutoGPT (GPT-4o-mini)"
    verification_date: 2024-09-28
  - agent_name: "CORE-Agent (GPT-4o)"
    verification_date: 2024-09-28
  - agent_name: "CORE-Agent (GPT-4o-mini)"
    verification_date: 2024-09-28