meghsn commited on
Commit
2705446
·
1 Parent(s): ea81237

Removed old results files

Browse files
app.py CHANGED
@@ -130,7 +130,7 @@ def check_sanity(agent):
130
  return True
131
 
132
  def main():
133
- st.set_page_config(page_title="WebAgent Leaderboard", layout="wide")
134
 
135
  all_agents = os.listdir("results")
136
  all_results = {}
 
130
  return True
131
 
132
  def main():
133
+ st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide")
134
 
135
  all_agents = os.listdir("results")
136
  all_results = {}
results/Bgym-GPT-3.5/results.json DELETED
@@ -1,53 +0,0 @@
1
- [
2
- {
3
- "benchmark": "WorkArena-L1",
4
- "score": 6.1,
5
- "std_err": 0.3,
6
- "benchmark_specific": "No",
7
- "benchmark_tuned": "No",
8
- "followed_evaluation_protocol": "Yes",
9
- "reproducible": "Yes",
10
- "reproduced": [["aug 2025", 0.65, 0.05, "study_id"]],
11
- "comments": "NA"
12
- },
13
- {
14
- "benchmark": "WorkArena++-L2",
15
- "score": 0.0,
16
- "std_err": 0.0,
17
- "benchmark_specific": "No",
18
- "benchmark_tuned": "No",
19
- "followed_evaluation_protocol": "Yes",
20
- "reproducible": "Yes",
21
- "comments": "NA"
22
- },
23
- {
24
- "benchmark": "WorkArena++-L3",
25
- "score": 0.0,
26
- "std_err": 0.0,
27
- "benchmark_specific": "No",
28
- "benchmark_tuned": "No",
29
- "followed_evaluation_protocol": "Yes",
30
- "reproducible": "Yes",
31
- "comments": "NA"
32
- },
33
- {
34
- "benchmark": "MiniWoB",
35
- "score": 43.4,
36
- "std_err": 0.1,
37
- "benchmark_specific": "No",
38
- "benchmark_tuned": "No",
39
- "followed_evaluation_protocol": "Yes",
40
- "reproducible": "Yes",
41
- "comments": "NA"
42
- },
43
- {
44
- "benchmark": "WebArena",
45
- "score": 6.7,
46
- "std_err": 0.2,
47
- "benchmark_specific": "No",
48
- "benchmark_tuned": "No",
49
- "followed_evaluation_protocol": "Yes",
50
- "reproducible": "Yes",
51
- "comments": "NA"
52
- }
53
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-4o-V/results.json DELETED
@@ -1,52 +0,0 @@
1
- [
2
- {
3
- "benchmark": "WorkArena-L1",
4
- "score": 41.8,
5
- "std_err": 0.4,
6
- "benchmark_specific": "No",
7
- "benchmark_tuned": "No",
8
- "followed_evaluation_protocol": "Yes",
9
- "reproducible": "Yes",
10
- "comments": "NA"
11
- },
12
- {
13
- "benchmark": "WorkArena++-L2",
14
- "score": 3.8,
15
- "std_err": 0.6,
16
- "benchmark_specific": "No",
17
- "benchmark_tuned": "No",
18
- "followed_evaluation_protocol": "Yes",
19
- "reproducible": "Yes",
20
- "comments": "NA"
21
- },
22
- {
23
- "benchmark": "WorkArena++-L3",
24
- "score": 0.0,
25
- "std_err": 0.0,
26
- "benchmark_specific": "No",
27
- "benchmark_tuned": "No",
28
- "followed_evaluation_protocol": "Yes",
29
- "reproducible": "Yes",
30
- "comments": "NA"
31
- },
32
- {
33
- "benchmark": "MiniWoB",
34
- "score": 72.5,
35
- "std_err": 0.5,
36
- "benchmark_specific": "No",
37
- "benchmark_tuned": "No",
38
- "followed_evaluation_protocol": "Yes",
39
- "reproducible": "Yes",
40
- "comments": "NA"
41
- },
42
- {
43
- "benchmark": "WebArena",
44
- "score": 24.0,
45
- "std_err": 0.4,
46
- "benchmark_specific": "No",
47
- "benchmark_tuned": "No",
48
- "followed_evaluation_protocol": "Yes",
49
- "reproducible": "Yes",
50
- "comments": "NA"
51
- }
52
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-4o/results.json DELETED
@@ -1,52 +0,0 @@
1
- [
2
- {
3
- "benchmark": "WorkArena-L1",
4
- "score": 42.7,
5
- "std_err": 0.4,
6
- "benchmark_specific": "No",
7
- "benchmark_tuned": "No",
8
- "followed_evaluation_protocol": "Yes",
9
- "reproducible": "Yes",
10
- "comments": "NA"
11
- },
12
- {
13
- "benchmark": "WorkArena++-L2",
14
- "score": 3.0,
15
- "std_err": 0.6,
16
- "benchmark_specific": "No",
17
- "benchmark_tuned": "No",
18
- "followed_evaluation_protocol": "Yes",
19
- "reproducible": "Yes",
20
- "comments": "NA"
21
- },
22
- {
23
- "benchmark": "WorkArena++-L3",
24
- "score": 0.0,
25
- "std_err": 0.0,
26
- "benchmark_specific": "No",
27
- "benchmark_tuned": "No",
28
- "followed_evaluation_protocol": "Yes",
29
- "reproducible": "Yes",
30
- "comments": "NA"
31
- },
32
- {
33
- "benchmark": "MiniWoB",
34
- "score": 71.3,
35
- "std_err": 0.5,
36
- "benchmark_specific": "No",
37
- "benchmark_tuned": "No",
38
- "followed_evaluation_protocol": "Yes",
39
- "reproducible": "Yes",
40
- "comments": "NA"
41
- },
42
- {
43
- "benchmark": "WebArena",
44
- "score": 23.5,
45
- "std_err": 0.4,
46
- "benchmark_specific": "No",
47
- "benchmark_tuned": "No",
48
- "followed_evaluation_protocol": "Yes",
49
- "reproducible": "Yes",
50
- "comments": "NA"
51
- }
52
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Llama-3-70b/results.json DELETED
@@ -1,52 +0,0 @@
1
- [
2
- {
3
- "benchmark": "WorkArena-L1",
4
- "score": 17.9,
5
- "std_err": 0.6,
6
- "benchmark_specific": "No",
7
- "benchmark_tuned": "No",
8
- "followed_evaluation_protocol": "Yes",
9
- "reproducible": "Yes",
10
- "comments": "NA"
11
- },
12
- {
13
- "benchmark": "WorkArena++-L2",
14
- "score": 0.0,
15
- "std_err": 0.0,
16
- "benchmark_specific": "No",
17
- "benchmark_tuned": "No",
18
- "followed_evaluation_protocol": "Yes",
19
- "reproducible": "Yes",
20
- "comments": "NA"
21
- },
22
- {
23
- "benchmark": "WorkArena++-L3",
24
- "score": 0.0,
25
- "std_err": 0.0,
26
- "benchmark_specific": "No",
27
- "benchmark_tuned": "No",
28
- "followed_evaluation_protocol": "Yes",
29
- "reproducible": "Yes",
30
- "comments": "NA"
31
- },
32
- {
33
- "benchmark": "MiniWoB",
34
- "score": 68.2,
35
- "std_err": 0.7,
36
- "benchmark_specific": "No",
37
- "benchmark_tuned": "No",
38
- "followed_evaluation_protocol": "Yes",
39
- "reproducible": "Yes",
40
- "comments": "NA"
41
- },
42
- {
43
- "benchmark": "WebArena",
44
- "score": 11.0,
45
- "std_err": 0.3,
46
- "benchmark_specific": "No",
47
- "benchmark_tuned": "No",
48
- "followed_evaluation_protocol": "Yes",
49
- "reproducible": "Yes",
50
- "comments": "NA"
51
- }
52
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Mixtral-8x22b/results.json DELETED
@@ -1,52 +0,0 @@
1
- [
2
- {
3
- "benchmark": "WorkArena-L1",
4
- "score": 12.4,
5
- "std_err": 0.7,
6
- "benchmark_specific": "No",
7
- "benchmark_tuned": "No",
8
- "followed_evaluation_protocol": "Yes",
9
- "reproducible": "Yes",
10
- "comments": "NA"
11
- },
12
- {
13
- "benchmark": "WorkArena++-L2",
14
- "score": 0.0,
15
- "std_err": 0.0,
16
- "benchmark_specific": "No",
17
- "benchmark_tuned": "No",
18
- "followed_evaluation_protocol": "Yes",
19
- "reproducible": "Yes",
20
- "comments": "NA"
21
- },
22
- {
23
- "benchmark": "WorkArena++-L3",
24
- "score": 0.0,
25
- "std_err": 0.0,
26
- "benchmark_specific": "No",
27
- "benchmark_tuned": "No",
28
- "followed_evaluation_protocol": "Yes",
29
- "reproducible": "Yes",
30
- "comments": "NA"
31
- },
32
- {
33
- "benchmark": "MiniWoB",
34
- "score": 62.4,
35
- "std_err": 0.5,
36
- "benchmark_specific": "No",
37
- "benchmark_tuned": "No",
38
- "followed_evaluation_protocol": "Yes",
39
- "reproducible": "Yes",
40
- "comments": "NA"
41
- },
42
- {
43
- "benchmark": "WebArena",
44
- "score": 12.6,
45
- "std_err": 0.9,
46
- "benchmark_specific": "No",
47
- "benchmark_tuned": "No",
48
- "followed_evaluation_protocol": "Yes",
49
- "reproducible": "Yes",
50
- "comments": "NA"
51
- }
52
- ]