Spaces:

ServiceNow
/

browsergym-leaderboard

Running

App Files Files Community

meghsn commited on Nov 21, 2024

Commit

2705446

1 Parent(s): ea81237

Removed old results files

Browse files

Files changed (6) hide show

app.py +1 -1
results/Bgym-GPT-3.5/results.json +0 -53
results/Bgym-GPT-4o-V/results.json +0 -52
results/Bgym-GPT-4o/results.json +0 -52
results/Bgym-Llama-3-70b/results.json +0 -52
results/Bgym-Mixtral-8x22b/results.json +0 -52

app.py CHANGED Viewed

@@ -130,7 +130,7 @@ def check_sanity(agent):
     return True
 def main():
-    st.set_page_config(page_title="WebAgent Leaderboard", layout="wide")
     all_agents = os.listdir("results")
     all_results = {}

     return True
 def main():
+    st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide")
     all_agents = os.listdir("results")
     all_results = {}

results/Bgym-GPT-3.5/results.json DELETED Viewed

@@ -1,53 +0,0 @@
-[
-    {
-        "benchmark": "WorkArena-L1",
-        "score": 6.1,
-        "std_err": 0.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "reproduced": [["aug 2025", 0.65, 0.05, "study_id"]],
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L2",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "MiniWoB",
-        "score": 43.4,
-        "std_err": 0.1,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WebArena",
-        "score": 6.7,
-        "std_err": 0.2,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    }
-]

results/Bgym-GPT-4o-V/results.json DELETED Viewed

@@ -1,52 +0,0 @@
-[
-    {
-        "benchmark": "WorkArena-L1",
-        "score": 41.8,
-        "std_err": 0.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L2",
-        "score": 3.8,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "MiniWoB",
-        "score": 72.5,
-        "std_err": 0.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WebArena",
-        "score": 24.0,
-        "std_err": 0.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    }
-]

results/Bgym-GPT-4o/results.json DELETED Viewed

@@ -1,52 +0,0 @@
-[
-    {
-        "benchmark": "WorkArena-L1",
-        "score": 42.7,
-        "std_err": 0.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L2",
-        "score": 3.0,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "MiniWoB",
-        "score": 71.3,
-        "std_err": 0.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WebArena",
-        "score": 23.5,
-        "std_err": 0.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    }
-]

results/Bgym-Llama-3-70b/results.json DELETED Viewed

@@ -1,52 +0,0 @@
-[
-    {
-        "benchmark": "WorkArena-L1",
-        "score": 17.9,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L2",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "MiniWoB",
-        "score": 68.2,
-        "std_err": 0.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WebArena",
-        "score": 11.0,
-        "std_err": 0.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    }
-]

results/Bgym-Mixtral-8x22b/results.json DELETED Viewed

@@ -1,52 +0,0 @@
-[
-    {
-        "benchmark": "WorkArena-L1",
-        "score": 12.4,
-        "std_err": 0.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L2",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WorkArena++-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "MiniWoB",
-        "score": 62.4,
-        "std_err": 0.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    },
-    {
-        "benchmark": "WebArena",
-        "score": 12.6,
-        "std_err": 0.9,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA"
-    }
-]