leaderboard / config.py
benediktstroebl's picture
big update with dynamic pricing, agent metadata, about page on top, and new benchmarks
import pandas as pd
"Agent Name",
"Total Cost",
SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
"Agent Name",
"Total Cost",
USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
"Agent Name",
"Total Cost",
COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
"Agent Name",
"Overall Score",
"Total Cost",
MLAGENTBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
MLAGENTBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Accuracy']
"?": pd.Interval(-1, 0, closed="right"),
"~1.5": pd.Interval(0, 2, closed="right"),
"~3": pd.Interval(2, 4, closed="right"),
"~7": pd.Interval(4, 9, closed="right"),
"~13": pd.Interval(9, 20, closed="right"),
"~35": pd.Interval(20, 45, closed="right"),
"~60": pd.Interval(45, 70, closed="right"),
"70+": pd.Interval(70, 10000, closed="right"),
"Agent Name",
"Total Cost",
CYBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
CYBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
"Agent Name",
"Total Cost",
"Scenario Goal Completion"
APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]
"Agent Name",
"Level 1 Accuracy",
"Level 2 Accuracy",
"Level 3 Accuracy",
"Total Cost",
GAIA_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
GAIA_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']