Spaces:

agent-evals
/

leaderboard

Running

App Files Files Community

leaderboard / config.py

benediktstroebl

big update with dynamic pricing, agent metadata, about page on top, and new benchmarks

56a86ce 3 months ago

raw

history blame contribute delete

3.8 kB

	import pandas as pd

	TYPES = [
	"str",
	"number",
	"number"
	]

	SWEBENCH_ON_LOAD_COLUMNS = [
	"Agent Name",
	"Accuracy",
	"Total Cost",
	"Runs",
	]
	SWEBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
	SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

	USACO_ON_LOAD_COLUMNS = [
	"Agent Name",
	"Accuracy",
	"Total Cost",
	"Runs",
	]
	USACO_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
	USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

	COREBENCH_ON_LOAD_COLUMNS = [
	"Agent Name",
	"Accuracy",
	"Total Cost",
	"Runs",
	]
	COREBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
	COREBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]



	MLAGENTBENCH_ON_LOAD_COLUMNS = [
	"Agent Name",
	"Overall Score",
	"Total Cost",
	]
	MLAGENTBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
	MLAGENTBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Accuracy']


	NUMERIC_INTERVALS = {
	"?": pd.Interval(-1, 0, closed="right"),
	"~1.5": pd.Interval(0, 2, closed="right"),
	"~3": pd.Interval(2, 4, closed="right"),
	"~7": pd.Interval(4, 9, closed="right"),
	"~13": pd.Interval(9, 20, closed="right"),
	"~35": pd.Interval(20, 45, closed="right"),
	"~60": pd.Interval(45, 70, closed="right"),
	"70+": pd.Interval(70, 10000, closed="right"),
	}

	CYBENCH_ON_LOAD_COLUMNS = [
	"Agent Name",
	"Accuracy",
	"Total Cost",
	"Runs",
	]
	CYBENCH_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
	CYBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

	APPWORLD_ON_LOAD_COLUMNS = [
	"Agent Name",
	"Accuracy",
	"Total Cost",
	"Runs",
	"Scenario Goal Completion"
	]
	APPWORLD_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
	APPWORLD_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score', "Level 1 Accuracy", "Level 2 Accuracy", "Level 3 Accuracy"]

	GAIA_ON_LOAD_COLUMNS = [
	"Agent Name",
	"Accuracy",
	"Level 1 Accuracy",
	"Level 2 Accuracy",
	"Level 3 Accuracy",
	"Total Cost",
	"Runs",
	]
	GAIA_SEARCH_COLUMNS = ['Total Cost', 'Agent Name']
	GAIA_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name", 'Overall Score', 'Vectorization Score', 'Fathomnet Score', 'Feedback Score', 'House Price Score', 'Spaceship Titanic Score', 'AMP Parkinsons Disease Progression Prediction Score', 'CIFAR10 Score', 'IMDB Score']