Spaces:

allenai
/

ZebraLogic

Running

App Files Files Community

ZebraLogic / constants.py

yuchenlin

inti commit

1c919b3 5 months ago

raw

history blame

7.1 kB

	from pathlib import Path
	from collections import OrderedDict

	DEFAULT_K = "∞"
	# DEFAULT_K = "1500"

	banner_url = "https://allenai.github.io/WildBench/gray_banner.png" # the same repo here.
	BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 800px;"> </div>'

	TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"

	WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"

	CITATION_TEXT = """@misc{lin2024wildbench,
	title={WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild},
	author={Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze Brahman and Abhilasha Ravichander and Valentina Pyatkin and Nouha Dziri and Ronan Le Bras and Yejin Choi},
	year={2024},
	eprint={2406.04770},
	archivePrefix={arXiv},
	primaryClass={cs.CL},
	url={https://arxiv.org/abs/2406.04770}
	}
	"""

	# make column_names as an ordered dict



	column_names = OrderedDict({
	"Model": "Model",
	"Mode": "Mode",
	"Puzzle Acc": "Puzzle Acc",
	"Cell Acc": "Cell Acc",
	"No answer": "No answer",
	"Easy Puzzle Acc": "Easy Puzzle Acc",
	"Hard Puzzle Acc": "Hard Puzzle Acc",
	# "Total Puzzles": "Total Puzzles",
	# "Reason Lens": "Reason Lens",
	})



	LEADERBOARD_REMARKS = """WB Reward: for each comparison (A vs B), a reward for A is +/-1 if A is much better/worse than B, and +/-0.5 if A is slightly better/worse than B; when there is a Tie, the reward is 0.
	"""

	# WB Reward: for each pairwise comparison, a reward for A is +/-1 if A is much better/worse than B, and +/-0.5 if A is slightly better/worse than B; 0 for a Tie.
	# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
	# WB Score individually scores each model based on checklists.
	# Evaluator is GPT-4-Turbo.
	LEADERBOARD_REMARKS_MAIN = """
	"""

	RANKING_COLUMN = "Puzzle Acc"

	ORDERED_COLUMN_NAMES = [
	"Model",
	"Mode",
	"Puzzle Acc",
	"Easy Puzzle Acc",
	"Hard Puzzle Acc",
	"Cell Acc",
	"No answer",
	]


	js_light = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'light') {
	url.searchParams.set('__theme', 'light');
	window.location.href = url.href;
	}

	// Find the fieldset with the given id
	const fieldset = document.getElementById("rank-column-radio");

	// Create a new span element with the text "Rank by:"
	const rankBySpan = document.createElement("span");
	rankBySpan.textContent = "Rank by: ";
	rankBySpan.style.fontWeight = "bold"; // Optional: make the text bold
	rankBySpan.style.fontSize = "19px"; // Larger font size
	rankBySpan.style.paddingRight = "18px"; // Add padding on the right

	// Wrap the span and the labels in a flex container
	const flexContainer = document.createElement("div");
	flexContainer.style.display = "flex";
	flexContainer.style.alignItems = "center";

	// Insert the rankBySpan at the beginning of the flex container
	flexContainer.appendChild(rankBySpan);

	// Move all existing labels into the flex container
	while (fieldset.firstChild) {
	flexContainer.appendChild(fieldset.firstChild);
	}

	// Append the flex container back to the fieldset
	fieldset.appendChild(flexContainer);
	}
	"""

	js_code = """
	function scroll_top() {
	console.log("Hello from Gradio!");
	const bubbles = document.querySelectorAll('.bubble-wrap');
	bubbles.forEach((bubble, index) => {
	setTimeout(() => {
	bubble.scrollTop = 0;
	}, index * 100); // Delay of 100ms between each iteration
	});

	}
	"""


	TASK_TYPE_STR = "Tasks: Info seeking (InfoSek), Creative Writing (CrtWrt), Coding&Debugging (Code), Reasoning (Reason), Editing (Edit), Math, Planning (Plan), Brainstorming (Brnstrm), Role playing (RolPly), Advice seeking (AdvSek), Data Analysis (DataAna)"

	css = """



	code {
	font-size: large;
	}
	footer {visibility: hidden}
	.top-left-LP{
	margin-top: 6px;
	margin-left: 5px;
	}
	.no_margin{
	margin-top: 0px;
	margin-left: 0px;
	margin-right: 0px;
	margin-bottom: 0px;
	padding-top: 0px;
	padding-left: 0px;
	padding-right: 0px;
	padding-bottom: 0px;
	}
	.markdown-text{font-size: 14pt}
	.markdown-text-tiny{font-size: 10pt}
	.markdown-text-small{font-size: 13pt}
	.markdown-text-tiny{font-size: 12pt}
	.markdown-text-tiny-red{
	font-size: 12pt;
	color: red;
	background-color: yellow;
	font-color: red;
	font-weight: bold;
	}
	th {
	text-align: center;
	font-size: 17px; /* Adjust the font size as needed */
	}
	td {
	font-size: 15px; /* Adjust the font size as needed */
	text-align: center;
	}

	.sample_button{
	border: 2px solid #000000;
	border-radius: 10px;
	padding: 10px;
	font-size: 17pt;
	font-weight: bold;
	margin: 5px;
	background-color: #D8BFD8;
	}

	.chat-common{
	height: auto;
	max-height: 400px;
	min-height: 100px;
	}
	.chat-specific{
	height: auto;
	max-height: 600px;
	min-height: 200px;
	}
	#od-benchmark-tab-table-button{
	font-size: 15pt;
	font-weight: bold;
	}

	.btn_boderline{
	border: 1px solid #000000;
	border-radius: 5px;
	padding: 5px;
	margin: 5px;
	font-size: 15pt;
	font-weight: bold;
	}

	.btn_boderline_next{
	border: 0.1px solid #000000;
	border-radius: 5px;
	padding: 5px;
	margin: 5px;
	font-size: 15pt;
	font-weight: bold;
	}

	.btn_boderline_gray{
	border: 0.5px solid gray;
	border-radius: 5px;
	padding: 5px;
	margin: 5px;
	font-size: 15pt;
	font-weight: italic;
	}
	.btn_boderline_selected{
	border: 2px solid purple;
	background-color: #f2f2f2;
	border-radius: 5px;
	padding: 5px;
	margin: 5px;
	font-size: 15pt;
	font-weight: bold;
	}
	.accordion-label button span{
	font-size: 14pt;
	font-weight: bold;
	}

	#show-task-categorized span{
	font-size: 13pt;
	font-weight: bold;
	}

	#show-open-source-models span{
	font-size: 13pt;
	font-weight: bold;
	}

	#select-models span{
	font-size: 10pt;
	}

	#select-tasks span{
	font-size: 10pt;
	}


	.markdown-text-details{
	margin: 10px;
	padding: 10px;
	}


	button.selected[role="tab"][aria-selected="true"] {
	font-size: 18px; /* or any other size you prefer */
	font-weight: bold;
	}

	#od-benchmark-tab-table-ablation-button {
	font-size: larger; /* Adjust the font size as needed */
	}


	.plotly-plot{
	height: auto;
	max-height: 600px;
	min-height: 600px;
	}

	#length-margin-radio{
	font-size: 10pt;
	# padding: 0px;
	# margin: 1px;
	}

	#show-task-categorized{
	font-size: 12pt;
	font-decoration: bold;
	}

	#show-open-source-models{
	font-size: 12pt;
	font-decoration: bold;
	}
	"""