Spaces:

felixz
/

meta_open_llm_leaderboard

Running

felix

add placeholder

e12cc65 over 1 year ago

5.53 kB

	# app.py
	import json

	import streamlit as st
	import glob
	import os
	from datetime import datetime

	#st.set_page_config(layout="wide")
	st.title('Meta Open LLM leaderboard')

	directories = os.listdir("./data")

	#data_dir = directories[0]

	def format_dir_date(data_dir):
	# Extracting date and time information from the path
	parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M")

	# Formatting the parsed date
	return parsed_date.strftime("%b %d, %Y %H:%M")

	data_dir = st.selectbox(
	'Select different Date',
	directories,
	format_func=format_dir_date,
	index=len(directories)-1
	)

	captions_map = {
	"hg_average_to_agentbench_compare.png": "HF to AgentBench compare",
	"hg_average_to_opencompass_compare.png": "HF to OpenCompass</a> compare",
	"hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare",
	"hg_average_to_mosaic_compare.png": "HF to MosaicML compare",
	"hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare"
	}

	st.write("Generated on: <b>" + format_dir_date(data_dir) + "</b>", unsafe_allow_html=True)
	st.divider()

	data_path = './data/' + data_dir

	imgs = glob.glob(os.path.join(data_path, '*.png'))

	# Extracting images that start with "hf_llm_diagram"
	hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]

	# Getting the remaining images
	remaining_imgs = [img for img in imgs if 'hf_llm_diagram' not in os.path.basename(img)]

	def print_model_list(file_name, st, split_into_two=False):
	file_path = file_name[:-4] + '.json'
	# Read the list from the JSON file
	with open(file_path, 'r') as file:
	model_id_list_loaded = json.load(file)
	midpoint = len(model_id_list_loaded) // 2 + (len(model_id_list_loaded) % 2) # Calculate the midpoint

	# Split the list into two parts
	left_list = model_id_list_loaded[:midpoint]
	right_list = model_id_list_loaded[midpoint:]

	# Generate HTML for the left column
	left_html = ""
	for model_id in left_list:
	model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
	left_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'


	# Generate HTML for the right column
	right_html = ""
	for model_id in right_list:
	model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
	right_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'

	final_html = ""
	if(split_into_two):
	final_html = "<ul>"
	final_html += left_html
	final_html += "</ul>"
	cols = st.columns(2)
	cols[0].write(final_html, unsafe_allow_html=True)
	final_html = "<ul>"
	final_html += right_html
	final_html += "</ul>"
	cols[1].write(final_html, unsafe_allow_html=True)
	else:
	final_html = "<ul>"
	final_html += left_html
	final_html += right_html
	final_html += "</ul>"
	st.write(final_html, unsafe_allow_html=True)

	st.write("HuggingFace Open LLM leaderboard by Model Size")
	st.image(hf_llm_diagrams[0],use_column_width="auto")

	print_model_list(hf_llm_diagrams[0],st,True)

	cols = st.columns(2)
	cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto")


	print_model_list(hf_llm_diagrams[1],cols[0])

	cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto")

	print_model_list(hf_llm_diagrams[2],cols[1])

	st.divider()
	st.write("HuggingFace and Other Leaderboards: A Comparative Model Evaluation")
	st.caption("Only models evaluated on both leaderboards are included.")
	cols = st.columns(2)

	for i, img in enumerate(remaining_imgs):
	# Extract the filename from the full image path
	filename = os.path.basename(img)

	# Get the caption from the captions_map dictionary
	caption = captions_map.get(filename, "") # If no caption is found, it will default to an empty string

	# Display the image with the caption
	cols[i % 2].image(img, caption=caption, width=None)

	st.write(
	"""
	<p>Leaderboards tracked:</p>
	<ul>
	<li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li>
	<li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench</a> GPT4 judged evaluation of models</li>
	<li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li>
	<li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li>
	<li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li>
	<li><a href="https://llmbench.ai/data">AgentBench</a> Benchmark evaluating Agent abilities</li>
	</ul>
	""", unsafe_allow_html=True
	)
	st.divider()

	st.write("TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size")
	st.image(hf_llm_diagrams[3],use_column_width="auto")
	print_model_list(hf_llm_diagrams[3],st,True)

	st.divider()
	st.subheader('About')
	st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at <a href="https://twitter.com/FZaslavskiy" >@FZaslavskiy</a> or here via community discussions.', unsafe_allow_html=True)