Weyaxi's picture
minor error fix
d2fc390
raw
history blame
10.3 kB
import re
import json
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from huggingface_hub import HfApi, list_models, list_datasets, list_spaces
import gradio as gr
api = HfApi()
def get_models(org_name, which_one):
all_list = []
if which_one == "models":
things = api.list_models(author=org_name)
elif which_one == "datasets":
things = api.list_datasets(author=org_name)
elif which_one == "spaces":
things = api.list_spaces(author=org_name)
for i in things:
i = i.__dict__
json_format_data = {"id": i['id'], "downloads": i['downloads'],
"likes": i['likes']} if which_one != "spaces" else {"id": i['id'], "downloads": 0, "likes": i['likes']}
all_list.append(json_format_data)
df_all_list = (pd.DataFrame(all_list))
return df_all_list
def get_most(df_for_most_function):
download_sorted_df = df_for_most_function.sort_values(by=['downloads'], ascending=False)
most_downloaded = download_sorted_df.iloc[0]
like_sorted_df = df_for_most_function.sort_values(by=['likes'], ascending=False)
most_liked = like_sorted_df.iloc[0]
return {"Most Download": {"id": most_downloaded['id'], "downloads": most_downloaded['downloads'], "likes": most_downloaded['likes']},
"Most Likes": {"id": most_liked['id'], "downloads": most_liked['downloads'], "likes": most_liked['likes']}}
def get_sum(df_for_sum_function):
sum_downloads = sum(df_for_sum_function['downloads'].tolist())
sum_likes = sum(df_for_sum_function['likes'].tolist())
return {"Downloads": sum_downloads, "Likes": sum_likes}
def get_openllm_leaderboard():
url = 'https://huggingfaceh4-open-llm-leaderboard.hf.space/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
script_elements = soup.find_all('script')
data = json.loads(str(script_elements[1])[31:-10])
component_index = 11
pattern = r'href="([^"]*)"'
zero_or_one = 1
result_list = []
i = 0
while True:
try:
unfiltered = data['components'][component_index]['props']['value']['data'][i][zero_or_one].rstrip("\n")
normal_name = re.search(pattern, unfiltered).group(1)
normal_name = "/".join(normal_name.split("/")[-2:])
result_list.append(normal_name)
i += 1
except (IndexError, AttributeError):
return result_list
def get_ranking(model_list, target_org):
for index, model in enumerate(model_list):
if model.split("/")[0].lower() == target_org.lower():
return [index + 1, model]
return "Not Found"
def make_leaderboard(orgs, which_one):
data_rows = []
open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None
for org in tqdm(orgs, desc=f"Scraping Organizations ({which_one})", position=0, leave=True):
df = get_models(org, which_one)
if len(df) == 0:
continue
num_things = len(df)
sum_info = get_sum(df)
most_info = get_most(df)
if which_one == "models":
open_llm_leaderboard_get_org = get_ranking(open_llm_leaderboard, org)
data_rows.append({
"Organization Name": org,
"Total Downloads": sum_info["Downloads"],
"Total Likes": sum_info["Likes"],
"Number of Models": num_things,
"Best Model On Open LLM Leaderboard": open_llm_leaderboard_get_org[1] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org,
"Best Rank On Open LLM Leaderboard": open_llm_leaderboard_get_org[0] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org,
"Average Downloads per Model": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0,
"Average Likes per Model": int(sum_info["Likes"] / num_things) if num_things != 0 else 0,
"Most Downloaded Model": most_info["Most Download"]["id"],
"Most Download Count": most_info["Most Download"]["downloads"],
"Most Liked Model": most_info["Most Likes"]["id"],
"Most Like Count": most_info["Most Likes"]["likes"]
})
elif which_one == "datasets":
data_rows.append({
"Organization Name": org,
"Total Downloads": sum_info["Downloads"],
"Total Likes": sum_info["Likes"],
"Number of Datasets": num_things,
"Average Downloads per Dataset": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0,
"Average Likes per Dataset": int(sum_info["Likes"] / num_things) if num_things != 0 else 0,
"Most Downloaded Dataset": most_info["Most Download"]["id"],
"Most Download Count": most_info["Most Download"]["downloads"],
"Most Liked Dataset": most_info["Most Likes"]["id"],
"Most Like Count": most_info["Most Likes"]["likes"]
})
elif which_one == "spaces":
data_rows.append({
"Organization Name": org,
"Total Likes": sum_info["Likes"],
"Number of Spaces": num_things,
"Average Likes per Space": int(sum_info["Likes"] / num_things) if num_things != 0 else 0,
"Most Liked Space": most_info["Most Likes"]["id"],
"Most Like Count": most_info["Most Likes"]["likes"]
})
leaderboard = pd.DataFrame(data_rows)
leaderboard = leaderboard.sort_values(by=["Total Downloads"] if which_one != "spaces" else ["Total Likes"], ascending=False)
leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1))
return leaderboard
with open("org_names.txt", "r") as f:
org_names_in_list = [i.rstrip("\n") for i in f.readlines()]
markdown_main_text = f"""
🎯 The Organization Leaderboard aims to track organizations ranking. This space is inspired by [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
## Dataframes Available:
- 🏛️ Models
- 📊 Datasets
- 🚀 Spaces
## Backend
🛠️ The leaderboard's backend mainly runs the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
🛠️ Organization names are being retrieved using web scrabing ([HUggingface Organizations](https://huggingface.co/organizations))
**🌐 Note:** In model's dataframe there is some columns related to [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). These datas are also being retrieved with web scrabing.
"""
def clickable(x, which_one):
if which_one == "models":
if x != "Not Found":
return f'<a target="_blank" href="https://huggingface.co/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>'
else:
return "Not Found"
else:
return f'<a target="_blank" href="https://huggingface.co/{which_one}/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>'
def df_to_clickable(df, columns, which_one):
for column in columns:
if column == "Organization Name":
df[column] = df[column].apply(lambda x: clickable(x, "models"))
else:
df[column] = df[column].apply(lambda x: clickable(x, which_one))
return df
with gr.Blocks() as demo:
gr.Markdown("""<h1 align="center" id="space-title">🤗 Organization Leaderboard</h1>""")
gr.Markdown(markdown_main_text, elem_classes="markdown-text")
with gr.TabItem("🏛️ Models", id=1):
columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model"]
models_df = make_leaderboard(org_names_in_list, "models")
models_df = df_to_clickable(models_df, columns_to_convert, "models")
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "🤖 Number of Models",
"🏆 Best Model On Open LLM Leaderboard", "🥇 Best Rank On Open LLM Leaderboard",
"📊 Average Downloads per Model", "📈 Average Likes per Model", "🚀 Most Downloaded Model",
"📈 Most Download Count", "❤ Most Liked Model", "👍 Most Like Count"]
gr.Dataframe(models_df, headers=headers, interactive=True,
datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "str", "str", "markdown",
"str", "markdown", "str"])
with gr.TabItem("📊 Dataset", id=2):
columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset"]
dataset_df = make_leaderboard(org_names_in_list, "datasets")
dataset_df = df_to_clickable(dataset_df, columns_to_convert, "datasets")
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes",
"📊 Number of Datasets", "📊 Average Downloads per Dataset", "📈 Average Likes per Dataset",
"🚀 Most Downloaded Dataset", "📈 Most Download Count", "❤ Most Liked Dataset", "👍 Most Like Count"]
gr.Dataframe(dataset_df, headers=headers, interactive=False,
datatype=["str", "markdown", "str", "str", "str", "str", "str", "markdown", "str", "markdown",
"str"])
with gr.TabItem("🚀 Spaces", id=3):
columns_to_convert = ["Organization Name", "Most Liked Space"]
spaces_df = make_leaderboard(org_names_in_list, "spaces")
spaces_df = df_to_clickable(spaces_df, columns_to_convert, "spaces")
headers = ["🔢 Serial Number", "🏢 Organization Name", "👍 Total Likes", "🚀 Number of Spaces",
"📈 Average Likes per Space", "❤ Most Liked Space", "👍 Most Like Count"]
gr.Dataframe(spaces_df, headers=headers, interactive=False,
datatype=["str", "markdown", "str", "str", "str", "markdown", "str"])
demo.launch()