|
import re |
|
import json |
|
import requests |
|
import pandas as pd |
|
from tqdm import tqdm |
|
from bs4 import BeautifulSoup |
|
from huggingface_hub import HfApi, list_models, list_datasets, list_spaces |
|
import gradio as gr |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
import datetime |
|
|
|
|
|
api = HfApi() |
|
|
|
|
|
def get_most(df_for_most_function): |
|
download_sorted_df = df_for_most_function.sort_values(by=['downloads'], ascending=False) |
|
most_downloaded = download_sorted_df.iloc[0] |
|
|
|
like_sorted_df = df_for_most_function.sort_values(by=['likes'], ascending=False) |
|
most_liked = like_sorted_df.iloc[0] |
|
|
|
return {"Most Download": {"id": most_downloaded['id'], "downloads": most_downloaded['downloads'], "likes": most_downloaded['likes']}, "Most Likes": {"id": most_liked['id'], "downloads": most_liked['downloads'], "likes": most_liked['likes']}} |
|
|
|
def get_sum(df_for_sum_function): |
|
sum_downloads = sum(df_for_sum_function['downloads'].tolist()) |
|
sum_likes = sum(df_for_sum_function['likes'].tolist()) |
|
|
|
return {"Downloads": sum_downloads, "Likes": sum_likes} |
|
|
|
def get_openllm_leaderboard(): |
|
try: |
|
url = 'https://huggingfaceh4-open-llm-leaderboard.hf.space/' |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
script_elements = soup.find_all('script') |
|
data = json.loads(str(script_elements[1])[31:-10]) |
|
|
|
component_index = 19 |
|
|
|
result_list = [] |
|
i = 0 |
|
while True: |
|
try: |
|
normal_name = data['components'][component_index]['props']['value']['data'][i][-1] |
|
result_list.append(normal_name) |
|
i += 1 |
|
except (IndexError, AttributeError): |
|
return result_list |
|
except Exception as e: |
|
print("Error on open llm leaderboard: ", e) |
|
return [] |
|
|
|
|
|
def get_ranking(model_list, target_org): |
|
if model_list == []: |
|
return "Error on Leaderboard" |
|
for index, model in enumerate(model_list): |
|
if model.split("/")[0].lower() == target_org.lower(): |
|
return [index+1, model] |
|
return "Not Found" |
|
|
|
|
|
def get_models(which_one): |
|
if which_one == "models": |
|
data = api.list_models() |
|
elif which_one == "datasets": |
|
data = api.list_datasets() |
|
elif which_one == "spaces": |
|
data = api.list_spaces() |
|
|
|
all_list = [] |
|
for i in tqdm(data, desc=f"Scraping {which_one}", position=0, leave=True): |
|
i = i.__dict__ |
|
|
|
id = i["id"].split("/") |
|
if len(id) != 1: |
|
json_format_data = {"author": id[0] ,"id": "/".join(id), "downloads": i['downloads'], "likes": i['likes']} if which_one != "spaces" else {"author": id[0] ,"id": "/".join(id), "downloads": 0, "likes": i['likes']} |
|
|
|
|
|
all_list.append(json_format_data) |
|
return all_list |
|
|
|
|
|
def search(models_dict, author_name): |
|
return pd.DataFrame(models_dict.get(author_name, [])) |
|
|
|
|
|
def group_models_by_author(all_things): |
|
models_by_author = {} |
|
for model in all_things: |
|
author_name = model['author'] |
|
if author_name not in models_by_author: |
|
models_by_author[author_name] = [] |
|
models_by_author[author_name].append(model) |
|
return models_by_author |
|
|
|
|
|
def make_leaderboard(orgs, which_one, data): |
|
data_rows = [] |
|
open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None |
|
|
|
trend = get_trending_list(1, which_one) |
|
|
|
for org in tqdm(orgs, desc=f"Proccesing Organizations ({which_one})", position=0, leave=True): |
|
rank = get_ranking_trend(trend, org) |
|
|
|
df = search(data, org) |
|
|
|
if len(df) == 0: |
|
continue |
|
num_things = len(df) |
|
sum_info = get_sum(df) |
|
most_info = get_most(df) |
|
|
|
if which_one == "models": |
|
open_llm_leaderboard_get_org = get_ranking(open_llm_leaderboard, org) |
|
|
|
data_rows.append({ |
|
"Organization Name": org, |
|
"Total Downloads": sum_info["Downloads"], |
|
"Total Likes": sum_info["Likes"], |
|
"Number of Models": num_things, |
|
"Best Model On Open LLM Leaderboard": open_llm_leaderboard_get_org[1] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org, |
|
"Best Rank On Open LLM Leaderboard": open_llm_leaderboard_get_org[0] if open_llm_leaderboard_get_org != "Not Found" else open_llm_leaderboard_get_org, |
|
"Average Downloads per Model": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0, |
|
"Average Likes per Model": int(sum_info["Likes"] / num_things) if num_things != 0 else 0, |
|
"Most Downloaded Model": most_info["Most Download"]["id"], |
|
"Most Download Count": most_info["Most Download"]["downloads"], |
|
"Most Liked Model": most_info["Most Likes"]["id"], |
|
"Most Like Count": most_info["Most Likes"]["likes"], |
|
"Trending Model": rank['id'], |
|
"Best Rank at Trending Models": rank['rank'] |
|
}) |
|
elif which_one == "datasets": |
|
|
|
data_rows.append({ |
|
"Organization Name": org, |
|
"Total Downloads": sum_info["Downloads"], |
|
"Total Likes": sum_info["Likes"], |
|
"Number of Datasets": num_things, |
|
"Average Downloads per Dataset": int(sum_info["Downloads"] / num_things) if num_things != 0 else 0, |
|
"Average Likes per Dataset": int(sum_info["Likes"] / num_things) if num_things != 0 else 0, |
|
"Most Downloaded Dataset": most_info["Most Download"]["id"], |
|
"Most Download Count": most_info["Most Download"]["downloads"], |
|
"Most Liked Dataset": most_info["Most Likes"]["id"], |
|
"Most Like Count": most_info["Most Likes"]["likes"], |
|
"Trending Dataset": rank['id'], |
|
"Best Rank at Trending Datasets": rank['rank'] |
|
}) |
|
|
|
elif which_one == "spaces": |
|
|
|
data_rows.append({ |
|
"Organization Name": org, |
|
"Total Likes": sum_info["Likes"], |
|
"Number of Spaces": num_things, |
|
"Average Likes per Space": int(sum_info["Likes"] / num_things) if num_things != 0 else 0, |
|
"Most Liked Space": most_info["Most Likes"]["id"], |
|
"Most Like Count": most_info["Most Likes"]["likes"], |
|
"Trending Space": rank['id'], |
|
"Best Rank at Trending Spaces": rank['rank'] |
|
}) |
|
|
|
leaderboard = pd.DataFrame(data_rows) |
|
temp = ["Total Downloads"] if which_one != "spaces" else ["Total Likes"] |
|
|
|
leaderboard = leaderboard.sort_values(by=temp, ascending=False) |
|
leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1)) |
|
return leaderboard |
|
|
|
|
|
def clickable(x, which_one): |
|
if which_one == "models": |
|
if x != "Not Found": |
|
return f'<a target="_blank" href="https://huggingface.co/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>' |
|
else: |
|
return "Not Found" |
|
else: |
|
if x != "Not Found": |
|
return f'<a target="_blank" href="https://huggingface.co/{which_one}/{x}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{x}</a>' |
|
return "Not Found" |
|
|
|
def models_df_to_clickable(df, columns, which_one): |
|
for column in columns: |
|
if column == "Organization Name": |
|
df[column] = df[column].apply(lambda x: clickable(x, "models")) |
|
else: |
|
df[column] = df[column].apply(lambda x: clickable(x, which_one)) |
|
return df |
|
|
|
|
|
def get_trending_list(pages, which_one): |
|
trending_list = [] |
|
for i in range(pages): |
|
json_data = requests.get(f"https://huggingface.co/{which_one}-json?p={i}").json() |
|
|
|
for thing in json_data[which_one]: |
|
id = thing["id"] |
|
likes = thing["likes"] |
|
|
|
if which_one != "spaces": |
|
downloads = thing["downloads"] |
|
|
|
trending_list.append({"id": id, "downloads": downloads, "likes": likes}) |
|
else: |
|
trending_list.append({"id": id, "likes": likes}) |
|
|
|
return trending_list |
|
|
|
def get_ranking_trend(json_data, org_name): |
|
names = [item['id'].split("/")[0] for item in json_data] |
|
models = [item['id'] for item in json_data] |
|
if org_name in names: |
|
temp = names.index(org_name) |
|
return {"id": models[temp], "rank": temp+1} |
|
else: |
|
return {"id": "Not Found", "rank": "Not Found"} |
|
|
|
def restart_space(): |
|
api.restart_space(repo_id="TFLai/organization-leaderboard", token=HF_TOKEN) |
|
|
|
|
|
with open("org_names.txt", "r") as f: |
|
org_names_in_list = [i.rstrip("\n") for i in f.readlines()] |
|
|
|
datetime = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M")) |
|
INTRODUCTION_TEXT = f""" |
|
🎯 The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). |
|
|
|
## Available Dataframes: |
|
|
|
- 🏛️ Models |
|
|
|
- 📊 Datasets |
|
|
|
- 🚀 Spaces |
|
|
|
## Backend |
|
|
|
🛠️ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api). |
|
|
|
🛠️ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co/organizations). |
|
|
|
**🌐 Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping. |
|
|
|
**🌐 Note:** In trending models, first 300 models/datasets/spaces is being retrieved from huggingface. |
|
|
|
## Last Update |
|
|
|
⌛ This space is last updated in **{datetime}**. |
|
""" |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("""<h1 align="center" id="space-title">🤗 Organization Leaderboard</h1>""") |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
all_models = get_models("models") |
|
all_datasets = get_models("datasets") |
|
all_spaces = get_models("spaces") |
|
|
|
|
|
with gr.TabItem("🏛️ Models", id=1): |
|
columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model", "Trending Model"] |
|
models_df = make_leaderboard(org_names_in_list, "models", group_models_by_author(all_models)) |
|
models_df = models_df_to_clickable(models_df, columns_to_convert, "models") |
|
|
|
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "🤖 Number of Models", "🏆 Best Model On Open LLM Leaderboard", "🥇 Best Rank On Open LLM Leaderboard", "📊 Average Downloads per Model", "📈 Average Likes per Model", "🚀 Most Downloaded Model", "📈 Most Download Count", "❤️ Most Liked Model", "👍 Most Like Count", "🔥 Trending Model", "👑 Best Rank at Trending Models"] |
|
gr.Dataframe(models_df.head(400), headers=headers, interactive=True, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str", "markdown", "str"]) |
|
|
|
with gr.TabItem("📊 Datasets", id=2): |
|
columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset", "Trending Dataset"] |
|
dataset_df = make_leaderboard(org_names_in_list, "datasets", group_models_by_author(all_datasets)) |
|
dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets") |
|
|
|
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "📊 Number of Datasets", "📊 Average Downloads per Dataset", "📈 Average Likes per Dataset", "🚀 Most Downloaded Dataset", "📈 Most Download Count", "❤️ Most Liked Dataset", "👍 Most Like Count", "🔥 Trending Dataset", "👑 Best Rank at Trending Datasets"] |
|
gr.Dataframe(dataset_df.head(250), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "str", "str", "markdown", "str", "markdown", "str", "markdown", "str"]) |
|
|
|
with gr.TabItem("🚀 Spaces", id=3): |
|
columns_to_convert = ["Organization Name", "Most Liked Space", "Trending Space"] |
|
|
|
spaces_df = make_leaderboard(org_names_in_list, "spaces", group_models_by_author(all_spaces)) |
|
spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces") |
|
|
|
headers = ["🔢 Serial Number", "🏢 Organization Name", "👍 Total Likes", "🚀 Number of Spaces", "📈 Average Likes per Space", "❤️ Most Liked Space", "👍 Most Like Count", "🔥 Trending Space", "👑 Best Rank at Trending Spaces"] |
|
gr.Dataframe(spaces_df.head(200), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"]) |
|
|
|
|
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(restart_space, "interval", seconds=21600) |
|
demo.launch() |