File size: 7,732 Bytes
0ca4024 182c1d0 02b09bd 182c1d0 02b09bd 182c1d0 c66e8f9 182c1d0 02b09bd a7a5300 182c1d0 02b09bd 182c1d0 3533641 02b09bd 3003b62 02b09bd 3533641 538d051 3533641 538d051 02b09bd 1e378d7 02b09bd 3533641 538d051 71f1dc0 3533641 538d051 182c1d0 1b8daa0 182c1d0 c66e8f9 1d67108 1e378d7 1b8daa0 e4f3e8d 182c1d0 1b8daa0 e4f3e8d 182c1d0 02b09bd 1b8daa0 02b09bd 1b8daa0 02b09bd 1b8daa0 02b09bd 182c1d0 00ccb92 3533641 6e94de2 054abb2 3533641 c66e8f9 02b09bd 182c1d0 02b09bd 538d051 1b8daa0 02b09bd 362494d 1b8daa0 02b09bd 4653f13 3533641 182c1d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import gradio as gr
import pandas as pd
from cachetools import TTLCache, cached
from huggingface_hub import list_models
from toolz import groupby
from tqdm.auto import tqdm
@cached(TTLCache(maxsize=10, ttl=60 * 60 * 3))
def get_all_models():
models = list(
tqdm(
iter(list_models(cardData=True, limit=None, sort="downloads", direction=-1))
)
)
models = [model for model in models if model is not None]
return [
model for model in models if model.downloads > 1
] # filter out models with 0 downloads
def has_base_model_info(model):
try:
if card_data := model.cardData:
if base_model := card_data.get("base_model"):
if isinstance(base_model, str):
return True
except AttributeError:
return False
return False
grouped_by_has_base_model_info = groupby(has_base_model_info, get_all_models())
def produce_summary():
return f"""{len(grouped_by_has_base_model_info.get(True)):,} models have base model info.
{len(grouped_by_has_base_model_info.get(False)):,} models don't have base model info.
Currently {round(len(grouped_by_has_base_model_info.get(True))/len(get_all_models())*100,2)}% of models have base model info."""
models_with_base_model_info = grouped_by_has_base_model_info.get(True)
base_models = [
model.cardData.get("base_model") for model in models_with_base_model_info
]
df = pd.DataFrame(
pd.DataFrame({"base_model": base_models}).value_counts()
).reset_index()
df_with_org = df.copy(deep=True)
pipeline_tags = [x.pipeline_tag for x in models_with_base_model_info]
# sort pipeline tags alphabetically
pipeline_tags = sorted(pipeline_tags)
unique_pipeline_tags = list(
{x.pipeline_tag for x in models_with_base_model_info if x.pipeline_tag is not None}
)
def parse_org(hub_id):
parts = hub_id.split("/")
if len(parts) == 2:
return parts[0] if parts[0] != "." else None
else:
return "huggingface"
def render_model_hub_link(hub_id):
link = f"https://huggingface.co/{hub_id}"
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
df_with_org["org"] = df_with_org["base_model"].apply(parse_org)
df_with_org = df_with_org.dropna(subset=["org"])
grouped_by_base_model = groupby(
lambda x: x.cardData.get("base_model"), models_with_base_model_info
)
all_base_models = df["base_model"].to_list()
def get_grandchildren(base_model):
grandchildren = []
for model in tqdm(grouped_by_base_model[base_model]):
model_id = model.modelId
grandchildren.extend(grouped_by_base_model.get(model_id, []))
return grandchildren
def return_models_for_base_model(base_model):
models = grouped_by_base_model.get(base_model)
# sort models by downloads
models = sorted(models, key=lambda x: x.downloads, reverse=True)
results = ""
results += (
"## Models fine-tuned from"
f" [`{base_model}`](https://huggingface.co/{base_model}) \n\n"
)
results += f"`{base_model}` has {len(models)} children\n\n"
total_download_number = sum(model.downloads for model in models)
results += (
f"`{base_model}`'s children have been"
f" downloaded {total_download_number:,} times\n\n"
)
grandchildren = get_grandchildren(base_model)
number_of_grandchildren = len(grandchildren)
results += f"`{base_model}` has {number_of_grandchildren} grandchildren\n\n"
grandchildren_download_count = sum(model.downloads for model in grandchildren)
results += (
f"`{base_model}`'s grandchildren have been"
f" downloaded {grandchildren_download_count:,} times\n\n"
)
results += f"Including grandchildren, `{base_model}` has {number_of_grandchildren + len(models):,} descendants\n\n"
results += f"Including grandchildren, `{base_model}`'s descendants have been downloaded {grandchildren_download_count + total_download_number:,} times\n\n"
results += "### Children models \n\n"
for model in models:
url = f"https://huggingface.co/{model.modelId}"
results += (
f"- [{model.modelId}]({url}) | number of downloads {model.downloads:,}"
+ "\n\n"
)
return results
def return_base_model_popularity(pipeline=None):
df_with_pipeline_info = (
pd.DataFrame({"base_model": base_models, "pipeline": pipeline_tags})
.value_counts()
.reset_index()
)
if pipeline is not None:
df_with_pipeline_info = df_with_pipeline_info[
df_with_pipeline_info["pipeline"] == pipeline
]
keep_columns = ["base_model", "count"]
df_with_pipeline_info["base_model"] = df_with_pipeline_info["base_model"].apply(
render_model_hub_link
)
return df_with_pipeline_info[keep_columns].head(50)
def return_base_model_popularity_by_org(pipeline=None):
referenced_base_models = [
f"[`{model}`](https://huggingface.co/{model})" for model in base_models
]
df_with_pipeline_info = pd.DataFrame(
{"base_model": base_models, "pipeline": pipeline_tags}
)
df_with_pipeline_info["org"] = df_with_pipeline_info["base_model"].apply(parse_org)
df_with_pipeline_info["org"] = df_with_pipeline_info["org"].apply(
render_model_hub_link
)
df_with_pipeline_info = df_with_pipeline_info.dropna(subset=["org"])
df_with_org = df_with_pipeline_info.copy(deep=True)
if pipeline is not None:
df_with_org = df_with_pipeline_info[df_with_org["pipeline"] == pipeline]
df_with_org = df_with_org.drop(columns=["pipeline"])
df_with_org = pd.DataFrame(df_with_org.value_counts())
return pd.DataFrame(
df_with_org.groupby("org")["count"]
.sum()
.sort_values(ascending=False)
.reset_index()
.head(50)
)
with gr.Blocks() as demo:
gr.Markdown(
"# Base model explorer: explore the lineage of models on the 🤗 Hub"
)
gr.Markdown(
"""When sharing models to the Hub, it is possible to [specify a base model in the model card](https://huggingface.co/docs/hub/model-cards#specifying-a-base-model), i.e. that your model is a fine-tuned version of [bert-base-cased](https://huggingface.co/bert-base-cased).
This Space allows you to find children's models for a given base model and view the popularity of models for fine-tuning.
You can also optionally filter by the task to see rankings for a particular machine learning task.
Don't forget to ❤ if you like this space 🤗"""
)
gr.Markdown(produce_summary())
gr.Markdown("## Find all models trained from a base model")
base_model = gr.Dropdown(all_base_models, label="Base Model")
results = gr.Markdown()
base_model.change(return_models_for_base_model, base_model, results)
gr.Markdown("## Base model rankings ")
dropdown = gr.Dropdown(
choices=unique_pipeline_tags,
value=None,
label="Filter rankings by task pipeline",
)
with gr.Accordion("Base model popularity ranking", open=False):
df_popularity = gr.DataFrame(
return_base_model_popularity(None), datatype="markdown"
)
dropdown.change(return_base_model_popularity, dropdown, df_popularity)
with gr.Accordion("Base model popularity ranking by organization", open=False):
df_popularity_org = gr.DataFrame(
return_base_model_popularity_by_org(None), datatype="markdown"
)
dropdown.change(
return_base_model_popularity_by_org, dropdown, df_popularity_org
)
demo.launch()
|