File size: 7,732 Bytes
0ca4024
 
 
182c1d0
02b09bd
182c1d0
02b09bd
182c1d0
 
 
c66e8f9
 
 
 
 
 
 
 
 
182c1d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02b09bd
 
 
a7a5300
182c1d0
 
02b09bd
182c1d0
 
 
 
 
 
 
3533641
02b09bd
3003b62
 
02b09bd
 
 
3533641
538d051
3533641
538d051
02b09bd
1e378d7
02b09bd
 
3533641
538d051
71f1dc0
 
 
 
 
3533641
538d051
182c1d0
 
 
 
 
 
 
 
1b8daa0
 
 
 
 
 
 
 
182c1d0
 
 
 
 
c66e8f9
 
 
 
1d67108
1e378d7
 
 
 
 
1b8daa0
 
 
 
 
 
 
 
 
 
e4f3e8d
182c1d0
 
 
1b8daa0
e4f3e8d
182c1d0
 
 
 
02b09bd
 
 
 
 
 
 
 
 
 
 
 
1b8daa0
 
 
02b09bd
 
 
 
1b8daa0
 
 
02b09bd
 
 
 
1b8daa0
 
 
02b09bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182c1d0
00ccb92
 
 
3533641
6e94de2
 
 
054abb2
3533641
c66e8f9
02b09bd
 
182c1d0
 
 
02b09bd
 
 
 
 
 
538d051
1b8daa0
 
 
02b09bd
362494d
1b8daa0
 
 
02b09bd
 
4653f13
3533641
182c1d0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import gradio as gr
import pandas as pd
from cachetools import TTLCache, cached
from huggingface_hub import list_models
from toolz import groupby
from tqdm.auto import tqdm


@cached(TTLCache(maxsize=10, ttl=60 * 60 * 3))
def get_all_models():
    models = list(
        tqdm(
            iter(list_models(cardData=True, limit=None, sort="downloads", direction=-1))
        )
    )
    models = [model for model in models if model is not None]
    return [
        model for model in models if model.downloads > 1
    ]  # filter out models with 0 downloads


def has_base_model_info(model):
    try:
        if card_data := model.cardData:
            if base_model := card_data.get("base_model"):
                if isinstance(base_model, str):
                    return True
    except AttributeError:
        return False
    return False


grouped_by_has_base_model_info = groupby(has_base_model_info, get_all_models())


def produce_summary():
    return f"""{len(grouped_by_has_base_model_info.get(True)):,} models have base model info. 
            {len(grouped_by_has_base_model_info.get(False)):,} models don't have base model info.
            Currently {round(len(grouped_by_has_base_model_info.get(True))/len(get_all_models())*100,2)}% of models have base model info."""


models_with_base_model_info = grouped_by_has_base_model_info.get(True)
base_models = [
    model.cardData.get("base_model") for model in models_with_base_model_info
]
df = pd.DataFrame(
    pd.DataFrame({"base_model": base_models}).value_counts()
).reset_index()
df_with_org = df.copy(deep=True)
pipeline_tags = [x.pipeline_tag for x in models_with_base_model_info]
# sort pipeline tags alphabetically
pipeline_tags = sorted(pipeline_tags)
unique_pipeline_tags = list(
    {x.pipeline_tag for x in models_with_base_model_info if x.pipeline_tag is not None}
)


def parse_org(hub_id):
    parts = hub_id.split("/")
    if len(parts) == 2:
        return parts[0] if parts[0] != "." else None
    else:
        return "huggingface"


def render_model_hub_link(hub_id):
    link = f"https://huggingface.co/{hub_id}"
    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'


df_with_org["org"] = df_with_org["base_model"].apply(parse_org)
df_with_org = df_with_org.dropna(subset=["org"])

grouped_by_base_model = groupby(
    lambda x: x.cardData.get("base_model"), models_with_base_model_info
)

all_base_models = df["base_model"].to_list()


def get_grandchildren(base_model):
    grandchildren = []
    for model in tqdm(grouped_by_base_model[base_model]):
        model_id = model.modelId
        grandchildren.extend(grouped_by_base_model.get(model_id, []))
    return grandchildren


def return_models_for_base_model(base_model):
    models = grouped_by_base_model.get(base_model)
    # sort models by downloads
    models = sorted(models, key=lambda x: x.downloads, reverse=True)
    results = ""
    results += (
        "## Models fine-tuned from"
        f" [`{base_model}`](https://huggingface.co/{base_model}) \n\n"
    )
    results += f"`{base_model}` has {len(models)} children\n\n"
    total_download_number = sum(model.downloads for model in models)
    results += (
        f"`{base_model}`'s children have been"
        f" downloaded {total_download_number:,} times\n\n"
    )
    grandchildren = get_grandchildren(base_model)
    number_of_grandchildren = len(grandchildren)
    results += f"`{base_model}` has {number_of_grandchildren} grandchildren\n\n"
    grandchildren_download_count = sum(model.downloads for model in grandchildren)
    results += (
        f"`{base_model}`'s grandchildren have been"
        f" downloaded {grandchildren_download_count:,} times\n\n"
    )
    results += f"Including grandchildren, `{base_model}` has {number_of_grandchildren + len(models):,} descendants\n\n"
    results += f"Including grandchildren, `{base_model}`'s descendants have been downloaded {grandchildren_download_count + total_download_number:,} times\n\n"
    results += "### Children models \n\n"
    for model in models:
        url = f"https://huggingface.co/{model.modelId}"
        results += (
            f"- [{model.modelId}]({url}) | number of downloads {model.downloads:,}"
            + "\n\n"
        )
    return results


def return_base_model_popularity(pipeline=None):
    df_with_pipeline_info = (
        pd.DataFrame({"base_model": base_models, "pipeline": pipeline_tags})
        .value_counts()
        .reset_index()
    )

    if pipeline is not None:
        df_with_pipeline_info = df_with_pipeline_info[
            df_with_pipeline_info["pipeline"] == pipeline
        ]
    keep_columns = ["base_model", "count"]
    df_with_pipeline_info["base_model"] = df_with_pipeline_info["base_model"].apply(
        render_model_hub_link
    )
    return df_with_pipeline_info[keep_columns].head(50)


def return_base_model_popularity_by_org(pipeline=None):
    referenced_base_models = [
        f"[`{model}`](https://huggingface.co/{model})" for model in base_models
    ]
    df_with_pipeline_info = pd.DataFrame(
        {"base_model": base_models, "pipeline": pipeline_tags}
    )
    df_with_pipeline_info["org"] = df_with_pipeline_info["base_model"].apply(parse_org)
    df_with_pipeline_info["org"] = df_with_pipeline_info["org"].apply(
        render_model_hub_link
    )
    df_with_pipeline_info = df_with_pipeline_info.dropna(subset=["org"])
    df_with_org = df_with_pipeline_info.copy(deep=True)
    if pipeline is not None:
        df_with_org = df_with_pipeline_info[df_with_org["pipeline"] == pipeline]
    df_with_org = df_with_org.drop(columns=["pipeline"])
    df_with_org = pd.DataFrame(df_with_org.value_counts())
    return pd.DataFrame(
        df_with_org.groupby("org")["count"]
        .sum()
        .sort_values(ascending=False)
        .reset_index()
        .head(50)
    )


with gr.Blocks() as demo:
    gr.Markdown(
        "# Base model explorer: explore the lineage of models on the  &#129303; Hub"
    )
    gr.Markdown(
        """When sharing models to the Hub, it is possible to [specify a base model in the model card](https://huggingface.co/docs/hub/model-cards#specifying-a-base-model), i.e. that your model is a fine-tuned version of [bert-base-cased](https://huggingface.co/bert-base-cased). 
        This Space allows you to find children's models for a given base model and view the popularity of models for fine-tuning.
        You can also optionally filter by the task to see rankings for a particular machine learning task.
        Don't forget to  &#10084;  if you like this space &#129303;"""
    )

    gr.Markdown(produce_summary())
    gr.Markdown("## Find all models trained from a base model")
    base_model = gr.Dropdown(all_base_models, label="Base Model")
    results = gr.Markdown()
    base_model.change(return_models_for_base_model, base_model, results)
    gr.Markdown("## Base model rankings ")
    dropdown = gr.Dropdown(
        choices=unique_pipeline_tags,
        value=None,
        label="Filter rankings by task pipeline",
    )
    with gr.Accordion("Base model popularity ranking", open=False):
        df_popularity = gr.DataFrame(
            return_base_model_popularity(None), datatype="markdown"
        )
        dropdown.change(return_base_model_popularity, dropdown, df_popularity)
    with gr.Accordion("Base model popularity ranking by organization", open=False):
        df_popularity_org = gr.DataFrame(
            return_base_model_popularity_by_org(None), datatype="markdown"
        )
        dropdown.change(
            return_base_model_popularity_by_org, dropdown, df_popularity_org
        )


demo.launch()