File size: 17,794 Bytes
ad7917a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c223932
ad7917a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4e77f2
ad7917a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a22ef66
ad7917a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89233f9
ad7917a
 
 
 
 
 
 
 
bd97257
ad7917a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
947b7c2
ad7917a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3701087
ad7917a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4e77f2
f3bd1e0
e4d2970
 
 
 
 
 
ad7917a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1935ca
ad7917a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd97257
 
 
 
 
 
 
ad7917a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
# TODO
# Remove duplication in code used to generate markdown
# periodically update models to check all still valid and public

import os
import re
import sys
from functools import lru_cache
from pathlib import Path
from typing import Dict, List, Set, Union

import gradio as gr
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from cachetools import TTLCache, cached
from dotenv import load_dotenv
from huggingface_hub import (
    HfApi,
    comment_discussion,
    create_discussion,
    dataset_info,
    get_repo_discussions,
)
from huggingface_hub.utils import HFValidationError, RepositoryNotFoundError
from sqlitedict import SqliteDict
from toolz import concat, count, unique
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import thread_map

local = bool(sys.platform.startswith("darwin"))
cache_location = "cache/" if local else "/data/cache"

save_dir = "test_data" if local else "/data/"
Path(save_dir).mkdir(parents=True, exist_ok=True)
load_dotenv()
user_agent = os.getenv("USER_AGENT")
HF_TOKEN = os.getenv("HF_TOKEN")
REPO = "librarian-bots/dataset-to-model-monitor"  # where issues land
AUTHOR = "librarian-bot"  # who makes the issues
hf_api = HfApi(user_agent=user_agent)

ten_min_cache = TTLCache(maxsize=5_000, ttl=600)


@cached(cache=ten_min_cache)
def get_datasets_for_user(username: str) -> List[str]:
    datasets = hf_api.list_datasets(author=username)
    datasets = (dataset.id for dataset in datasets)
    return datasets


@cached(cache=ten_min_cache)
def get_models_for_dataset(dataset_id):
    results = list(iter(hf_api.list_models(filter=f"dataset:{dataset_id}")))
    if results:
        results = list({result.id for result in results})
    return {dataset_id: results}


def generate_dataset_model_map(
    dataset_ids: List[str],
) -> dict[str, dict[str, List[str]]]:
    results = thread_map(get_models_for_dataset, dataset_ids)
    results = {key: value for d in results for key, value in d.items()}
    return results


def maybe_update_datasets_to_model_map(dataset_id):
    with SqliteDict(f"{save_dir}/models_to_dataset.sqlite") as dataset_to_model_map_db:
        if dataset_id not in dataset_to_model_map_db:
            dataset_to_model_map_db[dataset_id] = list(
                get_models_for_dataset(dataset_id)[dataset_id]
            )
            dataset_to_model_map_db.commit()
            return len(dataset_to_model_map_db)
        return False


def datasets_tracked_by_user(username):
    with SqliteDict(
        f"{save_dir}/tracked_dataset_to_users.sqlite"
    ) as tracked_dataset_to_users_db:
        return [
            dataset
            for dataset, users in tracked_dataset_to_users_db.items()
            if username in users
        ]


def update_tracked_dataset_to_users(dataset_id: str, username: str):
    with SqliteDict(
        f"{save_dir}/tracked_dataset_to_users.sqlite",
    ) as tracked_dataset_to_users_db:
        if dataset_id in tracked_dataset_to_users_db:
            # check if user already tracking dataset
            if username not in tracked_dataset_to_users_db[dataset_id]:
                users_for_dataset = tracked_dataset_to_users_db[dataset_id]
                users_for_dataset.append(username)
                tracked_dataset_to_users_db[dataset_id] = list(set(users_for_dataset))
                tracked_dataset_to_users_db.commit()
        else:
            tracked_dataset_to_users_db[dataset_id] = [username]
            tracked_dataset_to_users_db.commit()
    return datasets_tracked_by_user(username)


HUB_ORG_OR_USERNAME_GLOB_PATTERN = re.compile(r"^([a-zA-Z0-9_-]+)\/\*$")


@lru_cache(maxsize=128)
def match_org_user_glob_pattern(hub_id):
    if match := re.match(HUB_ORG_OR_USERNAME_GLOB_PATTERN, hub_id):
        return match[1]
    else:
        return None


@cached(cache=TTLCache(maxsize=100, ttl=60))
def grab_dataset_ids_for_user_or_org(hub_id: str) -> List[str]:
    datasets_for_org = hf_api.list_datasets(author=hub_id)
    datasets_for_org = (
        dataset for dataset in datasets_for_org if dataset.private is False
    )
    return [dataset.id for dataset in datasets_for_org]


@cached(cache=TTLCache(maxsize=100, ttl=60))
def parse_hub_id_entry(hub_id: str) -> Union[str, List[str]]:
    if match := match_org_user_glob_pattern(hub_id):
        return grab_dataset_ids_for_user_or_org(match), match
    try:
        dataset_info(hub_id)
        return hub_id, match
    except HFValidationError as e:
        raise gr.Error(f"Invalid format for Hugging Face Hub dataset ID. {e}") from e
    except RepositoryNotFoundError as e:
        raise gr.Error(f"{hub_id}: Invalid Hugging Face Hub dataset ID") from e


def remove_user_from_tracking_datasets(dataset_id, profile: gr.OAuthProfile | None):
    if not profile and not local:
        return "You must be logged in to remove a dataset"
    username = profile.preferred_username
    dataset_id, match = parse_hub_id_entry(dataset_id)
    if isinstance(dataset_id, str):
        return _remove_user_from_tracking_datasets(dataset_id, username)
    if isinstance(dataset_id, list):
        [
            _remove_user_from_tracking_datasets(dataset, username)
            for dataset in dataset_id
        ]
        return f"Stopped tracking datasets for username or org: {match}"


def _remove_user_from_tracking_datasets(dataset_id: str, username):
    with SqliteDict(
        f"{save_dir}/tracked_dataset_to_users.sqlite"
    ) as tracked_dataset_to_users_db:
        users = tracked_dataset_to_users_db.get(dataset_id)
        if users is None:
            return "Dataset not being tracked"
        try:
            users.remove(username)
        except ValueError:
            return "No longer tracking dataset"
        tracked_dataset_to_users_db[dataset_id] = users
        if len(users) < 1:
            del tracked_dataset_to_users_db[dataset_id]
            with SqliteDict(
                f"{save_dir}/models_to_dataset.sqlite"
            ) as dataset_to_models_db:
                del dataset_to_models_db[dataset_id]
                dataset_to_models_db.commit()
        tracked_dataset_to_users_db.commit()
        return "Dataset no longer being tracked"


def user_unsubscribe_all(username):
    datasets_tracked = datasets_tracked_by_user(username)
    for dataset_id in datasets_tracked:
        remove_user_from_tracking_datasets(username, dataset_id)
    assert len(datasets_tracked_by_user(username)) == 0
    return f"Unsubscribed from {len(datasets_tracked)} datasets"


def user_update(hub_id, profile: gr.OAuthProfile | None):
    if not profile and not local:
        return "Please login to track a dataset"
    username = profile.preferred_username
    hub_id, match = parse_hub_id_entry(hub_id)
    if isinstance(hub_id, str):
        return _user_update(hub_id, username)
    else:
        return glob_update_tracked_datasets(hub_id, username, match)


def glob_update_tracked_datasets(hub_ids, username, match):
    for id_ in tqdm(hub_ids):
        _user_update(id_, username)
    response = "## Dataset tracking summary \n\n"
    response += (
        f"All datasets under the user or organization: {match} are being tracked \n\n"
    )
    tracked_datasets = datasets_tracked_by_user(username)
    response += (
        "You are currently tracking whether new models have been trained on"
        f" {len(tracked_datasets)} datasets.\n\n"
    )
    if tracked_datasets:
        response += "### Datasets being tracked \n\n"
        response += (
            "You are currently monitoring whether new models have been trained on the"
            " following datasets:\n"
        )
        for dataset in tracked_datasets:
            response += f"- [{dataset}](https://huggingface.co/datasets/{dataset})\n"
    return response


def _user_update(hub_id: str, username: str) -> str:
    """Update the user's tracked datasets and return a response string."""
    response = ""
    if number_datasets_being_tracked := maybe_update_datasets_to_model_map(hub_id):
        response += (
            "New dataset being tracked! Now tracking"
            f" {number_datasets_being_tracked} datasets \n\n"
        )
    if not number_datasets_being_tracked:
        response += f"Dataset {hub_id} is already being tracked. \n\n"
    datasets_tracked_by_user = update_tracked_dataset_to_users(hub_id, username)
    response += (
        "You are currently tracking whether new models have been trained on"
        f" {len(datasets_tracked_by_user)} datasets."
    )
    if datasets_tracked_by_user:
        response += (
            "\nYou are currently monitoring whether new models have been trained on the"
            " following datasets:\n"
        )
        for dataset in datasets_tracked_by_user:
            response += f"- [{dataset}](https://huggingface.co/datasets/{dataset})\n"
    else:
        response += "You are not currently tracking any datasets."
    return response


def check_for_new_models_for_dataset_and_update() -> Dict[str, Set[str]]:
    # if not Path(f"{save_dir}/models_to_dataset.json").is_file():
    with SqliteDict(f"{save_dir}/models_to_dataset.sqlite") as old_results_db:
        dataset_ids = list(old_results_db.keys())
        new_results = generate_dataset_model_map(dataset_ids)
        models_to_notify_about = {
            dataset_id: set(models).difference(set(old_results_db[dataset_id]))
            for dataset_id, models in new_results.items()
            if len(models) > len(old_results_db[dataset_id])
        }
        for dataset_id, models in new_results.items():
            old_results_db[dataset_id] = models
        old_results_db.commit()
        return models_to_notify_about


def get_repo_discussion_by_author_and_type(
    repo, author, token, repo_type="space", include_prs=False
):
    discussions = get_repo_discussions(repo, repo_type=repo_type, token=token)
    for discussion in discussions:
        if discussion.author == author:
            if not include_prs and discussion.is_pull_request:
                continue
            yield discussion


def create_discussion_text_body(dataset_id, new_models, users_to_notify):
    usernames = [f"@{username}" for username in users_to_notify]
    usernames_string = ", ".join(usernames)
    dataset_id_markdown_url = (
        f"[{dataset_id}](https://huggingface.co/datasets/{dataset_id})"
    )
    description = (
        f"Hey {usernames_string}! Librarian Bot found new models trained on the"
        f" {dataset_id_markdown_url} dataset!\n\n"
    )
    description += f"New model trained on {dataset_id}:\n"
    markdown_items = [
        f"- {hub_id_to_huggingface_hub_url_markdown(model)}" for model in new_models
    ]
    markdown_list = "\n".join(markdown_items)
    description += markdown_list
    description += """\n\n This discussion was created by the [Dataset to Model Monitor](https://huggingface.co/spaces/librarian-bots/dataset-to-model-monitor) Space. You can modify your alerts using this Space."""
    return description


def maybe_create_discussion(
    repo: str,
    dataset_id: str,
    new_models: Union[List, str],
    users_to_notify: List[str],
    author: str,
    token: str,
):
    title = f"Discussion tracking new models trained on {dataset_id}"
    discussions = get_repo_discussion_by_author_and_type(repo, author, HF_TOKEN)
    if discussions_for_dataset := next(
        (discussion for discussion in discussions if title == discussion.title),
        None,
    ):
        discussion_id = discussions_for_dataset.num
        description = create_discussion_text_body(
            dataset_id, new_models, users_to_notify
        )
        comment_discussion(
            repo, discussion_id, description, token=token, repo_type="space"
        )
    else:
        description = create_discussion_text_body(
            dataset_id, new_models, users_to_notify
        )
        create_discussion(
            repo,
            title,
            token=token,
            description=description,
            repo_type="space",
        )


def hub_id_to_huggingface_hub_url_markdown(hub_id: str) -> str:
    return f"[{hub_id}](https://huggingface.co/{hub_id})"


def notify_about_new_models():
    print("running notifications")
    if models_to_notify_about := check_for_new_models_for_dataset_and_update():
        for dataset_id, new_models in models_to_notify_about.items():
            with SqliteDict(
                f"{save_dir}/tracked_dataset_to_users.sqlite"
            ) as tracked_dataset_to_users_db:
                users_to_notify = tracked_dataset_to_users_db.get(dataset_id)
                maybe_create_discussion(
                    REPO, dataset_id, new_models, users_to_notify, AUTHOR, HF_TOKEN
                )
                print("notified about new models")


def number_of_users_tracking_datasets():
    with SqliteDict(
        f"{save_dir}/tracked_dataset_to_users.sqlite"
    ) as tracked_dataset_to_users_db:
        return count(unique(concat(iter(tracked_dataset_to_users_db.values()))))


def number_of_datasets_tracked():
    with SqliteDict(f"{save_dir}/models_to_dataset.sqlite") as datasets_to_models_db:
        return len(datasets_to_models_db)


@cached(cache=TTLCache(maxsize=1, ttl=30))
def generate_summary_stats():
    return (
        f"Currently there are {number_of_users_tracking_datasets()} users tracking"
        f" datasets with a total of {number_of_datasets_tracked()} datasets being"
        " tracked"
    )


def _user_stats(username: str):
    if not (tracked_datasets := datasets_tracked_by_user(username)):
        return "You are not currently tracking any datasets"
    response = (
        "You are  currently tracking whether new models have been trained on"
        f" {len(tracked_datasets)} datasets.\n\n"
    )
    response += "### Datasets being tracked \n\n"
    response += (
        "You are currently monitoring whether new models have been trained on the"
        " following datasets:\n"
    )
    for dataset in tracked_datasets:
        response += f"- [{dataset}](https://huggingface.co/datasets/{dataset})\n"
    return response


def user_stats(profile: gr.OAuthProfile | None):
    if not profile and not local:
        return "You must be logged in to view datasets you are tracking"
    username = profile.preferred_username
    return _user_stats(username)


markdown_text = """
The Hugging Face Hub allows users to specify the dataset used to train a model in the model metadata. 
This metadata allows you to find models trained on a particular dataset.
These links can be very powerful for finding models that might be suitable for a particular task.\n\n

This Gradio app allows you to track datasets hosted on the Hugging Face Hub and get a notification when new models are trained on the dataset you are tracking. 
1. Submit the Hugging Face Hub ID for the dataset you are interested in tracking. 
2. If a new model is listed as being trained on this dataset Librarian Bot will ping you in a discussion on the Hugging Face Hub to let you know. 
3. Librarian Bot will check for new models for a particular dataset once a day.


**NOTE** This app is a proof of concept and is intended to validate how much interest there is for a feature like this. 
If you have feedback please add it to this [discussion](https://huggingface.co/spaces/librarian-bots/dataset-to-model-monitor/discussions/2).

### Tips 
- You might find the [Hugging Face Datasets Semantic Search](https://huggingface.co/spaces/librarian-bots/huggingface-datasets-semantic-search) Space useful for finding datasets to track.
- You can use a wildcard `*` to track all datasets for a user or organization on the hub. For example `biglam/*` will create alerts for all the datasets under the biglam Hugging Face Organization
- You need to be logged in to your Hugging Face account to use this app. If you don't have a Hugging Face Hub account you can get one <a href="https://huggingface.co/join">here</a>. 

"""

with gr.Blocks() as demo:
    gr.Markdown(
        '<div style="text-align: center;"><h1> &#129302; Librarian Bot Dataset-to-Model'
        ' Monitor &#129302; </h1><i><p style="font-size: 20px;">✨ Get alerts when a new'
        " model is created from a dataset you are interested in! ✨</p></i></div>"
    )

    with gr.Row():
        gr.Markdown(markdown_text)
    with gr.Row():
        hub_id = gr.Textbox(
            "i.e. biglam/brill_iconclass",
            label="Hugging Face Hub ID for dataset to track",
            max_lines=1,
        )
        with gr.Column():
            track_button = gr.Button("Track new models for dataset")
            with gr.Row():
                remove_specific_datasets = gr.Button("Stop tracking dataset")
                remove_all = gr.Button("⛔️ Unsubscribe from all datasets ⛔️")
    with gr.Row(variant="compact"):
        gr.LoginButton(size="sm")
        gr.LogoutButton(size="sm")
        summary_stats_btn = gr.Button(
            "Summary stats for datasets being tracked by this app", size="sm"
        )
        user_stats_btn = gr.Button("List my tracked datasets", size="sm")
    with gr.Row():
        output = gr.Markdown()
        track_button.click(user_update, [hub_id], output)
        remove_specific_datasets.click(
            remove_user_from_tracking_datasets, [hub_id], output
        )
        summary_stats_btn.click(generate_summary_stats, [], output)
        user_stats_btn.click(user_stats, [], output)
scheduler = BackgroundScheduler()

if local:
    scheduler.add_job(notify_about_new_models, "interval", minutes=5)
else:
    scheduler.add_job(
        notify_about_new_models,
        CronTrigger.from_crontab("0 */12 * * *"),
    )
scheduler.start()
demo.queue(max_size=5)
demo.launch()