Spaces:

librarian-bots
/

new_hub_datasets

Sleeping

App Files Files Community

davanstrien HF Staff commited on Mar 12

Commit

4283f40

1 Parent(s): 32f75dc

refactor

Browse files

Files changed (3) hide show

app.py +202 -123
requirements.in +2 -1
requirements.txt +23 -15

app.py CHANGED Viewed

@@ -1,21 +1,28 @@
 import os
 from datetime import datetime, timedelta, timezone
 from typing import Any, Dict
 import gradio as gr
 import pandas as pd
 from cachetools import TTLCache, cached
 from dotenv import load_dotenv
-from httpx import Client
 from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
 from tqdm.auto import tqdm
-from tqdm.contrib.concurrent import thread_map
 load_dotenv()
-LIMIT = 5_000
-CACHE_TIME = 60 * 60 * 12  # 12 hours
 REMOVE_ORGS = {
     "HuggingFaceM4",
     "HuggingFaceBR4",
@@ -35,59 +42,117 @@ headers = {"authorization": f"Bearer {HF_TOKEN}", "user-agent": USER_AGENT}
 client = Client(
     headers=headers,
-    timeout=120,
 )
-# LOCAL = False
-# if platform == "darwin":
-#     LOCAL = True
-# cache_dir = "cache" if LOCAL else "/data/diskcache"
-# cache = Cache(cache_dir)
 cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
-def get_three_months_ago():
-    now = datetime.now(timezone.utc)
-    return now - timedelta(days=90)
-def add_created_data(dataset):
-    _id = dataset._id
-    created = dataset.created_at
-    dataset_dict = dataset.__dict__
-    dataset_dict["createdAt"] = created
-    return dataset_dict
-def get_readme_len(dataset: Dict[str, Any]):
     try:
-        url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
-        resp = client.get(url)
-        if resp.status_code == 200:
-            card = DatasetCard(resp.text)
-            dataset["len"] = len(card.text)
-            return dataset
     except Exception as e:
         print(e)
-        return None
-def check_ds_server_valid(id):
-    url = f"https://datasets-server.huggingface.co/is-valid?dataset={id}"
-    response = client.get(url)
-    if response.status_code != 200:
-        return False
     try:
         data = response.json()
         preview = data.get("preview")
-        return preview is not None
     except Exception as e:
         print(e)
-        return False
-def has_server_preview(dataset):
-    dataset["server_preview"] = check_ds_server_valid(dataset["id"])
-    return dataset
 def render_model_hub_link(hub_id):
@@ -98,90 +163,117 @@ def render_model_hub_link(hub_id):
     )
-@cached(cache)
-def get_datasets():
-    return list(
-        tqdm(
-            iter(list_datasets(limit=LIMIT, full=True, sort="createdAt", direction=-1))
-        )
     )
-@cached(cache)
-def load_data():
-    datasets = get_datasets()
-    datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
-    # datasets = [dataset.__dict__ for dataset in tqdm(datasets)]
-    filtered = [ds for ds in datasets if ds["createdAt"] > get_three_months_ago()]
-    ds_with_len = thread_map(get_readme_len, filtered)
-    ds_with_len = [ds for ds in ds_with_len if ds is not None]
-    ds_with_valid_status = thread_map(has_server_preview, ds_with_len)
-    ds_with_valid_status = [ds for ds in ds_with_valid_status if ds is not None]
-    return ds_with_valid_status
-columns_to_drop = [
-    "cardData",
-    "gated",
-    "sha",
-    "tags",
-    "description",
-    "siblings",
-    "disabled",
-    "_id",
-    "private",
-    "author",
-    #  "citation",
-    "lastModified",
-]
-def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to_drop):
-    ds_with_len = load_data()
-    if remove_orgs_and_users:
-        ds_with_len = [
-            ds for ds in ds_with_len if ds["author"] not in remove_orgs_and_users
-        ]
-    df = pd.DataFrame(ds_with_len)
-    df["id"] = df["id"].apply(render_model_hub_link)
-    if columns_to_drop:
-        df = df.drop(columns=columns_to_drop)
-    df = df.sort_values(by=["likes", "downloads", "len"], ascending=False)
     return df
-def filter_df_by_max_age(df, max_age_days=None):
-    df = df.dropna(subset=["createdAt"])
-    now = datetime.now(timezone.utc)
-    if max_age_days is not None:
-        max_date = now - timedelta(days=max_age_days)
-        df = df[df["createdAt"] >= max_date]
     return df
-def filter_by_readme_len(df, min_len=None):
-    if min_len is not None:
-        df = df[df["len"] >= min_len]
     return df
-def filter_df(max_age_days=None, min_len=None, needs_server_preview: bool = False):
-    try:
-        df = prep_dataframe()
-        if needs_server_preview:
-            df = df[df["server_preview"] == True]
-        if max_age_days is not None:
-            df = filter_df_by_max_age(df, max_age_days=max_age_days)
-        if min_len is not None:
-            df = filter_by_readme_len(df, min_len=min_len)
-        df = df.sort_values(by=["likes", "downloads", "len"], ascending=False)
-        return df
-    except Exception as e:
-        print(f"Error filtering dataframe: {str(e)}")
-        # Return empty dataframe with same columns if there's an error
-        return pd.DataFrame(
-            columns=["id", "likes", "downloads", "len", "createdAt", "server_preview"]
-        )
 with gr.Blocks() as demo:
@@ -212,29 +304,16 @@ with gr.Blocks() as demo:
             interactive=True,
         )
-    # gr.Markdown(
-    #     """
-    #     <style>
-    #     #dataset_table {
-    #         height: 1000px;
-    #         overflow: auto;
-    #     }
-    #     </style>
-    #     """
-    # )
     output = gr.DataFrame(
-        value=filter_df(7, 300, False),  # Set initial values explicitly
         interactive=False,
         datatype="markdown",
-        min_width=160 * 2.5,
-        elem_id="dataset_table",
     )
     def update_df(age, length, preview):
         return filter_df(age, length, preview)
-    # Use a single update function for all inputs
     for component in [max_age_days, min_len, needs_server_preview]:
         component.change(
             fn=update_df,

+import asyncio
 import os
+import time
 from datetime import datetime, timedelta, timezone
 from typing import Any, Dict
 import gradio as gr
 import pandas as pd
+import polars as pl
 from cachetools import TTLCache, cached
+from cashews import cache
+from datasets import Dataset
 from dotenv import load_dotenv
+from httpx import AsyncClient, Client
 from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
 from tqdm.auto import tqdm
+cache.setup("mem://")
 load_dotenv()
+LIMIT = 15_000
+CACHE_TIME = 60 * 60 * 1  # 1 hour
 REMOVE_ORGS = {
     "HuggingFaceM4",
     "HuggingFaceBR4",
 client = Client(
     headers=headers,
+    timeout=30,
 )
+async_client = AsyncClient(
+    headers=headers,
+    timeout=30,
+    http2=True,
+)
 cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
+@cached(cache)
+def get_initial_data():
+    datasets = list_datasets(
+        limit=LIMIT,
+        sort="createdAt",
+        direction=-1,
+        expand=[
+            "trendingScore",
+            "createdAt",
+            "author",
+            "downloads",
+            "likes",
+            "cardData",
+            "lastModified",
+            "private",
+        ],
+    )
+    return [d.__dict__ for d in tqdm(datasets)]
+keep_initial = [
+    "id",
+    "author",
+    "created_at",
+    "last_modified",
+    "private",
+    "downloads",
+    "likes",
+    "trending_score",
+    "card_data",
+    "cardData",
+]
+keep_final = [
+    "id",
+    "author",
+    "created_at",
+    "last_modified",
+    "downloads",
+    "likes",
+    "trending_score",
+]
+def prepare_initial_df():
+    ds = get_initial_data()
+    df = pl.LazyFrame(ds).select(keep_initial)
+    # remove private datasets
+    df = df.filter(~pl.col("private"))
+    df = df.filter(~pl.col("author").is_in(REMOVE_ORGS))
+    df = df.filter(~pl.col("id").str.contains("my-distiset"))
+    df = df.select(keep_final)
+    return df.collect()
+async def get_readme_len(row: Dict[str, Any]):
+    SEMPAHORE = asyncio.Semaphore(30)
     try:
+        url = hf_hub_url(row["id"], "README.md", repo_type="dataset")
+        async with SEMPAHORE:
+            resp = await async_client.get(url)
+            if resp.status_code == 200:
+                card = DatasetCard(resp.text)
+                row["len"] = len(card.text)
+            else:
+                row["len"] = 0  # Use 0 instead of None to avoid type issues
+            return row
     except Exception as e:
         print(e)
+        row["len"] = 0  # Use 0 instead of None to avoid type issues
+        return row
+def prepare_data_with_readme_len(df: pl.DataFrame):
+    ds = Dataset.from_polars(df)
+    ds = ds.map(get_readme_len)
+    return ds
+async def check_ds_server_valid(row):
+    SEMPAHORE = asyncio.Semaphore(10)
     try:
+        url = f"https://datasets-server.huggingface.co/is-valid?dataset={row['id']}"
+        async with SEMPAHORE:
+            response = await async_client.get(url)
+        if response.status_code != 200:
+            row["has_server_preview"] = False
         data = response.json()
         preview = data.get("preview")
+        row["has_server_preview"] = preview is not None
+        return row
     except Exception as e:
         print(e)
+        row["has_server_preview"] = False
+        return row
+def prep_data_with_server_preview(ds):
+    ds = ds.map(check_ds_server_valid)
+    return ds.to_polars()
 def render_model_hub_link(hub_id):
     )
+def prep_final_data():
+    # Check if we have a valid cached parquet file
+    cache_dir = "cache"
+    os.makedirs(cache_dir, exist_ok=True)
+    # Get current time and calculate cache validity
+    now = time.time()
+    cache_valid_time = (
+        now - CACHE_TIME
+    )  # Cache is valid if created within the last CACHE_TIME seconds
+    # Look for valid cache files
+    valid_cache_file = None
+    for filename in os.listdir(cache_dir):
+        if filename.startswith("dataset_cache_") and filename.endswith(".parquet"):
+            try:
+                # Extract timestamp from filename
+                timestamp = float(
+                    filename.replace("dataset_cache_", "").replace(".parquet", "")
+                )
+                if timestamp > cache_valid_time:
+                    valid_cache_file = os.path.join(cache_dir, filename)
+                    break
+            except ValueError:
+                continue
+    # If we have a valid cache file, load it
+    if valid_cache_file:
+        print(f"Loading data from cache: {valid_cache_file}")
+        return pl.read_parquet(valid_cache_file)
+    # Otherwise, generate the data and cache it
+    print("Generating fresh data...")
+    df = prepare_initial_df()
+    ds = prepare_data_with_readme_len(df)
+    df = prep_data_with_server_preview(ds)
+    # Format the ID column as HTML links using string concatenation instead of regex
+    df = df.with_columns(
+        (
+            pl.lit('<a target="_blank" href="https://huggingface.co/datasets/')
+            + pl.col("id")
+            + pl.lit(
+                '" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">'
+            )
+            + pl.col("id")
+            + pl.lit("</a>")
+        ).alias("hub_id")
     )
+    df = df.drop("id")
+    df = df.sort(by=["trending_score", "likes", "downloads", "len"], descending=True)
+    # make hub_id column first column
+    print(df.columns)
+    df = df.select(
+        [
+            "hub_id",
+            "author",
+            "created_at",
+            "last_modified",
+            "downloads",
+            "likes",
+            "trending_score",
+            "len",
+            "has_server_preview",
+        ]
+    )
+    # Save to cache
+    cache_file = os.path.join(cache_dir, f"dataset_cache_{now}.parquet")
+    df.write_parquet(cache_file)
+    # Clean up old cache files
+    for filename in os.listdir(cache_dir):
+        if filename.startswith("dataset_cache_") and filename.endswith(".parquet"):
+            try:
+                timestamp = float(
+                    filename.replace("dataset_cache_", "").replace(".parquet", "")
+                )
+                if timestamp <= cache_valid_time:
+                    os.remove(os.path.join(cache_dir, filename))
+            except ValueError:
+                continue
+    return df
+def filter_by_max_age(df, max_age_days):
+    df = df.filter(
+        pl.col("created_at")
+        > (datetime.now(timezone.utc) - timedelta(days=max_age_days))
+    )
     return df
+def filter_by_min_len(df, min_len):
+    df = df.filter(pl.col("len") >= min_len)
     return df
+def filter_by_server_preview(df, needs_server_preview):
+    df = df.filter(pl.col("has_server_preview") == needs_server_preview)
     return df
+def filter_df(max_age_days, min_len, needs_server_preview):
+    df = prep_final_data()
+    df = df.lazy()
+    df = filter_by_max_age(df, max_age_days)
+    df = filter_by_min_len(df, min_len)
+    df = filter_by_server_preview(df, needs_server_preview)
+    df = df.sort(by=["trending_score", "likes", "downloads", "len"], descending=True)
+    return df.collect()
 with gr.Blocks() as demo:
             interactive=True,
         )
     output = gr.DataFrame(
+        value=filter_df(7, 300, False),
         interactive=False,
         datatype="markdown",
     )
     def update_df(age, length, preview):
         return filter_df(age, length, preview)
+    # Connect the input components to the update function
     for component in [max_age_days, min_len, needs_server_preview]:
         component.change(
             fn=update_df,

requirements.in CHANGED Viewed

@@ -3,7 +3,8 @@ datasets
 datasets
 diskcache
 gradio==5.14.0
-httpx
 huggingface_hub
 pandas
 python-dotenv

 datasets
 diskcache
 gradio==5.14.0
+httpx[http2]
 huggingface_hub
 pandas
 python-dotenv
+polars

requirements.txt CHANGED Viewed

@@ -2,9 +2,9 @@
 #    uv pip compile requirements.in -o requirements.txt
 aiofiles==23.2.1
     # via gradio
-aiohappyeyeballs==2.4.4
     # via aiohttp
-aiohttp==3.11.11
     # via
     #   datasets
     #   fsspec
@@ -17,9 +17,9 @@ anyio==4.8.0
     #   gradio
     #   httpx
     #   starlette
-attrs==25.1.0
     # via aiohttp
-cachetools==5.5.1
     # via -r requirements.in
 certifi==2025.1.31
     # via
@@ -32,7 +32,7 @@ click==8.1.8
     # via
     #   typer
     #   uvicorn
-datasets==3.2.0
     # via -r requirements.in
 dill==0.3.8
     # via
@@ -40,7 +40,7 @@ dill==0.3.8
     #   multiprocess
 diskcache==5.6.3
     # via -r requirements.in
-fastapi==0.115.8
     # via gradio
 ffmpy==0.5.0
     # via gradio
@@ -52,7 +52,7 @@ frozenlist==1.5.0
     # via
     #   aiohttp
     #   aiosignal
-fsspec==2024.9.0
     # via
     #   datasets
     #   gradio-client
@@ -65,6 +65,10 @@ h11==0.14.0
     # via
     #   httpcore
     #   uvicorn
 httpcore==1.0.7
     # via httpx
 httpx==0.28.1
@@ -73,19 +77,21 @@ httpx==0.28.1
     #   gradio
     #   gradio-client
     #   safehttpx
-huggingface-hub==0.28.1
     # via
     #   -r requirements.in
     #   datasets
     #   gradio
     #   gradio-client
 idna==3.10
     # via
     #   anyio
     #   httpx
     #   requests
     #   yarl
-jinja2==3.1.5
     # via gradio
 markdown-it-py==3.0.0
     # via rich
@@ -101,7 +107,7 @@ multidict==6.1.0
     #   yarl
 multiprocess==0.70.16
     # via datasets
-numpy==2.2.2
     # via
     #   datasets
     #   gradio
@@ -121,11 +127,13 @@ pandas==2.2.3
     #   gradio
 pillow==11.1.0
     # via gradio
-propcache==0.2.1
     # via
     #   aiohttp
     #   yarl
-pyarrow==19.0.0
     # via datasets
 pydantic==2.10.6
     # via
@@ -156,7 +164,7 @@ requests==2.32.3
     #   huggingface-hub
 rich==13.9.4
     # via typer
-ruff==0.9.4
     # via gradio
 safehttpx==0.1.6
     # via gradio
@@ -168,7 +176,7 @@ six==1.17.0
     # via python-dateutil
 sniffio==1.3.1
     # via anyio
-starlette==0.45.3
     # via
     #   fastapi
     #   gradio
@@ -178,7 +186,7 @@ tqdm==4.67.1
     # via
     #   datasets
     #   huggingface-hub
-typer==0.15.1
     # via gradio
 typing-extensions==4.12.2
     # via

 #    uv pip compile requirements.in -o requirements.txt
 aiofiles==23.2.1
     # via gradio
+aiohappyeyeballs==2.6.1
     # via aiohttp
+aiohttp==3.11.13
     # via
     #   datasets
     #   fsspec
     #   gradio
     #   httpx
     #   starlette
+attrs==25.2.0
     # via aiohttp
+cachetools==5.5.2
     # via -r requirements.in
 certifi==2025.1.31
     # via
     # via
     #   typer
     #   uvicorn
+datasets==3.3.2
     # via -r requirements.in
 dill==0.3.8
     # via
     #   multiprocess
 diskcache==5.6.3
     # via -r requirements.in
+fastapi==0.115.11
     # via gradio
 ffmpy==0.5.0
     # via gradio
     # via
     #   aiohttp
     #   aiosignal
+fsspec==2024.12.0
     # via
     #   datasets
     #   gradio-client
     # via
     #   httpcore
     #   uvicorn
+h2==4.2.0
+    # via httpx
+hpack==4.1.0
+    # via h2
 httpcore==1.0.7
     # via httpx
 httpx==0.28.1
     #   gradio
     #   gradio-client
     #   safehttpx
+huggingface-hub==0.29.3
     # via
     #   -r requirements.in
     #   datasets
     #   gradio
     #   gradio-client
+hyperframe==6.1.0
+    # via h2
 idna==3.10
     # via
     #   anyio
     #   httpx
     #   requests
     #   yarl
+jinja2==3.1.6
     # via gradio
 markdown-it-py==3.0.0
     # via rich
     #   yarl
 multiprocess==0.70.16
     # via datasets
+numpy==2.2.3
     # via
     #   datasets
     #   gradio
     #   gradio
 pillow==11.1.0
     # via gradio
+polars==1.24.0
+    # via -r requirements.in
+propcache==0.3.0
     # via
     #   aiohttp
     #   yarl
+pyarrow==19.0.1
     # via datasets
 pydantic==2.10.6
     # via
     #   huggingface-hub
 rich==13.9.4
     # via typer
+ruff==0.9.10
     # via gradio
 safehttpx==0.1.6
     # via gradio
     # via python-dateutil
 sniffio==1.3.1
     # via anyio
+starlette==0.46.1
     # via
     #   fastapi
     #   gradio
     # via
     #   datasets
     #   huggingface-hub
+typer==0.15.2
     # via gradio
 typing-extensions==4.12.2
     # via