Spaces:
Sleeping
Sleeping
File size: 6,280 Bytes
69765f6 9380316 7f66f08 e62ac39 9380316 7f66f08 9380316 69765f6 e62ac39 69765f6 e62ac39 eb97135 d102592 69765f6 f5faacd 69765f6 7f66f08 69765f6 7f66f08 69765f6 7f66f08 69765f6 e62ac39 69765f6 3215dde 69765f6 e62ac39 69765f6 f5faacd 69765f6 e62ac39 69765f6 f5faacd 69765f6 7f66f08 69765f6 e62ac39 69765f6 7f66f08 69765f6 9a1a3ec 69765f6 e62ac39 69765f6 325fff8 69765f6 3215dde 7f66f08 4072c1c 69765f6 4072c1c 7f66f08 69765f6 3215dde e62ac39 3215dde e62ac39 f5faacd 3215dde 75d97b4 3215dde 67b9f48 69765f6 75d97b4 3215dde 75d97b4 e62ac39 8d49daa e62ac39 3215dde ba8d4ac e62ac39 69765f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
import os
from datetime import datetime, timedelta
from sys import platform
from typing import Any, Dict
import gradio as gr
import pandas as pd
from cachetools import TTLCache, cached
from diskcache import Cache
from dotenv import load_dotenv
from httpx import Client
from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import thread_map
load_dotenv()
LIMIT = None
CACHE_TIME = 60 * 60 * 12 # 12 hours
REMOVE_ORGS = {
"HuggingFaceM4",
"HuggingFaceBR4",
"open-llm-leaderboard",
"TrainingDataPro",
}
HF_TOKEN = os.getenv("HF_TOKEN")
USER_AGENT = os.getenv("USER_AGENT")
headers = {"authorization": f"Bearer ${HF_TOKEN}", "user-agent": USER_AGENT}
client = Client(
headers=headers,
timeout=60,
)
# LOCAL = False
# if platform == "darwin":
# LOCAL = True
# cache_dir = "cache" if LOCAL else "/data/diskcache"
# cache = Cache(cache_dir)
cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
def get_three_months_ago():
now = datetime.now()
return now - timedelta(days=90)
def parse_date(date_str):
# parse the created date from string 2023-11-17T16:39:54.000Z to datetime
return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
def add_created_data(dataset):
_id = dataset._id
created = parse_date(dataset.createdAt)
dataset_dict = dataset.__dict__
dataset_dict["createdAt"] = created
return dataset_dict
def get_readme_len(dataset: Dict[str, Any]):
try:
url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
resp = client.get(url)
if resp.status_code == 200:
card = DatasetCard(resp.text)
dataset["len"] = len(card.text)
return dataset
except Exception as e:
print(e)
return None
def check_ds_server_valid(id):
url = f"https://datasets-server.huggingface.co/is-valid?dataset={id}"
response = client.get(url)
if response.status_code != 200:
return False
try:
data = response.json()
preview = data.get("preview")
return preview is not None
except Exception as e:
print(e)
return False
def has_server_preview(dataset):
dataset["server_preview"] = check_ds_server_valid(dataset["id"])
return dataset
def render_model_hub_link(hub_id):
link = f"https://huggingface.co/datasets/{hub_id}"
return (
f'<a target="_blank" href="{link}" style="color: var(--link-text-color);'
f' text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
)
@cached(cache)
def get_datasets():
return list(
tqdm(
iter(
list_datasets(limit=LIMIT, full=True, sort="lastModified", direction=-1)
)
)
)
@cached(cache)
def load_data():
datasets = get_datasets()
datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
# datasets = [dataset.__dict__ for dataset in tqdm(datasets)]
filtered = [ds for ds in datasets if ds["createdAt"] > get_three_months_ago()]
ds_with_len = thread_map(get_readme_len, filtered)
ds_with_len = [ds for ds in ds_with_len if ds is not None]
ds_with_valid_status = thread_map(has_server_preview, ds_with_len)
ds_with_valid_status = [ds for ds in ds_with_valid_status if ds is not None]
return ds_with_valid_status
columns_to_drop = [
"cardData",
"gated",
"sha",
# "paperswithcode_id",
"tags",
"description",
"siblings",
"disabled",
"_id",
"private",
"author",
"citation",
"lastModified",
]
def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to_drop):
ds_with_len = load_data()
if remove_orgs_and_users:
ds_with_len = [
ds for ds in ds_with_len if ds["author"] not in remove_orgs_and_users
]
df = pd.DataFrame(ds_with_len)
df["id"] = df["id"].apply(render_model_hub_link)
if columns_to_drop:
df = df.drop(columns=columns_to_drop)
df = df.sort_values(by=["likes", "downloads", "len"], ascending=False)
return df
def filter_df_by_max_age(df, max_age_days=None):
df = df.dropna(subset=["createdAt"])
now = datetime.now()
if max_age_days is not None:
max_date = now - timedelta(days=max_age_days)
df = df[df["createdAt"] >= max_date]
return df
def filter_by_readme_len(df, min_len=None):
if min_len is not None:
df = df[df["len"] >= min_len]
return df
def filter_df(max_age_days=None, min_len=None, needs_server_preview: bool = False):
df = prep_dataframe()
if needs_server_preview:
df = df[df["server_preview"] == True]
if max_age_days is not None:
df = filter_df_by_max_age(df, max_age_days=max_age_days)
if min_len is not None:
df = filter_by_readme_len(df, min_len=min_len)
df = df.sort_values(by=["likes", "downloads", "len"], ascending=False)
return df
with gr.Blocks() as demo:
gr.Markdown("# Recent Datasets on the Hub")
gr.Markdown(
"Datasets added in the past 90 days with a README.md and some metadata."
)
with gr.Row():
max_age_days = gr.Slider(
label="Max Age (days)",
value=7,
minimum=0,
maximum=90,
step=1,
interactive=True,
)
min_len = gr.Slider(
label="Minimum README Length",
value=300,
minimum=0,
maximum=1000,
step=50,
interactive=True,
)
needs_server_preview = gr.Checkbox(
label="Exclude datasets without datasets-server preview?",
value=False,
interactive=True,
)
output = gr.DataFrame(filter_df, datatype="markdown", min_width=160 * 2.5, height=1000)
max_age_days.input(
filter_df,
inputs=[max_age_days, min_len, needs_server_preview],
outputs=[output],
)
min_len.input(
filter_df,
inputs=[max_age_days, min_len, needs_server_preview],
outputs=[output],
)
needs_server_preview.change(
filter_df,
inputs=[max_age_days, min_len, needs_server_preview],
outputs=[output],
)
demo.launch()
|