davanstrien HF staff commited on
Commit
4283f40
·
1 Parent(s): 32f75dc
Files changed (3) hide show
  1. app.py +202 -123
  2. requirements.in +2 -1
  3. requirements.txt +23 -15
app.py CHANGED
@@ -1,21 +1,28 @@
 
1
  import os
 
2
  from datetime import datetime, timedelta, timezone
3
  from typing import Any, Dict
4
 
5
  import gradio as gr
6
  import pandas as pd
 
7
  from cachetools import TTLCache, cached
 
 
8
  from dotenv import load_dotenv
9
- from httpx import Client
10
  from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
11
  from tqdm.auto import tqdm
12
- from tqdm.contrib.concurrent import thread_map
 
 
13
 
14
  load_dotenv()
15
 
16
- LIMIT = 5_000
17
 
18
- CACHE_TIME = 60 * 60 * 12 # 12 hours
19
  REMOVE_ORGS = {
20
  "HuggingFaceM4",
21
  "HuggingFaceBR4",
@@ -35,59 +42,117 @@ headers = {"authorization": f"Bearer {HF_TOKEN}", "user-agent": USER_AGENT}
35
 
36
  client = Client(
37
  headers=headers,
38
- timeout=120,
39
  )
40
- # LOCAL = False
41
- # if platform == "darwin":
42
- # LOCAL = True
43
- # cache_dir = "cache" if LOCAL else "/data/diskcache"
44
- # cache = Cache(cache_dir)
 
45
  cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
46
 
47
 
48
- def get_three_months_ago():
49
- now = datetime.now(timezone.utc)
50
- return now - timedelta(days=90)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
 
53
- def add_created_data(dataset):
54
- _id = dataset._id
55
- created = dataset.created_at
56
- dataset_dict = dataset.__dict__
57
- dataset_dict["createdAt"] = created
58
- return dataset_dict
 
 
 
59
 
60
 
61
- def get_readme_len(dataset: Dict[str, Any]):
 
62
  try:
63
- url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
64
- resp = client.get(url)
65
- if resp.status_code == 200:
66
- card = DatasetCard(resp.text)
67
- dataset["len"] = len(card.text)
68
- return dataset
 
 
 
69
  except Exception as e:
70
  print(e)
71
- return None
 
 
 
 
 
 
 
72
 
73
 
74
- def check_ds_server_valid(id):
75
- url = f"https://datasets-server.huggingface.co/is-valid?dataset={id}"
76
- response = client.get(url)
77
- if response.status_code != 200:
78
- return False
79
  try:
 
 
 
 
 
80
  data = response.json()
81
  preview = data.get("preview")
82
- return preview is not None
 
83
  except Exception as e:
84
  print(e)
85
- return False
 
86
 
87
 
88
- def has_server_preview(dataset):
89
- dataset["server_preview"] = check_ds_server_valid(dataset["id"])
90
- return dataset
91
 
92
 
93
  def render_model_hub_link(hub_id):
@@ -98,90 +163,117 @@ def render_model_hub_link(hub_id):
98
  )
99
 
100
 
101
- @cached(cache)
102
- def get_datasets():
103
- return list(
104
- tqdm(
105
- iter(list_datasets(limit=LIMIT, full=True, sort="createdAt", direction=-1))
106
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
-
110
- @cached(cache)
111
- def load_data():
112
- datasets = get_datasets()
113
- datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
114
- # datasets = [dataset.__dict__ for dataset in tqdm(datasets)]
115
- filtered = [ds for ds in datasets if ds["createdAt"] > get_three_months_ago()]
116
- ds_with_len = thread_map(get_readme_len, filtered)
117
- ds_with_len = [ds for ds in ds_with_len if ds is not None]
118
- ds_with_valid_status = thread_map(has_server_preview, ds_with_len)
119
- ds_with_valid_status = [ds for ds in ds_with_valid_status if ds is not None]
120
- return ds_with_valid_status
121
-
122
-
123
- columns_to_drop = [
124
- "cardData",
125
- "gated",
126
- "sha",
127
- "tags",
128
- "description",
129
- "siblings",
130
- "disabled",
131
- "_id",
132
- "private",
133
- "author",
134
- # "citation",
135
- "lastModified",
136
- ]
137
 
138
 
139
- def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to_drop):
140
- ds_with_len = load_data()
141
- if remove_orgs_and_users:
142
- ds_with_len = [
143
- ds for ds in ds_with_len if ds["author"] not in remove_orgs_and_users
144
- ]
145
- df = pd.DataFrame(ds_with_len)
146
- df["id"] = df["id"].apply(render_model_hub_link)
147
- if columns_to_drop:
148
- df = df.drop(columns=columns_to_drop)
149
- df = df.sort_values(by=["likes", "downloads", "len"], ascending=False)
150
  return df
151
 
152
 
153
- def filter_df_by_max_age(df, max_age_days=None):
154
- df = df.dropna(subset=["createdAt"])
155
- now = datetime.now(timezone.utc)
156
- if max_age_days is not None:
157
- max_date = now - timedelta(days=max_age_days)
158
- df = df[df["createdAt"] >= max_date]
159
  return df
160
 
161
 
162
- def filter_by_readme_len(df, min_len=None):
163
- if min_len is not None:
164
- df = df[df["len"] >= min_len]
165
  return df
166
 
167
 
168
- def filter_df(max_age_days=None, min_len=None, needs_server_preview: bool = False):
169
- try:
170
- df = prep_dataframe()
171
- if needs_server_preview:
172
- df = df[df["server_preview"] == True]
173
- if max_age_days is not None:
174
- df = filter_df_by_max_age(df, max_age_days=max_age_days)
175
- if min_len is not None:
176
- df = filter_by_readme_len(df, min_len=min_len)
177
- df = df.sort_values(by=["likes", "downloads", "len"], ascending=False)
178
- return df
179
- except Exception as e:
180
- print(f"Error filtering dataframe: {str(e)}")
181
- # Return empty dataframe with same columns if there's an error
182
- return pd.DataFrame(
183
- columns=["id", "likes", "downloads", "len", "createdAt", "server_preview"]
184
- )
185
 
186
 
187
  with gr.Blocks() as demo:
@@ -212,29 +304,16 @@ with gr.Blocks() as demo:
212
  interactive=True,
213
  )
214
 
215
- # gr.Markdown(
216
- # """
217
- # <style>
218
- # #dataset_table {
219
- # height: 1000px;
220
- # overflow: auto;
221
- # }
222
- # </style>
223
- # """
224
- # )
225
-
226
  output = gr.DataFrame(
227
- value=filter_df(7, 300, False), # Set initial values explicitly
228
  interactive=False,
229
  datatype="markdown",
230
- min_width=160 * 2.5,
231
- elem_id="dataset_table",
232
  )
233
 
234
  def update_df(age, length, preview):
235
  return filter_df(age, length, preview)
236
 
237
- # Use a single update function for all inputs
238
  for component in [max_age_days, min_len, needs_server_preview]:
239
  component.change(
240
  fn=update_df,
 
1
+ import asyncio
2
  import os
3
+ import time
4
  from datetime import datetime, timedelta, timezone
5
  from typing import Any, Dict
6
 
7
  import gradio as gr
8
  import pandas as pd
9
+ import polars as pl
10
  from cachetools import TTLCache, cached
11
+ from cashews import cache
12
+ from datasets import Dataset
13
  from dotenv import load_dotenv
14
+ from httpx import AsyncClient, Client
15
  from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
16
  from tqdm.auto import tqdm
17
+
18
+
19
+ cache.setup("mem://")
20
 
21
  load_dotenv()
22
 
23
+ LIMIT = 15_000
24
 
25
+ CACHE_TIME = 60 * 60 * 1 # 1 hour
26
  REMOVE_ORGS = {
27
  "HuggingFaceM4",
28
  "HuggingFaceBR4",
 
42
 
43
  client = Client(
44
  headers=headers,
45
+ timeout=30,
46
  )
47
+ async_client = AsyncClient(
48
+ headers=headers,
49
+ timeout=30,
50
+ http2=True,
51
+ )
52
+
53
  cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
54
 
55
 
56
+ @cached(cache)
57
+ def get_initial_data():
58
+ datasets = list_datasets(
59
+ limit=LIMIT,
60
+ sort="createdAt",
61
+ direction=-1,
62
+ expand=[
63
+ "trendingScore",
64
+ "createdAt",
65
+ "author",
66
+ "downloads",
67
+ "likes",
68
+ "cardData",
69
+ "lastModified",
70
+ "private",
71
+ ],
72
+ )
73
+ return [d.__dict__ for d in tqdm(datasets)]
74
+
75
+
76
+ keep_initial = [
77
+ "id",
78
+ "author",
79
+ "created_at",
80
+ "last_modified",
81
+ "private",
82
+ "downloads",
83
+ "likes",
84
+ "trending_score",
85
+ "card_data",
86
+ "cardData",
87
+ ]
88
+
89
+ keep_final = [
90
+ "id",
91
+ "author",
92
+ "created_at",
93
+ "last_modified",
94
+ "downloads",
95
+ "likes",
96
+ "trending_score",
97
+ ]
98
 
99
 
100
+ def prepare_initial_df():
101
+ ds = get_initial_data()
102
+ df = pl.LazyFrame(ds).select(keep_initial)
103
+ # remove private datasets
104
+ df = df.filter(~pl.col("private"))
105
+ df = df.filter(~pl.col("author").is_in(REMOVE_ORGS))
106
+ df = df.filter(~pl.col("id").str.contains("my-distiset"))
107
+ df = df.select(keep_final)
108
+ return df.collect()
109
 
110
 
111
+ async def get_readme_len(row: Dict[str, Any]):
112
+ SEMPAHORE = asyncio.Semaphore(30)
113
  try:
114
+ url = hf_hub_url(row["id"], "README.md", repo_type="dataset")
115
+ async with SEMPAHORE:
116
+ resp = await async_client.get(url)
117
+ if resp.status_code == 200:
118
+ card = DatasetCard(resp.text)
119
+ row["len"] = len(card.text)
120
+ else:
121
+ row["len"] = 0 # Use 0 instead of None to avoid type issues
122
+ return row
123
  except Exception as e:
124
  print(e)
125
+ row["len"] = 0 # Use 0 instead of None to avoid type issues
126
+ return row
127
+
128
+
129
+ def prepare_data_with_readme_len(df: pl.DataFrame):
130
+ ds = Dataset.from_polars(df)
131
+ ds = ds.map(get_readme_len)
132
+ return ds
133
 
134
 
135
+ async def check_ds_server_valid(row):
136
+ SEMPAHORE = asyncio.Semaphore(10)
 
 
 
137
  try:
138
+ url = f"https://datasets-server.huggingface.co/is-valid?dataset={row['id']}"
139
+ async with SEMPAHORE:
140
+ response = await async_client.get(url)
141
+ if response.status_code != 200:
142
+ row["has_server_preview"] = False
143
  data = response.json()
144
  preview = data.get("preview")
145
+ row["has_server_preview"] = preview is not None
146
+ return row
147
  except Exception as e:
148
  print(e)
149
+ row["has_server_preview"] = False
150
+ return row
151
 
152
 
153
+ def prep_data_with_server_preview(ds):
154
+ ds = ds.map(check_ds_server_valid)
155
+ return ds.to_polars()
156
 
157
 
158
  def render_model_hub_link(hub_id):
 
163
  )
164
 
165
 
166
+ def prep_final_data():
167
+ # Check if we have a valid cached parquet file
168
+ cache_dir = "cache"
169
+ os.makedirs(cache_dir, exist_ok=True)
170
+
171
+ # Get current time and calculate cache validity
172
+ now = time.time()
173
+ cache_valid_time = (
174
+ now - CACHE_TIME
175
+ ) # Cache is valid if created within the last CACHE_TIME seconds
176
+
177
+ # Look for valid cache files
178
+ valid_cache_file = None
179
+ for filename in os.listdir(cache_dir):
180
+ if filename.startswith("dataset_cache_") and filename.endswith(".parquet"):
181
+ try:
182
+ # Extract timestamp from filename
183
+ timestamp = float(
184
+ filename.replace("dataset_cache_", "").replace(".parquet", "")
185
+ )
186
+ if timestamp > cache_valid_time:
187
+ valid_cache_file = os.path.join(cache_dir, filename)
188
+ break
189
+ except ValueError:
190
+ continue
191
+
192
+ # If we have a valid cache file, load it
193
+ if valid_cache_file:
194
+ print(f"Loading data from cache: {valid_cache_file}")
195
+ return pl.read_parquet(valid_cache_file)
196
+
197
+ # Otherwise, generate the data and cache it
198
+ print("Generating fresh data...")
199
+ df = prepare_initial_df()
200
+ ds = prepare_data_with_readme_len(df)
201
+ df = prep_data_with_server_preview(ds)
202
+
203
+ # Format the ID column as HTML links using string concatenation instead of regex
204
+ df = df.with_columns(
205
+ (
206
+ pl.lit('<a target="_blank" href="https://huggingface.co/datasets/')
207
+ + pl.col("id")
208
+ + pl.lit(
209
+ '" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">'
210
+ )
211
+ + pl.col("id")
212
+ + pl.lit("</a>")
213
+ ).alias("hub_id")
214
  )
215
+ df = df.drop("id")
216
+ df = df.sort(by=["trending_score", "likes", "downloads", "len"], descending=True)
217
+ # make hub_id column first column
218
+ print(df.columns)
219
+ df = df.select(
220
+ [
221
+ "hub_id",
222
+ "author",
223
+ "created_at",
224
+ "last_modified",
225
+ "downloads",
226
+ "likes",
227
+ "trending_score",
228
+ "len",
229
+ "has_server_preview",
230
+ ]
231
+ )
232
+ # Save to cache
233
+ cache_file = os.path.join(cache_dir, f"dataset_cache_{now}.parquet")
234
+ df.write_parquet(cache_file)
235
+
236
+ # Clean up old cache files
237
+ for filename in os.listdir(cache_dir):
238
+ if filename.startswith("dataset_cache_") and filename.endswith(".parquet"):
239
+ try:
240
+ timestamp = float(
241
+ filename.replace("dataset_cache_", "").replace(".parquet", "")
242
+ )
243
+ if timestamp <= cache_valid_time:
244
+ os.remove(os.path.join(cache_dir, filename))
245
+ except ValueError:
246
+ continue
247
 
248
+ return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
 
251
+ def filter_by_max_age(df, max_age_days):
252
+ df = df.filter(
253
+ pl.col("created_at")
254
+ > (datetime.now(timezone.utc) - timedelta(days=max_age_days))
255
+ )
 
 
 
 
 
 
256
  return df
257
 
258
 
259
+ def filter_by_min_len(df, min_len):
260
+ df = df.filter(pl.col("len") >= min_len)
 
 
 
 
261
  return df
262
 
263
 
264
+ def filter_by_server_preview(df, needs_server_preview):
265
+ df = df.filter(pl.col("has_server_preview") == needs_server_preview)
 
266
  return df
267
 
268
 
269
+ def filter_df(max_age_days, min_len, needs_server_preview):
270
+ df = prep_final_data()
271
+ df = df.lazy()
272
+ df = filter_by_max_age(df, max_age_days)
273
+ df = filter_by_min_len(df, min_len)
274
+ df = filter_by_server_preview(df, needs_server_preview)
275
+ df = df.sort(by=["trending_score", "likes", "downloads", "len"], descending=True)
276
+ return df.collect()
 
 
 
 
 
 
 
 
 
277
 
278
 
279
  with gr.Blocks() as demo:
 
304
  interactive=True,
305
  )
306
 
 
 
 
 
 
 
 
 
 
 
 
307
  output = gr.DataFrame(
308
+ value=filter_df(7, 300, False),
309
  interactive=False,
310
  datatype="markdown",
 
 
311
  )
312
 
313
  def update_df(age, length, preview):
314
  return filter_df(age, length, preview)
315
 
316
+ # Connect the input components to the update function
317
  for component in [max_age_days, min_len, needs_server_preview]:
318
  component.change(
319
  fn=update_df,
requirements.in CHANGED
@@ -3,7 +3,8 @@ datasets
3
  datasets
4
  diskcache
5
  gradio==5.14.0
6
- httpx
7
  huggingface_hub
8
  pandas
9
  python-dotenv
 
 
3
  datasets
4
  diskcache
5
  gradio==5.14.0
6
+ httpx[http2]
7
  huggingface_hub
8
  pandas
9
  python-dotenv
10
+ polars
requirements.txt CHANGED
@@ -2,9 +2,9 @@
2
  # uv pip compile requirements.in -o requirements.txt
3
  aiofiles==23.2.1
4
  # via gradio
5
- aiohappyeyeballs==2.4.4
6
  # via aiohttp
7
- aiohttp==3.11.11
8
  # via
9
  # datasets
10
  # fsspec
@@ -17,9 +17,9 @@ anyio==4.8.0
17
  # gradio
18
  # httpx
19
  # starlette
20
- attrs==25.1.0
21
  # via aiohttp
22
- cachetools==5.5.1
23
  # via -r requirements.in
24
  certifi==2025.1.31
25
  # via
@@ -32,7 +32,7 @@ click==8.1.8
32
  # via
33
  # typer
34
  # uvicorn
35
- datasets==3.2.0
36
  # via -r requirements.in
37
  dill==0.3.8
38
  # via
@@ -40,7 +40,7 @@ dill==0.3.8
40
  # multiprocess
41
  diskcache==5.6.3
42
  # via -r requirements.in
43
- fastapi==0.115.8
44
  # via gradio
45
  ffmpy==0.5.0
46
  # via gradio
@@ -52,7 +52,7 @@ frozenlist==1.5.0
52
  # via
53
  # aiohttp
54
  # aiosignal
55
- fsspec==2024.9.0
56
  # via
57
  # datasets
58
  # gradio-client
@@ -65,6 +65,10 @@ h11==0.14.0
65
  # via
66
  # httpcore
67
  # uvicorn
 
 
 
 
68
  httpcore==1.0.7
69
  # via httpx
70
  httpx==0.28.1
@@ -73,19 +77,21 @@ httpx==0.28.1
73
  # gradio
74
  # gradio-client
75
  # safehttpx
76
- huggingface-hub==0.28.1
77
  # via
78
  # -r requirements.in
79
  # datasets
80
  # gradio
81
  # gradio-client
 
 
82
  idna==3.10
83
  # via
84
  # anyio
85
  # httpx
86
  # requests
87
  # yarl
88
- jinja2==3.1.5
89
  # via gradio
90
  markdown-it-py==3.0.0
91
  # via rich
@@ -101,7 +107,7 @@ multidict==6.1.0
101
  # yarl
102
  multiprocess==0.70.16
103
  # via datasets
104
- numpy==2.2.2
105
  # via
106
  # datasets
107
  # gradio
@@ -121,11 +127,13 @@ pandas==2.2.3
121
  # gradio
122
  pillow==11.1.0
123
  # via gradio
124
- propcache==0.2.1
 
 
125
  # via
126
  # aiohttp
127
  # yarl
128
- pyarrow==19.0.0
129
  # via datasets
130
  pydantic==2.10.6
131
  # via
@@ -156,7 +164,7 @@ requests==2.32.3
156
  # huggingface-hub
157
  rich==13.9.4
158
  # via typer
159
- ruff==0.9.4
160
  # via gradio
161
  safehttpx==0.1.6
162
  # via gradio
@@ -168,7 +176,7 @@ six==1.17.0
168
  # via python-dateutil
169
  sniffio==1.3.1
170
  # via anyio
171
- starlette==0.45.3
172
  # via
173
  # fastapi
174
  # gradio
@@ -178,7 +186,7 @@ tqdm==4.67.1
178
  # via
179
  # datasets
180
  # huggingface-hub
181
- typer==0.15.1
182
  # via gradio
183
  typing-extensions==4.12.2
184
  # via
 
2
  # uv pip compile requirements.in -o requirements.txt
3
  aiofiles==23.2.1
4
  # via gradio
5
+ aiohappyeyeballs==2.6.1
6
  # via aiohttp
7
+ aiohttp==3.11.13
8
  # via
9
  # datasets
10
  # fsspec
 
17
  # gradio
18
  # httpx
19
  # starlette
20
+ attrs==25.2.0
21
  # via aiohttp
22
+ cachetools==5.5.2
23
  # via -r requirements.in
24
  certifi==2025.1.31
25
  # via
 
32
  # via
33
  # typer
34
  # uvicorn
35
+ datasets==3.3.2
36
  # via -r requirements.in
37
  dill==0.3.8
38
  # via
 
40
  # multiprocess
41
  diskcache==5.6.3
42
  # via -r requirements.in
43
+ fastapi==0.115.11
44
  # via gradio
45
  ffmpy==0.5.0
46
  # via gradio
 
52
  # via
53
  # aiohttp
54
  # aiosignal
55
+ fsspec==2024.12.0
56
  # via
57
  # datasets
58
  # gradio-client
 
65
  # via
66
  # httpcore
67
  # uvicorn
68
+ h2==4.2.0
69
+ # via httpx
70
+ hpack==4.1.0
71
+ # via h2
72
  httpcore==1.0.7
73
  # via httpx
74
  httpx==0.28.1
 
77
  # gradio
78
  # gradio-client
79
  # safehttpx
80
+ huggingface-hub==0.29.3
81
  # via
82
  # -r requirements.in
83
  # datasets
84
  # gradio
85
  # gradio-client
86
+ hyperframe==6.1.0
87
+ # via h2
88
  idna==3.10
89
  # via
90
  # anyio
91
  # httpx
92
  # requests
93
  # yarl
94
+ jinja2==3.1.6
95
  # via gradio
96
  markdown-it-py==3.0.0
97
  # via rich
 
107
  # yarl
108
  multiprocess==0.70.16
109
  # via datasets
110
+ numpy==2.2.3
111
  # via
112
  # datasets
113
  # gradio
 
127
  # gradio
128
  pillow==11.1.0
129
  # via gradio
130
+ polars==1.24.0
131
+ # via -r requirements.in
132
+ propcache==0.3.0
133
  # via
134
  # aiohttp
135
  # yarl
136
+ pyarrow==19.0.1
137
  # via datasets
138
  pydantic==2.10.6
139
  # via
 
164
  # huggingface-hub
165
  rich==13.9.4
166
  # via typer
167
+ ruff==0.9.10
168
  # via gradio
169
  safehttpx==0.1.6
170
  # via gradio
 
176
  # via python-dateutil
177
  sniffio==1.3.1
178
  # via anyio
179
+ starlette==0.46.1
180
  # via
181
  # fastapi
182
  # gradio
 
186
  # via
187
  # datasets
188
  # huggingface-hub
189
+ typer==0.15.2
190
  # via gradio
191
  typing-extensions==4.12.2
192
  # via