davanstrien HF staff commited on
Commit
e62ac39
1 Parent(s): 75d97b4

add server preview filter

Browse files
Files changed (1) hide show
  1. app.py +61 -17
app.py CHANGED
@@ -1,19 +1,23 @@
1
  import os
2
  from datetime import datetime, timedelta
3
  from sys import platform
 
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  from diskcache import Cache
8
  from dotenv import load_dotenv
9
  from httpx import Client
10
- from huggingface_hub import hf_hub_url, list_datasets
11
  from tqdm.auto import tqdm
12
  from tqdm.contrib.concurrent import thread_map
13
- from huggingface_hub import DatasetCard
14
 
15
  load_dotenv()
16
 
 
 
 
17
 
18
  HF_TOKEN = os.getenv("HF_TOKEN")
19
  USER_AGENT = os.getenv("USER_AGENT")
@@ -46,7 +50,7 @@ def get_three_months_ago():
46
  return now - timedelta(days=90)
47
 
48
 
49
- def get_readme_len(dataset):
50
  try:
51
  url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
52
  resp = client.get(url)
@@ -59,6 +63,25 @@ def get_readme_len(dataset):
59
  return None
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def render_model_hub_link(hub_id):
63
  link = f"https://huggingface.co/datasets/{hub_id}"
64
  return (
@@ -67,23 +90,27 @@ def render_model_hub_link(hub_id):
67
  )
68
 
69
 
70
- @cache.memoize(expire=60 * 60 * 12)
71
  def get_datasets():
72
- return list(tqdm(iter(list_datasets(limit=None, full=True))))
 
 
 
 
 
 
73
 
74
 
75
- @cache.memoize(expire=60 * 60 * 12)
76
  def load_data():
77
  datasets = get_datasets()
78
  datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
79
- filtered = [ds for ds in datasets if ds.get("cardData")]
80
- filtered = [ds for ds in filtered if ds["created"] > get_three_months_ago()]
81
  ds_with_len = thread_map(get_readme_len, filtered)
82
  ds_with_len = [ds for ds in ds_with_len if ds is not None]
83
- return ds_with_len
84
-
85
-
86
- remove_orgs = {"HuggingFaceM4", "HuggingFaceBR4", "open-llm-leaderboard"}
87
 
88
 
89
  columns_to_drop = [
@@ -103,7 +130,7 @@ columns_to_drop = [
103
  ]
104
 
105
 
106
- def prep_dataframe(remove_orgs_and_users=remove_orgs, columns_to_drop=columns_to_drop):
107
  ds_with_len = load_data()
108
  if remove_orgs_and_users:
109
  ds_with_len = [
@@ -132,8 +159,10 @@ def filter_by_readme_len(df, min_len=None):
132
  return df
133
 
134
 
135
- def filter_df(max_age_days=None, min_len=None):
136
  df = prep_dataframe()
 
 
137
  if max_age_days is not None:
138
  df = filter_df_by_max_age(df, max_age_days=max_age_days)
139
  if min_len is not None:
@@ -164,10 +193,25 @@ with gr.Blocks() as demo:
164
  step=50,
165
  interactive=True,
166
  )
 
 
 
167
 
168
  output = gr.DataFrame(filter_df, datatype="markdown", min_width=160 * 2.5)
169
- max_age_days.input(filter_df, inputs=[max_age_days, min_len], outputs=[output])
170
- min_len.input(filter_df, inputs=[max_age_days, min_len], outputs=[output])
171
-
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  demo.launch()
 
1
  import os
2
  from datetime import datetime, timedelta
3
  from sys import platform
4
+ from typing import Any, Dict
5
 
6
  import gradio as gr
7
  import pandas as pd
8
  from diskcache import Cache
9
  from dotenv import load_dotenv
10
  from httpx import Client
11
+ from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
12
  from tqdm.auto import tqdm
13
  from tqdm.contrib.concurrent import thread_map
14
+
15
 
16
  load_dotenv()
17
 
18
+ LIMIT = None
19
+ CACHE_TIME = 60 * 60 * 6 # 6 hours
20
+ REMOVE_ORGS = {"HuggingFaceM4", "HuggingFaceBR4", "open-llm-leaderboard"}
21
 
22
  HF_TOKEN = os.getenv("HF_TOKEN")
23
  USER_AGENT = os.getenv("USER_AGENT")
 
50
  return now - timedelta(days=90)
51
 
52
 
53
+ def get_readme_len(dataset: Dict[str, Any]):
54
  try:
55
  url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
56
  resp = client.get(url)
 
63
  return None
64
 
65
 
66
+ def check_ds_server_valid(id):
67
+ url = f"https://datasets-server.huggingface.co/is-valid?dataset={id}"
68
+ response = client.get(url)
69
+ if response.status_code != 200:
70
+ return False
71
+ try:
72
+ data = response.json()
73
+ preview = data.get("preview")
74
+ return preview is not None
75
+ except Exception as e:
76
+ print(e)
77
+ return False
78
+
79
+
80
+ def has_server_preview(dataset):
81
+ dataset["server_preview"] = check_ds_server_valid(dataset["id"])
82
+ return dataset
83
+
84
+
85
  def render_model_hub_link(hub_id):
86
  link = f"https://huggingface.co/datasets/{hub_id}"
87
  return (
 
90
  )
91
 
92
 
93
+ @cache.memoize(expire=CACHE_TIME)
94
  def get_datasets():
95
+ return list(
96
+ tqdm(
97
+ iter(
98
+ list_datasets(limit=LIMIT, full=True, sort="lastModified", direction=-1)
99
+ )
100
+ )
101
+ )
102
 
103
 
104
+ @cache.memoize(expire=CACHE_TIME)
105
  def load_data():
106
  datasets = get_datasets()
107
  datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
108
+ filtered = [ds for ds in datasets if ds["created"] > get_three_months_ago()]
 
109
  ds_with_len = thread_map(get_readme_len, filtered)
110
  ds_with_len = [ds for ds in ds_with_len if ds is not None]
111
+ ds_with_valid_status = thread_map(has_server_preview, ds_with_len)
112
+ ds_with_valid_status = [ds for ds in ds_with_valid_status if ds is not None]
113
+ return ds_with_valid_status
 
114
 
115
 
116
  columns_to_drop = [
 
130
  ]
131
 
132
 
133
+ def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to_drop):
134
  ds_with_len = load_data()
135
  if remove_orgs_and_users:
136
  ds_with_len = [
 
159
  return df
160
 
161
 
162
+ def filter_df(max_age_days=None, min_len=None, needs_server_preview: bool = False):
163
  df = prep_dataframe()
164
+ if needs_server_preview:
165
+ df = df[df["server_preview"] is True]
166
  if max_age_days is not None:
167
  df = filter_df_by_max_age(df, max_age_days=max_age_days)
168
  if min_len is not None:
 
193
  step=50,
194
  interactive=True,
195
  )
196
+ needs_server_preview = gr.Checkbox(
197
+ label="Needs Server Preview", default=False, interactive=True
198
+ )
199
 
200
  output = gr.DataFrame(filter_df, datatype="markdown", min_width=160 * 2.5)
201
+ max_age_days.input(
202
+ filter_df,
203
+ inputs=[max_age_days, min_len, needs_server_preview],
204
+ outputs=[output],
205
+ )
206
+ min_len.input(
207
+ filter_df,
208
+ inputs=[max_age_days, min_len, needs_server_preview],
209
+ outputs=[output],
210
+ )
211
+ needs_server_preview.change(
212
+ filter_df,
213
+ inputs=[max_age_days, min_len, needs_server_preview],
214
+ outputs=[output],
215
+ )
216
 
217
  demo.launch()