asoria HF staff commited on
Commit
4be1a5c
1 Parent(s): cd5f2d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +455 -434
app.py CHANGED
@@ -1,441 +1,462 @@
1
- import requests
2
- import logging
3
- import duckdb
4
- import numpy as np
5
- from torch import cuda
6
- from gradio_huggingfacehub_search import HuggingfaceHubSearch
7
- from bertopic import BERTopic
8
- from bertopic.representation import KeyBERTInspired
9
- from umap import UMAP
10
- from hdbscan import HDBSCAN
11
- from sklearn.feature_extraction.text import CountVectorizer
12
-
13
- from sentence_transformers import SentenceTransformer
14
-
15
- from dotenv import load_dotenv
16
- import os
17
-
18
- import spaces
19
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
 
21
 
22
- """
23
- TODOs:
24
- - Try for small dataset <1000 rows
 
 
 
 
 
 
 
25
  """
26
 
27
- load_dotenv()
28
- HF_TOKEN = os.getenv("HF_TOKEN")
29
- assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
30
-
31
- logging.basicConfig(
32
- level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
33
- )
34
-
35
- MAX_ROWS = 5_000
36
- CHUNK_SIZE = 1_000
37
-
38
-
39
- session = requests.Session()
40
- sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
41
- keybert = KeyBERTInspired()
42
- vectorizer_model = CountVectorizer(stop_words="english")
43
-
44
- representation_model = KeyBERTInspired()
45
-
46
- global_topic_model = None
47
-
48
-
49
- def get_split_rows(dataset, config, split):
50
- config_size = session.get(
51
- f"https://datasets-server.huggingface.co/size?dataset={dataset}&config={config}",
52
- timeout=20,
53
- ).json()
54
- if "error" in config_size:
55
- raise Exception(f"Error fetching config size: {config_size['error']}")
56
- split_size = next(
57
- (s for s in config_size["size"]["splits"] if s["split"] == split),
58
- None,
59
- )
60
- if split_size is None:
61
- raise Exception(f"Error fetching split {split} in config {config}")
62
- return split_size["num_rows"]
63
-
64
-
65
- def get_parquet_urls(dataset, config, split):
66
- parquet_files = session.get(
67
- f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
68
- timeout=20,
69
- ).json()
70
- if "error" in parquet_files:
71
- raise Exception(f"Error fetching parquet files: {parquet_files['error']}")
72
- parquet_urls = [file["url"] for file in parquet_files["parquet_files"]]
73
- logging.debug(f"Parquet files: {parquet_urls}")
74
- return ",".join(f"'{url}'" for url in parquet_urls)
75
-
76
-
77
- def get_docs_from_parquet(parquet_urls, column, offset, limit):
78
- SQL_QUERY = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
79
- df = duckdb.sql(SQL_QUERY).to_df()
80
- logging.debug(f"Dataframe: {df.head(5)}")
81
- return df[column].tolist()
82
-
83
-
84
- @spaces.GPU
85
- def calculate_embeddings(docs):
86
- return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
87
-
88
-
89
- def calculate_n_neighbors_and_components(n_rows):
90
- n_neighbors = min(max(n_rows // 20, 15), 100)
91
- n_components = 10 if n_rows > 1000 else 5 # Higher components for larger datasets
92
- return n_neighbors, n_components
93
-
94
-
95
- @spaces.GPU
96
- def fit_model(docs, embeddings, n_neighbors, n_components):
97
- global global_topic_model
98
-
99
- umap_model = UMAP(
100
- n_neighbors=n_neighbors,
101
- n_components=n_components,
102
- min_dist=0.0,
103
- metric="cosine",
104
- random_state=42,
105
- )
106
-
107
- hdbscan_model = HDBSCAN(
108
- min_cluster_size=max(
109
- 5, n_neighbors // 2
110
- ), # Reducing min_cluster_size for fewer outliers
111
- metric="euclidean",
112
- cluster_selection_method="eom",
113
- prediction_data=True,
114
- )
115
-
116
- new_model = BERTopic(
117
- language="english",
118
- # Sub-models
119
- embedding_model=sentence_model,
120
- umap_model=umap_model,
121
- hdbscan_model=hdbscan_model,
122
- representation_model=representation_model,
123
- vectorizer_model=vectorizer_model,
124
- # Hyperparameters
125
- top_n_words=10,
126
- verbose=True,
127
- min_topic_size=n_neighbors, # Coherent with n_neighbors?
128
- )
129
- logging.info("Fitting new model")
130
- new_model.fit(docs, embeddings)
131
- logging.info("End fitting new model")
132
-
133
- global_topic_model = new_model
134
-
135
- logging.info("Global model updated")
136
-
137
-
138
- def generate_topics(dataset, config, split, column, nested_column):
139
- logging.info(
140
- f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
141
- )
142
-
143
- parquet_urls = get_parquet_urls(dataset, config, split)
144
- split_rows = get_split_rows(dataset, config, split)
145
- logging.info(f"Split rows: {split_rows}")
146
-
147
- limit = min(split_rows, MAX_ROWS)
148
- n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
149
-
150
- reduce_umap_model = UMAP(
151
- n_neighbors=n_neighbors,
152
- n_components=2, # For visualization, keeping it at 2 (2D)
153
- min_dist=0.0,
154
- metric="cosine",
155
- random_state=42,
156
- )
157
-
158
- offset = 0
159
- rows_processed = 0
160
-
161
- base_model = None
162
- all_docs = []
163
- reduced_embeddings_list = []
164
- topics_info, topic_plot = None, None
165
- yield (
166
- gr.DataFrame(value=[], interactive=False, visible=True),
167
- gr.Plot(value=None, visible=True),
168
- gr.Label(
169
- {f"⚙️ Generating topics {dataset}": rows_processed / limit}, visible=True
170
- ),
171
- )
172
- while offset < limit:
173
- docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
174
- if not docs:
175
- break
176
-
177
- logging.info(
178
- f"----> Processing chunk: {offset=} {CHUNK_SIZE=} with {len(docs)} docs"
179
- )
180
-
181
- embeddings = calculate_embeddings(docs)
182
- fit_model(docs, embeddings, n_neighbors, n_components)
183
-
184
- if base_model is None:
185
- base_model = global_topic_model
186
- else:
187
- updated_model = BERTopic.merge_models([base_model, global_topic_model])
188
- nr_new_topics = len(set(updated_model.topics_)) - len(
189
- set(base_model.topics_)
190
- )
191
- new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
192
- logging.info(f"The following topics are newly found: {new_topics}")
193
- base_model = updated_model
194
-
195
- reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
196
- reduced_embeddings_list.append(reduced_embeddings)
197
-
198
- all_docs.extend(docs)
199
-
200
- topics_info = base_model.get_topic_info()
201
- topic_plot = base_model.visualize_documents(
202
- all_docs,
203
- reduced_embeddings=np.vstack(reduced_embeddings_list),
204
- custom_labels=True,
205
- )
206
-
207
- rows_processed += len(docs)
208
- progress = min(rows_processed / limit, 1.0)
209
- logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
210
- yield (
211
- topics_info,
212
- topic_plot,
213
- gr.Label({f"⚙️ Generating topics {dataset}": progress}, visible=True),
214
- )
215
-
216
- offset += CHUNK_SIZE
217
-
218
- logging.info("Finished processing all data")
219
- yield (
220
- topics_info,
221
- topic_plot,
222
- gr.Label({f"✅ Generating topics {dataset}": 1.0}, visible=True),
223
- )
224
- cuda.empty_cache()
225
-
226
-
227
  with gr.Blocks() as demo:
228
- gr.Markdown("# 💠 Dataset Topic Discovery 🔭")
229
- gr.Markdown("## Select dataset and text column")
230
- with gr.Accordion("Data details", open=True):
231
- with gr.Row():
232
- with gr.Column(scale=3):
233
- dataset_name = HuggingfaceHubSearch(
234
- label="Hub Dataset ID",
235
- placeholder="Search for dataset id on Huggingface",
236
- search_type="dataset",
237
- )
238
- subset_dropdown = gr.Dropdown(label="Subset", visible=False)
239
- split_dropdown = gr.Dropdown(label="Split", visible=False)
240
-
241
- with gr.Accordion("Dataset preview", open=False):
242
-
243
- @gr.render(inputs=[dataset_name, subset_dropdown, split_dropdown])
244
- def embed(name, subset, split):
245
- html_code = f"""
246
- <iframe
247
- src="https://huggingface.co/datasets/{name}/embed/viewer/{subset}/{split}"
248
- frameborder="0"
249
- width="100%"
250
- height="600px"
251
- ></iframe>
252
- """
253
- return gr.HTML(value=html_code)
254
-
255
- with gr.Row():
256
- text_column_dropdown = gr.Dropdown(label="Text column name")
257
- nested_text_column_dropdown = gr.Dropdown(
258
- label="Nested text column name", visible=False
259
- )
260
-
261
- generate_button = gr.Button("Generate Topics", variant="primary")
262
-
263
- gr.Markdown("## Datamap")
264
- full_topics_generation_label = gr.Label(visible=False, show_label=False)
265
- topics_plot = gr.Plot()
266
- with gr.Accordion("Topics Info", open=False):
267
- topics_df = gr.DataFrame(interactive=False, visible=True)
268
- generate_button.click(
269
- generate_topics,
270
- inputs=[
271
- dataset_name,
272
- subset_dropdown,
273
- split_dropdown,
274
- text_column_dropdown,
275
- nested_text_column_dropdown,
276
- ],
277
- outputs=[topics_df, topics_plot, full_topics_generation_label],
278
- )
279
-
280
- def _resolve_dataset_selection(
281
- dataset: str, default_subset: str, default_split: str, text_feature
282
- ):
283
- if "/" not in dataset.strip().strip("/"):
284
- return {
285
- subset_dropdown: gr.Dropdown(visible=False),
286
- split_dropdown: gr.Dropdown(visible=False),
287
- text_column_dropdown: gr.Dropdown(label="Text column name"),
288
- nested_text_column_dropdown: gr.Dropdown(visible=False),
289
- }
290
- info_resp = session.get(
291
- f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=20
292
- ).json()
293
- if "error" in info_resp:
294
- return {
295
- subset_dropdown: gr.Dropdown(visible=False),
296
- split_dropdown: gr.Dropdown(visible=False),
297
- text_column_dropdown: gr.Dropdown(label="Text column name"),
298
- nested_text_column_dropdown: gr.Dropdown(visible=False),
299
- }
300
- subsets: list[str] = list(info_resp["dataset_info"])
301
- subset = default_subset if default_subset in subsets else subsets[0]
302
- splits: list[str] = list(info_resp["dataset_info"][subset]["splits"])
303
- split = default_split if default_split in splits else splits[0]
304
- features = info_resp["dataset_info"][subset]["features"]
305
-
306
- def _is_string_feature(feature):
307
- return isinstance(feature, dict) and feature.get("dtype") == "string"
308
-
309
- text_features = [
310
- feature_name
311
- for feature_name, feature in features.items()
312
- if _is_string_feature(feature)
313
- ]
314
- nested_features = [
315
- feature_name
316
- for feature_name, feature in features.items()
317
- if isinstance(feature, dict)
318
- and isinstance(next(iter(feature.values())), dict)
319
- ]
320
- nested_text_features = [
321
- feature_name
322
- for feature_name in nested_features
323
- if any(
324
- _is_string_feature(nested_feature)
325
- for nested_feature in features[feature_name].values()
326
- )
327
- ]
328
- if not text_feature:
329
- return {
330
- subset_dropdown: gr.Dropdown(
331
- value=subset, choices=subsets, visible=len(subsets) > 1
332
- ),
333
- split_dropdown: gr.Dropdown(
334
- value=split, choices=splits, visible=len(splits) > 1
335
- ),
336
- text_column_dropdown: gr.Dropdown(
337
- choices=text_features + nested_text_features,
338
- label="Text column name",
339
- ),
340
- nested_text_column_dropdown: gr.Dropdown(visible=False),
341
- }
342
- if text_feature in nested_text_features:
343
- nested_keys = [
344
- feature_name
345
- for feature_name, feature in features[text_feature].items()
346
- if _is_string_feature(feature)
347
- ]
348
- return {
349
- subset_dropdown: gr.Dropdown(
350
- value=subset, choices=subsets, visible=len(subsets) > 1
351
- ),
352
- split_dropdown: gr.Dropdown(
353
- value=split, choices=splits, visible=len(splits) > 1
354
- ),
355
- text_column_dropdown: gr.Dropdown(
356
- choices=text_features + nested_text_features,
357
- label="Text column name",
358
- ),
359
- nested_text_column_dropdown: gr.Dropdown(
360
- value=nested_keys[0],
361
- choices=nested_keys,
362
- label="Nested text column name",
363
- visible=True,
364
- ),
365
- }
366
- return {
367
- subset_dropdown: gr.Dropdown(
368
- value=subset, choices=subsets, visible=len(subsets) > 1
369
- ),
370
- split_dropdown: gr.Dropdown(
371
- value=split, choices=splits, visible=len(splits) > 1
372
- ),
373
- text_column_dropdown: gr.Dropdown(
374
- choices=text_features + nested_text_features, label="Text column name"
375
- ),
376
- nested_text_column_dropdown: gr.Dropdown(visible=False),
377
- }
378
-
379
- @dataset_name.change(
380
- inputs=[dataset_name],
381
- outputs=[
382
- subset_dropdown,
383
- split_dropdown,
384
- text_column_dropdown,
385
- nested_text_column_dropdown,
386
- ],
387
- )
388
- def show_input_from_subset_dropdown(dataset: str) -> dict:
389
- return _resolve_dataset_selection(
390
- dataset, default_subset="default", default_split="train", text_feature=None
391
- )
392
-
393
- @subset_dropdown.change(
394
- inputs=[dataset_name, subset_dropdown],
395
- outputs=[
396
- subset_dropdown,
397
- split_dropdown,
398
- text_column_dropdown,
399
- nested_text_column_dropdown,
400
- ],
401
- )
402
- def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
403
- return _resolve_dataset_selection(
404
- dataset, default_subset=subset, default_split="train", text_feature=None
405
- )
406
-
407
- @split_dropdown.change(
408
- inputs=[dataset_name, subset_dropdown, split_dropdown],
409
- outputs=[
410
- subset_dropdown,
411
- split_dropdown,
412
- text_column_dropdown,
413
- nested_text_column_dropdown,
414
- ],
415
- )
416
- def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
417
- return _resolve_dataset_selection(
418
- dataset, default_subset=subset, default_split=split, text_feature=None
419
- )
420
-
421
- @text_column_dropdown.change(
422
- inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown],
423
- outputs=[
424
- subset_dropdown,
425
- split_dropdown,
426
- text_column_dropdown,
427
- nested_text_column_dropdown,
428
- ],
429
- )
430
- def show_input_from_text_column_dropdown(
431
- dataset: str, subset: str, split: str, text_column
432
- ) -> dict:
433
- return _resolve_dataset_selection(
434
- dataset,
435
- default_subset=subset,
436
- default_split=split,
437
- text_feature=text_column,
438
- )
439
-
440
 
 
441
  demo.launch()
 
1
+ # import requests
2
+ # import logging
3
+ # import duckdb
4
+ # import numpy as np
5
+ # from torch import cuda
6
+ # from gradio_huggingfacehub_search import HuggingfaceHubSearch
7
+ # from bertopic import BERTopic
8
+ # from bertopic.representation import KeyBERTInspired
9
+ # from umap import UMAP
10
+ # from hdbscan import HDBSCAN
11
+ # from sklearn.feature_extraction.text import CountVectorizer
12
+
13
+ # from sentence_transformers import SentenceTransformer
14
+
15
+ # from dotenv import load_dotenv
16
+ # import os
17
+
18
+ # import spaces
19
+ # import gradio as gr
20
+
21
+
22
+ # """
23
+ # TODOs:
24
+ # - Try for small dataset <1000 rows
25
+ # """
26
+
27
+ # load_dotenv()
28
+ # HF_TOKEN = os.getenv("HF_TOKEN")
29
+ # assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
30
+
31
+ # logging.basicConfig(
32
+ # level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
33
+ # )
34
+
35
+ # MAX_ROWS = 5_000
36
+ # CHUNK_SIZE = 1_000
37
+
38
+
39
+ # session = requests.Session()
40
+ # sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
41
+ # keybert = KeyBERTInspired()
42
+ # vectorizer_model = CountVectorizer(stop_words="english")
43
+
44
+ # representation_model = KeyBERTInspired()
45
+
46
+ # global_topic_model = None
47
+
48
+
49
+ # def get_split_rows(dataset, config, split):
50
+ # config_size = session.get(
51
+ # f"https://datasets-server.huggingface.co/size?dataset={dataset}&config={config}",
52
+ # timeout=20,
53
+ # ).json()
54
+ # if "error" in config_size:
55
+ # raise Exception(f"Error fetching config size: {config_size['error']}")
56
+ # split_size = next(
57
+ # (s for s in config_size["size"]["splits"] if s["split"] == split),
58
+ # None,
59
+ # )
60
+ # if split_size is None:
61
+ # raise Exception(f"Error fetching split {split} in config {config}")
62
+ # return split_size["num_rows"]
63
+
64
+
65
+ # def get_parquet_urls(dataset, config, split):
66
+ # parquet_files = session.get(
67
+ # f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}",
68
+ # timeout=20,
69
+ # ).json()
70
+ # if "error" in parquet_files:
71
+ # raise Exception(f"Error fetching parquet files: {parquet_files['error']}")
72
+ # parquet_urls = [file["url"] for file in parquet_files["parquet_files"]]
73
+ # logging.debug(f"Parquet files: {parquet_urls}")
74
+ # return ",".join(f"'{url}'" for url in parquet_urls)
75
+
76
+
77
+ # def get_docs_from_parquet(parquet_urls, column, offset, limit):
78
+ # SQL_QUERY = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
79
+ # df = duckdb.sql(SQL_QUERY).to_df()
80
+ # logging.debug(f"Dataframe: {df.head(5)}")
81
+ # return df[column].tolist()
82
+
83
+
84
+ # @spaces.GPU
85
+ # def calculate_embeddings(docs):
86
+ # return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
87
+
88
+
89
+ # def calculate_n_neighbors_and_components(n_rows):
90
+ # n_neighbors = min(max(n_rows // 20, 15), 100)
91
+ # n_components = 10 if n_rows > 1000 else 5 # Higher components for larger datasets
92
+ # return n_neighbors, n_components
93
+
94
+
95
+ # @spaces.GPU
96
+ # def fit_model(docs, embeddings, n_neighbors, n_components):
97
+ # global global_topic_model
98
+
99
+ # umap_model = UMAP(
100
+ # n_neighbors=n_neighbors,
101
+ # n_components=n_components,
102
+ # min_dist=0.0,
103
+ # metric="cosine",
104
+ # random_state=42,
105
+ # )
106
+
107
+ # hdbscan_model = HDBSCAN(
108
+ # min_cluster_size=max(
109
+ # 5, n_neighbors // 2
110
+ # ), # Reducing min_cluster_size for fewer outliers
111
+ # metric="euclidean",
112
+ # cluster_selection_method="eom",
113
+ # prediction_data=True,
114
+ # )
115
+
116
+ # new_model = BERTopic(
117
+ # language="english",
118
+ # # Sub-models
119
+ # embedding_model=sentence_model,
120
+ # umap_model=umap_model,
121
+ # hdbscan_model=hdbscan_model,
122
+ # representation_model=representation_model,
123
+ # vectorizer_model=vectorizer_model,
124
+ # # Hyperparameters
125
+ # top_n_words=10,
126
+ # verbose=True,
127
+ # min_topic_size=n_neighbors, # Coherent with n_neighbors?
128
+ # )
129
+ # logging.info("Fitting new model")
130
+ # new_model.fit(docs, embeddings)
131
+ # logging.info("End fitting new model")
132
+
133
+ # global_topic_model = new_model
134
+
135
+ # logging.info("Global model updated")
136
+
137
+
138
+ # def generate_topics(dataset, config, split, column, nested_column):
139
+ # logging.info(
140
+ # f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}"
141
+ # )
142
+
143
+ # parquet_urls = get_parquet_urls(dataset, config, split)
144
+ # split_rows = get_split_rows(dataset, config, split)
145
+ # logging.info(f"Split rows: {split_rows}")
146
+
147
+ # limit = min(split_rows, MAX_ROWS)
148
+ # n_neighbors, n_components = calculate_n_neighbors_and_components(limit)
149
+
150
+ # reduce_umap_model = UMAP(
151
+ # n_neighbors=n_neighbors,
152
+ # n_components=2, # For visualization, keeping it at 2 (2D)
153
+ # min_dist=0.0,
154
+ # metric="cosine",
155
+ # random_state=42,
156
+ # )
157
+
158
+ # offset = 0
159
+ # rows_processed = 0
160
+
161
+ # base_model = None
162
+ # all_docs = []
163
+ # reduced_embeddings_list = []
164
+ # topics_info, topic_plot = None, None
165
+ # yield (
166
+ # gr.DataFrame(value=[], interactive=False, visible=True),
167
+ # gr.Plot(value=None, visible=True),
168
+ # gr.Label(
169
+ # {f"⚙️ Generating topics {dataset}": rows_processed / limit}, visible=True
170
+ # ),
171
+ # )
172
+ # while offset < limit:
173
+ # docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE)
174
+ # if not docs:
175
+ # break
176
+
177
+ # logging.info(
178
+ # f"----> Processing chunk: {offset=} {CHUNK_SIZE=} with {len(docs)} docs"
179
+ # )
180
+
181
+ # embeddings = calculate_embeddings(docs)
182
+ # fit_model(docs, embeddings, n_neighbors, n_components)
183
+
184
+ # if base_model is None:
185
+ # base_model = global_topic_model
186
+ # else:
187
+ # updated_model = BERTopic.merge_models([base_model, global_topic_model])
188
+ # nr_new_topics = len(set(updated_model.topics_)) - len(
189
+ # set(base_model.topics_)
190
+ # )
191
+ # new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:]
192
+ # logging.info(f"The following topics are newly found: {new_topics}")
193
+ # base_model = updated_model
194
+
195
+ # reduced_embeddings = reduce_umap_model.fit_transform(embeddings)
196
+ # reduced_embeddings_list.append(reduced_embeddings)
197
+
198
+ # all_docs.extend(docs)
199
+
200
+ # topics_info = base_model.get_topic_info()
201
+ # topic_plot = base_model.visualize_documents(
202
+ # all_docs,
203
+ # reduced_embeddings=np.vstack(reduced_embeddings_list),
204
+ # custom_labels=True,
205
+ # )
206
+
207
+ # rows_processed += len(docs)
208
+ # progress = min(rows_processed / limit, 1.0)
209
+ # logging.info(f"Progress: {progress} % - {rows_processed} of {limit}")
210
+ # yield (
211
+ # topics_info,
212
+ # topic_plot,
213
+ # gr.Label({f"⚙️ Generating topics {dataset}": progress}, visible=True),
214
+ # )
215
+
216
+ # offset += CHUNK_SIZE
217
+
218
+ # logging.info("Finished processing all data")
219
+ # yield (
220
+ # topics_info,
221
+ # topic_plot,
222
+ # gr.Label({f"✅ Generating topics {dataset}": 1.0}, visible=True),
223
+ # )
224
+ # cuda.empty_cache()
225
+
226
+
227
+ # with gr.Blocks() as demo:
228
+ # gr.Markdown("# 💠 Dataset Topic Discovery 🔭")
229
+ # gr.Markdown("## Select dataset and text column")
230
+ # with gr.Accordion("Data details", open=True):
231
+ # with gr.Row():
232
+ # with gr.Column(scale=3):
233
+ # dataset_name = HuggingfaceHubSearch(
234
+ # label="Hub Dataset ID",
235
+ # placeholder="Search for dataset id on Huggingface",
236
+ # search_type="dataset",
237
+ # )
238
+ # subset_dropdown = gr.Dropdown(label="Subset", visible=False)
239
+ # split_dropdown = gr.Dropdown(label="Split", visible=False)
240
+
241
+ # with gr.Accordion("Dataset preview", open=False):
242
+
243
+ # @gr.render(inputs=[dataset_name, subset_dropdown, split_dropdown])
244
+ # def embed(name, subset, split):
245
+ # html_code = f"""
246
+ # <iframe
247
+ # src="https://huggingface.co/datasets/{name}/embed/viewer/{subset}/{split}"
248
+ # frameborder="0"
249
+ # width="100%"
250
+ # height="600px"
251
+ # ></iframe>
252
+ # """
253
+ # return gr.HTML(value=html_code)
254
+
255
+ # with gr.Row():
256
+ # text_column_dropdown = gr.Dropdown(label="Text column name")
257
+ # nested_text_column_dropdown = gr.Dropdown(
258
+ # label="Nested text column name", visible=False
259
+ # )
260
+
261
+ # generate_button = gr.Button("Generate Topics", variant="primary")
262
+
263
+ # gr.Markdown("## Datamap")
264
+ # full_topics_generation_label = gr.Label(visible=False, show_label=False)
265
+ # topics_plot = gr.Plot()
266
+ # with gr.Accordion("Topics Info", open=False):
267
+ # topics_df = gr.DataFrame(interactive=False, visible=True)
268
+ # generate_button.click(
269
+ # generate_topics,
270
+ # inputs=[
271
+ # dataset_name,
272
+ # subset_dropdown,
273
+ # split_dropdown,
274
+ # text_column_dropdown,
275
+ # nested_text_column_dropdown,
276
+ # ],
277
+ # outputs=[topics_df, topics_plot, full_topics_generation_label],
278
+ # )
279
+
280
+ # def _resolve_dataset_selection(
281
+ # dataset: str, default_subset: str, default_split: str, text_feature
282
+ # ):
283
+ # if "/" not in dataset.strip().strip("/"):
284
+ # return {
285
+ # subset_dropdown: gr.Dropdown(visible=False),
286
+ # split_dropdown: gr.Dropdown(visible=False),
287
+ # text_column_dropdown: gr.Dropdown(label="Text column name"),
288
+ # nested_text_column_dropdown: gr.Dropdown(visible=False),
289
+ # }
290
+ # info_resp = session.get(
291
+ # f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=20
292
+ # ).json()
293
+ # if "error" in info_resp:
294
+ # return {
295
+ # subset_dropdown: gr.Dropdown(visible=False),
296
+ # split_dropdown: gr.Dropdown(visible=False),
297
+ # text_column_dropdown: gr.Dropdown(label="Text column name"),
298
+ # nested_text_column_dropdown: gr.Dropdown(visible=False),
299
+ # }
300
+ # subsets: list[str] = list(info_resp["dataset_info"])
301
+ # subset = default_subset if default_subset in subsets else subsets[0]
302
+ # splits: list[str] = list(info_resp["dataset_info"][subset]["splits"])
303
+ # split = default_split if default_split in splits else splits[0]
304
+ # features = info_resp["dataset_info"][subset]["features"]
305
+
306
+ # def _is_string_feature(feature):
307
+ # return isinstance(feature, dict) and feature.get("dtype") == "string"
308
+
309
+ # text_features = [
310
+ # feature_name
311
+ # for feature_name, feature in features.items()
312
+ # if _is_string_feature(feature)
313
+ # ]
314
+ # nested_features = [
315
+ # feature_name
316
+ # for feature_name, feature in features.items()
317
+ # if isinstance(feature, dict)
318
+ # and isinstance(next(iter(feature.values())), dict)
319
+ # ]
320
+ # nested_text_features = [
321
+ # feature_name
322
+ # for feature_name in nested_features
323
+ # if any(
324
+ # _is_string_feature(nested_feature)
325
+ # for nested_feature in features[feature_name].values()
326
+ # )
327
+ # ]
328
+ # if not text_feature:
329
+ # return {
330
+ # subset_dropdown: gr.Dropdown(
331
+ # value=subset, choices=subsets, visible=len(subsets) > 1
332
+ # ),
333
+ # split_dropdown: gr.Dropdown(
334
+ # value=split, choices=splits, visible=len(splits) > 1
335
+ # ),
336
+ # text_column_dropdown: gr.Dropdown(
337
+ # choices=text_features + nested_text_features,
338
+ # label="Text column name",
339
+ # ),
340
+ # nested_text_column_dropdown: gr.Dropdown(visible=False),
341
+ # }
342
+ # if text_feature in nested_text_features:
343
+ # nested_keys = [
344
+ # feature_name
345
+ # for feature_name, feature in features[text_feature].items()
346
+ # if _is_string_feature(feature)
347
+ # ]
348
+ # return {
349
+ # subset_dropdown: gr.Dropdown(
350
+ # value=subset, choices=subsets, visible=len(subsets) > 1
351
+ # ),
352
+ # split_dropdown: gr.Dropdown(
353
+ # value=split, choices=splits, visible=len(splits) > 1
354
+ # ),
355
+ # text_column_dropdown: gr.Dropdown(
356
+ # choices=text_features + nested_text_features,
357
+ # label="Text column name",
358
+ # ),
359
+ # nested_text_column_dropdown: gr.Dropdown(
360
+ # value=nested_keys[0],
361
+ # choices=nested_keys,
362
+ # label="Nested text column name",
363
+ # visible=True,
364
+ # ),
365
+ # }
366
+ # return {
367
+ # subset_dropdown: gr.Dropdown(
368
+ # value=subset, choices=subsets, visible=len(subsets) > 1
369
+ # ),
370
+ # split_dropdown: gr.Dropdown(
371
+ # value=split, choices=splits, visible=len(splits) > 1
372
+ # ),
373
+ # text_column_dropdown: gr.Dropdown(
374
+ # choices=text_features + nested_text_features, label="Text column name"
375
+ # ),
376
+ # nested_text_column_dropdown: gr.Dropdown(visible=False),
377
+ # }
378
+
379
+ # @dataset_name.change(
380
+ # inputs=[dataset_name],
381
+ # outputs=[
382
+ # subset_dropdown,
383
+ # split_dropdown,
384
+ # text_column_dropdown,
385
+ # nested_text_column_dropdown,
386
+ # ],
387
+ # )
388
+ # def show_input_from_subset_dropdown(dataset: str) -> dict:
389
+ # return _resolve_dataset_selection(
390
+ # dataset, default_subset="default", default_split="train", text_feature=None
391
+ # )
392
+
393
+ # @subset_dropdown.change(
394
+ # inputs=[dataset_name, subset_dropdown],
395
+ # outputs=[
396
+ # subset_dropdown,
397
+ # split_dropdown,
398
+ # text_column_dropdown,
399
+ # nested_text_column_dropdown,
400
+ # ],
401
+ # )
402
+ # def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
403
+ # return _resolve_dataset_selection(
404
+ # dataset, default_subset=subset, default_split="train", text_feature=None
405
+ # )
406
+
407
+ # @split_dropdown.change(
408
+ # inputs=[dataset_name, subset_dropdown, split_dropdown],
409
+ # outputs=[
410
+ # subset_dropdown,
411
+ # split_dropdown,
412
+ # text_column_dropdown,
413
+ # nested_text_column_dropdown,
414
+ # ],
415
+ # )
416
+ # def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
417
+ # return _resolve_dataset_selection(
418
+ # dataset, default_subset=subset, default_split=split, text_feature=None
419
+ # )
420
+
421
+ # @text_column_dropdown.change(
422
+ # inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown],
423
+ # outputs=[
424
+ # subset_dropdown,
425
+ # split_dropdown,
426
+ # text_column_dropdown,
427
+ # nested_text_column_dropdown,
428
+ # ],
429
+ # )
430
+ # def show_input_from_text_column_dropdown(
431
+ # dataset: str, subset: str, split: str, text_column
432
+ # ) -> dict:
433
+ # return _resolve_dataset_selection(
434
+ # dataset,
435
+ # default_subset=subset,
436
+ # default_split=split,
437
+ # text_feature=text_column,
438
+ # )
439
+
440
+
441
+ # demo.launch()
442
 
443
+ import gradio as gr
444
 
445
+ # Full HTML content
446
+ html_content = """
447
+ <h1 style="color: blue;">Welcome to My Gradio App</h1>
448
+ <p>This is a paragraph with <b>bold</b> and <i>italic</i> text.</p>
449
+ <ul>
450
+ <li>First item</li>
451
+ <li>Second item</li>
452
+ <li>Third item</li>
453
+ </ul>
454
+ <img src="https://via.placeholder.com/150" alt="Sample Image">
455
  """
456
 
457
+ # Create a Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  with gr.Blocks() as demo:
459
+ gr.HTML(html_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
 
461
+ # Launch the app
462
  demo.launch()