polinaeterna HF staff commited on
Commit
352586a
β€’
1 Parent(s): 44cbba4

add prop of non-ascii

Browse files
Files changed (1) hide show
  1. app.py +37 -5
app.py CHANGED
@@ -1,10 +1,13 @@
1
  import requests
2
  from collections import Counter
3
  from requests.adapters import HTTPAdapter, Retry
 
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  import polars as pl
 
 
8
  import spaces
9
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
10
  from huggingface_hub import PyTorchModelHubMixin
@@ -19,6 +22,21 @@ retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
19
  session.mount('http://', HTTPAdapter(max_retries=retries))
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  class QualityModel(nn.Module, PyTorchModelHubMixin):
23
  def __init__(self, config):
24
  super(QualityModel, self).__init__()
@@ -76,7 +94,7 @@ def run_quality_check(dataset, column, batch_size, num_examples):
76
  # config = "default"
77
  info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
78
  if "error" in info_resp:
79
- yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
80
  return
81
  config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
82
  split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
@@ -87,7 +105,7 @@ def run_quality_check(dataset, column, batch_size, num_examples):
87
  try:
88
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
89
  except Exception as error:
90
- yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
91
  return
92
  texts = data[column].to_list()
93
  # batch_size = 100
@@ -98,8 +116,18 @@ def run_quality_check(dataset, column, batch_size, num_examples):
98
  batch_predictions = predict(batch_texts)
99
  predictions.extend(batch_predictions)
100
  texts_processed.extend(batch_texts)
101
- yield {"check in progress...": (i+batch_size) / num_examples}, *plot_and_df(texts_processed, predictions)
102
- yield {"finished": 1.}, *plot_and_df(texts_processed, predictions)
 
 
 
 
 
 
 
 
 
 
103
 
104
  with gr.Blocks() as demo:
105
  gr.Markdown(
@@ -135,6 +163,7 @@ with gr.Blocks() as demo:
135
  progress_bar = gr.Label(show_label=False)
136
  plot = gr.BarPlot()
137
 
 
138
  with gr.Accordion("Explore some individual examples for each class", open=False):
139
  gr.Markdown("### Low")
140
  df_low = gr.DataFrame()
@@ -142,6 +171,9 @@ with gr.Blocks() as demo:
142
  df_medium = gr.DataFrame()
143
  gr.Markdown("### High")
144
  df_high = gr.DataFrame()
145
- gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[progress_bar, plot, df_low, df_medium, df_high])
 
 
 
146
 
147
  demo.launch()
 
1
  import requests
2
  from collections import Counter
3
  from requests.adapters import HTTPAdapter, Retry
4
+ import multiprocessing
5
 
6
  import gradio as gr
7
  import pandas as pd
8
  import polars as pl
9
+ import numpy as np
10
+ import matplotlib.pyplot as plt
11
  import spaces
12
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
13
  from huggingface_hub import PyTorchModelHubMixin
 
22
  session.mount('http://', HTTPAdapter(max_retries=retries))
23
 
24
 
25
+ def proportion_non_ascii(s):
26
+ """
27
+ Compute the proportion of non-ASCII characters in a string.
28
+
29
+ Parameters:
30
+ s (str): The input string.
31
+
32
+ Returns:
33
+ float: The proportion of non-ASCII characters in the string.
34
+ """
35
+ non_ascii_count = sum(1 for c in s if ord(c) > 127)
36
+ total_chars = len(s)
37
+ return non_ascii_count / total_chars if total_chars > 0 else 0.0
38
+
39
+
40
  class QualityModel(nn.Module, PyTorchModelHubMixin):
41
  def __init__(self, config):
42
  super(QualityModel, self).__init__()
 
94
  # config = "default"
95
  info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
96
  if "error" in info_resp:
97
+ yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure()
98
  return
99
  config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
100
  split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
 
105
  try:
106
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
107
  except Exception as error:
108
+ yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure()
109
  return
110
  texts = data[column].to_list()
111
  # batch_size = 100
 
116
  batch_predictions = predict(batch_texts)
117
  predictions.extend(batch_predictions)
118
  texts_processed.extend(batch_texts)
119
+ yield {"check in progress...": (i+batch_size) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure()
120
+
121
+ with multiprocessing.Pool(processes=8) as pool:
122
+ props = pool.map(proportion_non_ascii, texts)
123
+
124
+ # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
125
+ plt.hist(props, bins=20, range=(0., 1.))
126
+ plt.title('Histogram of proportion of non-ASCII characters')
127
+ plt.xlabel('Proportion of non-ASCII characters')
128
+ plt.ylabel('Number of texts')
129
+
130
+ yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf()
131
 
132
  with gr.Blocks() as demo:
133
  gr.Markdown(
 
163
  progress_bar = gr.Label(show_label=False)
164
  plot = gr.BarPlot()
165
 
166
+
167
  with gr.Accordion("Explore some individual examples for each class", open=False):
168
  gr.Markdown("### Low")
169
  df_low = gr.DataFrame()
 
171
  df_medium = gr.DataFrame()
172
  gr.Markdown("### High")
173
  df_high = gr.DataFrame()
174
+
175
+ # non_ascii_hist = gr.DataFrame(visible=False)
176
+ non_ascii_hist = gr.Plot()
177
+ gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[progress_bar, plot, df_low, df_medium, df_high, non_ascii_hist])
178
 
179
  demo.launch()