Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

App Files Files Community

polinaeterna HF staff commited on Sep 9

Commit

0a44dc6

•

1 Parent(s): 46c2a69

fix

Browse files

Files changed (1) hide show

app.py +49 -31

app.py CHANGED Viewed

@@ -23,21 +23,6 @@ retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
 session.mount('http://', HTTPAdapter(max_retries=retries))
-def proportion_non_ascii(s):
-    """
-    Compute the proportion of non-ASCII characters in a string.
-    Parameters:
-    s (str): The input string.
-    Returns:
-    float: The proportion of non-ASCII characters in the string.
-    """
-    non_ascii_count = sum(1 for c in s if ord(c) > 127)
-    total_chars = len(s)
-    return non_ascii_count / total_chars if total_chars > 0 else 0.0
 class QualityModel(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config):
         super(QualityModel, self).__init__()
@@ -95,7 +80,7 @@ def plot_and_df(texts, preds):
 def run_quality_check(dataset, column, batch_size, num_examples):
     info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
     if "error" in info_resp:
-        yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
         return
     config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
     split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
@@ -106,10 +91,10 @@ def run_quality_check(dataset, column, batch_size, num_examples):
         try:
             data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
         except Exception as error:
-            yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
             return
     texts = data[column].to_list()
-    texts_sample = data.sample(100, shuffle=True, seed=16).to_pandas()
     # batch_size = 100
     predictions, texts_processed = [], []
     num_examples = min(len(texts), num_examples)
@@ -118,18 +103,18 @@ def run_quality_check(dataset, column, batch_size, num_examples):
         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
-        yield {"check in progress...": min(i+batch_size, num_examples) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure(), pd.DataFrame()
-    with multiprocessing.Pool(processes=8) as pool:
-        props = pool.map(proportion_non_ascii, texts)
-    # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
-    plt.hist(props, bins=20, range=(0., 1.))
-    plt.title('Histogram of proportion of non-ASCII characters')
-    plt.xlabel('Proportion of non-ASCII characters')
-    plt.ylabel('Number of texts')
-    yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf(), texts_sample
 PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
@@ -199,12 +184,41 @@ def call_perspective_api(texts_df, column_name):#, s):
                 return req_att_scores
         if i % 10 == 0:
             plot_toxicity(req_att_scores)
-            yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame()
     plot_toxicity(req_att_scores)
     yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
 with gr.Blocks() as demo:
     gr.Markdown(
         """
@@ -248,14 +262,18 @@ with gr.Blocks() as demo:
         gr.Markdown("### High")
         df_high = gr.DataFrame()
-    non_ascii_hist = gr.Plot()
     texts_sample_df = gr.DataFrame(visible=False)
     gr_check_btn.click(
         run_quality_check,
         inputs=[dataset_name, text_column, batch_size, num_examples],
-        outputs=[progress_bar, plot, df_low, df_medium, df_high, non_ascii_hist, texts_sample_df]
     )
     gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")
     toxicity_progress_bar = gr.Label(show_label=False)
     toxicity_hist = gr.Plot()

 session.mount('http://', HTTPAdapter(max_retries=retries))
 class QualityModel(nn.Module, PyTorchModelHubMixin):
     def __init__(self, config):
         super(QualityModel, self).__init__()
 def run_quality_check(dataset, column, batch_size, num_examples):
     info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
     if "error" in info_resp:
+        yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
         return
     config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
     split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
         try:
             data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
         except Exception as error:
+            yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(),
             return
     texts = data[column].to_list()
+    # texts_sample = data.sample(100, shuffle=True, seed=16).to_pandas()
     # batch_size = 100
     predictions, texts_processed = [], []
     num_examples = min(len(texts), num_examples)
         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
+        yield {"check in progress...": min(i+batch_size, num_examples) / num_examples}, *plot_and_df(texts_processed, predictions), pd.DataFrame()
+    # with multiprocessing.Pool(processes=8) as pool:
+    #     props = pool.map(proportion_non_ascii, texts)
+    #
+    # # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
+    # plt.hist(props, bins=20, range=(0., 1.))
+    # plt.title('Histogram of proportion of non-ASCII characters')
+    # plt.xlabel('Proportion of non-ASCII characters')
+    # plt.ylabel('Number of texts')
+    yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), data
 PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
                 return req_att_scores
         if i % 10 == 0:
             plot_toxicity(req_att_scores)
+            yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts[:i], **req_att_scores})
     plot_toxicity(req_att_scores)
     yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
+def proportion_non_ascii(s):
+    """
+    Compute the proportion of non-ASCII characters in a string.
+    Parameters:
+    s (str): The input string.
+    Returns:
+    float: The proportion of non-ASCII characters in the string.
+    """
+    non_ascii_count = sum(1 for c in s if ord(c) > 127)
+    total_chars = len(s)
+    return non_ascii_count / total_chars if total_chars > 0 else 0.0
+def non_ascii_check(texts_df, column_name):
+    texts = texts_df[column_name].to_list()
+    with multiprocessing.Pool(processes=8) as pool:
+        props = pool.map(proportion_non_ascii, texts)
+    # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
+    plt.hist(props, bins=20, range=(0., 1.))
+    plt.title('Histogram of proportion of non-ASCII characters')
+    plt.xlabel('Proportion of non-ASCII characters')
+    plt.ylabel('Number of texts')
+    return plt.gcf()
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         gr.Markdown("### High")
         df_high = gr.DataFrame()
     texts_sample_df = gr.DataFrame(visible=False)
     gr_check_btn.click(
         run_quality_check,
         inputs=[dataset_name, text_column, batch_size, num_examples],
+        outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_sample_df]
     )
+    gr_ascii_btn = gr.Button("Non ascii chars.")
+    non_ascii_hist = gr.Plot()
+    gr_ascii_btn.click(non_ascii_check, inputs=[texts_sample_df, text_column], outputs=[non_ascii_hist])
     gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")
     toxicity_progress_bar = gr.Label(show_label=False)
     toxicity_hist = gr.Plot()