Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

App Files Files Community

polinaeterna commited on Aug 28, 2024

Commit

e1c0c70

1 Parent(s): 3aad6e9

batching

Browse files

Files changed (1) hide show

app.py +24 -14

app.py CHANGED Viewed

@@ -44,20 +44,30 @@ def predict(texts: list[str]):
     return predicted_domains
-def run_quality_check(dataset, column, n_samples):
-    config = "default"
-    data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
-    texts = data[column].to_list()[:n_samples]
-    predictions = predict(texts[:n_samples])
-    texts_df = pd.DataFrame({"quality": predictions, "text": texts})
-    counts = pd.DataFrame({"quality": predictions}).value_counts().to_frame()
     counts.reset_index(inplace=True)
     return (
-        gr.BarPlot(counts, x="quality", y="count"),
-        texts_df[texts_df["quality"] == "Low"][:20],
-        texts_df[texts_df["quality"] == "Medium"][:20],
-        texts_df[texts_df["quality"] == "High"][:20],
-    )
 with gr.Blocks() as demo:
     gr.Markdown("# 💫 Dataset Quality Checker 💫")
@@ -80,12 +90,12 @@ with gr.Blocks() as demo:
             """
         return gr.HTML(value=html_code)
     text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
-    n_samples = gr.Number(label="Num first samples to run check")
     gr_check_btn = gr.Button("Check Dataset")
     plot = gr.BarPlot()
     with gr.Accordion("Explore some individual examples for each class", open=False):
         df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
-    gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[plot, df_low, df_medium, df_high])
 demo.launch()

     return predicted_domains
+def plot_and_df(texts, preds):
+    texts_df = pd.DataFrame({"quality": preds, "text": texts})
+    counts = pd.DataFrame({"quality": preds}).value_counts().to_frame()
     counts.reset_index(inplace=True)
     return (
+            gr.BarPlot(counts, x="quality", y="count"),
+            texts_df[texts_df["quality"] == "Low"][:20],
+            texts_df[texts_df["quality"] == "Medium"][:20],
+            texts_df[texts_df["quality"] == "High"][:20],
+        )
+def run_quality_check(dataset, column, batch_size):
+    config = "default"
+    data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
+    texts = data[column].to_list()
+    # batch_size = 100
+    predictions, texts_processed = [], []
+    for i in range(5):
+        batch_texts = texts[i:i+batch_size]
+        batch_predictions = predict(batch_texts)
+        predictions.extend(batch_predictions)
+        texts_processed.extend(batch_texts)
+        yield plot_and_df(texts_processed, predictions)
 with gr.Blocks() as demo:
     gr.Markdown("# 💫 Dataset Quality Checker 💫")
             """
         return gr.HTML(value=html_code)
     text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
+    batch_size = gr.Number(100, label="Batch size")
     gr_check_btn = gr.Button("Check Dataset")
     plot = gr.BarPlot()
     with gr.Accordion("Explore some individual examples for each class", open=False):
         df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
+    gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size], outputs=[plot, df_low, df_medium, df_high])
 demo.launch()