Spaces:

huggingface
/

text-data-filtering

Running

App Files Files Community

HugoLaurencon commited on Nov 24, 2021

Commit

4bbaeac

•

1 Parent(s): f622ed0

first draft

Browse files

Files changed (1) hide show

app.py +61 -13

app.py CHANGED Viewed

@@ -1,23 +1,71 @@
 import streamlit as st
-import datasets
-from functools import partial
-data = datasets.load_dataset("json", data_files="small_test_data.jsonl")["train"].select(range(100))
-bad_cutoff = st.slider('Bad words cutoff', 0, 1)
-stp_cutoff = st.slider('Stop words cutoff', 0, 1)
-ppl_cutoff = st.slider('ppl cutoff', 0, 1)
-def filter_ppl(examples, invert=False):
-    return [ppl < ppl_cutoff for ppl in examples["ppl"]]
-def filter_bad(examples, invert=False):
-    return [bad < bad_cutoff for bad in examples["bad_words"]]
-def filter_stp(examples, invert=False):
-    return [stp > stp_cutoff for stp in examples["stop_words"]]
-st.table(data)

 import streamlit as st
+import json
+import pandas as pd
+import numpy as np
+st.title('5k English documents from Oscar with their stats.')
+path_data = "./10K_english_examples_with_stats.json"
+with open(path_data) as json_file:
+    data = json.load(json_file)
+data = data[:5000]
+data = pd.DataFrame(data)
+del data["len_words"]
+st.header('Parameters of the filtering')
+cutoff_special_characters_ratio = st.slider("Max cutoff special characters ratio", 0., 1., 1., step=0.01)
+cutoff_stopwords_ratio = st.slider("Min cutoff stopwords ratio", 0., 1., 0., step=0.01)
+cutoff_badwords_ratio = st.slider("Max cutoff badwords ratio", 0., 1., 1., step=0.001)
+cutoff_lang_id_score = st.slider("Min cutoff lang id score", 0., 1., 0., step=0.01)
+cutoff_perplexity_score = st.slider("Perplexity cutoff perplexity score", 0, 14000000, 14000000)
+keys = [
+    ("special_characters_ratio", cutoff_special_characters_ratio, True),
+    ("stopwords_ratio", cutoff_stopwords_ratio, False),
+    ("badwords_ratio", cutoff_badwords_ratio, True),
+    ("lang_id_score", cutoff_lang_id_score, False),
+    ("perplexity_score", cutoff_perplexity_score, True),
+]
+cond = [(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff) for key, cutoff, max_cutoff in keys]
+cond = np.all(cond, axis=0)
+data_keep = data.loc[cond]
+st.header('Data that we keep')
+st.markdown("Click on a column to sort by it.")
+st.markdown("Place the cursor on the text to display it.")
+st.dataframe(data_keep)
+data_not_keep = data.loc[np.invert(cond)]
+st.header('Data that is thrown away')
+st.markdown("Click on a column to sort by it.")
+st.markdown("Place the cursor on the text to display it.")
+st.dataframe(data_not_keep)
+def plot_hist(key, num_bins=50):
+    st.header(" ".join(key.split("_")))
+    hist_values = data[key].values
+    max_range = np.max(hist_values)
+    hist_values = np.histogram(
+        hist_values,
+        bins=num_bins,
+        range=(0,max_range)
+    )[0]
+    st.bar_chart(hist_values)
+    st.markdown(f"Each bin is of size: {max_range/num_bins}.")
+for key, _, _ in keys:
+    plot_hist(key)
+st.header('Download data')
+with open(path_data) as json_file:
+    btn = st.download_button(
+        label="Download data as json",
+        data=json_file,
+        file_name='data.json',
+    )