Spaces:

huggingface
/

text-data-filtering

Running

App Files Files Community

teven commited on Nov 26, 2021

Commit

f924b14

•

1 Parent(s): 21fafec

TVN update

Browse files

Files changed (2) hide show

app.py +53 -81
en_examples_with_stats_no_small_docs.json +3 -0

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import streamlit as st
 import json
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
@@ -15,7 +16,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
     st.title(f"{num_docs} {lang} documents from Oscar with their stats.")
     sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]]
-    words = [word for sentence in sentences for word in sentence]
     words_data = [{"len_word": len(word), "word": word} for word in words]
     words_data = pd.DataFrame(words_data)
@@ -24,39 +25,46 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
     columns = list(data)
     keys = []
-    st.header("Parameters of the filtering")
-    if "special_characters_ratio" in columns:
-        cutoff_special_characters_ratio = st.slider(
-            "Max cutoff special characters ratio", 0.0, 1.0, 1.0, step=0.01
-        )
-        keys.append(("special_characters_ratio", cutoff_special_characters_ratio, True))
-    if "stopwords_ratio" in columns:
-        cutoff_stopwords_ratio = st.slider(
-            "Min cutoff stopwords ratio", 0.0, 1.0, 0.0, step=0.01
         )
-        keys.append(("stopwords_ratio", cutoff_stopwords_ratio, False))
-    if "badwords_ratio" in columns:
-        cutoff_badwords_ratio = st.slider(
-            "Max cutoff badwords ratio", 0.0, 1.0, 1.0, step=0.001
         )
-        keys.append(("badwords_ratio", cutoff_badwords_ratio, True))
-    if "lang_id_score" in columns:
-        cutoff_lang_id_score = st.slider(
-            "Min cutoff lang id score", 0.0, 1.0, 0.0, step=0.01
         )
-        keys.append(("lang_id_score", cutoff_lang_id_score, False))
-    if "perplexity_score" in columns:
-        max_pp = int(np.max(data["perplexity_score"])) + 1
-        cutoff_perplexity_score = st.slider(
-            "Perplexity cutoff perplexity score", 0, max_pp, max_pp
         )
-        keys.append(("perplexity_score", cutoff_perplexity_score, True))
     cond = [
         (data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff)
@@ -64,78 +72,42 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
     ]
     cond = np.all(cond, axis=0)
-    data_keep = data.loc[cond]
-    st.header("Data that we keep")
-    st.markdown("Click on a column to sort by it.")
-    st.markdown("Place the cursor on the text to display it.")
-    st.dataframe(data_keep)
     data_not_keep = data.loc[np.invert(cond)]
-    st.header("Data that is thrown away")
-    st.markdown("Click on a column to sort by it.")
-    st.markdown("Place the cursor on the text to display it.")
     st.dataframe(data_not_keep)
     def plot_hist(dataframe, key, num_bins=50):
-        st.header(" ".join(key.split("_")))
         hist_values = dataframe[key].values
         max_range = np.max(hist_values)
         hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
         st.bar_chart(hist_values)
         st.markdown(f"Each bin is of size: {max_range/num_bins}.")
-    for key, _, _ in keys:
-        plot_hist(data, key)
-    st.header("Zipf's Law")
-    def get_frequency_words(data):
-        freq_words = {}
-        for index, row in data.iterrows():
-            for word in row["text"].split(" "):
-                if word in freq_words:
-                    freq_words[word] += 1
-                else:
-                    freq_words[word] = 1
-        freq_words = np.array(list(freq_words.values()))
-        freq_words = -np.sort(-freq_words)
-        return freq_words
-    freq_words_data = get_frequency_words(data)
-    freq_words_data_keep = get_frequency_words(data_keep)
-    freq_words_data_not_keep = get_frequency_words(data_not_keep)
-    fig, ax = plt.subplots()
-    ax.loglog(freq_words_data)
-    ax.loglog(freq_words_data_keep)
-    ax.loglog(freq_words_data_not_keep)
-    ax.set_title("Zipf's Law")
-    ax.set_xlabel("$i$-th most frequent word")
-    ax.set_ylabel("frequency in the documents")
-    ax.legend(["All data", "Data that we keep", "Data that is thrown away"])
-    st.pyplot(fig)
-    st.markdown("If less than three curves are displayed, it means that there are overlaps.")
-    st.header("Parameter of the filtering for words")
     max_len_word = int(np.max(words_data["len_word"])) + 1
-    cutoff_word = st.slider("Max cutoff length word", 0, max_len_word, max_len_word)
     cond_words = words_data["len_word"] <= cutoff_word
     words_keep = words_data.loc[cond_words]
-    st.header(f"Words that we keep (for {num_docs_for_words} documents)")
-    st.markdown("Click on a column to sort by it.")
-    st.markdown("Place the cursor on the text to display it.")
     st.dataframe(words_keep)
     words_not_keep = words_data.loc[np.invert(cond_words)]
-    st.header(f"Words that are thrown away (for {num_docs_for_words} documents)")
-    st.markdown("Click on a column to sort by it.")
-    st.markdown("Place the cursor on the text to display it.")
     st.dataframe(words_not_keep)
-    plot_hist(words_data, "len_word")
     st.header("Download data")
     with open(path_data) as json_file:
@@ -146,7 +118,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
         )
-path_data = "./en_examples_with_stats.json"
 lang = "English"
 num_docs = 5000
 num_docs_for_words = 500

 import streamlit as st
 import json
 import pandas as pd
+import math
 import numpy as np
 import matplotlib.pyplot as plt
     st.title(f"{num_docs} {lang} documents from Oscar with their stats.")
     sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]]
+    words = set([word for sentence in sentences for word in sentence])
     words_data = [{"len_word": len(word), "word": word} for word in words]
     words_data = pd.DataFrame(words_data)
     columns = list(data)
     keys = []
+    values = {}
+    st.header("Filtering based on document content")
+    if "special_%" in columns:
+        special_ratio = st.sidebar.slider(
+            "% filtered by special characters ratio", 0.0, 100.0, 0.0, step=1.0
         )
+        cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
+        special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
+        st.sidebar.text(f"Kept text with <{special_cutoff:.1f}% special chars")
+        keys.append(("special_%", special_cutoff, True))
+    if "stop_%" in columns:
+        stop_ratio = st.sidebar.slider(
+            "% filtered by stop word ratio", 0.0, 100.0, 0.0, step=1.0
         )
+        cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
+        stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
+        st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words")
+        keys.append(("stop_%", stop_cutoff, False))
+    if "bad_%" in columns:
+        bad_ratio = st.sidebar.slider(
+            "% filtered by badwords ratio", 0.0, 100.0, 0.0, step=1.0
         )
+        bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
+        bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
+        st.sidebar.text(f"Kept text with <{bad_cutoff:.1f}% bad words")
+        keys.append(("bad_%", bad_cutoff, True))
+    if "perplexity" in columns:
+        ppl_ratio = st.sidebar.slider(
+            "% filtered by perplexity", 0.0, 100.0, 0.0, step=1.0
         )
+        ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
+        ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
+        st.sidebar.text(f"Kept text with <{ppl_cutoff:.0f} perplexity")
+        keys.append(("perplexity", ppl_cutoff, True))
     cond = [
         (data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff)
     ]
     cond = np.all(cond, axis=0)
     data_not_keep = data.loc[np.invert(cond)]
+    st.subheader("Filtered data")
+    st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
     st.dataframe(data_not_keep)
+    data_keep = data.loc[cond]
+    st.subheader("Kept data")
+    st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
+    st.dataframe(data_keep)
     def plot_hist(dataframe, key, num_bins=50):
+        st.subheader(" ".join(key.split("_")))
         hist_values = dataframe[key].values
         max_range = np.max(hist_values)
         hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
         st.bar_chart(hist_values)
         st.markdown(f"Each bin is of size: {max_range/num_bins}.")
+    # for key, _, _ in keys:
+    #     plot_hist(data, key)
+    st.header("Filtering links and concatenated words")
     max_len_word = int(np.max(words_data["len_word"])) + 1
+    cutoff_word = st.sidebar.slider("Word length cutoff", 0, max_len_word, max_len_word)
     cond_words = words_data["len_word"] <= cutoff_word
     words_keep = words_data.loc[cond_words]
+    st.subheader(f"Words that we keep (for {num_docs_for_words} documents)")
+    st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
     st.dataframe(words_keep)
     words_not_keep = words_data.loc[np.invert(cond_words)]
+    st.subheader(f"Words that are thrown away (for {num_docs_for_words} documents)")
+    st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
     st.dataframe(words_not_keep)
     st.header("Download data")
     with open(path_data) as json_file:
         )
+path_data = "./en_examples_with_stats_no_small_docs.json"
 lang = "English"
 num_docs = 5000
 num_docs_for_words = 500

en_examples_with_stats_no_small_docs.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42de045d52e16b4c96ec03b332c12f406e52b22b442234eea4845f5b5598784c
+size 21200705