Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Dec 26, 2021

Commit

611e98e

1 Parent(s): 58d483d

chinese visu

Browse files

Files changed (4) hide show

.gitattributes +2 -0
app.py +117 -71
en_examples_with_stats.json +3 -0
zh_examples_with_stats.json +3 -0

.gitattributes CHANGED Viewed

@@ -27,3 +27,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.jsonl filter=lfs diff=lfs merge=lfs -text
 *.json filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.jsonl filter=lfs diff=lfs merge=lfs -text
 *.json filter=lfs diff=lfs merge=lfs -text
+en_examples_with_stats.json filter=lfs diff=lfs merge=lfs -text
+zh_examples_with_stats.json filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -15,7 +15,13 @@ import matplotlib.pyplot as plt
 class Visualization:
     def __init__(
-        self, path_instructions, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
     ):
         self.path_instructions = path_instructions
         self.path_data = path_data
@@ -25,17 +31,25 @@ class Visualization:
         self.max_len_text_display = max_len_text_display
     def preamble(self):
-        st.markdown("Before diving into this demo, you might want to take a look at how the filtering pipeline of OSCAR looks like in more detail.")
-        def get_binary_file_downloader_html(bin_file, file_label='File'):
-            with open(bin_file, 'rb') as f:
                 data = f.read()
             bin_str = base64.b64encode(data).decode()
             href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
             return href
-        st.markdown(get_binary_file_downloader_html(self.path_instructions, "Download the filtering pipeline of OSCAR as pdf"), unsafe_allow_html=True)
     def open_data(self):
         with open(self.path_data) as json_file:
             data = json.load(json_file)
@@ -43,13 +57,17 @@ class Visualization:
         self.num_docs = min(self.num_docs, len(data))
         self.num_docs_for_words = min(self.num_docs_for_words, len(data))
-        words = [doc["words"] for doc in data[: self.num_docs_for_words]]
-        words = [word for doc in words for word in doc]
-        self.words = pd.DataFrame(words)
         docs = data[: self.num_docs]
         for doc in docs:
-            del doc["words"]
             if len(doc["text"]) > self.max_len_text_display:
                 doc["text"] = (
                     doc["text"][: self.max_len_text_display]
@@ -179,82 +197,103 @@ class Visualization:
                 "Click on a column to sort by it, place the cursor on the text to display it."
             )
             st.dataframe(displayed_docs)
         display_dataset(np.invert(all_conds), "Discarded documents")
-        #st.subheader("Display discarded documents by filter")
-        display_discarded_documents_by_filter = st.checkbox("Display discarded documents by filter")
         if display_discarded_documents_by_filter:
             columns = list(self.docs)
             if "number_words" in columns:
                 cond_filter = np.invert(np.all(conds["number_words"], axis=0))
-                display_dataset(cond_filter, "Discarded documents for the filter on the number of words")
             if "special_characters_ratio" in columns:
-                cond_filter = np.invert(np.all(conds["special_characters_ratio"], axis=0))
-                display_dataset(cond_filter, "Discarded documents for the filter on the special characters ratio")
             if "stopwords_ratio" in columns:
                 cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
-                display_dataset(cond_filter, "Discarded documents for the filter on the stop words ratio")
             if "badwords_ratio" in columns:
                 cond_filter = np.invert(np.all(conds["badwords_ratio"], axis=0))
-                display_dataset(cond_filter, "Discarded documents for the filter on the bad words ratio")
             if "lang_id_score" in columns:
                 cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
-                display_dataset(cond_filter, "Discarded documents for the filter on the language identification confidence score")
             if "perplexity_score" in columns:
                 cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
-                display_dataset(cond_filter, "Discarded documents for the filter on the perplexity score")
         display_dataset(all_conds, "Retained documents")
     def filtering_of_words(self):
-        st.sidebar.subheader("Parameter of the filtering on words")
-        cutoff_def = (
-            "If the length of a word is higher than this number, the word is removed."
-        )
-        max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
-        cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
-        incorrect_substrings = st.sidebar.checkbox(
-            "Remove words with incorrect substrings."
-        )
-        cond_words = self.words["len_word"] <= cutoff_word
-        if incorrect_substrings:
-            cond_words = cond_words & np.invert(self.words["incorrect_substring"])
-        st.header("Filtering on words")
-        st.markdown(
-            f"Since the number of words is way larger than the number of documents, "
-            f"we consider in this section words for the first {self.num_docs_for_words} documents only."
-        )
-        discarded_words = self.words.loc[np.invert(cond_words)]
-        st.subheader(
-            f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
-        )
-        st.markdown(
-            "Click on a column to sort by it, place the cursor on the text to display it."
-        )
-        st.dataframe(discarded_words)
-        retained_words = self.words.loc[cond_words]
-        st.subheader(
-            f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
-        )
-        st.markdown(
-            "Click on a column to sort by it, place the cursor on the text to display it."
-        )
-        st.dataframe(retained_words)
     def plot_distributions_filtering_parameters(self):
         st.header("Distributions of the filtering parameters")
@@ -276,27 +315,29 @@ class Visualization:
             for key in list({el[0]: None for el in self.keys}):
                 plot_hist(self.docs, key)
-            plot_hist(self.words, "len_word")
     def plot_zipf_law(self):
-        st.header("Zipf's Law")
-        display_zipf_law = st.checkbox("Display Zipf's Law")
-        if display_zipf_law:
-            freq_words = {}
-            for _, row in self.words.iterrows():
-                freq_words[row["word"]] = freq_words.get(row["word"], 0) + 1
-            freq_words = np.array(list(freq_words.values()))
-            freq_words = -np.sort(-freq_words)
-            fig, ax = plt.subplots()
-            ax.loglog(freq_words)
-            ax.set_title("Zipf's Law")
-            ax.set_xlabel("$i$-th most frequent word")
-            ax.set_ylabel("frequency in the documents")
-            st.pyplot(fig)
     def download_data(self):
         st.header("Download data")
@@ -320,13 +361,18 @@ class Visualization:
 path_instructions = "./filtering_pipeline_oscar.pdf"
-path_data = "./en_examples_with_stats.json"
-lang = "English"
 num_docs = 5000
 num_docs_for_words = 500
 max_len_text_display = 10000
 visualization = Visualization(
-    path_instructions, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
 )
 visualization.visualization()

 class Visualization:
     def __init__(
+        self,
+        path_instructions,
+        path_data,
+        lang,
+        num_docs,
+        num_docs_for_words,
+        max_len_text_display,
     ):
         self.path_instructions = path_instructions
         self.path_data = path_data
         self.max_len_text_display = max_len_text_display
     def preamble(self):
+        st.markdown(
+            "Before diving into this demo, you might want to take a look at how the filtering pipeline of OSCAR looks like in more detail."
+        )
+        def get_binary_file_downloader_html(bin_file, file_label="File"):
+            with open(bin_file, "rb") as f:
                 data = f.read()
             bin_str = base64.b64encode(data).decode()
             href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
             return href
+        st.markdown(
+            get_binary_file_downloader_html(
+                self.path_instructions,
+                "Download the filtering pipeline of OSCAR as pdf",
+            ),
+            unsafe_allow_html=True,
+        )
     def open_data(self):
         with open(self.path_data) as json_file:
             data = json.load(json_file)
         self.num_docs = min(self.num_docs, len(data))
         self.num_docs_for_words = min(self.num_docs_for_words, len(data))
+        if "words" in data[0]:
+            words = [doc["words"] for doc in data[: self.num_docs_for_words]]
+            words = [word for doc in words for word in doc]
+            self.words = pd.DataFrame(words)
+        else:
+            self.words = None
         docs = data[: self.num_docs]
         for doc in docs:
+            if not (self.words is None):
+                del doc["words"]
             if len(doc["text"]) > self.max_len_text_display:
                 doc["text"] = (
                     doc["text"][: self.max_len_text_display]
                 "Click on a column to sort by it, place the cursor on the text to display it."
             )
             st.dataframe(displayed_docs)
         display_dataset(np.invert(all_conds), "Discarded documents")
+        # st.subheader("Display discarded documents by filter")
+        display_discarded_documents_by_filter = st.checkbox(
+            "Display discarded documents by filter"
+        )
         if display_discarded_documents_by_filter:
             columns = list(self.docs)
             if "number_words" in columns:
                 cond_filter = np.invert(np.all(conds["number_words"], axis=0))
+                display_dataset(
+                    cond_filter,
+                    "Discarded documents for the filter on the number of words",
+                )
             if "special_characters_ratio" in columns:
+                cond_filter = np.invert(
+                    np.all(conds["special_characters_ratio"], axis=0)
+                )
+                display_dataset(
+                    cond_filter,
+                    "Discarded documents for the filter on the special characters ratio",
+                )
             if "stopwords_ratio" in columns:
                 cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
+                display_dataset(
+                    cond_filter,
+                    "Discarded documents for the filter on the stop words ratio",
+                )
             if "badwords_ratio" in columns:
                 cond_filter = np.invert(np.all(conds["badwords_ratio"], axis=0))
+                display_dataset(
+                    cond_filter,
+                    "Discarded documents for the filter on the bad words ratio",
+                )
             if "lang_id_score" in columns:
                 cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
+                display_dataset(
+                    cond_filter,
+                    "Discarded documents for the filter on the language identification confidence score",
+                )
             if "perplexity_score" in columns:
                 cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
+                display_dataset(
+                    cond_filter,
+                    "Discarded documents for the filter on the perplexity score",
+                )
         display_dataset(all_conds, "Retained documents")
     def filtering_of_words(self):
+        if not (self.words is None):
+            st.sidebar.subheader("Parameter of the filtering on words")
+            cutoff_def = "If the length of a word is higher than this number, the word is removed."
+            max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
+            cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
+            incorrect_substrings = st.sidebar.checkbox(
+                "Remove words with incorrect substrings."
+            )
+            cond_words = self.words["len_word"] <= cutoff_word
+            if incorrect_substrings:
+                cond_words = cond_words & np.invert(self.words["incorrect_substring"])
+            st.header("Filtering on words")
+            st.markdown(
+                f"Since the number of words is way larger than the number of documents, "
+                f"we consider in this section words for the first {self.num_docs_for_words} documents only."
+            )
+            discarded_words = self.words.loc[np.invert(cond_words)]
+            st.subheader(
+                f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
+            )
+            st.markdown(
+                "Click on a column to sort by it, place the cursor on the text to display it."
+            )
+            st.dataframe(discarded_words)
+            retained_words = self.words.loc[cond_words]
+            st.subheader(
+                f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
+            )
+            st.markdown(
+                "Click on a column to sort by it, place the cursor on the text to display it."
+            )
+            st.dataframe(retained_words)
     def plot_distributions_filtering_parameters(self):
         st.header("Distributions of the filtering parameters")
             for key in list({el[0]: None for el in self.keys}):
                 plot_hist(self.docs, key)
+            if not (self.words is None):
+                plot_hist(self.words, "len_word")
     def plot_zipf_law(self):
+        if not (self.words is None):
+            st.header("Zipf's Law")
+            display_zipf_law = st.checkbox("Display Zipf's Law")
+            if display_zipf_law:
+                freq_words = {}
+                for _, row in self.words.iterrows():
+                    freq_words[row["word"]] = freq_words.get(row["word"], 0) + 1
+                freq_words = np.array(list(freq_words.values()))
+                freq_words = -np.sort(-freq_words)
+                fig, ax = plt.subplots()
+                ax.loglog(freq_words)
+                ax.set_title("Zipf's Law")
+                ax.set_xlabel("$i$-th most frequent word")
+                ax.set_ylabel("frequency in the documents")
+                st.pyplot(fig)
     def download_data(self):
         st.header("Download data")
 path_instructions = "./filtering_pipeline_oscar.pdf"
+path_data = "./zh_examples_with_stats.json"
+lang = "Chinese"
 num_docs = 5000
 num_docs_for_words = 500
 max_len_text_display = 10000
 visualization = Visualization(
+    path_instructions,
+    path_data,
+    lang,
+    num_docs,
+    num_docs_for_words,
+    max_len_text_display,
 )
 visualization.visualization()

en_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2325873414309a7ea67d2753202207a2773319dc40f338c0a0fc7bb703463a6
+size 713107133

zh_examples_with_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:438a5bb757c23581784946f345a99ab11b77c43f57a3cbf18148c197ec4ef741
+size 193517532