HugoLaurencon
commited on
Commit
•
14574d7
1
Parent(s):
6303415
visu with discarded documents by filter
Browse files
app.py
CHANGED
@@ -66,7 +66,7 @@ class Visualization:
|
|
66 |
def set_sliders(docs):
|
67 |
columns = list(docs)
|
68 |
keys = []
|
69 |
-
conds =
|
70 |
|
71 |
def get_cond(key, cutoff, max_cutoff):
|
72 |
if max_cutoff:
|
@@ -87,9 +87,8 @@ class Visualization:
|
|
87 |
)
|
88 |
new_key = ("number_words", cutoff_min_number_words, False)
|
89 |
keys.append(new_key)
|
90 |
-
|
91 |
-
|
92 |
-
print_discared_by_cond(cond)
|
93 |
|
94 |
cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
|
95 |
cutoff_max_number_words = st.sidebar.slider(
|
@@ -97,9 +96,10 @@ class Visualization:
|
|
97 |
)
|
98 |
new_key = ("number_words", cutoff_max_number_words, True)
|
99 |
keys.append(new_key)
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
103 |
|
104 |
if "special_characters_ratio" in columns:
|
105 |
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
|
@@ -113,8 +113,8 @@ class Visualization:
|
|
113 |
)
|
114 |
keys.append(new_key)
|
115 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
116 |
-
conds.append(cond)
|
117 |
print_discared_by_cond(cond)
|
|
|
118 |
|
119 |
if "stopwords_ratio" in columns:
|
120 |
cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
|
@@ -124,8 +124,8 @@ class Visualization:
|
|
124 |
new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
|
125 |
keys.append(new_key)
|
126 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
127 |
-
conds.append(cond)
|
128 |
print_discared_by_cond(cond)
|
|
|
129 |
|
130 |
if "badwords_ratio" in columns:
|
131 |
cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
|
@@ -135,8 +135,8 @@ class Visualization:
|
|
135 |
new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
|
136 |
keys.append(new_key)
|
137 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
138 |
-
conds.append(cond)
|
139 |
print_discared_by_cond(cond)
|
|
|
140 |
|
141 |
if "lang_id_score" in columns:
|
142 |
cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
|
@@ -146,8 +146,8 @@ class Visualization:
|
|
146 |
new_key = ("lang_id_score", cutoff_lang_id_score, False)
|
147 |
keys.append(new_key)
|
148 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
149 |
-
conds.append(cond)
|
150 |
print_discared_by_cond(cond)
|
|
|
151 |
|
152 |
if "perplexity_score" in columns:
|
153 |
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
|
@@ -158,34 +158,61 @@ class Visualization:
|
|
158 |
new_key = ("perplexity_score", cutoff_perplexity_score, True)
|
159 |
keys.append(new_key)
|
160 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
161 |
-
conds.append(cond)
|
162 |
print_discared_by_cond(cond)
|
|
|
163 |
|
164 |
return keys, conds
|
165 |
|
166 |
self.keys, conds = set_sliders(self.docs)
|
167 |
|
168 |
-
|
|
|
169 |
|
170 |
st.header("Filtering on documents")
|
171 |
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
180 |
|
181 |
-
|
182 |
-
st.
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
def filtering_of_words(self):
|
191 |
st.sidebar.subheader("Parameter of the filtering on words")
|
|
|
66 |
def set_sliders(docs):
|
67 |
columns = list(docs)
|
68 |
keys = []
|
69 |
+
conds = {}
|
70 |
|
71 |
def get_cond(key, cutoff, max_cutoff):
|
72 |
if max_cutoff:
|
|
|
87 |
)
|
88 |
new_key = ("number_words", cutoff_min_number_words, False)
|
89 |
keys.append(new_key)
|
90 |
+
cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
|
91 |
+
print_discared_by_cond(cond_1)
|
|
|
92 |
|
93 |
cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
|
94 |
cutoff_max_number_words = st.sidebar.slider(
|
|
|
96 |
)
|
97 |
new_key = ("number_words", cutoff_max_number_words, True)
|
98 |
keys.append(new_key)
|
99 |
+
cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
|
100 |
+
print_discared_by_cond(cond_2)
|
101 |
+
|
102 |
+
conds["number_words"] = [cond_1, cond_2]
|
103 |
|
104 |
if "special_characters_ratio" in columns:
|
105 |
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
|
|
|
113 |
)
|
114 |
keys.append(new_key)
|
115 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
116 |
print_discared_by_cond(cond)
|
117 |
+
conds["special_characters_ratio"] = [cond]
|
118 |
|
119 |
if "stopwords_ratio" in columns:
|
120 |
cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
|
|
|
124 |
new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
|
125 |
keys.append(new_key)
|
126 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
127 |
print_discared_by_cond(cond)
|
128 |
+
conds["stopwords_ratio"] = [cond]
|
129 |
|
130 |
if "badwords_ratio" in columns:
|
131 |
cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
|
|
|
135 |
new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
|
136 |
keys.append(new_key)
|
137 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
138 |
print_discared_by_cond(cond)
|
139 |
+
conds["badwords_ratio"] = [cond]
|
140 |
|
141 |
if "lang_id_score" in columns:
|
142 |
cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
|
|
|
146 |
new_key = ("lang_id_score", cutoff_lang_id_score, False)
|
147 |
keys.append(new_key)
|
148 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
149 |
print_discared_by_cond(cond)
|
150 |
+
conds["lang_id_score"] = [cond]
|
151 |
|
152 |
if "perplexity_score" in columns:
|
153 |
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
|
|
|
158 |
new_key = ("perplexity_score", cutoff_perplexity_score, True)
|
159 |
keys.append(new_key)
|
160 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
161 |
print_discared_by_cond(cond)
|
162 |
+
conds["perplexity_score"] = [cond]
|
163 |
|
164 |
return keys, conds
|
165 |
|
166 |
self.keys, conds = set_sliders(self.docs)
|
167 |
|
168 |
+
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
169 |
+
all_conds = np.all(all_conds, axis=0)
|
170 |
|
171 |
st.header("Filtering on documents")
|
172 |
|
173 |
+
def display_dataset(cond, description):
|
174 |
+
displayed_docs = self.docs.loc[cond]
|
175 |
+
st.subheader(
|
176 |
+
f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
|
177 |
+
)
|
178 |
+
st.markdown(
|
179 |
+
"Click on a column to sort by it, place the cursor on the text to display it."
|
180 |
+
)
|
181 |
+
st.dataframe(displayed_docs)
|
182 |
+
|
183 |
+
display_dataset(np.invert(all_conds), "Discarded documents")
|
184 |
|
185 |
+
#st.subheader("Display discarded documents by filter")
|
186 |
+
display_discarded_documents_by_filter = st.checkbox("Display discarded documents by filter")
|
187 |
+
|
188 |
+
if display_discarded_documents_by_filter:
|
189 |
+
columns = list(self.docs)
|
190 |
+
|
191 |
+
if "number_words" in columns:
|
192 |
+
cond_filter = np.invert(np.all(conds["number_words"], axis=0))
|
193 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the number of words")
|
194 |
+
|
195 |
+
if "special_characters_ratio" in columns:
|
196 |
+
cond_filter = np.invert(np.all(conds["special_characters_ratio"], axis=0))
|
197 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the special characters ratio")
|
198 |
+
|
199 |
+
if "stopwords_ratio" in columns:
|
200 |
+
cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
|
201 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the stop words ratio")
|
202 |
+
|
203 |
+
if "badwords_ratio" in columns:
|
204 |
+
cond_filter = np.invert(np.all(conds["badwords_ratio"], axis=0))
|
205 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the bad words ratio")
|
206 |
+
|
207 |
+
if "lang_id_score" in columns:
|
208 |
+
cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
|
209 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the language identification confidence score")
|
210 |
+
|
211 |
+
if "perplexity_score" in columns:
|
212 |
+
cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
|
213 |
+
display_dataset(cond_filter, "Discarded documents for the filter on the perplexity score")
|
214 |
+
|
215 |
+
display_dataset(all_conds, "Retained documents")
|
216 |
|
217 |
def filtering_of_words(self):
|
218 |
st.sidebar.subheader("Parameter of the filtering on words")
|