HugoLaurencon commited on
Commit
611e98e
1 Parent(s): 58d483d

chinese visu

Browse files
.gitattributes CHANGED
@@ -27,3 +27,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
  *.jsonl filter=lfs diff=lfs merge=lfs -text
29
  *.json filter=lfs diff=lfs merge=lfs -text
 
 
 
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
  *.jsonl filter=lfs diff=lfs merge=lfs -text
29
  *.json filter=lfs diff=lfs merge=lfs -text
30
+ en_examples_with_stats.json filter=lfs diff=lfs merge=lfs -text
31
+ zh_examples_with_stats.json filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -15,7 +15,13 @@ import matplotlib.pyplot as plt
15
 
16
  class Visualization:
17
  def __init__(
18
- self, path_instructions, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
 
 
 
 
 
 
19
  ):
20
  self.path_instructions = path_instructions
21
  self.path_data = path_data
@@ -25,17 +31,25 @@ class Visualization:
25
  self.max_len_text_display = max_len_text_display
26
 
27
  def preamble(self):
28
- st.markdown("Before diving into this demo, you might want to take a look at how the filtering pipeline of OSCAR looks like in more detail.")
 
 
29
 
30
- def get_binary_file_downloader_html(bin_file, file_label='File'):
31
- with open(bin_file, 'rb') as f:
32
  data = f.read()
33
  bin_str = base64.b64encode(data).decode()
34
  href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
35
  return href
36
 
37
- st.markdown(get_binary_file_downloader_html(self.path_instructions, "Download the filtering pipeline of OSCAR as pdf"), unsafe_allow_html=True)
38
-
 
 
 
 
 
 
39
  def open_data(self):
40
  with open(self.path_data) as json_file:
41
  data = json.load(json_file)
@@ -43,13 +57,17 @@ class Visualization:
43
  self.num_docs = min(self.num_docs, len(data))
44
  self.num_docs_for_words = min(self.num_docs_for_words, len(data))
45
 
46
- words = [doc["words"] for doc in data[: self.num_docs_for_words]]
47
- words = [word for doc in words for word in doc]
48
- self.words = pd.DataFrame(words)
 
 
 
49
 
50
  docs = data[: self.num_docs]
51
  for doc in docs:
52
- del doc["words"]
 
53
  if len(doc["text"]) > self.max_len_text_display:
54
  doc["text"] = (
55
  doc["text"][: self.max_len_text_display]
@@ -179,82 +197,103 @@ class Visualization:
179
  "Click on a column to sort by it, place the cursor on the text to display it."
180
  )
181
  st.dataframe(displayed_docs)
182
-
183
  display_dataset(np.invert(all_conds), "Discarded documents")
184
 
185
- #st.subheader("Display discarded documents by filter")
186
- display_discarded_documents_by_filter = st.checkbox("Display discarded documents by filter")
 
 
187
 
188
  if display_discarded_documents_by_filter:
189
  columns = list(self.docs)
190
 
191
  if "number_words" in columns:
192
  cond_filter = np.invert(np.all(conds["number_words"], axis=0))
193
- display_dataset(cond_filter, "Discarded documents for the filter on the number of words")
 
 
 
194
 
195
  if "special_characters_ratio" in columns:
196
- cond_filter = np.invert(np.all(conds["special_characters_ratio"], axis=0))
197
- display_dataset(cond_filter, "Discarded documents for the filter on the special characters ratio")
 
 
 
 
 
198
 
199
  if "stopwords_ratio" in columns:
200
  cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
201
- display_dataset(cond_filter, "Discarded documents for the filter on the stop words ratio")
 
 
 
202
 
203
  if "badwords_ratio" in columns:
204
  cond_filter = np.invert(np.all(conds["badwords_ratio"], axis=0))
205
- display_dataset(cond_filter, "Discarded documents for the filter on the bad words ratio")
 
 
 
206
 
207
  if "lang_id_score" in columns:
208
  cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
209
- display_dataset(cond_filter, "Discarded documents for the filter on the language identification confidence score")
 
 
 
210
 
211
  if "perplexity_score" in columns:
212
  cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
213
- display_dataset(cond_filter, "Discarded documents for the filter on the perplexity score")
 
 
 
214
 
215
  display_dataset(all_conds, "Retained documents")
216
 
217
  def filtering_of_words(self):
218
- st.sidebar.subheader("Parameter of the filtering on words")
 
219
 
220
- cutoff_def = (
221
- "If the length of a word is higher than this number, the word is removed."
222
- )
223
- max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
224
- cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
225
 
226
- incorrect_substrings = st.sidebar.checkbox(
227
- "Remove words with incorrect substrings."
228
- )
229
 
230
- cond_words = self.words["len_word"] <= cutoff_word
231
- if incorrect_substrings:
232
- cond_words = cond_words & np.invert(self.words["incorrect_substring"])
233
 
234
- st.header("Filtering on words")
235
 
236
- st.markdown(
237
- f"Since the number of words is way larger than the number of documents, "
238
- f"we consider in this section words for the first {self.num_docs_for_words} documents only."
239
- )
240
 
241
- discarded_words = self.words.loc[np.invert(cond_words)]
242
- st.subheader(
243
- f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
244
- )
245
- st.markdown(
246
- "Click on a column to sort by it, place the cursor on the text to display it."
247
- )
248
- st.dataframe(discarded_words)
249
 
250
- retained_words = self.words.loc[cond_words]
251
- st.subheader(
252
- f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
253
- )
254
- st.markdown(
255
- "Click on a column to sort by it, place the cursor on the text to display it."
256
- )
257
- st.dataframe(retained_words)
258
 
259
  def plot_distributions_filtering_parameters(self):
260
  st.header("Distributions of the filtering parameters")
@@ -276,27 +315,29 @@ class Visualization:
276
  for key in list({el[0]: None for el in self.keys}):
277
  plot_hist(self.docs, key)
278
 
279
- plot_hist(self.words, "len_word")
 
280
 
281
  def plot_zipf_law(self):
282
- st.header("Zipf's Law")
 
283
 
284
- display_zipf_law = st.checkbox("Display Zipf's Law")
285
 
286
- if display_zipf_law:
287
 
288
- freq_words = {}
289
- for _, row in self.words.iterrows():
290
- freq_words[row["word"]] = freq_words.get(row["word"], 0) + 1
291
- freq_words = np.array(list(freq_words.values()))
292
- freq_words = -np.sort(-freq_words)
293
 
294
- fig, ax = plt.subplots()
295
- ax.loglog(freq_words)
296
- ax.set_title("Zipf's Law")
297
- ax.set_xlabel("$i$-th most frequent word")
298
- ax.set_ylabel("frequency in the documents")
299
- st.pyplot(fig)
300
 
301
  def download_data(self):
302
  st.header("Download data")
@@ -320,13 +361,18 @@ class Visualization:
320
 
321
 
322
  path_instructions = "./filtering_pipeline_oscar.pdf"
323
- path_data = "./en_examples_with_stats.json"
324
- lang = "English"
325
  num_docs = 5000
326
  num_docs_for_words = 500
327
  max_len_text_display = 10000
328
 
329
  visualization = Visualization(
330
- path_instructions, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
 
 
 
 
 
331
  )
332
  visualization.visualization()
 
15
 
16
  class Visualization:
17
  def __init__(
18
+ self,
19
+ path_instructions,
20
+ path_data,
21
+ lang,
22
+ num_docs,
23
+ num_docs_for_words,
24
+ max_len_text_display,
25
  ):
26
  self.path_instructions = path_instructions
27
  self.path_data = path_data
 
31
  self.max_len_text_display = max_len_text_display
32
 
33
  def preamble(self):
34
+ st.markdown(
35
+ "Before diving into this demo, you might want to take a look at how the filtering pipeline of OSCAR looks like in more detail."
36
+ )
37
 
38
+ def get_binary_file_downloader_html(bin_file, file_label="File"):
39
+ with open(bin_file, "rb") as f:
40
  data = f.read()
41
  bin_str = base64.b64encode(data).decode()
42
  href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
43
  return href
44
 
45
+ st.markdown(
46
+ get_binary_file_downloader_html(
47
+ self.path_instructions,
48
+ "Download the filtering pipeline of OSCAR as pdf",
49
+ ),
50
+ unsafe_allow_html=True,
51
+ )
52
+
53
  def open_data(self):
54
  with open(self.path_data) as json_file:
55
  data = json.load(json_file)
 
57
  self.num_docs = min(self.num_docs, len(data))
58
  self.num_docs_for_words = min(self.num_docs_for_words, len(data))
59
 
60
+ if "words" in data[0]:
61
+ words = [doc["words"] for doc in data[: self.num_docs_for_words]]
62
+ words = [word for doc in words for word in doc]
63
+ self.words = pd.DataFrame(words)
64
+ else:
65
+ self.words = None
66
 
67
  docs = data[: self.num_docs]
68
  for doc in docs:
69
+ if not (self.words is None):
70
+ del doc["words"]
71
  if len(doc["text"]) > self.max_len_text_display:
72
  doc["text"] = (
73
  doc["text"][: self.max_len_text_display]
 
197
  "Click on a column to sort by it, place the cursor on the text to display it."
198
  )
199
  st.dataframe(displayed_docs)
200
+
201
  display_dataset(np.invert(all_conds), "Discarded documents")
202
 
203
+ # st.subheader("Display discarded documents by filter")
204
+ display_discarded_documents_by_filter = st.checkbox(
205
+ "Display discarded documents by filter"
206
+ )
207
 
208
  if display_discarded_documents_by_filter:
209
  columns = list(self.docs)
210
 
211
  if "number_words" in columns:
212
  cond_filter = np.invert(np.all(conds["number_words"], axis=0))
213
+ display_dataset(
214
+ cond_filter,
215
+ "Discarded documents for the filter on the number of words",
216
+ )
217
 
218
  if "special_characters_ratio" in columns:
219
+ cond_filter = np.invert(
220
+ np.all(conds["special_characters_ratio"], axis=0)
221
+ )
222
+ display_dataset(
223
+ cond_filter,
224
+ "Discarded documents for the filter on the special characters ratio",
225
+ )
226
 
227
  if "stopwords_ratio" in columns:
228
  cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
229
+ display_dataset(
230
+ cond_filter,
231
+ "Discarded documents for the filter on the stop words ratio",
232
+ )
233
 
234
  if "badwords_ratio" in columns:
235
  cond_filter = np.invert(np.all(conds["badwords_ratio"], axis=0))
236
+ display_dataset(
237
+ cond_filter,
238
+ "Discarded documents for the filter on the bad words ratio",
239
+ )
240
 
241
  if "lang_id_score" in columns:
242
  cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
243
+ display_dataset(
244
+ cond_filter,
245
+ "Discarded documents for the filter on the language identification confidence score",
246
+ )
247
 
248
  if "perplexity_score" in columns:
249
  cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
250
+ display_dataset(
251
+ cond_filter,
252
+ "Discarded documents for the filter on the perplexity score",
253
+ )
254
 
255
  display_dataset(all_conds, "Retained documents")
256
 
257
  def filtering_of_words(self):
258
+ if not (self.words is None):
259
+ st.sidebar.subheader("Parameter of the filtering on words")
260
 
261
+ cutoff_def = "If the length of a word is higher than this number, the word is removed."
262
+ max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
263
+ cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
 
 
264
 
265
+ incorrect_substrings = st.sidebar.checkbox(
266
+ "Remove words with incorrect substrings."
267
+ )
268
 
269
+ cond_words = self.words["len_word"] <= cutoff_word
270
+ if incorrect_substrings:
271
+ cond_words = cond_words & np.invert(self.words["incorrect_substring"])
272
 
273
+ st.header("Filtering on words")
274
 
275
+ st.markdown(
276
+ f"Since the number of words is way larger than the number of documents, "
277
+ f"we consider in this section words for the first {self.num_docs_for_words} documents only."
278
+ )
279
 
280
+ discarded_words = self.words.loc[np.invert(cond_words)]
281
+ st.subheader(
282
+ f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
283
+ )
284
+ st.markdown(
285
+ "Click on a column to sort by it, place the cursor on the text to display it."
286
+ )
287
+ st.dataframe(discarded_words)
288
 
289
+ retained_words = self.words.loc[cond_words]
290
+ st.subheader(
291
+ f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
292
+ )
293
+ st.markdown(
294
+ "Click on a column to sort by it, place the cursor on the text to display it."
295
+ )
296
+ st.dataframe(retained_words)
297
 
298
  def plot_distributions_filtering_parameters(self):
299
  st.header("Distributions of the filtering parameters")
 
315
  for key in list({el[0]: None for el in self.keys}):
316
  plot_hist(self.docs, key)
317
 
318
+ if not (self.words is None):
319
+ plot_hist(self.words, "len_word")
320
 
321
  def plot_zipf_law(self):
322
+ if not (self.words is None):
323
+ st.header("Zipf's Law")
324
 
325
+ display_zipf_law = st.checkbox("Display Zipf's Law")
326
 
327
+ if display_zipf_law:
328
 
329
+ freq_words = {}
330
+ for _, row in self.words.iterrows():
331
+ freq_words[row["word"]] = freq_words.get(row["word"], 0) + 1
332
+ freq_words = np.array(list(freq_words.values()))
333
+ freq_words = -np.sort(-freq_words)
334
 
335
+ fig, ax = plt.subplots()
336
+ ax.loglog(freq_words)
337
+ ax.set_title("Zipf's Law")
338
+ ax.set_xlabel("$i$-th most frequent word")
339
+ ax.set_ylabel("frequency in the documents")
340
+ st.pyplot(fig)
341
 
342
  def download_data(self):
343
  st.header("Download data")
 
361
 
362
 
363
  path_instructions = "./filtering_pipeline_oscar.pdf"
364
+ path_data = "./zh_examples_with_stats.json"
365
+ lang = "Chinese"
366
  num_docs = 5000
367
  num_docs_for_words = 500
368
  max_len_text_display = 10000
369
 
370
  visualization = Visualization(
371
+ path_instructions,
372
+ path_data,
373
+ lang,
374
+ num_docs,
375
+ num_docs_for_words,
376
+ max_len_text_display,
377
  )
378
  visualization.visualization()
en_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2325873414309a7ea67d2753202207a2773319dc40f338c0a0fc7bb703463a6
3
+ size 713107133
zh_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:438a5bb757c23581784946f345a99ab11b77c43f57a3cbf18148c197ec4ef741
3
+ size 193517532