HugoLaurencon commited on
Commit
0610f9d
1 Parent(s): 0319ee2

visualization: choose between several languages

Browse files
Files changed (4) hide show
  1. app.py +115 -91
  2. zh.arpa.bin +3 -0
  3. zh.sp.model +3 -0
  4. zh_examples_with_stats.json +3 -0
app.py CHANGED
@@ -16,12 +16,12 @@ import numpy as np
16
  import matplotlib.pyplot as plt
17
 
18
  from filtering import LoadParameters, ModifyingDocuments, Filtering
 
19
 
20
 
21
- class Visualization:
22
  def __init__(
23
  self,
24
- path_instructions,
25
  path_data,
26
  lang,
27
  num_docs,
@@ -32,7 +32,6 @@ class Visualization:
32
  path_sentencepiece_model,
33
  path_kenlm_model,
34
  ):
35
- self.path_instructions = path_instructions
36
  self.path_data = path_data
37
  self.lang = lang
38
  self.num_docs = num_docs
@@ -56,32 +55,8 @@ class Visualization:
56
  lang_dataset_id, path_kenlm_model
57
  )
58
 
59
- def warning_preamble(self):
60
- st.markdown(
61
- "This demo can be a little slow, and only allows you to process up to 5000 documents "
62
- "for a decent speed. If you want to display up to three times more documents and have "
63
- "a faster visualization, we invite you to run this "
64
- "[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) "
65
- "on your computer."
66
- )
67
-
68
- def preamble(self):
69
- def get_binary_file_downloader_html(bin_file, file_label="File"):
70
- with open(bin_file, "rb") as f:
71
- data = f.read()
72
- bin_str = base64.b64encode(data).decode()
73
- href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
74
- return href
75
-
76
- st.markdown(
77
- "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this "
78
- + get_binary_file_downloader_html(
79
- self.path_instructions,
80
- "pdf",
81
- )
82
- + ".",
83
- unsafe_allow_html=True,
84
- )
85
 
86
  def open_data(self):
87
  with open(self.path_data) as json_file:
@@ -109,9 +84,6 @@ class Visualization:
109
  self.docs_checkpoint = pd.DataFrame(docs)
110
  self.docs = self.docs_checkpoint
111
 
112
- def set_title(self):
113
- st.title(f"Filtering visualization")
114
-
115
  @staticmethod
116
  def print_discarded_by_cond(cond):
117
  st.caption(
@@ -169,9 +141,9 @@ class Visualization:
169
  )
170
  new_key = ("number_words", cutoff_min_number_words, False)
171
  keys.append(new_key)
172
- Visualization.plot_hist(self.docs, new_key)
173
  cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
174
- Visualization.print_discarded_by_cond(cond_1)
175
 
176
  cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
177
  cutoff_max_number_words = st.slider(
@@ -180,7 +152,7 @@ class Visualization:
180
  new_key = ("number_words", cutoff_max_number_words, True)
181
  keys.append(new_key)
182
  cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
183
- Visualization.print_discarded_by_cond(cond_2)
184
 
185
  conds["number_words"] = [cond_1, cond_2]
186
 
@@ -226,9 +198,9 @@ class Visualization:
226
  repetitions_length,
227
  )
228
  keys.append(new_key)
229
- Visualization.plot_hist(self.docs, new_key)
230
  cond = get_cond(new_key[0], new_key[1], new_key[2])
231
- Visualization.print_discarded_by_cond(cond)
232
  conds["repetitions_ratio"] = [cond]
233
 
234
  if "special_characters_ratio" in columns:
@@ -243,9 +215,9 @@ class Visualization:
243
  True,
244
  )
245
  keys.append(new_key)
246
- Visualization.plot_hist(self.docs, new_key)
247
  cond = get_cond(new_key[0], new_key[1], new_key[2])
248
- Visualization.print_discarded_by_cond(cond)
249
  conds["special_characters_ratio"] = [cond]
250
 
251
  if "stopwords_ratio" in columns:
@@ -279,9 +251,9 @@ class Visualization:
279
  )
280
  new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
281
  keys.append(new_key)
282
- Visualization.plot_hist(self.docs, new_key)
283
  cond = get_cond(new_key[0], new_key[1], new_key[2])
284
- Visualization.print_discarded_by_cond(cond)
285
  conds["stopwords_ratio"] = [cond]
286
 
287
  if "flagged_words_ratio" in columns:
@@ -316,9 +288,9 @@ class Visualization:
316
  )
317
  new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
318
  keys.append(new_key)
319
- Visualization.plot_hist(self.docs, new_key)
320
  cond = get_cond(new_key[0], new_key[1], new_key[2])
321
- Visualization.print_discarded_by_cond(cond)
322
  conds["flagged_words_ratio"] = [cond]
323
 
324
  if "lang_id_score" in columns:
@@ -329,9 +301,9 @@ class Visualization:
329
  )
330
  new_key = ("lang_id_score", cutoff_lang_id_score, False)
331
  keys.append(new_key)
332
- Visualization.plot_hist(self.docs, new_key)
333
  cond = get_cond(new_key[0], new_key[1], new_key[2])
334
- Visualization.print_discarded_by_cond(cond)
335
  conds["lang_id_score"] = [cond]
336
 
337
  if "perplexity_score" in columns:
@@ -341,9 +313,9 @@ class Visualization:
341
  cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp)
342
  new_key = ("perplexity_score", cutoff_perplexity_score, True)
343
  keys.append(new_key)
344
- Visualization.plot_hist(self.docs, new_key)
345
  cond = get_cond(new_key[0], new_key[1], new_key[2])
346
- Visualization.print_discarded_by_cond(cond)
347
  conds["perplexity_score"] = [cond]
348
 
349
  return keys, conds
@@ -361,7 +333,7 @@ class Visualization:
361
  f"Filtering on documents, for {self.num_docs} {self.lang} documents"
362
  )
363
 
364
- Visualization.display_dataset(
365
  self.docs, np.invert(all_conds), "Discarded documents", "docs"
366
  )
367
 
@@ -375,7 +347,7 @@ class Visualization:
375
 
376
  if "number_words" in columns:
377
  cond_filter = np.invert(np.all(conds["number_words"], axis=0))
378
- Visualization.display_dataset(
379
  self.docs,
380
  cond_filter,
381
  "Discarded documents for the filter on the number of words",
@@ -384,7 +356,7 @@ class Visualization:
384
 
385
  if "repetitions_ratio" in columns:
386
  cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
387
- Visualization.display_dataset(
388
  self.docs,
389
  cond_filter,
390
  "Discarded documents for the filter on the repetitions ratio",
@@ -395,7 +367,7 @@ class Visualization:
395
  cond_filter = np.invert(
396
  np.all(conds["special_characters_ratio"], axis=0)
397
  )
398
- Visualization.display_dataset(
399
  self.docs,
400
  cond_filter,
401
  "Discarded documents for the filter on the special characters ratio",
@@ -404,7 +376,7 @@ class Visualization:
404
 
405
  if "stopwords_ratio" in columns:
406
  cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
407
- Visualization.display_dataset(
408
  self.docs,
409
  cond_filter,
410
  "Discarded documents for the filter on the stop words ratio",
@@ -415,7 +387,7 @@ class Visualization:
415
  cond_filter = np.invert(
416
  np.all(conds["flagged_words_ratio"], axis=0)
417
  )
418
- Visualization.display_dataset(
419
  self.docs,
420
  cond_filter,
421
  "Discarded documents for the filter on the flagged words ratio",
@@ -424,7 +396,7 @@ class Visualization:
424
 
425
  if "lang_id_score" in columns:
426
  cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
427
- Visualization.display_dataset(
428
  self.docs,
429
  cond_filter,
430
  "Discarded documents for the filter on the language identification confidence score",
@@ -433,14 +405,14 @@ class Visualization:
433
 
434
  if "perplexity_score" in columns:
435
  cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
436
- Visualization.display_dataset(
437
  self.docs,
438
  cond_filter,
439
  "Discarded documents for the filter on the perplexity score",
440
  "docs",
441
  )
442
 
443
- Visualization.display_dataset(
444
  self.docs, all_conds, "Retained documents", "docs"
445
  )
446
 
@@ -468,9 +440,9 @@ class Visualization:
468
  cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
469
  new_key = ("len_word", cutoff_word, True)
470
  self.parameters.append(new_key)
471
- Visualization.plot_hist(self.words, new_key)
472
  cond_len_words = self.words["len_word"] <= cutoff_word
473
- Visualization.print_discarded_by_cond(cond_len_words)
474
  conds_words["len_word"] = cond_len_words
475
 
476
  if "incorrect_substrings" in columns:
@@ -509,7 +481,7 @@ class Visualization:
509
  for i in range(len(self.words["incorrect_substrings"]))
510
  ]
511
  )
512
- Visualization.print_discarded_by_cond(cond_incorrect_substrings)
513
  conds_words["incorrect_substrings"] = cond_incorrect_substrings
514
 
515
  all_conds_words = np.all(list(conds_words.values()), axis=0)
@@ -526,7 +498,7 @@ class Visualization:
526
  f"we consider in this section words for only {self.num_docs_for_words} documents."
527
  )
528
 
529
- Visualization.display_dataset(
530
  self.words, np.invert(all_conds_words), "Discarded words", "words"
531
  )
532
 
@@ -539,7 +511,7 @@ class Visualization:
539
 
540
  if "len_word" in columns:
541
  cond_filter = np.invert(conds_words["len_word"])
542
- Visualization.display_dataset(
543
  self.words,
544
  cond_filter,
545
  "Discarded words for the filter on length",
@@ -548,14 +520,14 @@ class Visualization:
548
 
549
  if "incorrect_substrings" in columns:
550
  cond_filter = np.invert(conds_words["incorrect_substrings"])
551
- Visualization.display_dataset(
552
  self.words,
553
  cond_filter,
554
  "Discarded words for the filter on incorrect substrings",
555
  "words",
556
  )
557
 
558
- Visualization.display_dataset(
559
  self.words, all_conds_words, "Retained words", "words"
560
  )
561
 
@@ -709,40 +681,92 @@ class Visualization:
709
  f"With the current filtering parameters, this document **is {is_discarded}discarded**."
710
  )
711
 
712
- def visualization(self):
713
- self.warning_preamble()
714
- self.preamble()
715
- self.open_data()
716
  self.set_title()
 
717
  self.filtering_of_docs()
718
  self.filtering_of_words()
719
  self.download_parameters()
720
  self.analyse_personal_doc()
721
 
722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  path_instructions = "./explanation_filtering_pipeline.pdf"
724
- path_data = "./en_examples_with_stats.json"
725
- lang = "English"
726
- num_docs = 5000
727
- num_docs_for_words = 500
728
- max_len_text_display = 10000
729
-
730
- # Only useful for analyse_personal_doc
731
- lang_dataset_id = "en"
732
- path_fasttext_model = "./lid.176.bin"
733
- path_sentencepiece_model = "./en.sp.model"
734
- path_kenlm_model = "./en.arpa.bin"
735
-
736
- visualization = Visualization(
737
- path_instructions,
738
- path_data,
739
- lang,
740
- num_docs,
741
- num_docs_for_words,
742
- max_len_text_display,
743
- lang_dataset_id,
744
- path_fasttext_model,
745
- path_sentencepiece_model,
746
- path_kenlm_model,
747
- )
748
  visualization.visualization()
 
16
  import matplotlib.pyplot as plt
17
 
18
  from filtering import LoadParameters, ModifyingDocuments, Filtering
19
+ from languages_id import langs_id
20
 
21
 
22
+ class Visualization_for_lang:
23
  def __init__(
24
  self,
 
25
  path_data,
26
  lang,
27
  num_docs,
 
32
  path_sentencepiece_model,
33
  path_kenlm_model,
34
  ):
 
35
  self.path_data = path_data
36
  self.lang = lang
37
  self.num_docs = num_docs
 
55
  lang_dataset_id, path_kenlm_model
56
  )
57
 
58
+ def set_title(self):
59
+ st.title(f"Filtering visualization for {self.lang}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def open_data(self):
62
  with open(self.path_data) as json_file:
 
84
  self.docs_checkpoint = pd.DataFrame(docs)
85
  self.docs = self.docs_checkpoint
86
 
 
 
 
87
  @staticmethod
88
  def print_discarded_by_cond(cond):
89
  st.caption(
 
141
  )
142
  new_key = ("number_words", cutoff_min_number_words, False)
143
  keys.append(new_key)
144
+ Visualization_for_lang.plot_hist(self.docs, new_key)
145
  cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
146
+ Visualization_for_lang.print_discarded_by_cond(cond_1)
147
 
148
  cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
149
  cutoff_max_number_words = st.slider(
 
152
  new_key = ("number_words", cutoff_max_number_words, True)
153
  keys.append(new_key)
154
  cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
155
+ Visualization_for_lang.print_discarded_by_cond(cond_2)
156
 
157
  conds["number_words"] = [cond_1, cond_2]
158
 
 
198
  repetitions_length,
199
  )
200
  keys.append(new_key)
201
+ Visualization_for_lang.plot_hist(self.docs, new_key)
202
  cond = get_cond(new_key[0], new_key[1], new_key[2])
203
+ Visualization_for_lang.print_discarded_by_cond(cond)
204
  conds["repetitions_ratio"] = [cond]
205
 
206
  if "special_characters_ratio" in columns:
 
215
  True,
216
  )
217
  keys.append(new_key)
218
+ Visualization_for_lang.plot_hist(self.docs, new_key)
219
  cond = get_cond(new_key[0], new_key[1], new_key[2])
220
+ Visualization_for_lang.print_discarded_by_cond(cond)
221
  conds["special_characters_ratio"] = [cond]
222
 
223
  if "stopwords_ratio" in columns:
 
251
  )
252
  new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
253
  keys.append(new_key)
254
+ Visualization_for_lang.plot_hist(self.docs, new_key)
255
  cond = get_cond(new_key[0], new_key[1], new_key[2])
256
+ Visualization_for_lang.print_discarded_by_cond(cond)
257
  conds["stopwords_ratio"] = [cond]
258
 
259
  if "flagged_words_ratio" in columns:
 
288
  )
289
  new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
290
  keys.append(new_key)
291
+ Visualization_for_lang.plot_hist(self.docs, new_key)
292
  cond = get_cond(new_key[0], new_key[1], new_key[2])
293
+ Visualization_for_lang.print_discarded_by_cond(cond)
294
  conds["flagged_words_ratio"] = [cond]
295
 
296
  if "lang_id_score" in columns:
 
301
  )
302
  new_key = ("lang_id_score", cutoff_lang_id_score, False)
303
  keys.append(new_key)
304
+ Visualization_for_lang.plot_hist(self.docs, new_key)
305
  cond = get_cond(new_key[0], new_key[1], new_key[2])
306
+ Visualization_for_lang.print_discarded_by_cond(cond)
307
  conds["lang_id_score"] = [cond]
308
 
309
  if "perplexity_score" in columns:
 
313
  cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp)
314
  new_key = ("perplexity_score", cutoff_perplexity_score, True)
315
  keys.append(new_key)
316
+ Visualization_for_lang.plot_hist(self.docs, new_key)
317
  cond = get_cond(new_key[0], new_key[1], new_key[2])
318
+ Visualization_for_lang.print_discarded_by_cond(cond)
319
  conds["perplexity_score"] = [cond]
320
 
321
  return keys, conds
 
333
  f"Filtering on documents, for {self.num_docs} {self.lang} documents"
334
  )
335
 
336
+ Visualization_for_lang.display_dataset(
337
  self.docs, np.invert(all_conds), "Discarded documents", "docs"
338
  )
339
 
 
347
 
348
  if "number_words" in columns:
349
  cond_filter = np.invert(np.all(conds["number_words"], axis=0))
350
+ Visualization_for_lang.display_dataset(
351
  self.docs,
352
  cond_filter,
353
  "Discarded documents for the filter on the number of words",
 
356
 
357
  if "repetitions_ratio" in columns:
358
  cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
359
+ Visualization_for_lang.display_dataset(
360
  self.docs,
361
  cond_filter,
362
  "Discarded documents for the filter on the repetitions ratio",
 
367
  cond_filter = np.invert(
368
  np.all(conds["special_characters_ratio"], axis=0)
369
  )
370
+ Visualization_for_lang.display_dataset(
371
  self.docs,
372
  cond_filter,
373
  "Discarded documents for the filter on the special characters ratio",
 
376
 
377
  if "stopwords_ratio" in columns:
378
  cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
379
+ Visualization_for_lang.display_dataset(
380
  self.docs,
381
  cond_filter,
382
  "Discarded documents for the filter on the stop words ratio",
 
387
  cond_filter = np.invert(
388
  np.all(conds["flagged_words_ratio"], axis=0)
389
  )
390
+ Visualization_for_lang.display_dataset(
391
  self.docs,
392
  cond_filter,
393
  "Discarded documents for the filter on the flagged words ratio",
 
396
 
397
  if "lang_id_score" in columns:
398
  cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
399
+ Visualization_for_lang.display_dataset(
400
  self.docs,
401
  cond_filter,
402
  "Discarded documents for the filter on the language identification confidence score",
 
405
 
406
  if "perplexity_score" in columns:
407
  cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
408
+ Visualization_for_lang.display_dataset(
409
  self.docs,
410
  cond_filter,
411
  "Discarded documents for the filter on the perplexity score",
412
  "docs",
413
  )
414
 
415
+ Visualization_for_lang.display_dataset(
416
  self.docs, all_conds, "Retained documents", "docs"
417
  )
418
 
 
440
  cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
441
  new_key = ("len_word", cutoff_word, True)
442
  self.parameters.append(new_key)
443
+ Visualization_for_lang.plot_hist(self.words, new_key)
444
  cond_len_words = self.words["len_word"] <= cutoff_word
445
+ Visualization_for_lang.print_discarded_by_cond(cond_len_words)
446
  conds_words["len_word"] = cond_len_words
447
 
448
  if "incorrect_substrings" in columns:
 
481
  for i in range(len(self.words["incorrect_substrings"]))
482
  ]
483
  )
484
+ Visualization_for_lang.print_discarded_by_cond(cond_incorrect_substrings)
485
  conds_words["incorrect_substrings"] = cond_incorrect_substrings
486
 
487
  all_conds_words = np.all(list(conds_words.values()), axis=0)
 
498
  f"we consider in this section words for only {self.num_docs_for_words} documents."
499
  )
500
 
501
+ Visualization_for_lang.display_dataset(
502
  self.words, np.invert(all_conds_words), "Discarded words", "words"
503
  )
504
 
 
511
 
512
  if "len_word" in columns:
513
  cond_filter = np.invert(conds_words["len_word"])
514
+ Visualization_for_lang.display_dataset(
515
  self.words,
516
  cond_filter,
517
  "Discarded words for the filter on length",
 
520
 
521
  if "incorrect_substrings" in columns:
522
  cond_filter = np.invert(conds_words["incorrect_substrings"])
523
+ Visualization_for_lang.display_dataset(
524
  self.words,
525
  cond_filter,
526
  "Discarded words for the filter on incorrect substrings",
527
  "words",
528
  )
529
 
530
+ Visualization_for_lang.display_dataset(
531
  self.words, all_conds_words, "Retained words", "words"
532
  )
533
 
 
681
  f"With the current filtering parameters, this document **is {is_discarded}discarded**."
682
  )
683
 
684
+ def visualization_for_lang(self):
 
 
 
685
  self.set_title()
686
+ self.open_data()
687
  self.filtering_of_docs()
688
  self.filtering_of_words()
689
  self.download_parameters()
690
  self.analyse_personal_doc()
691
 
692
 
693
+ class Visualization:
694
+ def __init__(self, path_instructions, param_visu_langs):
695
+ self.path_instructions = path_instructions
696
+ self.param_visu_langs = param_visu_langs
697
+
698
+ def preamble(self):
699
+ def get_binary_file_downloader_html(bin_file, file_label="File"):
700
+ with open(bin_file, "rb") as f:
701
+ data = f.read()
702
+ bin_str = base64.b64encode(data).decode()
703
+ href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
704
+ return href
705
+
706
+ st.markdown(
707
+ "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this "
708
+ + get_binary_file_downloader_html(
709
+ self.path_instructions,
710
+ "pdf",
711
+ )
712
+ + ".",
713
+ unsafe_allow_html=True,
714
+ )
715
+
716
+ def warning_preamble(self):
717
+ st.markdown(
718
+ "This demo can be a little slow, and only allows you to process up to 5000 documents "
719
+ "for a decent speed. If you want to display up to three times more documents and have "
720
+ "a faster visualization, we invite you to run this "
721
+ "[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) "
722
+ "on your computer."
723
+ )
724
+
725
+ def choose_lang(self):
726
+ options = [self.param_visu_langs[lang_dataset_id]["lang"] for lang_dataset_id in self.param_visu_langs]
727
+ index = options.index("English") if ("English" in options) else 0
728
+ lang_chosen = st.selectbox(
729
+ label="Select the language for visualization",
730
+ options=options,
731
+ index=index,
732
+ )
733
+ if lang_chosen != "None":
734
+ lang_chosen_dataset_id = langs_id.loc[langs_id["lang"] == lang_chosen, "dataset_id"].iloc[0]
735
+ visualization_for_lang = Visualization_for_lang(
736
+ path_data = self.param_visu_langs[lang_chosen_dataset_id]["path_data"],
737
+ lang = self.param_visu_langs[lang_chosen_dataset_id]["lang"],
738
+ num_docs = self.param_visu_langs[lang_chosen_dataset_id]["num_docs"],
739
+ num_docs_for_words = self.param_visu_langs[lang_chosen_dataset_id]["num_docs_for_words"],
740
+ max_len_text_display = self.param_visu_langs[lang_chosen_dataset_id]["max_len_text_display"],
741
+ lang_dataset_id = self.param_visu_langs[lang_chosen_dataset_id]["lang_dataset_id"],
742
+ path_fasttext_model = self.param_visu_langs[lang_chosen_dataset_id]["path_fasttext_model"],
743
+ path_sentencepiece_model = self.param_visu_langs[lang_chosen_dataset_id]["path_sentencepiece_model"],
744
+ path_kenlm_model = self.param_visu_langs[lang_chosen_dataset_id]["path_kenlm_model"],
745
+ )
746
+ visualization_for_lang.visualization_for_lang()
747
+
748
+ def visualization(self):
749
+ self.preamble()
750
+ self.warning_preamble()
751
+ self.choose_lang()
752
+
753
+
754
  path_instructions = "./explanation_filtering_pipeline.pdf"
755
+
756
+ param_visu_langs = {
757
+ lang_dataset_id: {
758
+ "path_data": f"./{lang_dataset_id}_examples_with_stats.json",
759
+ "lang": langs_id.loc[langs_id["dataset_id"] == lang_dataset_id, "lang"].iloc[0],
760
+ "num_docs": 5000,
761
+ "num_docs_for_words": 500,
762
+ "max_len_text_display": 10000,
763
+ "lang_dataset_id": lang_dataset_id,
764
+ "path_fasttext_model": "./lid.176.bin",
765
+ "path_sentencepiece_model": f"./{lang_dataset_id}.sp.model",
766
+ "path_kenlm_model": f"./{lang_dataset_id}.arpa.bin",
767
+ }
768
+ for lang_dataset_id in ["en", "zh"]
769
+ }
770
+
771
+ visualization = Visualization(path_instructions, param_visu_langs)
 
 
 
 
 
 
 
772
  visualization.visualization()
zh.arpa.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f157d94cb2828bbb44b5dddf38e7eb7f62a47d317917646a73fe2af50a3dad68
3
+ size 3392018416
zh.sp.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b30b883dfac9927edeb1fba8894ebc8ca4452aa3e26fb4ff3ff0e653ba011db7
3
+ size 1366946
zh_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90ffaf5e5c7b556587c8b2b97ad49c752bea5608d5cc56b7ea03fb0d96a71fd2
3
+ size 62914634