Spaces:
Running
Running
change the backgrounds of collapsable sample text
Browse files
web.py
CHANGED
@@ -297,8 +297,9 @@ def web_data():
|
|
297 |
Summary("Text Extraction Examples"),
|
298 |
DV2("data/sample_wet.json", "data/sample_warc.json", 3),
|
299 |
style="""
|
300 |
-
background-color: #
|
301 |
-
|
|
|
302 |
border-radius: 12px;
|
303 |
""", #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
|
304 |
),
|
@@ -314,6 +315,11 @@ def web_data():
|
|
314 |
Details(
|
315 |
Summary("Non-English Documents"),
|
316 |
DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
|
|
|
|
|
|
|
|
|
|
317 |
),
|
318 |
|
319 |
#DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
@@ -321,6 +327,11 @@ def web_data():
|
|
321 |
Details(
|
322 |
Summary("English Documents Scoring Lower than 0.65"),
|
323 |
DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
|
|
|
|
|
|
|
|
|
|
|
324 |
),
|
325 |
|
326 |
H3("1.3 URL Filtering"),
|
@@ -338,6 +349,11 @@ def web_data():
|
|
338 |
Details(
|
339 |
Summary("24 URL domains with more than 4k matches"),
|
340 |
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
|
|
|
|
|
|
|
|
|
|
341 |
),
|
342 |
|
343 |
P("""
|
@@ -346,6 +362,11 @@ def web_data():
|
|
346 |
Details(
|
347 |
Summary("6 url domains that are removed from the blocklist"),
|
348 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
|
|
|
|
|
|
|
|
|
|
349 |
),
|
350 |
|
351 |
Details(
|
@@ -354,7 +375,12 @@ def web_data():
|
|
354 |
"data/bad_url_doc.jsonl",
|
355 |
3,
|
356 |
"Sample documents whose urls are blocked by the refined url blocklist",
|
357 |
-
),
|
|
|
|
|
|
|
|
|
|
|
358 |
),
|
359 |
|
360 |
H3("1.3.2 Excluded High Quality Sources"),
|
@@ -368,11 +394,21 @@ def web_data():
|
|
368 |
non_web_urls,
|
369 |
"curated url domains that are excluded from our dataset",
|
370 |
),
|
|
|
|
|
|
|
|
|
|
|
371 |
),
|
372 |
|
373 |
Details(
|
374 |
Summary("Sample documents whose urls are in our curated url domain list"),
|
375 |
DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
|
|
|
|
|
|
|
|
|
|
|
376 |
),
|
377 |
|
378 |
|
@@ -401,6 +437,11 @@ def web_data():
|
|
401 |
0,
|
402 |
"Sample documents with lines that are removed by the rule of terminal punctuation",
|
403 |
),
|
|
|
|
|
|
|
|
|
|
|
404 |
),
|
405 |
|
406 |
|
@@ -422,6 +463,11 @@ def web_data():
|
|
422 |
0,
|
423 |
"Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
|
424 |
),
|
|
|
|
|
|
|
|
|
|
|
425 |
),
|
426 |
H3("2.2 Other Rules from RefinedWeb"),
|
427 |
P("""
|
@@ -440,6 +486,11 @@ def web_data():
|
|
440 |
0,
|
441 |
"Sample documents with lines that are removed by the RefinedWeb rules",
|
442 |
),
|
|
|
|
|
|
|
|
|
|
|
443 |
),
|
444 |
H3("2.3 Toxic Lines"),
|
445 |
P("""
|
@@ -455,6 +506,11 @@ def web_data():
|
|
455 |
json.load(open("data/toxic_lines.json")),
|
456 |
"Sample documents with toxic lines",
|
457 |
),
|
|
|
|
|
|
|
|
|
|
|
458 |
),
|
459 |
|
460 |
H2("3. Document-Level Filtering"),
|
@@ -467,6 +523,11 @@ def web_data():
|
|
467 |
json.load(open("data/all_signals.json")),
|
468 |
"Overview of all the quality signals that are used for filtering",
|
469 |
),
|
|
|
|
|
|
|
|
|
|
|
470 |
),
|
471 |
P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
|
472 |
Most quality signals were initially introduced by Gopher [2] and subsequently adopted by later
|
@@ -505,6 +566,11 @@ def web_data():
|
|
505 |
len(line) * count for line, count in line_counts.items() if count > 1
|
506 |
) / max(character_count, 1)
|
507 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
508 |
),
|
509 |
Details(
|
510 |
Summary("Implementations from DataTrove"),
|
@@ -539,6 +605,11 @@ def web_data():
|
|
539 |
if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
|
540 |
return False, "dup_line_char_frac"
|
541 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
542 |
),
|
543 |
P("""
|
544 |
After evaluating the implementations of Dolma and DataTrove (note: RedPajama V2 does not implement these two quality
|
@@ -580,6 +651,11 @@ def web_data():
|
|
580 |
sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
|
581 |
line_counts.items() if count > 1) / character_count
|
582 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
583 |
),
|
584 |
Details(
|
585 |
Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
|
@@ -588,6 +664,11 @@ def web_data():
|
|
588 |
0,
|
589 |
"Sample documents filtered by excessive line repetitions / characters in repeated lines",
|
590 |
),
|
|
|
|
|
|
|
|
|
|
|
591 |
),
|
592 |
H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
593 |
P("""
|
@@ -611,6 +692,11 @@ def web_data():
|
|
611 |
value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
|
612 |
attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
|
613 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
614 |
),
|
615 |
Details(
|
616 |
Summary("Implementations from RedPajama-V2"),
|
@@ -649,6 +735,11 @@ def web_data():
|
|
649 |
score = round(score, PRECISION)
|
650 |
return [(0, len(document), score)]
|
651 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
652 |
),
|
653 |
|
654 |
Details(
|
@@ -672,6 +763,11 @@ def web_data():
|
|
672 |
if top_char_length / len(text) > n_frac:
|
673 |
return False, f"top_n_gram"
|
674 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
675 |
),
|
676 |
P("""
|
677 |
There are almost no contradictions between each implementations of fractions of characters in the most common
|
@@ -699,6 +795,11 @@ def web_data():
|
|
699 |
value = count * sum(len(w) for w in most_common_ngram) / character_count
|
700 |
attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
|
701 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
702 |
),
|
703 |
Details(
|
704 |
Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
|
@@ -707,6 +808,11 @@ def web_data():
|
|
707 |
0,
|
708 |
"Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
|
709 |
),
|
|
|
|
|
|
|
|
|
|
|
710 |
),
|
711 |
H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
712 |
P("""
|
@@ -733,6 +839,11 @@ def web_data():
|
|
733 |
) / max(ng_char_count, 1)
|
734 |
attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
|
735 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
736 |
),
|
737 |
Details(
|
738 |
Summary("Implementations from RedPajama-V2"),
|
@@ -786,6 +897,11 @@ def web_data():
|
|
786 |
score = round(score, PRECISION)
|
787 |
return [(0, len(document), score)]
|
788 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
789 |
),
|
790 |
|
791 |
Details(
|
@@ -811,6 +927,11 @@ def web_data():
|
|
811 |
if n_duplicates_char / len(text) > n_frac:
|
812 |
return False, f"duplicated_n_grams"
|
813 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
814 |
),
|
815 |
P("""
|
816 |
For the computation of fraction of characters in duplicate n-gram, Dolma uses the number of characters in all
|
@@ -864,6 +985,11 @@ def web_data():
|
|
864 |
score = get_dup_ngram_frac(n, ngram_counts, text)
|
865 |
attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
|
866 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
867 |
),
|
868 |
Details(
|
869 |
Summary("An example to show the difference between above implementations"),
|
@@ -878,6 +1004,11 @@ def web_data():
|
|
878 |
|
879 |
In our implementation, there are 17*6 characters in total with 10*6 characters that are duplicated after excluding the first occurence. This results in a fraction of 10/17.
|
880 |
"""),
|
|
|
|
|
|
|
|
|
|
|
881 |
),
|
882 |
H5(
|
883 |
"Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
|
@@ -889,6 +1020,11 @@ def web_data():
|
|
889 |
0,
|
890 |
"Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
|
891 |
),
|
|
|
|
|
|
|
|
|
|
|
892 |
),
|
893 |
H3("3.2 Line-wise Heuristics"),
|
894 |
P("""
|
@@ -915,6 +1051,11 @@ def web_data():
|
|
915 |
D_code("""
|
916 |
ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
|
917 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
918 |
),
|
919 |
Details(
|
920 |
Summary("Bullet Point Identification Implemetations"),
|
@@ -959,6 +1100,11 @@ def web_data():
|
|
959 |
"*", # * star
|
960 |
)
|
961 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
962 |
),
|
963 |
|
964 |
|
@@ -969,6 +1115,11 @@ def web_data():
|
|
969 |
0,
|
970 |
"Sample documents that are filtered out by line-wise heuristics",
|
971 |
),
|
|
|
|
|
|
|
|
|
|
|
972 |
),
|
973 |
|
974 |
H3("3.3 Statistics-based Heuristics"),
|
@@ -1029,6 +1180,11 @@ def web_data():
|
|
1029 |
text = unicodedata.normalize("NFD", text)
|
1030 |
return text
|
1031 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1032 |
),
|
1033 |
|
1034 |
Details(
|
@@ -1040,6 +1196,11 @@ def web_data():
|
|
1040 |
non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
|
1041 |
n_non_symbol_words_words = len(non_symbol_words)
|
1042 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1043 |
),
|
1044 |
P("""
|
1045 |
Both Dolma and RedPajama V2 split texts into words using white spaces and newline symbols. However,
|
@@ -1084,6 +1245,11 @@ def web_data():
|
|
1084 |
score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
|
1085 |
return [(0, len(document), score)]
|
1086 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1087 |
),
|
1088 |
P("""
|
1089 |
However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
|
@@ -1100,6 +1266,11 @@ def web_data():
|
|
1100 |
...
|
1101 |
attrs.num_of_sentences = count_sentences(text)
|
1102 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1103 |
),
|
1104 |
|
1105 |
H3("Symbol to Word Ratio"),
|
@@ -1116,6 +1287,11 @@ def web_data():
|
|
1116 |
word_count, 1
|
1117 |
)
|
1118 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1119 |
),
|
1120 |
Details(
|
1121 |
Summary("Implementations from RedPajama-V2"),
|
@@ -1142,6 +1318,11 @@ def web_data():
|
|
1142 |
score = round(score, PRECISION)
|
1143 |
return [(0, len(document), score)]
|
1144 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1145 |
),
|
1146 |
|
1147 |
Details(
|
@@ -1152,6 +1333,11 @@ def web_data():
|
|
1152 |
if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
|
1153 |
return False, "gopher_too_many_ellipsis"
|
1154 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1155 |
),
|
1156 |
Details(
|
1157 |
Summary("TxT360 Implementation"),
|
@@ -1162,6 +1348,11 @@ def web_data():
|
|
1162 |
...
|
1163 |
attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
|
1164 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1165 |
),
|
1166 |
|
1167 |
H3("Fraction of Alphabetic Words"),
|
@@ -1172,6 +1363,11 @@ def web_data():
|
|
1172 |
1 for word in words if any(c.isalpha() for c in word)
|
1173 |
) / max(word_count, 1)
|
1174 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1175 |
),
|
1176 |
Details(
|
1177 |
Summary("Implementations from RedPajama-V2"),
|
@@ -1196,6 +1392,11 @@ def web_data():
|
|
1196 |
score = round(score, PRECISION)
|
1197 |
return [(0, len(document), score)]
|
1198 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1199 |
),
|
1200 |
Details(
|
1201 |
Summary("Implementations from DataTrove"),
|
@@ -1207,6 +1408,11 @@ def web_data():
|
|
1207 |
):
|
1208 |
return False, "gopher_below_alpha_threshold"
|
1209 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
1210 |
),
|
1211 |
P("""
|
1212 |
Both Dolma and DataTrove use `char.isalpha()` to detect whether a word contains alphabetic characters while
|
@@ -1233,6 +1439,11 @@ def web_data():
|
|
1233 |
0,
|
1234 |
"Sample documents that are filtered out by statistics-based heuristics",
|
1235 |
),
|
|
|
|
|
|
|
|
|
|
|
1236 |
),
|
1237 |
H3("3.4 Others"),
|
1238 |
P("""
|
@@ -1243,6 +1454,11 @@ def web_data():
|
|
1243 |
Details(
|
1244 |
Summary("Sample documents containing 'lorem ipsum'"),
|
1245 |
DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
|
|
|
|
|
|
|
|
|
|
|
1246 |
),
|
1247 |
H2("4. Deduplication"),
|
1248 |
P("""
|
|
|
297 |
Summary("Text Extraction Examples"),
|
298 |
DV2("data/sample_wet.json", "data/sample_warc.json", 3),
|
299 |
style="""
|
300 |
+
background-color: #F0F8FF; /* Light blue background */
|
301 |
+
padding: 15px;
|
302 |
+
# border: 1px solid #949494; /* Grey border */
|
303 |
border-radius: 12px;
|
304 |
""", #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
|
305 |
),
|
|
|
315 |
Details(
|
316 |
Summary("Non-English Documents"),
|
317 |
DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
318 |
+
style="""
|
319 |
+
background-color: #FFC0CB; /* Light pink background */
|
320 |
+
padding: 15px;
|
321 |
+
border-radius: 12px;
|
322 |
+
""",
|
323 |
),
|
324 |
|
325 |
#DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
|
|
327 |
Details(
|
328 |
Summary("English Documents Scoring Lower than 0.65"),
|
329 |
DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
|
330 |
+
style="""
|
331 |
+
background-color: #EAFFF1; /* Light green background */
|
332 |
+
padding: 15px;
|
333 |
+
border-radius: 12px;
|
334 |
+
""",
|
335 |
),
|
336 |
|
337 |
H3("1.3 URL Filtering"),
|
|
|
349 |
Details(
|
350 |
Summary("24 URL domains with more than 4k matches"),
|
351 |
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
352 |
+
style="""
|
353 |
+
background-color: #FFC0CB; /* Light pink background */
|
354 |
+
padding: 15px;
|
355 |
+
border-radius: 12px;
|
356 |
+
""",
|
357 |
),
|
358 |
|
359 |
P("""
|
|
|
362 |
Details(
|
363 |
Summary("6 url domains that are removed from the blocklist"),
|
364 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
365 |
+
style="""
|
366 |
+
background-color: #FFC0CB; /* Light pink background */
|
367 |
+
padding: 15px;
|
368 |
+
border-radius: 12px;
|
369 |
+
""",
|
370 |
),
|
371 |
|
372 |
Details(
|
|
|
375 |
"data/bad_url_doc.jsonl",
|
376 |
3,
|
377 |
"Sample documents whose urls are blocked by the refined url blocklist",
|
378 |
+
),
|
379 |
+
style="""
|
380 |
+
background-color: #FFC0CB; /* Light pink background */
|
381 |
+
padding: 15px;
|
382 |
+
border-radius: 12px;
|
383 |
+
""",
|
384 |
),
|
385 |
|
386 |
H3("1.3.2 Excluded High Quality Sources"),
|
|
|
394 |
non_web_urls,
|
395 |
"curated url domains that are excluded from our dataset",
|
396 |
),
|
397 |
+
style="""
|
398 |
+
background-color: #FFC0CB; /* Light pink background */
|
399 |
+
padding: 15px;
|
400 |
+
border-radius: 12px;
|
401 |
+
""",
|
402 |
),
|
403 |
|
404 |
Details(
|
405 |
Summary("Sample documents whose urls are in our curated url domain list"),
|
406 |
DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
|
407 |
+
style="""
|
408 |
+
background-color: #EAFFF1; /* Light green background */
|
409 |
+
padding: 15px;
|
410 |
+
border-radius: 12px;
|
411 |
+
""",
|
412 |
),
|
413 |
|
414 |
|
|
|
437 |
0,
|
438 |
"Sample documents with lines that are removed by the rule of terminal punctuation",
|
439 |
),
|
440 |
+
style="""
|
441 |
+
background-color: #FFC0CB; /* Light pink background */
|
442 |
+
padding: 15px;
|
443 |
+
border-radius: 12px;
|
444 |
+
""",
|
445 |
),
|
446 |
|
447 |
|
|
|
463 |
0,
|
464 |
"Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
|
465 |
),
|
466 |
+
style="""
|
467 |
+
background-color: #FFC0CB; /* Light pink background */
|
468 |
+
padding: 15px;
|
469 |
+
border-radius: 12px;
|
470 |
+
""",
|
471 |
),
|
472 |
H3("2.2 Other Rules from RefinedWeb"),
|
473 |
P("""
|
|
|
486 |
0,
|
487 |
"Sample documents with lines that are removed by the RefinedWeb rules",
|
488 |
),
|
489 |
+
style="""
|
490 |
+
background-color: #FFC0CB; /* Light pink background */
|
491 |
+
padding: 15px;
|
492 |
+
border-radius: 12px;
|
493 |
+
""",
|
494 |
),
|
495 |
H3("2.3 Toxic Lines"),
|
496 |
P("""
|
|
|
506 |
json.load(open("data/toxic_lines.json")),
|
507 |
"Sample documents with toxic lines",
|
508 |
),
|
509 |
+
style="""
|
510 |
+
background-color: #FFC0CB; /* Light pink background */
|
511 |
+
padding: 15px;
|
512 |
+
border-radius: 12px;
|
513 |
+
""",
|
514 |
),
|
515 |
|
516 |
H2("3. Document-Level Filtering"),
|
|
|
523 |
json.load(open("data/all_signals.json")),
|
524 |
"Overview of all the quality signals that are used for filtering",
|
525 |
),
|
526 |
+
style="""
|
527 |
+
background-color: #EAFFF1; /* Light green background */
|
528 |
+
padding: 15px;
|
529 |
+
border-radius: 12px;
|
530 |
+
""",
|
531 |
),
|
532 |
P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
|
533 |
Most quality signals were initially introduced by Gopher [2] and subsequently adopted by later
|
|
|
566 |
len(line) * count for line, count in line_counts.items() if count > 1
|
567 |
) / max(character_count, 1)
|
568 |
""", block="block", language="python"),
|
569 |
+
style="""
|
570 |
+
background-color: #FFFAEA; /* Light yellow background */
|
571 |
+
padding: 15px;
|
572 |
+
border-radius: 12px;
|
573 |
+
""",
|
574 |
),
|
575 |
Details(
|
576 |
Summary("Implementations from DataTrove"),
|
|
|
605 |
if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
|
606 |
return False, "dup_line_char_frac"
|
607 |
""", block="block", language="python"),
|
608 |
+
style="""
|
609 |
+
background-color: #FFFAEA; /* Light yellow background */
|
610 |
+
padding: 15px;
|
611 |
+
border-radius: 12px;
|
612 |
+
""",
|
613 |
),
|
614 |
P("""
|
615 |
After evaluating the implementations of Dolma and DataTrove (note: RedPajama V2 does not implement these two quality
|
|
|
651 |
sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
|
652 |
line_counts.items() if count > 1) / character_count
|
653 |
""", block="block", language="python"),
|
654 |
+
style="""
|
655 |
+
background-color: #EAFFF1; /* Light green background */
|
656 |
+
padding: 15px;
|
657 |
+
border-radius: 12px;
|
658 |
+
""",
|
659 |
),
|
660 |
Details(
|
661 |
Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
|
|
|
664 |
0,
|
665 |
"Sample documents filtered by excessive line repetitions / characters in repeated lines",
|
666 |
),
|
667 |
+
style="""
|
668 |
+
background-color: #EAFFF1; /* Light green background */
|
669 |
+
padding: 15px;
|
670 |
+
border-radius: 12px;
|
671 |
+
""",
|
672 |
),
|
673 |
H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
674 |
P("""
|
|
|
692 |
value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
|
693 |
attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
|
694 |
""", block="block", language="python"),
|
695 |
+
style="""
|
696 |
+
background-color: #FFFAEA; /* Light yellow background */
|
697 |
+
padding: 15px;
|
698 |
+
border-radius: 12px;
|
699 |
+
""",
|
700 |
),
|
701 |
Details(
|
702 |
Summary("Implementations from RedPajama-V2"),
|
|
|
735 |
score = round(score, PRECISION)
|
736 |
return [(0, len(document), score)]
|
737 |
""", block="block", language="python"),
|
738 |
+
style="""
|
739 |
+
background-color: #FFFAEA; /* Light yellow background */
|
740 |
+
padding: 15px;
|
741 |
+
border-radius: 12px;
|
742 |
+
""",
|
743 |
),
|
744 |
|
745 |
Details(
|
|
|
763 |
if top_char_length / len(text) > n_frac:
|
764 |
return False, f"top_n_gram"
|
765 |
""", block="block", language="python"),
|
766 |
+
style="""
|
767 |
+
background-color: #FFFAEA; /* Light yellow background */
|
768 |
+
padding: 15px;
|
769 |
+
border-radius: 12px;
|
770 |
+
""",
|
771 |
),
|
772 |
P("""
|
773 |
There are almost no contradictions between each implementations of fractions of characters in the most common
|
|
|
795 |
value = count * sum(len(w) for w in most_common_ngram) / character_count
|
796 |
attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
|
797 |
""", block="block", language="python"),
|
798 |
+
style="""
|
799 |
+
background-color: #EAFFF1; /* Light green background */
|
800 |
+
padding: 15px;
|
801 |
+
border-radius: 12px;
|
802 |
+
""",
|
803 |
),
|
804 |
Details(
|
805 |
Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
|
|
|
808 |
0,
|
809 |
"Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
|
810 |
),
|
811 |
+
style="""
|
812 |
+
background-color: #EAFFF1; /* Light green background */
|
813 |
+
padding: 15px;
|
814 |
+
border-radius: 12px;
|
815 |
+
""",
|
816 |
),
|
817 |
H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
818 |
P("""
|
|
|
839 |
) / max(ng_char_count, 1)
|
840 |
attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
|
841 |
""", block="block", language="python"),
|
842 |
+
style="""
|
843 |
+
background-color: #FFFAEA; /* Light yellow background */
|
844 |
+
padding: 15px;
|
845 |
+
border-radius: 12px;
|
846 |
+
""",
|
847 |
),
|
848 |
Details(
|
849 |
Summary("Implementations from RedPajama-V2"),
|
|
|
897 |
score = round(score, PRECISION)
|
898 |
return [(0, len(document), score)]
|
899 |
""", block="block", language="python"),
|
900 |
+
style="""
|
901 |
+
background-color: #FFFAEA; /* Light yellow background */
|
902 |
+
padding: 15px;
|
903 |
+
border-radius: 12px;
|
904 |
+
""",
|
905 |
),
|
906 |
|
907 |
Details(
|
|
|
927 |
if n_duplicates_char / len(text) > n_frac:
|
928 |
return False, f"duplicated_n_grams"
|
929 |
""", block="block", language="python"),
|
930 |
+
style="""
|
931 |
+
background-color: #FFFAEA; /* Light yellow background */
|
932 |
+
padding: 15px;
|
933 |
+
border-radius: 12px;
|
934 |
+
""",
|
935 |
),
|
936 |
P("""
|
937 |
For the computation of fraction of characters in duplicate n-gram, Dolma uses the number of characters in all
|
|
|
985 |
score = get_dup_ngram_frac(n, ngram_counts, text)
|
986 |
attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
|
987 |
""", block="block", language="python"),
|
988 |
+
style="""
|
989 |
+
background-color: #EAFFF1; /* Light green background */
|
990 |
+
padding: 15px;
|
991 |
+
border-radius: 12px;
|
992 |
+
""",
|
993 |
),
|
994 |
Details(
|
995 |
Summary("An example to show the difference between above implementations"),
|
|
|
1004 |
|
1005 |
In our implementation, there are 17*6 characters in total with 10*6 characters that are duplicated after excluding the first occurence. This results in a fraction of 10/17.
|
1006 |
"""),
|
1007 |
+
style="""
|
1008 |
+
background-color: #EAFFF1; /* Light green background */
|
1009 |
+
padding: 15px;
|
1010 |
+
border-radius: 12px;
|
1011 |
+
""",
|
1012 |
),
|
1013 |
H5(
|
1014 |
"Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
|
|
|
1020 |
0,
|
1021 |
"Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
|
1022 |
),
|
1023 |
+
style="""
|
1024 |
+
background-color: #EAFFF1; /* Light green background */
|
1025 |
+
padding: 15px;
|
1026 |
+
border-radius: 12px;
|
1027 |
+
""",
|
1028 |
),
|
1029 |
H3("3.2 Line-wise Heuristics"),
|
1030 |
P("""
|
|
|
1051 |
D_code("""
|
1052 |
ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
|
1053 |
""", block="block", language="python"),
|
1054 |
+
style="""
|
1055 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1056 |
+
padding: 15px;
|
1057 |
+
border-radius: 12px;
|
1058 |
+
""",
|
1059 |
),
|
1060 |
Details(
|
1061 |
Summary("Bullet Point Identification Implemetations"),
|
|
|
1100 |
"*", # * star
|
1101 |
)
|
1102 |
""", block="block", language="python"),
|
1103 |
+
style="""
|
1104 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1105 |
+
padding: 15px;
|
1106 |
+
border-radius: 12px;
|
1107 |
+
""",
|
1108 |
),
|
1109 |
|
1110 |
|
|
|
1115 |
0,
|
1116 |
"Sample documents that are filtered out by line-wise heuristics",
|
1117 |
),
|
1118 |
+
style="""
|
1119 |
+
background-color: #EAFFF1; /* Light green background */
|
1120 |
+
padding: 15px;
|
1121 |
+
border-radius: 12px;
|
1122 |
+
""",
|
1123 |
),
|
1124 |
|
1125 |
H3("3.3 Statistics-based Heuristics"),
|
|
|
1180 |
text = unicodedata.normalize("NFD", text)
|
1181 |
return text
|
1182 |
""", block="block", language="python"),
|
1183 |
+
style="""
|
1184 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1185 |
+
padding: 15px;
|
1186 |
+
border-radius: 12px;
|
1187 |
+
""",
|
1188 |
),
|
1189 |
|
1190 |
Details(
|
|
|
1196 |
non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
|
1197 |
n_non_symbol_words_words = len(non_symbol_words)
|
1198 |
""", block="block", language="python"),
|
1199 |
+
style="""
|
1200 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1201 |
+
padding: 15px;
|
1202 |
+
border-radius: 12px;
|
1203 |
+
""",
|
1204 |
),
|
1205 |
P("""
|
1206 |
Both Dolma and RedPajama V2 split texts into words using white spaces and newline symbols. However,
|
|
|
1245 |
score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
|
1246 |
return [(0, len(document), score)]
|
1247 |
""", block="block", language="python"),
|
1248 |
+
style="""
|
1249 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1250 |
+
padding: 15px;
|
1251 |
+
border-radius: 12px;
|
1252 |
+
""",
|
1253 |
),
|
1254 |
P("""
|
1255 |
However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
|
|
|
1266 |
...
|
1267 |
attrs.num_of_sentences = count_sentences(text)
|
1268 |
""", block="block", language="python"),
|
1269 |
+
style="""
|
1270 |
+
background-color: #EAFFF1; /* Light green background */
|
1271 |
+
padding: 15px;
|
1272 |
+
border-radius: 12px;
|
1273 |
+
""",
|
1274 |
),
|
1275 |
|
1276 |
H3("Symbol to Word Ratio"),
|
|
|
1287 |
word_count, 1
|
1288 |
)
|
1289 |
""", block="block", language="python"),
|
1290 |
+
style="""
|
1291 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1292 |
+
padding: 15px;
|
1293 |
+
border-radius: 12px;
|
1294 |
+
""",
|
1295 |
),
|
1296 |
Details(
|
1297 |
Summary("Implementations from RedPajama-V2"),
|
|
|
1318 |
score = round(score, PRECISION)
|
1319 |
return [(0, len(document), score)]
|
1320 |
""", block="block", language="python"),
|
1321 |
+
style="""
|
1322 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1323 |
+
padding: 15px;
|
1324 |
+
border-radius: 12px;
|
1325 |
+
""",
|
1326 |
),
|
1327 |
|
1328 |
Details(
|
|
|
1333 |
if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
|
1334 |
return False, "gopher_too_many_ellipsis"
|
1335 |
""", block="block", language="python"),
|
1336 |
+
style="""
|
1337 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1338 |
+
padding: 15px;
|
1339 |
+
border-radius: 12px;
|
1340 |
+
""",
|
1341 |
),
|
1342 |
Details(
|
1343 |
Summary("TxT360 Implementation"),
|
|
|
1348 |
...
|
1349 |
attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
|
1350 |
""", block="block", language="python"),
|
1351 |
+
style="""
|
1352 |
+
background-color: #EAFFF1; /* Light green background */
|
1353 |
+
padding: 15px;
|
1354 |
+
border-radius: 12px;
|
1355 |
+
""",
|
1356 |
),
|
1357 |
|
1358 |
H3("Fraction of Alphabetic Words"),
|
|
|
1363 |
1 for word in words if any(c.isalpha() for c in word)
|
1364 |
) / max(word_count, 1)
|
1365 |
""", block="block", language="python"),
|
1366 |
+
style="""
|
1367 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1368 |
+
padding: 15px;
|
1369 |
+
border-radius: 12px;
|
1370 |
+
""",
|
1371 |
),
|
1372 |
Details(
|
1373 |
Summary("Implementations from RedPajama-V2"),
|
|
|
1392 |
score = round(score, PRECISION)
|
1393 |
return [(0, len(document), score)]
|
1394 |
""", block="block", language="python"),
|
1395 |
+
style="""
|
1396 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1397 |
+
padding: 15px;
|
1398 |
+
border-radius: 12px;
|
1399 |
+
""",
|
1400 |
),
|
1401 |
Details(
|
1402 |
Summary("Implementations from DataTrove"),
|
|
|
1408 |
):
|
1409 |
return False, "gopher_below_alpha_threshold"
|
1410 |
""", block="block", language="python"),
|
1411 |
+
style="""
|
1412 |
+
background-color: #FFFAEA; /* Light yellow background */
|
1413 |
+
padding: 15px;
|
1414 |
+
border-radius: 12px;
|
1415 |
+
""",
|
1416 |
),
|
1417 |
P("""
|
1418 |
Both Dolma and DataTrove use `char.isalpha()` to detect whether a word contains alphabetic characters while
|
|
|
1439 |
0,
|
1440 |
"Sample documents that are filtered out by statistics-based heuristics",
|
1441 |
),
|
1442 |
+
style="""
|
1443 |
+
background-color: #EAFFF1; /* Light green background */
|
1444 |
+
padding: 15px;
|
1445 |
+
border-radius: 12px;
|
1446 |
+
""",
|
1447 |
),
|
1448 |
H3("3.4 Others"),
|
1449 |
P("""
|
|
|
1454 |
Details(
|
1455 |
Summary("Sample documents containing 'lorem ipsum'"),
|
1456 |
DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
|
1457 |
+
style="""
|
1458 |
+
background-color: #FFC0CB; /* Light pink background */
|
1459 |
+
padding: 15px;
|
1460 |
+
border-radius: 12px;
|
1461 |
+
""",
|
1462 |
),
|
1463 |
H2("4. Deduplication"),
|
1464 |
P("""
|