victormiller
commited on
Commit
•
666337a
1
Parent(s):
ff67812
Update curated.py
Browse files- curated.py +2 -36
curated.py
CHANGED
@@ -450,9 +450,6 @@ filtering_process = Div(
|
|
450 |
P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
|
451 |
H4("Filtering"),
|
452 |
P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
|
453 |
-
H4("Local Deduplication Process"),
|
454 |
-
Ol(
|
455 |
-
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
456 |
),
|
457 |
table_div_wikipedia,
|
458 |
),
|
@@ -470,10 +467,6 @@ filtering_process = Div(
|
|
470 |
Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
|
471 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
472 |
),
|
473 |
-
H4("Local Deduplication Process"),
|
474 |
-
Ol(
|
475 |
-
Li("Local dedup was done with all papers combined."),
|
476 |
-
),
|
477 |
table_div_arx,
|
478 |
),
|
479 |
),
|
@@ -554,10 +547,6 @@ filtering_process = Div(
|
|
554 |
Ol(
|
555 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
556 |
),
|
557 |
-
H4("Local Deduplication Process"),
|
558 |
-
Ol(
|
559 |
-
Li("Local dedup was done with all papers combined."),
|
560 |
-
),
|
561 |
table_div_phil,
|
562 |
),
|
563 |
),
|
@@ -568,10 +557,6 @@ filtering_process = Div(
|
|
568 |
P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
569 |
H4("Filtering"),
|
570 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
571 |
-
H4("Local Deduplication Process"),
|
572 |
-
Ol(
|
573 |
-
Li("Local dedup was done within europarl itself"),
|
574 |
-
),
|
575 |
table_div_up,
|
576 |
),
|
577 |
),
|
@@ -587,10 +572,6 @@ filtering_process = Div(
|
|
587 |
Li("Minimum Word Count Filter: 10"),
|
588 |
Li("Unigram Log Probability"),
|
589 |
),
|
590 |
-
H4("Local Deduplication Process"),
|
591 |
-
Ol(
|
592 |
-
Li("Local dedup was done within hackernews itself"),
|
593 |
-
),
|
594 |
table_div_hn,
|
595 |
),
|
596 |
),
|
@@ -605,10 +586,6 @@ filtering_process = Div(
|
|
605 |
Li("Minimum Word Count Filter: 50"),
|
606 |
Li("Unigram Log Probability"),
|
607 |
),
|
608 |
-
H4("Local Deduplication Process"),
|
609 |
-
Ol(
|
610 |
-
Li("Local dedup was done within USPTO itself"),
|
611 |
-
),
|
612 |
table_div_uspto,
|
613 |
),
|
614 |
),
|
@@ -660,10 +637,6 @@ filtering_process = Div(
|
|
660 |
Ol(
|
661 |
Li("Minimum Word Count Filter: 10"),
|
662 |
),
|
663 |
-
H4("Local Deduplication Process"),
|
664 |
-
Ol(
|
665 |
-
Li("Local dedup was done within stackexchange itself"),
|
666 |
-
),
|
667 |
table_div_se,
|
668 |
),
|
669 |
),
|
@@ -679,7 +652,8 @@ filtering_process = Div(
|
|
679 |
|
680 |
def exclude_select_system(x):
|
681 |
return '\n'.join(line for line in x.split('\n') if not (line.startswith('===')
|
682 |
-
|
|
|
683 |
|
684 |
def clean(x):
|
685 |
return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
|
@@ -690,10 +664,6 @@ filtering_process = Div(
|
|
690 |
Li("Minimum Word Count Filter: 10"),
|
691 |
Li("Unigram Log Probability"),
|
692 |
),
|
693 |
-
H4("Local Deduplication Process"),
|
694 |
-
Ol(
|
695 |
-
Li("Local dedup was done within Ubuntu IRC itself"),
|
696 |
-
),
|
697 |
table_div_uirc,
|
698 |
),
|
699 |
),
|
@@ -729,10 +699,6 @@ filtering_process = Div(
|
|
729 |
Li("Minimum Word Count Filter: 20"),
|
730 |
Li("Unigram Log Probability"),
|
731 |
),
|
732 |
-
H4("Local Deduplication Process"),
|
733 |
-
Ol(
|
734 |
-
Li("Local dedup was done within PG19 itself"),
|
735 |
-
),
|
736 |
table_div_pg19,
|
737 |
),
|
738 |
),
|
|
|
450 |
P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
|
451 |
H4("Filtering"),
|
452 |
P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
|
|
|
|
|
|
|
453 |
),
|
454 |
table_div_wikipedia,
|
455 |
),
|
|
|
467 |
Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
|
468 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
469 |
),
|
|
|
|
|
|
|
|
|
470 |
table_div_arx,
|
471 |
),
|
472 |
),
|
|
|
547 |
Ol(
|
548 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
549 |
),
|
|
|
|
|
|
|
|
|
550 |
table_div_phil,
|
551 |
),
|
552 |
),
|
|
|
557 |
P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
558 |
H4("Filtering"),
|
559 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
|
|
|
|
|
|
|
|
560 |
table_div_up,
|
561 |
),
|
562 |
),
|
|
|
572 |
Li("Minimum Word Count Filter: 10"),
|
573 |
Li("Unigram Log Probability"),
|
574 |
),
|
|
|
|
|
|
|
|
|
575 |
table_div_hn,
|
576 |
),
|
577 |
),
|
|
|
586 |
Li("Minimum Word Count Filter: 50"),
|
587 |
Li("Unigram Log Probability"),
|
588 |
),
|
|
|
|
|
|
|
|
|
589 |
table_div_uspto,
|
590 |
),
|
591 |
),
|
|
|
637 |
Ol(
|
638 |
Li("Minimum Word Count Filter: 10"),
|
639 |
),
|
|
|
|
|
|
|
|
|
640 |
table_div_se,
|
641 |
),
|
642 |
),
|
|
|
652 |
|
653 |
def exclude_select_system(x):
|
654 |
return '\n'.join(line for line in x.split('\n') if not (line.startswith('===')
|
655 |
+
and any(term in line for term in
|
656 |
+
['has joined #', 'has left #', 'Topic for #', "Topic (#", "is now known as"]) ))
|
657 |
|
658 |
def clean(x):
|
659 |
return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
|
|
|
664 |
Li("Minimum Word Count Filter: 10"),
|
665 |
Li("Unigram Log Probability"),
|
666 |
),
|
|
|
|
|
|
|
|
|
667 |
table_div_uirc,
|
668 |
),
|
669 |
),
|
|
|
699 |
Li("Minimum Word Count Filter: 20"),
|
700 |
Li("Unigram Log Probability"),
|
701 |
),
|
|
|
|
|
|
|
|
|
702 |
table_div_pg19,
|
703 |
),
|
704 |
),
|