victormiller
commited on
Update curated.py
Browse files- curated.py +25 -1
curated.py
CHANGED
@@ -458,6 +458,7 @@ filtering_process = Div(
|
|
458 |
),
|
459 |
),
|
460 |
Section(
|
|
|
461 |
H3("ArXiv"),
|
462 |
H4("Download and Extraction"),
|
463 |
P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
@@ -474,8 +475,10 @@ filtering_process = Div(
|
|
474 |
Li("Local dedup was done with all papers combined."),
|
475 |
),
|
476 |
table_div_arx,
|
|
|
477 |
),
|
478 |
Section(
|
|
|
479 |
H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
|
480 |
H4("Download and Extraction"),
|
481 |
Ol(
|
@@ -509,8 +512,10 @@ filtering_process = Div(
|
|
509 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
510 |
),
|
511 |
table_div_s2o,
|
|
|
512 |
),
|
513 |
Section(
|
|
|
514 |
H3("PubMed - need to update with abstract vs central"),
|
515 |
H4("Download and Extraction"),
|
516 |
Ol(
|
@@ -538,8 +543,10 @@ filtering_process = Div(
|
|
538 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
539 |
),
|
540 |
table_div_med,
|
|
|
541 |
),
|
542 |
Section(
|
|
|
543 |
H3("Phil Papers"),
|
544 |
H4("Download and Extraction"),
|
545 |
P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
|
@@ -552,8 +559,10 @@ filtering_process = Div(
|
|
552 |
Li("Local dedup was done with all papers combined."),
|
553 |
),
|
554 |
table_div_phil,
|
|
|
555 |
),
|
556 |
Section(
|
|
|
557 |
H3("Europarl"),
|
558 |
H4("Download and Extraction"),
|
559 |
P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
@@ -565,7 +574,9 @@ filtering_process = Div(
|
|
565 |
),
|
566 |
table_div_up,
|
567 |
),
|
|
|
568 |
Section(
|
|
|
569 |
H3("HackerNews"),
|
570 |
H4("Download and Extraction"),
|
571 |
P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
|
@@ -581,8 +592,10 @@ filtering_process = Div(
|
|
581 |
Li("Local dedup was done within hackernews itself"),
|
582 |
),
|
583 |
table_div_hn,
|
|
|
584 |
),
|
585 |
Section(
|
|
|
586 |
H3("USPTO"),
|
587 |
H4("Download and Extraction"),
|
588 |
P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
|
@@ -597,8 +610,10 @@ filtering_process = Div(
|
|
597 |
Li("Local dedup was done within USPTO itself"),
|
598 |
),
|
599 |
table_div_uspto,
|
|
|
600 |
),
|
601 |
Section(
|
|
|
602 |
H3("FreeLaw"),
|
603 |
H4("Download and Extraction"),
|
604 |
#P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
|
@@ -623,8 +638,10 @@ filtering_process = Div(
|
|
623 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
624 |
),
|
625 |
table_div_freelaw,
|
|
|
626 |
),
|
627 |
Section(
|
|
|
628 |
H3("StackExchange"),
|
629 |
H4("Download and Extraction"),
|
630 |
P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
|
@@ -648,8 +665,10 @@ filtering_process = Div(
|
|
648 |
Li("Local dedup was done within stackexchange itself"),
|
649 |
),
|
650 |
table_div_se,
|
|
|
651 |
),
|
652 |
Section(
|
|
|
653 |
H3("Ubuntu IRC"),
|
654 |
H4("Download and Extraction"),
|
655 |
P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
|
@@ -675,9 +694,11 @@ filtering_process = Div(
|
|
675 |
Li("Local dedup was done within Ubuntu IRC itself"),
|
676 |
),
|
677 |
table_div_uirc,
|
|
|
678 |
),
|
679 |
Section(
|
680 |
-
|
|
|
681 |
H4("Download and Extraction"),
|
682 |
P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
|
683 |
D_code("""
|
@@ -692,8 +713,10 @@ filtering_process = Div(
|
|
692 |
Li("None"),
|
693 |
),
|
694 |
table_div_dmm,
|
|
|
695 |
),
|
696 |
Section(
|
|
|
697 |
H3("PG19"),
|
698 |
H4("Download and Extraction"),
|
699 |
Ol(
|
@@ -710,6 +733,7 @@ filtering_process = Div(
|
|
710 |
Li("Local dedup was done within PG19 itself"),
|
711 |
),
|
712 |
table_div_pg19,
|
|
|
713 |
),
|
714 |
)
|
715 |
|
|
|
458 |
),
|
459 |
),
|
460 |
Section(
|
461 |
+
Div(
|
462 |
H3("ArXiv"),
|
463 |
H4("Download and Extraction"),
|
464 |
P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
|
|
475 |
Li("Local dedup was done with all papers combined."),
|
476 |
),
|
477 |
table_div_arx,
|
478 |
+
),
|
479 |
),
|
480 |
Section(
|
481 |
+
Div(
|
482 |
H3("S2ORC - NEED TO MAKE S2ORC ABSTRACT AND UPDATE THIS FILTERING SECTION"),
|
483 |
H4("Download and Extraction"),
|
484 |
Ol(
|
|
|
512 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
513 |
),
|
514 |
table_div_s2o,
|
515 |
+
),
|
516 |
),
|
517 |
Section(
|
518 |
+
Div(
|
519 |
H3("PubMed - need to update with abstract vs central"),
|
520 |
H4("Download and Extraction"),
|
521 |
Ol(
|
|
|
543 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
544 |
),
|
545 |
table_div_med,
|
546 |
+
),
|
547 |
),
|
548 |
Section(
|
549 |
+
Div(
|
550 |
H3("Phil Papers"),
|
551 |
H4("Download and Extraction"),
|
552 |
P("Original PDF files download from", A("https://philarchive.org/oai.pl", href="https://philarchive.org/oai.pl"), ". All available PDF's were downloaded. Each PDF was converted to text using java", D_code("-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}", language="java"), ". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library."),
|
|
|
559 |
Li("Local dedup was done with all papers combined."),
|
560 |
),
|
561 |
table_div_phil,
|
562 |
+
),
|
563 |
),
|
564 |
Section(
|
565 |
+
Div(
|
566 |
H3("Europarl"),
|
567 |
H4("Download and Extraction"),
|
568 |
P("Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
|
|
574 |
),
|
575 |
table_div_up,
|
576 |
),
|
577 |
+
),
|
578 |
Section(
|
579 |
+
Div(
|
580 |
H3("HackerNews"),
|
581 |
H4("Download and Extraction"),
|
582 |
P("The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
|
|
|
592 |
Li("Local dedup was done within hackernews itself"),
|
593 |
),
|
594 |
table_div_hn,
|
595 |
+
),
|
596 |
),
|
597 |
Section(
|
598 |
+
Div(
|
599 |
H3("USPTO"),
|
600 |
H4("Download and Extraction"),
|
601 |
P("Data was downloaded and extracted using tags from", A("https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/", href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),". There were three different formats that needed three different functions to download and extract the data based on year: I(Pre_2002), 2002_to_2004, and post_2004."),
|
|
|
610 |
Li("Local dedup was done within USPTO itself"),
|
611 |
),
|
612 |
table_div_uspto,
|
613 |
+
),
|
614 |
),
|
615 |
Section(
|
616 |
+
Div(
|
617 |
H3("FreeLaw"),
|
618 |
H4("Download and Extraction"),
|
619 |
#P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
|
|
|
638 |
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
639 |
),
|
640 |
table_div_freelaw,
|
641 |
+
),
|
642 |
),
|
643 |
Section(
|
644 |
+
Div(
|
645 |
H3("StackExchange"),
|
646 |
H4("Download and Extraction"),
|
647 |
P("The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
|
|
|
665 |
Li("Local dedup was done within stackexchange itself"),
|
666 |
),
|
667 |
table_div_se,
|
668 |
+
),
|
669 |
),
|
670 |
Section(
|
671 |
+
Div(
|
672 |
H3("Ubuntu IRC"),
|
673 |
H4("Download and Extraction"),
|
674 |
P("The dataset was downloaded from:", A("https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/", href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/"), " based on the year."),
|
|
|
694 |
Li("Local dedup was done within Ubuntu IRC itself"),
|
695 |
),
|
696 |
table_div_uirc,
|
697 |
+
),
|
698 |
),
|
699 |
Section(
|
700 |
+
Div(
|
701 |
+
H3("DM Math"),
|
702 |
H4("Download and Extraction"),
|
703 |
P("The dataset was downloaded rirectly downloaded from the Huggingface repo:", A("https://huggingface.co/datasets/deepmind/math_dataset",href="https://huggingface.co/datasets/deepmind/math_dataset"), ". The data was converted to the jsonl format where lines is represented as:"),
|
704 |
D_code("""
|
|
|
713 |
Li("None"),
|
714 |
),
|
715 |
table_div_dmm,
|
716 |
+
),
|
717 |
),
|
718 |
Section(
|
719 |
+
Div(
|
720 |
H3("PG19"),
|
721 |
H4("Download and Extraction"),
|
722 |
Ol(
|
|
|
733 |
Li("Local dedup was done within PG19 itself"),
|
734 |
),
|
735 |
table_div_pg19,
|
736 |
+
),
|
737 |
),
|
738 |
)
|
739 |
|