victormiller
commited on
Commit
•
5f4285e
1
Parent(s):
e3ed423
Update curated.py
Browse files- curated.py +39 -53
curated.py
CHANGED
@@ -462,7 +462,7 @@ data_preprocessing_div = Div(
|
|
462 |
P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
|
463 |
H3("Data Processing for S2ORC"),
|
464 |
P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
|
465 |
-
P("The ", B("Title Abstract Filter")," extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
|
466 |
P("The ", B("Majority Language Filter")," identifies the majority language in the dataset. This step displays the distribution of languages in the dataset to enable language-specific analysis and insights."),
|
467 |
P("The ", B("Paragraph Count Filter")," counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful hueristic for document complexity."),
|
468 |
P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
|
@@ -559,7 +559,7 @@ filtering_process = Div(
|
|
559 |
Li("Language Filter: any language other than English are discarded"),
|
560 |
Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
|
561 |
Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by ", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
|
562 |
-
Li("Note:
|
563 |
),
|
564 |
table_div_arx,
|
565 |
Details(
|
@@ -579,33 +579,24 @@ filtering_process = Div(
|
|
579 |
),
|
580 |
Section(
|
581 |
Div(
|
582 |
-
H3("S2ORC
|
583 |
P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
|
584 |
H4("Download and Extraction"),
|
585 |
Ol(
|
586 |
-
Li("This was downloaded directly in zip format using S2ORC api key and
|
587 |
-
Li("There were two kind of datasets that was downloaded S2ORC and S2ORC abstract"),
|
588 |
),
|
589 |
-
H4("Filtering
|
590 |
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
|
591 |
Ol(
|
592 |
-
Li("
|
593 |
-
Li("The paper must be in English. To determine the language of each document, we use the pycld3 library. We run pycld3 on the first 2000 characters of each paragraph in the paper. The language of the paper is the most common language of the paragraphs."),
|
594 |
-
Li("
|
595 |
-
Li("
|
596 |
-
Li("
|
597 |
-
),
|
598 |
-
H4("Local Deduplication Process"),
|
599 |
-
Ol(
|
600 |
-
Li("Local dedup was done with all papers combined."),
|
601 |
-
),
|
602 |
-
H4("Global Deduplication Process"),
|
603 |
-
Ol(
|
604 |
-
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
605 |
),
|
606 |
table_div_s2o,
|
607 |
Details(
|
608 |
-
Summary("
|
609 |
Div(
|
610 |
P("examples are missing"),
|
611 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
@@ -621,21 +612,33 @@ filtering_process = Div(
|
|
621 |
),
|
622 |
Section(
|
623 |
Div(
|
624 |
-
H3("S2ORC
|
625 |
P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
|
626 |
H4("Download and Extraction"),
|
627 |
Ol(
|
628 |
-
Li("This was downloaded directly in zip format using S2ORC api key and
|
629 |
-
Li("There were two kind of datasets that was downloaded S2ORC and S2ORC abstract"),
|
630 |
),
|
631 |
-
H4("Filtering
|
632 |
-
P("
|
633 |
Ol(
|
634 |
-
Li("
|
635 |
-
Li("
|
636 |
-
Li("
|
637 |
-
Li("Unigram
|
638 |
-
Li("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
639 |
),
|
640 |
)
|
641 |
),
|
@@ -643,32 +646,15 @@ filtering_process = Div(
|
|
643 |
|
644 |
Section(
|
645 |
Div(
|
646 |
-
H3("PubMed
|
647 |
-
|
648 |
-
H4("Download and Extraction"),
|
649 |
-
Ol(
|
650 |
-
Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
|
651 |
-
Li("All the urls are downloaded and the downloaded data is in xml.tar format"),
|
652 |
-
Li("For pubmed central First tar files are opened using tarfile library and then converted to markdown format using pandoc: pandoc -f jats {nxml} -o {pmcid}.md --wrap=none"),
|
653 |
-
Li("All the markdown files are combined to create jsonl files. In jsonl files, 1 line correspond to 1 markdown file."),
|
654 |
-
Li("For pubmed abstract, the XML files are in very simple format and beautiful soup is directly used to extract the abstract, title and pmid and stored in jsonl format"),
|
655 |
-
),
|
656 |
H4("Filtering"),
|
657 |
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
|
658 |
Ol(
|
659 |
-
Li("
|
660 |
-
Li("Language: any language other than English are discarded"),
|
661 |
-
Li("Frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace. This filter is not used for pubmed abstract"),
|
662 |
-
Li("Unigram
|
663 |
-
Li("need to add the hyperlinks for the section above"),
|
664 |
-
),
|
665 |
-
H4("Local Deduplication Process"),
|
666 |
-
Ol(
|
667 |
-
Li("Local dedup was done with all papers combined."),
|
668 |
-
),
|
669 |
-
H4("Global Deduplication Process"),
|
670 |
-
Ol(
|
671 |
-
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
672 |
),
|
673 |
table_div_med,
|
674 |
Details(
|
|
|
462 |
P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
|
463 |
H3("Data Processing for S2ORC"),
|
464 |
P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
|
465 |
+
P("The ", B("Title and Abstract Filter")," extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
|
466 |
P("The ", B("Majority Language Filter")," identifies the majority language in the dataset. This step displays the distribution of languages in the dataset to enable language-specific analysis and insights."),
|
467 |
P("The ", B("Paragraph Count Filter")," counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful hueristic for document complexity."),
|
468 |
P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
|
|
|
559 |
Li("Language Filter: any language other than English are discarded"),
|
560 |
Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
|
561 |
Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by ", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
|
562 |
+
Li("Note: the Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
563 |
),
|
564 |
table_div_arx,
|
565 |
Details(
|
|
|
579 |
),
|
580 |
Section(
|
581 |
Div(
|
582 |
+
H3("S2ORC"),
|
583 |
P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
|
584 |
H4("Download and Extraction"),
|
585 |
Ol(
|
586 |
+
Li("This was downloaded directly in zip format using S2ORC api key and a get() request: ", D_code("response = urllib.request.urlopen(url)", language = "python")),
|
|
|
587 |
),
|
588 |
+
H4("Filtering"),
|
589 |
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
|
590 |
Ol(
|
591 |
+
Li("Title and Abstract Filter: must have title and abstract"),
|
592 |
+
Li("Language Filter: The paper must be in English. To determine the language of each document, we use the pycld3 library. We run pycld3 on the first 2000 characters of each paragraph in the paper. The language of the paper is the most common language of the paragraphs."),
|
593 |
+
Li("Word Count Filter: less than 500 words (not inclusive) are discarded"),
|
594 |
+
Li("Paragraph Count Filter: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability"),
|
595 |
+
Li("Frequency Filter: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
596 |
),
|
597 |
table_div_s2o,
|
598 |
Details(
|
599 |
+
Summary("S2ORC Filtering Examples -- need to update"),
|
600 |
Div(
|
601 |
P("examples are missing"),
|
602 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
|
612 |
),
|
613 |
Section(
|
614 |
Div(
|
615 |
+
H3("S2ORC Abstract"),
|
616 |
P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
|
617 |
H4("Download and Extraction"),
|
618 |
Ol(
|
619 |
+
Li("This was downloaded directly in zip format using S2ORC api key and a get() request: ", D_code("response = urllib.request.urlopen(url)", language = "python")),
|
|
|
620 |
),
|
621 |
+
H4("Filtering"),
|
622 |
+
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
|
623 |
Ol(
|
624 |
+
Li("Title and Abstract Filter: must have title and abstract"),
|
625 |
+
Li("Majority Language Filter: abstract must be in English"),
|
626 |
+
Li("Minimum Word Count Filter: less than 20 (not inclusive) are discarded"),
|
627 |
+
Li("Unigram Log Probability Threshold: -20"),
|
628 |
+
Li("Note: Frequency Filter: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
629 |
+
),
|
630 |
+
Details(
|
631 |
+
Summary("S2ORC Abstract Filtering Examples "),
|
632 |
+
Div(
|
633 |
+
P("examples are missing"),
|
634 |
+
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
635 |
+
),
|
636 |
+
style="""
|
637 |
+
background-color: #FFFAEA; /* Light yellow background */
|
638 |
+
padding: 15px;
|
639 |
+
border-radius: 12px;
|
640 |
+
margin-bottom: 15px
|
641 |
+
""",
|
642 |
),
|
643 |
)
|
644 |
),
|
|
|
646 |
|
647 |
Section(
|
648 |
Div(
|
649 |
+
H3("PubMed Central and PubMed Abstract"),
|
650 |
+
P(B("Download and Extraction: "), "All files were downloaded from", A("ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc", D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
H4("Filtering"),
|
652 |
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
|
653 |
Ol(
|
654 |
+
Li("Minimum Word Count Filter: less than 100 words (not inclusive) are discarded, less than 20 words for pubmed abstract"),
|
655 |
+
Li("Language Filter: any language other than English are discarded"),
|
656 |
+
Li("Frequency Filter: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace. This filter is not used for pubmed abstract"),
|
657 |
+
Li("Unigram Log Probability Threshold: -20"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
658 |
),
|
659 |
table_div_med,
|
660 |
Details(
|