victormiller
commited on
Commit
•
e3ed423
1
Parent(s):
913dc7b
Update curated.py
Browse files- curated.py +27 -14
curated.py
CHANGED
@@ -595,15 +595,6 @@ filtering_process = Div(
|
|
595 |
Li("paragraph_count: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability"),
|
596 |
Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
597 |
),
|
598 |
-
H4("Filtering - S2ORC Abstract"),
|
599 |
-
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
|
600 |
-
Ol(
|
601 |
-
Li("title_abstract: must have title and abstract"),
|
602 |
-
Li("language: abstract must be in English"),
|
603 |
-
Li("word_count: less than 20 (not inclusive) are discarded"),
|
604 |
-
Li("Unigram log probablity"),
|
605 |
-
Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
606 |
-
),
|
607 |
H4("Local Deduplication Process"),
|
608 |
Ol(
|
609 |
Li("Local dedup was done with all papers combined."),
|
@@ -616,7 +607,7 @@ filtering_process = Div(
|
|
616 |
Details(
|
617 |
Summary("FreeLaw Filtering Examples -- need to update"),
|
618 |
Div(
|
619 |
-
|
620 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
621 |
),
|
622 |
style="""
|
@@ -628,6 +619,28 @@ filtering_process = Div(
|
|
628 |
),
|
629 |
),
|
630 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
631 |
Section(
|
632 |
Div(
|
633 |
H3("PubMed - need to update with abstract vs central"),
|
@@ -797,7 +810,7 @@ filtering_process = Div(
|
|
797 |
H3("FreeLaw"),
|
798 |
P("Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts."),
|
799 |
H4("Download and Extraction"),
|
800 |
-
|
801 |
D_code("""
|
802 |
("html", html2text),
|
803 |
("html_lawbox", html2text),
|
@@ -839,7 +852,7 @@ filtering_process = Div(
|
|
839 |
Div(
|
840 |
H3("StackExchange"),
|
841 |
P("A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more. This is one of the largest publicly available repositories for question-answer pairs. We have included comments also to include an overall discussion on each post."),
|
842 |
-
P(B("Download and Extraction: "), "The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
|
843 |
P("""
|
844 |
1. Questions:
|
845 |
2. Comment1:
|
@@ -937,9 +950,9 @@ filtering_process = Div(
|
|
937 |
P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
|
938 |
H4("Filtering"),
|
939 |
Ol(
|
940 |
-
Li("Language Filter:
|
941 |
Li("Minimum Word Count Filter: 20", style = "margin-bottom: 2px"),
|
942 |
-
Li("Unigram Log Probability", style = "margin-bottom: 2px"),
|
943 |
),
|
944 |
table_div_pg19,
|
945 |
Details(
|
|
|
595 |
Li("paragraph_count: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability"),
|
596 |
Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
597 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
598 |
H4("Local Deduplication Process"),
|
599 |
Ol(
|
600 |
Li("Local dedup was done with all papers combined."),
|
|
|
607 |
Details(
|
608 |
Summary("FreeLaw Filtering Examples -- need to update"),
|
609 |
Div(
|
610 |
+
P("examples are missing"),
|
611 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
612 |
),
|
613 |
style="""
|
|
|
619 |
),
|
620 |
),
|
621 |
),
|
622 |
+
Section(
|
623 |
+
Div(
|
624 |
+
H3("S2ORC ABSTRACT"),
|
625 |
+
P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
|
626 |
+
H4("Download and Extraction"),
|
627 |
+
Ol(
|
628 |
+
Li("This was downloaded directly in zip format using S2ORC api key and normal get request. code: response = urllib.request.urlopen(url)"),
|
629 |
+
Li("There were two kind of datasets that was downloaded S2ORC and S2ORC abstract"),
|
630 |
+
),
|
631 |
+
H4("Filtering - S2ORC Abstract"),
|
632 |
+
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
|
633 |
+
Ol(
|
634 |
+
Li("title_abstract: must have title and abstract"),
|
635 |
+
Li("language: abstract must be in English"),
|
636 |
+
Li("word_count: less than 20 (not inclusive) are discarded"),
|
637 |
+
Li("Unigram log probablity"),
|
638 |
+
Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
639 |
+
),
|
640 |
+
)
|
641 |
+
),
|
642 |
+
|
643 |
+
|
644 |
Section(
|
645 |
Div(
|
646 |
H3("PubMed - need to update with abstract vs central"),
|
|
|
810 |
H3("FreeLaw"),
|
811 |
P("Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts."),
|
812 |
H4("Download and Extraction"),
|
813 |
+
P("The dataset was downloaded from:", A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), ". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
|
814 |
D_code("""
|
815 |
("html", html2text),
|
816 |
("html_lawbox", html2text),
|
|
|
852 |
Div(
|
853 |
H3("StackExchange"),
|
854 |
P("A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more. This is one of the largest publicly available repositories for question-answer pairs. We have included comments also to include an overall discussion on each post."),
|
855 |
+
P(B("Download and Extraction: "), "The archive dataset was used to download all data from StackExchange and 364 StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments. We will include the full list of sub URLs in when the code is released."),
|
856 |
P("""
|
857 |
1. Questions:
|
858 |
2. Comment1:
|
|
|
950 |
P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
|
951 |
H4("Filtering"),
|
952 |
Ol(
|
953 |
+
Li("Language Filter: English", style = "margin-bottom: 2px"),
|
954 |
Li("Minimum Word Count Filter: 20", style = "margin-bottom: 2px"),
|
955 |
+
Li("Unigram Log Probability: ", "-20", style = "margin-bottom: 2px"),
|
956 |
),
|
957 |
table_div_pg19,
|
958 |
Details(
|