victormiller
commited on
Commit
•
b4e3ff3
1
Parent(s):
583d7c5
Update curated.py
Browse files- curated.py +3 -3
curated.py
CHANGED
@@ -41,7 +41,7 @@ filtering_process = Div(
|
|
41 |
Li("All markdowns were combined to create jsonl files"),
|
42 |
),
|
43 |
H4("Filtering"),
|
44 |
-
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset")
|
45 |
Ol(
|
46 |
Li("min_word: less than 500 words (not inclusive) are discarded"),
|
47 |
Li("Language: any language other than English are discarded"),
|
@@ -76,7 +76,7 @@ filtering_process = Div(
|
|
76 |
Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
77 |
),
|
78 |
H4("Filtering - S2ORC Abstract"),
|
79 |
-
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually")
|
80 |
Ol(
|
81 |
Li("title_abstract: must have title and abstract"),
|
82 |
Li("language: abstract must be in English"),
|
@@ -105,7 +105,7 @@ filtering_process = Div(
|
|
105 |
Li("For pubmed abstract, the XML files are in very simple format and beautiful soup is directly used to extract the abstract, title and pmid and stored in jsonl format"),
|
106 |
),
|
107 |
H4("Filtering"),
|
108 |
-
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset.")
|
109 |
Ol(
|
110 |
Li("min_word: less than 100 words (not inclusive) are discarded, less than 20 words for pubmed abstract"),
|
111 |
Li("Language: any language other than English are discarded"),
|
|
|
41 |
Li("All markdowns were combined to create jsonl files"),
|
42 |
),
|
43 |
H4("Filtering"),
|
44 |
+
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
|
45 |
Ol(
|
46 |
Li("min_word: less than 500 words (not inclusive) are discarded"),
|
47 |
Li("Language: any language other than English are discarded"),
|
|
|
76 |
Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
77 |
),
|
78 |
H4("Filtering - S2ORC Abstract"),
|
79 |
+
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
|
80 |
Ol(
|
81 |
Li("title_abstract: must have title and abstract"),
|
82 |
Li("language: abstract must be in English"),
|
|
|
105 |
Li("For pubmed abstract, the XML files are in very simple format and beautiful soup is directly used to extract the abstract, title and pmid and stored in jsonl format"),
|
106 |
),
|
107 |
H4("Filtering"),
|
108 |
+
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
|
109 |
Ol(
|
110 |
Li("min_word: less than 100 words (not inclusive) are discarded, less than 20 words for pubmed abstract"),
|
111 |
Li("Language: any language other than English are discarded"),
|