victormiller
commited on
Commit
•
6a9172e
1
Parent(s):
90b43c6
Update curated.py
Browse files- curated.py +19 -16
curated.py
CHANGED
@@ -544,7 +544,7 @@ data_preprocessing_div = Div(
|
|
544 |
P(
|
545 |
"The ",
|
546 |
B("Unigram Log Probability Filter"),
|
547 |
-
" calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the",
|
548 |
A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
|
549 |
". Specifically, we use the list available created by ",
|
550 |
A(
|
@@ -909,7 +909,7 @@ filtering_process = Div(
|
|
909 |
),
|
910 |
P(
|
911 |
B("Download and Extraction: "),
|
912 |
-
"Original PDF files download from",
|
913 |
A(
|
914 |
"https://philarchive.org/oai.pl",
|
915 |
href="https://philarchive.org/oai.pl",
|
@@ -917,7 +917,7 @@ filtering_process = Div(
|
|
917 |
". All available PDF's were downloaded. Each PDF was converted to text using java",
|
918 |
D_code(
|
919 |
"-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}",
|
920 |
-
language="
|
921 |
),
|
922 |
". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library.",
|
923 |
),
|
@@ -1034,7 +1034,7 @@ filtering_process = Div(
|
|
1034 |
),
|
1035 |
P(
|
1036 |
B("Download and Extraction: "),
|
1037 |
-
"Original dataset was downloaded from",
|
1038 |
A(
|
1039 |
"http://www.statmt.org/europarl/v7/europarl.tgz",
|
1040 |
href="http://www.statmt.org/europarl/v7/europarl.tgz",
|
@@ -1098,11 +1098,11 @@ filtering_process = Div(
|
|
1098 |
Div(
|
1099 |
H3("HackerNews"),
|
1100 |
P(
|
1101 |
-
"
|
1102 |
),
|
1103 |
P(
|
1104 |
B("Download and Extraction: "),
|
1105 |
-
"The dataset was downloaded from the HackerNews repo here:",
|
1106 |
A(
|
1107 |
"https://hacker-news.firebaseio.com/v0/item/",
|
1108 |
href="https://hacker-news.firebaseio.com/v0/item/",
|
@@ -1110,7 +1110,7 @@ filtering_process = Div(
|
|
1110 |
". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time.",
|
1111 |
),
|
1112 |
P(
|
1113 |
-
"The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level).
|
1114 |
),
|
1115 |
P(B("Unique Data Preperation Challenges: ")),
|
1116 |
Ul(
|
@@ -1141,7 +1141,7 @@ filtering_process = Div(
|
|
1141 |
P("Patent documents from the United States Patent and Trademark Office."),
|
1142 |
P(
|
1143 |
B("Download and Extraction: "),
|
1144 |
-
"Data was downloaded and extracted using tags from",
|
1145 |
A(
|
1146 |
"https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/",
|
1147 |
href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/",
|
@@ -1171,7 +1171,7 @@ filtering_process = Div(
|
|
1171 |
),
|
1172 |
P(
|
1173 |
B("Download and Extraction"),
|
1174 |
-
"The dataset was downloaded from:",
|
1175 |
A(
|
1176 |
"https://storage.courtlistener.com/bulk-data/",
|
1177 |
href="https://storage.courtlistener.com/bulk-data/",
|
@@ -1185,7 +1185,7 @@ filtering_process = Div(
|
|
1185 |
("html_with_citations", html2text), ("xml_harvard", html2text),
|
1186 |
plain_text
|
1187 |
""",
|
1188 |
-
language="
|
1189 |
),
|
1190 |
P(
|
1191 |
"All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
|
@@ -1247,7 +1247,8 @@ filtering_process = Div(
|
|
1247 |
A("math.stackexchange.com", href="math.stackexchange.com"),
|
1248 |
". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments. We will include the full list of sub URLs in when the code is released.",
|
1249 |
),
|
1250 |
-
|
|
|
1251 |
1. Questions:
|
1252 |
2. Comment1:
|
1253 |
3. Comment2:
|
@@ -1256,8 +1257,10 @@ filtering_process = Div(
|
|
1256 |
6. Comment2:
|
1257 |
7. Answer2:
|
1258 |
8. Comment1:
|
1259 |
-
9. Comment2:
|
1260 |
-
|
|
|
|
|
1261 |
P(B("Unique Data Preperation Challenges: ")),
|
1262 |
Ul(
|
1263 |
Li(
|
@@ -1301,7 +1304,7 @@ filtering_process = Div(
|
|
1301 |
),
|
1302 |
P(
|
1303 |
B("Download and Extraction: "),
|
1304 |
-
"The dataset was downloaded from:",
|
1305 |
A(
|
1306 |
"https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/",
|
1307 |
href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/",
|
@@ -1349,7 +1352,7 @@ filtering_process = Div(
|
|
1349 |
),
|
1350 |
P(
|
1351 |
B("Download and Extraction: "),
|
1352 |
-
"The dataset was downloaded rirectly downloaded from the Huggingface repo:",
|
1353 |
A(
|
1354 |
"https://huggingface.co/datasets/deepmind/math_dataset",
|
1355 |
href="https://huggingface.co/datasets/deepmind/math_dataset",
|
@@ -1359,7 +1362,7 @@ filtering_process = Div(
|
|
1359 |
D_code(
|
1360 |
"""
|
1361 |
Question: TEXT
|
1362 |
-
|
1363 |
block="block",
|
1364 |
language="python",
|
1365 |
),
|
|
|
544 |
P(
|
545 |
"The ",
|
546 |
B("Unigram Log Probability Filter"),
|
547 |
+
" calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the ",
|
548 |
A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
|
549 |
". Specifically, we use the list available created by ",
|
550 |
A(
|
|
|
909 |
),
|
910 |
P(
|
911 |
B("Download and Extraction: "),
|
912 |
+
"Original PDF files download from ",
|
913 |
A(
|
914 |
"https://philarchive.org/oai.pl",
|
915 |
href="https://philarchive.org/oai.pl",
|
|
|
917 |
". All available PDF's were downloaded. Each PDF was converted to text using java",
|
918 |
D_code(
|
919 |
"-jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}",
|
920 |
+
language="python",
|
921 |
),
|
922 |
". After converting to text formatting, a language was detected and added using the langdetect (citation needed) library.",
|
923 |
),
|
|
|
1034 |
),
|
1035 |
P(
|
1036 |
B("Download and Extraction: "),
|
1037 |
+
"Original dataset was downloaded from ",
|
1038 |
A(
|
1039 |
"http://www.statmt.org/europarl/v7/europarl.tgz",
|
1040 |
href="http://www.statmt.org/europarl/v7/europarl.tgz",
|
|
|
1098 |
Div(
|
1099 |
H3("HackerNews"),
|
1100 |
P(
|
1101 |
+
"A dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."
|
1102 |
),
|
1103 |
P(
|
1104 |
B("Download and Extraction: "),
|
1105 |
+
"The dataset was downloaded from the HackerNews repo here: ",
|
1106 |
A(
|
1107 |
"https://hacker-news.firebaseio.com/v0/item/",
|
1108 |
href="https://hacker-news.firebaseio.com/v0/item/",
|
|
|
1110 |
". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time.",
|
1111 |
),
|
1112 |
P(
|
1113 |
+
"The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level)."
|
1114 |
),
|
1115 |
P(B("Unique Data Preperation Challenges: ")),
|
1116 |
Ul(
|
|
|
1141 |
P("Patent documents from the United States Patent and Trademark Office."),
|
1142 |
P(
|
1143 |
B("Download and Extraction: "),
|
1144 |
+
"Data was downloaded and extracted using tags from ",
|
1145 |
A(
|
1146 |
"https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/",
|
1147 |
href="https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/",
|
|
|
1171 |
),
|
1172 |
P(
|
1173 |
B("Download and Extraction"),
|
1174 |
+
"The dataset was downloaded from: ",
|
1175 |
A(
|
1176 |
"https://storage.courtlistener.com/bulk-data/",
|
1177 |
href="https://storage.courtlistener.com/bulk-data/",
|
|
|
1185 |
("html_with_citations", html2text), ("xml_harvard", html2text),
|
1186 |
plain_text
|
1187 |
""",
|
1188 |
+
language="python",
|
1189 |
),
|
1190 |
P(
|
1191 |
"All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
|
|
|
1247 |
A("math.stackexchange.com", href="math.stackexchange.com"),
|
1248 |
". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments. We will include the full list of sub URLs in when the code is released.",
|
1249 |
),
|
1250 |
+
D_code(
|
1251 |
+
"""
|
1252 |
1. Questions:
|
1253 |
2. Comment1:
|
1254 |
3. Comment2:
|
|
|
1257 |
6. Comment2:
|
1258 |
7. Answer2:
|
1259 |
8. Comment1:
|
1260 |
+
9. Comment2:""",
|
1261 |
+
block="block",
|
1262 |
+
language="python",
|
1263 |
+
),
|
1264 |
P(B("Unique Data Preperation Challenges: ")),
|
1265 |
Ul(
|
1266 |
Li(
|
|
|
1304 |
),
|
1305 |
P(
|
1306 |
B("Download and Extraction: "),
|
1307 |
+
"The dataset was downloaded from: ",
|
1308 |
A(
|
1309 |
"https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/",
|
1310 |
href="https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/",
|
|
|
1352 |
),
|
1353 |
P(
|
1354 |
B("Download and Extraction: "),
|
1355 |
+
"The dataset was downloaded rirectly downloaded from the Huggingface repo: ",
|
1356 |
A(
|
1357 |
"https://huggingface.co/datasets/deepmind/math_dataset",
|
1358 |
href="https://huggingface.co/datasets/deepmind/math_dataset",
|
|
|
1362 |
D_code(
|
1363 |
"""
|
1364 |
Question: TEXT
|
1365 |
+
Answer: TEXT""",
|
1366 |
block="block",
|
1367 |
language="python",
|
1368 |
),
|