victormiller
commited on
Commit
•
0e12ce8
1
Parent(s):
6a336ca
Update curated.py
Browse files- curated.py +5 -5
curated.py
CHANGED
@@ -544,7 +544,7 @@ data_preprocessing_div = Div(
|
|
544 |
P(
|
545 |
"The ",
|
546 |
B("Unigram Log Probability Filter"),
|
547 |
-
" calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but
|
548 |
A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
|
549 |
". Specifically, we use the list available created by ",
|
550 |
A(
|
@@ -555,7 +555,7 @@ data_preprocessing_div = Div(
|
|
555 |
),
|
556 |
H3("Data Processing for S2ORC"),
|
557 |
P(
|
558 |
-
"The
|
559 |
),
|
560 |
P(
|
561 |
"The ",
|
@@ -637,7 +637,7 @@ filtering_process = Div(
|
|
637 |
),
|
638 |
plotly2fasthtml(diff2_stacked_bar),
|
639 |
H3(
|
640 |
-
"This section continues
|
641 |
),
|
642 |
),
|
643 |
Section(
|
@@ -1188,7 +1188,7 @@ filtering_process = Div(
|
|
1188 |
language="python",
|
1189 |
),
|
1190 |
P(
|
1191 |
-
"All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile,
|
1192 |
),
|
1193 |
P(B("Unique Data Preparation Challenges: ")),
|
1194 |
Ul(
|
@@ -1352,7 +1352,7 @@ filtering_process = Div(
|
|
1352 |
),
|
1353 |
P(
|
1354 |
B("Download and Extraction: "),
|
1355 |
-
"The dataset was downloaded
|
1356 |
A(
|
1357 |
"https://huggingface.co/datasets/deepmind/math_dataset",
|
1358 |
href="https://huggingface.co/datasets/deepmind/math_dataset",
|
|
|
544 |
P(
|
545 |
"The ",
|
546 |
B("Unigram Log Probability Filter"),
|
547 |
+
" calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but may not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the ",
|
548 |
A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
|
549 |
". Specifically, we use the list available created by ",
|
550 |
A(
|
|
|
555 |
),
|
556 |
H3("Data Processing for S2ORC"),
|
557 |
P(
|
558 |
+
"The formatting of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."
|
559 |
),
|
560 |
P(
|
561 |
"The ",
|
|
|
637 |
),
|
638 |
plotly2fasthtml(diff2_stacked_bar),
|
639 |
H3(
|
640 |
+
"This section continues below with the specific filtering steps taken for all 14 curated datasets."
|
641 |
),
|
642 |
),
|
643 |
Section(
|
|
|
1188 |
language="python",
|
1189 |
),
|
1190 |
P(
|
1191 |
+
"All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priority was given to plain_text first, followed by the columns in the table in reverse order."
|
1192 |
),
|
1193 |
P(B("Unique Data Preparation Challenges: ")),
|
1194 |
Ul(
|
|
|
1352 |
),
|
1353 |
P(
|
1354 |
B("Download and Extraction: "),
|
1355 |
+
"The dataset was downloaded directly from the Huggingface repo: ",
|
1356 |
A(
|
1357 |
"https://huggingface.co/datasets/deepmind/math_dataset",
|
1358 |
href="https://huggingface.co/datasets/deepmind/math_dataset",
|