Spaces:
Running
Running
victormiller
commited on
Commit
•
103b5cf
1
Parent(s):
c642284
Update curated.py
Browse files- curated.py +3 -1
curated.py
CHANGED
@@ -443,8 +443,10 @@ filtering_process = Div(
|
|
443 |
P("This section contains the specific steps taken to filter all 14 curated source datasets.")
|
444 |
),
|
445 |
Section(
|
|
|
446 |
H3("Wikipedia"),
|
447 |
H4("Download and Extraction"),
|
|
|
448 |
P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
|
449 |
H4("Filtering"),
|
450 |
P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
|
@@ -453,7 +455,7 @@ filtering_process = Div(
|
|
453 |
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
454 |
),
|
455 |
table_div_wikipedia,
|
456 |
-
|
457 |
),
|
458 |
Section(
|
459 |
H3("ArXiv"),
|
|
|
443 |
P("This section contains the specific steps taken to filter all 14 curated source datasets.")
|
444 |
),
|
445 |
Section(
|
446 |
+
Div(
|
447 |
H3("Wikipedia"),
|
448 |
H4("Download and Extraction"),
|
449 |
+
|
450 |
P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
|
451 |
H4("Filtering"),
|
452 |
P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
|
|
|
455 |
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
456 |
),
|
457 |
table_div_wikipedia,
|
458 |
+
),
|
459 |
),
|
460 |
Section(
|
461 |
H3("ArXiv"),
|