victormiller
commited on
Update web.py
Browse files
web.py
CHANGED
@@ -252,7 +252,9 @@ def web_data():
|
|
252 |
Li("Document Preperation", style = "margin-bottom: 5px"),
|
253 |
Li("Line-Level Filtering", style = "margin-bottom: 5px"),
|
254 |
Li("Local Deduplication", style = "margin-bottom: 5px"),
|
255 |
-
Li("Each section is complete with code and comparisons to Dolma,
|
|
|
|
|
256 |
),
|
257 |
P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
|
258 |
),
|
@@ -370,7 +372,7 @@ def web_data():
|
|
370 |
"""),
|
371 |
|
372 |
P(B("URL Blocklist: "), """
|
373 |
-
Following RefinedWeb
|
374 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
375 |
4.6M domain names in the UT1 blocklist. Of note, 24 URLs were detected with more than 4k matches and are shown below.
|
376 |
"""),
|
@@ -466,7 +468,7 @@ def web_data():
|
|
466 |
This ensured that computing quality signals would align with the final kept texts.
|
467 |
"""),
|
468 |
P(B("Terminal Punctuation: "), """
|
469 |
-
The terminal punctuation has been used in C4
|
470 |
punctuation mark (i.e., “.”, “?”, “!”, or “"”). However, we found it could be too aggressive to remove these
|
471 |
lines, especially when the text extraction tool “trafilatura”.
|
472 |
"""),
|
|
|
252 |
Li("Document Preperation", style = "margin-bottom: 5px"),
|
253 |
Li("Line-Level Filtering", style = "margin-bottom: 5px"),
|
254 |
Li("Local Deduplication", style = "margin-bottom: 5px"),
|
255 |
+
Li("Each section is complete with code and comparisons to Dolma,", D_cite(bibtex_key="soldaini2024dolma"),
|
256 |
+
"DataTrove,", D_cite(bibtex_key="penedo2024datatrove"),
|
257 |
+
"and/or RedPajama-V-2" D_cite(bibtex_key="redpajama-v2"),, style = "margin-bottom: 5px"),
|
258 |
),
|
259 |
P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
|
260 |
),
|
|
|
372 |
"""),
|
373 |
|
374 |
P(B("URL Blocklist: "), """
|
375 |
+
Following RefinedWeb, """, D_cite(bibtex_key="refinedweb"), """we manually inspected the UT1 blocklist to reduce false positives like news
|
376 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
377 |
4.6M domain names in the UT1 blocklist. Of note, 24 URLs were detected with more than 4k matches and are shown below.
|
378 |
"""),
|
|
|
468 |
This ensured that computing quality signals would align with the final kept texts.
|
469 |
"""),
|
470 |
P(B("Terminal Punctuation: "), """
|
471 |
+
The terminal punctuation has been used in C4""", D_cite(bibtex_key="c4"), """and Dolma""", D_cite(bibtex_key="dolma"), """to remove lines that do not end with a terminal
|
472 |
punctuation mark (i.e., “.”, “?”, “!”, or “"”). However, we found it could be too aggressive to remove these
|
473 |
lines, especially when the text extraction tool “trafilatura”.
|
474 |
"""),
|