victormiller
commited on
Update web.py
Browse files
web.py
CHANGED
@@ -242,6 +242,7 @@ attrs.fraction_of_characters_in_duplicate_lines = sum(
|
|
242 |
|
243 |
def web_data():
|
244 |
return Div(
|
|
|
245 |
Div(
|
246 |
H2("Common Crawl Snapshot Processing"),
|
247 |
H3("What This Section Contains"),
|
@@ -287,6 +288,8 @@ def web_data():
|
|
287 |
margin-bottom: 15px
|
288 |
""",
|
289 |
),
|
|
|
|
|
290 |
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
|
291 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
292 |
table_div_filter_data,
|
@@ -325,8 +328,9 @@ def web_data():
|
|
325 |
|
326 |
# P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
|
327 |
|
328 |
-
|
329 |
-
|
|
|
330 |
|
331 |
|
332 |
P(B("Text Extraction: "), """
|
@@ -486,8 +490,9 @@ def web_data():
|
|
486 |
""",
|
487 |
),
|
488 |
|
489 |
-
|
490 |
-
|
|
|
491 |
P("""
|
492 |
Before filtering low-quality documents, we perform the line-level removal to remove low-quality lines.
|
493 |
This ensured that computing quality signals would align with the final kept texts.
|
@@ -599,8 +604,9 @@ def web_data():
|
|
599 |
margin-bottom: 15px
|
600 |
""",
|
601 |
),
|
602 |
-
|
603 |
-
|
|
|
604 |
P("""
|
605 |
In this section, we introduce each quality signal used to filter out low-quality documents.
|
606 |
"""),
|
@@ -1660,4 +1666,5 @@ def web_data():
|
|
1660 |
margin-bottom: 15px
|
1661 |
""",
|
1662 |
),
|
|
|
1663 |
)
|
|
|
242 |
|
243 |
def web_data():
|
244 |
return Div(
|
245 |
+
Section(
|
246 |
Div(
|
247 |
H2("Common Crawl Snapshot Processing"),
|
248 |
H3("What This Section Contains"),
|
|
|
288 |
margin-bottom: 15px
|
289 |
""",
|
290 |
),
|
291 |
+
id="section1",),
|
292 |
+
Section(
|
293 |
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
|
294 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
295 |
table_div_filter_data,
|
|
|
328 |
|
329 |
# P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
|
330 |
|
331 |
+
id="section2",),
|
332 |
+
Section(
|
333 |
+
H2("Document Preparation"),
|
334 |
|
335 |
|
336 |
P(B("Text Extraction: "), """
|
|
|
490 |
""",
|
491 |
),
|
492 |
|
493 |
+
id="section3",),
|
494 |
+
Section(
|
495 |
+
H2("Line-Level Removal"),
|
496 |
P("""
|
497 |
Before filtering low-quality documents, we perform the line-level removal to remove low-quality lines.
|
498 |
This ensured that computing quality signals would align with the final kept texts.
|
|
|
604 |
margin-bottom: 15px
|
605 |
""",
|
606 |
),
|
607 |
+
id="section4",),
|
608 |
+
Section(
|
609 |
+
H2("Document-Level Filtering"),
|
610 |
P("""
|
611 |
In this section, we introduce each quality signal used to filter out low-quality documents.
|
612 |
"""),
|
|
|
1666 |
margin-bottom: 15px
|
1667 |
""",
|
1668 |
),
|
1669 |
+
id="section5",),
|
1670 |
)
|