victormiller
commited on
Commit
•
ba24833
1
Parent(s):
7c015e6
Update main.py
Browse files
main.py
CHANGED
@@ -788,7 +788,7 @@ def intro():
|
|
788 |
B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.")
|
789 |
),
|
790 |
P(
|
791 |
-
"Building on top of the prior studies on pre-training data,"
|
792 |
D_cite(bibtex_key="refinedweb"), D_cite(bibtex_key="fineweb"), D_cite(bibtex_key="c4"), D_cite(bibtex_key="muennighoff2023scaling"),
|
793 |
"TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."
|
794 |
),
|
|
|
788 |
B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.")
|
789 |
),
|
790 |
P(
|
791 |
+
"Building on top of the prior studies on pre-training data,",
|
792 |
D_cite(bibtex_key="refinedweb"), D_cite(bibtex_key="fineweb"), D_cite(bibtex_key="c4"), D_cite(bibtex_key="muennighoff2023scaling"),
|
793 |
"TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."
|
794 |
),
|