victormiller
commited on
Commit
•
7c015e6
1
Parent(s):
aba6489
Update main.py
Browse files
main.py
CHANGED
@@ -788,10 +788,12 @@ def intro():
|
|
788 |
B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.")
|
789 |
),
|
790 |
P(
|
791 |
-
"Building on top of the prior studies on pre-training data,
|
|
|
|
|
792 |
),
|
793 |
P(
|
794 |
-
"Metadata is stored to recover the raw distribution for each dataset, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM and RedPajama V2, we present the final deduplicated dataset that is ready to go."
|
795 |
),
|
796 |
P(
|
797 |
"We documented all implementation details in this blog post and are open sourcing the code. Examples of each filter and rationale supporting each decision are included."
|
|
|
788 |
B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.")
|
789 |
),
|
790 |
P(
|
791 |
+
"Building on top of the prior studies on pre-training data,"
|
792 |
+
D_cite(bibtex_key="refinedweb"), D_cite(bibtex_key="fineweb"), D_cite(bibtex_key="c4"), D_cite(bibtex_key="muennighoff2023scaling"),
|
793 |
+
"TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."
|
794 |
),
|
795 |
P(
|
796 |
+
"Metadata is stored to recover the raw distribution for each dataset, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM", D_cite(bibtex_key="dclm"), "and RedPajama V2,", D_cite(bibtex_key="redpajama-v2"), "we present the final deduplicated dataset that is ready to go."
|
797 |
),
|
798 |
P(
|
799 |
"We documented all implementation details in this blog post and are open sourcing the code. Examples of each filter and rationale supporting each decision are included."
|