|
from fasthtml.common import * |
|
from fasthtml.components import * |
|
from fasthtml.components import ( |
|
D_title, |
|
D_article, |
|
D_front_matter, |
|
D_contents, |
|
D_byline, |
|
D_bibliography, |
|
D_appendix, |
|
D_cite, |
|
) |
|
from plotly import graph_objects as go |
|
from fh_plotly import plotly2fasthtml |
|
import pandas as pd |
|
import json |
|
from rich import print |
|
import overview |
|
import curated |
|
import web |
|
import common |
|
import results |
|
from pybtex.database import parse_file |
|
import data_viewer |
|
|
|
|
|
app, rt = fast_app( |
|
debug=True, |
|
pico=False, |
|
hdrs=( |
|
Meta(charset="UTF-8"), |
|
Meta(name="viewport", content="width=device-width, initial-scale=1.0"), |
|
Script(src="https://distill.pub/template.v2.js"), |
|
Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"), |
|
Script(src="https://cdn.plot.ly/plotly-latest.min.js"), |
|
Link(rel="stylesheet", href="style.css"), |
|
MarkdownJS(), |
|
), |
|
) |
|
|
|
|
|
front_matter = """ |
|
<d-front-matter> |
|
<script id='distill-front-matter' type="text/json">{ |
|
"title": "", |
|
"description": "", |
|
"published": "", |
|
"affiliation": {}, |
|
"authors": [ |
|
{ |
|
"author":"", |
|
"authorURL":"" |
|
} |
|
], |
|
"katex": { |
|
"delimiters": [ |
|
{"left": "$$", "right": "$$", "display": false} |
|
] |
|
} |
|
} |
|
</script> |
|
</d-front-matter> |
|
""" |
|
|
|
|
|
def read_bibs(): |
|
bib_data = parse_file("bibliography.bib") |
|
cits = [] |
|
for key in bib_data.entries.keys(): |
|
cits.append(D_cite(bibtex_key=key)) |
|
return cits |
|
|
|
|
|
@app.get("/bibliography.bib") |
|
def get(): |
|
return FileResponse("bibliography.bib") |
|
|
|
|
|
@app.get("/") |
|
def main(): |
|
return Div( |
|
D_title( |
|
H1( |
|
"TxT360: a globally deduplicated dataset for LLM pretraining", |
|
cls="l-body", |
|
style="text-align: center;", |
|
), |
|
Div( |
|
Img(src="images/llm360_logo.png"), |
|
id="title-plot", |
|
cls="main-plot-container l-page", |
|
), |
|
), |
|
Div(D_byline(), NotStr(front_matter), style="display: none;"), |
|
D_article( |
|
D_contents( |
|
Nav( |
|
H3("Table of Contents"), |
|
Div( |
|
A( |
|
"TxT360", |
|
href="/intro#section1", |
|
hx_get="/intro#section1", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Div( |
|
Ul( |
|
Li( |
|
A( |
|
"About TxT360", |
|
href="/intro#section1", |
|
hx_get="/intro#section1", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Motivation Behind Txt360", |
|
href="/intro#section2", |
|
hx_get="/intro#section2", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Generalizable Approach to Data Processing", |
|
href="/intro#section3", |
|
hx_get="/intro#section3", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
), |
|
), |
|
Div( |
|
A( |
|
"Web Data Processing", |
|
href="/webdata", |
|
hx_get="/webdata", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Div( |
|
Ul( |
|
Li( |
|
A( |
|
"Common Crawl Snapshot Processing", |
|
href="/webdata#section1", |
|
hx_get="/webdata#section1", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Common Crawl Data Processing Summary", |
|
href="/webdata#section2", |
|
hx_get="/webdata#section2", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Document Preparation", |
|
href="/webdata#section3", |
|
hx_get="/webdata#section3", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Line-Level Removal", |
|
href="/webdata#section4", |
|
hx_get="/webdata#section4", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Document-Level Filtering", |
|
href="/webdata#section5", |
|
hx_get="/webdata#section5", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
), |
|
), |
|
Div( |
|
A( |
|
"Curated Sources Processing", |
|
href="/curated", |
|
hx_get="/curated", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Div( |
|
Ul( |
|
Li( |
|
A( |
|
"Curated Sources in TxT360", |
|
href="/curated#section1", |
|
hx_get="/curated#section1", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Filtering Steps and Definitions", |
|
href="/curated#section2", |
|
hx_get="/curated#section2", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Filtering Discussion on All Curated Sources", |
|
href="/curated#section3", |
|
hx_get="/curated#section3", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
), |
|
), |
|
Div( |
|
A( |
|
"Commonly Applied Processing Steps", |
|
href="/common#section1", |
|
hx_get="/common#section1", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Div( |
|
Ul( |
|
Li( |
|
A( |
|
"Overview", |
|
href="/common#section1", |
|
hx_get="/common#section1", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Motivation Behind Global Deduplication", |
|
href="/common#section2", |
|
hx_get="/common#section2", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"MinHash Generation", |
|
href="/common#section3", |
|
hx_get="/common#section3", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Matching Pairs Generation", |
|
href="/common#section4", |
|
hx_get="/common#section4", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Finding Duplicate Pairs", |
|
href="/common#section5", |
|
hx_get="/common#section5", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Finding Connected Components using MapReduce", |
|
href="/common#section6", |
|
hx_get="/common#section6", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Personally Identifable Information Removal", |
|
href="/common#section7", |
|
hx_get="/common#section7", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Normalization Form C", |
|
href="/common#section8", |
|
hx_get="/common#section8", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
), |
|
), |
|
Div( |
|
A( |
|
"TxT360 Studies", |
|
href="/results", |
|
hx_get="/results", |
|
hx_target="#inner-text", |
|
), |
|
), |
|
Div( |
|
Ul( |
|
Li( |
|
A( |
|
"Overview", |
|
href="/results#section1", |
|
hx_get="/results#section1", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Upsampling Experiment", |
|
href="/results#section2", |
|
hx_get="/results#section2", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Perplexity Analysis", |
|
href="/results#section3", |
|
hx_get="/results#section3", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
), |
|
), |
|
role="navigation", |
|
cls="l-text figcaption", |
|
), |
|
), |
|
intro(), |
|
), |
|
D_appendix(D_bibliography(src="bibliography.bib")), |
|
Div(*read_bibs(), style="display: none;"), |
|
) |
|
|
|
|
|
new_dataset_comparison1 = pd.DataFrame( |
|
{ |
|
"Data Source": [ |
|
"CommonCrawl Snapshots", |
|
"Papers", |
|
"Wikipedia", |
|
"FreeLaw", |
|
"DM Math", |
|
"USPTO", |
|
"PG-19", |
|
"HackerNews", |
|
"Ubuntu IRC", |
|
"EuroParl", |
|
"StackExchange", |
|
"Code", |
|
|
|
], |
|
"TxT360": [ |
|
"99", |
|
"5 Sources", |
|
"310+ Languages", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"**", |
|
], |
|
"FineWeb": [ |
|
"96", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
], |
|
"RefinedWeb": [ |
|
"90", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
], |
|
"PedPajamaV2": [ |
|
"84", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
], |
|
"C4": [ |
|
"1", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
], |
|
"Dolma": [ |
|
"24", |
|
"1 Source", |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
], |
|
"RedPajamaV1": [ |
|
"5", |
|
"1 Source", |
|
"Included", |
|
"", |
|
" ", |
|
"", |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
"Included", |
|
], |
|
"The Pile": [ |
|
"0.6% of 74", |
|
"4 Sources", |
|
"English Only", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
"Included", |
|
], |
|
} |
|
) |
|
|
|
|
|
|
|
styled_table = ( |
|
new_dataset_comparison1.style.set_properties( |
|
**{"background-color": "#E1EEDB"}, |
|
subset=pd.IndexSlice[0, :], |
|
) |
|
.apply( |
|
lambda x: [ |
|
"background-color: #E1EEDB" |
|
if i == 0 |
|
else ( |
|
"background-color: rgb(237, 242, 251)" |
|
if i % 2 == 0 |
|
else "background-color: white" |
|
) |
|
for i in range(len(x)) |
|
], |
|
axis=0, |
|
) |
|
.hide(axis="index") |
|
) |
|
|
|
|
|
table_html = styled_table._repr_html_() |
|
|
|
new_table_div_1 = Div(NotStr(table_html), style="margin: 40px;") |
|
|
|
|
|
dataset_comparison1 = pd.DataFrame( |
|
{ |
|
"Dataset": [ |
|
"TxT360", |
|
"FineWeb", |
|
"RefinedWeb", |
|
"RedPajama-v2", |
|
"C4", |
|
"Dolma", |
|
"RedPajama-v1", |
|
"The Pile", |
|
], |
|
"CommonCrawl": [ |
|
"99 Snapshots", |
|
"96 Snapshots", |
|
"90 Snapshots", |
|
"84 Snapshots", |
|
"1 Snapshots", |
|
"24 Snapshots", |
|
"5 Snapshots", |
|
"0.6% of 74 Snapshots", |
|
], |
|
"Papers": [ |
|
"5 Sources", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"1 Source", |
|
"1 Source", |
|
"4 Sources", |
|
], |
|
"Wikipedia": [ |
|
"310+ Languages", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"what does a check mark mean?", |
|
"what does a check mark mean?", |
|
"English Only", |
|
], |
|
"FreeLaw": [ |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
], |
|
"DM Math": [ |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
], |
|
"USPTO": [ |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
], |
|
} |
|
) |
|
|
|
|
|
styled_table = ( |
|
dataset_comparison1.style.set_properties( |
|
**{"background-color": "#E1EEDB"}, |
|
subset=pd.IndexSlice[0, :], |
|
) |
|
.apply( |
|
lambda x: [ |
|
"background-color: #E1EEDB" |
|
if i == 0 |
|
else ( |
|
"background-color: rgb(237, 242, 251)" |
|
if i % 2 == 0 |
|
else "background-color: white" |
|
) |
|
for i in range(len(x)) |
|
], |
|
axis=0, |
|
) |
|
.hide(axis="index") |
|
) |
|
|
|
|
|
table_html = styled_table._repr_html_() |
|
|
|
table_div_1 = Div(NotStr(table_html), style="margin: 40px;") |
|
|
|
dataset_comparison2 = pd.DataFrame( |
|
{ |
|
"Dataset": [ |
|
"TxT360", |
|
"FineWeb", |
|
"RefinedWeb", |
|
"RedPajama-v2", |
|
"C4", |
|
"Dolma", |
|
"RedPajama-v1", |
|
"The Pile", |
|
], |
|
"PG-19": [ |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
"Included", |
|
"Included", |
|
], |
|
"HackerNews": [ |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
], |
|
"Ubuntu IRC": [ |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
], |
|
"EuroParl": [ |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
], |
|
"StackExchange": [ |
|
"Included", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
"Included", |
|
], |
|
"Code": [ |
|
"- what is this?", |
|
"-", |
|
"-", |
|
"-", |
|
"-", |
|
"Included", |
|
"Included", |
|
"Included", |
|
], |
|
} |
|
) |
|
|
|
styled_table = ( |
|
dataset_comparison2.style.set_properties( |
|
**{"background-color": "#E1EEDB"}, |
|
subset=pd.IndexSlice[0, :], |
|
) |
|
.apply( |
|
lambda x: [ |
|
"background-color: #E1EEDB" |
|
if i == 0 |
|
else ( |
|
"background-color: rgb(237, 242, 251)" |
|
if i % 2 == 0 |
|
else "background-color: white" |
|
) |
|
for i in range(len(x)) |
|
], |
|
axis=0, |
|
) |
|
.hide(axis="index") |
|
) |
|
|
|
|
|
table_html2 = styled_table._repr_html_() |
|
|
|
table_div_2 = Div(NotStr(table_html2), style="margin: 40px;") |
|
|
|
dataset_sources = pd.DataFrame( |
|
{ |
|
"Data Source": [ |
|
"CommonCrawl", |
|
"Papers", |
|
"Wikipedia", |
|
"Freelaw", |
|
"DM Math", |
|
"USPTO", |
|
"PG-19", |
|
"HackerNews", |
|
"Ubuntu IRC", |
|
"Europarl", |
|
"StackExchange", |
|
], |
|
"Raw Data Size": [ |
|
"11 TB", |
|
"712 GB", |
|
"210 GB", |
|
"23 GB", |
|
"22 GB", |
|
"45 GB", |
|
"11 GB", |
|
"4.1 GB", |
|
"4.7 GB", |
|
"6.1 GB", |
|
"45 GB", |
|
], |
|
"Token Count": [ |
|
"5.71T", |
|
"154.96B", |
|
"4.75B", |
|
"7.34B", |
|
"5.23B", |
|
"4.95B", |
|
"2.94B", |
|
"1.08B", |
|
"1.54B", |
|
"1.96B", |
|
"8.37B", |
|
], |
|
"Information Cut-Off Date": [ |
|
"2024-30", |
|
"Q4 2023", |
|
"-", |
|
"Q1 2024", |
|
"-", |
|
"Q4 2023", |
|
"-", |
|
"Q4 2023", |
|
"Q4 2023", |
|
"-", |
|
"Q4 2023", |
|
], |
|
} |
|
) |
|
|
|
styled_table = dataset_sources.style.apply( |
|
lambda x: [ |
|
"background-color: white" |
|
if i % 2 == 0 |
|
else "background-color: rgb(237, 242, 251)" |
|
for i in range(len(x)) |
|
], |
|
axis=0, |
|
).hide(axis="index") |
|
|
|
table_html_data = styled_table._repr_html_() |
|
|
|
table_div_data = Div(NotStr(table_html_data), style="margin: 40px;") |
|
|
|
|
|
@app.get("/intro") |
|
def intro(): |
|
return Div( |
|
Section( |
|
H2("About TxT360"), |
|
P( |
|
B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.") |
|
), |
|
P( |
|
"Building on top of the prior studies on pre-training data,", |
|
D_cite(bibtex_key="refinedweb"), D_cite(bibtex_key="fineweb"), D_cite(bibtex_key="c4"), D_cite(bibtex_key="muennighoff2023scaling"), |
|
"TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps." |
|
), |
|
P( |
|
"Metadata is stored to recover the raw distribution for each dataset, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM", D_cite(bibtex_key="dclm"), "and RedPajama V2,", D_cite(bibtex_key="redpajama-v2"), "we present the final deduplicated dataset that is ready to go." |
|
), |
|
P( |
|
"We documented all implementation details in this blog post and are open sourcing the code. Examples of each filter and rationale supporting each decision are included." |
|
), |
|
id="section1", |
|
), |
|
Section( |
|
H2("Motivation Behind Txt360"), |
|
H3( |
|
"TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining." |
|
), |
|
new_table_div_1, |
|
|
|
|
|
P( |
|
"In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below." |
|
), |
|
P("** TxT360 does not include code. This decision was made due to the perceived low duplication count of code. TxT360 can easily be combined with leading code dataset."), |
|
|
|
|
|
id="section2", |
|
), |
|
Section( |
|
H2("Our Generalizable Approach to Data Processing"), |
|
P( |
|
"To produce TxT360, a comprehensive and transparent data processing pipeline was designed to account for the nuances of both web and curated datasets. The pipeline presents a unified framework for processing both data types, making it convenient and easily adaptive for users to revise and fine-tune the pipeline for their own use cases." |
|
), |
|
P( |
|
"Web datasets are inherently noisy and varied. The TxT360 pipeline implements sophisticated filtering and deduplication techniques to clean and remove redundancies while preserving data integrity." |
|
), |
|
P( |
|
"Curated datasets are typically structured and consistently formatted. TxT360 filters these sources with selective steps to maintain their integrity while providing seamless integration into the larger dataset. Both data source types are globally deduplicated together resulting in 5.7T tokens of high-quality data. The table below shows the source distribution of TxT360 tokens." |
|
), |
|
table_div_data, |
|
P( |
|
"We provide details and context for the choices behind TxT360 in the respective Web Data Processing and Curated Source Processing section. A deep dive describing the deduplication process can be found in the Commonly Applied Processing Steps section." |
|
), |
|
|
|
|
|
|
|
|
|
id="section3", |
|
), |
|
id="inner-text", |
|
) |
|
|
|
|
|
rt("/update/{target}")(data_viewer.update) |
|
|
|
rt("/curated")(curated.curated) |
|
|
|
rt("/webdata")(web.web_data) |
|
|
|
rt("/common")(common.common_steps) |
|
|
|
rt("/results")(results.results) |
|
|
|
serve() |
|
|