|
from fasthtml.common import * |
|
from fasthtml.components import * |
|
from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline |
|
from plotly import graph_objects as go |
|
from fh_plotly import plotly2fasthtml |
|
import pandas as pd |
|
import json |
|
from rich import print |
|
import curated |
|
import web |
|
import common |
|
import results |
|
|
|
|
|
app, rt = fast_app( |
|
debug=True, |
|
pico=False, |
|
hdrs=( |
|
Meta(charset="UTF-8"), |
|
Meta(name="viewport", content="width=device-width, initial-scale=1.0"), |
|
Script(src="https://distill.pub/template.v2.js"), |
|
Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"), |
|
Script(src="https://cdn.plot.ly/plotly-latest.min.js"), |
|
Link(rel="stylesheet", href="style.css"), |
|
MarkdownJS(), |
|
HighlightJS(langs=["python", "javascript", "html", "css"]), |
|
), |
|
) |
|
|
|
|
|
@app.get("/") |
|
def main(): |
|
return Div( |
|
D_front_matter(), |
|
D_title( |
|
H1( |
|
"TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models", |
|
cls="l-body", |
|
style="text-align: center;", |
|
), |
|
Div( |
|
Img(src="images/llm360_logo.png"), |
|
id="title-plot", |
|
cls="main-plot-container l-page", |
|
), |
|
), |
|
D_article( |
|
D_contents( |
|
Nav( |
|
H3("Table of Contents"), |
|
Div( |
|
A("TxT360", href="#_self"), |
|
hx_get="/intro", |
|
hx_target="#inner-text", |
|
), |
|
Div( |
|
Ul( |
|
Li( |
|
A( |
|
"About TxT360", |
|
href="/intro#section1", |
|
hx_get="/intro#section1", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Global Deduplication", |
|
href="/intro#section2", |
|
hx_get="/intro#section2", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Controllable Upweighting", |
|
href="/intro#section3", |
|
hx_get="/intro#section3", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
Li( |
|
A( |
|
"Full Documentation", |
|
href="/intro#section4", |
|
hx_get="/intro#section4", |
|
hx_target="#inner-text", |
|
) |
|
), |
|
), |
|
), |
|
Div( |
|
A("Web Data", href="#inner-text"), |
|
hx_get="/webdata", |
|
hx_target="#inner-text", |
|
), |
|
Div( |
|
A("Curated Sources", href="#inner-text"), |
|
hx_get="/curated", |
|
hx_target="#inner-text", |
|
), |
|
Div( |
|
A("Common Steps", href="#inner-text"), |
|
hx_get="/common", |
|
hx_target="#inner-text", |
|
), |
|
Div( |
|
A("TxT360 Results", href="#inner-text"), |
|
hx_get="/results", |
|
hx_target="#inner-text", |
|
), |
|
role="navigation", |
|
cls="l-text figcaption", |
|
), |
|
), |
|
intro(), |
|
), |
|
) |
|
|
|
intro_text = P( |
|
"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""") |
|
|
|
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""") |
|
|
|
intro_list1 = Ol( |
|
Li("Curates commonly used pretraining datasets, including all CommonCrawl"), |
|
Li("Employs carefully selected filters designed for each data source"), |
|
Li("Provides only unique data elements via globally deduplicated across all datasets"), |
|
Li("Retains all deduplication metadata for custom upweighting"), |
|
Li("Is Production ready! Download here [link to HF repo]") |
|
) |
|
|
|
previous_intro = P("""We are excited to introduce TxT360, a |
|
large-scale, comprehensive, and fully transparent |
|
dataset designed for Large Language Model (LLM) |
|
pre-training. TxT360 is engineered to strike a |
|
balance between the quantity and quality of |
|
pre-training data, pushing the limit on both |
|
fronts. This comprehensive dataset encompasses both |
|
expansive web-based data and highly curated data |
|
sources, making it one of the most robust LLM |
|
pre-training corpora available today. Our web data |
|
component includes 99 snapshots from Common Crawl, |
|
amassing 5.7 trillion tokens and occupying 11 TB of |
|
disk space in jsonl.gz format. On the curated side, |
|
TxT360 integrates one of the most extensive |
|
collections of high-quality sources across multiple |
|
domains, ensuring diverse and rich content referred |
|
to as curated sources, 14 sources across 10 |
|
domains. To maintain the highest quality, we |
|
meticulously pre-processed the web data to filter |
|
out low-quality content and conducted thorough |
|
reviews of the curated sources. This process not |
|
only unified their formats but also identified and |
|
rectified any anomalies. Not only do we 100% |
|
open-source our processing scripts, but we also |
|
release the details of our data reviews, revealing |
|
the decision-making processes behind data selection |
|
and quality assurance. This level of transparency |
|
allows researchers and practitioners to fully |
|
understand the dataset’s composition and make |
|
informed decisions when using TxT360 for training. |
|
Additionally, TxT360 includes detailed |
|
documentation and analysis of the data, covering |
|
distribution statistics, domain coverage, and |
|
processing pipeline, which helps users navigate and |
|
utilize the dataset effectively. Overall, TxT360 |
|
represents a significant step forward in the |
|
availability and transparency of large-scale |
|
training data for language models, setting a new |
|
standard for dataset quality and openness.""") |
|
|
|
previous_background = P( |
|
""" The quality and size of a pre-training dataset |
|
play a crucial role in the performance of large |
|
language models (LLMs). The community has |
|
introduced a variety of datasets for this purpose, |
|
including purely web-based datasets like RefinedWeb |
|
[1], RedPajama-Data-V2 [2], DCLM [3], and |
|
FineWeb [4], as well as comprehensive datasets |
|
derived from multiple highly-curated data sources |
|
such as The Pile [5], RedPajama-Data-V1 [6], and |
|
Dolma [7] . It is commonly known that web-based |
|
datasets provide a vast quantity of data, while |
|
highly-curated multi-source datasets consistently |
|
deliver high quality and diversity, both critical |
|
for effective LLM pre-training. However, despite |
|
the advancements in both types of data, each type |
|
of dataset has its limitations. For instance, the |
|
processing scripts for the web dataset, RefinedWeb, |
|
known for its high quality, are not public, and |
|
only about 10% of the entire dataset has been |
|
disclosed. Conversely, the web component of |
|
existing highly-curated multi-source datasets is |
|
relatively small compared to purely web-based |
|
datasets, limiting their coverage and diversity |
|
compared to the scale of information from the |
|
internet. By integrating the extensive reach of |
|
web data with the exceptional quality of curated |
|
sources, TxT360 is crafted to meet and surpass the |
|
rigorous standards required for state-of-the-art |
|
LLM pre-training. """ |
|
), |
|
|
|
previous_content = P("""The performance of a large language model (LLM) |
|
depends heavily on the quality and size of its |
|
pretraining dataset. However, the pretraining |
|
datasets for state-of-the-art open LLMs like Llama |
|
3 and Mixtral are not publicly available and very |
|
little is known about how they were created. |
|
Reading time: 45 min. For the best reading |
|
experience, we recommend not using a mobile phone. |
|
Recently, we released 🍷 FineWeb, a new, |
|
large-scale (15-trillion tokens, 44TB disk space) |
|
dataset for LLM pretraining. FineWeb is derived |
|
from 96 CommonCrawl snapshots and produces |
|
better-performing LLMs than other open pretraining |
|
datasets. To bring more clarity in machine learning |
|
and advance the open understanding of how to train |
|
good quality large language models, we carefully |
|
documented and ablated all of the design choices |
|
used in FineWeb, including in-depth investigations |
|
of deduplication and filtering strategies. The |
|
present long form report is a deep dive in how to |
|
create a large and high-quality web-scale dataset |
|
for LLM pretraining. The dataset itself, 🍷 |
|
FineWeb, is available here. We are extremely |
|
thankful to the whole distill.pub team (Christopher |
|
Olah, Shan Carter, Ludwig Schubert in particular) |
|
for creating the template on which we based this |
|
blog post. Thanks also for inspiring us with |
|
exquisitely crafted articles and blog posts. In |
|
this report we also introduce 📚 FineWeb-Edu, a |
|
subset of FineWeb constructed using scalable |
|
automated high-quality annotations for educational |
|
value, and which outperforms all openly accessible |
|
web-datasets on a number of educational benchmarks |
|
such as MMLU, ARC, and OpenBookQA. 📚 FineWeb-Edu |
|
is available in two sizes/filtering-level: 1.3 |
|
trillion (very high educational content) and 5.4 |
|
trillion (high educational content) tokens (all |
|
tokens are measured with GPT2 tokenizer). You can |
|
download it here. Both datasets are released under |
|
the permissive ODC-By 1.0 license TLDR: This blog |
|
covers a discussion on processing and evaluating |
|
data quality at scale, the 🍷 FineWeb recipe |
|
(listing and explaining all of our design choices), |
|
and the process followed to create its 📚 |
|
FineWeb-Edu subset."""), |
|
|
|
previous_conclusion = P("""This is the conclusion section where we |
|
summarize the key points discussed in the blog post |
|
and provide final thoughts."""), |
|
|
|
@app.get("/intro") |
|
def intro(): |
|
return Div( |
|
Section( |
|
H2("About TxT360"), |
|
intro_text, |
|
intro_list, |
|
intro_list1, |
|
id="section1", |
|
), |
|
Section( |
|
H2("Background"), |
|
|
|
id="section2", |
|
), |
|
Section( |
|
H2("Main Content"), |
|
|
|
id="section3", |
|
), |
|
Section( |
|
H2("Conclusion"), |
|
|
|
id="section4", |
|
), |
|
id="inner-text", |
|
) |
|
|
|
|
|
rt("/curated")(curated.curated) |
|
rt("/curated/{target}")(curated.update) |
|
|
|
rt("/webdata")(web.web_data) |
|
rt("/webdata/{target}")(web.update) |
|
|
|
rt("/common")(common.common_steps) |
|
|
|
rt("/results")(results.results) |
|
|
|
serve() |
|
|