Spaces:
Running
Running
victormiller
commited on
Commit
•
fb20585
1
Parent(s):
5d3f993
Update main.py
Browse files
main.py
CHANGED
@@ -117,13 +117,20 @@ def main():
|
|
117 |
),
|
118 |
)
|
119 |
|
|
|
|
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
127 |
large-scale, comprehensive, and fully transparent
|
128 |
dataset designed for Large Language Model (LLM)
|
129 |
pre-training. TxT360 is engineered to strike a
|
@@ -161,12 +168,9 @@ def intro():
|
|
161 |
represents a significant step forward in the
|
162 |
availability and transparency of large-scale
|
163 |
training data for language models, setting a new
|
164 |
-
standard for dataset quality and openness.""")
|
165 |
-
|
166 |
-
|
167 |
-
Section(
|
168 |
-
H2("Background"),
|
169 |
-
P(
|
170 |
""" The quality and size of a pre-training dataset
|
171 |
play a crucial role in the performance of large
|
172 |
language models (LLMs). The community has
|
@@ -197,11 +201,8 @@ def intro():
|
|
197 |
rigorous standards required for state-of-the-art
|
198 |
LLM pre-training. """
|
199 |
),
|
200 |
-
|
201 |
-
|
202 |
-
Section(
|
203 |
-
H2("Main Content"),
|
204 |
-
P("""The performance of a large language model (LLM)
|
205 |
depends heavily on the quality and size of its
|
206 |
pretraining dataset. However, the pretraining
|
207 |
datasets for state-of-the-art open LLMs like Llama
|
@@ -246,13 +247,34 @@ def intro():
|
|
246 |
(listing and explaining all of our design choices),
|
247 |
and the process followed to create its 📚
|
248 |
FineWeb-Edu subset."""),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
id="section3",
|
250 |
),
|
251 |
Section(
|
252 |
H2("Conclusion"),
|
253 |
-
|
254 |
-
summarize the key points discussed in the blog post
|
255 |
-
and provide final thoughts."""),
|
256 |
id="section4",
|
257 |
),
|
258 |
id="inner-text",
|
|
|
117 |
),
|
118 |
)
|
119 |
|
120 |
+
intro_text = P(
|
121 |
+
"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
|
122 |
|
123 |
+
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
|
124 |
+
|
125 |
+
intro_list1 = Ol(
|
126 |
+
Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
|
127 |
+
Li("Employs carefully selected filters designed for each data source"),
|
128 |
+
Li("Provides only unique data elements via globally deduplicated across all datasets"),
|
129 |
+
Li("Retains all deduplication metadata for custom upweighting"),
|
130 |
+
Li("Is Production ready! Download here [link to HF repo]")
|
131 |
+
)
|
132 |
+
|
133 |
+
previous_intro = P("""We are excited to introduce TxT360, a
|
134 |
large-scale, comprehensive, and fully transparent
|
135 |
dataset designed for Large Language Model (LLM)
|
136 |
pre-training. TxT360 is engineered to strike a
|
|
|
168 |
represents a significant step forward in the
|
169 |
availability and transparency of large-scale
|
170 |
training data for language models, setting a new
|
171 |
+
standard for dataset quality and openness.""")
|
172 |
+
|
173 |
+
previous_background = P(
|
|
|
|
|
|
|
174 |
""" The quality and size of a pre-training dataset
|
175 |
play a crucial role in the performance of large
|
176 |
language models (LLMs). The community has
|
|
|
201 |
rigorous standards required for state-of-the-art
|
202 |
LLM pre-training. """
|
203 |
),
|
204 |
+
|
205 |
+
previous_content = P("""The performance of a large language model (LLM)
|
|
|
|
|
|
|
206 |
depends heavily on the quality and size of its
|
207 |
pretraining dataset. However, the pretraining
|
208 |
datasets for state-of-the-art open LLMs like Llama
|
|
|
247 |
(listing and explaining all of our design choices),
|
248 |
and the process followed to create its 📚
|
249 |
FineWeb-Edu subset."""),
|
250 |
+
|
251 |
+
previous_conclusion = P("""This is the conclusion section where we
|
252 |
+
summarize the key points discussed in the blog post
|
253 |
+
and provide final thoughts."""),
|
254 |
+
|
255 |
+
@app.get("/intro")
|
256 |
+
def intro():
|
257 |
+
return Div(
|
258 |
+
Section(
|
259 |
+
H2("About TxT360"),
|
260 |
+
intro_text,
|
261 |
+
intro_list,
|
262 |
+
intro_list1,
|
263 |
+
id="section1",
|
264 |
+
),
|
265 |
+
Section(
|
266 |
+
H2("Background"),
|
267 |
+
|
268 |
+
id="section2",
|
269 |
+
),
|
270 |
+
Section(
|
271 |
+
H2("Main Content"),
|
272 |
+
|
273 |
id="section3",
|
274 |
),
|
275 |
Section(
|
276 |
H2("Conclusion"),
|
277 |
+
|
|
|
|
|
278 |
id="section4",
|
279 |
),
|
280 |
id="inner-text",
|