victormiller
commited on
Commit
•
a89d144
1
Parent(s):
591cd18
Update overview.py
Browse files- overview.py +25 -4
overview.py
CHANGED
@@ -11,7 +11,7 @@ import web
|
|
11 |
import common
|
12 |
import results
|
13 |
|
14 |
-
|
15 |
{
|
16 |
"Dataset": [
|
17 |
"TxT360",
|
@@ -83,6 +83,26 @@ dataset_comparison = pd.DataFrame(
|
|
83 |
"-",
|
84 |
"Included",
|
85 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
"PG-19": [
|
87 |
"Included",
|
88 |
"-",
|
@@ -146,8 +166,8 @@ dataset_comparison = pd.DataFrame(
|
|
146 |
}
|
147 |
)
|
148 |
|
149 |
-
|
150 |
-
|
151 |
|
152 |
dataset_sources = pd.DataFrame(
|
153 |
{
|
@@ -259,7 +279,8 @@ both critical for effective LLM pre-training."""),
|
|
259 |
P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
|
260 |
H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
|
261 |
P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
|
262 |
-
|
|
|
263 |
P("Table 2: Basic TxT360 Statistics."),
|
264 |
table_div1,
|
265 |
),
|
|
|
11 |
import common
|
12 |
import results
|
13 |
|
14 |
+
dataset_comparison1 = pd.DataFrame(
|
15 |
{
|
16 |
"Dataset": [
|
17 |
"TxT360",
|
|
|
83 |
"-",
|
84 |
"Included",
|
85 |
],
|
86 |
+
|
87 |
+
}
|
88 |
+
)
|
89 |
+
|
90 |
+
table_html = dataset_comparison1.to_html(index=False, border=0)
|
91 |
+
table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
|
92 |
+
|
93 |
+
dataset_comparison2 = pd.DataFrame(
|
94 |
+
{
|
95 |
+
"Dataset": [
|
96 |
+
"TxT360",
|
97 |
+
"FineWeb",
|
98 |
+
"RefinedWeb",
|
99 |
+
"RedPajama-v2",
|
100 |
+
"C4",
|
101 |
+
"Dolma",
|
102 |
+
"RedPajama-v1",
|
103 |
+
"The Pile",
|
104 |
+
],
|
105 |
+
|
106 |
"PG-19": [
|
107 |
"Included",
|
108 |
"-",
|
|
|
166 |
}
|
167 |
)
|
168 |
|
169 |
+
table_html2 = dataset_comparison2.to_html(index=False, border=0)
|
170 |
+
table_div2 = Div(NotStr(table_html2), style="margin: 40px;")
|
171 |
|
172 |
dataset_sources = pd.DataFrame(
|
173 |
{
|
|
|
279 |
P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
|
280 |
H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
|
281 |
P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
|
282 |
+
table_div1,
|
283 |
+
table_div2,
|
284 |
P("Table 2: Basic TxT360 Statistics."),
|
285 |
table_div1,
|
286 |
),
|