hunterhector
commited on
Commit
•
d098e08
1
Parent(s):
eb884e6
fix common crawl stats
Browse files- overview.py +3 -3
overview.py
CHANGED
@@ -172,7 +172,7 @@ table_div_2 = Div(NotStr(table_html2), style="margin: 40px;")
|
|
172 |
dataset_sources = pd.DataFrame(
|
173 |
{
|
174 |
"Data Source": [
|
175 |
-
"
|
176 |
"Papers",
|
177 |
"Wikipedia",
|
178 |
"Freelaw",
|
@@ -185,7 +185,7 @@ dataset_sources = pd.DataFrame(
|
|
185 |
"StackExchange",
|
186 |
],
|
187 |
"Raw Data Size": [
|
188 |
-
"
|
189 |
"712 GB",
|
190 |
"210 GB",
|
191 |
"23 GB",
|
@@ -198,7 +198,7 @@ dataset_sources = pd.DataFrame(
|
|
198 |
"45 GB",
|
199 |
],
|
200 |
"Token Count": [
|
201 |
-
"
|
202 |
"154.96B",
|
203 |
"4.75B",
|
204 |
"7.34B",
|
|
|
172 |
dataset_sources = pd.DataFrame(
|
173 |
{
|
174 |
"Data Source": [
|
175 |
+
"Common Crawl",
|
176 |
"Papers",
|
177 |
"Wikipedia",
|
178 |
"Freelaw",
|
|
|
185 |
"StackExchange",
|
186 |
],
|
187 |
"Raw Data Size": [
|
188 |
+
"9.2 TB",
|
189 |
"712 GB",
|
190 |
"210 GB",
|
191 |
"23 GB",
|
|
|
198 |
"45 GB",
|
199 |
],
|
200 |
"Token Count": [
|
201 |
+
"4.83T",
|
202 |
"154.96B",
|
203 |
"4.75B",
|
204 |
"7.34B",
|