TxT360

Sleeping

App Files Files Community

victormiller commited on Sep 25, 2024

Commit

a89d144

verified ·

1 Parent(s): 591cd18

Update overview.py

Browse files

Files changed (1) hide show

overview.py +25 -4

overview.py CHANGED Viewed

@@ -11,7 +11,7 @@ import web
 import common
 import results
-dataset_comparison = pd.DataFrame(
         {
             "Dataset": [
                 "TxT360",
@@ -83,6 +83,26 @@ dataset_comparison = pd.DataFrame(
                 "-",
                 "Included",
             ],
             "PG-19": [
                 "Included",
                 "-",
@@ -146,8 +166,8 @@ dataset_comparison = pd.DataFrame(
         }
     )
-table_html = dataset_comparison.to_html(index=False, border=0)
-table_div = Div(NotStr(table_html), style="margin: 40px;")
 dataset_sources = pd.DataFrame(
         {
@@ -259,7 +279,8 @@ both critical for effective LLM pre-training."""),
             P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
             H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
             P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
-            table_div,
             P("Table 2: Basic TxT360 Statistics."),
             table_div1,
         ),

 import common
 import results
+dataset_comparison1 = pd.DataFrame(
         {
             "Dataset": [
                 "TxT360",
                 "-",
                 "Included",
             ],
+        }
+    )
+table_html = dataset_comparison1.to_html(index=False, border=0)
+table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
+dataset_comparison2 = pd.DataFrame(
+        {
+            "Dataset": [
+                "TxT360",
+                "FineWeb",
+                "RefinedWeb",
+                "RedPajama-v2",
+                "C4",
+                "Dolma",
+                "RedPajama-v1",
+                "The Pile",
+            ],
             "PG-19": [
                 "Included",
                 "-",
         }
     )
+table_html2 = dataset_comparison2.to_html(index=False, border=0)
+table_div2 = Div(NotStr(table_html2), style="margin: 40px;")
 dataset_sources = pd.DataFrame(
         {
             P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
             H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
             P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
+            table_div1,
+            table_div2,
             P("Table 2: Basic TxT360 Statistics."),
             table_div1,
         ),