TxT360

Running

App Files Files Community

victormiller commited on Oct 2

Commit

4254834

•

1 Parent(s): 4a437aa

Update curated.py

Browse files

Files changed (1) hide show

curated.py +15 -23

curated.py CHANGED Viewed

@@ -78,7 +78,7 @@ wikipedia_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -107,7 +107,7 @@ freelaw_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -136,7 +136,7 @@ dmm_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -166,7 +166,7 @@ uspto_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -195,7 +195,7 @@ pg19_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -225,7 +225,7 @@ hn_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -255,7 +255,7 @@ uirc_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -284,7 +284,7 @@ up_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -313,7 +313,7 @@ se_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -342,7 +342,7 @@ arx_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -371,7 +371,7 @@ s2o_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -400,7 +400,7 @@ med_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -429,7 +429,7 @@ phil_filter = pd.DataFrame(
                 "",
             ],
             "Total Percentage Remaining": [
-                "98.14%",
             ],
         }
     )
@@ -445,8 +445,8 @@ filtering_process = Div(
         H3("Wikipedia"),
         H4("Download and Extraction"),
         Ol(
-            Li("Downloaded from Wikimedia official dump of wikipedia on huggingface https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"),
-            Li("Data is originally in parqet format so we use huggingface dataset.to_json function to convert it to the jsonl format"),
         ),
         H4("Filtering"),
         Ol(
@@ -456,10 +456,6 @@ filtering_process = Div(
         Ol(
             Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
         ),
-        H4("Global Deduplication Process"),
-        Ol(
-            Li("After local dedup, remaining wikipedia was deduped again with all the datasets combined"),
-        ),
         table_div_wikipedia,
     ),
@@ -485,10 +481,6 @@ filtering_process = Div(
         Ol(
             Li("Local dedup was done with all papers combined."),
         ),
-        H4("Global Deduplication Process"),
-        Ol(
-            Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
-        ),
         table_div_arx,
     ),
     Section(

                 "",
             ],
             "Total Percentage Remaining": [
+                "",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
                 "",
             ],
             "Total Percentage Remaining": [
+                "%",
             ],
         }
     )
         H3("Wikipedia"),
         H4("Download and Extraction"),
         Ol(
+            Li("The Wikimedia dataset was downloaded from the official snapshot on Huggingface", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main")),
+            Li("Data is originally in parqet format so we used the", D_code("huggingface dataset.to_json"), " function to convert the data to the jsonl format"),
         ),
         H4("Filtering"),
         Ol(
         Ol(
             Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
         ),
         table_div_wikipedia,
     ),
         Ol(
             Li("Local dedup was done with all papers combined."),
         ),
         table_div_arx,
     ),
     Section(