victormiller
commited on
Commit
•
4254834
1
Parent(s):
4a437aa
Update curated.py
Browse files- curated.py +15 -23
curated.py
CHANGED
@@ -78,7 +78,7 @@ wikipedia_filter = pd.DataFrame(
|
|
78 |
"",
|
79 |
],
|
80 |
"Total Percentage Remaining": [
|
81 |
-
"
|
82 |
],
|
83 |
}
|
84 |
)
|
@@ -107,7 +107,7 @@ freelaw_filter = pd.DataFrame(
|
|
107 |
"",
|
108 |
],
|
109 |
"Total Percentage Remaining": [
|
110 |
-
"
|
111 |
],
|
112 |
}
|
113 |
)
|
@@ -136,7 +136,7 @@ dmm_filter = pd.DataFrame(
|
|
136 |
"",
|
137 |
],
|
138 |
"Total Percentage Remaining": [
|
139 |
-
"
|
140 |
],
|
141 |
}
|
142 |
)
|
@@ -166,7 +166,7 @@ uspto_filter = pd.DataFrame(
|
|
166 |
"",
|
167 |
],
|
168 |
"Total Percentage Remaining": [
|
169 |
-
"
|
170 |
],
|
171 |
}
|
172 |
)
|
@@ -195,7 +195,7 @@ pg19_filter = pd.DataFrame(
|
|
195 |
"",
|
196 |
],
|
197 |
"Total Percentage Remaining": [
|
198 |
-
"
|
199 |
],
|
200 |
}
|
201 |
)
|
@@ -225,7 +225,7 @@ hn_filter = pd.DataFrame(
|
|
225 |
"",
|
226 |
],
|
227 |
"Total Percentage Remaining": [
|
228 |
-
"
|
229 |
],
|
230 |
}
|
231 |
)
|
@@ -255,7 +255,7 @@ uirc_filter = pd.DataFrame(
|
|
255 |
"",
|
256 |
],
|
257 |
"Total Percentage Remaining": [
|
258 |
-
"
|
259 |
],
|
260 |
}
|
261 |
)
|
@@ -284,7 +284,7 @@ up_filter = pd.DataFrame(
|
|
284 |
"",
|
285 |
],
|
286 |
"Total Percentage Remaining": [
|
287 |
-
"
|
288 |
],
|
289 |
}
|
290 |
)
|
@@ -313,7 +313,7 @@ se_filter = pd.DataFrame(
|
|
313 |
"",
|
314 |
],
|
315 |
"Total Percentage Remaining": [
|
316 |
-
"
|
317 |
],
|
318 |
}
|
319 |
)
|
@@ -342,7 +342,7 @@ arx_filter = pd.DataFrame(
|
|
342 |
"",
|
343 |
],
|
344 |
"Total Percentage Remaining": [
|
345 |
-
"
|
346 |
],
|
347 |
}
|
348 |
)
|
@@ -371,7 +371,7 @@ s2o_filter = pd.DataFrame(
|
|
371 |
"",
|
372 |
],
|
373 |
"Total Percentage Remaining": [
|
374 |
-
"
|
375 |
],
|
376 |
}
|
377 |
)
|
@@ -400,7 +400,7 @@ med_filter = pd.DataFrame(
|
|
400 |
"",
|
401 |
],
|
402 |
"Total Percentage Remaining": [
|
403 |
-
"
|
404 |
],
|
405 |
}
|
406 |
)
|
@@ -429,7 +429,7 @@ phil_filter = pd.DataFrame(
|
|
429 |
"",
|
430 |
],
|
431 |
"Total Percentage Remaining": [
|
432 |
-
"
|
433 |
],
|
434 |
}
|
435 |
)
|
@@ -445,8 +445,8 @@ filtering_process = Div(
|
|
445 |
H3("Wikipedia"),
|
446 |
H4("Download and Extraction"),
|
447 |
Ol(
|
448 |
-
Li("
|
449 |
-
Li("Data is originally in parqet format so we
|
450 |
),
|
451 |
H4("Filtering"),
|
452 |
Ol(
|
@@ -456,10 +456,6 @@ filtering_process = Div(
|
|
456 |
Ol(
|
457 |
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
458 |
),
|
459 |
-
H4("Global Deduplication Process"),
|
460 |
-
Ol(
|
461 |
-
Li("After local dedup, remaining wikipedia was deduped again with all the datasets combined"),
|
462 |
-
),
|
463 |
table_div_wikipedia,
|
464 |
|
465 |
),
|
@@ -485,10 +481,6 @@ filtering_process = Div(
|
|
485 |
Ol(
|
486 |
Li("Local dedup was done with all papers combined."),
|
487 |
),
|
488 |
-
H4("Global Deduplication Process"),
|
489 |
-
Ol(
|
490 |
-
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
491 |
-
),
|
492 |
table_div_arx,
|
493 |
),
|
494 |
Section(
|
|
|
78 |
"",
|
79 |
],
|
80 |
"Total Percentage Remaining": [
|
81 |
+
"",
|
82 |
],
|
83 |
}
|
84 |
)
|
|
|
107 |
"",
|
108 |
],
|
109 |
"Total Percentage Remaining": [
|
110 |
+
"%",
|
111 |
],
|
112 |
}
|
113 |
)
|
|
|
136 |
"",
|
137 |
],
|
138 |
"Total Percentage Remaining": [
|
139 |
+
"%",
|
140 |
],
|
141 |
}
|
142 |
)
|
|
|
166 |
"",
|
167 |
],
|
168 |
"Total Percentage Remaining": [
|
169 |
+
"%",
|
170 |
],
|
171 |
}
|
172 |
)
|
|
|
195 |
"",
|
196 |
],
|
197 |
"Total Percentage Remaining": [
|
198 |
+
"%",
|
199 |
],
|
200 |
}
|
201 |
)
|
|
|
225 |
"",
|
226 |
],
|
227 |
"Total Percentage Remaining": [
|
228 |
+
"%",
|
229 |
],
|
230 |
}
|
231 |
)
|
|
|
255 |
"",
|
256 |
],
|
257 |
"Total Percentage Remaining": [
|
258 |
+
"%",
|
259 |
],
|
260 |
}
|
261 |
)
|
|
|
284 |
"",
|
285 |
],
|
286 |
"Total Percentage Remaining": [
|
287 |
+
"%",
|
288 |
],
|
289 |
}
|
290 |
)
|
|
|
313 |
"",
|
314 |
],
|
315 |
"Total Percentage Remaining": [
|
316 |
+
"%",
|
317 |
],
|
318 |
}
|
319 |
)
|
|
|
342 |
"",
|
343 |
],
|
344 |
"Total Percentage Remaining": [
|
345 |
+
"%",
|
346 |
],
|
347 |
}
|
348 |
)
|
|
|
371 |
"",
|
372 |
],
|
373 |
"Total Percentage Remaining": [
|
374 |
+
"%",
|
375 |
],
|
376 |
}
|
377 |
)
|
|
|
400 |
"",
|
401 |
],
|
402 |
"Total Percentage Remaining": [
|
403 |
+
"%",
|
404 |
],
|
405 |
}
|
406 |
)
|
|
|
429 |
"",
|
430 |
],
|
431 |
"Total Percentage Remaining": [
|
432 |
+
"%",
|
433 |
],
|
434 |
}
|
435 |
)
|
|
|
445 |
H3("Wikipedia"),
|
446 |
H4("Download and Extraction"),
|
447 |
Ol(
|
448 |
+
Li("The Wikimedia dataset was downloaded from the official snapshot on Huggingface", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main")),
|
449 |
+
Li("Data is originally in parqet format so we used the", D_code("huggingface dataset.to_json"), " function to convert the data to the jsonl format"),
|
450 |
),
|
451 |
H4("Filtering"),
|
452 |
Ol(
|
|
|
456 |
Ol(
|
457 |
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
458 |
),
|
|
|
|
|
|
|
|
|
459 |
table_div_wikipedia,
|
460 |
|
461 |
),
|
|
|
481 |
Ol(
|
482 |
Li("Local dedup was done with all papers combined."),
|
483 |
),
|
|
|
|
|
|
|
|
|
484 |
table_div_arx,
|
485 |
),
|
486 |
Section(
|