victormiller
commited on
Commit
•
dac95a4
1
Parent(s):
5672cf7
Update curated.py
Browse files- curated.py +4 -3
curated.py
CHANGED
@@ -599,8 +599,8 @@ filtering_process = Div(
|
|
599 |
Section(
|
600 |
H3("FreeLaw"),
|
601 |
H4("Download and Extraction"),
|
602 |
-
P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"),". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function.",
|
603 |
-
|
604 |
("html", html2text),
|
605 |
("html_lawbox", html2text),
|
606 |
("html_columbia", html2text),
|
@@ -608,7 +608,8 @@ filtering_process = Div(
|
|
608 |
("html_with_citations", html2text),
|
609 |
("xml_harvard", html2text),
|
610 |
plain_text
|
611 |
-
""", language ="SQL"),
|
|
|
612 |
H4("Filtering"),
|
613 |
Ol(
|
614 |
Li("Language Filter: English"),
|
|
|
599 |
Section(
|
600 |
H3("FreeLaw"),
|
601 |
H4("Download and Extraction"),
|
602 |
+
P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"),". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
|
603 |
+
D_code("""
|
604 |
("html", html2text),
|
605 |
("html_lawbox", html2text),
|
606 |
("html_columbia", html2text),
|
|
|
608 |
("html_with_citations", html2text),
|
609 |
("xml_harvard", html2text),
|
610 |
plain_text
|
611 |
+
""", language ="SQL"),
|
612 |
+
P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
|
613 |
H4("Filtering"),
|
614 |
Ol(
|
615 |
Li("Language Filter: English"),
|