victormiller
commited on
Commit
•
dc1c900
1
Parent(s):
adfc108
Update curated.py
Browse files- curated.py +4 -4
curated.py
CHANGED
@@ -681,15 +681,15 @@ filtering_process = Div(
|
|
681 |
H4("Filtering"),
|
682 |
Ul(
|
683 |
Li(P(B("Hyphenation Removal:"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python"))),
|
684 |
-
Li(P(B("Newline Filtering:"), D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")))
|
685 |
Li(P(B("Header/Footer Filtering:"), D_code("(c) 2023 Company Name.", language="python"), " is removed ",))),
|
686 |
-
Li(P(B("Double Whitespace Filtering:"), D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python")))
|
687 |
Li(P(B("Mean Line Length Check: "), "removes paragraphs with an average line length of < 2.0")),
|
688 |
Li(P(B("CID Percentage Filter: "), "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts.")),
|
689 |
Li(P(B("Letterness Filter: "), "discards paragraphs with a low proportion of letters")),
|
690 |
-
Li(P(B("Removing Leading/Trailing Numbers: "), "removes numbers at the start or end of paragraphs. ", D_code("1 This is a sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python")))
|
691 |
Li(P(B("Fixing Unicode Issues: "), "fixes Unicode issues.")),
|
692 |
-
Li(P(B("Combining Diacritics Correction: "), D_code("a'", language="python"), " becomes ", D_code("å", language="python")))
|
693 |
Li(P(B("Unigram Log Probability: "), "the document must have higher than -20 average unigram log probability.")),
|
694 |
),
|
695 |
table_div_phil,
|
|
|
681 |
H4("Filtering"),
|
682 |
Ul(
|
683 |
Li(P(B("Hyphenation Removal:"), D_code("end-of", language="python"), " becomes ", D_code("end of", language="python"))),
|
684 |
+
Li(P(B("Newline Filtering:"), D_code("This is/na sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python"))),
|
685 |
Li(P(B("Header/Footer Filtering:"), D_code("(c) 2023 Company Name.", language="python"), " is removed ",))),
|
686 |
+
Li(P(B("Double Whitespace Filtering:"), D_code("This is a test.", language="python"), " becomes ", D_code("This is a test.", language="python"))),
|
687 |
Li(P(B("Mean Line Length Check: "), "removes paragraphs with an average line length of < 2.0")),
|
688 |
Li(P(B("CID Percentage Filter: "), "removes LaTex heavy paragraphs that contain over 10% “CID” font artifacts.")),
|
689 |
Li(P(B("Letterness Filter: "), "discards paragraphs with a low proportion of letters")),
|
690 |
+
Li(P(B("Removing Leading/Trailing Numbers: "), "removes numbers at the start or end of paragraphs. ", D_code("1 This is a sentence.", language="python"), " becomes ", D_code("This is a sentence.", language="python"))),
|
691 |
Li(P(B("Fixing Unicode Issues: "), "fixes Unicode issues.")),
|
692 |
+
Li(P(B("Combining Diacritics Correction: "), D_code("a'", language="python"), " becomes ", D_code("å", language="python"))),
|
693 |
Li(P(B("Unigram Log Probability: "), "the document must have higher than -20 average unigram log probability.")),
|
694 |
),
|
695 |
table_div_phil,
|