omwdataset

Runtime error

App Files Files Community

victormiller commited on Sep 26

Commit

6263148

•

1 Parent(s): 88c0211

Update web.py

Browse files

Files changed (1) hide show

web.py +32 -16

web.py CHANGED Viewed

@@ -586,9 +586,12 @@ def web_data():
         P("""
         In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
         Overview of all the quality signals that are used for filtering."""),
-        DVS(
-            json.load(open("data/all_signals.json")),
-            "Overview of all the quality signals that are used for filtering",
         ),
         P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
         Most of these quality signals were initially introduced by Gopher [2] and subsequently adopted by later
@@ -636,10 +639,13 @@ def web_data():
         ensures consistency with the overall document character count calculation.
         """),
         H5("Our Implementation"),
-        DV(
-            "data/repeat_line_frac.jsonl",
-            0,
-            "Sample documents filtered by excessive line repetitions / characters in repeated lines",
         ),
         H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
         P("""
@@ -663,10 +669,13 @@ def web_data():
         only once — tend to be short.
         """),
         H5("Our Implementations"),
-        DV(
-            "data/sample_top_ngram.json",
-            0,
-            "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
         ),
         H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
         P("""
@@ -710,10 +719,13 @@ def web_data():
         works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
         90% of lines start with a bullet point.
         """),
-        DV(
-            "data/line_info.json",
-            0,
-            "Sample documents that are filtered out by line-wise heuristics",
         ),
         H4("3.3 Statistics-based Heuristics"),
         P("""
@@ -806,7 +818,11 @@ def web_data():
         Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
         text.
         """),
-        DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum"),
         H3("4. Deduplication"),
         P("..."),  # Add detailed content and images as needed
         H3("5. PII Removal"),

         P("""
         In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
         Overview of all the quality signals that are used for filtering."""),
+        Details(
+            Summary("Overview of all the quality signals that are used for filtering"),
+            DVS(
+                json.load(open("data/all_signals.json")),
+                "Overview of all the quality signals that are used for filtering",
+            ),
         ),
         P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
         Most of these quality signals were initially introduced by Gopher [2] and subsequently adopted by later
         ensures consistency with the overall document character count calculation.
         """),
         H5("Our Implementation"),
+        Details(
+            Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
+            DV(
+                "data/repeat_line_frac.jsonl",
+                0,
+                "Sample documents filtered by excessive line repetitions / characters in repeated lines",
+            ),
         ),
         H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
         P("""
         only once — tend to be short.
         """),
         H5("Our Implementations"),
+        Details(
+            Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
+            DV(
+                "data/sample_top_ngram.json",
+                0,
+                "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
+            ),
         ),
         H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
         P("""
         works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
         90% of lines start with a bullet point.
         """),
+        Details(
+            Summary("Sample documents that are filtered out by line-wise heuristics"),
+            DV(
+                "data/line_info.json",
+                0,
+                "Sample documents that are filtered out by line-wise heuristics",
+            ),
         ),
         H4("3.3 Statistics-based Heuristics"),
         P("""
         Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
         text.
         """),
+        Details(
+            Summary("Sample documents containing 'lorem ipsum'"),
+            DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
+        ),
         H3("4. Deduplication"),
         P("..."),  # Add detailed content and images as needed
         H3("5. PII Removal"),