Spaces:

LLM360
/

TxT360

Running

App Files Files Community

CarisMu commited on Oct 3

Commit

02f8831

•

1 Parent(s): 59cb00c

change the backgrounds of collapsable sample text

Browse files

Files changed (1) hide show

web.py +219 -3

web.py CHANGED Viewed

@@ -297,8 +297,9 @@ def web_data():
                 Summary("Text Extraction Examples"),
                 DV2("data/sample_wet.json", "data/sample_warc.json", 3),
             style="""
-            background-color: #D3D3D3; /* Light grey background */
-            border: 1px solid #949494; /* Grey border */
             border-radius: 12px;
             """, #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
             ),
@@ -314,6 +315,11 @@ def web_data():
         Details(
             Summary("Non-English Documents"),
             DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
         ),
         #DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
@@ -321,6 +327,11 @@ def web_data():
         Details(
             Summary("English Documents Scoring Lower than 0.65"),
             DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
         ),
         H3("1.3 URL Filtering"),
@@ -338,6 +349,11 @@ def web_data():
         Details(
             Summary("24 URL domains with more than 4k matches"),
             DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
         ),
         P("""
@@ -346,6 +362,11 @@ def web_data():
         Details(
             Summary("6 url domains that are removed from the blocklist"),
             DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
         ),
         Details(
@@ -354,7 +375,12 @@ def web_data():
             "data/bad_url_doc.jsonl",
             3,
             "Sample documents whose urls are blocked by the refined url blocklist",
-            ),
         ),
         H3("1.3.2 Excluded High Quality Sources"),
@@ -368,11 +394,21 @@ def web_data():
                 non_web_urls,
                 "curated url domains that are excluded from our dataset",
             ),
         ),
         Details(
             Summary("Sample documents whose urls are in our curated url domain list"),
             DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
         ),
@@ -401,6 +437,11 @@ def web_data():
             0,
             "Sample documents with lines that are removed by the rule of terminal punctuation",
             ),
         ),
@@ -422,6 +463,11 @@ def web_data():
                 0,
                 "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
             ),
         ),
         H3("2.2 Other Rules from RefinedWeb"),
         P("""
@@ -440,6 +486,11 @@ def web_data():
                 0,
                 "Sample documents with lines that are removed by the RefinedWeb rules",
             ),
         ),
         H3("2.3 Toxic Lines"),
         P("""
@@ -455,6 +506,11 @@ def web_data():
                 json.load(open("data/toxic_lines.json")),
                 "Sample documents with toxic lines",
             ),
         ),
         H2("3. Document-Level Filtering"),
@@ -467,6 +523,11 @@ def web_data():
                 json.load(open("data/all_signals.json")),
                 "Overview of all the quality signals that are used for filtering",
             ),
         ),
         P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
         Most quality signals were initially introduced by Gopher [2] and subsequently adopted by later
@@ -505,6 +566,11 @@ def web_data():
                 len(line) * count for line, count in line_counts.items() if count > 1
             ) / max(character_count, 1)
             """, block="block", language="python"),
         ),
         Details(
             Summary("Implementations from DataTrove"),
@@ -539,6 +605,11 @@ def web_data():
             if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
                 return False, "dup_line_char_frac"
             """, block="block", language="python"),
         ),
         P("""
         After evaluating the implementations of Dolma and DataTrove (note: RedPajama V2 does not implement these two quality
@@ -580,6 +651,11 @@ def web_data():
                 sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
                 line_counts.items() if count > 1) / character_count
             """, block="block", language="python"),
         ),
         Details(
             Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
@@ -588,6 +664,11 @@ def web_data():
                 0,
                 "Sample documents filtered by excessive line repetitions / characters in repeated lines",
             ),
         ),
         H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
         P("""
@@ -611,6 +692,11 @@ def web_data():
                     value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
                     attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
             """, block="block", language="python"),
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
@@ -649,6 +735,11 @@ def web_data():
                         score = round(score, PRECISION)
                         return [(0, len(document), score)]
             """, block="block", language="python"),
         ),
         Details(
@@ -672,6 +763,11 @@ def web_data():
                 if top_char_length / len(text) > n_frac:
                     return False, f"top_n_gram"
             """, block="block", language="python"),
         ),
         P("""
         There are almost no contradictions between each implementations of fractions of characters in the most common
@@ -699,6 +795,11 @@ def web_data():
                     value = count * sum(len(w) for w in most_common_ngram) / character_count
                     attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
             """, block="block", language="python"),
         ),
         Details(
             Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
@@ -707,6 +808,11 @@ def web_data():
                 0,
                 "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
             ),
         ),
         H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
         P("""
@@ -733,6 +839,11 @@ def web_data():
                     ) / max(ng_char_count, 1)
                     attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
             """, block="block", language="python"),
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
@@ -786,6 +897,11 @@ def web_data():
                     score = round(score, PRECISION)
                     return [(0, len(document), score)]
             """, block="block", language="python"),
         ),
         Details(
@@ -811,6 +927,11 @@ def web_data():
                 if n_duplicates_char / len(text) > n_frac:
                     return False, f"duplicated_n_grams"
             """, block="block", language="python"),
         ),
         P("""
         For the computation of fraction of characters in duplicate n-gram, Dolma uses the number of characters in all
@@ -864,6 +985,11 @@ def web_data():
                     score = get_dup_ngram_frac(n, ngram_counts, text)
                     attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
             """, block="block", language="python"),
         ),
         Details(
             Summary("An example to show the difference between above implementations"),
@@ -878,6 +1004,11 @@ def web_data():
             In our implementation, there are 17*6 characters in total with 10*6 characters that are duplicated after excluding the first occurence. This results in a fraction of 10/17.
             """),
         ),
         H5(
             "Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
@@ -889,6 +1020,11 @@ def web_data():
                 0,
                 "Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
             ),
         ),
         H3("3.2 Line-wise Heuristics"),
         P("""
@@ -915,6 +1051,11 @@ def web_data():
             D_code("""
             ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
             """, block="block", language="python"),
         ),
         Details(
             Summary("Bullet Point Identification Implemetations"),
@@ -959,6 +1100,11 @@ def web_data():
                 "*",  # * star
             )
             """, block="block", language="python"),
         ),
@@ -969,6 +1115,11 @@ def web_data():
                 0,
                 "Sample documents that are filtered out by line-wise heuristics",
             ),
         ),
         H3("3.3 Statistics-based Heuristics"),
@@ -1029,6 +1180,11 @@ def web_data():
                    text = unicodedata.normalize("NFD", text)
                return text
             """, block="block", language="python"),
         ),
         Details(
@@ -1040,6 +1196,11 @@ def web_data():
             non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
             n_non_symbol_words_words = len(non_symbol_words)
             """, block="block", language="python"),
         ),
         P("""
         Both Dolma and RedPajama V2 split texts into words using white spaces and newline symbols. However,
@@ -1084,6 +1245,11 @@ def web_data():
                 score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
                 return [(0, len(document), score)]
             """, block="block", language="python"),
         ),
         P("""
         However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
@@ -1100,6 +1266,11 @@ def web_data():
             ...
             attrs.num_of_sentences = count_sentences(text)
             """, block="block", language="python"),
         ),
         H3("Symbol to Word Ratio"),
@@ -1116,6 +1287,11 @@ def web_data():
                         word_count, 1
                     )
             """, block="block", language="python"),
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
@@ -1142,6 +1318,11 @@ def web_data():
                     score = round(score, PRECISION)
                     return [(0, len(document), score)]
             """, block="block", language="python"),
         ),
         Details(
@@ -1152,6 +1333,11 @@ def web_data():
             if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
                 return False, "gopher_too_many_ellipsis"
             """, block="block", language="python"),
         ),
         Details(
             Summary("TxT360 Implementation"),
@@ -1162,6 +1348,11 @@ def web_data():
             ...
             attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
             """, block="block", language="python"),
         ),
         H3("Fraction of Alphabetic Words"),
@@ -1172,6 +1363,11 @@ def web_data():
             1 for word in words if any(c.isalpha() for c in word)
         ) / max(word_count, 1)
             """, block="block", language="python"),
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
@@ -1196,6 +1392,11 @@ def web_data():
                     score = round(score, PRECISION)
                     return [(0, len(document), score)]
             """, block="block", language="python"),
         ),
         Details(
             Summary("Implementations from DataTrove"),
@@ -1207,6 +1408,11 @@ def web_data():
             ):
                 return False, "gopher_below_alpha_threshold"
             """, block="block", language="python"),
         ),
         P("""
         Both Dolma and DataTrove use `char.isalpha()` to detect whether a word contains alphabetic characters while
@@ -1233,6 +1439,11 @@ def web_data():
                 0,
                 "Sample documents that are filtered out by statistics-based heuristics",
             ),
         ),
         H3("3.4 Others"),
         P("""
@@ -1243,6 +1454,11 @@ def web_data():
         Details(
             Summary("Sample documents containing 'lorem ipsum'"),
             DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
         ),
         H2("4. Deduplication"),
         P("""

                 Summary("Text Extraction Examples"),
                 DV2("data/sample_wet.json", "data/sample_warc.json", 3),
             style="""
+            background-color: #F0F8FF; /* Light blue background */
+            padding: 15px;
+            # border: 1px solid #949494; /* Grey border */
             border-radius: 12px;
             """, #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
             ),
         Details(
             Summary("Non-English Documents"),
             DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         #DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
         Details(
             Summary("English Documents Scoring Lower than 0.65"),
             DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("1.3 URL Filtering"),
         Details(
             Summary("24 URL domains with more than 4k matches"),
             DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         P("""
         Details(
             Summary("6 url domains that are removed from the blocklist"),
             DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             "data/bad_url_doc.jsonl",
             3,
             "Sample documents whose urls are blocked by the refined url blocklist",
+            ),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("1.3.2 Excluded High Quality Sources"),
                 non_web_urls,
                 "curated url domains that are excluded from our dataset",
             ),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Sample documents whose urls are in our curated url domain list"),
             DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
             0,
             "Sample documents with lines that are removed by the rule of terminal punctuation",
             ),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
                 0,
                 "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
             ),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("2.2 Other Rules from RefinedWeb"),
         P("""
                 0,
                 "Sample documents with lines that are removed by the RefinedWeb rules",
             ),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("2.3 Toxic Lines"),
         P("""
                 json.load(open("data/toxic_lines.json")),
                 "Sample documents with toxic lines",
             ),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H2("3. Document-Level Filtering"),
                 json.load(open("data/all_signals.json")),
                 "Overview of all the quality signals that are used for filtering",
             ),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
         Most quality signals were initially introduced by Gopher [2] and subsequently adopted by later
                 len(line) * count for line, count in line_counts.items() if count > 1
             ) / max(character_count, 1)
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Implementations from DataTrove"),
             if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
                 return False, "dup_line_char_frac"
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         P("""
         After evaluating the implementations of Dolma and DataTrove (note: RedPajama V2 does not implement these two quality
                 sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
                 line_counts.items() if count > 1) / character_count
             """, block="block", language="python"),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
                 0,
                 "Sample documents filtered by excessive line repetitions / characters in repeated lines",
             ),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
         P("""
                     value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
                     attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
                         score = round(score, PRECISION)
                         return [(0, len(document), score)]
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
                 if top_char_length / len(text) > n_frac:
                     return False, f"top_n_gram"
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         P("""
         There are almost no contradictions between each implementations of fractions of characters in the most common
                     value = count * sum(len(w) for w in most_common_ngram) / character_count
                     attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
             """, block="block", language="python"),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
                 0,
                 "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
             ),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
         P("""
                     ) / max(ng_char_count, 1)
                     attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
                     score = round(score, PRECISION)
                     return [(0, len(document), score)]
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
                 if n_duplicates_char / len(text) > n_frac:
                     return False, f"duplicated_n_grams"
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         P("""
         For the computation of fraction of characters in duplicate n-gram, Dolma uses the number of characters in all
                     score = get_dup_ngram_frac(n, ngram_counts, text)
                     attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
             """, block="block", language="python"),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("An example to show the difference between above implementations"),
             In our implementation, there are 17*6 characters in total with 10*6 characters that are duplicated after excluding the first occurence. This results in a fraction of 10/17.
             """),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H5(
             "Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
                 0,
                 "Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
             ),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("3.2 Line-wise Heuristics"),
         P("""
             D_code("""
             ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Bullet Point Identification Implemetations"),
                 "*",  # * star
             )
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
                 0,
                 "Sample documents that are filtered out by line-wise heuristics",
             ),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("3.3 Statistics-based Heuristics"),
                    text = unicodedata.normalize("NFD", text)
                return text
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
             n_non_symbol_words_words = len(non_symbol_words)
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         P("""
         Both Dolma and RedPajama V2 split texts into words using white spaces and newline symbols. However,
                 score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
                 return [(0, len(document), score)]
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         P("""
         However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
             ...
             attrs.num_of_sentences = count_sentences(text)
             """, block="block", language="python"),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("Symbol to Word Ratio"),
                         word_count, 1
                     )
             """, block="block", language="python"),
+             style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
                     score = round(score, PRECISION)
                     return [(0, len(document), score)]
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
                 return False, "gopher_too_many_ellipsis"
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("TxT360 Implementation"),
             ...
             attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
             """, block="block", language="python"),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("Fraction of Alphabetic Words"),
             1 for word in words if any(c.isalpha() for c in word)
         ) / max(word_count, 1)
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
                     score = round(score, PRECISION)
                     return [(0, len(document), score)]
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         Details(
             Summary("Implementations from DataTrove"),
             ):
                 return False, "gopher_below_alpha_threshold"
             """, block="block", language="python"),
+            style="""
+            background-color: #FFFAEA; /* Light yellow background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         P("""
         Both Dolma and DataTrove use `char.isalpha()` to detect whether a word contains alphabetic characters while
                 0,
                 "Sample documents that are filtered out by statistics-based heuristics",
             ),
+            style="""
+            background-color: #EAFFF1; /* Light green background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H3("3.4 Others"),
         P("""
         Details(
             Summary("Sample documents containing 'lorem ipsum'"),
             DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
+            style="""
+            background-color: #FFC0CB; /* Light pink background */
+            padding: 15px;
+            border-radius: 12px;
+            """,
         ),
         H2("4. Deduplication"),
         P("""