TxT360

Sleeping

App Files Files Community

victormiller commited on Oct 3, 2024

Commit

3d4aecc

verified ·

1 Parent(s): 1630e9d

Update curated.py

Browse files

Files changed (1) hide show

curated.py +61 -63

curated.py CHANGED Viewed

@@ -12,7 +12,7 @@ import plotly.express as px
 from fasthtml.components import D_code
 overview = Div(
-            H2("Curated Source Processing Overview"),
             H3("What This Section Contains"),
             P("This section provides a complete discussion on the filtering applied to the 14 curated sources that comprise the non-web data section of TxT360. The section is split into the following topic areas: "),
             Ul(
@@ -21,8 +21,12 @@ overview = Div(
             ),
         ),
-overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity.", B(" We were strongly influenced by The Pile regarding both inclusion of the dataset and filtering techniques."), " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
-copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in The Pile like YouTube and Opensubtitles, Reddit threads, and books.")
 treemap_data = {
   'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
@@ -449,14 +453,12 @@ eu_examples = DV("data/curated_samples/europarl_raw.json", 0, "Europarl")
 ## end filtered examples
 data_preprocessing_div = Div(
-    H2("Data Preprocessing"),
     P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
     P("The ", B("Language Filter"),  " removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
-    H3("Minimum Word Count Filter"),
     P("The ", B("Minimum Word Count Filter")," sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
     P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
     H3("Data Processing for S2ORC"),
     P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
@@ -466,11 +468,61 @@ data_preprocessing_div = Div(
     P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
     )
 filtering_process = Div(
     Section(
-          H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
     ),
     Section(
         Div(
@@ -1006,55 +1058,6 @@ def update(target: str, request):
         return get_data(
             params.get(f"data_source_{target}"), doc_id, target)
-# Data for the stacked bar chart
-data = {
-    'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
-    'Wikipedia': [61614907, 61614907, 60468491, 60468491],
-    'Freelaw': [75971288, 73690766, 68171834, 68123174],
-    'DM Maths': [112559888, 112559888, 112559888, 112559888],
-    'USPTO': [6880276, 6878964, 6749922, 6749389],
-    'PG19': [28752, 28683, 28682, 28632],
-    'Hackernews': [2064931, 2010802, 2010488, 2003636],
-    'Ubuntu IRC': [37966, 23501, 23468, 23205],
-    'Europarl': [69814, 69814, 69814, 69814],
-    'StackExchange': [23246548, 23246548, 23246352, 23246352],
-    'Arxiv': [1911867, 1869441, 1763840, 1762661],
-    'S2ORC': [12963563, 12963563, 12963563, 12963563],
-    'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
-    'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
-    'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
-    'Phil Papers': [49389, 39175, 39175, 39128]
-}
-# Creating a dataframe
-df = pd.DataFrame(data)
-# Creating the stacked bar chart
-fig = go.Figure()
-# Add trace for each dataset
-for dataset in df.columns[1:]:
-    fig.add_trace(go.Bar(
-        name=dataset,
-        x=df['Filter'],
-        y=df[dataset]
-    ))
-# Update the layout
-fig.update_layout(
-    barmode='stack',
-    title='Document Reduction by Filter for Each Dataset',
-    xaxis_title='Filter',
-    yaxis_title='Number of Lines',
-    legend_title='Dataset',
-    height=600,
-    width=1000
-)
-# Show the plot
-diff2_stacked_bar = fig
 def curated(request):
@@ -1118,14 +1121,9 @@ def curated(request):
     return Div(
             overview,
-            H2("Curated Sources: Overview"),
-            overview_text,
-            copyright_disclaimer,
             plotly2fasthtml(treemap_chart),
             data_preprocessing_div,
-            H2("Curated Sources Processing"),
-            plotly2fasthtml(diff2_stacked_bar),
-            P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
             filtering_process,
             #data_preparation_div,
             #H2("Local Deduplication"), are these numbers even right?

 from fasthtml.components import D_code
 overview = Div(
+            H2("Curated Sources Processing"),
             H3("What This Section Contains"),
             P("This section provides a complete discussion on the filtering applied to the 14 curated sources that comprise the non-web data section of TxT360. The section is split into the following topic areas: "),
             Ul(
             ),
         ),
+curated_sources_intro = Div(
+            H2("Curated Sources in TxT360"),
+            P("Curated sources comprise high-quality datasets that contain domain-specificity.", B(" TxT360 was strongly influenced by The Pile regarding both inclusion of the dataset and filtering techniques."), " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. "),
+            P("TxT360 respects the copyright of the data sources and have not included the controversial data that was used in The Pile like YouTube and Opensubtitles, Reddit threads, and books."),
+)
 treemap_data = {
   'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
 ## end filtered examples
 data_preprocessing_div = Div(
+    H2("Filtering Steps and Definitions"),
     P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
     P("The ", B("Language Filter"),  " removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
     P("The ", B("Minimum Word Count Filter")," sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
     P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
     H3("Data Processing for S2ORC"),
     P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
     P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
     )
+# Data for the stacked bar chart
+data = {
+    'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
+    'Wikipedia': [61614907, 61614907, 60468491, 60468491],
+    'Freelaw': [75971288, 73690766, 68171834, 68123174],
+    'DM Maths': [112559888, 112559888, 112559888, 112559888],
+    'USPTO': [6880276, 6878964, 6749922, 6749389],
+    'PG19': [28752, 28683, 28682, 28632],
+    'Hackernews': [2064931, 2010802, 2010488, 2003636],
+    'Ubuntu IRC': [37966, 23501, 23468, 23205],
+    'Europarl': [69814, 69814, 69814, 69814],
+    'StackExchange': [23246548, 23246548, 23246352, 23246352],
+    'Arxiv': [1911867, 1869441, 1763840, 1762661],
+    'S2ORC': [12963563, 12963563, 12963563, 12963563],
+    'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
+    'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
+    'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
+    'Phil Papers': [49389, 39175, 39175, 39128]
+}
+# Creating a dataframe
+df = pd.DataFrame(data)
+# Creating the stacked bar chart
+fig = go.Figure()
+# Add trace for each dataset
+for dataset in df.columns[1:]:
+    fig.add_trace(go.Bar(
+        name=dataset,
+        x=df['Filter'],
+        y=df[dataset]
+    ))
+# Update the layout
+fig.update_layout(
+    barmode='stack',
+    title='Document Reduction by Filter for Each Dataset',
+    xaxis_title='Filter',
+    yaxis_title='Number of Lines',
+    legend_title='Dataset',
+    height=600,
+    width=1000
+)
+# Show the plot
+diff2_stacked_bar = fig
 filtering_process = Div(
     Section(
+        H2("Discussion on Filtering All Curated Sources")
+        P("Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
+        plotly2fasthtml(diff2_stacked_bar),
+        H3("This section continues belows with the specific filtering steps taken for all 14 curated datasets."),
     ),
     Section(
         Div(
         return get_data(
             params.get(f"data_source_{target}"), doc_id, target)
 def curated(request):
     return Div(
             overview,
+            curated_sources_intro,
             plotly2fasthtml(treemap_chart),
             data_preprocessing_div,
             filtering_process,
             #data_preparation_div,
             #H2("Local Deduplication"), are these numbers even right?