victormiller
commited on
Commit
•
3d4aecc
1
Parent(s):
1630e9d
Update curated.py
Browse files- curated.py +61 -63
curated.py
CHANGED
@@ -12,7 +12,7 @@ import plotly.express as px
|
|
12 |
from fasthtml.components import D_code
|
13 |
|
14 |
overview = Div(
|
15 |
-
H2("Curated
|
16 |
H3("What This Section Contains"),
|
17 |
P("This section provides a complete discussion on the filtering applied to the 14 curated sources that comprise the non-web data section of TxT360. The section is split into the following topic areas: "),
|
18 |
Ul(
|
@@ -21,8 +21,12 @@ overview = Div(
|
|
21 |
),
|
22 |
),
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
26 |
|
27 |
treemap_data = {
|
28 |
'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
|
@@ -449,14 +453,12 @@ eu_examples = DV("data/curated_samples/europarl_raw.json", 0, "Europarl")
|
|
449 |
## end filtered examples
|
450 |
|
451 |
|
|
|
452 |
data_preprocessing_div = Div(
|
453 |
-
H2("
|
454 |
P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
|
455 |
-
|
456 |
P("The ", B("Language Filter"), " removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
|
457 |
-
H3("Minimum Word Count Filter"),
|
458 |
P("The ", B("Minimum Word Count Filter")," sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
|
459 |
-
|
460 |
P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
|
461 |
H3("Data Processing for S2ORC"),
|
462 |
P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
|
@@ -466,11 +468,61 @@ data_preprocessing_div = Div(
|
|
466 |
P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
|
467 |
)
|
468 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
|
471 |
filtering_process = Div(
|
472 |
Section(
|
473 |
-
|
|
|
|
|
|
|
474 |
),
|
475 |
Section(
|
476 |
Div(
|
@@ -1006,55 +1058,6 @@ def update(target: str, request):
|
|
1006 |
return get_data(
|
1007 |
params.get(f"data_source_{target}"), doc_id, target)
|
1008 |
|
1009 |
-
# Data for the stacked bar chart
|
1010 |
-
data = {
|
1011 |
-
'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
|
1012 |
-
'Wikipedia': [61614907, 61614907, 60468491, 60468491],
|
1013 |
-
'Freelaw': [75971288, 73690766, 68171834, 68123174],
|
1014 |
-
'DM Maths': [112559888, 112559888, 112559888, 112559888],
|
1015 |
-
'USPTO': [6880276, 6878964, 6749922, 6749389],
|
1016 |
-
'PG19': [28752, 28683, 28682, 28632],
|
1017 |
-
'Hackernews': [2064931, 2010802, 2010488, 2003636],
|
1018 |
-
'Ubuntu IRC': [37966, 23501, 23468, 23205],
|
1019 |
-
'Europarl': [69814, 69814, 69814, 69814],
|
1020 |
-
'StackExchange': [23246548, 23246548, 23246352, 23246352],
|
1021 |
-
'Arxiv': [1911867, 1869441, 1763840, 1762661],
|
1022 |
-
'S2ORC': [12963563, 12963563, 12963563, 12963563],
|
1023 |
-
'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
|
1024 |
-
'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
|
1025 |
-
'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
|
1026 |
-
'Phil Papers': [49389, 39175, 39175, 39128]
|
1027 |
-
}
|
1028 |
-
|
1029 |
-
# Creating a dataframe
|
1030 |
-
df = pd.DataFrame(data)
|
1031 |
-
|
1032 |
-
# Creating the stacked bar chart
|
1033 |
-
fig = go.Figure()
|
1034 |
-
|
1035 |
-
# Add trace for each dataset
|
1036 |
-
for dataset in df.columns[1:]:
|
1037 |
-
fig.add_trace(go.Bar(
|
1038 |
-
name=dataset,
|
1039 |
-
x=df['Filter'],
|
1040 |
-
y=df[dataset]
|
1041 |
-
))
|
1042 |
-
|
1043 |
-
# Update the layout
|
1044 |
-
fig.update_layout(
|
1045 |
-
barmode='stack',
|
1046 |
-
title='Document Reduction by Filter for Each Dataset',
|
1047 |
-
xaxis_title='Filter',
|
1048 |
-
yaxis_title='Number of Lines',
|
1049 |
-
legend_title='Dataset',
|
1050 |
-
height=600,
|
1051 |
-
width=1000
|
1052 |
-
)
|
1053 |
-
|
1054 |
-
# Show the plot
|
1055 |
-
diff2_stacked_bar = fig
|
1056 |
-
|
1057 |
-
|
1058 |
|
1059 |
def curated(request):
|
1060 |
|
@@ -1118,14 +1121,9 @@ def curated(request):
|
|
1118 |
|
1119 |
return Div(
|
1120 |
overview,
|
1121 |
-
|
1122 |
-
overview_text,
|
1123 |
-
copyright_disclaimer,
|
1124 |
plotly2fasthtml(treemap_chart),
|
1125 |
data_preprocessing_div,
|
1126 |
-
H2("Curated Sources Processing"),
|
1127 |
-
plotly2fasthtml(diff2_stacked_bar),
|
1128 |
-
P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
|
1129 |
filtering_process,
|
1130 |
#data_preparation_div,
|
1131 |
#H2("Local Deduplication"), are these numbers even right?
|
|
|
12 |
from fasthtml.components import D_code
|
13 |
|
14 |
overview = Div(
|
15 |
+
H2("Curated Sources Processing"),
|
16 |
H3("What This Section Contains"),
|
17 |
P("This section provides a complete discussion on the filtering applied to the 14 curated sources that comprise the non-web data section of TxT360. The section is split into the following topic areas: "),
|
18 |
Ul(
|
|
|
21 |
),
|
22 |
),
|
23 |
|
24 |
+
curated_sources_intro = Div(
|
25 |
+
H2("Curated Sources in TxT360"),
|
26 |
+
P("Curated sources comprise high-quality datasets that contain domain-specificity.", B(" TxT360 was strongly influenced by The Pile regarding both inclusion of the dataset and filtering techniques."), " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. "),
|
27 |
+
P("TxT360 respects the copyright of the data sources and have not included the controversial data that was used in The Pile like YouTube and Opensubtitles, Reddit threads, and books."),
|
28 |
+
)
|
29 |
+
|
30 |
|
31 |
treemap_data = {
|
32 |
'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
|
|
|
453 |
## end filtered examples
|
454 |
|
455 |
|
456 |
+
|
457 |
data_preprocessing_div = Div(
|
458 |
+
H2("Filtering Steps and Definitions"),
|
459 |
P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
|
|
|
460 |
P("The ", B("Language Filter"), " removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
|
|
|
461 |
P("The ", B("Minimum Word Count Filter")," sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
|
|
|
462 |
P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
|
463 |
H3("Data Processing for S2ORC"),
|
464 |
P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
|
|
|
468 |
P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
|
469 |
)
|
470 |
|
471 |
+
# Data for the stacked bar chart
|
472 |
+
data = {
|
473 |
+
'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
|
474 |
+
'Wikipedia': [61614907, 61614907, 60468491, 60468491],
|
475 |
+
'Freelaw': [75971288, 73690766, 68171834, 68123174],
|
476 |
+
'DM Maths': [112559888, 112559888, 112559888, 112559888],
|
477 |
+
'USPTO': [6880276, 6878964, 6749922, 6749389],
|
478 |
+
'PG19': [28752, 28683, 28682, 28632],
|
479 |
+
'Hackernews': [2064931, 2010802, 2010488, 2003636],
|
480 |
+
'Ubuntu IRC': [37966, 23501, 23468, 23205],
|
481 |
+
'Europarl': [69814, 69814, 69814, 69814],
|
482 |
+
'StackExchange': [23246548, 23246548, 23246352, 23246352],
|
483 |
+
'Arxiv': [1911867, 1869441, 1763840, 1762661],
|
484 |
+
'S2ORC': [12963563, 12963563, 12963563, 12963563],
|
485 |
+
'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
|
486 |
+
'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
|
487 |
+
'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
|
488 |
+
'Phil Papers': [49389, 39175, 39175, 39128]
|
489 |
+
}
|
490 |
+
|
491 |
+
# Creating a dataframe
|
492 |
+
df = pd.DataFrame(data)
|
493 |
+
|
494 |
+
# Creating the stacked bar chart
|
495 |
+
fig = go.Figure()
|
496 |
+
|
497 |
+
# Add trace for each dataset
|
498 |
+
for dataset in df.columns[1:]:
|
499 |
+
fig.add_trace(go.Bar(
|
500 |
+
name=dataset,
|
501 |
+
x=df['Filter'],
|
502 |
+
y=df[dataset]
|
503 |
+
))
|
504 |
+
|
505 |
+
# Update the layout
|
506 |
+
fig.update_layout(
|
507 |
+
barmode='stack',
|
508 |
+
title='Document Reduction by Filter for Each Dataset',
|
509 |
+
xaxis_title='Filter',
|
510 |
+
yaxis_title='Number of Lines',
|
511 |
+
legend_title='Dataset',
|
512 |
+
height=600,
|
513 |
+
width=1000
|
514 |
+
)
|
515 |
+
|
516 |
+
# Show the plot
|
517 |
+
diff2_stacked_bar = fig
|
518 |
|
519 |
|
520 |
filtering_process = Div(
|
521 |
Section(
|
522 |
+
H2("Discussion on Filtering All Curated Sources")
|
523 |
+
P("Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
|
524 |
+
plotly2fasthtml(diff2_stacked_bar),
|
525 |
+
H3("This section continues belows with the specific filtering steps taken for all 14 curated datasets."),
|
526 |
),
|
527 |
Section(
|
528 |
Div(
|
|
|
1058 |
return get_data(
|
1059 |
params.get(f"data_source_{target}"), doc_id, target)
|
1060 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1061 |
|
1062 |
def curated(request):
|
1063 |
|
|
|
1121 |
|
1122 |
return Div(
|
1123 |
overview,
|
1124 |
+
curated_sources_intro,
|
|
|
|
|
1125 |
plotly2fasthtml(treemap_chart),
|
1126 |
data_preprocessing_div,
|
|
|
|
|
|
|
1127 |
filtering_process,
|
1128 |
#data_preparation_div,
|
1129 |
#H2("Local Deduplication"), are these numbers even right?
|