victormiller
commited on
Commit
•
ee43c81
1
Parent(s):
d82cc95
Update curated.py
Browse files- curated.py +322 -126
curated.py
CHANGED
@@ -484,6 +484,291 @@ freelaw_examples = Div(
|
|
484 |
),
|
485 |
)
|
486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
filtering_process = Div(
|
488 |
Section(
|
489 |
H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
|
@@ -497,6 +782,10 @@ filtering_process = Div(
|
|
497 |
H4("Filtering"),
|
498 |
P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
|
499 |
table_div_wikipedia,
|
|
|
|
|
|
|
|
|
500 |
),
|
501 |
),
|
502 |
Section(
|
@@ -514,6 +803,10 @@ filtering_process = Div(
|
|
514 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
515 |
),
|
516 |
table_div_arx,
|
|
|
|
|
|
|
|
|
517 |
),
|
518 |
),
|
519 |
Section(
|
@@ -552,6 +845,10 @@ filtering_process = Div(
|
|
552 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
553 |
),
|
554 |
table_div_s2o,
|
|
|
|
|
|
|
|
|
555 |
),
|
556 |
),
|
557 |
Section(
|
@@ -584,6 +881,10 @@ filtering_process = Div(
|
|
584 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
585 |
),
|
586 |
table_div_med,
|
|
|
|
|
|
|
|
|
587 |
),
|
588 |
),
|
589 |
Section(
|
@@ -597,6 +898,10 @@ filtering_process = Div(
|
|
597 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
598 |
),
|
599 |
table_div_phil,
|
|
|
|
|
|
|
|
|
600 |
),
|
601 |
),
|
602 |
Section(
|
@@ -608,6 +913,10 @@ filtering_process = Div(
|
|
608 |
H4("Filtering"),
|
609 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
610 |
table_div_up,
|
|
|
|
|
|
|
|
|
611 |
),
|
612 |
),
|
613 |
Section(
|
@@ -697,6 +1006,10 @@ filtering_process = Div(
|
|
697 |
Li("Minimum Word Count Filter: 10"),
|
698 |
),
|
699 |
table_div_se,
|
|
|
|
|
|
|
|
|
700 |
),
|
701 |
),
|
702 |
Section(
|
@@ -724,7 +1037,7 @@ filtering_process = Div(
|
|
724 |
Li("Minimum Word Count Filter: 10"),
|
725 |
Li("Unigram Log Probability"),
|
726 |
),
|
727 |
-
table_div_uirc,
|
728 |
),
|
729 |
),
|
730 |
Section(
|
@@ -745,6 +1058,10 @@ filtering_process = Div(
|
|
745 |
Li("None"),
|
746 |
),
|
747 |
table_div_dmm,
|
|
|
|
|
|
|
|
|
748 |
),
|
749 |
),
|
750 |
Section(
|
@@ -762,6 +1079,10 @@ filtering_process = Div(
|
|
762 |
Li("Unigram Log Probability"),
|
763 |
),
|
764 |
table_div_pg19,
|
|
|
|
|
|
|
|
|
765 |
),
|
766 |
),
|
767 |
)
|
@@ -887,78 +1208,6 @@ table_div_data_pipe = Div(NotStr(table_html_data_pipe), style="margin: 40px;")
|
|
887 |
|
888 |
|
889 |
|
890 |
-
|
891 |
-
def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
|
892 |
-
doc_id = max(0, min(int(doc_id), 9))
|
893 |
-
|
894 |
-
if data_source == "Freelaw":
|
895 |
-
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
|
896 |
-
extracted_sample_doc = json.load(
|
897 |
-
open("data/curated_samples/freelaw_extract.json")
|
898 |
-
)
|
899 |
-
elif data_source == "Wikipedia":
|
900 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
901 |
-
open("data/curated_samples/wiki.json")
|
902 |
-
)
|
903 |
-
elif data_source == "StackExchange":
|
904 |
-
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
|
905 |
-
extracted_sample_doc = json.load(
|
906 |
-
open("data/curated_samples/stackexchange_extract.json")
|
907 |
-
)
|
908 |
-
elif data_source == "PhilPapers":
|
909 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
910 |
-
open("data/curated_samples/philpapers_raw.json")
|
911 |
-
)
|
912 |
-
elif data_source == "Arxiv":
|
913 |
-
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
|
914 |
-
extracted_sample_doc = json.load(
|
915 |
-
open("data/curated_samples/arxiv_extract.json")
|
916 |
-
)
|
917 |
-
elif data_source == "S2ORC":
|
918 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
919 |
-
open("data/curated_samples/s2orc_raw.json")
|
920 |
-
)
|
921 |
-
elif data_source == "S2ORC Abstract":
|
922 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
923 |
-
open("data/curated_samples/s2orc_abstract_raw.json")
|
924 |
-
)
|
925 |
-
elif data_source == "Pubmed":
|
926 |
-
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
|
927 |
-
extracted_sample_doc = json.load(
|
928 |
-
open("data/curated_samples/pubmed_extract.json")
|
929 |
-
)
|
930 |
-
elif data_source == "DM Maths":
|
931 |
-
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
|
932 |
-
extracted_sample_doc = json.load(
|
933 |
-
open("data/curated_samples/dm_maths_extract.json")
|
934 |
-
)
|
935 |
-
elif data_source == "PG19":
|
936 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
937 |
-
open("data/curated_samples/pg19_raw.json")
|
938 |
-
)
|
939 |
-
elif data_source == "Europarl":
|
940 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
941 |
-
open("data/curated_samples/europarl_raw.json")
|
942 |
-
)
|
943 |
-
else:
|
944 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
945 |
-
|
946 |
-
raw_json = raw_sample_doc[doc_id]
|
947 |
-
extracted_json = extracted_sample_doc[doc_id]
|
948 |
-
return view_data(
|
949 |
-
raw_json,
|
950 |
-
extracted_json,
|
951 |
-
doc_id=doc_id,
|
952 |
-
data_source=data_source,
|
953 |
-
data_sources=data_sources,
|
954 |
-
target=target,
|
955 |
-
)
|
956 |
-
|
957 |
-
|
958 |
-
|
959 |
-
|
960 |
-
|
961 |
-
|
962 |
def update(target: str, request):
|
963 |
params = request.query_params
|
964 |
if data_source := params.get(f"data_source_{target}"):
|
@@ -1082,58 +1331,6 @@ def curated(request):
|
|
1082 |
)
|
1083 |
|
1084 |
|
1085 |
-
preprocessing_steps = pd.DataFrame(
|
1086 |
-
{
|
1087 |
-
"Step": [
|
1088 |
-
"Language Filter",
|
1089 |
-
"Min Word Count",
|
1090 |
-
"Title Abstract",
|
1091 |
-
"Majority Language",
|
1092 |
-
"Paragraph Count",
|
1093 |
-
"Frequency",
|
1094 |
-
"Unigram Log Probability",
|
1095 |
-
],
|
1096 |
-
"Description": [
|
1097 |
-
"Filtering data based on language",
|
1098 |
-
"Setting a minimum word count threshold",
|
1099 |
-
"Extracting information from the title and abstract",
|
1100 |
-
"Identifying the majority language in the dataset",
|
1101 |
-
"Counting the number of paragraphs in each document",
|
1102 |
-
"Calculating the frequency of each word in the dataset",
|
1103 |
-
"Calculating the log probability of each unigram",
|
1104 |
-
],
|
1105 |
-
"Need": [
|
1106 |
-
"To remove documents in unwanted languages",
|
1107 |
-
"To filter out documents with very few words",
|
1108 |
-
"To extract relevant information for analysis",
|
1109 |
-
"To understand the distribution of languages in the dataset",
|
1110 |
-
"To analyze the structure and length of documents",
|
1111 |
-
"To identify important words in the dataset",
|
1112 |
-
"To measure the significance of individual words",
|
1113 |
-
],
|
1114 |
-
"Pros": [
|
1115 |
-
"Improves data quality by removing irrelevant documents",
|
1116 |
-
"Filters out low-quality or incomplete documents",
|
1117 |
-
"Provides additional information for analysis",
|
1118 |
-
"Enables language-specific analysis and insights",
|
1119 |
-
"Helps understand the complexity and content of documents",
|
1120 |
-
"Identifies important terms and topics in the dataset",
|
1121 |
-
"Quantifies the importance of individual words",
|
1122 |
-
],
|
1123 |
-
"Cons": [
|
1124 |
-
"May exclude documents in less common languages",
|
1125 |
-
"May remove documents with valuable information",
|
1126 |
-
"May introduce bias in the analysis",
|
1127 |
-
"May not accurately represent the language distribution",
|
1128 |
-
"May not capture the complexity of document structure",
|
1129 |
-
"May be sensitive to noise and outliers",
|
1130 |
-
"May not capture the semantic meaning of words",
|
1131 |
-
],
|
1132 |
-
}
|
1133 |
-
)
|
1134 |
-
|
1135 |
-
table_html = preprocessing_steps.to_html(index=False, border=0)
|
1136 |
-
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
1137 |
data_preprocessing_div = Div(
|
1138 |
H2("Data Preprocessing"),
|
1139 |
P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
|
@@ -1166,7 +1363,6 @@ def curated(request):
|
|
1166 |
plotly2fasthtml(diff2_stacked_bar),
|
1167 |
P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
|
1168 |
filtering_process,
|
1169 |
-
freelaw_examples,
|
1170 |
data_preparation_div,
|
1171 |
#H2("Local Deduplication"), are these numbers even right?
|
1172 |
#local_dedup_text,
|
|
|
484 |
),
|
485 |
)
|
486 |
|
487 |
+
|
488 |
+
def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
|
489 |
+
doc_id = max(0, min(int(doc_id), 9))
|
490 |
+
|
491 |
+
if data_source == "Wikipedia":
|
492 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
493 |
+
open("data/curated_samples/wiki.json")
|
494 |
+
)
|
495 |
+
else:
|
496 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
497 |
+
|
498 |
+
raw_json = raw_sample_doc[doc_id]
|
499 |
+
extracted_json = extracted_sample_doc[doc_id]
|
500 |
+
return view_data(
|
501 |
+
raw_json,
|
502 |
+
extracted_json,
|
503 |
+
doc_id=doc_id,
|
504 |
+
data_source="Wikipedia",
|
505 |
+
data_sources="Wikipedia",
|
506 |
+
target=target,
|
507 |
+
)
|
508 |
+
|
509 |
+
wiki_examples = Div(
|
510 |
+
Div(
|
511 |
+
get_wiki_data(target=gen_random_id()),
|
512 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
513 |
+
),
|
514 |
+
)
|
515 |
+
|
516 |
+
def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
|
517 |
+
doc_id = max(0, min(int(doc_id), 9))
|
518 |
+
|
519 |
+
if data_source == "StackExchange":
|
520 |
+
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
|
521 |
+
extracted_sample_doc = json.load(
|
522 |
+
open("data/curated_samples/stackexchange_extract.json")
|
523 |
+
)
|
524 |
+
else:
|
525 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
526 |
+
|
527 |
+
raw_json = raw_sample_doc[doc_id]
|
528 |
+
extracted_json = extracted_sample_doc[doc_id]
|
529 |
+
return view_data(
|
530 |
+
raw_json,
|
531 |
+
extracted_json,
|
532 |
+
doc_id=doc_id,
|
533 |
+
data_source="StackExchange",
|
534 |
+
data_sources="StackExchange",
|
535 |
+
target=target,
|
536 |
+
)
|
537 |
+
|
538 |
+
se_examples = Div(
|
539 |
+
Div(
|
540 |
+
get_se_data(target=gen_random_id()),
|
541 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
542 |
+
),
|
543 |
+
)
|
544 |
+
|
545 |
+
def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
|
546 |
+
doc_id = max(0, min(int(doc_id), 9))
|
547 |
+
|
548 |
+
if data_source == "PhilPapers":
|
549 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
550 |
+
open("data/curated_samples/philpapers_raw.json")
|
551 |
+
)
|
552 |
+
else:
|
553 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
554 |
+
|
555 |
+
raw_json = raw_sample_doc[doc_id]
|
556 |
+
extracted_json = extracted_sample_doc[doc_id]
|
557 |
+
return view_data(
|
558 |
+
raw_json,
|
559 |
+
extracted_json,
|
560 |
+
doc_id=doc_id,
|
561 |
+
data_source="PhilPapers",
|
562 |
+
data_sources="PhilPapers",
|
563 |
+
target=target,
|
564 |
+
)
|
565 |
+
|
566 |
+
phil_examples = Div(
|
567 |
+
Div(
|
568 |
+
get_phil_data(target=gen_random_id()),
|
569 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
570 |
+
),
|
571 |
+
)
|
572 |
+
|
573 |
+
def get_arx_data(data_source: str = "Arxiv", doc_id: int = 3, target: str = "foo"):
|
574 |
+
doc_id = max(0, min(int(doc_id), 9))
|
575 |
+
|
576 |
+
if data_source == "Arxiv":
|
577 |
+
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
|
578 |
+
extracted_sample_doc = json.load(
|
579 |
+
open("data/curated_samples/arxiv_extract.json")
|
580 |
+
)
|
581 |
+
else:
|
582 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
583 |
+
|
584 |
+
raw_json = raw_sample_doc[doc_id]
|
585 |
+
extracted_json = extracted_sample_doc[doc_id]
|
586 |
+
return view_data(
|
587 |
+
raw_json,
|
588 |
+
extracted_json,
|
589 |
+
doc_id=doc_id,
|
590 |
+
data_source="Arxiv",
|
591 |
+
data_sources="Arxiv",
|
592 |
+
target=target,
|
593 |
+
)
|
594 |
+
|
595 |
+
arx_examples = Div(
|
596 |
+
Div(
|
597 |
+
get_arx_data(target=gen_random_id()),
|
598 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
599 |
+
),
|
600 |
+
)
|
601 |
+
|
602 |
+
def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
|
603 |
+
doc_id = max(0, min(int(doc_id), 9))
|
604 |
+
|
605 |
+
if data_source == "S2ORC":
|
606 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
607 |
+
open("data/curated_samples/s2orc_raw.json")
|
608 |
+
)
|
609 |
+
else:
|
610 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
611 |
+
|
612 |
+
raw_json = raw_sample_doc[doc_id]
|
613 |
+
extracted_json = extracted_sample_doc[doc_id]
|
614 |
+
return view_data(
|
615 |
+
raw_json,
|
616 |
+
extracted_json,
|
617 |
+
doc_id=doc_id,
|
618 |
+
data_source="S2ORC",
|
619 |
+
data_sources="S2ORC",
|
620 |
+
target=target,
|
621 |
+
)
|
622 |
+
|
623 |
+
s2o_examples = Div(
|
624 |
+
Div(
|
625 |
+
get_S2ORC_data(target=gen_random_id()),
|
626 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
627 |
+
),
|
628 |
+
)
|
629 |
+
|
630 |
+
def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
|
631 |
+
doc_id = max(0, min(int(doc_id), 9))
|
632 |
+
|
633 |
+
if data_source == "S2ORC":
|
634 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
635 |
+
open("data/curated_samples/s2orc_abstract_raw.json")
|
636 |
+
)
|
637 |
+
else:
|
638 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
639 |
+
|
640 |
+
raw_json = raw_sample_doc[doc_id]
|
641 |
+
extracted_json = extracted_sample_doc[doc_id]
|
642 |
+
return view_data(
|
643 |
+
raw_json,
|
644 |
+
extracted_json,
|
645 |
+
doc_id=doc_id,
|
646 |
+
data_source="S2ORC Abstract",
|
647 |
+
data_sources="S2ORC Abstract",
|
648 |
+
target=target,
|
649 |
+
)
|
650 |
+
|
651 |
+
s2oa_examples = Div(
|
652 |
+
Div(
|
653 |
+
get_S2ORCA_data(target=gen_random_id()),
|
654 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
655 |
+
),
|
656 |
+
)
|
657 |
+
|
658 |
+
def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
|
659 |
+
doc_id = max(0, min(int(doc_id), 9))
|
660 |
+
|
661 |
+
if data_source == "Pubmed":
|
662 |
+
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
|
663 |
+
extracted_sample_doc = json.load(
|
664 |
+
open("data/curated_samples/pubmed_extract.json")
|
665 |
+
)
|
666 |
+
else:
|
667 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
668 |
+
|
669 |
+
raw_json = raw_sample_doc[doc_id]
|
670 |
+
extracted_json = extracted_sample_doc[doc_id]
|
671 |
+
return view_data(
|
672 |
+
raw_json,
|
673 |
+
extracted_json,
|
674 |
+
doc_id=doc_id,
|
675 |
+
data_source="Pubmed",
|
676 |
+
data_sources="Pubmed",
|
677 |
+
target=target,
|
678 |
+
)
|
679 |
+
|
680 |
+
pubmed_examples = Div(
|
681 |
+
Div(
|
682 |
+
get_pubmed_data(target=gen_random_id()),
|
683 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
684 |
+
),
|
685 |
+
)
|
686 |
+
|
687 |
+
def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
|
688 |
+
doc_id = max(0, min(int(doc_id), 9))
|
689 |
+
|
690 |
+
if data_source == "DM Math":
|
691 |
+
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
|
692 |
+
extracted_sample_doc = json.load(
|
693 |
+
open("data/curated_samples/dm_maths_extract.json")
|
694 |
+
)
|
695 |
+
else:
|
696 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
697 |
+
|
698 |
+
raw_json = raw_sample_doc[doc_id]
|
699 |
+
extracted_json = extracted_sample_doc[doc_id]
|
700 |
+
return view_data(
|
701 |
+
raw_json,
|
702 |
+
extracted_json,
|
703 |
+
doc_id=doc_id,
|
704 |
+
data_source="DM Math",
|
705 |
+
data_sources="DM Math",
|
706 |
+
target=target,
|
707 |
+
)
|
708 |
+
|
709 |
+
dmm_examples = Div(
|
710 |
+
Div(
|
711 |
+
get_dmm_data(target=gen_random_id()),
|
712 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
713 |
+
),
|
714 |
+
)
|
715 |
+
|
716 |
+
def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
|
717 |
+
doc_id = max(0, min(int(doc_id), 9))
|
718 |
+
|
719 |
+
if data_source == "PG19":
|
720 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
721 |
+
open("data/curated_samples/pg19_raw.json")
|
722 |
+
)
|
723 |
+
else:
|
724 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
725 |
+
|
726 |
+
raw_json = raw_sample_doc[doc_id]
|
727 |
+
extracted_json = extracted_sample_doc[doc_id]
|
728 |
+
return view_data(
|
729 |
+
raw_json,
|
730 |
+
extracted_json,
|
731 |
+
doc_id=doc_id,
|
732 |
+
data_source="PG19",
|
733 |
+
data_sources="PG19",
|
734 |
+
target=target,
|
735 |
+
)
|
736 |
+
|
737 |
+
pg19_examples = Div(
|
738 |
+
Div(
|
739 |
+
get_pg19_data(target=gen_random_id()),
|
740 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
741 |
+
),
|
742 |
+
)
|
743 |
+
|
744 |
+
def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
|
745 |
+
doc_id = max(0, min(int(doc_id), 9))
|
746 |
+
|
747 |
+
if data_source == "Europarl":
|
748 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
749 |
+
open("data/curated_samples/europarl_raw.json")
|
750 |
+
)
|
751 |
+
else:
|
752 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
753 |
+
|
754 |
+
raw_json = raw_sample_doc[doc_id]
|
755 |
+
extracted_json = extracted_sample_doc[doc_id]
|
756 |
+
return view_data(
|
757 |
+
raw_json,
|
758 |
+
extracted_json,
|
759 |
+
doc_id=doc_id,
|
760 |
+
data_source="Europarl",
|
761 |
+
data_sources="Europarl",
|
762 |
+
target=target,
|
763 |
+
)
|
764 |
+
|
765 |
+
eu_examples = Div(
|
766 |
+
Div(
|
767 |
+
get_eu_data(target=gen_random_id()),
|
768 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
769 |
+
),
|
770 |
+
)
|
771 |
+
|
772 |
filtering_process = Div(
|
773 |
Section(
|
774 |
H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
|
|
|
782 |
H4("Filtering"),
|
783 |
P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
|
784 |
table_div_wikipedia,
|
785 |
+
Details(
|
786 |
+
Summary("Wikipedia Filtering Examples"),
|
787 |
+
wiki_examples_examples,
|
788 |
+
),
|
789 |
),
|
790 |
),
|
791 |
Section(
|
|
|
803 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
804 |
),
|
805 |
table_div_arx,
|
806 |
+
Details(
|
807 |
+
Summary("ArXiv Filtering Examples"),
|
808 |
+
arx_examples_examples,
|
809 |
+
),
|
810 |
),
|
811 |
),
|
812 |
Section(
|
|
|
845 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
846 |
),
|
847 |
table_div_s2o,
|
848 |
+
Details(
|
849 |
+
Summary("FreeLaw Filtering Examples -- need to update"),
|
850 |
+
freelaw_examples,
|
851 |
+
),
|
852 |
),
|
853 |
),
|
854 |
Section(
|
|
|
881 |
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
882 |
),
|
883 |
table_div_med,
|
884 |
+
Details(
|
885 |
+
Summary("PubMed Filtering Examples"),
|
886 |
+
pubmed_examples,
|
887 |
+
),
|
888 |
),
|
889 |
),
|
890 |
Section(
|
|
|
898 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
899 |
),
|
900 |
table_div_phil,
|
901 |
+
Details(
|
902 |
+
Summary("Phil Papers Filtering Examples"),
|
903 |
+
phil_examples,
|
904 |
+
),
|
905 |
),
|
906 |
),
|
907 |
Section(
|
|
|
913 |
H4("Filtering"),
|
914 |
P("EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained 'TAGS' which were removed."),
|
915 |
table_div_up,
|
916 |
+
Details(
|
917 |
+
Summary("EuroParl Filtering Examples"),
|
918 |
+
eu_examples,
|
919 |
+
),
|
920 |
),
|
921 |
),
|
922 |
Section(
|
|
|
1006 |
Li("Minimum Word Count Filter: 10"),
|
1007 |
),
|
1008 |
table_div_se,
|
1009 |
+
Details(
|
1010 |
+
Summary("StackExchange Filtering Examples"),
|
1011 |
+
se_examples,
|
1012 |
+
),
|
1013 |
),
|
1014 |
),
|
1015 |
Section(
|
|
|
1037 |
Li("Minimum Word Count Filter: 10"),
|
1038 |
Li("Unigram Log Probability"),
|
1039 |
),
|
1040 |
+
table_div_uirc,
|
1041 |
),
|
1042 |
),
|
1043 |
Section(
|
|
|
1058 |
Li("None"),
|
1059 |
),
|
1060 |
table_div_dmm,
|
1061 |
+
Details(
|
1062 |
+
Summary("DM Math Filtering Examples"),
|
1063 |
+
dmm_examples,
|
1064 |
+
),
|
1065 |
),
|
1066 |
),
|
1067 |
Section(
|
|
|
1079 |
Li("Unigram Log Probability"),
|
1080 |
),
|
1081 |
table_div_pg19,
|
1082 |
+
Details(
|
1083 |
+
Summary("PG-19 Filtering Examples"),
|
1084 |
+
pg19_examples,
|
1085 |
+
),
|
1086 |
),
|
1087 |
),
|
1088 |
)
|
|
|
1208 |
|
1209 |
|
1210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1211 |
def update(target: str, request):
|
1212 |
params = request.query_params
|
1213 |
if data_source := params.get(f"data_source_{target}"):
|
|
|
1331 |
)
|
1332 |
|
1333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1334 |
data_preprocessing_div = Div(
|
1335 |
H2("Data Preprocessing"),
|
1336 |
P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
|
|
|
1363 |
plotly2fasthtml(diff2_stacked_bar),
|
1364 |
P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
|
1365 |
filtering_process,
|
|
|
1366 |
data_preparation_div,
|
1367 |
#H2("Local Deduplication"), are these numbers even right?
|
1368 |
#local_dedup_text,
|