Spaces:
Running
Running
from fasthtml.common import * | |
from fasthtml.components import * | |
from plotly import graph_objects as go | |
from fh_plotly import plotly2fasthtml | |
import pandas as pd | |
import json | |
from data_viewer import view_data, gen_random_id | |
from rich import print | |
import uuid | |
data_sources = [ | |
"Freelaw", | |
"Wikipedia", | |
"PhilPapers", | |
"Arxiv", | |
"S2ORC", | |
"S2ORC Abstract", | |
"Pubmed", | |
"USPTO", | |
"Hackernews", | |
"Ubuntu IRC", | |
"StackExchange", | |
"DM Maths", | |
"PG19", | |
"Europarl", | |
] | |
def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"): | |
doc_id = max(0, min(int(doc_id), 9)) | |
if data_source == "Freelaw": | |
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json")) | |
extracted_sample_doc = json.load( | |
open("data/curated_samples/freelaw_extract.json") | |
) | |
elif data_source == "Wikipedia": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/wiki.json") | |
) | |
elif data_source == "StackExchange": | |
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json")) | |
extracted_sample_doc = json.load( | |
open("data/curated_samples/stackexchange_extract.json") | |
) | |
elif data_source == "PhilPapers": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/philpapers_raw.json") | |
) | |
elif data_source == "Arxiv": | |
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json")) | |
extracted_sample_doc = json.load( | |
open("data/curated_samples/arxiv_extract.json") | |
) | |
elif data_source == "S2ORC": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/s2orc_raw.json") | |
) | |
elif data_source == "S2ORC Abstract": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/s2orc_abstract_raw.json") | |
) | |
elif data_source == "Pubmed": | |
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json")) | |
extracted_sample_doc = json.load( | |
open("data/curated_samples/pubmed_extract.json") | |
) | |
elif data_source == "DM Maths": | |
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json")) | |
extracted_sample_doc = json.load( | |
open("data/curated_samples/dm_maths_extract.json") | |
) | |
elif data_source == "PG19": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/pg19_raw.json") | |
) | |
elif data_source == "Europarl": | |
raw_sample_doc = extracted_sample_doc = json.load( | |
open("data/curated_samples/europarl_raw.json") | |
) | |
else: | |
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)] | |
raw_json = raw_sample_doc[doc_id] | |
extracted_json = extracted_sample_doc[doc_id] | |
return view_data( | |
raw_json, | |
extracted_json, | |
doc_id=doc_id, | |
data_source=data_source, | |
data_sources=data_sources, | |
target=target, | |
) | |
def get_chart_28168342(): | |
fig = go.Figure() | |
filter_names = [ | |
"Download", | |
"Language", | |
"Min word count", | |
"Title Abstract", | |
"Majority language", | |
"Paragraph count", | |
"Frequency", | |
"Unigram log probability", | |
"Local dedup", | |
] | |
data_sources = [ | |
("Wikipedia", [100, 90, 80, 70, 60, 50, 40, 30, 20]), | |
("Freelaw", [100, 90, 80, 70, 60, 50, 40, 20, 20]), | |
("DM Maths", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("USPTO", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("PG19", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("Hackernews", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("Ubuntu IRC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("Europarl", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("StackExchange", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("Arxiv", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("S2ORC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("S2ORC Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("PubMed Central", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("PubMed Central Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
("PhilPapers", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
] | |
for name, x_values in data_sources: | |
fig.add_trace( | |
go.Funnel( | |
name=name, | |
orientation="h", | |
y=filter_names, | |
x=x_values, | |
textinfo="value+percent total", | |
textposition="inside", | |
) | |
) | |
fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)") | |
return fig | |
def update(target: str, request): | |
params = request.query_params | |
if data_source := params.get(f"data_source_{target}"): | |
return get_data( | |
data_source, params.get(f"doc_id_{target}", 3), target) | |
if doc_id := params.get(f"doc_id_{target}"): | |
return get_data( | |
params.get(f"data_source_{target}"), doc_id, target) | |
def curated(request): | |
data_preparation_steps = pd.DataFrame( | |
{ | |
"Method": [ | |
"HTTP/FTP dumps", | |
"Web crawling", | |
"Archive snapshot", | |
"Generated", | |
"Curated", | |
], | |
"Description": [ | |
"Acquiring data from HTTP/FTP dumps", | |
"Crawling websites to extract data", | |
"Working with archive dumps", | |
"Generating synthetic data", | |
"High quality curated data", | |
], | |
"Source": [ | |
"Freelaw | Wikipedia | PhilPapers | Arxiv | S2ORC | Pubmeds", | |
"USPTO | Hackernews | Ubuntu IRC", | |
"StackExchange", | |
"DM Maths", | |
"PG19 | Europarl", | |
], | |
} | |
) | |
table_html = data_preparation_steps.to_html(index=False, border=0) | |
table_div = Div(NotStr(table_html), style="margin: 40px;") | |
text = P("""This initial stage serves as the foundation for the entire | |
process. Here, we focus on acquiring and extracting the raw data, which can | |
come from various sources such as crawling websites, using HTTP/FTP dumps, | |
or working with archive dumps. For instance, to download and prepare a | |
dataset, we can specific downloaders based on the data source. Each dataset | |
might have its own downloader script which can be updated in real time to | |
handle changes in the data source. Here is a general outline of the data | |
preparation process: It's worth noting that some pipelines might require | |
invoking additional functions or scripts to handle specific data sources or | |
formats. These helper scripts can be located within specific directories | |
or modules dedicated to the dataset.""") | |
data_preparation_div = Div( | |
H3("Data Preparation"), | |
text, | |
table_div, | |
Div( | |
get_data(target=gen_random_id()), | |
style="border: 1px solid #ccc; padding: 20px;", | |
), | |
) | |
text = P("""Data preprocessing is a crucial step in the data science | |
pipeline. It involves cleaning and transforming raw data into a format that | |
is suitable for analysis. This process includes handling missing values, | |
normalizing data, encoding categorical variables, and more.""") | |
preprocessing_steps = pd.DataFrame( | |
{ | |
"Step": [ | |
"Language Filter", | |
"Min Word Count", | |
"Title Abstract", | |
"Majority Language", | |
"Paragraph Count", | |
"Frequency", | |
"Unigram Log Probability", | |
], | |
"Description": [ | |
"Filtering data based on language", | |
"Setting a minimum word count threshold", | |
"Extracting information from the title and abstract", | |
"Identifying the majority language in the dataset", | |
"Counting the number of paragraphs in each document", | |
"Calculating the frequency of each word in the dataset", | |
"Calculating the log probability of each unigram", | |
], | |
"Need": [ | |
"To remove documents in unwanted languages", | |
"To filter out documents with very few words", | |
"To extract relevant information for analysis", | |
"To understand the distribution of languages in the dataset", | |
"To analyze the structure and length of documents", | |
"To identify important words in the dataset", | |
"To measure the significance of individual words", | |
], | |
"Pros": [ | |
"Improves data quality by removing irrelevant documents", | |
"Filters out low-quality or incomplete documents", | |
"Provides additional information for analysis", | |
"Enables language-specific analysis and insights", | |
"Helps understand the complexity and content of documents", | |
"Identifies important terms and topics in the dataset", | |
"Quantifies the importance of individual words", | |
], | |
"Cons": [ | |
"May exclude documents in less common languages", | |
"May remove documents with valuable information", | |
"May introduce bias in the analysis", | |
"May not accurately represent the language distribution", | |
"May not capture the complexity of document structure", | |
"May be sensitive to noise and outliers", | |
"May not capture the semantic meaning of words", | |
], | |
} | |
) | |
table_html = preprocessing_steps.to_html(index=False, border=0) | |
table_div = Div(NotStr(table_html), style="margin: 40px;") | |
data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div) | |
return Div( | |
Section( | |
H2("Curated Sources"), | |
plotly2fasthtml(get_chart_28168342()), | |
data_preparation_div, | |
data_preprocessing_div, | |
id="inner-text", | |
) | |
) | |