Spaces:
Runtime error
Runtime error
import os | |
import random | |
import gradio as gr | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from functools import partial | |
from datasets import load_dataset | |
dataset_names = [ | |
"AI4Code", | |
"AMPS", | |
"ASFPublicMail", | |
"CPDataset", | |
"DMMath", | |
"Discourse", | |
"Enwiki", | |
"EuroParliamentProceedings", | |
"FreeLaw_Options", | |
"GithubDiff", | |
"GithubIssues", | |
"Gutenberg", | |
"LeetCode", | |
"PileOfLaw", | |
"PubMed", | |
"S2ORC", | |
"StackExchange", | |
"USENET", | |
"USPTO", | |
"UbuntuIRC", | |
"arXiv", | |
] | |
dataset_data = {} | |
for name in dataset_names: | |
path = f"data/{name}/data.json" | |
ds = load_dataset( | |
"CarperAI/pilev2_smol_metadata", | |
data_files=path, | |
use_auth_token=os.environ["HF_TOKEN"], | |
split="train", | |
# download_mode="force_redownload", | |
) | |
dataset_data[name] = { | |
"ds": ds, | |
"check_word_number_criteria": np.array(ds["check_word_number_criteria"]), | |
"check_char_repetition_criteria": np.array(ds["check_char_repetition_criteria"]), | |
"check_flagged_words_criteria": np.array(ds["check_flagged_words_criteria"]), | |
"check_stop_word_ratio_criteria": np.array(ds["check_stop_word_ratio_criteria"]), | |
"check_perplexity_criteria": np.array(ds["check_perplexity_criteria"]), | |
"check_language_criteria": np.array(ds["check_language_criteria"]), | |
} | |
def plt_plot(criteria, dataset, threshold): | |
plt.close("all") | |
x = dataset_data[dataset][criteria] | |
# calculate percentage of data that will be removed given threshold | |
perc = np.sum(x > threshold) / len(x) | |
# create a figure | |
fig = plt.figure() | |
# add a subplot | |
ax = fig.add_subplot(111) | |
# plot some data using black | |
ax.hist(x, bins=50, color="black") | |
# plot red dashed line at threshold | |
ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2) | |
# set title | |
# add percentage of data removed | |
ax.set_title(f"{dataset} (removed {perc:.2%})") | |
plt.xlabel("Value") | |
plt.ylabel("Frequency") | |
# make it look nice | |
plt.tight_layout() | |
return fig | |
def check_filtered(criteria, dataset, threshold): | |
ds = dataset_data[dataset]["ds"] | |
filtered_ds = ds.filter(lambda x: x[criteria] > threshold) | |
if len(filtered_ds) == 0: | |
return "No examples found" | |
# get random sample of 1 | |
sample = filtered_ds.select([random.randint(0, len(filtered_ds) - 1)])["text"][0] | |
return sample | |
with gr.Blocks() as demo: | |
dataset = gr.Radio(dataset_names, label="Dataset", value="arXiv") | |
with gr.Tab("Number of Words Criteria"): | |
# plot some random data | |
plot = gr.Plot() | |
threshold = gr.Slider(minimum=0, maximum=50_000, label="Threshold") | |
calculate = gr.Button("Calculate") | |
check = gr.Button("Check Filtered Data") | |
filtered_data = gr.Textbox(lines=5, label="Filtered Data") | |
plot_fn = partial(plt_plot, "check_word_number_criteria") | |
calculate.click(plot_fn, [dataset, threshold], plot) | |
check_fn = partial(check_filtered, "check_word_number_criteria") | |
check.click(check_fn, [dataset, threshold], filtered_data) | |
with gr.Tab("Character Repetition Criteria"): | |
# plot some random data | |
plot = gr.Plot() | |
threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") | |
calculate = gr.Button("Calculate") | |
check = gr.Button("Check Filtered Data") | |
filtered_data = gr.Textbox(lines=5, label="Filtered Data") | |
plot_fn = partial(plt_plot, "check_char_repetition_criteria") | |
calculate.click(plot_fn, [dataset, threshold], plot) | |
check_fn = partial(check_filtered, "check_char_repetition_criteria") | |
check.click(check_fn, [dataset, threshold], filtered_data) | |
with gr.Tab("Stop Word Ratio Criteria"): | |
plot = gr.Plot() | |
threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") | |
calculate = gr.Button("Calculate") | |
check = gr.Button("Check Filtered Data") | |
filtered_data = gr.Textbox(lines=5, label="Filtered Data") | |
plot_fn = partial(plt_plot, "check_stop_word_ratio_criteria") | |
calculate.click(plot_fn, [dataset, threshold], plot) | |
check_fn = partial(check_filtered, "check_stop_word_ratio_criteria") | |
check.click(check_fn, [dataset, threshold], filtered_data) | |
with gr.Tab("Flagged Word Criteria"): | |
plot = gr.Plot() | |
threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") | |
calculate = gr.Button("Calculate") | |
check = gr.Button("Check Filtered Data") | |
filtered_data = gr.Textbox(lines=5, label="Filtered Data") | |
plot_fn = partial(plt_plot, "check_flagged_words_criteria") | |
calculate.click(plot_fn, [dataset, threshold], plot) | |
check_fn = partial(check_filtered, "check_flagged_words_criteria") | |
check.click(check_fn, [dataset, threshold], filtered_data) | |
with gr.Tab("Perplexity Criteria"): | |
plot = gr.Plot() | |
threshold = gr.Slider(minimum=0, maximum=50_000, label="Threshold") | |
calculate = gr.Button("Calculate") | |
check = gr.Button("Check Filtered Data") | |
filtered_data = gr.Textbox(lines=5, label="Filtered Data") | |
plot_fn = partial(plt_plot, "check_perplexity_criteria") | |
calculate.click(plot_fn, [dataset, threshold], plot) | |
check_fn = partial(check_filtered, "check_perplexity_criteria") | |
check.click(check_fn, [dataset, threshold], filtered_data) | |
with gr.Tab("Language Detection Criteria"): | |
plot = gr.Plot() | |
threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") | |
calculate = gr.Button("Calculate") | |
check = gr.Button("Check Filtered Data") | |
filtered_data = gr.Textbox(lines=5, label="Filtered Data") | |
plot_fn = partial(plt_plot, "check_language_criteria") | |
calculate.click(plot_fn, [dataset, threshold], plot) | |
check_fn = partial(check_filtered, "check_language_criteria") | |
check.click(check_fn, [dataset, threshold], filtered_data) | |
if __name__ == "__main__": | |
demo.launch() |