import os import random import gradio as gr import matplotlib.pyplot as plt import numpy as np from functools import partial from datasets import load_dataset dataset_names = [ "AI4Code", "AMPS", "ASFPublicMail", "CPDataset", "DMMath", "Discourse", "Enwiki", "EuroParliamentProceedings", "FreeLaw_Options", "GithubDiff", "GithubIssues", "Gutenberg", "LeetCode", "PileOfLaw", "PubMed", "S2ORC", "StackExchange", "USENET", "USPTO", "UbuntuIRC", "arXiv", ] dataset_data = {} for name in dataset_names: path = f"data/{name}/data.json" ds = load_dataset( "CarperAI/pilev2_smol_metadata", data_files=path, use_auth_token=os.environ["HF_TOKEN"], split="train", # download_mode="force_redownload", ) dataset_data[name] = { "ds": ds, "check_word_number_criteria": np.array(ds["check_word_number_criteria"]), "check_char_repetition_criteria": np.array(ds["check_char_repetition_criteria"]), "check_flagged_words_criteria": np.array(ds["check_flagged_words_criteria"]), "check_stop_word_ratio_criteria": np.array(ds["check_stop_word_ratio_criteria"]), "check_perplexity_criteria": np.array(ds["check_perplexity_criteria"]), "check_language_criteria": np.array(ds["check_language_criteria"]), } def plt_plot(criteria, dataset, threshold): plt.close("all") x = dataset_data[dataset][criteria] # calculate percentage of data that will be removed given threshold perc = np.sum(x > threshold) / len(x) # create a figure fig = plt.figure() # add a subplot ax = fig.add_subplot(111) # plot some data using black ax.hist(x, bins=50, color="black") # plot red dashed line at threshold ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2) # set title # add percentage of data removed ax.set_title(f"{dataset} (removed {perc:.2%})") plt.xlabel("Value") plt.ylabel("Frequency") # make it look nice plt.tight_layout() return fig def check_filtered(criteria, dataset, threshold): ds = dataset_data[dataset]["ds"] filtered_ds = ds.filter(lambda x: x[criteria] > threshold) if len(filtered_ds) == 0: return "No examples found" # get random sample of 1 sample = filtered_ds.select([random.randint(0, len(filtered_ds) - 1)])["text"][0] return sample with gr.Blocks() as demo: dataset = gr.Radio(dataset_names, label="Dataset", value="arXiv") with gr.Tab("Number of Words Criteria"): # plot some random data plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=50_000, label="Threshold") calculate = gr.Button("Calculate") check = gr.Button("Check Filtered Data") filtered_data = gr.Textbox(lines=5, label="Filtered Data") plot_fn = partial(plt_plot, "check_word_number_criteria") calculate.click(plot_fn, [dataset, threshold], plot) check_fn = partial(check_filtered, "check_word_number_criteria") check.click(check_fn, [dataset, threshold], filtered_data) with gr.Tab("Character Repetition Criteria"): # plot some random data plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") calculate = gr.Button("Calculate") check = gr.Button("Check Filtered Data") filtered_data = gr.Textbox(lines=5, label="Filtered Data") plot_fn = partial(plt_plot, "check_char_repetition_criteria") calculate.click(plot_fn, [dataset, threshold], plot) check_fn = partial(check_filtered, "check_char_repetition_criteria") check.click(check_fn, [dataset, threshold], filtered_data) with gr.Tab("Stop Word Ratio Criteria"): plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") calculate = gr.Button("Calculate") check = gr.Button("Check Filtered Data") filtered_data = gr.Textbox(lines=5, label="Filtered Data") plot_fn = partial(plt_plot, "check_stop_word_ratio_criteria") calculate.click(plot_fn, [dataset, threshold], plot) check_fn = partial(check_filtered, "check_stop_word_ratio_criteria") check.click(check_fn, [dataset, threshold], filtered_data) with gr.Tab("Flagged Word Criteria"): plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") calculate = gr.Button("Calculate") check = gr.Button("Check Filtered Data") filtered_data = gr.Textbox(lines=5, label="Filtered Data") plot_fn = partial(plt_plot, "check_flagged_words_criteria") calculate.click(plot_fn, [dataset, threshold], plot) check_fn = partial(check_filtered, "check_flagged_words_criteria") check.click(check_fn, [dataset, threshold], filtered_data) with gr.Tab("Perplexity Criteria"): plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=50_000, label="Threshold") calculate = gr.Button("Calculate") check = gr.Button("Check Filtered Data") filtered_data = gr.Textbox(lines=5, label="Filtered Data") plot_fn = partial(plt_plot, "check_perplexity_criteria") calculate.click(plot_fn, [dataset, threshold], plot) check_fn = partial(check_filtered, "check_perplexity_criteria") check.click(check_fn, [dataset, threshold], filtered_data) with gr.Tab("Language Detection Criteria"): plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") calculate = gr.Button("Calculate") check = gr.Button("Check Filtered Data") filtered_data = gr.Textbox(lines=5, label="Filtered Data") plot_fn = partial(plt_plot, "check_language_criteria") calculate.click(plot_fn, [dataset, threshold], plot) check_fn = partial(check_filtered, "check_language_criteria") check.click(check_fn, [dataset, threshold], filtered_data) if __name__ == "__main__": demo.launch()