Spaces:

CarperAI
/

pilev2_pipeline

Runtime error

File size: 10,137 Bytes

import gradio as gr
import matplotlib.pyplot as plt
import numpy as np

# ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code")
# amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS")
# apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail")
# books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3")
# cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset")
# dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath")
# discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse")
# wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki")
# euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings")
# freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options")
# ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff")
# ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues")
# gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg")
# leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode")
# pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw")
# pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed")
# s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC")
# se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange")
# usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET")
# uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO")
# ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC")
# arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv")

dataset_data = {
    "AI4Code": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "AMPS": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "ASFPublicMail": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "Books3": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "CPDataset": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "DMMath": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "Discourse": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "Enwiki": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "EuroParliamentProceedings": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "FreeLaw_Options": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "GitHubDiff": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    }, 
    "GitHubIssues": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "Gutenberg": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "LeetCode": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "PileOfLaw": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "PubMed": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "S2ORC": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "StackExchange": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "USENET": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "USPTO": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "UbuntuIRC": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
    "arXiv": {
        # create fake data for the different ratios
        "word_rep_ratios": np.random.randn(1000),
        "char_rep_ratios": np.random.randn(1000),
        "flagged_word_ratios": np.random.randn(1000),
        "num_words": np.random.randint(0, 1000, 1000),
    },
}

def plt_plot(threshold, x):
    # prepare some data for a histogram
    # x = np.random.randn(1000)
    # create a figure
    fig = plt.figure()
    # add a subplot
    ax = fig.add_subplot(111)
    # plot some data
    ax.hist(x, bins=50)
    # plot red dashed line at threshold
    ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2)
    plt.title("Histogram of random data")
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    return fig
    # x = ["Math", "Business", "Statistics", "IT", "Commerce"]
    # y = [68, 73, 82, 74, 85]
    # # create a new plot
    # plt.rcParams['figure.figsize'] = 6,4
    # fig = plt.figure()
    # ax = fig.add_axes([0,0,1,1])
    # ax.bar(x, y)
    # plot red dashed line at threshold
    # plt.axhline(y=threshold, color='r', linestyle='--')
    # plt.title("Marks per subject")
    # plt.xlabel("Subject")
    # plt.ylabel("Score")

    # return fig

with gr.Blocks() as demo:
    dataset = gr.Radio(list(dataset_data.keys()), label="Dataset")

    with gr.Tab("Character Repetition Ratio"):
        # plot some random data
        plot = gr.Plot()
        threshold = gr.Slider(minimum=0, maximum=100, label="Threshold")
        calculate = gr.Button("Calculate")
        calculate.click(plt_plot, [threshold, dataset_data[dataset].char_rep_ratios], plot)
    
    with gr.Tab("Word Repetition Ratio"):# plot some random data
        plot = gr.Plot()
        threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
        calculate = gr.Button("Calculate")
        calculate.click(plt_plot, [threshold, dataset_data[dataset].word_rep_ratios], plot)
    
    with gr.Tab("Flagged Word Ratio"):# plot some random data
        plot = gr.Plot()
        threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
        calculate = gr.Button("Calculate")
        calculate.click(plt_plot, [threshold, dataset_data[dataset].flagged_word_ratios], plot)

if __name__ == "__main__":
    demo.launch(share=True)