Spaces:

CarperAI
/

pilev2_pipeline

Runtime error

ncoop57

Have initial setup of layout and fake data

4c20fbb almost 2 years ago

10.1 kB

	import gradio as gr
	import matplotlib.pyplot as plt
	import numpy as np

	# ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code")
	# amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS")
	# apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail")
	# books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3")
	# cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset")
	# dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath")
	# discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse")
	# wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki")
	# euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings")
	# freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options")
	# ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff")
	# ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues")
	# gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg")
	# leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode")
	# pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw")
	# pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed")
	# s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC")
	# se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange")
	# usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET")
	# uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO")
	# ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC")
	# arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv")

	dataset_data = {
	"AI4Code": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"AMPS": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"ASFPublicMail": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"Books3": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"CPDataset": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"DMMath": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"Discourse": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"Enwiki": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"EuroParliamentProceedings": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"FreeLaw_Options": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"GitHubDiff": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"GitHubIssues": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"Gutenberg": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"LeetCode": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"PileOfLaw": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"PubMed": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"S2ORC": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"StackExchange": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"USENET": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"USPTO": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"UbuntuIRC": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	"arXiv": {
	# create fake data for the different ratios
	"word_rep_ratios": np.random.randn(1000),
	"char_rep_ratios": np.random.randn(1000),
	"flagged_word_ratios": np.random.randn(1000),
	"num_words": np.random.randint(0, 1000, 1000),
	},
	}

	def plt_plot(threshold, x):
	# prepare some data for a histogram
	# x = np.random.randn(1000)
	# create a figure
	fig = plt.figure()
	# add a subplot
	ax = fig.add_subplot(111)
	# plot some data
	ax.hist(x, bins=50)
	# plot red dashed line at threshold
	ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2)
	plt.title("Histogram of random data")
	plt.xlabel("Value")
	plt.ylabel("Frequency")
	return fig
	# x = ["Math", "Business", "Statistics", "IT", "Commerce"]
	# y = [68, 73, 82, 74, 85]
	# # create a new plot
	# plt.rcParams['figure.figsize'] = 6,4
	# fig = plt.figure()
	# ax = fig.add_axes([0,0,1,1])
	# ax.bar(x, y)
	# plot red dashed line at threshold
	# plt.axhline(y=threshold, color='r', linestyle='--')
	# plt.title("Marks per subject")
	# plt.xlabel("Subject")
	# plt.ylabel("Score")

	# return fig

	with gr.Blocks() as demo:
	dataset = gr.Radio(list(dataset_data.keys()), label="Dataset")

	with gr.Tab("Character Repetition Ratio"):
	# plot some random data
	plot = gr.Plot()
	threshold = gr.Slider(minimum=0, maximum=100, label="Threshold")
	calculate = gr.Button("Calculate")
	calculate.click(plt_plot, [threshold, dataset_data[dataset].char_rep_ratios], plot)

	with gr.Tab("Word Repetition Ratio"):# plot some random data
	plot = gr.Plot()
	threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
	calculate = gr.Button("Calculate")
	calculate.click(plt_plot, [threshold, dataset_data[dataset].word_rep_ratios], plot)

	with gr.Tab("Flagged Word Ratio"):# plot some random data
	plot = gr.Plot()
	threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
	calculate = gr.Button("Calculate")
	calculate.click(plt_plot, [threshold, dataset_data[dataset].flagged_word_ratios], plot)

	if __name__ == "__main__":
	demo.launch(share=True)