import json import math from functools import partial import streamlit as st import streamlit.components.v1 as components from gforms import Form BAD_EXAMPLES_PATH = "bad_examples" DATA_PATH = "data" MAX_DOC_LENGTH = 30000 def form_callback( element, page_index, element_index, dataset, docid, text, metadata, reason, person, part, ): if element.name == "Dataset": return dataset if element.name == "Datapoint ID": return docid if element.name == "Text": return text if element.name == "Metadata": return metadata if element.name == "Flagging Reason": return reason if element.name == "Flagging Person": return person if element.name == "Part": return part def report_result(dataset, docid, text, metadata, reason, person, part): form = Form() FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform" form.load(FORM_URL) form.fill( partial( form_callback, dataset=dataset, docid=docid, text=text, metadata=metadata, reason=reason, person=person, part=part, ), ) form.submit() def load_jsonl(file_path): data = [] with open(file_path, "r") as f: for line in f: data.append(json.loads(line)) return data if "idx" not in st.session_state: st.session_state.idx = 0 def get_next_item(): st.session_state.idx += 1 def save_flag_and_get_next_item(sample, issue): sample["issue"] = issue with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f: f.write(json.dumps(sample) + "\n") text = sample["text"] sample.pop("text") sample.pop("issue") sample_id = "" if "id" not in sample: if "title" in sample: sample_id = sample["title"] else: sample_id = sample["id"] if len(text) > MAX_DOC_LENGTH: num_parts = math.ceil(len(text) / MAX_DOC_LENGTH) for i in range(num_parts): text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH] report_result( dataset, sample_id, text_portion, str(sample), issue, "", str(i) ) else: report_result(dataset, sample_id, text, str(sample), issue, "", str(0)) get_next_item() datasets = [ "gutenberg_raw", "stackexchange2", "bigcode_python_code", "bigcode_python_github_issues", "bigcode_python_jupyter_scripts_dedup_filtered", "books3", "c4", "s2orc_raw", "reddit_threaded", "cc_filtered_text", ] dataset = st.sidebar.selectbox("Dataset", datasets) data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json") # create bad file if it does not exists with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f: pass st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx")) with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f: st.sidebar.download_button( "Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl" ) st.sidebar.button( "Clear bad examples file", on_click=lambda: open( f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w" ).close(), ) with st.form(key="bad_form", clear_on_submit=True): sample = data[st.session_state.idx] text = sample["text"] st.text_area(f"text id: {st.session_state.idx}", text, height=500) issue = st.text_input( "What's wrong with this example? (leave blank if example is fine)" ) good = st.form_submit_button( "GOOD", on_click=get_next_item, ) bad = st.form_submit_button( "BAD", on_click=save_flag_and_get_next_item, args=(sample, issue), )