|
import json |
|
import math |
|
from functools import partial |
|
|
|
import streamlit as st |
|
import streamlit.components.v1 as components |
|
from gforms import Form |
|
|
|
BAD_EXAMPLES_PATH = "bad_examples" |
|
DATA_PATH = "data" |
|
MAX_DOC_LENGTH = 30000 |
|
|
|
|
|
def form_callback( |
|
element, |
|
page_index, |
|
element_index, |
|
dataset, |
|
docid, |
|
text, |
|
metadata, |
|
reason, |
|
person, |
|
part, |
|
): |
|
if element.name == "Dataset": |
|
return dataset |
|
if element.name == "Datapoint ID": |
|
return docid |
|
if element.name == "Text": |
|
return text |
|
if element.name == "Metadata": |
|
return metadata |
|
if element.name == "Flagging Reason": |
|
return reason |
|
if element.name == "Flagging Person": |
|
return person |
|
if element.name == "Part": |
|
return part |
|
|
|
|
|
def report_result(dataset, docid, text, metadata, reason, person, part): |
|
form = Form() |
|
FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform" |
|
form.load(FORM_URL) |
|
form.fill( |
|
partial( |
|
form_callback, |
|
dataset=dataset, |
|
docid=docid, |
|
text=text, |
|
metadata=metadata, |
|
reason=reason, |
|
person=person, |
|
part=part, |
|
), |
|
) |
|
form.submit() |
|
|
|
|
|
def load_jsonl(file_path): |
|
data = [] |
|
with open(file_path, "r") as f: |
|
for line in f: |
|
data.append(json.loads(line)) |
|
|
|
return data |
|
|
|
|
|
if "idx" not in st.session_state: |
|
st.session_state.idx = 0 |
|
|
|
|
|
def get_next_item(): |
|
st.session_state.idx += 1 |
|
|
|
|
|
def save_flag_and_get_next_item(sample, issue): |
|
sample["issue"] = issue |
|
|
|
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f: |
|
f.write(json.dumps(sample) + "\n") |
|
|
|
text = sample["text"] |
|
|
|
sample.pop("text") |
|
sample.pop("issue") |
|
sample_id = "" |
|
if "id" not in sample: |
|
if "title" in sample: |
|
sample_id = sample["title"] |
|
else: |
|
sample_id = sample["id"] |
|
|
|
if len(text) > MAX_DOC_LENGTH: |
|
num_parts = math.ceil(len(text) / MAX_DOC_LENGTH) |
|
for i in range(num_parts): |
|
text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH] |
|
report_result( |
|
dataset, sample_id, text_portion, str(sample), issue, "", str(i) |
|
) |
|
else: |
|
report_result(dataset, sample_id, text, str(sample), issue, "", str(0)) |
|
|
|
get_next_item() |
|
|
|
|
|
datasets = [ |
|
"gutenberg_raw", |
|
"stackexchange2", |
|
"bigcode_python_code", |
|
"bigcode_python_github_issues", |
|
"bigcode_python_jupyter_scripts_dedup_filtered", |
|
"books3", |
|
"c4", |
|
"s2orc_raw", |
|
"reddit_threaded", |
|
"cc_filtered_text", |
|
] |
|
dataset = st.sidebar.selectbox("Dataset", datasets) |
|
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json") |
|
|
|
|
|
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f: |
|
pass |
|
|
|
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx")) |
|
|
|
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f: |
|
st.sidebar.download_button( |
|
"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl" |
|
) |
|
|
|
st.sidebar.button( |
|
"Clear bad examples file", |
|
on_click=lambda: open( |
|
f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w" |
|
).close(), |
|
) |
|
|
|
with st.form(key="bad_form", clear_on_submit=True): |
|
sample = data[st.session_state.idx] |
|
text = sample["text"] |
|
st.text_area(f"text id: {st.session_state.idx}", text, height=500) |
|
|
|
issue = st.text_input( |
|
"What's wrong with this example? (leave blank if example is fine)" |
|
) |
|
|
|
good = st.form_submit_button( |
|
"GOOD", |
|
on_click=get_next_item, |
|
) |
|
bad = st.form_submit_button( |
|
"BAD", |
|
on_click=save_flag_and_get_next_item, |
|
args=(sample, issue), |
|
) |
|
|