ola13's picture
Add flagging to Google Form
a6e1ff6
raw
history blame
3.93 kB
import json
import math
from functools import partial
import streamlit as st
import streamlit.components.v1 as components
from gforms import Form
BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"
MAX_DOC_LENGTH = 30000
def form_callback(
element,
page_index,
element_index,
dataset,
docid,
text,
metadata,
reason,
person,
part,
):
if element.name == "Dataset":
return dataset
if element.name == "Datapoint ID":
return docid
if element.name == "Text":
return text
if element.name == "Metadata":
return metadata
if element.name == "Flagging Reason":
return reason
if element.name == "Flagging Person":
return person
if element.name == "Part":
return part
def report_result(dataset, docid, text, metadata, reason, person, part):
form = Form()
FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
form.load(FORM_URL)
form.fill(
partial(
form_callback,
dataset=dataset,
docid=docid,
text=text,
metadata=metadata,
reason=reason,
person=person,
part=part,
),
)
form.submit()
def load_jsonl(file_path):
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
return data
if "idx" not in st.session_state:
st.session_state.idx = 0
def get_next_item():
st.session_state.idx += 1
def save_flag_and_get_next_item(sample, issue):
sample["issue"] = issue
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
f.write(json.dumps(sample) + "\n")
text = sample["text"]
sample.pop("text")
sample.pop("issue")
sample_id = ""
if "id" not in sample:
if "title" in sample:
sample_id = sample["title"]
else:
sample_id = sample["id"]
if len(text) > MAX_DOC_LENGTH:
num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
for i in range(num_parts):
text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
report_result(
dataset, sample_id, text_portion, str(sample), issue, "", str(i)
)
else:
report_result(dataset, sample_id, text, str(sample), issue, "", str(0))
get_next_item()
datasets = [
"gutenberg_raw",
"stackexchange2",
"bigcode_python_code",
"bigcode_python_github_issues",
"bigcode_python_jupyter_scripts_dedup_filtered",
"books3",
"c4",
"s2orc_raw",
"reddit_threaded",
"cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")
# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
pass
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
st.sidebar.download_button(
"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
)
st.sidebar.button(
"Clear bad examples file",
on_click=lambda: open(
f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
).close(),
)
with st.form(key="bad_form", clear_on_submit=True):
sample = data[st.session_state.idx]
text = sample["text"]
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
issue = st.text_input(
"What's wrong with this example? (leave blank if example is fine)"
)
good = st.form_submit_button(
"GOOD",
on_click=get_next_item,
)
bad = st.form_submit_button(
"BAD",
on_click=save_flag_and_get_next_item,
args=(sample, issue),
)