File size: 3,928 Bytes
215f60a a6e1ff6 215f60a 724b1ea a6e1ff6 724b1ea a6e1ff6 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea a6e1ff6 215f60a 724b1ea a6e1ff6 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a 724b1ea 215f60a a6e1ff6 215f60a a6e1ff6 215f60a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import json
import math
from functools import partial
import streamlit as st
import streamlit.components.v1 as components
from gforms import Form
BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"
MAX_DOC_LENGTH = 30000
def form_callback(
element,
page_index,
element_index,
dataset,
docid,
text,
metadata,
reason,
person,
part,
):
if element.name == "Dataset":
return dataset
if element.name == "Datapoint ID":
return docid
if element.name == "Text":
return text
if element.name == "Metadata":
return metadata
if element.name == "Flagging Reason":
return reason
if element.name == "Flagging Person":
return person
if element.name == "Part":
return part
def report_result(dataset, docid, text, metadata, reason, person, part):
form = Form()
FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
form.load(FORM_URL)
form.fill(
partial(
form_callback,
dataset=dataset,
docid=docid,
text=text,
metadata=metadata,
reason=reason,
person=person,
part=part,
),
)
form.submit()
def load_jsonl(file_path):
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
return data
if "idx" not in st.session_state:
st.session_state.idx = 0
def get_next_item():
st.session_state.idx += 1
def save_flag_and_get_next_item(sample, issue):
sample["issue"] = issue
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
f.write(json.dumps(sample) + "\n")
text = sample["text"]
sample.pop("text")
sample.pop("issue")
sample_id = ""
if "id" not in sample:
if "title" in sample:
sample_id = sample["title"]
else:
sample_id = sample["id"]
if len(text) > MAX_DOC_LENGTH:
num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
for i in range(num_parts):
text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
report_result(
dataset, sample_id, text_portion, str(sample), issue, "", str(i)
)
else:
report_result(dataset, sample_id, text, str(sample), issue, "", str(0))
get_next_item()
datasets = [
"gutenberg_raw",
"stackexchange2",
"bigcode_python_code",
"bigcode_python_github_issues",
"bigcode_python_jupyter_scripts_dedup_filtered",
"books3",
"c4",
"s2orc_raw",
"reddit_threaded",
"cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")
# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
pass
st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
st.sidebar.download_button(
"Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
)
st.sidebar.button(
"Clear bad examples file",
on_click=lambda: open(
f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
).close(),
)
with st.form(key="bad_form", clear_on_submit=True):
sample = data[st.session_state.idx]
text = sample["text"]
st.text_area(f"text id: {st.session_state.idx}", text, height=500)
issue = st.text_input(
"What's wrong with this example? (leave blank if example is fine)"
)
good = st.form_submit_button(
"GOOD",
on_click=get_next_item,
)
bad = st.form_submit_button(
"BAD",
on_click=save_flag_and_get_next_item,
args=(sample, issue),
)
|