Add flagging to Google Form
Browse files- app.py +76 -3
- bad_examples/bigcode_python_code_bad_examples.jsonl +3 -0
- bad_examples/bigcode_python_github_issues_bad_examples.jsonl +3 -0
- bad_examples/bigcode_python_jupyter_scripts_dedup_filtered_bad_examples.jsonl +3 -0
- bad_examples/books3_bad_examples.jsonl +3 -0
- bad_examples/c4_bad_examples.jsonl +2 -2
- bad_examples/gutenberg_raw_bad_examples.jsonl +2 -2
- bad_examples/reddit_threaded_bad_examples.jsonl +3 -0
- bad_examples/s2orc_raw_bad_examples.jsonl +3 -0
- bad_examples/stackexchange2_bad_examples.jsonl +2 -2
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,10 +1,61 @@
|
|
1 |
import json
|
|
|
|
|
2 |
|
3 |
import streamlit as st
|
4 |
import streamlit.components.v1 as components
|
|
|
5 |
|
6 |
BAD_EXAMPLES_PATH = "bad_examples"
|
7 |
DATA_PATH = "data"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
def load_jsonl(file_path):
|
@@ -24,12 +75,33 @@ def get_next_item():
|
|
24 |
st.session_state.idx += 1
|
25 |
|
26 |
|
27 |
-
def
|
28 |
sample["issue"] = issue
|
29 |
|
30 |
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
|
31 |
f.write(json.dumps(sample) + "\n")
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
get_next_item()
|
34 |
|
35 |
|
@@ -76,10 +148,11 @@ with st.form(key="bad_form", clear_on_submit=True):
|
|
76 |
)
|
77 |
|
78 |
good = st.form_submit_button(
|
79 |
-
"GOOD",
|
|
|
80 |
)
|
81 |
bad = st.form_submit_button(
|
82 |
"BAD",
|
83 |
-
on_click=
|
84 |
args=(sample, issue),
|
85 |
)
|
|
|
1 |
import json
|
2 |
+
import math
|
3 |
+
from functools import partial
|
4 |
|
5 |
import streamlit as st
|
6 |
import streamlit.components.v1 as components
|
7 |
+
from gforms import Form
|
8 |
|
9 |
BAD_EXAMPLES_PATH = "bad_examples"
|
10 |
DATA_PATH = "data"
|
11 |
+
MAX_DOC_LENGTH = 30000
|
12 |
+
|
13 |
+
|
14 |
+
def form_callback(
|
15 |
+
element,
|
16 |
+
page_index,
|
17 |
+
element_index,
|
18 |
+
dataset,
|
19 |
+
docid,
|
20 |
+
text,
|
21 |
+
metadata,
|
22 |
+
reason,
|
23 |
+
person,
|
24 |
+
part,
|
25 |
+
):
|
26 |
+
if element.name == "Dataset":
|
27 |
+
return dataset
|
28 |
+
if element.name == "Datapoint ID":
|
29 |
+
return docid
|
30 |
+
if element.name == "Text":
|
31 |
+
return text
|
32 |
+
if element.name == "Metadata":
|
33 |
+
return metadata
|
34 |
+
if element.name == "Flagging Reason":
|
35 |
+
return reason
|
36 |
+
if element.name == "Flagging Person":
|
37 |
+
return person
|
38 |
+
if element.name == "Part":
|
39 |
+
return part
|
40 |
+
|
41 |
+
|
42 |
+
def report_result(dataset, docid, text, metadata, reason, person, part):
|
43 |
+
form = Form()
|
44 |
+
FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
|
45 |
+
form.load(FORM_URL)
|
46 |
+
form.fill(
|
47 |
+
partial(
|
48 |
+
form_callback,
|
49 |
+
dataset=dataset,
|
50 |
+
docid=docid,
|
51 |
+
text=text,
|
52 |
+
metadata=metadata,
|
53 |
+
reason=reason,
|
54 |
+
person=person,
|
55 |
+
part=part,
|
56 |
+
),
|
57 |
+
)
|
58 |
+
form.submit()
|
59 |
|
60 |
|
61 |
def load_jsonl(file_path):
|
|
|
75 |
st.session_state.idx += 1
|
76 |
|
77 |
|
78 |
+
def save_flag_and_get_next_item(sample, issue):
|
79 |
sample["issue"] = issue
|
80 |
|
81 |
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
|
82 |
f.write(json.dumps(sample) + "\n")
|
83 |
|
84 |
+
text = sample["text"]
|
85 |
+
|
86 |
+
sample.pop("text")
|
87 |
+
sample.pop("issue")
|
88 |
+
sample_id = ""
|
89 |
+
if "id" not in sample:
|
90 |
+
if "title" in sample:
|
91 |
+
sample_id = sample["title"]
|
92 |
+
else:
|
93 |
+
sample_id = sample["id"]
|
94 |
+
|
95 |
+
if len(text) > MAX_DOC_LENGTH:
|
96 |
+
num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
|
97 |
+
for i in range(num_parts):
|
98 |
+
text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
|
99 |
+
report_result(
|
100 |
+
dataset, sample_id, text_portion, str(sample), issue, "", str(i)
|
101 |
+
)
|
102 |
+
else:
|
103 |
+
report_result(dataset, sample_id, text, str(sample), issue, "", str(0))
|
104 |
+
|
105 |
get_next_item()
|
106 |
|
107 |
|
|
|
148 |
)
|
149 |
|
150 |
good = st.form_submit_button(
|
151 |
+
"GOOD",
|
152 |
+
on_click=get_next_item,
|
153 |
)
|
154 |
bad = st.form_submit_button(
|
155 |
"BAD",
|
156 |
+
on_click=save_flag_and_get_next_item,
|
157 |
args=(sample, issue),
|
158 |
)
|
bad_examples/bigcode_python_code_bad_examples.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:deca29f2463d96422b301c1ca4af444e1f1dad66764a2835db243fd1a7abc3c3
|
3 |
+
size 3250
|
bad_examples/bigcode_python_github_issues_bad_examples.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3bfc92a7f740e92393f314bac702965dd47a8084bd093d63632865fda5bb11b0
|
3 |
+
size 2876
|
bad_examples/bigcode_python_jupyter_scripts_dedup_filtered_bad_examples.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3184586d973ef844d86995c33d1439dfeef8faca7813a0cdd80f0d22ca9d84fa
|
3 |
+
size 7802
|
bad_examples/books3_bad_examples.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d011167cb7679eb46b595af10c2965efa2e36ce8085f2c6fe8a7c5d3a28e54d0
|
3 |
+
size 452432
|
bad_examples/c4_bad_examples.jsonl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2415a6bc59f376c1535f1eb2c6854b9d13a75842675b3d8231e5d81999d865b2
|
3 |
+
size 8618
|
bad_examples/gutenberg_raw_bad_examples.jsonl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d4a0bce4796569315e6af2b9f2313204d5cf108b21b69bb01a27a98b56ff643
|
3 |
+
size 2394572
|
bad_examples/reddit_threaded_bad_examples.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90cccebb836615224b151fe1576ad3667933d425bc16e0e8f231671e151b0dbb
|
3 |
+
size 2971
|
bad_examples/s2orc_raw_bad_examples.jsonl
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0dc4d3ee6ca348b2cd56294e65ff268c73905aec89856e4645bfa4aea108d573
|
3 |
+
size 15219
|
bad_examples/stackexchange2_bad_examples.jsonl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d56866a48e1cd99a0bc80ab9088bf7f28e7a861d91a02630252f8fad676147b
|
3 |
+
size 41965
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
|
|
1 |
streamlit==1.20.0
|
|
|
1 |
+
gforms
|
2 |
streamlit==1.20.0
|