ola13 commited on
Commit
a6e1ff6
1 Parent(s): b421ee0

Add flagging to Google Form

Browse files
app.py CHANGED
@@ -1,10 +1,61 @@
1
  import json
 
 
2
 
3
  import streamlit as st
4
  import streamlit.components.v1 as components
 
5
 
6
  BAD_EXAMPLES_PATH = "bad_examples"
7
  DATA_PATH = "data"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def load_jsonl(file_path):
@@ -24,12 +75,33 @@ def get_next_item():
24
  st.session_state.idx += 1
25
 
26
 
27
- def save_and_get_next_item(sample, issue):
28
  sample["issue"] = issue
29
 
30
  with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
31
  f.write(json.dumps(sample) + "\n")
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  get_next_item()
34
 
35
 
@@ -76,10 +148,11 @@ with st.form(key="bad_form", clear_on_submit=True):
76
  )
77
 
78
  good = st.form_submit_button(
79
- "GOOD", on_click=get_next_item,
 
80
  )
81
  bad = st.form_submit_button(
82
  "BAD",
83
- on_click=save_and_get_next_item,
84
  args=(sample, issue),
85
  )
 
1
  import json
2
+ import math
3
+ from functools import partial
4
 
5
  import streamlit as st
6
  import streamlit.components.v1 as components
7
+ from gforms import Form
8
 
9
  BAD_EXAMPLES_PATH = "bad_examples"
10
  DATA_PATH = "data"
11
+ MAX_DOC_LENGTH = 30000
12
+
13
+
14
+ def form_callback(
15
+ element,
16
+ page_index,
17
+ element_index,
18
+ dataset,
19
+ docid,
20
+ text,
21
+ metadata,
22
+ reason,
23
+ person,
24
+ part,
25
+ ):
26
+ if element.name == "Dataset":
27
+ return dataset
28
+ if element.name == "Datapoint ID":
29
+ return docid
30
+ if element.name == "Text":
31
+ return text
32
+ if element.name == "Metadata":
33
+ return metadata
34
+ if element.name == "Flagging Reason":
35
+ return reason
36
+ if element.name == "Flagging Person":
37
+ return person
38
+ if element.name == "Part":
39
+ return part
40
+
41
+
42
+ def report_result(dataset, docid, text, metadata, reason, person, part):
43
+ form = Form()
44
+ FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
45
+ form.load(FORM_URL)
46
+ form.fill(
47
+ partial(
48
+ form_callback,
49
+ dataset=dataset,
50
+ docid=docid,
51
+ text=text,
52
+ metadata=metadata,
53
+ reason=reason,
54
+ person=person,
55
+ part=part,
56
+ ),
57
+ )
58
+ form.submit()
59
 
60
 
61
  def load_jsonl(file_path):
 
75
  st.session_state.idx += 1
76
 
77
 
78
+ def save_flag_and_get_next_item(sample, issue):
79
  sample["issue"] = issue
80
 
81
  with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
82
  f.write(json.dumps(sample) + "\n")
83
 
84
+ text = sample["text"]
85
+
86
+ sample.pop("text")
87
+ sample.pop("issue")
88
+ sample_id = ""
89
+ if "id" not in sample:
90
+ if "title" in sample:
91
+ sample_id = sample["title"]
92
+ else:
93
+ sample_id = sample["id"]
94
+
95
+ if len(text) > MAX_DOC_LENGTH:
96
+ num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
97
+ for i in range(num_parts):
98
+ text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
99
+ report_result(
100
+ dataset, sample_id, text_portion, str(sample), issue, "", str(i)
101
+ )
102
+ else:
103
+ report_result(dataset, sample_id, text, str(sample), issue, "", str(0))
104
+
105
  get_next_item()
106
 
107
 
 
148
  )
149
 
150
  good = st.form_submit_button(
151
+ "GOOD",
152
+ on_click=get_next_item,
153
  )
154
  bad = st.form_submit_button(
155
  "BAD",
156
+ on_click=save_flag_and_get_next_item,
157
  args=(sample, issue),
158
  )
bad_examples/bigcode_python_code_bad_examples.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:deca29f2463d96422b301c1ca4af444e1f1dad66764a2835db243fd1a7abc3c3
3
+ size 3250
bad_examples/bigcode_python_github_issues_bad_examples.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bfc92a7f740e92393f314bac702965dd47a8084bd093d63632865fda5bb11b0
3
+ size 2876
bad_examples/bigcode_python_jupyter_scripts_dedup_filtered_bad_examples.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3184586d973ef844d86995c33d1439dfeef8faca7813a0cdd80f0d22ca9d84fa
3
+ size 7802
bad_examples/books3_bad_examples.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d011167cb7679eb46b595af10c2965efa2e36ce8085f2c6fe8a7c5d3a28e54d0
3
+ size 452432
bad_examples/c4_bad_examples.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1d2500082179deff6c62072e3937f3b432f5615eaea968602f59754eb5cd69d
3
- size 3314
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2415a6bc59f376c1535f1eb2c6854b9d13a75842675b3d8231e5d81999d865b2
3
+ size 8618
bad_examples/gutenberg_raw_bad_examples.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f116395c3f0c07973218d81c31fb2bf59c44b8b4d8f4e8a97a6228656c3a3d93
3
- size 145658
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d4a0bce4796569315e6af2b9f2313204d5cf108b21b69bb01a27a98b56ff643
3
+ size 2394572
bad_examples/reddit_threaded_bad_examples.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90cccebb836615224b151fe1576ad3667933d425bc16e0e8f231671e151b0dbb
3
+ size 2971
bad_examples/s2orc_raw_bad_examples.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dc4d3ee6ca348b2cd56294e65ff268c73905aec89856e4645bfa4aea108d573
3
+ size 15219
bad_examples/stackexchange2_bad_examples.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc14cd72933da3cae9553adf26702026ac27d4895bebe994a2df3bd21f612b68
3
- size 40469
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d56866a48e1cd99a0bc80ab9088bf7f28e7a861d91a02630252f8fad676147b
3
+ size 41965
requirements.txt CHANGED
@@ -1 +1,2 @@
 
1
  streamlit==1.20.0
 
1
+ gforms
2
  streamlit==1.20.0