File size: 3,928 Bytes
215f60a
a6e1ff6
 
215f60a
724b1ea
 
a6e1ff6
724b1ea
 
 
a6e1ff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724b1ea
215f60a
724b1ea
 
215f60a
724b1ea
 
 
 
 
 
215f60a
724b1ea
 
215f60a
724b1ea
 
 
 
a6e1ff6
215f60a
 
 
 
724b1ea
a6e1ff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724b1ea
 
 
215f60a
 
 
 
 
 
 
 
 
 
 
 
724b1ea
215f60a
724b1ea
 
215f60a
724b1ea
 
215f60a
724b1ea
215f60a
 
 
 
724b1ea
215f60a
 
 
 
 
 
724b1ea
215f60a
724b1ea
 
 
 
215f60a
 
 
 
 
a6e1ff6
 
215f60a
 
 
a6e1ff6
215f60a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import json
import math
from functools import partial

import streamlit as st
import streamlit.components.v1 as components
from gforms import Form

BAD_EXAMPLES_PATH = "bad_examples"
DATA_PATH = "data"
MAX_DOC_LENGTH = 30000


def form_callback(
    element,
    page_index,
    element_index,
    dataset,
    docid,
    text,
    metadata,
    reason,
    person,
    part,
):
    if element.name == "Dataset":
        return dataset
    if element.name == "Datapoint ID":
        return docid
    if element.name == "Text":
        return text
    if element.name == "Metadata":
        return metadata
    if element.name == "Flagging Reason":
        return reason
    if element.name == "Flagging Person":
        return person
    if element.name == "Part":
        return part


def report_result(dataset, docid, text, metadata, reason, person, part):
    form = Form()
    FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
    form.load(FORM_URL)
    form.fill(
        partial(
            form_callback,
            dataset=dataset,
            docid=docid,
            text=text,
            metadata=metadata,
            reason=reason,
            person=person,
            part=part,
        ),
    )
    form.submit()


def load_jsonl(file_path):
    data = []
    with open(file_path, "r") as f:
        for line in f:
            data.append(json.loads(line))

    return data


if "idx" not in st.session_state:
    st.session_state.idx = 0


def get_next_item():
    st.session_state.idx += 1


def save_flag_and_get_next_item(sample, issue):
    sample["issue"] = issue

    with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
        f.write(json.dumps(sample) + "\n")

    text = sample["text"]

    sample.pop("text")
    sample.pop("issue")
    sample_id = ""
    if "id" not in sample:
        if "title" in sample:
            sample_id = sample["title"]
    else:
        sample_id = sample["id"]

    if len(text) > MAX_DOC_LENGTH:
        num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
        for i in range(num_parts):
            text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
            report_result(
                dataset, sample_id, text_portion, str(sample), issue, "", str(i)
            )
    else:
        report_result(dataset, sample_id, text, str(sample), issue, "", str(0))

    get_next_item()


datasets = [
    "gutenberg_raw",
    "stackexchange2",
    "bigcode_python_code",
    "bigcode_python_github_issues",
    "bigcode_python_jupyter_scripts_dedup_filtered",
    "books3",
    "c4",
    "s2orc_raw",
    "reddit_threaded",
    "cc_filtered_text",
]
dataset = st.sidebar.selectbox("Dataset", datasets)
data = load_jsonl(f"{DATA_PATH}/{dataset}_examples_with_stats.json")

# create bad file if it does not exists
with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "a") as f:
    pass

st.sidebar.button("Reset Index", on_click=lambda: st.session_state.__delitem__("idx"))

with open(f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "r+") as f:
    st.sidebar.download_button(
        "Download bad example JSON file", f, file_name=f"{dataset}_bad_examples.jsonl"
    )

st.sidebar.button(
    "Clear bad examples file",
    on_click=lambda: open(
        f"{BAD_EXAMPLES_PATH}/{dataset}_bad_examples.jsonl", "w"
    ).close(),
)

with st.form(key="bad_form", clear_on_submit=True):
    sample = data[st.session_state.idx]
    text = sample["text"]
    st.text_area(f"text id: {st.session_state.idx}", text, height=500)

    issue = st.text_input(
        "What's wrong with this example? (leave blank if example is fine)"
    )

    good = st.form_submit_button(
        "GOOD",
        on_click=get_next_item,
    )
    bad = st.form_submit_button(
        "BAD",
        on_click=save_flag_and_get_next_item,
        args=(sample, issue),
    )