File size: 31,941 Bytes
d2c1af1
68f913d
 
 
 
 
 
 
 
 
 
bf8e6b0
68f913d
bf8e6b0
 
a09b56d
bf8e6b0
 
a09b56d
bf8e6b0
 
 
 
 
68f913d
 
bf8e6b0
 
 
 
 
 
 
 
68f913d
 
bf8e6b0
 
 
 
 
 
 
68f913d
a09b56d
bf8e6b0
 
 
 
68f913d
 
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
68f913d
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68f913d
 
 
 
 
 
bf8e6b0
 
68f913d
 
 
a09b56d
 
 
 
 
 
 
 
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68f913d
bf8e6b0
 
 
 
 
 
68f913d
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68f913d
 
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68f913d
 
 
bf8e6b0
 
 
 
68f913d
bf8e6b0
 
68f913d
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68f913d
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68f913d
a09b56d
 
 
 
 
 
 
 
 
68f913d
bf8e6b0
 
 
 
 
 
 
 
68f913d
a09b56d
bf8e6b0
68f913d
bf8e6b0
68f913d
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
a09b56d
 
 
 
 
 
 
bf8e6b0
68f913d
a09b56d
 
 
 
 
 
 
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a09b56d
 
bf8e6b0
 
 
 
 
a09b56d
 
bf8e6b0
 
 
 
 
a09b56d
 
bf8e6b0
 
 
 
a09b56d
 
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a09b56d
 
 
 
 
 
 
 
bf8e6b0
 
 
 
 
a09b56d
 
 
 
 
 
 
 
bf8e6b0
 
 
 
 
 
 
 
 
 
 
68f913d
 
bf8e6b0
 
 
68f913d
bf8e6b0
 
 
 
68f913d
bf8e6b0
 
 
 
68f913d
 
bf8e6b0
68f913d
 
bf8e6b0
 
 
 
 
 
 
68f913d
bf8e6b0
 
 
 
 
 
 
 
a09b56d
 
 
 
 
 
 
bf8e6b0
68f913d
a09b56d
 
 
 
 
 
 
 
bf8e6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
a09b56d
 
 
 
 
 
 
bf8e6b0
68f913d
a09b56d
 
 
 
 
 
 
bf8e6b0
 
68f913d
 
bf8e6b0
 
68f913d
bf8e6b0
 
 
68f913d
bf8e6b0
 
 
d2c1af1
bf8e6b0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
import streamlit as st
import os
import pathlib
import beir
from beir import util
from beir.datasets.data_loader import GenericDataLoader
import pytrec_eval
import pandas as pd
from collections import defaultdict
import json
import copy
import plotly.express as px

from constants import ALL_DATASETS, ALL_METRICS
from dataset_loading import get_dataset, load_run, load_local_qrels, load_local_corpus, load_local_queries
from analysis import create_boxplot_1df, create_boxplot_2df, create_boxplot_diff, get_model, prep_func


os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
st.set_page_config(layout="wide")


if 'cur_instance_num' not in st.session_state:
    st.session_state.cur_instance_num = -1


def update_details(run_details, run_score):
    if run_score == 0:
        run_details["none"] += 1
    elif run_score == 1:
        run_details["perfect"] += 1
    else:
        run_details["inbetween"] += 1
    return run_details


def check_valid_args(run1_file, run2_file, dataset_name, qrels, queries, corpus):
    if run1_file is not None and dataset_name not in ["", None, "custom"]:
        return True
    elif run1_file is not None and dataset_name == "custom":
        if qrels is not None and queries is not None and corpus is not None:
            return True
    return False


def validate(config_option, file_loaded):
    if config_option != "None" and file_loaded is None:
        st.error("Please upload a file for " + config_option)
        st.stop()


def combine(text_og, text_new, combine_type):
    if combine_type == "None":
        return text_og
    elif combine_type == "Append":
        return text_og + " <APPEND> " + text_new
    elif combine_type == "Prepend":
        return text_new + " <PREPEND> " + text_og
    elif combine_type == "Replace":
        return text_new
    else:
        raise ValueError("Invalid combine type")

with st.sidebar:
    st.title("Options")
    dataset_name = st.selectbox("Select a preloaded dataset or upload your own", tuple(ALL_DATASETS))
    metric_name = st.selectbox("Select a metric", tuple(ALL_METRICS))

    if dataset_name == "custom":
        st.header("Upload corpus")
        corpus_file = st.file_uploader("Choose a file", key="corpus")
        corpus = load_local_corpus(corpus_file)
        st.header("Upload queries")
        queries_file = st.file_uploader("Choose a file", key="queries")
        queries = load_local_queries(queries_file)
        st.header("Upload qrels")
        qrels_file = st.file_uploader("Choose a file", key="qrels")
        qrels = load_local_qrels(qrels_file)
    else:
        qrels = None
        queries = None
        corpus = None

    # sliderbar of how many Top N to choose
    top_n = st.slider("Top N", 1, 100, 3)
    x = st.header('Upload a run file')
    run1_file = st.file_uploader("Choose a file", key="run1")
    y = st.header("Upload a second run file")
    run2_file = st.file_uploader("Choose a file", key="run2")

    z = st.header("Analysis Options")
    incorrect_only = st.checkbox("Show only incorrect instances", value=False)
    one_better_than_two = st.checkbox("Show only instances where run 1 is better than run 2", value=False)
    two_better_than_one = st.checkbox("Show only instances where run 2 is better than run 1", value=False)
    use_model_saliency = st.checkbox("Use model saliency (slow!)", value=False)
    if use_model_saliency:
        # choose from a list of models
        model_name = st.selectbox("Choose from a list of models", ["MonoT5"])
        model, formatter = get_model("MonoT5")
        get_saliency = prep_func(model, formatter)


    advanced_options1 = st.checkbox("Show advanced options for Run 1", value=False)
    doc_expansion1 = doc_expansion2 = None
    query_expansion1 = query_expansion2 = None
    run1_uses_query_expansion = "None"
    run1_uses_doc_expansion = "None"
    run2_uses_query_expansion = "None"
    run2_uses_doc_expansion = "None"
    if advanced_options1:
        doc_header = st.header("Upload a Document Expansion file")
        doc_expansion_file = st.file_uploader("Choose a file", key="doc_expansion")
        if doc_expansion_file is not None:
            doc_expansion1 = load_local_corpus(doc_expansion_file)
        query_header = st.header("Upload a Query Expansion file")
        query_expansion_file = st.file_uploader("Choose a file", key="query_expansion")
        if query_expansion_file is not None:
            query_expansion1 = load_local_queries(query_expansion_file)

        run1_uses_query_expansion = st.selectbox("Type of query expansion used in run 1", ("None", "Append", "Prepend", "Replace"))
        run1_uses_doc_expansion = st.selectbox("Type of document expansion used in run 1", ("None", "Append", "Prepend", "Replace"))
        validate(run1_uses_query_expansion, query_expansion_file)
        validate(run1_uses_doc_expansion, doc_expansion_file)

    advanced_options2 = st.checkbox("Show advanced options for Run 2", value=False)
    if advanced_options2:
        doc_header = st.header("Upload a Document Expansion file")
        doc_expansion_file = st.file_uploader("Choose a file", key="doc_expansion2")
        if doc_expansion_file is not None:
            doc_expansion2 = load_local_corpus(doc_expansion_file)
        query_header = st.header("Upload a Query Expansion file")
        query_expansion_file = st.file_uploader("Choose a file", key="query_expansion2")
        if query_expansion_file is not None:
            query_expansion2 = load_local_queries(query_expansion_file)

        run2_uses_query_expansion = st.selectbox("Type of query expansion used in run 2", ("None", "Append", "Prepend", "Replace"))
        run2_uses_doc_expansion = st.selectbox("Type of document expansion used in run 2", ("None", "Append", "Prepend", "Replace"))
        validate(run2_uses_query_expansion, query_expansion_file)
        validate(run2_uses_doc_expansion, doc_expansion_file)


# everything hinges on the run being uploaded, so do that first
# init_title = st.title("Upload Run and Choose Details")

if run1_file is not None:
    run1, run1_pandas = load_run(run1_file)

# do everything, now that we have the run file
if check_valid_args(run1_file, run2_file, dataset_name, qrels, queries, corpus):
    # init_title = st.title("Analysis")
    # don't load these til a run is given
    if dataset_name != "custom":
        corpus, queries, qrels = get_dataset(dataset_name)

    evaluator = pytrec_eval.RelevanceEvaluator(
            copy.deepcopy(qrels), pytrec_eval.supported_measures)
    results1 = evaluator.evaluate(run1) # dict of instance then metrics then values
    if len(results1) == 0:
        # alert and stop
        st.error("Run file is empty")
        st.stop()

    if run2_file is not None:
        run2, run2_pandas = load_run(run2_file)
        # NOTE: will fail if run1 is not uploaded
        evaluator2 = pytrec_eval.RelevanceEvaluator(
            copy.deepcopy(qrels), pytrec_eval.supported_measures)
        results2 = evaluator2.evaluate(run2)

    col1, col2 = st.columns([1, 3], gap="large")

    # incorrect = 0
    is_better_run1_count = 0
    is_better_run2_count = 0
    is_same_count = 0
    run1_details = {"none": 0, "perfect": 0, "inbetween": 0}
    run2_details = {"none": 0, "perfect": 0, "inbetween": 0}
    with col1:
        st.title("Instances")
        if run1_file is not None:
            set_of_cols =  set(run1_pandas.qid.tolist())
            container_for_nav = st.container()
            name_of_columns = sorted([item for item in set_of_cols])
            instances_to_use = []
            # st.divider()
            for idx in range(len(name_of_columns)):
                is_incorrect = False
                is_better_run1 = False
                is_better_run2 = False

                run1_score = results1[str(name_of_columns[idx])][metric_name] if idx else 1
                run1_details = update_details(run1_details, run1_score)
                if run2_file is not None:
                    run2_score = results2[str(name_of_columns[idx])][metric_name] if idx else 1
                    run2_details = update_details(run2_details, run2_score)

                    if run1_score == 0 or run2_score == 0:
                        is_incorrect = True

                    if run1_score > run2_score:
                        is_better_run1_count += 1
                        is_better_run1 = True
                    elif run2_score > run1_score:
                        is_better_run2_count += 1
                        is_better_run2 = True
                    else:
                        is_same_count += 1


                    if not incorrect_only or is_incorrect:
                        if not one_better_than_two or is_better_run1:
                            if not two_better_than_one or is_better_run2:
                                # check = st.checkbox(f"{idx}. " + str(name_of_columns[idx]), key=f"{idx}check")  
                                # st.divider()
                                instances_to_use.append(name_of_columns[idx])
                else:
                    if run1_score == 0:
                        is_incorrect = True

                    if not incorrect_only or is_incorrect:
                        # check = st.checkbox(f"{idx}. " + str(name_of_columns[idx]), key=f"{idx}check")  
                        # st.divider()
                        instances_to_use.append(name_of_columns[idx])


        def sync_from_drop():
            if st.session_state.selectbox_instance == "Overview":
                st.session_state.number_of_col = -1
                st.session_state.cur_instance_num = -1
            else:
                index_of_obj = name_of_columns.index(st.session_state.selectbox_instance)
                # print("Index of obj: ", index_of_obj, type(index_of_obj)) 
                st.session_state.number_of_col = index_of_obj
                st.session_state.cur_instance_num = index_of_obj

        def sync_from_number():
            st.session_state.cur_instance_num = st.session_state.number_of_col
            # print("Session state number of col: ", st.session_state.number_of_col, type(st.session_state.number_of_col))
            if st.session_state.number_of_col == -1:
                st.session_state.selectbox_instance = "Overview"
            else:
                st.session_state.selectbox_instance = name_of_columns[st.session_state.number_of_col]


        number_of_col = container_for_nav.number_input(min_value=-1, step=1, max_value=len(instances_to_use), on_change=sync_from_number, label=f"Select instance by index (out of **{len(instances_to_use)}**)", key="number_of_col")
        selectbox_instance = container_for_nav.selectbox("Select instance by ID", ["Overview"] + name_of_columns, on_change=sync_from_drop, key="selectbox_instance")
        st.divider()  
        # make pie plot showing incorrect vs correct
        st.header("Breakdown")
        if run2_file is None:
            plotly_pie_chart = px.pie(names=["Perfect", "Inbetween", "None"], values=[run1_details["perfect"], run1_details["inbetween"], run1_details["none"]])
            st.write("Run 1 Scores")
            plotly_pie_chart.update_traces(showlegend=False, selector=dict(type='pie'), textposition='inside', textinfo='percent+label')
            st.plotly_chart(plotly_pie_chart, use_container_width=True)
        else:
            if st.checkbox("Show Run 1 vs Run 2", value=True):
                plotly_pie_chart = px.pie(names=["Run 1 Better", "Run 2 Better", "Tied"], values=[is_better_run1_count, is_better_run2_count, is_same_count])
                plotly_pie_chart.update_traces(showlegend=False, selector=dict(type='pie'), textposition='inside', textinfo='percent+label')
                st.plotly_chart(plotly_pie_chart, use_container_width=True)
                
            if st.checkbox("Show Run 1 Breakdown"):
                plotly_pie_chart_run1 = px.pie(names=["Perfect", "Inbetween", "None"], values=[run1_details["perfect"], run1_details["inbetween"], run1_details["none"]])
                plotly_pie_chart_run1.update_traces(showlegend=False, selector=dict(type='pie'), textposition='inside', textinfo='percent+label')
                st.plotly_chart(plotly_pie_chart_run1, use_container_width=True)
            if st.checkbox("Show Run 2 Breakdown"):
                plotly_pie_chart_run2 = px.pie(names=["Perfect", "Inbetween", "None"], values=[run2_details["perfect"], run2_details["inbetween"], run2_details["none"]])
                plotly_pie_chart_run2.update_traces(showlegend=False, selector=dict(type='pie'), textposition='inside', textinfo='percent+label')
                st.plotly_chart(plotly_pie_chart_run2, use_container_width=True)



    with col2:
        # st.title(f"Information ({len(checkboxes) - 1}/{len(name_of_columns) - 1})")
        ### Only one run file
        if run1_file is not None and run2_file is None:

            # get instance number
            inst_index = number_of_col

            if inst_index >= 0:
                inst_num = instances_to_use[inst_index - 1]
                
                st.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Run 1</h1>", unsafe_allow_html=True)

                container = st.container()

                rank_col, score_col, id_col = container.columns([2,1,3])
                id_col.metric("ID", inst_num)
                score_col.metric(metric_name, results1[str(inst_num)][metric_name])

                # st.subheader(f"ID")
                # st.markdown(inst_num)
                st.divider()

                st.subheader(f"Query")
                if run1_uses_query_expansion != "None":
                    show_orig_rel = st.checkbox("Show Original Query", key=f"{inst_index}reloriguery", value=False)

                query_text_og = queries[str(inst_num)]
                if query_expansion1 is not None and run1_uses_query_expansion != "None" and not show_orig_rel:
                    alt_text = query_expansion1[str(inst_num)]
                    query_text = combine(query_text_og, alt_text, run1_uses_query_expansion)
                else:
                    query_text = query_text_og
                st.markdown(query_text)
                st.divider()

                ## Documents
                # relevant
                relevant_docs = list(qrels[str(inst_num)].keys())
                doc_texts = [(doc_id, corpus[doc_id]["title"], corpus[doc_id]["text"]) for doc_id in relevant_docs]
                st.subheader("Relevant Documents")
                if doc_expansion1 is not None and run1_uses_doc_expansion != "None":
                    show_orig_rel = st.checkbox("Show Original Relevant Doc(s)", key=f"{inst_index}relorig", value=False)

                for (docid, title, text) in doc_texts:
                    if doc_expansion1 is not None and run1_uses_doc_expansion != "None" and not show_orig_rel:
                        alt_text = doc_expansion1[docid]["text"]
                        text = combine(text, alt_text, run1_uses_doc_expansion)

                    if use_model_saliency:
                        if st.checkbox("Show Model Saliency", key=f"{inst_index}model_saliency", value=False):
                            st.markdown(get_saliency(query_text, doc_texts),unsafe_allow_html=True)
                        else:
                            st.text_area(f"{docid}:", text)

                    else:
                        st.text_area(f"{docid}:", text)


                pred_doc = run1_pandas[run1_pandas.doc_id.isin(relevant_docs)]
                rank_pred = pred_doc[pred_doc.qid == str(inst_num)]["rank"].tolist()
                # st.subheader("Ranked of Documents")
                # st.markdown(f"Rank: {rank_pred}")
                ranking_str = ",".join([str(item) for item in rank_pred]) if type(rank_pred) == list else str(rank_pred)
                if ranking_str == "":
                    ranking_str = "--"
                rank_col.metric(f"Rank of Relevant Doc(s)", ranking_str)


                st.divider()

                # top ranked

                if st.checkbox('Show top ranked documents', key=f"{inst_index}top-1run"):
                    st.subheader("Top N Ranked Documents")
                    if doc_expansion1 is not None and run1_uses_doc_expansion != "None":
                        show_orig_rel_ranked = st.checkbox("Show Original Ranked Doc(s)", key=f"{inst_index}relorigdocs", value=False)

                    run1_top_n = run1_pandas[run1_pandas.qid == str(inst_num)][:top_n]
                    run1_top_n_docs = [corpus[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]
                    if doc_expansion1 is not None and run1_uses_doc_expansion != "None" and not show_orig_rel_ranked:
                        run1_top_n_docs_alt = [doc_expansion1[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]
                        for d_idx, doc in enumerate(run1_top_n_docs):
                            alt_text = run1_top_n_docs_alt[d_idx]["text"]
                            doc_text = combine(doc["text"], alt_text, run1_uses_doc_expansion)
                            if use_model_saliency:
                                if st.checkbox("Show Model Saliency", key=f"{inst_index}model_saliency", value=False):
                                    st.markdown(get_saliency(query_text, doc_text),unsafe_allow_html=True)
                                else:
                                    st.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc_text, key=f"{inst_num}doc{d_idx}")
                            else:
                                st.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc_text, key=f"{inst_num}doc{d_idx}")
                    else:
                        for d_idx, doc in enumerate(run1_top_n_docs):
                            if use_model_saliency:
                                if st.checkbox("Show Model Saliency", key=f"{inst_index}model_saliency{d_idx}ranked", value=False):
                                    st.markdown(get_saliency(query_text, doc),unsafe_allow_html=True)
                                else:
                                    st.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc["text"], key=f"{inst_num}doc{d_idx}")
                            else:
                                st.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc["text"], key=f"{inst_num}doc{d_idx}")
                    st.divider()
        
            # none checked
            elif inst_index < 0:
                st.title("Overview")
                st.subheader(f"Scores of {metric_name}")
                plotly_chart = create_boxplot_1df(results1, metric_name)
                st.plotly_chart(plotly_chart)
                                
        ## Both run files available
        elif run1_file is not None and run2_file is not None:
            has_check = False
            container_top = st.container()

            # get instance number
            inst_index = number_of_col
           
            if inst_index >= 0:
                inst_num = instances_to_use[inst_index]
    
                col_run1, col_run2 = container_top.columns([1,1])
                col_run1.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Run 1</h1>", unsafe_allow_html=True)
                col_run2.markdown("<h1 style='text-align: center; color: black;text-decoration: underline;'>Run 2</h1>", unsafe_allow_html=True)

                container_overview = st.container()
                rank_col1, score_col1, rank_col2, score_col2  = container_overview.columns([2,1,2,1])
                # id_col1.metric("", "")
                score_col1.metric("Run 1 " + metric_name, results1[str(inst_num)][metric_name])
                score_col2.metric("Run 2 " + metric_name, results2[str(inst_num)][metric_name])

                st.divider()

                st.subheader(f"Query")
                container_two_query = st.container()
                col_run1, col_run2 = container_two_query.columns(2, gap="medium")

                query_text_og = queries[str(inst_num)]
                if run1_uses_query_expansion != "None" and run2_uses_query_expansion != "None":
                    alt_text1 = query_expansion1[str(inst_num)]
                    alt_text2 = query_expansion2[str(inst_num)]
                    combined_text1 = combine(query_text_og, alt_text1, run1_uses_query_expansion)
                    combined_text2 = combine(query_text_og, alt_text2, run2_uses_query_expansion)
                    col_run1.markdown(combined_text1)
                    col_run2.markdown(combined_text2)
                    query_text1 = combined_text1
                    query_text2 = combined_text2
                elif run1_uses_query_expansion != "None":
                    alt_text = query_expansion1[str(inst_num)]
                    combined_text1 = combine(query_text_og, alt_text, run1_uses_query_expansion)
                    col_run1.markdown(combined_text1)
                    col_run2.markdown(query_text_og)
                    query_text1 = combined_text1
                    query_text2 = query_text_og
                elif run2_uses_query_expansion != "None":
                    alt_text = query_expansion2[str(inst_num)]
                    combined_text2 = combine(query_text_og, alt_text, run2_uses_query_expansion)
                    col_run1.markdown(query_text_og)
                    col_run2.markdown(combined_text2)
                    query_text1 = query_text_og
                    query_text2 = combined_text2
                else:
                    query_text = query_text_og
                    col_run1.markdown(query_text)
                    col_run2.markdown(query_text)
                    query_text1 = query_text
                    query_text2 = query_text

                st.divider()



                ## Documents
                # relevant
                st.subheader("Relevant Documents")
                container_two_docs_rel = st.container()
                col_run1, col_run2 = container_two_docs_rel.columns(2, gap="medium")
                relevant_docs = list(qrels[str(inst_num)].keys())
                doc_texts = [(doc_id, corpus[doc_id]["title"], corpus[doc_id]["text"]) for doc_id in relevant_docs]

                if doc_expansion1 is not None and run1_uses_doc_expansion != "None":
                    show_orig_rel1 = col_run1.checkbox("Show Original Relevant Doc(s)", key=f"{inst_index}relorig_run1", value=False)
                if doc_expansion2 is not None and run2_uses_doc_expansion != "None":
                    show_orig_rel2 = col_run2.checkbox("Show Original Relevant Doc(s)", key=f"{inst_index}relorig_run2", value=False)

                for (docid, title, text) in doc_texts:
                    if doc_expansion1 is not None and run1_uses_doc_expansion != "None" and not show_orig_rel1:
                        alt_text = doc_expansion1[docid]["text"]
                        text = combine(text, alt_text, run1_uses_doc_expansion)
                    
                    if use_model_saliency:
                        if col_run1.checkbox("Show Model Saliency", key=f"{inst_index}model_saliency{docid}relevant", value=False):
                            col_run1.markdown(get_saliency(query_text1, text),unsafe_allow_html=True)
                        else:
                            col_run1.text_area(f"{docid}:", text, key=f"{inst_num}doc{docid}1")
                    else:
                        col_run1.text_area(f"{docid}:", text, key=f"{inst_num}doc{docid}1")

                for (docid, title, text) in doc_texts:
                    if doc_expansion2 is not None and run2_uses_doc_expansion != "None" and not show_orig_rel2:
                        alt_text = doc_expansion2[docid]["text"]
                        text = combine(text, alt_text, run2_uses_doc_expansion)

                    if use_model_saliency:
                        if col_run2.checkbox("Show Model Saliency", key=f"{inst_index}model_saliency{docid}relevant2", value=False):
                            col_run2.markdown(get_saliency(query_text2, text),unsafe_allow_html=True)
                        else:
                            col_run2.text_area(f"{docid}:", text, key=f"{inst_num}doc{docid}2")
                    else:
                        col_run2.text_area(f"{docid}:", text, key=f"{inst_num}doc{docid}2")

                # top ranked
                # NOTE: BEIR calls trec_eval which ranks by score, then doc_id for ties
                # we have to fix that or we don't match the scores
                pred_doc1 = run1_pandas[run1_pandas.qid == inst_num].sort_values(["score", "doc_id"], ascending=[False, False])
                pred_doc1["rank_real"] = list(range(1, len(pred_doc1) + 1))
                rank_pred1 = pred_doc1[pred_doc1.doc_id.isin(relevant_docs)]["rank_real"].tolist()
                
                pred_doc2 = run2_pandas[run2_pandas.qid == inst_num].sort_values(["score", "doc_id"], ascending=[False, False])
                pred_doc2["rank_real"] = list(range(1, len(pred_doc2) + 1))
                rank_pred2 = pred_doc2[pred_doc2.doc_id.isin(relevant_docs)]["rank_real"].tolist()


                # st.subheader("Ranked of Documents")
                # st.markdown(f"Run 1 Rank: {rank_pred1}")
                # st.markdown(f"Run 2 Rank: {rank_pred2}")

                ranking_str = ",".join([str(item) for item in rank_pred1]) if type(rank_pred1) == list else str(rank_pred1)
                if ranking_str == "":
                    ranking_str = "--"
                rank_col1.metric("Run 1 " + f"Rank of Relevant Doc(s)", ranking_str)

                ranking_str2 = ",".join([str(item) for item in rank_pred2]) if type(rank_pred2) == list else str(rank_pred2)
                if ranking_str2 == "":
                    ranking_str2 = "--"
                rank_col2.metric("Run 2 " + f"Rank of Relevant Doc(s)", ranking_str2)


                st.divider()


                container_two_docs_ranked = st.container()
                col_run1, col_run2 = container_two_docs_ranked.columns(2, gap="medium")

                if col_run1.checkbox('Show top ranked documents for Run 1', key=f"{inst_index}top-1run"):
                    col_run1.subheader("Top N Ranked Documents")
                    if doc_expansion1 is not None and run1_uses_doc_expansion != "None":
                        show_orig_rel_ranked1 = col_run1.checkbox("Show Original Ranked Doc(s)", key=f"{inst_index}relorigdocs1", value=False)

                    run1_top_n = run1_pandas[run1_pandas.qid == str(inst_num)].sort_values(["score", "doc_id"], ascending=[False, False])[:top_n]
                    run1_top_n_docs = [corpus[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]

                    if doc_expansion1 is not None and run1_uses_doc_expansion != "None" and not show_orig_rel_ranked1:
                        run1_top_n_docs_alt = [doc_expansion1[str(doc_id)] for doc_id in run1_top_n.doc_id.tolist()]
                        for d_idx, doc in enumerate(run1_top_n_docs):
                            alt_text = run1_top_n_docs_alt[d_idx]["text"]
                            doc_text = combine(doc["text"], alt_text, run1_uses_doc_expansion)
                            if use_model_saliency:
                                if col_run1.checkbox("Show Model Saliency", key=f"{inst_index}model_saliency{d_idx}ranked1", value=False):
                                    col_run1.markdown(get_saliency(query_text1, doc_text),unsafe_allow_html=True)
                                else:
                                    col_run1.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc_text, key=f"{inst_num}doc{d_idx}1")
                            else:
                                col_run1.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc_text, key=f"{inst_num}doc{d_idx}1")
                    else:
                        for d_idx, doc in enumerate(run1_top_n_docs):
                            if use_model_saliency:
                                if col_run1.checkbox("Show Model Saliency", key=f"{inst_index}model_saliency{d_idx}ranked1", value=False):
                                    col_run1.markdown(get_saliency(query_text1, doc),unsafe_allow_html=True)
                                else:
                                    col_run1.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc["text"], key=f"{inst_num}doc{d_idx}1")
                            else:
                                col_run1.text_area(f"{run1_top_n['doc_id'].iloc[d_idx]}: ", doc["text"], key=f"{inst_num}doc{d_idx}1")
                                
                    
                if col_run2.checkbox('Show top ranked documents for Run 2', key=f"{inst_index}top-2run"):
                    col_run2.subheader("Top N Ranked Documents")
                    if doc_expansion2 is not None and run2_uses_doc_expansion != "None":
                        show_orig_rel_ranked2 = col_run2.checkbox("Show Original Ranked Doc(s)", key=f"{inst_index}relorigdocs2", value=False)        
                    run2_top_n = run2_pandas[run2_pandas.qid == str(inst_num)].sort_values(["score", "doc_id"], ascending=[False, False])[:top_n]
                    run2_top_n_docs = [corpus[str(doc_id)] for doc_id in run2_top_n.doc_id.tolist()]


                    if doc_expansion2 is not None and run2_uses_doc_expansion != "None" and not show_orig_rel_ranked2:
                        run2_top_n_docs_alt = [doc_expansion2[str(doc_id)] for doc_id in run2_top_n.doc_id.tolist()]
                        for d_idx, doc in enumerate(run2_top_n_docs):
                            alt_text = run2_top_n_docs_alt[d_idx]["text"]
                            doc_text = combine(doc["text"], alt_text, run2_uses_doc_expansion)
                            if use_model_saliency:
                                if col_run2.checkbox("Show Model Saliency", key=f"{inst_index}model_saliency{d_idx}ranked2", value=False):
                                    col_run2.markdown(get_saliency(query_text2, doc_text),unsafe_allow_html=True)
                                else:
                                    col_run2.text_area(f"{run2_top_n['doc_id'].iloc[d_idx]}: ", doc_text, key=f"{inst_num}doc{d_idx}2")
                            else:
                                col_run2.text_area(f"{run2_top_n['doc_id'].iloc[d_idx]}: ", doc_text, key=f"{inst_num}doc{d_idx}2")
                    else:
                        for d_idx, doc in enumerate(run2_top_n_docs):
                            if use_model_saliency:
                                if col_run2.checkbox("Show Model Saliency", key=f"{inst_index}model_saliency{d_idx}ranked2", value=False):
                                    col_run2.markdown(get_saliency(query_text2, doc),unsafe_allow_html=True)
                                else:
                                    col_run2.text_area(f"{run2_top_n['doc_id'].iloc[d_idx]}: ", doc["text"], key=f"{inst_num}doc{d_idx}2")
                            else:
                                col_run2.text_area(f"{run2_top_n['doc_id'].iloc[d_idx]}: ", doc["text"], key=f"{inst_num}doc{d_idx}2")

                st.divider()


            else:
                st.title("Overview")

                st.subheader(f"Scores of {metric_name}")
                fig = create_boxplot_2df(results1, results2, metric_name)
                st.plotly_chart(fig)

                st.subheader(f"Score Difference of {metric_name}")
                fig_comp = create_boxplot_diff(results1, results2, metric_name)
                st.plotly_chart(fig_comp)

else:
    st.warning("Please choose a dataset and upload a run file. If you chose \"custom\" be sure that you uploaded all files (queries, corpus, qrels)")