gaia / app.py
ola13's picture
refactor
9291cc0
raw
history blame
8.59 kB
import json
import os
import pprint
import streamlit as st
import streamlit.components.v1 as components
import requests
pp = pprint.PrettyPrinter(indent=2)
os.environ["address"] = "http://34.79.83.149:8080"
st.set_page_config(page_title="Gaia Search 🌖🌏", layout="wide")
os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
file.write('[theme]\nbase="light"')
corpus_name_map = {
"LAION": "laion",
"The Pile": "pile",
"C4": "c4",
}
st.sidebar.markdown(
"""
<style>
.aligncenter {
text-align: center;
font-weight: bold;
font-size: 36px;
}
</style>
<p class="aligncenter">Gaia Search 🌖🌏</p>
<p>A search engine for large scale texual
corpora. Most of the datasets included in the tool are based on Common
Crawl. By using the tool, you are also bound by the Common Crawl terms
of use in respect of the content contained in the datasets.
</p>
""",
unsafe_allow_html=True,
)
st.sidebar.markdown(
"""
<style>
.aligncenter {
text-align: center;
}
</style>
<p style='text-align: center'>
<a href="" style="color:#7978FF;">GitHub</a> | <a href="" style="color:#7978FF;" >Project Report</a> | <a href="" style="color:#7978FF;" >Colab</a>
</p>
""",
unsafe_allow_html=True,
)
# <p class="aligncenter">
# <a href="" target="_blank">
# <img src="https://colab.research.google.com/assets/colab-badge.svg"/>
# </a>
# </p>
query = st.sidebar.text_input(label="Query", value="")
corpus = st.sidebar.selectbox(
"Corpus",
tuple(corpus_name_map.keys()),
index=0,
)
max_results = st.sidebar.slider(
"Max Results",
min_value=1,
max_value=100,
step=1,
value=10,
help="Max Number of Documents to return",
)
# dark_mode_toggle = """
# <script>
# function load_image(id){
# console.log(id)
# var x = document.getElementById(id);
# console.log(x)
# if (x.style.display === "none") {
# x.style.display = "block";
# } else {
# x.style.display = "none";
# }
# };
# function myFunction() {
# var element = document.body;
# element.classList.toggle("dark-mode");
# }
# </script>
# <button onclick="myFunction()">Toggle dark mode</button>
# """
# st.sidebar.markdown(dark_mode_toggle, unsafe_allow_html=True)
footer = """
<style>
.footer {
position: fixed;
left: 0;
bottom: 0;
width: 100%;
background-color: white;
color: black;
text-align: center;
}
</style>
<div class="footer">
<p>Powered by <a href="https://huggingface.co/" >HuggingFace 🤗</a> and <a href="https://github.com/castorini/pyserini" >Pyserini 🦆</a></p>
</div>
"""
st.sidebar.markdown(footer, unsafe_allow_html=True)
def scisearch(query, corpus, num_results=10):
try:
print(query, corpus, num_results)
query = query.strip()
if query == "" or query is None:
return
post_data = {"query": query, "corpus": corpus, "k": num_results}
output = requests.post(
os.environ.get("address"),
headers={"Content-type": "application/json"},
data=json.dumps(post_data),
timeout=60,
)
payload = json.loads(output.text)
return payload["results"], payload["highlight_terms"]
except Exception as e:
print(e)
PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
PII_PREFIX = "PI:"
def process_pii(text):
for tag in PII_TAGS:
text = text.replace(
PII_PREFIX + tag,
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
tag
),
)
return text
def highlight_string(paragraph: str, highlight_terms: list) -> str:
tokens = paragraph.split()
tokens_html = []
for token in tokens:
if token in highlight_terms:
tokens_html.append("<b>{}</b>".format(token))
else:
tokens_html.append(token)
tokens_html = " ".join(tokens_html)
return process_pii(tokens_html)
def process_results(corpus: str, hits: list, highlight_terms: list) -> str:
hit_list = []
for i, hit in enumerate(hits):
res_head = f"""
<p class="searchresult" style="color: #7978FF;">Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}</p>
"""
if corpus == "laion":
res_head += f"""
<p style="color: #7978FF;">Caption:</p>
<p>{highlight_string(hit['text'], highlight_terms)}</p>
"""
if (
"meta" in hit
and hit["meta"] is not None
and "docs" in hit["meta"]
and len(hit["meta"]["docs"]) > 0
):
res_head += """<p style="color: #7978FF;"> Image links:</p><ul>"""
for subhit in hit["meta"]["docs"]:
res_head += f"""<li><a href={subhit["URL"]} target="_blank" style="color:#ffcdf8; ">{subhit["URL"]}</a></li>"""
res_head += "</ul>"
res_head += "<hr>"
else:
res_head += (
f"""<p>{highlight_string(hit['text'], highlight_terms)}</p></div><hr>"""
)
hit_list.append(res_head)
return " ".join(hit_list)
if st.sidebar.button("Search", type="primary"):
hits, highlight_terms = scisearch(query, corpus_name_map[corpus], max_results)
html_results = process_results(corpus_name_map[corpus], hits, highlight_terms)
rendered_results = f"""
<div id="searchresultsarea">
<br>
<p id="searchresultsnumber">About {max_results} results</p>
{html_results}
# </div>"""
# st.markdown(
# """
# <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
# integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
# """,
# unsafe_allow_html=True,
# )
# st.markdown(
# """
# <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
# """,
# unsafe_allow_html=True,
# )
# st.markdown(
# f"""
# <div class="row no-gutters mt-3 align-items-center">
# Gaia Search 🌖🌏
# <div class="col col-md-4">
# <input class="form-control border-secondary rounded-pill pr-5" type="search" value="{query}" id="example-search-input2">
# </div>
# <div class="col-auto">
# <button class="btn btn-outline-light text-dark border-0 rounded-pill ml-n5" type="button">
# <i class="fa fa-search"></i>
# </button>
# </div>
# </div>
# """,
# unsafe_allow_html=True,
# )
# .bk-root{position:relative;width:auto;height:auto;box-sizing:border-box;font-family:Helvetica, Arial, sans-serif;font-size:13px;}.bk-root .bk,.bk-root .bk:before,.bk-root .bk:after{box-sizing:inherit;margin:0;border:0;padding:0;background-image:none;font-family:inherit;font-size:100%;line-height:1.42857143;}.bk-root pre.bk{font-family:Courier, monospace;}
components.html(
"""
<head>
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
</head>
<style>
#searchresultsarea {
font-family: "Source Sans Pro", sans-serif;
}
#searchresultsnumber {
font-size: 0.8rem;
color: gray;
}
.searchresult h2 {
font-size: 19px;
line-height: 18px;
font-weight: normal;
color: rgb(7, 111, 222);
margin-bottom: 0px;
margin-top: 25px;
color: #7978FF;"
}
.searchresult a {
font-size: 12px;
line-height: 12px;
color: green;
margin-bottom: 0px;
}
</style>
"""
+ rendered_results,
height=800,
scrolling=True,
)