import json import os import pprint import streamlit as st import streamlit.components.v1 as components import requests import logging from typing import Union pp = pprint.PrettyPrinter(indent=2) st.set_page_config(page_title="Gaia Search 🌖🌏", layout="wide") os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True) with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file: file.write('[theme]\nbase="light"') corpus_name_map = { "LAION": "laion", "ROOTS": "roots", "The Pile": "pile", "C4": "c4", } st.sidebar.markdown( """
Gaia Search 🌖🌏
A search engine for large scale texual corpora. Most of the datasets included in the tool are based on Common Crawl. By using the tool, you are also bound by the Common Crawl terms of use in respect of the content contained in the datasets.
""", unsafe_allow_html=True, ) st.sidebar.markdown( """GitHub | Project Report | Colab
""", unsafe_allow_html=True, ) ## # # #
query = st.sidebar.text_input(label="Query", placeholder="Type your query here") corpus = st.sidebar.selectbox( "Corpus", tuple(corpus_name_map.keys()), index=2, ) max_results = st.sidebar.slider( "Max Results", min_value=1, max_value=100, step=1, value=10, help="Max Number of Documents to return", ) # dark_mode_toggle = """ # # # """ # st.sidebar.markdown(dark_mode_toggle, unsafe_allow_html=True) footer = """ """ st.sidebar.markdown(footer, unsafe_allow_html=True) def scisearch(query, corpus, num_results=10): try: print(query, corpus, num_results) query = query.strip() if query == "" or query is None: return post_data = {"query": query, "corpus": corpus, "k": num_results, "lang": "all"} address = ( os.environ.get("address") if corpus != "roots" else os.environ.get("address_roots") ) logging.warning(os.environ.get("address")) output = requests.post( address, headers={"Content-type": "application/json"}, data=json.dumps(post_data), timeout=60, ) payload = json.loads(output.text) return payload["results"], payload["highlight_terms"] except Exception as e: print(e) PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"} PII_PREFIX = "PI:" def process_pii(text): for tag in PII_TAGS: text = text.replace( PII_PREFIX + tag, """REDACTED {}""".format( tag ), ) return text def highlight_string(paragraph: str, highlight_terms: list) -> str: tokens = paragraph.split() tokens_html = [] for token in tokens: if token in highlight_terms: tokens_html.append("{}".format(token)) else: tokens_html.append(token) tokens_html = " ".join(tokens_html) return process_pii(tokens_html) def extract_lang_from_docid(docid): return docid.split("_")[1] def format_result(result, highlight_terms): text = result["text"] docid = result["docid"] tokens_html = highlight_string(text, highlight_terms) language = extract_lang_from_docid(docid) result_html = """ Language: {} | Document ID: {} |" + result_html + "
" def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list) -> str: hit_list = [] if corpus == "roots": result_page_html = "" for lang, results_for_lang in hits.items(): print("Processing language", lang) if len(results_for_lang) == 0: result_page_html += """Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}
""" if corpus == "laion": res_head += f"""Caption:
{highlight_string(hit['text'], highlight_terms)}
""" if ( "meta" in hit and hit["meta"] is not None and "docs" in hit["meta"] and len(hit["meta"]["docs"]) > 0 ): res_head += """Image links:
{highlight_string(hit['text'], highlight_terms)}
Please provide a non-empty query.
About {max_results} results
{html_results}