Spaces:

arabellastrange
/

search-assistant

Sleeping

App Files Files

arabellastrange commited on Jul 25, 2024

Commit

6855b1e

1 Parent(s): b0b5964

Add application file

Browse files

Files changed (11) hide show

app.py +114 -0
generate_response.py +185 -0
llmsearch/google_search_concurrent.py +698 -0
llmsearch/meta.py +357 -0
llmsearch/show_site_stats.py +56 -0
llmsearch/site_stats.py +124 -0
llmsearch/utilityV2.py +358 -0
read_write_index.py +22 -0
requirements.txt +13 -0
utils.py +5 -0
web_search.py +290 -0

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import logging
+from time import asctime
+import gradio as gr
+from llama_index.core import Document, VectorStoreIndex
+from llama_index.core.evaluation import SemanticSimilarityEvaluator, FaithfulnessEvaluator
+from generate_response import generate_chat_response_with_history, set_llm, is_search_query, condense_question, \
+    generate_chat_response_with_history_rag_return_response, get_llm
+from utils import read_file
+from web_search import search
+API_KEY_PATH = "../keys/gpt_api_key.txt"
+logger = logging.getLogger("agent_logger")
+sourced = False
+query = False
+rag_similarity = False
+def google_search_chat(message, history):
+    condensed_question = condense_question(message, history)
+    if is_search_query(condensed_question):
+        search_results = search(message, condensed_question)
+        relevant_content = ""
+        sources = ""
+        for index, result in enumerate(search_results):
+            relevant_content = relevant_content + "\n" + ''.join(result['text'])
+            sources = sources + f'\n {index + 1}. ' + result['url']  # python is zero-indexed
+        if relevant_content != "":
+            documents = [Document(text=relevant_content)]
+            index = VectorStoreIndex.from_documents(documents)
+            response = generate_chat_response_with_history_rag_return_response(index, message, history)
+            similar_str = "not calculated"
+            faithfulness_str = "not calculated"
+            if rag_similarity:
+                sim_evaluator = SemanticSimilarityEvaluator()
+                faith_evaluator = FaithfulnessEvaluator(llm=get_llm())
+                # condensed_context = condense_context(relevant_content)
+                # logger.info("Calculating similarity...")
+                # similar = sim_evaluator.evaluate(response=str(response),
+                #                                   reference=condensed_context)
+                logger.info("Calculating faithfulness...")
+                faithfulness = faith_evaluator.evaluate_response(query=condensed_question, response=response)
+                # similar_str = str(round((similar.score * 100), 2)) + "%"
+                faithfulness_str = "Yes" if faithfulness.passing else "No"
+            logger.info(f'**Search Query:** {condensed_question} \n **Faithfulness:** {faithfulness_str} \n '
+                        f'**Similarity:** {similar_str} \n **Sources used:** \n {sources}')
+            response_text = []
+            string_output = ""
+            for text in response.response_gen:
+                response_text.append(text)
+                string_output = ''.join(response_text)
+                yield string_output
+            if not sourced:
+                pass
+            if sourced and not query and not rag_similarity:
+                yield string_output + f'\n\n --- \n **Sources used:** \n {sources}'
+            if sourced and query and not rag_similarity:
+                yield (string_output
+                       + f'\n\n --- \n **Search Query:** {condensed_question} '
+                         f'\n **Sources used:** \n {sources}')
+            if rag_similarity:
+                yield (string_output
+                       + f'\n\n --- \n **Search Query:** {condensed_question} \n '
+                       # f'**Similarity of response to the sources [ℹ️]'
+                       # f'(https://en.wikipedia.org/wiki/Semantic_similarity):** {similar_str} \n'
+                         f'**Is response in source documents?**: {faithfulness_str}'
+                         f'\n **Sources used:** \n {sources}')
+            logger.info(f'Assistant Response: {string_output}')
+        else:
+            logger.info(
+                f'Assistant Response: Sorry, no search results found.')
+            yield "Sorry, no search results found."
+    else:
+        yield from generate_chat_response_with_history(message, history)
+def run_searchbot():
+    logging.root.setLevel(logging.INFO)
+    filehandler = logging.FileHandler(f'../logs/agent_log_{asctime().replace(" ", "").lower().replace(":", "")}.log',
+                                      'a')
+    formatter = logging.Formatter('%(asctime)-15s::%(levelname)s::%(filename)s::%(funcName)s::%(lineno)d::%(message)s')
+    filehandler.setFormatter(formatter)
+    logger = logging.getLogger("agent_logger")
+    for hdlr in logger.handlers[:]:  # remove the existing file handlers
+        if isinstance(hdlr, logging.FileHandler):
+            logger.removeHandler(hdlr)
+    logger.addHandler(filehandler)  # set the new handler
+    logger.setLevel(logging.INFO)
+    api_key = read_file(API_KEY_PATH)
+    global sourced
+    sourced = False
+    # GPT - 4 Turbo. The latest GPT - 4 model intended to reduce cases of “laziness” where the model doesn’t complete
+    # a task. Returns a maximum of 4,096 output tokens. Link:
+    # https://openai.com/blog/new-embedding-models-and-api-updates
+    set_llm(key=api_key, model="gpt-4-0125-preview", temperature=0)
+    logger.info("Launching Gradio ChatInterface for searchbot...")
+    demo = gr.ChatInterface(fn=google_search_chat,
+                            title="Search Assistant", retry_btn=None, undo_btn=None, clear_btn=None,
+                            theme="soft")
+    demo.queue().launch(auth=('convo', 'session2024'), root_path='/convosearch', server_port=7866)

generate_response.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import logging
+from llama_index.core import ServiceContext, set_global_service_context, PromptTemplate
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from llama_index.core.base.llms.base import BaseLLM
+from llama_index.core.base.llms.generic_utils import messages_to_history_str
+from llama_index.core.base.llms.types import ChatMessage, MessageRole
+from llama_index.core.chat_engine.types import ChatMode
+from llama_index.embeddings.mistralai import MistralAIEmbedding
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.llms.mistralai import MistralAI
+from llama_index.llms.openai import OpenAI
+llm: BaseLLM
+embed_model: BaseEmbedding
+logger = logging.getLogger("agent_logger")
+# TODO why is my system prompt being ignored?
+def set_llm(model, key, temperature):
+    global llm
+    global embed_model
+    logger.info(f'Setting up LLM with {model} and associated embedding model...')
+    if "gpt" in model:
+        llm = OpenAI(api_key=key, temperature=temperature, model=model)
+        embed_model = OpenAIEmbedding(api_key=key)
+    elif "mistral" in model:
+        llm = MistralAI(api_key=key, model=model, temperature=temperature, safe_mode=True)
+        embed_model = MistralAIEmbedding(api_key=key)
+    else:
+        llm = OpenAI(api_key=key, model="gpt-3.5-turbo", temperature=0)
+        embed_model = OpenAIEmbedding(api_key=key)
+    # deprecated call should migrate
+    service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
+    set_global_service_context(service_context)
+def get_llm():
+    return llm
+def generate_query_response(index, message):
+    string_output = ""
+    logger.info("Creating query engine with index...")
+    query_engine = index.as_query_engine(streaming=True, chat_mode=ChatMode.CONDENSE_QUESTION)
+    logger.info(f'Input user message: {message}')
+    response = query_engine.query(message)
+    response_text = []
+    for text in response.response_gen:
+        response_text.append(text)
+        string_output = ''.join(response_text)
+        yield string_output
+    logger.info(f'Assistant response: {string_output}')
+def generate_chat_response_with_history(message, history):
+    string_output = ""
+    messages = collect_history(message, history)
+    response = llm.stream_chat(messages)
+    response_text = []
+    for text in response:
+        response_text.append(text.delta)
+        string_output = ''.join(response_text)
+        yield string_output
+    logger.info(f'Assistant response: {string_output}')
+def generate_chat_response_with_history_rag_return_response(index, message, history):
+    logger.info("Generating chat response with history and rag...")
+    messages = collect_history(message, history)
+    logger.info("Creating query engine with index...")
+    query_engine = index.as_chat_engine(chat_mode=ChatMode.CONDENSE_QUESTION, streaming=True)
+    return query_engine.stream_chat(messages)
+def generate_chat_response_with_history_rag_yield_string(index, message, history):
+    logger.info("Generating chat response with history and rag...")
+    string_output = ""
+    messages = collect_history(message, history)
+    logger.info("Creating query engine with index...")
+    query_engine = index.as_chat_engine(chat_mode=ChatMode.CONDENSE_QUESTION, streaming=True)
+    response = query_engine.stream_chat(messages)
+    response_text = []
+    for text in response.response_gen:
+        response_text.append(text)
+        string_output = ''.join(response_text)
+        yield string_output
+    logger.info(f'Assistant response: {string_output}')
+def is_greeting(message):
+    response = llm.complete(
+        f'Is the user message a greeting? Answer True or False only. For example: \n User message: "Hello" \n '
+        f'Assistant response: True \n User message "Where do pears grow?" Assistant response: False \n. User message: "{message}"')
+    if any(x in response.text.lower() for x in ["true", "yes", "is a greeting"]):
+        return True
+    return False
+def is_closing(message):
+    # TODO
+    return False
+def is_search_query(message):
+    response = llm.complete(
+        f'Is the user message a request for factual information? Answer True or False only. For example: \n User '
+        f'message: "Where do watermelons grow?" \n  Assistant response: True \n User message "Do you like watermelons?" '
+        f'Assistant response: False \n. User message: "Hello" \n Assistant response: False \n User message: "My code '
+        f'is not working. How do I implement logging correctly in python?" \n Assistant response: True \n User '
+        f'message: "{message}"')
+    if any(x in response.text.lower() for x in ["true", "yes", "is a request"]):
+        logger.info(f'Message: {message} is a request...')
+        return True
+    return False
+def collect_history(message, history):
+    logger.info(f'Input user message: {message}')
+    def message_generator():
+        messages = []
+        logger.info("Fetching message history...")
+        for message_pair in history:
+            if message_pair[0] is not None:
+                messages.append(ChatMessage(role=MessageRole.USER, content=message_pair[0]))
+            if message_pair[1] is not None:
+                messages.append(ChatMessage(role=MessageRole.ASSISTANT, content=message_pair[1]))
+        logger.info(f'{len(messages)} messages in message history...')
+        return messages
+    messages = message_generator()
+    messages.append(ChatMessage(role=MessageRole.USER, content=message))
+    return messages
+def condense_question(message, history):
+    DEFAULT_TEMPLATE = """\
+        Given a conversation (between Human and Assistant) and a follow up message from Human, \
+        rewrite the message to be a standalone question that captures all relevant context \
+        from the conversation.
+        <Chat History>
+        {chat_history}
+        <Follow Up Message>
+        {question}
+        <Standalone question>
+        """
+    condense_question_prompt = PromptTemplate(DEFAULT_TEMPLATE)
+    messages = collect_history(message, history)
+    chat_history_str = messages_to_history_str(messages)
+    question = llm.predict(condense_question_prompt, question=message, chat_history=chat_history_str)
+    return question
+def condense_context(context):
+    logger.info("Condensing input text with LLM complete...")
+    response = llm.complete(f'Rewrite the input to be a concise summary that captures '
+                            f'all relevant context from the original text. \n'
+                            f'Original Text: {context}')
+    return response.text

llmsearch/google_search_concurrent.py ADDED Viewed

	@@ -0,0 +1,698 @@

+import concurrent.futures
+import copy
+import json
+import logging
+import sys
+import time
+# from PyPDF2 import PdfReader
+import traceback
+import urllib.parse as en
+import warnings
+from datetime import date
+from itertools import zip_longest
+import nltk
+import requests
+import selenium.common.exceptions
+import wordfreq as wf
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from unstructured.partition.html import partition_html
+from llmsearch import site_stats
+from llmsearch import utilityV2 as ut
+# from llmsearch import site_stats
+# from llmsearch import utilityV2 as ut
+logger = logging.getLogger("agent_logger")
+today = " as of " + date.today().strftime("%b-%d-%Y") + "\n\n"
+suffix = "\nA: "
+client = "\nQ: "
+QUICK_SEARCH = "quick"
+NORMAL_SEARCH = "moderate"
+DEEP_SEARCH = "deep"
+# system_prime = {
+#     "role": "system",
+#     "content": "You analyze Text with respect to Query and list any relevant information found, including direct quotes from the text, and detailed samples or examples in the text.",
+# }
+priming_1 = {"role": "user", "content": "Query:\n"}
+# priming_2 = {
+#     "role": "user",
+#     "content": "List relevant information in the provided text, including direct quotes from the text. If none, respond 'no information'.\nText:\n",
+# }
+def process_url_mod(query_phrase, url, timeout):
+    start_time = time.time()
+    site = ut.extract_site(url)
+    result = ""
+    try:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            options = Options()
+            options.page_load_strategy = "eager"
+            options.add_argument("--headless")
+            result = ""
+            with webdriver.Chrome(options=options) as dr:
+                logger.info(f"*****setting page load timeout {timeout}")
+                dr.set_page_load_timeout(timeout)
+                try:
+                    dr.get(url)
+                    response = dr.page_source
+                    result = response_text_extract_mod(url, response)
+                except selenium.common.exceptions.TimeoutException:
+                    return "", url
+    except Exception:
+        traceback.print_exc()
+        logger.info(f"{site} err")
+        pass
+    logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
+    return result, url
+# Define a function to make a single URL request and process the response
+def process_url(query_phrase, keywords, keyword_weights, url, timeout):
+    start_time = time.time()
+    site = ut.extract_site(url)
+    result = ""
+    try:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            options = Options()
+            options.page_load_strategy = "eager"
+            options.add_argument("--headless")
+            result = ""
+            with webdriver.Chrome(options=options) as dr:
+                logger.info(f"*****setting page load timeout {timeout}")
+                dr.set_page_load_timeout(timeout)
+                try:
+                    dr.get(url)
+                    response = dr.page_source
+                    result = response_text_extract(
+                        query_phrase,
+                        keywords,
+                        keyword_weights,
+                        url,
+                        response,
+                        int(time.time() - start_time),
+                    )
+                except selenium.common.exceptions.TimeoutException:
+                    return "", url
+    except Exception:
+        traceback.print_exc()
+        logger.info(f"{site} err")
+        pass
+    # logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time()-start_time)*1000)} ms")
+    return result, url
+def process_urls_mod(query_phrase, urls):
+    start_time = time.time()
+    response = []
+    logger.info("entering process urls")
+    full_text = ""
+    used_index = 0
+    urls_used = ["" for i in range(30)]
+    tried_index = 0
+    urls_tried = ["" for i in range(30)]
+    in_process = []
+    processed = []
+    google_futures = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:
+        # initialize scan of google urls
+        while True:
+            try:
+                while len(urls) > 0:
+                    timeout = 12 - int(time.time() - start_time)
+                    recommendation = site_stats.get_next(
+                        urls, sample_unknown=True
+                    )
+                    url = recommendation[1]
+                    future = executor.submit(process_url_mod, query_phrase=query_phrase, url=url, timeout=timeout)
+                    google_futures.append(future)
+                    in_process.append(future)
+                    urls_tried[tried_index] = url
+                    tried_index += 1
+                    urls.remove(url)
+                    logger.info(f"queued {ut.extract_site(url)}, {timeout}")
+                    for future in in_process:
+                        if future.done():
+                            result, url = future.result()
+                            processed.append(future)
+                            in_process.remove(future)
+                            if len(result) > 0:
+                                urls_used[used_index] = url
+                                used_index += 1
+                                result = result.replace(". .", ".")
+                                logger.info(
+                                    f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
+                                )
+                                response.append(
+                                    {
+                                        "source": ut.extract_domain(url),
+                                        "url": url,
+                                        "text": result,
+                                    }
+                                )
+                    if time.time() - start_time > 28:
+                        executor.shutdown(wait=False)
+                        logger.info(
+                            f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
+                        )
+                        return response, used_index, urls_used, tried_index, urls_tried
+                    time.sleep(0.5)
+            except:
+                traceback.print_exc()
+        executor.shutdown(wait=False)
+        logger.info(
+            f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
+        )
+        return response, index, urls_used, tried_index, urls_tried
+def process_urls(query_phrase, keywords, keyword_weights, urls, search_level):
+    # Create a ThreadPoolExecutor with 5 worker threads
+    response = []
+    logger.info("entering process urls")
+    start_time = time.time()
+    full_text = ""
+    used_index = 0
+    urls_used = ["" for i in range(30)]
+    tried_index = 0
+    urls_tried = ["" for i in range(30)]
+    start_time = time.time()
+    in_process = []
+    processed = []
+    google_futures = []
+    off_whitelist = False
+    with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:
+        # initialize scan of google urls
+        while True:
+            try:
+                while (
+                        len(urls) > 0
+                        # no sense starting if not much time left
+                        and (
+                                (
+                                        search_level == DEEP_SEARCH
+                                        and len(full_text) < 9600
+                                        and len(in_process) < 16
+                                        and time.time() - start_time < 14
+                                )
+                                or (
+                                        search_level == NORMAL_SEARCH
+                                        and len(full_text) < 6400
+                                        and len(in_process) < 14
+                                        and time.time() - start_time < 12
+                                )
+                                or (
+                                        search_level == QUICK_SEARCH
+                                        and len(full_text) < 4800
+                                        and len(in_process) < 10
+                                        and time.time() - start_time < 8
+                                )
+                        )
+                ):
+                    recommendation = site_stats.get_next(
+                        urls, sample_unknown=off_whitelist
+                    )
+                    if recommendation is None or len(recommendation) == 0:
+                        off_whitelist = True
+                    else:
+                        # set timeout so we don't wait for a slow site forever
+                        timeout = 12 - int(time.time() - start_time)
+                        if search_level == NORMAL_SEARCH:
+                            timeout = timeout + 4
+                        url = recommendation[1]
+                        future = executor.submit(
+                            process_url,
+                            query_phrase,
+                            keywords,
+                            keyword_weights,
+                            url,
+                            timeout,
+                        )
+                        # remaining_time = start_time+18-time.time()
+                        # future.exception(remaining_time)
+                        google_futures.append(future)
+                        in_process.append(future)
+                        urls_tried[tried_index] = url
+                        tried_index += 1
+                        urls.remove(url)
+                        logger.info(f"queued {ut.extract_site(url)}, {timeout}")
+                # Process the responses as they arrive
+                for future in in_process:
+                    if future.done():
+                        result, url = future.result()
+                        processed.append(future)
+                        in_process.remove(future)
+                        if len(result) > 0:
+                            urls_used[used_index] = url
+                            used_index += 1
+                            result = result.replace(". .", ".")
+                            logger.info(
+                                f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
+                            )
+                            site = ut.extract_site(url)
+                            domain = ut.extract_domain(url)
+                            if domain.endswith("gov"):
+                                credibility = "Official Source"
+                            elif site in ut.sites.keys():
+                                if ut.sites[site] > 0:
+                                    credibility = "Whitelisted Source"
+                                elif ut.sites[site] == 0:
+                                    credibility = "Blacklisted Source"
+                            else:
+                                credibility = "Third-Party Source"
+                            response.append(
+                                {
+                                    "source": ut.extract_domain(url),
+                                    "url": url,
+                                    "credibility": credibility,
+                                    "text": result,
+                                }
+                            )
+                # openai seems to timeout a plugin  at about 30 secs, and there is pbly 3-4 sec overhead
+                if (
+                        (len(urls) == 0 and len(in_process) == 0)
+                        or (
+                        search_level == DEEP_SEARCH
+                        and (len(full_text) > 9600)
+                        or time.time() - start_time > 42
+                )
+                        or (
+                        search_level == NORMAL_SEARCH
+                        and (len(full_text) > 6400)
+                        or time.time() - start_time > 32
+                )
+                        or (
+                        search_level == QUICK_SEARCH
+                        and (len(full_text) > 4800)
+                        or time.time() - start_time > 28
+                )
+                ):
+                    executor.shutdown(wait=False)
+                    logger.info(
+                        f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
+                    )
+                    return response, used_index, urls_used, tried_index, urls_tried
+                time.sleep(0.5)
+            except:
+                traceback.print_exc()
+        executor.shutdown(wait=False)
+    logger.info(
+        f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
+    )
+    return response, index, urls_used, tried_index, urls_tried
+def extract_subtext(text, query_phrase, keywords, keyword_weights):
+    ###  maybe we should score based on paragraphs, not lines?
+    sentences = ut.reform(text)
+    # logger.info('***** sentences from reform')
+    # for sentence in sentences:
+    #    logger.info(sentence)
+    sentence_weights = {}
+    final_text = ""
+    for sentence in sentences:
+        sentence_weights[sentence] = 0
+        for keyword in keywords:
+            if keyword in sentence or keyword.lower() in sentence:
+                if keyword in keyword_weights.keys():
+                    sentence_weights[sentence] += keyword_weights[keyword]
+    # now pick out sentences starting with those with the most keywords
+    max_sentence_weight = 0
+    for keyword in keyword_weights.keys():
+        max_sentence_weight += keyword_weights[keyword]
+    # logger.info(f'******* max sentence weight {max_sentence_weight}')
+    for i in range(max_sentence_weight, 1, -1):
+        if len(final_text) > 6000 and i < max(
+                1, int(max_sentence_weight / 4)
+        ):  # make sure we don't miss any super-important text
+            return final_text
+        for sentence in sentences:
+            if len(final_text) + len(sentence) > 6001 and i < max(
+                    1, int(max_sentence_weight / 4)
+            ):
+                continue
+            if sentence_weights[sentence] == i:
+                final_text += sentence
+    # logger.info("relevant text", final_text)
+    # logger.info("keyword extract length:",len(final_text)) #, end='.. ')
+    return final_text
+def search(query_phrase):
+    logger.info(f"***** search {query_phrase}")
+    sort = "&sort=date-sdate:d:w"
+    if "today" in query_phrase or "latest" in query_phrase:
+        sort = "&sort=date-sdate:d:s"
+    # logger.info(f"search for: {query_phrase}")
+    google_query = en.quote(query_phrase)
+    response = []
+    try:
+        start_wall_time = time.time()
+        url = (
+                "https://www.googleapis.com/customsearch/v1?key="
+                + ut.google_key
+                + "&cx="
+                + ut.google_cx
+                # was ten but want to reduce search time
+                + "&num=3"
+                + sort
+                + "&q="
+                + google_query
+        )
+        response = requests.get(url)
+        response_json = json.loads(response.text)
+        logger.info(f"***** google search {int((time.time() - start_wall_time) * 10) / 10} sec")
+    except:
+        traceback.print_exc()
+        return []
+    # see if we got anything useful from google
+    if "items" not in response_json.keys():
+        logger.info("no return from google ...", response, response_json.keys())
+        # logger.info(google_query)
+        return []
+    # first try whitelist sites
+    urls = []
+    for i in range(len(response_json["items"])):
+        url = response_json["items"][i]["link"].lstrip().rstrip()
+        site = ut.extract_site(url)
+        if site not in ut.sites or ut.sites[site] == 1:
+            urls.append(url)
+    return urls
+def log_url_process(site, reason, raw_text, extract_text, gpt_text):
+    return
+"""
+# to record detailed logs of url processing unquote this function
+def log_url_process(site, reason, raw_text, extract_text, gpt_text):
+    if len(raw_text) == 0 and len(extract_text)==0 and len(gpt_text) ==0:
+        return
+    try:
+        with open('google_log.txt', 'a') as lg:
+            lg.write('\n\n*************'+reason.upper()+'***********\n')
+            lg.write('*****************'+site+'  RAW*************\n')
+            lg.write(raw_text)
+            lg.write('\n******************extract****************\n')
+            lg.write(extract_text)
+            lg.write('\n********************gpt******************\n')
+            lg.write(gpt_text)
+    except Exception:
+        traceback.print_exc()
+"""
+def response_text_extract_mod(url, response):
+    extract_text = ""
+    if url.endswith("pdf"):
+        pass
+    else:
+        elements = partition_html(text=response)
+        str_elements = []
+        for e in elements:
+            stre = str(e).replace("  ", " ")
+            str_elements.append(stre)
+        extract_text = ut.reform(str_elements)
+        logger.info(
+            f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
+        )
+    if len(''.join(extract_text).strip()) < 8:
+        return ""
+    return extract_text
+def response_text_extract(
+        query_phrase, keywords, keyword_weights, url, response, get_time
+):
+    curr = time.time()
+    text = ""
+    extract_text = ""
+    site = ut.extract_site(url)
+    if url.endswith("pdf"):
+        pass
+    else:
+        elements = partition_html(text=response)
+        str_elements = []
+        # logger.info('\n***** elements')
+        for e in elements:
+            stre = str(e).replace("  ", " ")
+            str_elements.append(stre)
+        extract_text = extract_subtext(
+            str_elements, query_phrase, keywords, keyword_weights
+        )
+        # logger.info('\n************ unstructured **********')
+        logger.info(
+            f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
+        )
+    url_text = text  # save for final stats
+    new_curr = time.time()
+    extract_time = int((new_curr - curr) * 1000000)
+    if len(extract_text.strip()) < 8:
+        return ""
+    # now ask openai to extract answer
+    response_text = ""
+    curr = new_curr
+    extract_text = extract_text[:10000]  # make sure we don't run over token limit
+    gpt_tldr_message = [
+        {
+            "role": "user",
+            "content": "Given:\n" + extract_text + "\n\nQuery:\n" + query_phrase,
+        }
+    ]
+    start_wall_time = time.time()
+    t_out = 12 - get_time
+    # logger.info(f'****** spawning page get with timeout {t_out}')
+    google_tldr = ut.ask_gpt_with_retries(
+        ut.MODEL, gpt_tldr_message, tokens=300, temp=0.3, timeout=t_out, tries=1
+    )
+    openai_time = int((time.time() - start_wall_time) * 10) / 10
+    logger.info(f"\n***** tldr {query_phrase}, {openai_time} sec")
+    logger.info(f'***** \n{extract_text}\n***** \n{google_tldr}\n*****\n')
+    url_text = url_text.replace("\n", ". ")
+    if google_tldr is None:
+        google_tldr = ""
+    response_text = google_tldr.lstrip()
+    prefix_text = response_text[: min(len(response_text), 96)].lower()
+    # openai sometimes returns a special format for 'no imformation'
+    if prefix_text.startswith("query:"):
+        text_index = response_text.find("Text:")
+        if text_index > 0:
+            response_text = response_text[text_index + 5:]
+            prefix_text = response_text[: min(len(response_text), 96)].lower()
+    if (
+            "no information" in prefix_text
+            or "i cannot provide" in prefix_text
+            or "as an ai language model" in prefix_text
+            or "does not provide" in prefix_text
+            or "it is not possible" in prefix_text
+    ):
+        # skip this summary, no info
+        logger.info(
+            "{} {}/{}/{}/{}".format(
+                site, len(response), len(url_text), len(extract_text), 0
+            )
+        )
+        # logger.info('************')
+        # logger.info(extract_text)
+        # logger.info('************')
+        sys.stdout.flush()
+        log_url_process(site, "no info", url_text, extract_text, "")
+        site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
+        return ""
+    if (
+            prefix_text.startswith("i'm sorry")
+            or prefix_text.startswith("there is no ")
+            or (
+            prefix_text.startswith("the provided text")
+            or prefix_text.startswith("i cannot")
+            or prefix_text.startswith("unfortunately")
+            or prefix_text.startswith("sorry")
+            or prefix_text.startswith("the text")
+    )
+            and (
+            "is not relevant" in prefix_text
+            or "no information" in prefix_text
+            or "does not provide" in prefix_text
+            or "does not contain" in prefix_text
+            or "no relevant information" in prefix_text
+    )
+    ):
+        # skip this summary, no info
+        log_url_process(site, "no info 2", url_text, extract_text, "")
+        logger.info(
+            "{} {}/{}/{}/{}".format(
+                site, len(response), len(url_text), len(extract_text), 0
+            )
+        )
+        ###logger.info('************')
+        ###logger.info(extract_text)
+        ###logger.info('************')
+        site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
+        return ""
+    else:
+        sentences = nltk.sent_tokenize(response_text)
+        response_text = ""
+        for sentence in sentences:
+            if (
+                    "no inform" in sentence.lower()
+                    or "no specific inform" in sentence.lower()
+                    or "is unclear" in sentence.lower()
+                    or "not mention" in sentence.lower()
+                    or "not specifically mention" in sentence.lower()
+            ):
+                pass
+            else:
+                response_text += "\n \u2022 " + sentence + ". "
+        site_stats.update_site_stats(
+            site, len(response_text), get_time, extract_time, openai_time
+        )
+        # logger.info('\n',response_text)
+        log_url_process(site, "response", url_text, extract_text, response_text)
+        logger.info(
+            "{} {}/{}/{}/{}".format(
+                site,
+                len(response),
+                len(url_text),
+                len(extract_text),
+                len(response_text),
+            )
+        )
+        # logger.info('************')
+        # logger.info(google_tldr)
+        # logger.info('************ site response ***********')
+        # logger.info(response_text)
+        # logger.info('************')
+        return response_text + "\n"
+    site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
+    log_url_process(site, "no return", "", "", "")
+    logger.info(
+        "{} {}/{}/{}/{}".format(
+            site, len(response), len(url_text), len(extract_text), 0
+        )
+    )
+    ##logger.info('************')
+    ##logger.info(extract_text)
+    ##logger.info('************')
+    return ""
+def extract_items_from_numbered_list(text):
+    items = ""
+    elements = text.split("\n")
+    for candidate in elements:
+        candidate = candidate.lstrip(". \t")
+        if len(candidate) > 4 and candidate[0].isdigit():
+            candidate = candidate[1:].lstrip(". ")
+            if (
+                    len(candidate) > 4 and candidate[0].isdigit()
+            ):  # strip second digit if more than 10 items
+                candidate = candidate[1:].lstrip(". ")
+            logger.info("E {}".format(candidate))
+            items += candidate + " "
+    return items
+def search_google_mod(query_phrase):
+    full_text = ""
+    try:
+        gpt_phrase_urls = []
+        if len(query_phrase) > 0:
+            gpt_phrase_urls = search(query_phrase)
+        full_text = process_urls_mod(query_phrase, gpt_phrase_urls)
+        logger.info("return from url processing")
+    except:
+        traceback.print_exc()
+    return full_text
+def search_google(original_query, search_level, query_phrase, keywords, chat_history):
+    start_time = time.time()
+    all_urls = []
+    urls_used = []
+    urls_tried = []
+    index = 0
+    tried_index = 0
+    full_text = ""
+    keyword_weights = {}
+    for keyword in keywords:
+        zipf = wf.zipf_frequency(keyword, "en")
+        weight = max(0, int((8 - zipf)))
+        if weight > 0:
+            keyword_weights[keyword] = weight
+            logger.info(f"keyword {keyword} wf.ziff {zipf} weight {weight}")
+            subwds = keyword.split(" ")
+            if len(subwds) > 1:
+                for subwd in subwds:
+                    sub_z = wf.zipf_frequency(subwd, "en")
+                    sub_wgt = max(0, int((8 - zipf) * 1 / 2))
+                    if sub_wgt > 0:
+                        keyword_weights[subwd] = sub_wgt
+                        logger.info(f"keyword {subwd} weight {sub_wgt}")
+    try:  # query google for recent info
+        sort = ""
+        if "today" in original_query or "latest" in original_query:
+            original_query = today.strip("\n") + " " + original_query
+        extract_query = ""
+        orig_phrase_urls = []
+        if len(original_query) > 0:
+            orig_phrase_urls = search(original_query[: min(len(original_query), 128)])
+            extract_query = original_query[: min(len(original_query), 128)]
+        gpt_phrase_urls = []
+        if len(query_phrase) > 0:
+            gpt_phrase_urls = search(query_phrase)
+            extract_query = (
+                query_phrase  # prefer more succinct query phrase if available
+            )
+        if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
+            return "", [], 0, [""], 0, [""]
+        for url in orig_phrase_urls:
+            if url in gpt_phrase_urls:
+                gpt_phrase_urls.remove(url)
+        # interleave both lists now that duplicates are removed
+        urls = [
+            val
+            for tup in zip_longest(orig_phrase_urls, gpt_phrase_urls)
+            for val in tup
+            if val is not None
+        ]
+        # urls = [val for tup in zip_longest(urls, kwd_phrase_urls) for val in tup if val is not None]
+        all_urls = copy.deepcopy(urls)
+        # initialize scan of google urls
+        # compute keyword weights
+        start_wall_time = time.time()
+        full_text, index, urls_used, tried_index, urls_tried = process_urls(
+            extract_query, keywords, keyword_weights, all_urls, search_level
+        )
+        site_stats.ckpt()
+        logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
+        # logger.info("return from url processsing")
+    except:
+        traceback.print_exc()
+    return full_text, all_urls, index, urls_used, tried_index, urls_tried

llmsearch/meta.py ADDED Viewed

	@@ -0,0 +1,357 @@

+from llmsearch import utilityV2 as ut, google_search_concurrent as gs
+import re
+import time
+ABORT = False
+CONTINUE = True
+history = []
+class history_entry:
+    def __init__(self, turn, vector=None):
+        self.message = turn.message.lower()
+        self.role = turn.role
+    def equal(self, he2):
+        return self.message == he2.message and self.role == turn.role
+def add(turn):
+    he = history_entry(turn)
+    history.append(he)
+def is_metaCyclic(turn):
+    he = history_entry(turn)
+    count = 0
+    for prior_he in history:
+        if he.equal(prior_he):
+            count += 1
+    return count > 1
+def is_cyclic(turn):
+    he = history_entry(turn)
+    for prior_he in history:
+        if he.equal(prior_he):
+            return True
+    return False
+def clear():
+    global history
+    history = []
+    return
+def test_history():
+    he1 = history_entry(ut.turn(role="assistant", message="who is Noriel Roubini"))
+    he2 = history_entry(ut.turn(role="assistant", message="who was Noriel Roubini"))
+    he3 = history_entry(ut.turn(role="assistant", message="who was Nsriel Roubini"))
+    he4 = history_entry(ut.turn(role="assistant", message="where is the Pinnacles"))
+    for hea in (he1, he2, he3, he4):
+        for heb in (he1, he2, he3, he4):
+            print(cosine(hea, heb))
+def test_parse_decomp():
+    test_text = """<Subquery 1>? What is the birthplace of Hugh Jackman?
+<Subquery 2>? What is the Japanese name of the birthplace of Hugh Jackman?
+<Keywords 1>: Hugh Jackman, birthplace
+<Keywords 2>: Japanese name, birthplace, Hugh Jackman"""
+    decomp = parse_decomposition(test_text)
+    for subquery in decomp:
+        print("Subquery\n", subquery)
+def parse_decomposition(text):
+    ### expecting:
+    ###   <Subquery 1>
+    ###   Birthplace of Hugh Jackman
+    ###   <Subquery 2>
+    ###   Japanese name of Birthplace of Hugh Jackman
+    ###  note that 'Birthplace of Hugh Jackson' operates as both a strinq google query and a variable in subsequent occurences
+    subquery_indecies = re.finditer(
+        "<Subquery", text
+    )  # Action: Ask {Google, User} "query"
+    subqueries = []
+    for index in subquery_indecies:
+        hdr_end = text[index.start() :].find(">") + index.start()
+        query_start = hdr_end + 1
+        query_end = text[query_start:].find("<")
+        if query_end < 0:
+            query = text[query_start:].strip()
+        else:
+            query = text[query_start : query_start + query_end].lstrip("?").strip()
+        print("Query:", query)
+        subqueries.append(query)
+    return subqueries
+def query_keywords(query):
+    start_wall_time = time.time()
+    gpt_key_message = [
+        {
+            "role": "user",
+            "content": "Extract keywords and named-entities from the following text.",
+        },
+        {"role": "user", "content": query},
+    ]
+    # for item in gpt_key_message:
+    #    print(item)
+    gpt_parse = ut.ask_gpt_with_retries(
+        "gpt-3.5-turbo", gpt_key_message, tokens=25, temp=0, timeout=5, tries=2
+    )
+    # print(f'\n***** keywords and named-entities {gpt_parse}')
+    # parse result Keywords: {comma separated list}\n\nNamed-entities: {comma-separated-list}
+    keywords = []
+    # do named entities first, they might be compounds of keywords
+    ne_start = gpt_parse.find("Named-entities")
+    print(f"***** keyword extract {int((time.time()-start_wall_time)*10)/10} sec")
+    if ne_start > 0:
+        nes = gpt_parse[ne_start + len("Named-entities") + 1 :].split(
+            ","
+        )  # assume string ends with colon or space:].split(',')
+        # print(f'Named-entity candidates {nes}')
+        for ne in nes:
+            ne = ne.strip(" .,;:\n")
+            # print(f'  appending {ne}')
+            if ne != "None":
+                keywords.append(ne)
+    else:
+        ne_start = len(gpt_parse) + 1
+    kwd_start = gpt_parse.find("Keywords")
+    if kwd_start > -1:
+        kwds = gpt_parse[kwd_start + len("Keywords") + 1 : ne_start].split(",")
+        # print(f'Keyword candidates {kwds}')
+        for kwd in kwds:
+            kwd = kwd.strip(" .\n,;:")
+            skip = False
+            for kwd2 in keywords:
+                if kwd in kwd2:
+                    skip = True
+            if not skip:
+                # print('appending', kwd)
+                keywords.append(kwd)
+    # else: print("Keywords index < 0")
+    if len(keywords) > 0:
+        print(f"***** query_keywords found keywords {keywords}")
+        return keywords
+    # fallback - just use query words
+    candidates = query.split(" ")
+    for candidate in candidates:
+        candidate = candidate.strip()
+        if len(candidate) > 2:
+            keywords.append(candidate)
+    # print(f'***** query_keywords using default keywords {keywords}')
+    return keywords
+def substitute(Q1, A1, Q2, debug=False):
+    gpt_sub_message = [
+        {
+            "role": "user",
+            "content": "replace '" + Q1 + "' with '" + A1 + "' in '" + Q2 + "'",
+        }
+    ]
+    if debug:
+        print("\n\n**************")
+        for item in gpt_sub_message:
+            print(item)
+    google_tldr = ut.ask_gpt_with_retries(
+        "gpt-3.5-turbo", gpt_sub_message, tokens=25, temp=0.1, timeout=5, tries=2
+    )
+    print("\n\n**************")
+    if len(google_tldr) == 0 or "no information" in google_tldr:
+        print("Returning original Q2")
+        return Q2
+    print("Substituted", Q2, google_tldr)
+    return google_tldr
+def meta(query, chat_history, debug=False):
+    print("***** entering meta")
+    turn = ut.turn(
+        role=ut.ASSISTANT, source=ut.ASSISTANT, message='Action: search "' + query + '"'
+    )
+    if is_metaCyclic(turn):
+        return [], ABORT
+    prompt = """Decompose a compound <Query> into two smaller <Subquery>. Use the following format for output:
+<Subquery 1>
+<Subquery 2>"""
+    gpt_message = [
+        {"role": "user", "content": prompt},
+        {"role": "user", "content": "<Query>\n" + query},
+    ]
+    response_text = ""
+    completion = None
+    if debug:
+        for role in gpt_message:
+            print(role)
+    print("starting gpt decomp query")
+    response_text = ut.ask_gpt_with_retries(
+        "gpt-3.5-turbo", gpt_message, tokens=75, temp=0.1, timeout=5, tries=2
+    )
+    if debug:
+        print(f"initial gpt query response:\n{response_text}")
+        print("**** executing decomp ****")
+    subqueries = parse_decomposition(response_text)
+    meta_chat_history = []
+    prev_tldr = ""
+    google_tldr = ""
+    for n, subquery in enumerate(subqueries):
+        # do variable substituion into subquery
+        # ask google
+        # send google results as notes plus subquery to gpt to extract <answer i>
+        # return chat history extended with each subquery and its answer
+        #   (or maybe just all google notes, let next level down do the rest?)
+        #   bad idea, can exceed token limit!
+        if debug:
+            print(f'subquery {n}, "{subquery}"')
+        if n > 0:
+            subquery = substitute(subqueries[n - 1], prev_tldr, subquery)
+            keyword_set = query_keywords(subquery)
+        keyword_set = query_keywords(subquery)
+        print("*****Executing subquery", subquery, "\n  with keywords", keyword_set)
+        gpt_initial_message = [
+            {
+                "role": "user",
+                "content": subquery + " If fact is unavailable, respond: 'Unknown'",
+            }
+        ]
+        # for turn in meta_chat_history:
+        #    gpt_initial_message.append({"role":"user","content":turn.tldr})
+        initial_gpt_answer = ut.ask_gpt_with_retries(
+            "gpt-3.5-turbo",
+            gpt_initial_message,
+            tokens=25,
+            temp=0.0,
+            timeout=5,
+            tries=2,
+        )
+        if debug:
+            print(f"***** google extract\n {initial_gpt_answer}\n")
+        if (
+            "unknown" not in initial_gpt_answer.lower()
+            and "cannot provide" not in initial_gpt_answer
+            and "do not have access" not in initial_gpt_answer
+        ):
+            meta_chat_history.append(
+                ut.turn(
+                    role="assistant",
+                    message=subquery,
+                    source=ut.ASSISTANT,
+                    tldr=subquery,
+                    keywords=keyword_set,
+                )
+            )
+            meta_chat_history.append(
+                ut.turn(
+                    role="assistant",
+                    message="<note>\n" + initial_gpt_answer + "\n<note>",
+                    source=ut.GOOGLE,
+                    tldr=initial_gpt_answer,
+                    keywords=keyword_set,
+                )
+            )
+            prev_tldr = initial_gpt_answer
+            print(f"***** Answer to {subquery}: {initial_gpt_answer}\n")
+            google_tldr = initial_gpt_answer
+            continue
+        # ask google
+        (
+            google_text,
+            urls_all,
+            index,
+            urls_used,
+            tried_index,
+            urls_tried,
+        ) = gs.search_google(
+            subquery,
+            gs.QUICK_SEARCH,
+            "",
+            ut.INFORMATION_QUERY,
+            keyword_set,
+            meta_chat_history,
+        )
+        if len(google_text) > 0:
+            # digest google response into an answer for this subquery
+            if debug:
+                print(f"***** search result\n{google_text}\n")
+            gpt_tldr_message = [
+                {
+                    "role": "user",
+                    "content": 'Summarize the set of <note> provided. Including only the direct answer to <Query>. Do not include any qualifiers or modifiers from the <Query> such as "where x was born".',
+                },
+                {"role": "user", "content": google_text},
+                {"role": "user", "content": "<Query>\n" + subquery},
+            ]
+            # for turn in meta_chat_history:
+            #   gpt_tldr_message.append({"role":"user","content":turn.tldr})
+            google_tldr = ut.ask_gpt_with_retries(
+                "gpt-3.5-turbo",
+                gpt_tldr_message,
+                tokens=150,
+                temp=0.1,
+                timeout=5,
+                tries=2,
+            )
+            # print('\n\n**************')
+            # for item in gpt_tldr_message:
+            #    print(item)
+            print(f"***** Answer to {subquery}: {google_tldr}\n")
+            meta_chat_history.append(
+                ut.turn(
+                    role="assistant",
+                    message=subquery,
+                    source=ut.ASSISTANT,
+                    tldr=subquery,
+                    keywords=keyword_set,
+                )
+            )
+            meta_chat_history.append(
+                ut.turn(
+                    role="assistant",
+                    message="Observation: " + google_tldr,
+                    source=ut.GOOGLE,
+                    tldr=google_tldr,
+                    keywords=keyword_set,
+                )
+            )
+            prev_tldr = google_tldr
+    # print(f"\n******meta return: {google_tldr} *****\n")
+    return meta_chat_history, CONTINUE
+if __name__ == "__main__":
+    # test_parse_decomp()
+    # meta("what is the Japanese name of the birthplace of Hugh Jackman", [])
+    # meta("What is the capital of the birthplace of Levy Mwanawasa?",[])
+    # meta("What is the (rounded down) latitude of the birthplace of Ferenc Puskas?",[])
+    # meta("What is the (rounded down) longitude of the birthplace of Juliane Koepcke?",[])
+    # meta("What is the top-level domain of the birthplace of Norodom Sihamoni?",[])
+    # meta("What is the 3166-1 numeric code for the birthplace of Gilgamesh?",[])
+    # meta("What is the currency in the birthplace of Joel Campbell?",[])
+    # meta("What is the currency abbreviation in the birthplace of Antonio Valencia?",[])
+    # meta("What is the currency symbol in the birthplace of Marek Hamsˇ´ık?",[])
+    # meta("What is the Japanese name of the birthplace of Hugh Jackman?",[])
+    # meta("What is the Spanish name of the birthplace of Fred´ eric Chopin? ",[])
+    # meta("What is the Russian name of the birthplace of Confucius?",[])
+    # meta("What is the Estonian name of the birthplace of Kofi Annan?",[])
+    # meta("What is the Urdu name of the birthplace of Nicki Minaj?",[])
+    # meta("What is the calling code of the birthplace of Milla Jovovich?",[])
+    # meta("Who was the champion of the Masters Tournament in the year that Bob Dylan was born?",[])
+    # meta("Who won the Nobel Prize in Literature in the year Matt Damon was born?",[])
+    # meta("Who was the President of the United States when Sting was born?",[])
+    meta(
+        "What are the latest reviewer opinions on Tesla Full Self Driving Beta version 11.3.4?",
+        [],
+        debug=True,
+    )
+    meta("Michael D'Ambrosio Hound Labs", [], debug=True)

llmsearch/show_site_stats.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import json
+import sys
+site_stats = {}  # initialize dictionay of sites used
+try:
+    with open("site_stats.json", "r") as f:
+        site_stats = json.loads(f.read())
+except:
+    print("Failed to read site_stats.")
+sites = {}  # initialize dictionay or sites used
+try:
+    with open("sites.json", "r") as f:
+        sites = json.loads(f.read())
+except:
+    print("Failed to read sites.")
+site_list = []
+for site in site_stats.keys():
+    site_list.append(site_stats[site])
+# ordered, sort and compute cumulative
+site_list.sort(
+    reverse=True,
+    key=lambda item: (
+            item["chars"] / (max(1000, item["get"] + item["extract"] + item["openai"]))
+    ),
+)
+for site in site_list:
+    total_time = max(1000, site["get"] + site["extract"] + site["openai"])
+    if "hits" in site.keys():
+        hits = site["hits"]
+    else:
+        hits = 0
+    quality = int((site["chars"] * 1000000) / total_time)
+    arg = ""
+    if len(sys.argv) > 1:
+        arg = sys.argv[1]
+    print_site = False
+    if "new" in arg:
+        if quality > 0 and site["name"] not in sites.keys():
+            print_site = True
+    elif len(sys.argv) == 1 and quality > 0:
+        print_site = True
+    elif "all" in arg:
+        print_site = True
+    elif quality > 0:
+        print_site = True
+    if print_site:
+        print(site["name"], hits, site["chars"], quality, end="")
+        if quality > 0 and site["name"] not in sites.keys():
+            print("*")
+        else:
+            print()

llmsearch/site_stats.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import json
+import random
+import traceback
+from llmsearch import utilityV2 as ut
+def findnth(haystack, needle, n):
+    parts = haystack.split(needle, n + 1)
+    if len(parts) <= n + 1:
+        return -1
+    return len(haystack) - len(parts[-1]) - len(needle)
+def extract_site(url):
+    site = ""
+    base = findnth(url, "/", 2)
+    if base > 2:
+        site = url[:base].split(".")
+    if len(site) > 1:
+        site = site[-2]
+    site = site.replace("https://", "")
+    site = site.replace("http://", "")
+    return site
+site_stats = {}  # initialize dictionay of sites used
+stats_loaded = False
+stats_dirty = False
+def open_site_stats():
+    global site_stats, stats_loaded, stats_dirty
+    if stats_loaded:
+        return
+    try:
+        with open("site_stats.json", "r") as f:
+            site_stats = json.loads(f.read())
+    except:
+        print("Failed to read site_stats.json")
+        traceback.print_exc()
+def ckpt():
+    global site_stats, stats_dirty
+    if not stats_dirty:
+        return
+    try:
+        with open("site_stats.json", "w") as ss:
+            ss.write(json.dumps(site_stats))
+        stats_dirty = False
+    except Exception as e:
+        print(f"Failed to write site_stats: {str(e)}")
+        traceback.print_exc()
+def update_site_stats(site, char_cnt, get_time, extract_time, openai_time):
+    global site_stats, stats_dirty
+    open_site_stats()
+    if site not in site_stats:
+        site_stats[site] = {
+            "name": site,
+            "hits": 0,
+            "chars": 0,
+            "get": 0,
+            "extract": 0,
+            "openai": 0,
+        }
+    if "hits" not in site_stats[site]:
+        site_stats[site]["hits"] = 0
+    site_stats[site]["hits"] = site_stats[site]["hits"] + 1
+    site_stats[site]["chars"] = char_cnt + site_stats[site]["chars"]
+    site_stats[site]["get"] = get_time + site_stats[site]["get"]
+    site_stats[site]["extract"] = extract_time + site_stats[site]["extract"]
+    site_stats[site]["openai"] = openai_time + site_stats[site]["openai"]
+    stats_dirty = True
+    # print("updated", site_stats[site])
+def retrieve(site):
+    global site_stats
+    if site not in site_stats:
+        site_stats[site] = {
+            "name": site,
+            "hits": 0,
+            "chars": 0,
+            "get": 0,
+            "extract": 0,
+            "openai": 0,
+        }
+    return site_stats[site]
+def get_next(urls, sample_unknown=False):
+    global site_stats
+    # retrieve stats for sites in list
+    candidates = []
+    for url in urls:
+        site = extract_site(url)
+        candidate = retrieve(site)
+        if sample_unknown or (site in ut.sites and ut.sites[site] != 0):
+            candidates.append((candidate, url))
+    if len(candidates) == 0:
+        return []
+    if len(candidates) == 1:
+        return candidates[0]
+    # random or ordered? if random, pick without sorting
+    if random.random() > 0.85:
+        pick = int(random.random() * len(candidates))
+        return candidates[pick]
+    # ordered, sort and compute cumulative
+    candidates.sort(
+        reverse=True,
+        key=lambda item: (
+                (item[0]["chars"] * 1000000)
+                / (max(1000, item[0]["get"] + item[0]["extract"] + item[0]["openai"]))
+        ),
+    )
+    # now pick top from sort
+    p = random.random()
+    p2 = p * p * p
+    return candidates[int(p2 * len(candidates))]

llmsearch/utilityV2.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import json
+import linecache
+import logging
+import re
+import traceback
+import tracemalloc
+import nltk
+import openai
+# from tenacity import (retry,stop_after_attempt,stop_after_delay, wait_random_exponential)
+from tenacity import *
+# from agents.utils import read_file
+from utils import read_file
+logger = logging.getLogger("agent_logger")
+openai.api_key = read_file("../keys/gpt_api_key.txt")
+# paid and ad free
+google_key = read_file("../keys/google_search_api_key.txt")
+# cx: The identifier of the Programmable Search Engine.
+google_cx = read_file("../keys/google_cx_api_key.txt")
+GOOGLE = "google"
+USER = "user"
+ASSISTANT = "assistant"
+MODEL = "gpt-3.5-turbo"
+sites = {}  # initialize dictionary or sites used
+new_sites = {}  # initialize dictionary or sites used
+try:
+    with open("sites", "r") as f:
+        sites = json.loads(f.read())
+except:
+    print("Failed to read sites.")
+# for experimenting with Vicuna
+def display_top(snapshot, key_type="lineno", limit=10):
+    snapshot = snapshot.filter_traces(
+        (
+            tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
+            tracemalloc.Filter(False, "<unknown>"),
+        )
+    )
+    top_stats = snapshot.statistics(key_type)
+    logger.info("Top %s lines" % limit)
+    for index, stat in enumerate(top_stats[:limit], 1):
+        frame = stat.traceback[0]
+        logger.info(
+            "#%s: %s:%s: %.1f KiB"
+            % (index, frame.filename, frame.lineno, stat.size / 1024)
+        )
+        line = linecache.getline(frame.filename, frame.lineno).strip()
+        if line:
+            logger.info("    %s" % line)
+    other = top_stats[limit:]
+    if other:
+        size = sum(stat.size for stat in other)
+        logger.info("%s other: %.1f KiB" % (len(other), size / 1024))
+    total = sum(stat.size for stat in top_stats)
+    logger.info("Total allocated size: %.1f KiB" % (total / 1024))
+class turn:
+    def __init__(self, role="assistant", message="", tldr="", source="", keywords=[]):
+        self.role = role
+        self.message = message
+        self.tldr = tldr
+        self.source = source
+        self.keywords = keywords
+    def __str__(self):
+        s = ""
+        if self.role is not None and len(self.role) > 0:
+            s = s + "r: " + self.role
+        if self.message is not None and len(self.message) > 0:
+            s = s + " m: " + self.message
+        if self.source is not None and len(self.source) > 0:
+            s = s + " s: " + self.source
+        if self.tldr is not None and len(self.tldr) > 0:
+            s = s + "tldr: " + self.tldr
+        return s
+    def is_google_turn(self):
+        return self.source is not None and self.source == GOOGLE
+    def is_user_turn(self):
+        return self.source is not None and self.source == USER
+    def is_assistant_turn(self):
+        return self.source is not None and self.source == ASSISTANT
+# @retry(wait=wait_random_exponential(min=1, max=2), stop=(stop_after_delay(15) | stop_after_attempt(2)))
+def chatCompletion_with_backoff(**kwargs):
+    return openai.ChatCompletion.create(**kwargs)
+def ask_gpt(model, gpt_message, max_tokens, temp, top_p):
+    completion = None
+    try:
+        completion = openai.chat.completions.create(
+            model=model,
+            messages=gpt_message,
+            max_tokens=max_tokens,
+            temperature=temp,
+            top_p=top_p,
+        )
+    except:
+        traceback.print_exc()
+    if completion is not None:
+        response = completion.choices[0].message.content.lstrip(" ,:.")
+        logger.info(response)
+        return response
+    else:
+        logger.info("no response")
+        return None
+def ask_gpt_with_retries(model, gpt_message, tokens, temp, timeout, tries):
+    retryer = Retrying(stop=(stop_after_delay(timeout) | stop_after_attempt(1)))
+    r = retryer(
+        ask_gpt,
+        model=model,
+        gpt_message=gpt_message,
+        max_tokens=tokens,
+        temp=temp,
+        top_p=1,
+    )
+    return r
+INFORMATION_QUERY = "information query"
+INTENTS = []
+def find_intent(response):
+    global INTENTS, INFORMATION_QUERY
+    for intent in INTENTS:
+        if intent in response.lower():
+            return intent
+    return INFORMATION_QUERY
+def find_query(response):
+    search_query_phrase = response
+    phrase_index = response.lower().find("phrase:")
+    quoted_strings = []
+    if phrase_index < 0:
+        phrase_index = 0
+    else:
+        phrase_index += len("phrase:")
+    quoted_strings = re.findall(r'"([^"]*)"', search_query_phrase[phrase_index:])
+    if len(quoted_strings) == 0:
+        quoted_strings = re.findall(r"'([^']*)'", search_query_phrase[phrase_index:])
+    if len(quoted_strings) > 0:
+        # logger.info(quoted_strings)
+        phrase = quoted_strings[0]
+        return phrase, response[response.find(phrase) + len(phrase) + 1:]
+    else:
+        logger.info("no quoted text, returning original query string", response)
+        # logger.info(response)
+        return "", response
+def find_keywords(response, query_phrase, orig_phrase):
+    # keywords includes those suggested by gpt and any remaining words from query phrase len > 4
+    keywords = []
+    quoted_strings = re.findall(r'"([^"]*)"', query_phrase)
+    quoted_strings2 = re.findall(r'"([^"]*)"', orig_phrase)
+    remainder = query_phrase
+    k_index = response.lower().find("keyword")
+    if k_index > 0:
+        keyword_string = response[k_index + len("keyword"):]
+        nm_index = keyword_string.find("Named-Entities:")
+        if nm_index > 0:
+            keyword_string = keyword_string[:nm_index].rstrip()
+            # logger.info(keyword_string)
+        c_index = keyword_string.find(":")
+        keyword_string = keyword_string[c_index + 1:]
+        candidates = keyword_string.split(",")
+        for keyword in candidates:
+            keyword = keyword.strip(":,.\t\n").lstrip(" ")
+            if len(keyword) > 3 or keyword[0:1].isupper():
+                keywords.append(keyword)
+        return keywords
+    return ""
+# don't know why this compilation error doesn't throw errors at runtime, but it doesn't, on the other hand trying to
+# fix this creates an infinite import loop, so don't touch this.
+def split_interaction(interaction):
+    qs = interaction.find(prefix)
+    rs = interaction.find(suffix)
+    if qs >= 0 and rs >= 0:
+        query = interaction[len(prefix): rs].lstrip()
+        response = interaction[rs + len(suffix):].lstrip()
+        return query, response
+    else:
+        logger.info("can't parse", interaction)
+    return "", ""
+def findnth(haystack, needle, n):
+    parts = haystack.split(needle, n + 1)
+    if len(parts) <= n + 1:
+        return -1
+    return len(haystack) - len(parts[-1]) - len(needle)
+def extract_site(url):
+    site = ""
+    base = findnth(url, "/", 2)
+    if base > 2:
+        site = url[:base].split(".")
+    if len(site) > 1:
+        site = site[-2]
+    site = site.replace("https://", "")
+    site = site.replace("http://", "")
+    return site
+def extract_domain(url):
+    site = ""
+    base = findnth(url, "/", 2)
+    if base > 2:
+        domain = url[:base].split(".")
+    if len(domain) > 1:
+        domain = domain[-2] + "." + domain[-1]
+    domain = domain.replace("https://", "")
+    domain = domain.replace("http://", "")
+    return domain
+def part_of_keyword(word, keywords):
+    for keyword in keywords:
+        if word in keyword:
+            return True
+    return False
+keyword_prompt = 'Perform two tasks on the following text. First, rewrite the <text> as an effective google search phrase. Second, analyze text and list keywords and named-entities found. Return the result as: Phrase: "<google search phrase>"\nKeywords: <list of keywords>\nNamed-Entities: <list of Named-Entities>'
+def get_search_phrase_and_keywords(query_string, chat_history):
+    gpt_message = [
+        {"role": "user", "content": keyword_prompt},
+        {"role": "user", "content": "Text\n" + query_string},
+        {"role": "assistant", "content": "Phrase:"},
+    ]
+    response_text = ""
+    completion = None
+    # for role in gpt_message:
+    #    logger.info(role)
+    # logger.info()
+    response_text = ask_gpt_with_retries(
+        "gpt-3.5-turbo", gpt_message, tokens=150, temp=0.3, timeout=6, tries=2
+    )
+    logger.info(response_text)
+    # useful function to make search query more optimal, for future explainability studies
+    # consider returning query phrase and keywords to user
+    query_phrase, remainder = find_query(response_text)
+    logger.info("PHRASE:" + query_phrase)
+    # logger.info(remainder)
+    keywords = find_keywords(remainder, query_phrase, query_string)
+    logger.info("KEYWORDS:" + ''.join(keywords))
+    return query_phrase, keywords
+def reform(elements):
+    # reformulates text extracted from a webpage by unstructured.partition_html into larger keyword-rankable chunks
+    texts = (
+        []
+    )  # a list of text_strings, each of at most *max* chars, separated on '\n' when splitting an element is needed
+    paragraphs = []
+    total_elem_len = 0
+    for element in elements:
+        text = str(element)
+        total_elem_len += len(text)
+        if len(text) < 4:
+            continue
+        elif len(text) < 500:
+            texts.append(text)
+        else:
+            subtexts = text.split("\n")
+            for subtext in subtexts:
+                if len(subtext) < 500:
+                    texts.append(subtext)
+                else:
+                    texts.extend(nltk.sent_tokenize(subtext))
+    # now reassemble shorter texts into chunks
+    paragraph = ""
+    total_pp_len = 0
+    for text in texts:
+        if len(text) + len(paragraph) < 500:
+            paragraph += " " + text
+        else:
+            if len(paragraph) > 0:  # start a new paragraph
+                paragraphs.append(paragraph)
+                paragraph = ""
+            paragraph += text
+    if len(paragraph) > 0:
+        paragraphs.append(paragraph + ".\n")
+    # logger.info(f'\n***** reform elements in {len(elements)}, paragraphs out {len(paragraphs)}')
+    total_pp_len = 0
+    for paragraph in paragraphs:
+        total_pp_len += len(paragraph)
+    if total_pp_len > 1.2 * total_elem_len:
+        logger.info(
+            f"******** reform out > reform in.  out: {total_pp_len}, in: {total_elem_len}"
+        )
+    return paragraphs
+def get_actions(text):
+    # look for actions in response
+    action_indecies = re.finditer("Action:", text)  # Action: [search, ask} (query)
+    actions = []
+    editted_response = text
+    for action_index in action_indecies:
+        action = text[action_index.span()[1]:]
+        agent = None
+        query = None
+        query_start = action.find("(")
+        if query_start > 0:
+            agent = action[:query_start].strip()
+            query_end = action[query_start + 1:].find(")")
+            if query_end > 0:
+                query = action[query_start + 1: query_start + 1 + query_end]
+                action = text[
+                         action_index.start(): action_index.span()[1]
+                                               + action_index.start()
+                                               + query_start
+                                               + query_end
+                                               + 2
+                         ]
+        if agent is None or query is None:
+            logger.info(
+                "can't parse action, skipping",
+                text[action_index.start(): action_index.start() + 48],
+            )
+            continue
+        actions.append([agent, query, action])
+        editted_response = editted_response.replace(action, "")
+    return actions
+if __name__ == "__main__":
+    get_search_phrase_and_keywords(
+        "Would I like the video game Forspoken, given that I like Final Fantasy VII?",
+        [],
+    )
+    # logger.info(query_vicuna("what is 5 * 3?"))

read_write_index.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import logging
+import os
+from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, load_index_from_storage
+logger = logging.getLogger(__name__)
+DOCUMENT_PATH = '../data'
+# remember to delete stored vectors when new documents are added to the data so the storage is recreated
+def read_write_index(path):
+    if not os.path.exists(path):
+        documents = SimpleDirectoryReader(DOCUMENT_PATH).load_data()
+        logger.info(f'Indexing documents in {DOCUMENT_PATH}...')
+        index = VectorStoreIndex.from_documents(documents)
+        index.storage_context.persist(persist_dir=path)
+        logger.info(f'{len(documents)} documents indexed.')
+    else:
+        logger.info(f'Loading index from {path}...')
+        storage_context = StorageContext.from_defaults(persist_dir=path)
+        index = load_index_from_storage(storage_context)
+    return index

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio==4.38.1
+llama-index==0.10.29
+llama-index-core==0.10.29
+# prev version chatmessage class import broken
+llama-index-llms-mistralai==0.1.6
+llama-index-embeddings-mistralai
+llama-index-embeddings-openai
+llama-index-llms-openai
+# needed for simpledirectoryreader to work
+llama-index-readers-file
+selenium
+unstructured
+requests

utils.py ADDED Viewed

	@@ -0,0 +1,5 @@

+def read_file(path):
+    txt = open(path, "r")
+    file = txt.read()
+    txt.close()
+    return file

web_search.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import concurrent.futures
+import copy
+import json
+import logging
+import time
+import traceback
+import urllib.parse as en
+import warnings
+from itertools import zip_longest
+import requests
+import selenium.common.exceptions
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from unstructured.partition.html import partition_html
+from llmsearch import meta as mt, site_stats
+# this import style works in pycharm
+from llmsearch import utilityV2 as ut
+from urllib.request import urlopen
+# this import style works on sever + vs code
+# import utils
+# from llmsearch import google_search_concurrent as gs
+# from llmsearch import meta as mt
+# from llmsearch import utilityV2 as ut
+logger = logging.getLogger("agent_logger")
+# todo drop blocked pages > see og llmsearch code
+# todo use the chatcondesemode query instead of the new gpt query
+def search(msg, query_phrase):
+    try:
+        # this call extracts keywords from the statement and rewrites it into a better search phrase with gpt3.5
+        # query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
+        mt.clear()
+        google_text = ""
+        try:
+            logger.info(f"asking google {msg}; rephrased: {query_phrase}")
+            google_text, urls_all, index, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
+        except:
+            traceback.print_exc()
+        logger.info("\n\nFinal response: ")
+        for item in google_text:
+            logger.info(
+                f"\n##############################################################################################\nSource: {item['source']}"
+            )
+            logger.info(f"{item['text']}")
+            logger.info(f"URL: {item['url']}")
+        return google_text
+    except KeyboardInterrupt:
+        traceback.print_exc()
+        raise KeyboardInterrupt
+    except:
+        traceback.print_exc()
+    return ""
+# Define a function to make a single URL request and process the response
+def process_url(query_phrase, url, timeout):
+    start_time = time.time()
+    site = ut.extract_site(url)
+    result = ""
+    try:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            options = Options()
+            options.page_load_strategy = "eager"
+            options.add_argument("--headless")
+            result = ""
+            with webdriver.Chrome(options=options) as dr:
+                logger.info(f"*****setting page load timeout {timeout}")
+                dr.set_page_load_timeout(timeout)
+                try:
+                    dr.get(url)
+                    response = dr.page_source
+                    result = response_text_extract(url=url, response=response)
+                except selenium.common.exceptions.TimeoutException:
+                    return "", url
+    except Exception:
+        traceback.print_exc()
+        logger.info(f"{site} err")
+        pass
+    logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
+    return result, url
+def process_urls(query_phrase, urls):
+    # Create a ThreadPoolExecutor with 5 worker threads
+    response = []
+    logger.info("entering process urls")
+    full_text = ""
+    used_index = 0
+    urls_used = ["" for i in range(30)]
+    tried_index = 0
+    urls_tried = ["" for i in range(30)]
+    start_time = time.time()
+    in_process = []
+    processed = []
+    google_futures = []
+    with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
+        # initialize scan of google urls
+        while True:
+            try:
+                while (len(urls) > 0
+                       # no sense starting if not much time left
+                       and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
+                ):
+                    recommendation = site_stats.get_next(urls, sample_unknown=True)
+                    # set timeout so we don't wait for a slow site forever
+                    timeout = 12 - int(time.time() - start_time)
+                    url = recommendation[1]
+                    future = executor.submit(process_url, query_phrase, url, timeout)
+                    google_futures.append(future)
+                    in_process.append(future)
+                    urls_tried[tried_index] = url
+                    tried_index += 1
+                    urls.remove(url)
+                    logger.info(f"queued {ut.extract_site(url)}, {timeout}")
+                # Process the responses as they arrive
+                for future in in_process:
+                    if future.done():
+                        result, url = future.result()
+                        processed.append(future)
+                        in_process.remove(future)
+                        if len(result) > 0:
+                            urls_used[used_index] = url
+                            used_index += 1
+                            logger.info(
+                                f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
+                            )
+                            if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
+                                response.append(
+                                    {
+                                        "source": ut.extract_domain(url),
+                                        "url": url,
+                                        "text": result,
+                                    }
+                                )
+                if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
+                    executor.shutdown(wait=False)
+                    logger.info(
+                        f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
+                    )
+                    return response, used_index, urls_used, tried_index, urls_tried
+                time.sleep(0.5)
+            except:
+                traceback.print_exc()
+        executor.shutdown(wait=False)
+    logger.info(
+        f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
+    )
+    return response, index, urls_used, tried_index, urls_tried
+def extract_subtext(text):
+    return ut.reform(text)
+def request_google(query_phrase):
+    logger.info(f"***** search {query_phrase}")
+    sort = "&sort=date-sdate:d:w"
+    if "today" in query_phrase or "latest" in query_phrase:
+        sort = "&sort=date-sdate:d:s"
+    # logger.info(f"search for: {query_phrase}")
+    google_query = en.quote(query_phrase)
+    response = []
+    try:
+        start_wall_time = time.time()
+        url = (
+                "https://www.googleapis.com/customsearch/v1?key="
+                + ut.google_key
+                + "&cx="
+                + ut.google_cx
+                + "&num=4"
+                + sort
+                + "&q="
+                + google_query
+        )
+        response = requests.get(url)
+        response_json = json.loads(response.text)
+        logger.info(f"***** google search {int((time.time() - start_wall_time) * 10) / 10} sec")
+    except:
+        traceback.print_exc()
+        return []
+    # see if we got anything useful from google
+    if "items" not in response_json.keys():
+        logger.info("no return from google ...", response, response_json.keys())
+        return []
+    urls = []
+    for i in range(len(response_json["items"])):
+        url = response_json["items"][i]["link"].lstrip().rstrip()
+        site = ut.extract_site(url)
+        if site not in ut.sites or ut.sites[site] == 1:
+            # don't use these sources (reddit because it blocks bots)
+            if "reddit" not in url and "youtube" not in url and "facebook" not in url:
+                urls.append(url)
+    return urls
+def response_text_extract(url, response):
+    extract_text = ""
+    if url.endswith("pdf"):
+        pass
+    else:
+        if response is not None:
+            elements = partition_html(text=response)
+            str_elements = []
+            logger.info('\n***** elements')
+            for e in elements:
+                stre = str(e).replace("  ", " ")
+                str_elements.append(stre)
+            extract_text = ''.join(extract_subtext(str_elements))
+            logger.info(
+                f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
+            )
+    if len(extract_text.strip()) < 8:
+        return ""
+    else:
+        return extract_text
+def extract_items_from_numbered_list(text):
+    items = ""
+    elements = text.split("\n")
+    for candidate in elements:
+        candidate = candidate.lstrip(". \t")
+        if len(candidate) > 4 and candidate[0].isdigit():
+            candidate = candidate[1:].lstrip(". ")
+            if (
+                    len(candidate) > 4 and candidate[0].isdigit()
+            ):  # strip second digit if more than 10 items
+                candidate = candidate[1:].lstrip(". ")
+            logger.info("E {}".format(candidate))
+            items += candidate + " "
+    return items
+def search_google(original_query, query_phrase):
+    all_urls = []
+    urls_used = []
+    urls_tried = []
+    index = 0
+    tried_index = 0
+    full_text = ""
+    try:  # query google for recent info
+        extract_query = ""
+        orig_phrase_urls = []
+        if len(original_query) > 0:
+            orig_phrase_urls = request_google(original_query[: min(len(original_query), 128)])
+            extract_query = original_query[: min(len(original_query), 128)]
+        gpt_phrase_urls = []
+        if len(query_phrase) > 0:
+            gpt_phrase_urls = request_google(query_phrase)
+            extract_query = (
+                query_phrase  # prefer more succinct query phrase if available
+            )
+        if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
+            return "", [], 0, [""], 0, [""]
+        for url in orig_phrase_urls:
+            if url in gpt_phrase_urls:
+                gpt_phrase_urls.remove(url)
+        # interleave both lists now that duplicates are removed
+        urls = [
+            val
+            for tup in zip_longest(orig_phrase_urls, gpt_phrase_urls)
+            for val in tup
+            if val is not None
+        ]
+        all_urls = copy.deepcopy(urls)
+        # initialize scan of google urls
+        start_wall_time = time.time()
+        full_text, index, urls_used, tried_index, urls_tried = process_urls(extract_query, all_urls)
+        logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
+        logger.info("return from url processsing")
+    except:
+        traceback.print_exc()
+    return full_text, all_urls, index, urls_used, tried_index, urls_tried