arabellastrange commited on
Commit
6855b1e
·
1 Parent(s): b0b5964

Add application file

Browse files
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from time import asctime
3
+
4
+ import gradio as gr
5
+ from llama_index.core import Document, VectorStoreIndex
6
+ from llama_index.core.evaluation import SemanticSimilarityEvaluator, FaithfulnessEvaluator
7
+
8
+ from generate_response import generate_chat_response_with_history, set_llm, is_search_query, condense_question, \
9
+ generate_chat_response_with_history_rag_return_response, get_llm
10
+ from utils import read_file
11
+ from web_search import search
12
+
13
+ API_KEY_PATH = "../keys/gpt_api_key.txt"
14
+ logger = logging.getLogger("agent_logger")
15
+ sourced = False
16
+ query = False
17
+ rag_similarity = False
18
+
19
+
20
+ def google_search_chat(message, history):
21
+ condensed_question = condense_question(message, history)
22
+ if is_search_query(condensed_question):
23
+ search_results = search(message, condensed_question)
24
+ relevant_content = ""
25
+ sources = ""
26
+ for index, result in enumerate(search_results):
27
+ relevant_content = relevant_content + "\n" + ''.join(result['text'])
28
+ sources = sources + f'\n {index + 1}. ' + result['url'] # python is zero-indexed
29
+
30
+ if relevant_content != "":
31
+ documents = [Document(text=relevant_content)]
32
+ index = VectorStoreIndex.from_documents(documents)
33
+
34
+ response = generate_chat_response_with_history_rag_return_response(index, message, history)
35
+
36
+ similar_str = "not calculated"
37
+ faithfulness_str = "not calculated"
38
+
39
+ if rag_similarity:
40
+ sim_evaluator = SemanticSimilarityEvaluator()
41
+ faith_evaluator = FaithfulnessEvaluator(llm=get_llm())
42
+ # condensed_context = condense_context(relevant_content)
43
+ # logger.info("Calculating similarity...")
44
+ # similar = sim_evaluator.evaluate(response=str(response),
45
+ # reference=condensed_context)
46
+ logger.info("Calculating faithfulness...")
47
+ faithfulness = faith_evaluator.evaluate_response(query=condensed_question, response=response)
48
+ # similar_str = str(round((similar.score * 100), 2)) + "%"
49
+ faithfulness_str = "Yes" if faithfulness.passing else "No"
50
+
51
+ logger.info(f'**Search Query:** {condensed_question} \n **Faithfulness:** {faithfulness_str} \n '
52
+ f'**Similarity:** {similar_str} \n **Sources used:** \n {sources}')
53
+
54
+ response_text = []
55
+ string_output = ""
56
+
57
+ for text in response.response_gen:
58
+ response_text.append(text)
59
+ string_output = ''.join(response_text)
60
+ yield string_output
61
+
62
+ if not sourced:
63
+ pass
64
+ if sourced and not query and not rag_similarity:
65
+ yield string_output + f'\n\n --- \n **Sources used:** \n {sources}'
66
+ if sourced and query and not rag_similarity:
67
+ yield (string_output
68
+ + f'\n\n --- \n **Search Query:** {condensed_question} '
69
+ f'\n **Sources used:** \n {sources}')
70
+ if rag_similarity:
71
+ yield (string_output
72
+ + f'\n\n --- \n **Search Query:** {condensed_question} \n '
73
+ # f'**Similarity of response to the sources [ℹ️]'
74
+ # f'(https://en.wikipedia.org/wiki/Semantic_similarity):** {similar_str} \n'
75
+ f'**Is response in source documents?**: {faithfulness_str}'
76
+ f'\n **Sources used:** \n {sources}')
77
+
78
+ logger.info(f'Assistant Response: {string_output}')
79
+ else:
80
+ logger.info(
81
+ f'Assistant Response: Sorry, no search results found.')
82
+ yield "Sorry, no search results found."
83
+
84
+ else:
85
+ yield from generate_chat_response_with_history(message, history)
86
+
87
+ def run_searchbot():
88
+
89
+ logging.root.setLevel(logging.INFO)
90
+ filehandler = logging.FileHandler(f'../logs/agent_log_{asctime().replace(" ", "").lower().replace(":", "")}.log',
91
+ 'a')
92
+ formatter = logging.Formatter('%(asctime)-15s::%(levelname)s::%(filename)s::%(funcName)s::%(lineno)d::%(message)s')
93
+ filehandler.setFormatter(formatter)
94
+ logger = logging.getLogger("agent_logger")
95
+ for hdlr in logger.handlers[:]: # remove the existing file handlers
96
+ if isinstance(hdlr, logging.FileHandler):
97
+ logger.removeHandler(hdlr)
98
+ logger.addHandler(filehandler) # set the new handler
99
+ logger.setLevel(logging.INFO)
100
+
101
+ api_key = read_file(API_KEY_PATH)
102
+ global sourced
103
+ sourced = False
104
+
105
+ # GPT - 4 Turbo. The latest GPT - 4 model intended to reduce cases of “laziness” where the model doesn’t complete
106
+ # a task. Returns a maximum of 4,096 output tokens. Link:
107
+ # https://openai.com/blog/new-embedding-models-and-api-updates
108
+ set_llm(key=api_key, model="gpt-4-0125-preview", temperature=0)
109
+
110
+ logger.info("Launching Gradio ChatInterface for searchbot...")
111
+ demo = gr.ChatInterface(fn=google_search_chat,
112
+ title="Search Assistant", retry_btn=None, undo_btn=None, clear_btn=None,
113
+ theme="soft")
114
+ demo.queue().launch(auth=('convo', 'session2024'), root_path='/convosearch', server_port=7866)
generate_response.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from llama_index.core import ServiceContext, set_global_service_context, PromptTemplate
4
+ from llama_index.core.base.embeddings.base import BaseEmbedding
5
+ from llama_index.core.base.llms.base import BaseLLM
6
+ from llama_index.core.base.llms.generic_utils import messages_to_history_str
7
+ from llama_index.core.base.llms.types import ChatMessage, MessageRole
8
+ from llama_index.core.chat_engine.types import ChatMode
9
+ from llama_index.embeddings.mistralai import MistralAIEmbedding
10
+ from llama_index.embeddings.openai import OpenAIEmbedding
11
+ from llama_index.llms.mistralai import MistralAI
12
+ from llama_index.llms.openai import OpenAI
13
+
14
+ llm: BaseLLM
15
+ embed_model: BaseEmbedding
16
+ logger = logging.getLogger("agent_logger")
17
+
18
+
19
+ # TODO why is my system prompt being ignored?
20
+ def set_llm(model, key, temperature):
21
+ global llm
22
+ global embed_model
23
+
24
+ logger.info(f'Setting up LLM with {model} and associated embedding model...')
25
+
26
+ if "gpt" in model:
27
+ llm = OpenAI(api_key=key, temperature=temperature, model=model)
28
+ embed_model = OpenAIEmbedding(api_key=key)
29
+ elif "mistral" in model:
30
+ llm = MistralAI(api_key=key, model=model, temperature=temperature, safe_mode=True)
31
+ embed_model = MistralAIEmbedding(api_key=key)
32
+ else:
33
+ llm = OpenAI(api_key=key, model="gpt-3.5-turbo", temperature=0)
34
+ embed_model = OpenAIEmbedding(api_key=key)
35
+
36
+ # deprecated call should migrate
37
+ service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
38
+ set_global_service_context(service_context)
39
+
40
+
41
+ def get_llm():
42
+ return llm
43
+
44
+
45
+ def generate_query_response(index, message):
46
+ string_output = ""
47
+
48
+ logger.info("Creating query engine with index...")
49
+ query_engine = index.as_query_engine(streaming=True, chat_mode=ChatMode.CONDENSE_QUESTION)
50
+
51
+ logger.info(f'Input user message: {message}')
52
+ response = query_engine.query(message)
53
+
54
+ response_text = []
55
+ for text in response.response_gen:
56
+ response_text.append(text)
57
+ string_output = ''.join(response_text)
58
+ yield string_output
59
+ logger.info(f'Assistant response: {string_output}')
60
+
61
+
62
+ def generate_chat_response_with_history(message, history):
63
+ string_output = ""
64
+
65
+ messages = collect_history(message, history)
66
+
67
+ response = llm.stream_chat(messages)
68
+ response_text = []
69
+ for text in response:
70
+ response_text.append(text.delta)
71
+ string_output = ''.join(response_text)
72
+ yield string_output
73
+ logger.info(f'Assistant response: {string_output}')
74
+
75
+
76
+ def generate_chat_response_with_history_rag_return_response(index, message, history):
77
+ logger.info("Generating chat response with history and rag...")
78
+
79
+ messages = collect_history(message, history)
80
+
81
+ logger.info("Creating query engine with index...")
82
+ query_engine = index.as_chat_engine(chat_mode=ChatMode.CONDENSE_QUESTION, streaming=True)
83
+ return query_engine.stream_chat(messages)
84
+
85
+
86
+ def generate_chat_response_with_history_rag_yield_string(index, message, history):
87
+ logger.info("Generating chat response with history and rag...")
88
+ string_output = ""
89
+
90
+ messages = collect_history(message, history)
91
+
92
+ logger.info("Creating query engine with index...")
93
+ query_engine = index.as_chat_engine(chat_mode=ChatMode.CONDENSE_QUESTION, streaming=True)
94
+
95
+ response = query_engine.stream_chat(messages)
96
+
97
+ response_text = []
98
+ for text in response.response_gen:
99
+ response_text.append(text)
100
+ string_output = ''.join(response_text)
101
+ yield string_output
102
+
103
+ logger.info(f'Assistant response: {string_output}')
104
+
105
+
106
+ def is_greeting(message):
107
+ response = llm.complete(
108
+ f'Is the user message a greeting? Answer True or False only. For example: \n User message: "Hello" \n '
109
+ f'Assistant response: True \n User message "Where do pears grow?" Assistant response: False \n. User message: "{message}"')
110
+ if any(x in response.text.lower() for x in ["true", "yes", "is a greeting"]):
111
+ return True
112
+ return False
113
+
114
+
115
+ def is_closing(message):
116
+ # TODO
117
+ return False
118
+
119
+
120
+ def is_search_query(message):
121
+
122
+ response = llm.complete(
123
+ f'Is the user message a request for factual information? Answer True or False only. For example: \n User '
124
+ f'message: "Where do watermelons grow?" \n Assistant response: True \n User message "Do you like watermelons?" '
125
+ f'Assistant response: False \n. User message: "Hello" \n Assistant response: False \n User message: "My code '
126
+ f'is not working. How do I implement logging correctly in python?" \n Assistant response: True \n User '
127
+ f'message: "{message}"')
128
+ if any(x in response.text.lower() for x in ["true", "yes", "is a request"]):
129
+ logger.info(f'Message: {message} is a request...')
130
+ return True
131
+ return False
132
+
133
+
134
+ def collect_history(message, history):
135
+ logger.info(f'Input user message: {message}')
136
+
137
+ def message_generator():
138
+ messages = []
139
+ logger.info("Fetching message history...")
140
+ for message_pair in history:
141
+ if message_pair[0] is not None:
142
+ messages.append(ChatMessage(role=MessageRole.USER, content=message_pair[0]))
143
+ if message_pair[1] is not None:
144
+ messages.append(ChatMessage(role=MessageRole.ASSISTANT, content=message_pair[1]))
145
+ logger.info(f'{len(messages)} messages in message history...')
146
+ return messages
147
+
148
+ messages = message_generator()
149
+ messages.append(ChatMessage(role=MessageRole.USER, content=message))
150
+
151
+ return messages
152
+
153
+
154
+ def condense_question(message, history):
155
+ DEFAULT_TEMPLATE = """\
156
+ Given a conversation (between Human and Assistant) and a follow up message from Human, \
157
+ rewrite the message to be a standalone question that captures all relevant context \
158
+ from the conversation.
159
+
160
+ <Chat History>
161
+ {chat_history}
162
+
163
+ <Follow Up Message>
164
+ {question}
165
+
166
+ <Standalone question>
167
+ """
168
+ condense_question_prompt = PromptTemplate(DEFAULT_TEMPLATE)
169
+
170
+ messages = collect_history(message, history)
171
+ chat_history_str = messages_to_history_str(messages)
172
+
173
+ question = llm.predict(condense_question_prompt, question=message, chat_history=chat_history_str)
174
+
175
+ return question
176
+
177
+
178
+ def condense_context(context):
179
+ logger.info("Condensing input text with LLM complete...")
180
+
181
+ response = llm.complete(f'Rewrite the input to be a concise summary that captures '
182
+ f'all relevant context from the original text. \n'
183
+ f'Original Text: {context}')
184
+
185
+ return response.text
llmsearch/google_search_concurrent.py ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import concurrent.futures
2
+ import copy
3
+ import json
4
+ import logging
5
+ import sys
6
+ import time
7
+ # from PyPDF2 import PdfReader
8
+ import traceback
9
+ import urllib.parse as en
10
+ import warnings
11
+ from datetime import date
12
+ from itertools import zip_longest
13
+
14
+ import nltk
15
+ import requests
16
+ import selenium.common.exceptions
17
+ import wordfreq as wf
18
+ from selenium import webdriver
19
+ from selenium.webdriver.chrome.options import Options
20
+ from unstructured.partition.html import partition_html
21
+
22
+ from llmsearch import site_stats
23
+ from llmsearch import utilityV2 as ut
24
+
25
+ # from llmsearch import site_stats
26
+ # from llmsearch import utilityV2 as ut
27
+
28
+ logger = logging.getLogger("agent_logger")
29
+
30
+ today = " as of " + date.today().strftime("%b-%d-%Y") + "\n\n"
31
+
32
+ suffix = "\nA: "
33
+ client = "\nQ: "
34
+
35
+ QUICK_SEARCH = "quick"
36
+ NORMAL_SEARCH = "moderate"
37
+ DEEP_SEARCH = "deep"
38
+
39
+ # system_prime = {
40
+ # "role": "system",
41
+ # "content": "You analyze Text with respect to Query and list any relevant information found, including direct quotes from the text, and detailed samples or examples in the text.",
42
+ # }
43
+ priming_1 = {"role": "user", "content": "Query:\n"}
44
+
45
+
46
+ # priming_2 = {
47
+ # "role": "user",
48
+ # "content": "List relevant information in the provided text, including direct quotes from the text. If none, respond 'no information'.\nText:\n",
49
+ # }
50
+
51
+ def process_url_mod(query_phrase, url, timeout):
52
+ start_time = time.time()
53
+ site = ut.extract_site(url)
54
+ result = ""
55
+ try:
56
+ with warnings.catch_warnings():
57
+ warnings.simplefilter("ignore")
58
+ options = Options()
59
+ options.page_load_strategy = "eager"
60
+ options.add_argument("--headless")
61
+ result = ""
62
+ with webdriver.Chrome(options=options) as dr:
63
+ logger.info(f"*****setting page load timeout {timeout}")
64
+ dr.set_page_load_timeout(timeout)
65
+ try:
66
+ dr.get(url)
67
+ response = dr.page_source
68
+ result = response_text_extract_mod(url, response)
69
+ except selenium.common.exceptions.TimeoutException:
70
+ return "", url
71
+ except Exception:
72
+ traceback.print_exc()
73
+ logger.info(f"{site} err")
74
+ pass
75
+ logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
76
+ return result, url
77
+
78
+
79
+ # Define a function to make a single URL request and process the response
80
+ def process_url(query_phrase, keywords, keyword_weights, url, timeout):
81
+ start_time = time.time()
82
+ site = ut.extract_site(url)
83
+ result = ""
84
+ try:
85
+ with warnings.catch_warnings():
86
+ warnings.simplefilter("ignore")
87
+ options = Options()
88
+ options.page_load_strategy = "eager"
89
+ options.add_argument("--headless")
90
+ result = ""
91
+ with webdriver.Chrome(options=options) as dr:
92
+ logger.info(f"*****setting page load timeout {timeout}")
93
+ dr.set_page_load_timeout(timeout)
94
+ try:
95
+ dr.get(url)
96
+ response = dr.page_source
97
+ result = response_text_extract(
98
+ query_phrase,
99
+ keywords,
100
+ keyword_weights,
101
+ url,
102
+ response,
103
+ int(time.time() - start_time),
104
+ )
105
+ except selenium.common.exceptions.TimeoutException:
106
+ return "", url
107
+ except Exception:
108
+ traceback.print_exc()
109
+ logger.info(f"{site} err")
110
+ pass
111
+ # logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time()-start_time)*1000)} ms")
112
+ return result, url
113
+
114
+
115
+ def process_urls_mod(query_phrase, urls):
116
+ start_time = time.time()
117
+
118
+ response = []
119
+ logger.info("entering process urls")
120
+ full_text = ""
121
+ used_index = 0
122
+ urls_used = ["" for i in range(30)]
123
+ tried_index = 0
124
+ urls_tried = ["" for i in range(30)]
125
+ in_process = []
126
+ processed = []
127
+ google_futures = []
128
+
129
+ with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:
130
+ # initialize scan of google urls
131
+ while True:
132
+ try:
133
+ while len(urls) > 0:
134
+ timeout = 12 - int(time.time() - start_time)
135
+ recommendation = site_stats.get_next(
136
+ urls, sample_unknown=True
137
+ )
138
+ url = recommendation[1]
139
+ future = executor.submit(process_url_mod, query_phrase=query_phrase, url=url, timeout=timeout)
140
+ google_futures.append(future)
141
+ in_process.append(future)
142
+ urls_tried[tried_index] = url
143
+ tried_index += 1
144
+ urls.remove(url)
145
+ logger.info(f"queued {ut.extract_site(url)}, {timeout}")
146
+
147
+ for future in in_process:
148
+ if future.done():
149
+ result, url = future.result()
150
+ processed.append(future)
151
+ in_process.remove(future)
152
+ if len(result) > 0:
153
+ urls_used[used_index] = url
154
+ used_index += 1
155
+ result = result.replace(". .", ".")
156
+ logger.info(
157
+ f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
158
+ )
159
+ response.append(
160
+ {
161
+ "source": ut.extract_domain(url),
162
+ "url": url,
163
+ "text": result,
164
+ }
165
+ )
166
+ if time.time() - start_time > 28:
167
+ executor.shutdown(wait=False)
168
+ logger.info(
169
+ f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
170
+ )
171
+ return response, used_index, urls_used, tried_index, urls_tried
172
+ time.sleep(0.5)
173
+ except:
174
+ traceback.print_exc()
175
+ executor.shutdown(wait=False)
176
+ logger.info(
177
+ f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
178
+ )
179
+ return response, index, urls_used, tried_index, urls_tried
180
+
181
+
182
+ def process_urls(query_phrase, keywords, keyword_weights, urls, search_level):
183
+ # Create a ThreadPoolExecutor with 5 worker threads
184
+ response = []
185
+ logger.info("entering process urls")
186
+ start_time = time.time()
187
+ full_text = ""
188
+ used_index = 0
189
+ urls_used = ["" for i in range(30)]
190
+ tried_index = 0
191
+ urls_tried = ["" for i in range(30)]
192
+ start_time = time.time()
193
+ in_process = []
194
+ processed = []
195
+ google_futures = []
196
+ off_whitelist = False
197
+
198
+ with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:
199
+ # initialize scan of google urls
200
+ while True:
201
+ try:
202
+ while (
203
+ len(urls) > 0
204
+ # no sense starting if not much time left
205
+ and (
206
+ (
207
+ search_level == DEEP_SEARCH
208
+ and len(full_text) < 9600
209
+ and len(in_process) < 16
210
+ and time.time() - start_time < 14
211
+ )
212
+ or (
213
+ search_level == NORMAL_SEARCH
214
+ and len(full_text) < 6400
215
+ and len(in_process) < 14
216
+ and time.time() - start_time < 12
217
+ )
218
+ or (
219
+ search_level == QUICK_SEARCH
220
+ and len(full_text) < 4800
221
+ and len(in_process) < 10
222
+ and time.time() - start_time < 8
223
+ )
224
+ )
225
+ ):
226
+ recommendation = site_stats.get_next(
227
+ urls, sample_unknown=off_whitelist
228
+ )
229
+ if recommendation is None or len(recommendation) == 0:
230
+ off_whitelist = True
231
+ else:
232
+ # set timeout so we don't wait for a slow site forever
233
+ timeout = 12 - int(time.time() - start_time)
234
+ if search_level == NORMAL_SEARCH:
235
+ timeout = timeout + 4
236
+ url = recommendation[1]
237
+ future = executor.submit(
238
+ process_url,
239
+ query_phrase,
240
+ keywords,
241
+ keyword_weights,
242
+ url,
243
+ timeout,
244
+ )
245
+ # remaining_time = start_time+18-time.time()
246
+ # future.exception(remaining_time)
247
+ google_futures.append(future)
248
+ in_process.append(future)
249
+ urls_tried[tried_index] = url
250
+ tried_index += 1
251
+ urls.remove(url)
252
+ logger.info(f"queued {ut.extract_site(url)}, {timeout}")
253
+ # Process the responses as they arrive
254
+ for future in in_process:
255
+ if future.done():
256
+ result, url = future.result()
257
+ processed.append(future)
258
+ in_process.remove(future)
259
+ if len(result) > 0:
260
+ urls_used[used_index] = url
261
+ used_index += 1
262
+ result = result.replace(". .", ".")
263
+ logger.info(
264
+ f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
265
+ )
266
+ site = ut.extract_site(url)
267
+ domain = ut.extract_domain(url)
268
+ if domain.endswith("gov"):
269
+ credibility = "Official Source"
270
+ elif site in ut.sites.keys():
271
+ if ut.sites[site] > 0:
272
+ credibility = "Whitelisted Source"
273
+ elif ut.sites[site] == 0:
274
+ credibility = "Blacklisted Source"
275
+ else:
276
+ credibility = "Third-Party Source"
277
+
278
+ response.append(
279
+ {
280
+ "source": ut.extract_domain(url),
281
+ "url": url,
282
+ "credibility": credibility,
283
+ "text": result,
284
+ }
285
+ )
286
+
287
+ # openai seems to timeout a plugin at about 30 secs, and there is pbly 3-4 sec overhead
288
+ if (
289
+ (len(urls) == 0 and len(in_process) == 0)
290
+ or (
291
+ search_level == DEEP_SEARCH
292
+ and (len(full_text) > 9600)
293
+ or time.time() - start_time > 42
294
+ )
295
+ or (
296
+ search_level == NORMAL_SEARCH
297
+ and (len(full_text) > 6400)
298
+ or time.time() - start_time > 32
299
+ )
300
+ or (
301
+ search_level == QUICK_SEARCH
302
+ and (len(full_text) > 4800)
303
+ or time.time() - start_time > 28
304
+ )
305
+ ):
306
+ executor.shutdown(wait=False)
307
+ logger.info(
308
+ f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
309
+ )
310
+ return response, used_index, urls_used, tried_index, urls_tried
311
+ time.sleep(0.5)
312
+ except:
313
+ traceback.print_exc()
314
+ executor.shutdown(wait=False)
315
+ logger.info(
316
+ f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
317
+ )
318
+ return response, index, urls_used, tried_index, urls_tried
319
+
320
+
321
+ def extract_subtext(text, query_phrase, keywords, keyword_weights):
322
+ ### maybe we should score based on paragraphs, not lines?
323
+ sentences = ut.reform(text)
324
+ # logger.info('***** sentences from reform')
325
+ # for sentence in sentences:
326
+ # logger.info(sentence)
327
+ sentence_weights = {}
328
+ final_text = ""
329
+ for sentence in sentences:
330
+ sentence_weights[sentence] = 0
331
+ for keyword in keywords:
332
+ if keyword in sentence or keyword.lower() in sentence:
333
+ if keyword in keyword_weights.keys():
334
+ sentence_weights[sentence] += keyword_weights[keyword]
335
+
336
+ # now pick out sentences starting with those with the most keywords
337
+ max_sentence_weight = 0
338
+ for keyword in keyword_weights.keys():
339
+ max_sentence_weight += keyword_weights[keyword]
340
+ # logger.info(f'******* max sentence weight {max_sentence_weight}')
341
+ for i in range(max_sentence_weight, 1, -1):
342
+ if len(final_text) > 6000 and i < max(
343
+ 1, int(max_sentence_weight / 4)
344
+ ): # make sure we don't miss any super-important text
345
+ return final_text
346
+ for sentence in sentences:
347
+ if len(final_text) + len(sentence) > 6001 and i < max(
348
+ 1, int(max_sentence_weight / 4)
349
+ ):
350
+ continue
351
+ if sentence_weights[sentence] == i:
352
+ final_text += sentence
353
+ # logger.info("relevant text", final_text)
354
+ # logger.info("keyword extract length:",len(final_text)) #, end='.. ')
355
+
356
+ return final_text
357
+
358
+
359
+ def search(query_phrase):
360
+ logger.info(f"***** search {query_phrase}")
361
+ sort = "&sort=date-sdate:d:w"
362
+ if "today" in query_phrase or "latest" in query_phrase:
363
+ sort = "&sort=date-sdate:d:s"
364
+ # logger.info(f"search for: {query_phrase}")
365
+ google_query = en.quote(query_phrase)
366
+ response = []
367
+ try:
368
+ start_wall_time = time.time()
369
+ url = (
370
+ "https://www.googleapis.com/customsearch/v1?key="
371
+ + ut.google_key
372
+ + "&cx="
373
+ + ut.google_cx
374
+ # was ten but want to reduce search time
375
+ + "&num=3"
376
+ + sort
377
+ + "&q="
378
+ + google_query
379
+ )
380
+ response = requests.get(url)
381
+ response_json = json.loads(response.text)
382
+ logger.info(f"***** google search {int((time.time() - start_wall_time) * 10) / 10} sec")
383
+ except:
384
+ traceback.print_exc()
385
+ return []
386
+
387
+ # see if we got anything useful from google
388
+ if "items" not in response_json.keys():
389
+ logger.info("no return from google ...", response, response_json.keys())
390
+ # logger.info(google_query)
391
+ return []
392
+
393
+ # first try whitelist sites
394
+ urls = []
395
+ for i in range(len(response_json["items"])):
396
+ url = response_json["items"][i]["link"].lstrip().rstrip()
397
+ site = ut.extract_site(url)
398
+ if site not in ut.sites or ut.sites[site] == 1:
399
+ urls.append(url)
400
+ return urls
401
+
402
+
403
+ def log_url_process(site, reason, raw_text, extract_text, gpt_text):
404
+ return
405
+
406
+
407
+ """
408
+ # to record detailed logs of url processing unquote this function
409
+ def log_url_process(site, reason, raw_text, extract_text, gpt_text):
410
+ if len(raw_text) == 0 and len(extract_text)==0 and len(gpt_text) ==0:
411
+ return
412
+ try:
413
+ with open('google_log.txt', 'a') as lg:
414
+ lg.write('\n\n*************'+reason.upper()+'***********\n')
415
+ lg.write('*****************'+site+' RAW*************\n')
416
+ lg.write(raw_text)
417
+ lg.write('\n******************extract****************\n')
418
+ lg.write(extract_text)
419
+ lg.write('\n********************gpt******************\n')
420
+ lg.write(gpt_text)
421
+ except Exception:
422
+ traceback.print_exc()
423
+ """
424
+
425
+
426
+ def response_text_extract_mod(url, response):
427
+ extract_text = ""
428
+ if url.endswith("pdf"):
429
+ pass
430
+ else:
431
+ elements = partition_html(text=response)
432
+ str_elements = []
433
+ for e in elements:
434
+ stre = str(e).replace(" ", " ")
435
+ str_elements.append(stre)
436
+ extract_text = ut.reform(str_elements)
437
+ logger.info(
438
+ f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
439
+ )
440
+ if len(''.join(extract_text).strip()) < 8:
441
+ return ""
442
+ return extract_text
443
+
444
+
445
+ def response_text_extract(
446
+ query_phrase, keywords, keyword_weights, url, response, get_time
447
+ ):
448
+ curr = time.time()
449
+ text = ""
450
+ extract_text = ""
451
+ site = ut.extract_site(url)
452
+
453
+ if url.endswith("pdf"):
454
+ pass
455
+ else:
456
+ elements = partition_html(text=response)
457
+ str_elements = []
458
+ # logger.info('\n***** elements')
459
+ for e in elements:
460
+ stre = str(e).replace(" ", " ")
461
+ str_elements.append(stre)
462
+ extract_text = extract_subtext(
463
+ str_elements, query_phrase, keywords, keyword_weights
464
+ )
465
+ # logger.info('\n************ unstructured **********')
466
+ logger.info(
467
+ f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
468
+ )
469
+ url_text = text # save for final stats
470
+ new_curr = time.time()
471
+ extract_time = int((new_curr - curr) * 1000000)
472
+ if len(extract_text.strip()) < 8:
473
+ return ""
474
+
475
+ # now ask openai to extract answer
476
+ response_text = ""
477
+ curr = new_curr
478
+ extract_text = extract_text[:10000] # make sure we don't run over token limit
479
+ gpt_tldr_message = [
480
+ {
481
+ "role": "user",
482
+ "content": "Given:\n" + extract_text + "\n\nQuery:\n" + query_phrase,
483
+ }
484
+ ]
485
+ start_wall_time = time.time()
486
+ t_out = 12 - get_time
487
+ # logger.info(f'****** spawning page get with timeout {t_out}')
488
+ google_tldr = ut.ask_gpt_with_retries(
489
+ ut.MODEL, gpt_tldr_message, tokens=300, temp=0.3, timeout=t_out, tries=1
490
+ )
491
+ openai_time = int((time.time() - start_wall_time) * 10) / 10
492
+ logger.info(f"\n***** tldr {query_phrase}, {openai_time} sec")
493
+ logger.info(f'***** \n{extract_text}\n***** \n{google_tldr}\n*****\n')
494
+ url_text = url_text.replace("\n", ". ")
495
+ if google_tldr is None:
496
+ google_tldr = ""
497
+ response_text = google_tldr.lstrip()
498
+ prefix_text = response_text[: min(len(response_text), 96)].lower()
499
+ # openai sometimes returns a special format for 'no imformation'
500
+ if prefix_text.startswith("query:"):
501
+ text_index = response_text.find("Text:")
502
+ if text_index > 0:
503
+ response_text = response_text[text_index + 5:]
504
+ prefix_text = response_text[: min(len(response_text), 96)].lower()
505
+ if (
506
+ "no information" in prefix_text
507
+ or "i cannot provide" in prefix_text
508
+ or "as an ai language model" in prefix_text
509
+ or "does not provide" in prefix_text
510
+ or "it is not possible" in prefix_text
511
+ ):
512
+ # skip this summary, no info
513
+ logger.info(
514
+ "{} {}/{}/{}/{}".format(
515
+ site, len(response), len(url_text), len(extract_text), 0
516
+ )
517
+ )
518
+ # logger.info('************')
519
+ # logger.info(extract_text)
520
+ # logger.info('************')
521
+ sys.stdout.flush()
522
+ log_url_process(site, "no info", url_text, extract_text, "")
523
+ site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
524
+ return ""
525
+
526
+ if (
527
+ prefix_text.startswith("i'm sorry")
528
+ or prefix_text.startswith("there is no ")
529
+ or (
530
+ prefix_text.startswith("the provided text")
531
+ or prefix_text.startswith("i cannot")
532
+ or prefix_text.startswith("unfortunately")
533
+ or prefix_text.startswith("sorry")
534
+ or prefix_text.startswith("the text")
535
+ )
536
+ and (
537
+ "is not relevant" in prefix_text
538
+ or "no information" in prefix_text
539
+ or "does not provide" in prefix_text
540
+ or "does not contain" in prefix_text
541
+ or "no relevant information" in prefix_text
542
+ )
543
+ ):
544
+ # skip this summary, no info
545
+ log_url_process(site, "no info 2", url_text, extract_text, "")
546
+ logger.info(
547
+ "{} {}/{}/{}/{}".format(
548
+ site, len(response), len(url_text), len(extract_text), 0
549
+ )
550
+ )
551
+ ###logger.info('************')
552
+ ###logger.info(extract_text)
553
+ ###logger.info('************')
554
+ site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
555
+ return ""
556
+ else:
557
+ sentences = nltk.sent_tokenize(response_text)
558
+ response_text = ""
559
+ for sentence in sentences:
560
+ if (
561
+ "no inform" in sentence.lower()
562
+ or "no specific inform" in sentence.lower()
563
+ or "is unclear" in sentence.lower()
564
+ or "not mention" in sentence.lower()
565
+ or "not specifically mention" in sentence.lower()
566
+ ):
567
+ pass
568
+ else:
569
+ response_text += "\n \u2022 " + sentence + ". "
570
+ site_stats.update_site_stats(
571
+ site, len(response_text), get_time, extract_time, openai_time
572
+ )
573
+ # logger.info('\n',response_text)
574
+ log_url_process(site, "response", url_text, extract_text, response_text)
575
+ logger.info(
576
+ "{} {}/{}/{}/{}".format(
577
+ site,
578
+ len(response),
579
+ len(url_text),
580
+ len(extract_text),
581
+ len(response_text),
582
+ )
583
+ )
584
+ # logger.info('************')
585
+ # logger.info(google_tldr)
586
+ # logger.info('************ site response ***********')
587
+ # logger.info(response_text)
588
+ # logger.info('************')
589
+ return response_text + "\n"
590
+ site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
591
+ log_url_process(site, "no return", "", "", "")
592
+ logger.info(
593
+ "{} {}/{}/{}/{}".format(
594
+ site, len(response), len(url_text), len(extract_text), 0
595
+ )
596
+ )
597
+ ##logger.info('************')
598
+ ##logger.info(extract_text)
599
+ ##logger.info('************')
600
+ return ""
601
+
602
+
603
+ def extract_items_from_numbered_list(text):
604
+ items = ""
605
+ elements = text.split("\n")
606
+ for candidate in elements:
607
+ candidate = candidate.lstrip(". \t")
608
+ if len(candidate) > 4 and candidate[0].isdigit():
609
+ candidate = candidate[1:].lstrip(". ")
610
+ if (
611
+ len(candidate) > 4 and candidate[0].isdigit()
612
+ ): # strip second digit if more than 10 items
613
+ candidate = candidate[1:].lstrip(". ")
614
+ logger.info("E {}".format(candidate))
615
+ items += candidate + " "
616
+ return items
617
+
618
+
619
+ def search_google_mod(query_phrase):
620
+ full_text = ""
621
+ try:
622
+ gpt_phrase_urls = []
623
+ if len(query_phrase) > 0:
624
+ gpt_phrase_urls = search(query_phrase)
625
+ full_text = process_urls_mod(query_phrase, gpt_phrase_urls)
626
+ logger.info("return from url processing")
627
+ except:
628
+ traceback.print_exc()
629
+ return full_text
630
+
631
+
632
+ def search_google(original_query, search_level, query_phrase, keywords, chat_history):
633
+ start_time = time.time()
634
+ all_urls = []
635
+ urls_used = []
636
+ urls_tried = []
637
+ index = 0
638
+ tried_index = 0
639
+ full_text = ""
640
+ keyword_weights = {}
641
+ for keyword in keywords:
642
+ zipf = wf.zipf_frequency(keyword, "en")
643
+ weight = max(0, int((8 - zipf)))
644
+ if weight > 0:
645
+ keyword_weights[keyword] = weight
646
+ logger.info(f"keyword {keyword} wf.ziff {zipf} weight {weight}")
647
+ subwds = keyword.split(" ")
648
+ if len(subwds) > 1:
649
+ for subwd in subwds:
650
+ sub_z = wf.zipf_frequency(subwd, "en")
651
+ sub_wgt = max(0, int((8 - zipf) * 1 / 2))
652
+ if sub_wgt > 0:
653
+ keyword_weights[subwd] = sub_wgt
654
+ logger.info(f"keyword {subwd} weight {sub_wgt}")
655
+
656
+ try: # query google for recent info
657
+ sort = ""
658
+ if "today" in original_query or "latest" in original_query:
659
+ original_query = today.strip("\n") + " " + original_query
660
+ extract_query = ""
661
+ orig_phrase_urls = []
662
+ if len(original_query) > 0:
663
+ orig_phrase_urls = search(original_query[: min(len(original_query), 128)])
664
+ extract_query = original_query[: min(len(original_query), 128)]
665
+ gpt_phrase_urls = []
666
+ if len(query_phrase) > 0:
667
+ gpt_phrase_urls = search(query_phrase)
668
+ extract_query = (
669
+ query_phrase # prefer more succinct query phrase if available
670
+ )
671
+ if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
672
+ return "", [], 0, [""], 0, [""]
673
+
674
+ for url in orig_phrase_urls:
675
+ if url in gpt_phrase_urls:
676
+ gpt_phrase_urls.remove(url)
677
+
678
+ # interleave both lists now that duplicates are removed
679
+ urls = [
680
+ val
681
+ for tup in zip_longest(orig_phrase_urls, gpt_phrase_urls)
682
+ for val in tup
683
+ if val is not None
684
+ ]
685
+ # urls = [val for tup in zip_longest(urls, kwd_phrase_urls) for val in tup if val is not None]
686
+ all_urls = copy.deepcopy(urls)
687
+ # initialize scan of google urls
688
+ # compute keyword weights
689
+ start_wall_time = time.time()
690
+ full_text, index, urls_used, tried_index, urls_tried = process_urls(
691
+ extract_query, keywords, keyword_weights, all_urls, search_level
692
+ )
693
+ site_stats.ckpt()
694
+ logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
695
+ # logger.info("return from url processsing")
696
+ except:
697
+ traceback.print_exc()
698
+ return full_text, all_urls, index, urls_used, tried_index, urls_tried
llmsearch/meta.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llmsearch import utilityV2 as ut, google_search_concurrent as gs
2
+ import re
3
+ import time
4
+
5
+ ABORT = False
6
+ CONTINUE = True
7
+ history = []
8
+
9
+
10
+ class history_entry:
11
+ def __init__(self, turn, vector=None):
12
+ self.message = turn.message.lower()
13
+ self.role = turn.role
14
+
15
+ def equal(self, he2):
16
+ return self.message == he2.message and self.role == turn.role
17
+
18
+
19
+ def add(turn):
20
+ he = history_entry(turn)
21
+ history.append(he)
22
+
23
+
24
+ def is_metaCyclic(turn):
25
+ he = history_entry(turn)
26
+ count = 0
27
+ for prior_he in history:
28
+ if he.equal(prior_he):
29
+ count += 1
30
+ return count > 1
31
+
32
+
33
+ def is_cyclic(turn):
34
+ he = history_entry(turn)
35
+ for prior_he in history:
36
+ if he.equal(prior_he):
37
+ return True
38
+ return False
39
+
40
+
41
+ def clear():
42
+ global history
43
+ history = []
44
+ return
45
+
46
+
47
+ def test_history():
48
+ he1 = history_entry(ut.turn(role="assistant", message="who is Noriel Roubini"))
49
+ he2 = history_entry(ut.turn(role="assistant", message="who was Noriel Roubini"))
50
+ he3 = history_entry(ut.turn(role="assistant", message="who was Nsriel Roubini"))
51
+ he4 = history_entry(ut.turn(role="assistant", message="where is the Pinnacles"))
52
+ for hea in (he1, he2, he3, he4):
53
+ for heb in (he1, he2, he3, he4):
54
+ print(cosine(hea, heb))
55
+
56
+
57
+ def test_parse_decomp():
58
+ test_text = """<Subquery 1>? What is the birthplace of Hugh Jackman?
59
+ <Subquery 2>? What is the Japanese name of the birthplace of Hugh Jackman?
60
+ <Keywords 1>: Hugh Jackman, birthplace
61
+ <Keywords 2>: Japanese name, birthplace, Hugh Jackman"""
62
+
63
+ decomp = parse_decomposition(test_text)
64
+ for subquery in decomp:
65
+ print("Subquery\n", subquery)
66
+
67
+
68
+ def parse_decomposition(text):
69
+ ### expecting:
70
+ ### <Subquery 1>
71
+ ### Birthplace of Hugh Jackman
72
+ ### <Subquery 2>
73
+ ### Japanese name of Birthplace of Hugh Jackman
74
+ ### note that 'Birthplace of Hugh Jackson' operates as both a strinq google query and a variable in subsequent occurences
75
+ subquery_indecies = re.finditer(
76
+ "<Subquery", text
77
+ ) # Action: Ask {Google, User} "query"
78
+ subqueries = []
79
+ for index in subquery_indecies:
80
+ hdr_end = text[index.start() :].find(">") + index.start()
81
+ query_start = hdr_end + 1
82
+ query_end = text[query_start:].find("<")
83
+ if query_end < 0:
84
+ query = text[query_start:].strip()
85
+ else:
86
+ query = text[query_start : query_start + query_end].lstrip("?").strip()
87
+ print("Query:", query)
88
+ subqueries.append(query)
89
+ return subqueries
90
+
91
+
92
+ def query_keywords(query):
93
+ start_wall_time = time.time()
94
+ gpt_key_message = [
95
+ {
96
+ "role": "user",
97
+ "content": "Extract keywords and named-entities from the following text.",
98
+ },
99
+ {"role": "user", "content": query},
100
+ ]
101
+ # for item in gpt_key_message:
102
+ # print(item)
103
+ gpt_parse = ut.ask_gpt_with_retries(
104
+ "gpt-3.5-turbo", gpt_key_message, tokens=25, temp=0, timeout=5, tries=2
105
+ )
106
+ # print(f'\n***** keywords and named-entities {gpt_parse}')
107
+ # parse result Keywords: {comma separated list}\n\nNamed-entities: {comma-separated-list}
108
+ keywords = []
109
+ # do named entities first, they might be compounds of keywords
110
+ ne_start = gpt_parse.find("Named-entities")
111
+ print(f"***** keyword extract {int((time.time()-start_wall_time)*10)/10} sec")
112
+ if ne_start > 0:
113
+ nes = gpt_parse[ne_start + len("Named-entities") + 1 :].split(
114
+ ","
115
+ ) # assume string ends with colon or space:].split(',')
116
+ # print(f'Named-entity candidates {nes}')
117
+ for ne in nes:
118
+ ne = ne.strip(" .,;:\n")
119
+ # print(f' appending {ne}')
120
+ if ne != "None":
121
+ keywords.append(ne)
122
+ else:
123
+ ne_start = len(gpt_parse) + 1
124
+ kwd_start = gpt_parse.find("Keywords")
125
+ if kwd_start > -1:
126
+ kwds = gpt_parse[kwd_start + len("Keywords") + 1 : ne_start].split(",")
127
+ # print(f'Keyword candidates {kwds}')
128
+ for kwd in kwds:
129
+ kwd = kwd.strip(" .\n,;:")
130
+ skip = False
131
+ for kwd2 in keywords:
132
+ if kwd in kwd2:
133
+ skip = True
134
+ if not skip:
135
+ # print('appending', kwd)
136
+ keywords.append(kwd)
137
+ # else: print("Keywords index < 0")
138
+ if len(keywords) > 0:
139
+ print(f"***** query_keywords found keywords {keywords}")
140
+ return keywords
141
+ # fallback - just use query words
142
+ candidates = query.split(" ")
143
+ for candidate in candidates:
144
+ candidate = candidate.strip()
145
+ if len(candidate) > 2:
146
+ keywords.append(candidate)
147
+ # print(f'***** query_keywords using default keywords {keywords}')
148
+ return keywords
149
+
150
+
151
+ def substitute(Q1, A1, Q2, debug=False):
152
+ gpt_sub_message = [
153
+ {
154
+ "role": "user",
155
+ "content": "replace '" + Q1 + "' with '" + A1 + "' in '" + Q2 + "'",
156
+ }
157
+ ]
158
+ if debug:
159
+ print("\n\n**************")
160
+ for item in gpt_sub_message:
161
+ print(item)
162
+ google_tldr = ut.ask_gpt_with_retries(
163
+ "gpt-3.5-turbo", gpt_sub_message, tokens=25, temp=0.1, timeout=5, tries=2
164
+ )
165
+ print("\n\n**************")
166
+ if len(google_tldr) == 0 or "no information" in google_tldr:
167
+ print("Returning original Q2")
168
+ return Q2
169
+ print("Substituted", Q2, google_tldr)
170
+ return google_tldr
171
+
172
+
173
+ def meta(query, chat_history, debug=False):
174
+ print("***** entering meta")
175
+ turn = ut.turn(
176
+ role=ut.ASSISTANT, source=ut.ASSISTANT, message='Action: search "' + query + '"'
177
+ )
178
+ if is_metaCyclic(turn):
179
+ return [], ABORT
180
+
181
+ prompt = """Decompose a compound <Query> into two smaller <Subquery>. Use the following format for output:
182
+ <Subquery 1>
183
+ <Subquery 2>"""
184
+ gpt_message = [
185
+ {"role": "user", "content": prompt},
186
+ {"role": "user", "content": "<Query>\n" + query},
187
+ ]
188
+ response_text = ""
189
+ completion = None
190
+ if debug:
191
+ for role in gpt_message:
192
+ print(role)
193
+ print("starting gpt decomp query")
194
+ response_text = ut.ask_gpt_with_retries(
195
+ "gpt-3.5-turbo", gpt_message, tokens=75, temp=0.1, timeout=5, tries=2
196
+ )
197
+ if debug:
198
+ print(f"initial gpt query response:\n{response_text}")
199
+ print("**** executing decomp ****")
200
+ subqueries = parse_decomposition(response_text)
201
+ meta_chat_history = []
202
+ prev_tldr = ""
203
+ google_tldr = ""
204
+ for n, subquery in enumerate(subqueries):
205
+ # do variable substituion into subquery
206
+ # ask google
207
+ # send google results as notes plus subquery to gpt to extract <answer i>
208
+ # return chat history extended with each subquery and its answer
209
+ # (or maybe just all google notes, let next level down do the rest?)
210
+ # bad idea, can exceed token limit!
211
+ if debug:
212
+ print(f'subquery {n}, "{subquery}"')
213
+ if n > 0:
214
+ subquery = substitute(subqueries[n - 1], prev_tldr, subquery)
215
+ keyword_set = query_keywords(subquery)
216
+
217
+ keyword_set = query_keywords(subquery)
218
+ print("*****Executing subquery", subquery, "\n with keywords", keyword_set)
219
+ gpt_initial_message = [
220
+ {
221
+ "role": "user",
222
+ "content": subquery + " If fact is unavailable, respond: 'Unknown'",
223
+ }
224
+ ]
225
+
226
+ # for turn in meta_chat_history:
227
+ # gpt_initial_message.append({"role":"user","content":turn.tldr})
228
+
229
+ initial_gpt_answer = ut.ask_gpt_with_retries(
230
+ "gpt-3.5-turbo",
231
+ gpt_initial_message,
232
+ tokens=25,
233
+ temp=0.0,
234
+ timeout=5,
235
+ tries=2,
236
+ )
237
+ if debug:
238
+ print(f"***** google extract\n {initial_gpt_answer}\n")
239
+ if (
240
+ "unknown" not in initial_gpt_answer.lower()
241
+ and "cannot provide" not in initial_gpt_answer
242
+ and "do not have access" not in initial_gpt_answer
243
+ ):
244
+ meta_chat_history.append(
245
+ ut.turn(
246
+ role="assistant",
247
+ message=subquery,
248
+ source=ut.ASSISTANT,
249
+ tldr=subquery,
250
+ keywords=keyword_set,
251
+ )
252
+ )
253
+ meta_chat_history.append(
254
+ ut.turn(
255
+ role="assistant",
256
+ message="<note>\n" + initial_gpt_answer + "\n<note>",
257
+ source=ut.GOOGLE,
258
+ tldr=initial_gpt_answer,
259
+ keywords=keyword_set,
260
+ )
261
+ )
262
+ prev_tldr = initial_gpt_answer
263
+ print(f"***** Answer to {subquery}: {initial_gpt_answer}\n")
264
+ google_tldr = initial_gpt_answer
265
+ continue
266
+ # ask google
267
+ (
268
+ google_text,
269
+ urls_all,
270
+ index,
271
+ urls_used,
272
+ tried_index,
273
+ urls_tried,
274
+ ) = gs.search_google(
275
+ subquery,
276
+ gs.QUICK_SEARCH,
277
+ "",
278
+ ut.INFORMATION_QUERY,
279
+ keyword_set,
280
+ meta_chat_history,
281
+ )
282
+ if len(google_text) > 0:
283
+ # digest google response into an answer for this subquery
284
+ if debug:
285
+ print(f"***** search result\n{google_text}\n")
286
+ gpt_tldr_message = [
287
+ {
288
+ "role": "user",
289
+ "content": 'Summarize the set of <note> provided. Including only the direct answer to <Query>. Do not include any qualifiers or modifiers from the <Query> such as "where x was born".',
290
+ },
291
+ {"role": "user", "content": google_text},
292
+ {"role": "user", "content": "<Query>\n" + subquery},
293
+ ]
294
+ # for turn in meta_chat_history:
295
+ # gpt_tldr_message.append({"role":"user","content":turn.tldr})
296
+
297
+ google_tldr = ut.ask_gpt_with_retries(
298
+ "gpt-3.5-turbo",
299
+ gpt_tldr_message,
300
+ tokens=150,
301
+ temp=0.1,
302
+ timeout=5,
303
+ tries=2,
304
+ )
305
+ # print('\n\n**************')
306
+ # for item in gpt_tldr_message:
307
+ # print(item)
308
+ print(f"***** Answer to {subquery}: {google_tldr}\n")
309
+ meta_chat_history.append(
310
+ ut.turn(
311
+ role="assistant",
312
+ message=subquery,
313
+ source=ut.ASSISTANT,
314
+ tldr=subquery,
315
+ keywords=keyword_set,
316
+ )
317
+ )
318
+ meta_chat_history.append(
319
+ ut.turn(
320
+ role="assistant",
321
+ message="Observation: " + google_tldr,
322
+ source=ut.GOOGLE,
323
+ tldr=google_tldr,
324
+ keywords=keyword_set,
325
+ )
326
+ )
327
+ prev_tldr = google_tldr
328
+ # print(f"\n******meta return: {google_tldr} *****\n")
329
+ return meta_chat_history, CONTINUE
330
+
331
+
332
+ if __name__ == "__main__":
333
+ # test_parse_decomp()
334
+ # meta("what is the Japanese name of the birthplace of Hugh Jackman", [])
335
+ # meta("What is the capital of the birthplace of Levy Mwanawasa?",[])
336
+ # meta("What is the (rounded down) latitude of the birthplace of Ferenc Puskas?",[])
337
+ # meta("What is the (rounded down) longitude of the birthplace of Juliane Koepcke?",[])
338
+ # meta("What is the top-level domain of the birthplace of Norodom Sihamoni?",[])
339
+ # meta("What is the 3166-1 numeric code for the birthplace of Gilgamesh?",[])
340
+ # meta("What is the currency in the birthplace of Joel Campbell?",[])
341
+ # meta("What is the currency abbreviation in the birthplace of Antonio Valencia?",[])
342
+ # meta("What is the currency symbol in the birthplace of Marek Hamsˇ´ık?",[])
343
+ # meta("What is the Japanese name of the birthplace of Hugh Jackman?",[])
344
+ # meta("What is the Spanish name of the birthplace of Fred´ eric Chopin? ",[])
345
+ # meta("What is the Russian name of the birthplace of Confucius?",[])
346
+ # meta("What is the Estonian name of the birthplace of Kofi Annan?",[])
347
+ # meta("What is the Urdu name of the birthplace of Nicki Minaj?",[])
348
+ # meta("What is the calling code of the birthplace of Milla Jovovich?",[])
349
+ # meta("Who was the champion of the Masters Tournament in the year that Bob Dylan was born?",[])
350
+ # meta("Who won the Nobel Prize in Literature in the year Matt Damon was born?",[])
351
+ # meta("Who was the President of the United States when Sting was born?",[])
352
+ meta(
353
+ "What are the latest reviewer opinions on Tesla Full Self Driving Beta version 11.3.4?",
354
+ [],
355
+ debug=True,
356
+ )
357
+ meta("Michael D'Ambrosio Hound Labs", [], debug=True)
llmsearch/show_site_stats.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+
4
+ site_stats = {} # initialize dictionay of sites used
5
+
6
+ try:
7
+ with open("site_stats.json", "r") as f:
8
+ site_stats = json.loads(f.read())
9
+ except:
10
+ print("Failed to read site_stats.")
11
+
12
+ sites = {} # initialize dictionay or sites used
13
+ try:
14
+ with open("sites.json", "r") as f:
15
+ sites = json.loads(f.read())
16
+ except:
17
+ print("Failed to read sites.")
18
+
19
+ site_list = []
20
+ for site in site_stats.keys():
21
+ site_list.append(site_stats[site])
22
+
23
+ # ordered, sort and compute cumulative
24
+ site_list.sort(
25
+ reverse=True,
26
+ key=lambda item: (
27
+ item["chars"] / (max(1000, item["get"] + item["extract"] + item["openai"]))
28
+ ),
29
+ )
30
+
31
+ for site in site_list:
32
+ total_time = max(1000, site["get"] + site["extract"] + site["openai"])
33
+ if "hits" in site.keys():
34
+ hits = site["hits"]
35
+ else:
36
+ hits = 0
37
+ quality = int((site["chars"] * 1000000) / total_time)
38
+ arg = ""
39
+ if len(sys.argv) > 1:
40
+ arg = sys.argv[1]
41
+ print_site = False
42
+ if "new" in arg:
43
+ if quality > 0 and site["name"] not in sites.keys():
44
+ print_site = True
45
+ elif len(sys.argv) == 1 and quality > 0:
46
+ print_site = True
47
+ elif "all" in arg:
48
+ print_site = True
49
+ elif quality > 0:
50
+ print_site = True
51
+ if print_site:
52
+ print(site["name"], hits, site["chars"], quality, end="")
53
+ if quality > 0 and site["name"] not in sites.keys():
54
+ print("*")
55
+ else:
56
+ print()
llmsearch/site_stats.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import traceback
4
+
5
+ from llmsearch import utilityV2 as ut
6
+
7
+
8
+ def findnth(haystack, needle, n):
9
+ parts = haystack.split(needle, n + 1)
10
+ if len(parts) <= n + 1:
11
+ return -1
12
+ return len(haystack) - len(parts[-1]) - len(needle)
13
+
14
+
15
+ def extract_site(url):
16
+ site = ""
17
+ base = findnth(url, "/", 2)
18
+ if base > 2:
19
+ site = url[:base].split(".")
20
+ if len(site) > 1:
21
+ site = site[-2]
22
+ site = site.replace("https://", "")
23
+ site = site.replace("http://", "")
24
+ return site
25
+
26
+
27
+ site_stats = {} # initialize dictionay of sites used
28
+ stats_loaded = False
29
+ stats_dirty = False
30
+
31
+
32
+ def open_site_stats():
33
+ global site_stats, stats_loaded, stats_dirty
34
+ if stats_loaded:
35
+ return
36
+ try:
37
+ with open("site_stats.json", "r") as f:
38
+ site_stats = json.loads(f.read())
39
+ except:
40
+ print("Failed to read site_stats.json")
41
+ traceback.print_exc()
42
+
43
+
44
+ def ckpt():
45
+ global site_stats, stats_dirty
46
+ if not stats_dirty:
47
+ return
48
+ try:
49
+ with open("site_stats.json", "w") as ss:
50
+ ss.write(json.dumps(site_stats))
51
+ stats_dirty = False
52
+ except Exception as e:
53
+ print(f"Failed to write site_stats: {str(e)}")
54
+ traceback.print_exc()
55
+
56
+
57
+ def update_site_stats(site, char_cnt, get_time, extract_time, openai_time):
58
+ global site_stats, stats_dirty
59
+ open_site_stats()
60
+ if site not in site_stats:
61
+ site_stats[site] = {
62
+ "name": site,
63
+ "hits": 0,
64
+ "chars": 0,
65
+ "get": 0,
66
+ "extract": 0,
67
+ "openai": 0,
68
+ }
69
+ if "hits" not in site_stats[site]:
70
+ site_stats[site]["hits"] = 0
71
+ site_stats[site]["hits"] = site_stats[site]["hits"] + 1
72
+ site_stats[site]["chars"] = char_cnt + site_stats[site]["chars"]
73
+ site_stats[site]["get"] = get_time + site_stats[site]["get"]
74
+ site_stats[site]["extract"] = extract_time + site_stats[site]["extract"]
75
+ site_stats[site]["openai"] = openai_time + site_stats[site]["openai"]
76
+ stats_dirty = True
77
+ # print("updated", site_stats[site])
78
+
79
+
80
+ def retrieve(site):
81
+ global site_stats
82
+ if site not in site_stats:
83
+ site_stats[site] = {
84
+ "name": site,
85
+ "hits": 0,
86
+ "chars": 0,
87
+ "get": 0,
88
+ "extract": 0,
89
+ "openai": 0,
90
+ }
91
+ return site_stats[site]
92
+
93
+
94
+ def get_next(urls, sample_unknown=False):
95
+ global site_stats
96
+ # retrieve stats for sites in list
97
+ candidates = []
98
+ for url in urls:
99
+ site = extract_site(url)
100
+ candidate = retrieve(site)
101
+ if sample_unknown or (site in ut.sites and ut.sites[site] != 0):
102
+ candidates.append((candidate, url))
103
+ if len(candidates) == 0:
104
+ return []
105
+ if len(candidates) == 1:
106
+ return candidates[0]
107
+ # random or ordered? if random, pick without sorting
108
+ if random.random() > 0.85:
109
+ pick = int(random.random() * len(candidates))
110
+ return candidates[pick]
111
+
112
+ # ordered, sort and compute cumulative
113
+ candidates.sort(
114
+ reverse=True,
115
+ key=lambda item: (
116
+ (item[0]["chars"] * 1000000)
117
+ / (max(1000, item[0]["get"] + item[0]["extract"] + item[0]["openai"]))
118
+ ),
119
+ )
120
+
121
+ # now pick top from sort
122
+ p = random.random()
123
+ p2 = p * p * p
124
+ return candidates[int(p2 * len(candidates))]
llmsearch/utilityV2.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import linecache
3
+ import logging
4
+ import re
5
+ import traceback
6
+ import tracemalloc
7
+
8
+ import nltk
9
+ import openai
10
+ # from tenacity import (retry,stop_after_attempt,stop_after_delay, wait_random_exponential)
11
+ from tenacity import *
12
+
13
+ # from agents.utils import read_file
14
+ from utils import read_file
15
+
16
+ logger = logging.getLogger("agent_logger")
17
+ openai.api_key = read_file("../keys/gpt_api_key.txt")
18
+ # paid and ad free
19
+ google_key = read_file("../keys/google_search_api_key.txt")
20
+ # cx: The identifier of the Programmable Search Engine.
21
+ google_cx = read_file("../keys/google_cx_api_key.txt")
22
+ GOOGLE = "google"
23
+ USER = "user"
24
+ ASSISTANT = "assistant"
25
+
26
+ MODEL = "gpt-3.5-turbo"
27
+
28
+ sites = {} # initialize dictionary or sites used
29
+ new_sites = {} # initialize dictionary or sites used
30
+ try:
31
+ with open("sites", "r") as f:
32
+ sites = json.loads(f.read())
33
+ except:
34
+ print("Failed to read sites.")
35
+
36
+
37
+ # for experimenting with Vicuna
38
+
39
+
40
+ def display_top(snapshot, key_type="lineno", limit=10):
41
+ snapshot = snapshot.filter_traces(
42
+ (
43
+ tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
44
+ tracemalloc.Filter(False, "<unknown>"),
45
+ )
46
+ )
47
+ top_stats = snapshot.statistics(key_type)
48
+
49
+ logger.info("Top %s lines" % limit)
50
+ for index, stat in enumerate(top_stats[:limit], 1):
51
+ frame = stat.traceback[0]
52
+ logger.info(
53
+ "#%s: %s:%s: %.1f KiB"
54
+ % (index, frame.filename, frame.lineno, stat.size / 1024)
55
+ )
56
+ line = linecache.getline(frame.filename, frame.lineno).strip()
57
+ if line:
58
+ logger.info(" %s" % line)
59
+
60
+ other = top_stats[limit:]
61
+ if other:
62
+ size = sum(stat.size for stat in other)
63
+ logger.info("%s other: %.1f KiB" % (len(other), size / 1024))
64
+ total = sum(stat.size for stat in top_stats)
65
+ logger.info("Total allocated size: %.1f KiB" % (total / 1024))
66
+
67
+
68
+ class turn:
69
+ def __init__(self, role="assistant", message="", tldr="", source="", keywords=[]):
70
+ self.role = role
71
+ self.message = message
72
+ self.tldr = tldr
73
+ self.source = source
74
+ self.keywords = keywords
75
+
76
+ def __str__(self):
77
+ s = ""
78
+ if self.role is not None and len(self.role) > 0:
79
+ s = s + "r: " + self.role
80
+ if self.message is not None and len(self.message) > 0:
81
+ s = s + " m: " + self.message
82
+ if self.source is not None and len(self.source) > 0:
83
+ s = s + " s: " + self.source
84
+ if self.tldr is not None and len(self.tldr) > 0:
85
+ s = s + "tldr: " + self.tldr
86
+ return s
87
+
88
+ def is_google_turn(self):
89
+ return self.source is not None and self.source == GOOGLE
90
+
91
+ def is_user_turn(self):
92
+ return self.source is not None and self.source == USER
93
+
94
+ def is_assistant_turn(self):
95
+ return self.source is not None and self.source == ASSISTANT
96
+
97
+
98
+ # @retry(wait=wait_random_exponential(min=1, max=2), stop=(stop_after_delay(15) | stop_after_attempt(2)))
99
+ def chatCompletion_with_backoff(**kwargs):
100
+ return openai.ChatCompletion.create(**kwargs)
101
+
102
+
103
+ def ask_gpt(model, gpt_message, max_tokens, temp, top_p):
104
+ completion = None
105
+ try:
106
+ completion = openai.chat.completions.create(
107
+ model=model,
108
+ messages=gpt_message,
109
+ max_tokens=max_tokens,
110
+ temperature=temp,
111
+ top_p=top_p,
112
+ )
113
+ except:
114
+ traceback.print_exc()
115
+ if completion is not None:
116
+ response = completion.choices[0].message.content.lstrip(" ,:.")
117
+ logger.info(response)
118
+ return response
119
+ else:
120
+ logger.info("no response")
121
+ return None
122
+
123
+
124
+ def ask_gpt_with_retries(model, gpt_message, tokens, temp, timeout, tries):
125
+ retryer = Retrying(stop=(stop_after_delay(timeout) | stop_after_attempt(1)))
126
+ r = retryer(
127
+ ask_gpt,
128
+ model=model,
129
+ gpt_message=gpt_message,
130
+ max_tokens=tokens,
131
+ temp=temp,
132
+ top_p=1,
133
+ )
134
+ return r
135
+
136
+
137
+ INFORMATION_QUERY = "information query"
138
+ INTENTS = []
139
+
140
+
141
+ def find_intent(response):
142
+ global INTENTS, INFORMATION_QUERY
143
+ for intent in INTENTS:
144
+ if intent in response.lower():
145
+ return intent
146
+ return INFORMATION_QUERY
147
+
148
+
149
+ def find_query(response):
150
+ search_query_phrase = response
151
+ phrase_index = response.lower().find("phrase:")
152
+ quoted_strings = []
153
+ if phrase_index < 0:
154
+ phrase_index = 0
155
+ else:
156
+ phrase_index += len("phrase:")
157
+ quoted_strings = re.findall(r'"([^"]*)"', search_query_phrase[phrase_index:])
158
+ if len(quoted_strings) == 0:
159
+ quoted_strings = re.findall(r"'([^']*)'", search_query_phrase[phrase_index:])
160
+ if len(quoted_strings) > 0:
161
+ # logger.info(quoted_strings)
162
+ phrase = quoted_strings[0]
163
+ return phrase, response[response.find(phrase) + len(phrase) + 1:]
164
+ else:
165
+ logger.info("no quoted text, returning original query string", response)
166
+ # logger.info(response)
167
+ return "", response
168
+
169
+
170
+ def find_keywords(response, query_phrase, orig_phrase):
171
+ # keywords includes those suggested by gpt and any remaining words from query phrase len > 4
172
+ keywords = []
173
+ quoted_strings = re.findall(r'"([^"]*)"', query_phrase)
174
+ quoted_strings2 = re.findall(r'"([^"]*)"', orig_phrase)
175
+ remainder = query_phrase
176
+ k_index = response.lower().find("keyword")
177
+ if k_index > 0:
178
+ keyword_string = response[k_index + len("keyword"):]
179
+ nm_index = keyword_string.find("Named-Entities:")
180
+ if nm_index > 0:
181
+ keyword_string = keyword_string[:nm_index].rstrip()
182
+ # logger.info(keyword_string)
183
+ c_index = keyword_string.find(":")
184
+ keyword_string = keyword_string[c_index + 1:]
185
+ candidates = keyword_string.split(",")
186
+ for keyword in candidates:
187
+ keyword = keyword.strip(":,.\t\n").lstrip(" ")
188
+ if len(keyword) > 3 or keyword[0:1].isupper():
189
+ keywords.append(keyword)
190
+ return keywords
191
+ return ""
192
+
193
+
194
+ # don't know why this compilation error doesn't throw errors at runtime, but it doesn't, on the other hand trying to
195
+ # fix this creates an infinite import loop, so don't touch this.
196
+ def split_interaction(interaction):
197
+ qs = interaction.find(prefix)
198
+ rs = interaction.find(suffix)
199
+ if qs >= 0 and rs >= 0:
200
+ query = interaction[len(prefix): rs].lstrip()
201
+ response = interaction[rs + len(suffix):].lstrip()
202
+ return query, response
203
+ else:
204
+ logger.info("can't parse", interaction)
205
+ return "", ""
206
+
207
+
208
+ def findnth(haystack, needle, n):
209
+ parts = haystack.split(needle, n + 1)
210
+ if len(parts) <= n + 1:
211
+ return -1
212
+ return len(haystack) - len(parts[-1]) - len(needle)
213
+
214
+
215
+ def extract_site(url):
216
+ site = ""
217
+ base = findnth(url, "/", 2)
218
+ if base > 2:
219
+ site = url[:base].split(".")
220
+ if len(site) > 1:
221
+ site = site[-2]
222
+ site = site.replace("https://", "")
223
+ site = site.replace("http://", "")
224
+ return site
225
+
226
+
227
+ def extract_domain(url):
228
+ site = ""
229
+ base = findnth(url, "/", 2)
230
+ if base > 2:
231
+ domain = url[:base].split(".")
232
+ if len(domain) > 1:
233
+ domain = domain[-2] + "." + domain[-1]
234
+ domain = domain.replace("https://", "")
235
+ domain = domain.replace("http://", "")
236
+ return domain
237
+
238
+
239
+ def part_of_keyword(word, keywords):
240
+ for keyword in keywords:
241
+ if word in keyword:
242
+ return True
243
+ return False
244
+
245
+
246
+ keyword_prompt = 'Perform two tasks on the following text. First, rewrite the <text> as an effective google search phrase. Second, analyze text and list keywords and named-entities found. Return the result as: Phrase: "<google search phrase>"\nKeywords: <list of keywords>\nNamed-Entities: <list of Named-Entities>'
247
+
248
+
249
+ def get_search_phrase_and_keywords(query_string, chat_history):
250
+ gpt_message = [
251
+ {"role": "user", "content": keyword_prompt},
252
+ {"role": "user", "content": "Text\n" + query_string},
253
+ {"role": "assistant", "content": "Phrase:"},
254
+ ]
255
+ response_text = ""
256
+ completion = None
257
+ # for role in gpt_message:
258
+ # logger.info(role)
259
+ # logger.info()
260
+ response_text = ask_gpt_with_retries(
261
+ "gpt-3.5-turbo", gpt_message, tokens=150, temp=0.3, timeout=6, tries=2
262
+ )
263
+ logger.info(response_text)
264
+ # useful function to make search query more optimal, for future explainability studies
265
+ # consider returning query phrase and keywords to user
266
+ query_phrase, remainder = find_query(response_text)
267
+ logger.info("PHRASE:" + query_phrase)
268
+ # logger.info(remainder)
269
+ keywords = find_keywords(remainder, query_phrase, query_string)
270
+ logger.info("KEYWORDS:" + ''.join(keywords))
271
+ return query_phrase, keywords
272
+
273
+
274
+ def reform(elements):
275
+ # reformulates text extracted from a webpage by unstructured.partition_html into larger keyword-rankable chunks
276
+ texts = (
277
+ []
278
+ ) # a list of text_strings, each of at most *max* chars, separated on '\n' when splitting an element is needed
279
+ paragraphs = []
280
+ total_elem_len = 0
281
+ for element in elements:
282
+ text = str(element)
283
+ total_elem_len += len(text)
284
+ if len(text) < 4:
285
+ continue
286
+ elif len(text) < 500:
287
+ texts.append(text)
288
+ else:
289
+ subtexts = text.split("\n")
290
+ for subtext in subtexts:
291
+ if len(subtext) < 500:
292
+ texts.append(subtext)
293
+ else:
294
+ texts.extend(nltk.sent_tokenize(subtext))
295
+
296
+ # now reassemble shorter texts into chunks
297
+ paragraph = ""
298
+ total_pp_len = 0
299
+ for text in texts:
300
+ if len(text) + len(paragraph) < 500:
301
+ paragraph += " " + text
302
+ else:
303
+ if len(paragraph) > 0: # start a new paragraph
304
+ paragraphs.append(paragraph)
305
+ paragraph = ""
306
+ paragraph += text
307
+ if len(paragraph) > 0:
308
+ paragraphs.append(paragraph + ".\n")
309
+ # logger.info(f'\n***** reform elements in {len(elements)}, paragraphs out {len(paragraphs)}')
310
+ total_pp_len = 0
311
+ for paragraph in paragraphs:
312
+ total_pp_len += len(paragraph)
313
+ if total_pp_len > 1.2 * total_elem_len:
314
+ logger.info(
315
+ f"******** reform out > reform in. out: {total_pp_len}, in: {total_elem_len}"
316
+ )
317
+ return paragraphs
318
+
319
+
320
+ def get_actions(text):
321
+ # look for actions in response
322
+ action_indecies = re.finditer("Action:", text) # Action: [search, ask} (query)
323
+ actions = []
324
+ editted_response = text
325
+ for action_index in action_indecies:
326
+ action = text[action_index.span()[1]:]
327
+ agent = None
328
+ query = None
329
+ query_start = action.find("(")
330
+ if query_start > 0:
331
+ agent = action[:query_start].strip()
332
+ query_end = action[query_start + 1:].find(")")
333
+ if query_end > 0:
334
+ query = action[query_start + 1: query_start + 1 + query_end]
335
+ action = text[
336
+ action_index.start(): action_index.span()[1]
337
+ + action_index.start()
338
+ + query_start
339
+ + query_end
340
+ + 2
341
+ ]
342
+ if agent is None or query is None:
343
+ logger.info(
344
+ "can't parse action, skipping",
345
+ text[action_index.start(): action_index.start() + 48],
346
+ )
347
+ continue
348
+ actions.append([agent, query, action])
349
+ editted_response = editted_response.replace(action, "")
350
+ return actions
351
+
352
+
353
+ if __name__ == "__main__":
354
+ get_search_phrase_and_keywords(
355
+ "Would I like the video game Forspoken, given that I like Final Fantasy VII?",
356
+ [],
357
+ )
358
+ # logger.info(query_vicuna("what is 5 * 3?"))
read_write_index.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, load_index_from_storage
5
+
6
+ logger = logging.getLogger(__name__)
7
+ DOCUMENT_PATH = '../data'
8
+
9
+
10
+ # remember to delete stored vectors when new documents are added to the data so the storage is recreated
11
+ def read_write_index(path):
12
+ if not os.path.exists(path):
13
+ documents = SimpleDirectoryReader(DOCUMENT_PATH).load_data()
14
+ logger.info(f'Indexing documents in {DOCUMENT_PATH}...')
15
+ index = VectorStoreIndex.from_documents(documents)
16
+ index.storage_context.persist(persist_dir=path)
17
+ logger.info(f'{len(documents)} documents indexed.')
18
+ else:
19
+ logger.info(f'Loading index from {path}...')
20
+ storage_context = StorageContext.from_defaults(persist_dir=path)
21
+ index = load_index_from_storage(storage_context)
22
+ return index
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.38.1
2
+ llama-index==0.10.29
3
+ llama-index-core==0.10.29
4
+ # prev version chatmessage class import broken
5
+ llama-index-llms-mistralai==0.1.6
6
+ llama-index-embeddings-mistralai
7
+ llama-index-embeddings-openai
8
+ llama-index-llms-openai
9
+ # needed for simpledirectoryreader to work
10
+ llama-index-readers-file
11
+ selenium
12
+ unstructured
13
+ requests
utils.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ def read_file(path):
2
+ txt = open(path, "r")
3
+ file = txt.read()
4
+ txt.close()
5
+ return file
web_search.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import concurrent.futures
2
+ import copy
3
+ import json
4
+ import logging
5
+ import time
6
+ import traceback
7
+ import urllib.parse as en
8
+ import warnings
9
+ from itertools import zip_longest
10
+
11
+ import requests
12
+ import selenium.common.exceptions
13
+ from selenium import webdriver
14
+ from selenium.webdriver.chrome.options import Options
15
+ from unstructured.partition.html import partition_html
16
+
17
+ from llmsearch import meta as mt, site_stats
18
+ # this import style works in pycharm
19
+ from llmsearch import utilityV2 as ut
20
+
21
+ from urllib.request import urlopen
22
+ # this import style works on sever + vs code
23
+ # import utils
24
+ # from llmsearch import google_search_concurrent as gs
25
+ # from llmsearch import meta as mt
26
+ # from llmsearch import utilityV2 as ut
27
+
28
+ logger = logging.getLogger("agent_logger")
29
+
30
+
31
+ # todo drop blocked pages > see og llmsearch code
32
+ # todo use the chatcondesemode query instead of the new gpt query
33
+
34
+ def search(msg, query_phrase):
35
+ try:
36
+ # this call extracts keywords from the statement and rewrites it into a better search phrase with gpt3.5
37
+ # query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
38
+ mt.clear()
39
+ google_text = ""
40
+ try:
41
+ logger.info(f"asking google {msg}; rephrased: {query_phrase}")
42
+ google_text, urls_all, index, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
43
+ except:
44
+ traceback.print_exc()
45
+
46
+ logger.info("\n\nFinal response: ")
47
+
48
+ for item in google_text:
49
+ logger.info(
50
+ f"\n##############################################################################################\nSource: {item['source']}"
51
+ )
52
+ logger.info(f"{item['text']}")
53
+ logger.info(f"URL: {item['url']}")
54
+ return google_text
55
+ except KeyboardInterrupt:
56
+ traceback.print_exc()
57
+ raise KeyboardInterrupt
58
+ except:
59
+ traceback.print_exc()
60
+ return ""
61
+
62
+
63
+ # Define a function to make a single URL request and process the response
64
+ def process_url(query_phrase, url, timeout):
65
+ start_time = time.time()
66
+ site = ut.extract_site(url)
67
+ result = ""
68
+ try:
69
+ with warnings.catch_warnings():
70
+ warnings.simplefilter("ignore")
71
+ options = Options()
72
+ options.page_load_strategy = "eager"
73
+ options.add_argument("--headless")
74
+ result = ""
75
+ with webdriver.Chrome(options=options) as dr:
76
+ logger.info(f"*****setting page load timeout {timeout}")
77
+ dr.set_page_load_timeout(timeout)
78
+ try:
79
+ dr.get(url)
80
+ response = dr.page_source
81
+ result = response_text_extract(url=url, response=response)
82
+ except selenium.common.exceptions.TimeoutException:
83
+ return "", url
84
+ except Exception:
85
+ traceback.print_exc()
86
+ logger.info(f"{site} err")
87
+ pass
88
+ logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
89
+ return result, url
90
+
91
+
92
+ def process_urls(query_phrase, urls):
93
+ # Create a ThreadPoolExecutor with 5 worker threads
94
+ response = []
95
+ logger.info("entering process urls")
96
+ full_text = ""
97
+ used_index = 0
98
+ urls_used = ["" for i in range(30)]
99
+ tried_index = 0
100
+ urls_tried = ["" for i in range(30)]
101
+ start_time = time.time()
102
+ in_process = []
103
+ processed = []
104
+ google_futures = []
105
+
106
+ with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
107
+ # initialize scan of google urls
108
+ while True:
109
+ try:
110
+ while (len(urls) > 0
111
+ # no sense starting if not much time left
112
+ and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
113
+ ):
114
+ recommendation = site_stats.get_next(urls, sample_unknown=True)
115
+ # set timeout so we don't wait for a slow site forever
116
+ timeout = 12 - int(time.time() - start_time)
117
+ url = recommendation[1]
118
+ future = executor.submit(process_url, query_phrase, url, timeout)
119
+ google_futures.append(future)
120
+ in_process.append(future)
121
+ urls_tried[tried_index] = url
122
+ tried_index += 1
123
+ urls.remove(url)
124
+ logger.info(f"queued {ut.extract_site(url)}, {timeout}")
125
+ # Process the responses as they arrive
126
+ for future in in_process:
127
+ if future.done():
128
+ result, url = future.result()
129
+ processed.append(future)
130
+ in_process.remove(future)
131
+ if len(result) > 0:
132
+ urls_used[used_index] = url
133
+ used_index += 1
134
+ logger.info(
135
+ f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
136
+ )
137
+ if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
138
+ response.append(
139
+ {
140
+ "source": ut.extract_domain(url),
141
+ "url": url,
142
+ "text": result,
143
+ }
144
+ )
145
+
146
+ if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
147
+ executor.shutdown(wait=False)
148
+ logger.info(
149
+ f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
150
+ )
151
+ return response, used_index, urls_used, tried_index, urls_tried
152
+ time.sleep(0.5)
153
+ except:
154
+ traceback.print_exc()
155
+ executor.shutdown(wait=False)
156
+ logger.info(
157
+ f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
158
+ )
159
+ return response, index, urls_used, tried_index, urls_tried
160
+
161
+
162
+ def extract_subtext(text):
163
+ return ut.reform(text)
164
+
165
+
166
+ def request_google(query_phrase):
167
+ logger.info(f"***** search {query_phrase}")
168
+ sort = "&sort=date-sdate:d:w"
169
+ if "today" in query_phrase or "latest" in query_phrase:
170
+ sort = "&sort=date-sdate:d:s"
171
+ # logger.info(f"search for: {query_phrase}")
172
+ google_query = en.quote(query_phrase)
173
+ response = []
174
+ try:
175
+ start_wall_time = time.time()
176
+ url = (
177
+ "https://www.googleapis.com/customsearch/v1?key="
178
+ + ut.google_key
179
+ + "&cx="
180
+ + ut.google_cx
181
+ + "&num=4"
182
+ + sort
183
+ + "&q="
184
+ + google_query
185
+ )
186
+ response = requests.get(url)
187
+ response_json = json.loads(response.text)
188
+ logger.info(f"***** google search {int((time.time() - start_wall_time) * 10) / 10} sec")
189
+ except:
190
+ traceback.print_exc()
191
+ return []
192
+
193
+ # see if we got anything useful from google
194
+ if "items" not in response_json.keys():
195
+ logger.info("no return from google ...", response, response_json.keys())
196
+ return []
197
+
198
+ urls = []
199
+ for i in range(len(response_json["items"])):
200
+ url = response_json["items"][i]["link"].lstrip().rstrip()
201
+ site = ut.extract_site(url)
202
+ if site not in ut.sites or ut.sites[site] == 1:
203
+ # don't use these sources (reddit because it blocks bots)
204
+ if "reddit" not in url and "youtube" not in url and "facebook" not in url:
205
+ urls.append(url)
206
+ return urls
207
+
208
+
209
+ def response_text_extract(url, response):
210
+ extract_text = ""
211
+ if url.endswith("pdf"):
212
+ pass
213
+ else:
214
+ if response is not None:
215
+ elements = partition_html(text=response)
216
+ str_elements = []
217
+ logger.info('\n***** elements')
218
+ for e in elements:
219
+ stre = str(e).replace(" ", " ")
220
+ str_elements.append(stre)
221
+ extract_text = ''.join(extract_subtext(str_elements))
222
+ logger.info(
223
+ f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
224
+ )
225
+
226
+ if len(extract_text.strip()) < 8:
227
+ return ""
228
+ else:
229
+ return extract_text
230
+
231
+
232
+ def extract_items_from_numbered_list(text):
233
+ items = ""
234
+ elements = text.split("\n")
235
+ for candidate in elements:
236
+ candidate = candidate.lstrip(". \t")
237
+ if len(candidate) > 4 and candidate[0].isdigit():
238
+ candidate = candidate[1:].lstrip(". ")
239
+ if (
240
+ len(candidate) > 4 and candidate[0].isdigit()
241
+ ): # strip second digit if more than 10 items
242
+ candidate = candidate[1:].lstrip(". ")
243
+ logger.info("E {}".format(candidate))
244
+ items += candidate + " "
245
+ return items
246
+
247
+
248
+ def search_google(original_query, query_phrase):
249
+ all_urls = []
250
+ urls_used = []
251
+ urls_tried = []
252
+ index = 0
253
+ tried_index = 0
254
+ full_text = ""
255
+
256
+ try: # query google for recent info
257
+ extract_query = ""
258
+ orig_phrase_urls = []
259
+ if len(original_query) > 0:
260
+ orig_phrase_urls = request_google(original_query[: min(len(original_query), 128)])
261
+ extract_query = original_query[: min(len(original_query), 128)]
262
+ gpt_phrase_urls = []
263
+ if len(query_phrase) > 0:
264
+ gpt_phrase_urls = request_google(query_phrase)
265
+ extract_query = (
266
+ query_phrase # prefer more succinct query phrase if available
267
+ )
268
+ if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
269
+ return "", [], 0, [""], 0, [""]
270
+
271
+ for url in orig_phrase_urls:
272
+ if url in gpt_phrase_urls:
273
+ gpt_phrase_urls.remove(url)
274
+
275
+ # interleave both lists now that duplicates are removed
276
+ urls = [
277
+ val
278
+ for tup in zip_longest(orig_phrase_urls, gpt_phrase_urls)
279
+ for val in tup
280
+ if val is not None
281
+ ]
282
+ all_urls = copy.deepcopy(urls)
283
+ # initialize scan of google urls
284
+ start_wall_time = time.time()
285
+ full_text, index, urls_used, tried_index, urls_tried = process_urls(extract_query, all_urls)
286
+ logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
287
+ logger.info("return from url processsing")
288
+ except:
289
+ traceback.print_exc()
290
+ return full_text, all_urls, index, urls_used, tried_index, urls_tried