Spaces:
Sleeping
Sleeping
arabellastrange
commited on
Commit
·
e750b39
1
Parent(s):
96166e4
tidying files
Browse files- llmsearch/google_search_concurrent.py +0 -698
- llmsearch/meta.py +0 -357
- web_search.py +1 -2
llmsearch/google_search_concurrent.py
DELETED
@@ -1,698 +0,0 @@
|
|
1 |
-
import concurrent.futures
|
2 |
-
import copy
|
3 |
-
import json
|
4 |
-
import logging
|
5 |
-
import sys
|
6 |
-
import time
|
7 |
-
# from PyPDF2 import PdfReader
|
8 |
-
import traceback
|
9 |
-
import urllib.parse as en
|
10 |
-
import warnings
|
11 |
-
from datetime import date
|
12 |
-
from itertools import zip_longest
|
13 |
-
|
14 |
-
import nltk
|
15 |
-
import requests
|
16 |
-
import selenium.common.exceptions
|
17 |
-
import wordfreq as wf
|
18 |
-
from selenium import webdriver
|
19 |
-
from selenium.webdriver.chrome.options import Options
|
20 |
-
from unstructured.partition.html import partition_html
|
21 |
-
|
22 |
-
from llmsearch import site_stats
|
23 |
-
from llmsearch import utilityV2 as ut
|
24 |
-
|
25 |
-
# from llmsearch import site_stats
|
26 |
-
# from llmsearch import utilityV2 as ut
|
27 |
-
|
28 |
-
logger = logging.getLogger("agent_logger")
|
29 |
-
|
30 |
-
today = " as of " + date.today().strftime("%b-%d-%Y") + "\n\n"
|
31 |
-
|
32 |
-
suffix = "\nA: "
|
33 |
-
client = "\nQ: "
|
34 |
-
|
35 |
-
QUICK_SEARCH = "quick"
|
36 |
-
NORMAL_SEARCH = "moderate"
|
37 |
-
DEEP_SEARCH = "deep"
|
38 |
-
|
39 |
-
# system_prime = {
|
40 |
-
# "role": "system",
|
41 |
-
# "content": "You analyze Text with respect to Query and list any relevant information found, including direct quotes from the text, and detailed samples or examples in the text.",
|
42 |
-
# }
|
43 |
-
priming_1 = {"role": "user", "content": "Query:\n"}
|
44 |
-
|
45 |
-
|
46 |
-
# priming_2 = {
|
47 |
-
# "role": "user",
|
48 |
-
# "content": "List relevant information in the provided text, including direct quotes from the text. If none, respond 'no information'.\nText:\n",
|
49 |
-
# }
|
50 |
-
|
51 |
-
def process_url_mod(query_phrase, url, timeout):
|
52 |
-
start_time = time.time()
|
53 |
-
site = ut.extract_site(url)
|
54 |
-
result = ""
|
55 |
-
try:
|
56 |
-
with warnings.catch_warnings():
|
57 |
-
warnings.simplefilter("ignore")
|
58 |
-
options = Options()
|
59 |
-
options.page_load_strategy = "eager"
|
60 |
-
options.add_argument("--headless")
|
61 |
-
result = ""
|
62 |
-
with webdriver.Chrome(options=options) as dr:
|
63 |
-
logger.info(f"*****setting page load timeout {timeout}")
|
64 |
-
dr.set_page_load_timeout(timeout)
|
65 |
-
try:
|
66 |
-
dr.get(url)
|
67 |
-
response = dr.page_source
|
68 |
-
result = response_text_extract_mod(url, response)
|
69 |
-
except selenium.common.exceptions.TimeoutException:
|
70 |
-
return "", url
|
71 |
-
except Exception:
|
72 |
-
traceback.print_exc()
|
73 |
-
logger.info(f"{site} err")
|
74 |
-
pass
|
75 |
-
logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
76 |
-
return result, url
|
77 |
-
|
78 |
-
|
79 |
-
# Define a function to make a single URL request and process the response
|
80 |
-
def process_url(query_phrase, keywords, keyword_weights, url, timeout):
|
81 |
-
start_time = time.time()
|
82 |
-
site = ut.extract_site(url)
|
83 |
-
result = ""
|
84 |
-
try:
|
85 |
-
with warnings.catch_warnings():
|
86 |
-
warnings.simplefilter("ignore")
|
87 |
-
options = Options()
|
88 |
-
options.page_load_strategy = "eager"
|
89 |
-
options.add_argument("--headless")
|
90 |
-
result = ""
|
91 |
-
with webdriver.Chrome(options=options) as dr:
|
92 |
-
logger.info(f"*****setting page load timeout {timeout}")
|
93 |
-
dr.set_page_load_timeout(timeout)
|
94 |
-
try:
|
95 |
-
dr.get(url)
|
96 |
-
response = dr.page_source
|
97 |
-
result = response_text_extract(
|
98 |
-
query_phrase,
|
99 |
-
keywords,
|
100 |
-
keyword_weights,
|
101 |
-
url,
|
102 |
-
response,
|
103 |
-
int(time.time() - start_time),
|
104 |
-
)
|
105 |
-
except selenium.common.exceptions.TimeoutException:
|
106 |
-
return "", url
|
107 |
-
except Exception:
|
108 |
-
traceback.print_exc()
|
109 |
-
logger.info(f"{site} err")
|
110 |
-
pass
|
111 |
-
# logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time()-start_time)*1000)} ms")
|
112 |
-
return result, url
|
113 |
-
|
114 |
-
|
115 |
-
def process_urls_mod(query_phrase, urls):
|
116 |
-
start_time = time.time()
|
117 |
-
|
118 |
-
response = []
|
119 |
-
logger.info("entering process urls")
|
120 |
-
full_text = ""
|
121 |
-
used_index = 0
|
122 |
-
urls_used = ["" for i in range(30)]
|
123 |
-
tried_index = 0
|
124 |
-
urls_tried = ["" for i in range(30)]
|
125 |
-
in_process = []
|
126 |
-
processed = []
|
127 |
-
google_futures = []
|
128 |
-
|
129 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:
|
130 |
-
# initialize scan of google urls
|
131 |
-
while True:
|
132 |
-
try:
|
133 |
-
while len(urls) > 0:
|
134 |
-
timeout = 12 - int(time.time() - start_time)
|
135 |
-
recommendation = site_stats.get_next(
|
136 |
-
urls, sample_unknown=True
|
137 |
-
)
|
138 |
-
url = recommendation[1]
|
139 |
-
future = executor.submit(process_url_mod, query_phrase=query_phrase, url=url, timeout=timeout)
|
140 |
-
google_futures.append(future)
|
141 |
-
in_process.append(future)
|
142 |
-
urls_tried[tried_index] = url
|
143 |
-
tried_index += 1
|
144 |
-
urls.remove(url)
|
145 |
-
logger.info(f"queued {ut.extract_site(url)}, {timeout}")
|
146 |
-
|
147 |
-
for future in in_process:
|
148 |
-
if future.done():
|
149 |
-
result, url = future.result()
|
150 |
-
processed.append(future)
|
151 |
-
in_process.remove(future)
|
152 |
-
if len(result) > 0:
|
153 |
-
urls_used[used_index] = url
|
154 |
-
used_index += 1
|
155 |
-
result = result.replace(". .", ".")
|
156 |
-
logger.info(
|
157 |
-
f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
|
158 |
-
)
|
159 |
-
response.append(
|
160 |
-
{
|
161 |
-
"source": ut.extract_domain(url),
|
162 |
-
"url": url,
|
163 |
-
"text": result,
|
164 |
-
}
|
165 |
-
)
|
166 |
-
if time.time() - start_time > 28:
|
167 |
-
executor.shutdown(wait=False)
|
168 |
-
logger.info(
|
169 |
-
f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
|
170 |
-
)
|
171 |
-
return response, used_index, urls_used, tried_index, urls_tried
|
172 |
-
time.sleep(0.5)
|
173 |
-
except:
|
174 |
-
traceback.print_exc()
|
175 |
-
executor.shutdown(wait=False)
|
176 |
-
logger.info(
|
177 |
-
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
178 |
-
)
|
179 |
-
return response, index, urls_used, tried_index, urls_tried
|
180 |
-
|
181 |
-
|
182 |
-
def process_urls(query_phrase, keywords, keyword_weights, urls, search_level):
|
183 |
-
# Create a ThreadPoolExecutor with 5 worker threads
|
184 |
-
response = []
|
185 |
-
logger.info("entering process urls")
|
186 |
-
start_time = time.time()
|
187 |
-
full_text = ""
|
188 |
-
used_index = 0
|
189 |
-
urls_used = ["" for i in range(30)]
|
190 |
-
tried_index = 0
|
191 |
-
urls_tried = ["" for i in range(30)]
|
192 |
-
start_time = time.time()
|
193 |
-
in_process = []
|
194 |
-
processed = []
|
195 |
-
google_futures = []
|
196 |
-
off_whitelist = False
|
197 |
-
|
198 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:
|
199 |
-
# initialize scan of google urls
|
200 |
-
while True:
|
201 |
-
try:
|
202 |
-
while (
|
203 |
-
len(urls) > 0
|
204 |
-
# no sense starting if not much time left
|
205 |
-
and (
|
206 |
-
(
|
207 |
-
search_level == DEEP_SEARCH
|
208 |
-
and len(full_text) < 9600
|
209 |
-
and len(in_process) < 16
|
210 |
-
and time.time() - start_time < 14
|
211 |
-
)
|
212 |
-
or (
|
213 |
-
search_level == NORMAL_SEARCH
|
214 |
-
and len(full_text) < 6400
|
215 |
-
and len(in_process) < 14
|
216 |
-
and time.time() - start_time < 12
|
217 |
-
)
|
218 |
-
or (
|
219 |
-
search_level == QUICK_SEARCH
|
220 |
-
and len(full_text) < 4800
|
221 |
-
and len(in_process) < 10
|
222 |
-
and time.time() - start_time < 8
|
223 |
-
)
|
224 |
-
)
|
225 |
-
):
|
226 |
-
recommendation = site_stats.get_next(
|
227 |
-
urls, sample_unknown=off_whitelist
|
228 |
-
)
|
229 |
-
if recommendation is None or len(recommendation) == 0:
|
230 |
-
off_whitelist = True
|
231 |
-
else:
|
232 |
-
# set timeout so we don't wait for a slow site forever
|
233 |
-
timeout = 12 - int(time.time() - start_time)
|
234 |
-
if search_level == NORMAL_SEARCH:
|
235 |
-
timeout = timeout + 4
|
236 |
-
url = recommendation[1]
|
237 |
-
future = executor.submit(
|
238 |
-
process_url,
|
239 |
-
query_phrase,
|
240 |
-
keywords,
|
241 |
-
keyword_weights,
|
242 |
-
url,
|
243 |
-
timeout,
|
244 |
-
)
|
245 |
-
# remaining_time = start_time+18-time.time()
|
246 |
-
# future.exception(remaining_time)
|
247 |
-
google_futures.append(future)
|
248 |
-
in_process.append(future)
|
249 |
-
urls_tried[tried_index] = url
|
250 |
-
tried_index += 1
|
251 |
-
urls.remove(url)
|
252 |
-
logger.info(f"queued {ut.extract_site(url)}, {timeout}")
|
253 |
-
# Process the responses as they arrive
|
254 |
-
for future in in_process:
|
255 |
-
if future.done():
|
256 |
-
result, url = future.result()
|
257 |
-
processed.append(future)
|
258 |
-
in_process.remove(future)
|
259 |
-
if len(result) > 0:
|
260 |
-
urls_used[used_index] = url
|
261 |
-
used_index += 1
|
262 |
-
result = result.replace(". .", ".")
|
263 |
-
logger.info(
|
264 |
-
f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
|
265 |
-
)
|
266 |
-
site = ut.extract_site(url)
|
267 |
-
domain = ut.extract_domain(url)
|
268 |
-
if domain.endswith("gov"):
|
269 |
-
credibility = "Official Source"
|
270 |
-
elif site in ut.sites.keys():
|
271 |
-
if ut.sites[site] > 0:
|
272 |
-
credibility = "Whitelisted Source"
|
273 |
-
elif ut.sites[site] == 0:
|
274 |
-
credibility = "Blacklisted Source"
|
275 |
-
else:
|
276 |
-
credibility = "Third-Party Source"
|
277 |
-
|
278 |
-
response.append(
|
279 |
-
{
|
280 |
-
"source": ut.extract_domain(url),
|
281 |
-
"url": url,
|
282 |
-
"credibility": credibility,
|
283 |
-
"text": result,
|
284 |
-
}
|
285 |
-
)
|
286 |
-
|
287 |
-
# openai seems to timeout a plugin at about 30 secs, and there is pbly 3-4 sec overhead
|
288 |
-
if (
|
289 |
-
(len(urls) == 0 and len(in_process) == 0)
|
290 |
-
or (
|
291 |
-
search_level == DEEP_SEARCH
|
292 |
-
and (len(full_text) > 9600)
|
293 |
-
or time.time() - start_time > 42
|
294 |
-
)
|
295 |
-
or (
|
296 |
-
search_level == NORMAL_SEARCH
|
297 |
-
and (len(full_text) > 6400)
|
298 |
-
or time.time() - start_time > 32
|
299 |
-
)
|
300 |
-
or (
|
301 |
-
search_level == QUICK_SEARCH
|
302 |
-
and (len(full_text) > 4800)
|
303 |
-
or time.time() - start_time > 28
|
304 |
-
)
|
305 |
-
):
|
306 |
-
executor.shutdown(wait=False)
|
307 |
-
logger.info(
|
308 |
-
f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
|
309 |
-
)
|
310 |
-
return response, used_index, urls_used, tried_index, urls_tried
|
311 |
-
time.sleep(0.5)
|
312 |
-
except:
|
313 |
-
traceback.print_exc()
|
314 |
-
executor.shutdown(wait=False)
|
315 |
-
logger.info(
|
316 |
-
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
317 |
-
)
|
318 |
-
return response, index, urls_used, tried_index, urls_tried
|
319 |
-
|
320 |
-
|
321 |
-
def extract_subtext(text, query_phrase, keywords, keyword_weights):
|
322 |
-
### maybe we should score based on paragraphs, not lines?
|
323 |
-
sentences = ut.reform(text)
|
324 |
-
# logger.info('***** sentences from reform')
|
325 |
-
# for sentence in sentences:
|
326 |
-
# logger.info(sentence)
|
327 |
-
sentence_weights = {}
|
328 |
-
final_text = ""
|
329 |
-
for sentence in sentences:
|
330 |
-
sentence_weights[sentence] = 0
|
331 |
-
for keyword in keywords:
|
332 |
-
if keyword in sentence or keyword.lower() in sentence:
|
333 |
-
if keyword in keyword_weights.keys():
|
334 |
-
sentence_weights[sentence] += keyword_weights[keyword]
|
335 |
-
|
336 |
-
# now pick out sentences starting with those with the most keywords
|
337 |
-
max_sentence_weight = 0
|
338 |
-
for keyword in keyword_weights.keys():
|
339 |
-
max_sentence_weight += keyword_weights[keyword]
|
340 |
-
# logger.info(f'******* max sentence weight {max_sentence_weight}')
|
341 |
-
for i in range(max_sentence_weight, 1, -1):
|
342 |
-
if len(final_text) > 6000 and i < max(
|
343 |
-
1, int(max_sentence_weight / 4)
|
344 |
-
): # make sure we don't miss any super-important text
|
345 |
-
return final_text
|
346 |
-
for sentence in sentences:
|
347 |
-
if len(final_text) + len(sentence) > 6001 and i < max(
|
348 |
-
1, int(max_sentence_weight / 4)
|
349 |
-
):
|
350 |
-
continue
|
351 |
-
if sentence_weights[sentence] == i:
|
352 |
-
final_text += sentence
|
353 |
-
# logger.info("relevant text", final_text)
|
354 |
-
# logger.info("keyword extract length:",len(final_text)) #, end='.. ')
|
355 |
-
|
356 |
-
return final_text
|
357 |
-
|
358 |
-
|
359 |
-
def search(query_phrase):
|
360 |
-
logger.info(f"***** search {query_phrase}")
|
361 |
-
sort = "&sort=date-sdate:d:w"
|
362 |
-
if "today" in query_phrase or "latest" in query_phrase:
|
363 |
-
sort = "&sort=date-sdate:d:s"
|
364 |
-
# logger.info(f"search for: {query_phrase}")
|
365 |
-
google_query = en.quote(query_phrase)
|
366 |
-
response = []
|
367 |
-
try:
|
368 |
-
start_wall_time = time.time()
|
369 |
-
url = (
|
370 |
-
"https://www.googleapis.com/customsearch/v1?key="
|
371 |
-
+ ut.google_key
|
372 |
-
+ "&cx="
|
373 |
-
+ ut.google_cx
|
374 |
-
# was ten but want to reduce search time
|
375 |
-
+ "&num=3"
|
376 |
-
+ sort
|
377 |
-
+ "&q="
|
378 |
-
+ google_query
|
379 |
-
)
|
380 |
-
response = requests.get(url)
|
381 |
-
response_json = json.loads(response.text)
|
382 |
-
logger.info(f"***** google search {int((time.time() - start_wall_time) * 10) / 10} sec")
|
383 |
-
except:
|
384 |
-
traceback.print_exc()
|
385 |
-
return []
|
386 |
-
|
387 |
-
# see if we got anything useful from google
|
388 |
-
if "items" not in response_json.keys():
|
389 |
-
logger.info("no return from google ...", response, response_json.keys())
|
390 |
-
# logger.info(google_query)
|
391 |
-
return []
|
392 |
-
|
393 |
-
# first try whitelist sites
|
394 |
-
urls = []
|
395 |
-
for i in range(len(response_json["items"])):
|
396 |
-
url = response_json["items"][i]["link"].lstrip().rstrip()
|
397 |
-
site = ut.extract_site(url)
|
398 |
-
if site not in ut.sites or ut.sites[site] == 1:
|
399 |
-
urls.append(url)
|
400 |
-
return urls
|
401 |
-
|
402 |
-
|
403 |
-
def log_url_process(site, reason, raw_text, extract_text, gpt_text):
|
404 |
-
return
|
405 |
-
|
406 |
-
|
407 |
-
"""
|
408 |
-
# to record detailed logs of url processing unquote this function
|
409 |
-
def log_url_process(site, reason, raw_text, extract_text, gpt_text):
|
410 |
-
if len(raw_text) == 0 and len(extract_text)==0 and len(gpt_text) ==0:
|
411 |
-
return
|
412 |
-
try:
|
413 |
-
with open('google_log.txt', 'a') as lg:
|
414 |
-
lg.write('\n\n*************'+reason.upper()+'***********\n')
|
415 |
-
lg.write('*****************'+site+' RAW*************\n')
|
416 |
-
lg.write(raw_text)
|
417 |
-
lg.write('\n******************extract****************\n')
|
418 |
-
lg.write(extract_text)
|
419 |
-
lg.write('\n********************gpt******************\n')
|
420 |
-
lg.write(gpt_text)
|
421 |
-
except Exception:
|
422 |
-
traceback.print_exc()
|
423 |
-
"""
|
424 |
-
|
425 |
-
|
426 |
-
def response_text_extract_mod(url, response):
|
427 |
-
extract_text = ""
|
428 |
-
if url.endswith("pdf"):
|
429 |
-
pass
|
430 |
-
else:
|
431 |
-
elements = partition_html(text=response)
|
432 |
-
str_elements = []
|
433 |
-
for e in elements:
|
434 |
-
stre = str(e).replace(" ", " ")
|
435 |
-
str_elements.append(stre)
|
436 |
-
extract_text = ut.reform(str_elements)
|
437 |
-
logger.info(
|
438 |
-
f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
|
439 |
-
)
|
440 |
-
if len(''.join(extract_text).strip()) < 8:
|
441 |
-
return ""
|
442 |
-
return extract_text
|
443 |
-
|
444 |
-
|
445 |
-
def response_text_extract(
|
446 |
-
query_phrase, keywords, keyword_weights, url, response, get_time
|
447 |
-
):
|
448 |
-
curr = time.time()
|
449 |
-
text = ""
|
450 |
-
extract_text = ""
|
451 |
-
site = ut.extract_site(url)
|
452 |
-
|
453 |
-
if url.endswith("pdf"):
|
454 |
-
pass
|
455 |
-
else:
|
456 |
-
elements = partition_html(text=response)
|
457 |
-
str_elements = []
|
458 |
-
# logger.info('\n***** elements')
|
459 |
-
for e in elements:
|
460 |
-
stre = str(e).replace(" ", " ")
|
461 |
-
str_elements.append(stre)
|
462 |
-
extract_text = extract_subtext(
|
463 |
-
str_elements, query_phrase, keywords, keyword_weights
|
464 |
-
)
|
465 |
-
# logger.info('\n************ unstructured **********')
|
466 |
-
logger.info(
|
467 |
-
f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
|
468 |
-
)
|
469 |
-
url_text = text # save for final stats
|
470 |
-
new_curr = time.time()
|
471 |
-
extract_time = int((new_curr - curr) * 1000000)
|
472 |
-
if len(extract_text.strip()) < 8:
|
473 |
-
return ""
|
474 |
-
|
475 |
-
# now ask openai to extract answer
|
476 |
-
response_text = ""
|
477 |
-
curr = new_curr
|
478 |
-
extract_text = extract_text[:10000] # make sure we don't run over token limit
|
479 |
-
gpt_tldr_message = [
|
480 |
-
{
|
481 |
-
"role": "user",
|
482 |
-
"content": "Given:\n" + extract_text + "\n\nQuery:\n" + query_phrase,
|
483 |
-
}
|
484 |
-
]
|
485 |
-
start_wall_time = time.time()
|
486 |
-
t_out = 12 - get_time
|
487 |
-
# logger.info(f'****** spawning page get with timeout {t_out}')
|
488 |
-
google_tldr = ut.ask_gpt_with_retries(
|
489 |
-
ut.MODEL, gpt_tldr_message, tokens=300, temp=0.3, timeout=t_out, tries=1
|
490 |
-
)
|
491 |
-
openai_time = int((time.time() - start_wall_time) * 10) / 10
|
492 |
-
logger.info(f"\n***** tldr {query_phrase}, {openai_time} sec")
|
493 |
-
logger.info(f'***** \n{extract_text}\n***** \n{google_tldr}\n*****\n')
|
494 |
-
url_text = url_text.replace("\n", ". ")
|
495 |
-
if google_tldr is None:
|
496 |
-
google_tldr = ""
|
497 |
-
response_text = google_tldr.lstrip()
|
498 |
-
prefix_text = response_text[: min(len(response_text), 96)].lower()
|
499 |
-
# openai sometimes returns a special format for 'no imformation'
|
500 |
-
if prefix_text.startswith("query:"):
|
501 |
-
text_index = response_text.find("Text:")
|
502 |
-
if text_index > 0:
|
503 |
-
response_text = response_text[text_index + 5:]
|
504 |
-
prefix_text = response_text[: min(len(response_text), 96)].lower()
|
505 |
-
if (
|
506 |
-
"no information" in prefix_text
|
507 |
-
or "i cannot provide" in prefix_text
|
508 |
-
or "as an ai language model" in prefix_text
|
509 |
-
or "does not provide" in prefix_text
|
510 |
-
or "it is not possible" in prefix_text
|
511 |
-
):
|
512 |
-
# skip this summary, no info
|
513 |
-
logger.info(
|
514 |
-
"{} {}/{}/{}/{}".format(
|
515 |
-
site, len(response), len(url_text), len(extract_text), 0
|
516 |
-
)
|
517 |
-
)
|
518 |
-
# logger.info('************')
|
519 |
-
# logger.info(extract_text)
|
520 |
-
# logger.info('************')
|
521 |
-
sys.stdout.flush()
|
522 |
-
log_url_process(site, "no info", url_text, extract_text, "")
|
523 |
-
site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
|
524 |
-
return ""
|
525 |
-
|
526 |
-
if (
|
527 |
-
prefix_text.startswith("i'm sorry")
|
528 |
-
or prefix_text.startswith("there is no ")
|
529 |
-
or (
|
530 |
-
prefix_text.startswith("the provided text")
|
531 |
-
or prefix_text.startswith("i cannot")
|
532 |
-
or prefix_text.startswith("unfortunately")
|
533 |
-
or prefix_text.startswith("sorry")
|
534 |
-
or prefix_text.startswith("the text")
|
535 |
-
)
|
536 |
-
and (
|
537 |
-
"is not relevant" in prefix_text
|
538 |
-
or "no information" in prefix_text
|
539 |
-
or "does not provide" in prefix_text
|
540 |
-
or "does not contain" in prefix_text
|
541 |
-
or "no relevant information" in prefix_text
|
542 |
-
)
|
543 |
-
):
|
544 |
-
# skip this summary, no info
|
545 |
-
log_url_process(site, "no info 2", url_text, extract_text, "")
|
546 |
-
logger.info(
|
547 |
-
"{} {}/{}/{}/{}".format(
|
548 |
-
site, len(response), len(url_text), len(extract_text), 0
|
549 |
-
)
|
550 |
-
)
|
551 |
-
###logger.info('************')
|
552 |
-
###logger.info(extract_text)
|
553 |
-
###logger.info('************')
|
554 |
-
site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
|
555 |
-
return ""
|
556 |
-
else:
|
557 |
-
sentences = nltk.sent_tokenize(response_text)
|
558 |
-
response_text = ""
|
559 |
-
for sentence in sentences:
|
560 |
-
if (
|
561 |
-
"no inform" in sentence.lower()
|
562 |
-
or "no specific inform" in sentence.lower()
|
563 |
-
or "is unclear" in sentence.lower()
|
564 |
-
or "not mention" in sentence.lower()
|
565 |
-
or "not specifically mention" in sentence.lower()
|
566 |
-
):
|
567 |
-
pass
|
568 |
-
else:
|
569 |
-
response_text += "\n \u2022 " + sentence + ". "
|
570 |
-
site_stats.update_site_stats(
|
571 |
-
site, len(response_text), get_time, extract_time, openai_time
|
572 |
-
)
|
573 |
-
# logger.info('\n',response_text)
|
574 |
-
log_url_process(site, "response", url_text, extract_text, response_text)
|
575 |
-
logger.info(
|
576 |
-
"{} {}/{}/{}/{}".format(
|
577 |
-
site,
|
578 |
-
len(response),
|
579 |
-
len(url_text),
|
580 |
-
len(extract_text),
|
581 |
-
len(response_text),
|
582 |
-
)
|
583 |
-
)
|
584 |
-
# logger.info('************')
|
585 |
-
# logger.info(google_tldr)
|
586 |
-
# logger.info('************ site response ***********')
|
587 |
-
# logger.info(response_text)
|
588 |
-
# logger.info('************')
|
589 |
-
return response_text + "\n"
|
590 |
-
site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
|
591 |
-
log_url_process(site, "no return", "", "", "")
|
592 |
-
logger.info(
|
593 |
-
"{} {}/{}/{}/{}".format(
|
594 |
-
site, len(response), len(url_text), len(extract_text), 0
|
595 |
-
)
|
596 |
-
)
|
597 |
-
##logger.info('************')
|
598 |
-
##logger.info(extract_text)
|
599 |
-
##logger.info('************')
|
600 |
-
return ""
|
601 |
-
|
602 |
-
|
603 |
-
def extract_items_from_numbered_list(text):
|
604 |
-
items = ""
|
605 |
-
elements = text.split("\n")
|
606 |
-
for candidate in elements:
|
607 |
-
candidate = candidate.lstrip(". \t")
|
608 |
-
if len(candidate) > 4 and candidate[0].isdigit():
|
609 |
-
candidate = candidate[1:].lstrip(". ")
|
610 |
-
if (
|
611 |
-
len(candidate) > 4 and candidate[0].isdigit()
|
612 |
-
): # strip second digit if more than 10 items
|
613 |
-
candidate = candidate[1:].lstrip(". ")
|
614 |
-
logger.info("E {}".format(candidate))
|
615 |
-
items += candidate + " "
|
616 |
-
return items
|
617 |
-
|
618 |
-
|
619 |
-
def search_google_mod(query_phrase):
|
620 |
-
full_text = ""
|
621 |
-
try:
|
622 |
-
gpt_phrase_urls = []
|
623 |
-
if len(query_phrase) > 0:
|
624 |
-
gpt_phrase_urls = search(query_phrase)
|
625 |
-
full_text = process_urls_mod(query_phrase, gpt_phrase_urls)
|
626 |
-
logger.info("return from url processing")
|
627 |
-
except:
|
628 |
-
traceback.print_exc()
|
629 |
-
return full_text
|
630 |
-
|
631 |
-
|
632 |
-
def search_google(original_query, search_level, query_phrase, keywords, chat_history):
|
633 |
-
start_time = time.time()
|
634 |
-
all_urls = []
|
635 |
-
urls_used = []
|
636 |
-
urls_tried = []
|
637 |
-
index = 0
|
638 |
-
tried_index = 0
|
639 |
-
full_text = ""
|
640 |
-
keyword_weights = {}
|
641 |
-
for keyword in keywords:
|
642 |
-
zipf = wf.zipf_frequency(keyword, "en")
|
643 |
-
weight = max(0, int((8 - zipf)))
|
644 |
-
if weight > 0:
|
645 |
-
keyword_weights[keyword] = weight
|
646 |
-
logger.info(f"keyword {keyword} wf.ziff {zipf} weight {weight}")
|
647 |
-
subwds = keyword.split(" ")
|
648 |
-
if len(subwds) > 1:
|
649 |
-
for subwd in subwds:
|
650 |
-
sub_z = wf.zipf_frequency(subwd, "en")
|
651 |
-
sub_wgt = max(0, int((8 - zipf) * 1 / 2))
|
652 |
-
if sub_wgt > 0:
|
653 |
-
keyword_weights[subwd] = sub_wgt
|
654 |
-
logger.info(f"keyword {subwd} weight {sub_wgt}")
|
655 |
-
|
656 |
-
try: # query google for recent info
|
657 |
-
sort = ""
|
658 |
-
if "today" in original_query or "latest" in original_query:
|
659 |
-
original_query = today.strip("\n") + " " + original_query
|
660 |
-
extract_query = ""
|
661 |
-
orig_phrase_urls = []
|
662 |
-
if len(original_query) > 0:
|
663 |
-
orig_phrase_urls = search(original_query[: min(len(original_query), 128)])
|
664 |
-
extract_query = original_query[: min(len(original_query), 128)]
|
665 |
-
gpt_phrase_urls = []
|
666 |
-
if len(query_phrase) > 0:
|
667 |
-
gpt_phrase_urls = search(query_phrase)
|
668 |
-
extract_query = (
|
669 |
-
query_phrase # prefer more succinct query phrase if available
|
670 |
-
)
|
671 |
-
if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
|
672 |
-
return "", [], 0, [""], 0, [""]
|
673 |
-
|
674 |
-
for url in orig_phrase_urls:
|
675 |
-
if url in gpt_phrase_urls:
|
676 |
-
gpt_phrase_urls.remove(url)
|
677 |
-
|
678 |
-
# interleave both lists now that duplicates are removed
|
679 |
-
urls = [
|
680 |
-
val
|
681 |
-
for tup in zip_longest(orig_phrase_urls, gpt_phrase_urls)
|
682 |
-
for val in tup
|
683 |
-
if val is not None
|
684 |
-
]
|
685 |
-
# urls = [val for tup in zip_longest(urls, kwd_phrase_urls) for val in tup if val is not None]
|
686 |
-
all_urls = copy.deepcopy(urls)
|
687 |
-
# initialize scan of google urls
|
688 |
-
# compute keyword weights
|
689 |
-
start_wall_time = time.time()
|
690 |
-
full_text, index, urls_used, tried_index, urls_tried = process_urls(
|
691 |
-
extract_query, keywords, keyword_weights, all_urls, search_level
|
692 |
-
)
|
693 |
-
site_stats.ckpt()
|
694 |
-
logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
|
695 |
-
# logger.info("return from url processsing")
|
696 |
-
except:
|
697 |
-
traceback.print_exc()
|
698 |
-
return full_text, all_urls, index, urls_used, tried_index, urls_tried
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llmsearch/meta.py
DELETED
@@ -1,357 +0,0 @@
|
|
1 |
-
from llmsearch import utilityV2 as ut, google_search_concurrent as gs
|
2 |
-
import re
|
3 |
-
import time
|
4 |
-
|
5 |
-
ABORT = False
|
6 |
-
CONTINUE = True
|
7 |
-
history = []
|
8 |
-
|
9 |
-
|
10 |
-
class history_entry:
|
11 |
-
def __init__(self, turn, vector=None):
|
12 |
-
self.message = turn.message.lower()
|
13 |
-
self.role = turn.role
|
14 |
-
|
15 |
-
def equal(self, he2):
|
16 |
-
return self.message == he2.message and self.role == turn.role
|
17 |
-
|
18 |
-
|
19 |
-
def add(turn):
|
20 |
-
he = history_entry(turn)
|
21 |
-
history.append(he)
|
22 |
-
|
23 |
-
|
24 |
-
def is_metaCyclic(turn):
|
25 |
-
he = history_entry(turn)
|
26 |
-
count = 0
|
27 |
-
for prior_he in history:
|
28 |
-
if he.equal(prior_he):
|
29 |
-
count += 1
|
30 |
-
return count > 1
|
31 |
-
|
32 |
-
|
33 |
-
def is_cyclic(turn):
|
34 |
-
he = history_entry(turn)
|
35 |
-
for prior_he in history:
|
36 |
-
if he.equal(prior_he):
|
37 |
-
return True
|
38 |
-
return False
|
39 |
-
|
40 |
-
|
41 |
-
def clear():
|
42 |
-
global history
|
43 |
-
history = []
|
44 |
-
return
|
45 |
-
|
46 |
-
|
47 |
-
def test_history():
|
48 |
-
he1 = history_entry(ut.turn(role="assistant", message="who is Noriel Roubini"))
|
49 |
-
he2 = history_entry(ut.turn(role="assistant", message="who was Noriel Roubini"))
|
50 |
-
he3 = history_entry(ut.turn(role="assistant", message="who was Nsriel Roubini"))
|
51 |
-
he4 = history_entry(ut.turn(role="assistant", message="where is the Pinnacles"))
|
52 |
-
for hea in (he1, he2, he3, he4):
|
53 |
-
for heb in (he1, he2, he3, he4):
|
54 |
-
print(cosine(hea, heb))
|
55 |
-
|
56 |
-
|
57 |
-
def test_parse_decomp():
|
58 |
-
test_text = """<Subquery 1>? What is the birthplace of Hugh Jackman?
|
59 |
-
<Subquery 2>? What is the Japanese name of the birthplace of Hugh Jackman?
|
60 |
-
<Keywords 1>: Hugh Jackman, birthplace
|
61 |
-
<Keywords 2>: Japanese name, birthplace, Hugh Jackman"""
|
62 |
-
|
63 |
-
decomp = parse_decomposition(test_text)
|
64 |
-
for subquery in decomp:
|
65 |
-
print("Subquery\n", subquery)
|
66 |
-
|
67 |
-
|
68 |
-
def parse_decomposition(text):
|
69 |
-
### expecting:
|
70 |
-
### <Subquery 1>
|
71 |
-
### Birthplace of Hugh Jackman
|
72 |
-
### <Subquery 2>
|
73 |
-
### Japanese name of Birthplace of Hugh Jackman
|
74 |
-
### note that 'Birthplace of Hugh Jackson' operates as both a strinq google query and a variable in subsequent occurences
|
75 |
-
subquery_indecies = re.finditer(
|
76 |
-
"<Subquery", text
|
77 |
-
) # Action: Ask {Google, User} "query"
|
78 |
-
subqueries = []
|
79 |
-
for index in subquery_indecies:
|
80 |
-
hdr_end = text[index.start() :].find(">") + index.start()
|
81 |
-
query_start = hdr_end + 1
|
82 |
-
query_end = text[query_start:].find("<")
|
83 |
-
if query_end < 0:
|
84 |
-
query = text[query_start:].strip()
|
85 |
-
else:
|
86 |
-
query = text[query_start : query_start + query_end].lstrip("?").strip()
|
87 |
-
print("Query:", query)
|
88 |
-
subqueries.append(query)
|
89 |
-
return subqueries
|
90 |
-
|
91 |
-
|
92 |
-
def query_keywords(query):
|
93 |
-
start_wall_time = time.time()
|
94 |
-
gpt_key_message = [
|
95 |
-
{
|
96 |
-
"role": "user",
|
97 |
-
"content": "Extract keywords and named-entities from the following text.",
|
98 |
-
},
|
99 |
-
{"role": "user", "content": query},
|
100 |
-
]
|
101 |
-
# for item in gpt_key_message:
|
102 |
-
# print(item)
|
103 |
-
gpt_parse = ut.ask_gpt_with_retries(
|
104 |
-
"gpt-3.5-turbo", gpt_key_message, tokens=25, temp=0, timeout=5, tries=2
|
105 |
-
)
|
106 |
-
# print(f'\n***** keywords and named-entities {gpt_parse}')
|
107 |
-
# parse result Keywords: {comma separated list}\n\nNamed-entities: {comma-separated-list}
|
108 |
-
keywords = []
|
109 |
-
# do named entities first, they might be compounds of keywords
|
110 |
-
ne_start = gpt_parse.find("Named-entities")
|
111 |
-
print(f"***** keyword extract {int((time.time()-start_wall_time)*10)/10} sec")
|
112 |
-
if ne_start > 0:
|
113 |
-
nes = gpt_parse[ne_start + len("Named-entities") + 1 :].split(
|
114 |
-
","
|
115 |
-
) # assume string ends with colon or space:].split(',')
|
116 |
-
# print(f'Named-entity candidates {nes}')
|
117 |
-
for ne in nes:
|
118 |
-
ne = ne.strip(" .,;:\n")
|
119 |
-
# print(f' appending {ne}')
|
120 |
-
if ne != "None":
|
121 |
-
keywords.append(ne)
|
122 |
-
else:
|
123 |
-
ne_start = len(gpt_parse) + 1
|
124 |
-
kwd_start = gpt_parse.find("Keywords")
|
125 |
-
if kwd_start > -1:
|
126 |
-
kwds = gpt_parse[kwd_start + len("Keywords") + 1 : ne_start].split(",")
|
127 |
-
# print(f'Keyword candidates {kwds}')
|
128 |
-
for kwd in kwds:
|
129 |
-
kwd = kwd.strip(" .\n,;:")
|
130 |
-
skip = False
|
131 |
-
for kwd2 in keywords:
|
132 |
-
if kwd in kwd2:
|
133 |
-
skip = True
|
134 |
-
if not skip:
|
135 |
-
# print('appending', kwd)
|
136 |
-
keywords.append(kwd)
|
137 |
-
# else: print("Keywords index < 0")
|
138 |
-
if len(keywords) > 0:
|
139 |
-
print(f"***** query_keywords found keywords {keywords}")
|
140 |
-
return keywords
|
141 |
-
# fallback - just use query words
|
142 |
-
candidates = query.split(" ")
|
143 |
-
for candidate in candidates:
|
144 |
-
candidate = candidate.strip()
|
145 |
-
if len(candidate) > 2:
|
146 |
-
keywords.append(candidate)
|
147 |
-
# print(f'***** query_keywords using default keywords {keywords}')
|
148 |
-
return keywords
|
149 |
-
|
150 |
-
|
151 |
-
def substitute(Q1, A1, Q2, debug=False):
|
152 |
-
gpt_sub_message = [
|
153 |
-
{
|
154 |
-
"role": "user",
|
155 |
-
"content": "replace '" + Q1 + "' with '" + A1 + "' in '" + Q2 + "'",
|
156 |
-
}
|
157 |
-
]
|
158 |
-
if debug:
|
159 |
-
print("\n\n**************")
|
160 |
-
for item in gpt_sub_message:
|
161 |
-
print(item)
|
162 |
-
google_tldr = ut.ask_gpt_with_retries(
|
163 |
-
"gpt-3.5-turbo", gpt_sub_message, tokens=25, temp=0.1, timeout=5, tries=2
|
164 |
-
)
|
165 |
-
print("\n\n**************")
|
166 |
-
if len(google_tldr) == 0 or "no information" in google_tldr:
|
167 |
-
print("Returning original Q2")
|
168 |
-
return Q2
|
169 |
-
print("Substituted", Q2, google_tldr)
|
170 |
-
return google_tldr
|
171 |
-
|
172 |
-
|
173 |
-
def meta(query, chat_history, debug=False):
|
174 |
-
print("***** entering meta")
|
175 |
-
turn = ut.turn(
|
176 |
-
role=ut.ASSISTANT, source=ut.ASSISTANT, message='Action: search "' + query + '"'
|
177 |
-
)
|
178 |
-
if is_metaCyclic(turn):
|
179 |
-
return [], ABORT
|
180 |
-
|
181 |
-
prompt = """Decompose a compound <Query> into two smaller <Subquery>. Use the following format for output:
|
182 |
-
<Subquery 1>
|
183 |
-
<Subquery 2>"""
|
184 |
-
gpt_message = [
|
185 |
-
{"role": "user", "content": prompt},
|
186 |
-
{"role": "user", "content": "<Query>\n" + query},
|
187 |
-
]
|
188 |
-
response_text = ""
|
189 |
-
completion = None
|
190 |
-
if debug:
|
191 |
-
for role in gpt_message:
|
192 |
-
print(role)
|
193 |
-
print("starting gpt decomp query")
|
194 |
-
response_text = ut.ask_gpt_with_retries(
|
195 |
-
"gpt-3.5-turbo", gpt_message, tokens=75, temp=0.1, timeout=5, tries=2
|
196 |
-
)
|
197 |
-
if debug:
|
198 |
-
print(f"initial gpt query response:\n{response_text}")
|
199 |
-
print("**** executing decomp ****")
|
200 |
-
subqueries = parse_decomposition(response_text)
|
201 |
-
meta_chat_history = []
|
202 |
-
prev_tldr = ""
|
203 |
-
google_tldr = ""
|
204 |
-
for n, subquery in enumerate(subqueries):
|
205 |
-
# do variable substituion into subquery
|
206 |
-
# ask google
|
207 |
-
# send google results as notes plus subquery to gpt to extract <answer i>
|
208 |
-
# return chat history extended with each subquery and its answer
|
209 |
-
# (or maybe just all google notes, let next level down do the rest?)
|
210 |
-
# bad idea, can exceed token limit!
|
211 |
-
if debug:
|
212 |
-
print(f'subquery {n}, "{subquery}"')
|
213 |
-
if n > 0:
|
214 |
-
subquery = substitute(subqueries[n - 1], prev_tldr, subquery)
|
215 |
-
keyword_set = query_keywords(subquery)
|
216 |
-
|
217 |
-
keyword_set = query_keywords(subquery)
|
218 |
-
print("*****Executing subquery", subquery, "\n with keywords", keyword_set)
|
219 |
-
gpt_initial_message = [
|
220 |
-
{
|
221 |
-
"role": "user",
|
222 |
-
"content": subquery + " If fact is unavailable, respond: 'Unknown'",
|
223 |
-
}
|
224 |
-
]
|
225 |
-
|
226 |
-
# for turn in meta_chat_history:
|
227 |
-
# gpt_initial_message.append({"role":"user","content":turn.tldr})
|
228 |
-
|
229 |
-
initial_gpt_answer = ut.ask_gpt_with_retries(
|
230 |
-
"gpt-3.5-turbo",
|
231 |
-
gpt_initial_message,
|
232 |
-
tokens=25,
|
233 |
-
temp=0.0,
|
234 |
-
timeout=5,
|
235 |
-
tries=2,
|
236 |
-
)
|
237 |
-
if debug:
|
238 |
-
print(f"***** google extract\n {initial_gpt_answer}\n")
|
239 |
-
if (
|
240 |
-
"unknown" not in initial_gpt_answer.lower()
|
241 |
-
and "cannot provide" not in initial_gpt_answer
|
242 |
-
and "do not have access" not in initial_gpt_answer
|
243 |
-
):
|
244 |
-
meta_chat_history.append(
|
245 |
-
ut.turn(
|
246 |
-
role="assistant",
|
247 |
-
message=subquery,
|
248 |
-
source=ut.ASSISTANT,
|
249 |
-
tldr=subquery,
|
250 |
-
keywords=keyword_set,
|
251 |
-
)
|
252 |
-
)
|
253 |
-
meta_chat_history.append(
|
254 |
-
ut.turn(
|
255 |
-
role="assistant",
|
256 |
-
message="<note>\n" + initial_gpt_answer + "\n<note>",
|
257 |
-
source=ut.GOOGLE,
|
258 |
-
tldr=initial_gpt_answer,
|
259 |
-
keywords=keyword_set,
|
260 |
-
)
|
261 |
-
)
|
262 |
-
prev_tldr = initial_gpt_answer
|
263 |
-
print(f"***** Answer to {subquery}: {initial_gpt_answer}\n")
|
264 |
-
google_tldr = initial_gpt_answer
|
265 |
-
continue
|
266 |
-
# ask google
|
267 |
-
(
|
268 |
-
google_text,
|
269 |
-
urls_all,
|
270 |
-
index,
|
271 |
-
urls_used,
|
272 |
-
tried_index,
|
273 |
-
urls_tried,
|
274 |
-
) = gs.search_google(
|
275 |
-
subquery,
|
276 |
-
gs.QUICK_SEARCH,
|
277 |
-
"",
|
278 |
-
ut.INFORMATION_QUERY,
|
279 |
-
keyword_set,
|
280 |
-
meta_chat_history,
|
281 |
-
)
|
282 |
-
if len(google_text) > 0:
|
283 |
-
# digest google response into an answer for this subquery
|
284 |
-
if debug:
|
285 |
-
print(f"***** search result\n{google_text}\n")
|
286 |
-
gpt_tldr_message = [
|
287 |
-
{
|
288 |
-
"role": "user",
|
289 |
-
"content": 'Summarize the set of <note> provided. Including only the direct answer to <Query>. Do not include any qualifiers or modifiers from the <Query> such as "where x was born".',
|
290 |
-
},
|
291 |
-
{"role": "user", "content": google_text},
|
292 |
-
{"role": "user", "content": "<Query>\n" + subquery},
|
293 |
-
]
|
294 |
-
# for turn in meta_chat_history:
|
295 |
-
# gpt_tldr_message.append({"role":"user","content":turn.tldr})
|
296 |
-
|
297 |
-
google_tldr = ut.ask_gpt_with_retries(
|
298 |
-
"gpt-3.5-turbo",
|
299 |
-
gpt_tldr_message,
|
300 |
-
tokens=150,
|
301 |
-
temp=0.1,
|
302 |
-
timeout=5,
|
303 |
-
tries=2,
|
304 |
-
)
|
305 |
-
# print('\n\n**************')
|
306 |
-
# for item in gpt_tldr_message:
|
307 |
-
# print(item)
|
308 |
-
print(f"***** Answer to {subquery}: {google_tldr}\n")
|
309 |
-
meta_chat_history.append(
|
310 |
-
ut.turn(
|
311 |
-
role="assistant",
|
312 |
-
message=subquery,
|
313 |
-
source=ut.ASSISTANT,
|
314 |
-
tldr=subquery,
|
315 |
-
keywords=keyword_set,
|
316 |
-
)
|
317 |
-
)
|
318 |
-
meta_chat_history.append(
|
319 |
-
ut.turn(
|
320 |
-
role="assistant",
|
321 |
-
message="Observation: " + google_tldr,
|
322 |
-
source=ut.GOOGLE,
|
323 |
-
tldr=google_tldr,
|
324 |
-
keywords=keyword_set,
|
325 |
-
)
|
326 |
-
)
|
327 |
-
prev_tldr = google_tldr
|
328 |
-
# print(f"\n******meta return: {google_tldr} *****\n")
|
329 |
-
return meta_chat_history, CONTINUE
|
330 |
-
|
331 |
-
|
332 |
-
if __name__ == "__main__":
|
333 |
-
# test_parse_decomp()
|
334 |
-
# meta("what is the Japanese name of the birthplace of Hugh Jackman", [])
|
335 |
-
# meta("What is the capital of the birthplace of Levy Mwanawasa?",[])
|
336 |
-
# meta("What is the (rounded down) latitude of the birthplace of Ferenc Puskas?",[])
|
337 |
-
# meta("What is the (rounded down) longitude of the birthplace of Juliane Koepcke?",[])
|
338 |
-
# meta("What is the top-level domain of the birthplace of Norodom Sihamoni?",[])
|
339 |
-
# meta("What is the 3166-1 numeric code for the birthplace of Gilgamesh?",[])
|
340 |
-
# meta("What is the currency in the birthplace of Joel Campbell?",[])
|
341 |
-
# meta("What is the currency abbreviation in the birthplace of Antonio Valencia?",[])
|
342 |
-
# meta("What is the currency symbol in the birthplace of Marek Hamsˇ´ık?",[])
|
343 |
-
# meta("What is the Japanese name of the birthplace of Hugh Jackman?",[])
|
344 |
-
# meta("What is the Spanish name of the birthplace of Fred´ eric Chopin? ",[])
|
345 |
-
# meta("What is the Russian name of the birthplace of Confucius?",[])
|
346 |
-
# meta("What is the Estonian name of the birthplace of Kofi Annan?",[])
|
347 |
-
# meta("What is the Urdu name of the birthplace of Nicki Minaj?",[])
|
348 |
-
# meta("What is the calling code of the birthplace of Milla Jovovich?",[])
|
349 |
-
# meta("Who was the champion of the Masters Tournament in the year that Bob Dylan was born?",[])
|
350 |
-
# meta("Who won the Nobel Prize in Literature in the year Matt Damon was born?",[])
|
351 |
-
# meta("Who was the President of the United States when Sting was born?",[])
|
352 |
-
meta(
|
353 |
-
"What are the latest reviewer opinions on Tesla Full Self Driving Beta version 11.3.4?",
|
354 |
-
[],
|
355 |
-
debug=True,
|
356 |
-
)
|
357 |
-
meta("Michael D'Ambrosio Hound Labs", [], debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
web_search.py
CHANGED
@@ -14,7 +14,7 @@ from selenium import webdriver
|
|
14 |
from selenium.webdriver.chrome.options import Options
|
15 |
from unstructured.partition.html import partition_html
|
16 |
|
17 |
-
from llmsearch import
|
18 |
# this import style works in pycharm
|
19 |
from llmsearch import utilityV2 as ut
|
20 |
|
@@ -35,7 +35,6 @@ def search(msg, query_phrase):
|
|
35 |
try:
|
36 |
# this call extracts keywords from the statement and rewrites it into a better search phrase with gpt3.5
|
37 |
# query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
|
38 |
-
mt.clear()
|
39 |
google_text = ""
|
40 |
try:
|
41 |
logger.info(f"asking google {msg}; rephrased: {query_phrase}")
|
|
|
14 |
from selenium.webdriver.chrome.options import Options
|
15 |
from unstructured.partition.html import partition_html
|
16 |
|
17 |
+
from llmsearch import site_stats
|
18 |
# this import style works in pycharm
|
19 |
from llmsearch import utilityV2 as ut
|
20 |
|
|
|
35 |
try:
|
36 |
# this call extracts keywords from the statement and rewrites it into a better search phrase with gpt3.5
|
37 |
# query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
|
|
|
38 |
google_text = ""
|
39 |
try:
|
40 |
logger.info(f"asking google {msg}; rephrased: {query_phrase}")
|