arabellastrange commited on
Commit
e750b39
·
1 Parent(s): 96166e4

tidying files

Browse files
llmsearch/google_search_concurrent.py DELETED
@@ -1,698 +0,0 @@
1
- import concurrent.futures
2
- import copy
3
- import json
4
- import logging
5
- import sys
6
- import time
7
- # from PyPDF2 import PdfReader
8
- import traceback
9
- import urllib.parse as en
10
- import warnings
11
- from datetime import date
12
- from itertools import zip_longest
13
-
14
- import nltk
15
- import requests
16
- import selenium.common.exceptions
17
- import wordfreq as wf
18
- from selenium import webdriver
19
- from selenium.webdriver.chrome.options import Options
20
- from unstructured.partition.html import partition_html
21
-
22
- from llmsearch import site_stats
23
- from llmsearch import utilityV2 as ut
24
-
25
- # from llmsearch import site_stats
26
- # from llmsearch import utilityV2 as ut
27
-
28
- logger = logging.getLogger("agent_logger")
29
-
30
- today = " as of " + date.today().strftime("%b-%d-%Y") + "\n\n"
31
-
32
- suffix = "\nA: "
33
- client = "\nQ: "
34
-
35
- QUICK_SEARCH = "quick"
36
- NORMAL_SEARCH = "moderate"
37
- DEEP_SEARCH = "deep"
38
-
39
- # system_prime = {
40
- # "role": "system",
41
- # "content": "You analyze Text with respect to Query and list any relevant information found, including direct quotes from the text, and detailed samples or examples in the text.",
42
- # }
43
- priming_1 = {"role": "user", "content": "Query:\n"}
44
-
45
-
46
- # priming_2 = {
47
- # "role": "user",
48
- # "content": "List relevant information in the provided text, including direct quotes from the text. If none, respond 'no information'.\nText:\n",
49
- # }
50
-
51
- def process_url_mod(query_phrase, url, timeout):
52
- start_time = time.time()
53
- site = ut.extract_site(url)
54
- result = ""
55
- try:
56
- with warnings.catch_warnings():
57
- warnings.simplefilter("ignore")
58
- options = Options()
59
- options.page_load_strategy = "eager"
60
- options.add_argument("--headless")
61
- result = ""
62
- with webdriver.Chrome(options=options) as dr:
63
- logger.info(f"*****setting page load timeout {timeout}")
64
- dr.set_page_load_timeout(timeout)
65
- try:
66
- dr.get(url)
67
- response = dr.page_source
68
- result = response_text_extract_mod(url, response)
69
- except selenium.common.exceptions.TimeoutException:
70
- return "", url
71
- except Exception:
72
- traceback.print_exc()
73
- logger.info(f"{site} err")
74
- pass
75
- logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
76
- return result, url
77
-
78
-
79
- # Define a function to make a single URL request and process the response
80
- def process_url(query_phrase, keywords, keyword_weights, url, timeout):
81
- start_time = time.time()
82
- site = ut.extract_site(url)
83
- result = ""
84
- try:
85
- with warnings.catch_warnings():
86
- warnings.simplefilter("ignore")
87
- options = Options()
88
- options.page_load_strategy = "eager"
89
- options.add_argument("--headless")
90
- result = ""
91
- with webdriver.Chrome(options=options) as dr:
92
- logger.info(f"*****setting page load timeout {timeout}")
93
- dr.set_page_load_timeout(timeout)
94
- try:
95
- dr.get(url)
96
- response = dr.page_source
97
- result = response_text_extract(
98
- query_phrase,
99
- keywords,
100
- keyword_weights,
101
- url,
102
- response,
103
- int(time.time() - start_time),
104
- )
105
- except selenium.common.exceptions.TimeoutException:
106
- return "", url
107
- except Exception:
108
- traceback.print_exc()
109
- logger.info(f"{site} err")
110
- pass
111
- # logger.info(f"Processed {site}: {len(response)} / {len(result)} {int((time.time()-start_time)*1000)} ms")
112
- return result, url
113
-
114
-
115
- def process_urls_mod(query_phrase, urls):
116
- start_time = time.time()
117
-
118
- response = []
119
- logger.info("entering process urls")
120
- full_text = ""
121
- used_index = 0
122
- urls_used = ["" for i in range(30)]
123
- tried_index = 0
124
- urls_tried = ["" for i in range(30)]
125
- in_process = []
126
- processed = []
127
- google_futures = []
128
-
129
- with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:
130
- # initialize scan of google urls
131
- while True:
132
- try:
133
- while len(urls) > 0:
134
- timeout = 12 - int(time.time() - start_time)
135
- recommendation = site_stats.get_next(
136
- urls, sample_unknown=True
137
- )
138
- url = recommendation[1]
139
- future = executor.submit(process_url_mod, query_phrase=query_phrase, url=url, timeout=timeout)
140
- google_futures.append(future)
141
- in_process.append(future)
142
- urls_tried[tried_index] = url
143
- tried_index += 1
144
- urls.remove(url)
145
- logger.info(f"queued {ut.extract_site(url)}, {timeout}")
146
-
147
- for future in in_process:
148
- if future.done():
149
- result, url = future.result()
150
- processed.append(future)
151
- in_process.remove(future)
152
- if len(result) > 0:
153
- urls_used[used_index] = url
154
- used_index += 1
155
- result = result.replace(". .", ".")
156
- logger.info(
157
- f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
158
- )
159
- response.append(
160
- {
161
- "source": ut.extract_domain(url),
162
- "url": url,
163
- "text": result,
164
- }
165
- )
166
- if time.time() - start_time > 28:
167
- executor.shutdown(wait=False)
168
- logger.info(
169
- f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
170
- )
171
- return response, used_index, urls_used, tried_index, urls_tried
172
- time.sleep(0.5)
173
- except:
174
- traceback.print_exc()
175
- executor.shutdown(wait=False)
176
- logger.info(
177
- f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
178
- )
179
- return response, index, urls_used, tried_index, urls_tried
180
-
181
-
182
- def process_urls(query_phrase, keywords, keyword_weights, urls, search_level):
183
- # Create a ThreadPoolExecutor with 5 worker threads
184
- response = []
185
- logger.info("entering process urls")
186
- start_time = time.time()
187
- full_text = ""
188
- used_index = 0
189
- urls_used = ["" for i in range(30)]
190
- tried_index = 0
191
- urls_tried = ["" for i in range(30)]
192
- start_time = time.time()
193
- in_process = []
194
- processed = []
195
- google_futures = []
196
- off_whitelist = False
197
-
198
- with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:
199
- # initialize scan of google urls
200
- while True:
201
- try:
202
- while (
203
- len(urls) > 0
204
- # no sense starting if not much time left
205
- and (
206
- (
207
- search_level == DEEP_SEARCH
208
- and len(full_text) < 9600
209
- and len(in_process) < 16
210
- and time.time() - start_time < 14
211
- )
212
- or (
213
- search_level == NORMAL_SEARCH
214
- and len(full_text) < 6400
215
- and len(in_process) < 14
216
- and time.time() - start_time < 12
217
- )
218
- or (
219
- search_level == QUICK_SEARCH
220
- and len(full_text) < 4800
221
- and len(in_process) < 10
222
- and time.time() - start_time < 8
223
- )
224
- )
225
- ):
226
- recommendation = site_stats.get_next(
227
- urls, sample_unknown=off_whitelist
228
- )
229
- if recommendation is None or len(recommendation) == 0:
230
- off_whitelist = True
231
- else:
232
- # set timeout so we don't wait for a slow site forever
233
- timeout = 12 - int(time.time() - start_time)
234
- if search_level == NORMAL_SEARCH:
235
- timeout = timeout + 4
236
- url = recommendation[1]
237
- future = executor.submit(
238
- process_url,
239
- query_phrase,
240
- keywords,
241
- keyword_weights,
242
- url,
243
- timeout,
244
- )
245
- # remaining_time = start_time+18-time.time()
246
- # future.exception(remaining_time)
247
- google_futures.append(future)
248
- in_process.append(future)
249
- urls_tried[tried_index] = url
250
- tried_index += 1
251
- urls.remove(url)
252
- logger.info(f"queued {ut.extract_site(url)}, {timeout}")
253
- # Process the responses as they arrive
254
- for future in in_process:
255
- if future.done():
256
- result, url = future.result()
257
- processed.append(future)
258
- in_process.remove(future)
259
- if len(result) > 0:
260
- urls_used[used_index] = url
261
- used_index += 1
262
- result = result.replace(". .", ".")
263
- logger.info(
264
- f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
265
- )
266
- site = ut.extract_site(url)
267
- domain = ut.extract_domain(url)
268
- if domain.endswith("gov"):
269
- credibility = "Official Source"
270
- elif site in ut.sites.keys():
271
- if ut.sites[site] > 0:
272
- credibility = "Whitelisted Source"
273
- elif ut.sites[site] == 0:
274
- credibility = "Blacklisted Source"
275
- else:
276
- credibility = "Third-Party Source"
277
-
278
- response.append(
279
- {
280
- "source": ut.extract_domain(url),
281
- "url": url,
282
- "credibility": credibility,
283
- "text": result,
284
- }
285
- )
286
-
287
- # openai seems to timeout a plugin at about 30 secs, and there is pbly 3-4 sec overhead
288
- if (
289
- (len(urls) == 0 and len(in_process) == 0)
290
- or (
291
- search_level == DEEP_SEARCH
292
- and (len(full_text) > 9600)
293
- or time.time() - start_time > 42
294
- )
295
- or (
296
- search_level == NORMAL_SEARCH
297
- and (len(full_text) > 6400)
298
- or time.time() - start_time > 32
299
- )
300
- or (
301
- search_level == QUICK_SEARCH
302
- and (len(full_text) > 4800)
303
- or time.time() - start_time > 28
304
- )
305
- ):
306
- executor.shutdown(wait=False)
307
- logger.info(
308
- f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
309
- )
310
- return response, used_index, urls_used, tried_index, urls_tried
311
- time.sleep(0.5)
312
- except:
313
- traceback.print_exc()
314
- executor.shutdown(wait=False)
315
- logger.info(
316
- f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
317
- )
318
- return response, index, urls_used, tried_index, urls_tried
319
-
320
-
321
- def extract_subtext(text, query_phrase, keywords, keyword_weights):
322
- ### maybe we should score based on paragraphs, not lines?
323
- sentences = ut.reform(text)
324
- # logger.info('***** sentences from reform')
325
- # for sentence in sentences:
326
- # logger.info(sentence)
327
- sentence_weights = {}
328
- final_text = ""
329
- for sentence in sentences:
330
- sentence_weights[sentence] = 0
331
- for keyword in keywords:
332
- if keyword in sentence or keyword.lower() in sentence:
333
- if keyword in keyword_weights.keys():
334
- sentence_weights[sentence] += keyword_weights[keyword]
335
-
336
- # now pick out sentences starting with those with the most keywords
337
- max_sentence_weight = 0
338
- for keyword in keyword_weights.keys():
339
- max_sentence_weight += keyword_weights[keyword]
340
- # logger.info(f'******* max sentence weight {max_sentence_weight}')
341
- for i in range(max_sentence_weight, 1, -1):
342
- if len(final_text) > 6000 and i < max(
343
- 1, int(max_sentence_weight / 4)
344
- ): # make sure we don't miss any super-important text
345
- return final_text
346
- for sentence in sentences:
347
- if len(final_text) + len(sentence) > 6001 and i < max(
348
- 1, int(max_sentence_weight / 4)
349
- ):
350
- continue
351
- if sentence_weights[sentence] == i:
352
- final_text += sentence
353
- # logger.info("relevant text", final_text)
354
- # logger.info("keyword extract length:",len(final_text)) #, end='.. ')
355
-
356
- return final_text
357
-
358
-
359
- def search(query_phrase):
360
- logger.info(f"***** search {query_phrase}")
361
- sort = "&sort=date-sdate:d:w"
362
- if "today" in query_phrase or "latest" in query_phrase:
363
- sort = "&sort=date-sdate:d:s"
364
- # logger.info(f"search for: {query_phrase}")
365
- google_query = en.quote(query_phrase)
366
- response = []
367
- try:
368
- start_wall_time = time.time()
369
- url = (
370
- "https://www.googleapis.com/customsearch/v1?key="
371
- + ut.google_key
372
- + "&cx="
373
- + ut.google_cx
374
- # was ten but want to reduce search time
375
- + "&num=3"
376
- + sort
377
- + "&q="
378
- + google_query
379
- )
380
- response = requests.get(url)
381
- response_json = json.loads(response.text)
382
- logger.info(f"***** google search {int((time.time() - start_wall_time) * 10) / 10} sec")
383
- except:
384
- traceback.print_exc()
385
- return []
386
-
387
- # see if we got anything useful from google
388
- if "items" not in response_json.keys():
389
- logger.info("no return from google ...", response, response_json.keys())
390
- # logger.info(google_query)
391
- return []
392
-
393
- # first try whitelist sites
394
- urls = []
395
- for i in range(len(response_json["items"])):
396
- url = response_json["items"][i]["link"].lstrip().rstrip()
397
- site = ut.extract_site(url)
398
- if site not in ut.sites or ut.sites[site] == 1:
399
- urls.append(url)
400
- return urls
401
-
402
-
403
- def log_url_process(site, reason, raw_text, extract_text, gpt_text):
404
- return
405
-
406
-
407
- """
408
- # to record detailed logs of url processing unquote this function
409
- def log_url_process(site, reason, raw_text, extract_text, gpt_text):
410
- if len(raw_text) == 0 and len(extract_text)==0 and len(gpt_text) ==0:
411
- return
412
- try:
413
- with open('google_log.txt', 'a') as lg:
414
- lg.write('\n\n*************'+reason.upper()+'***********\n')
415
- lg.write('*****************'+site+' RAW*************\n')
416
- lg.write(raw_text)
417
- lg.write('\n******************extract****************\n')
418
- lg.write(extract_text)
419
- lg.write('\n********************gpt******************\n')
420
- lg.write(gpt_text)
421
- except Exception:
422
- traceback.print_exc()
423
- """
424
-
425
-
426
- def response_text_extract_mod(url, response):
427
- extract_text = ""
428
- if url.endswith("pdf"):
429
- pass
430
- else:
431
- elements = partition_html(text=response)
432
- str_elements = []
433
- for e in elements:
434
- stre = str(e).replace(" ", " ")
435
- str_elements.append(stre)
436
- extract_text = ut.reform(str_elements)
437
- logger.info(
438
- f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
439
- )
440
- if len(''.join(extract_text).strip()) < 8:
441
- return ""
442
- return extract_text
443
-
444
-
445
- def response_text_extract(
446
- query_phrase, keywords, keyword_weights, url, response, get_time
447
- ):
448
- curr = time.time()
449
- text = ""
450
- extract_text = ""
451
- site = ut.extract_site(url)
452
-
453
- if url.endswith("pdf"):
454
- pass
455
- else:
456
- elements = partition_html(text=response)
457
- str_elements = []
458
- # logger.info('\n***** elements')
459
- for e in elements:
460
- stre = str(e).replace(" ", " ")
461
- str_elements.append(stre)
462
- extract_text = extract_subtext(
463
- str_elements, query_phrase, keywords, keyword_weights
464
- )
465
- # logger.info('\n************ unstructured **********')
466
- logger.info(
467
- f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
468
- )
469
- url_text = text # save for final stats
470
- new_curr = time.time()
471
- extract_time = int((new_curr - curr) * 1000000)
472
- if len(extract_text.strip()) < 8:
473
- return ""
474
-
475
- # now ask openai to extract answer
476
- response_text = ""
477
- curr = new_curr
478
- extract_text = extract_text[:10000] # make sure we don't run over token limit
479
- gpt_tldr_message = [
480
- {
481
- "role": "user",
482
- "content": "Given:\n" + extract_text + "\n\nQuery:\n" + query_phrase,
483
- }
484
- ]
485
- start_wall_time = time.time()
486
- t_out = 12 - get_time
487
- # logger.info(f'****** spawning page get with timeout {t_out}')
488
- google_tldr = ut.ask_gpt_with_retries(
489
- ut.MODEL, gpt_tldr_message, tokens=300, temp=0.3, timeout=t_out, tries=1
490
- )
491
- openai_time = int((time.time() - start_wall_time) * 10) / 10
492
- logger.info(f"\n***** tldr {query_phrase}, {openai_time} sec")
493
- logger.info(f'***** \n{extract_text}\n***** \n{google_tldr}\n*****\n')
494
- url_text = url_text.replace("\n", ". ")
495
- if google_tldr is None:
496
- google_tldr = ""
497
- response_text = google_tldr.lstrip()
498
- prefix_text = response_text[: min(len(response_text), 96)].lower()
499
- # openai sometimes returns a special format for 'no imformation'
500
- if prefix_text.startswith("query:"):
501
- text_index = response_text.find("Text:")
502
- if text_index > 0:
503
- response_text = response_text[text_index + 5:]
504
- prefix_text = response_text[: min(len(response_text), 96)].lower()
505
- if (
506
- "no information" in prefix_text
507
- or "i cannot provide" in prefix_text
508
- or "as an ai language model" in prefix_text
509
- or "does not provide" in prefix_text
510
- or "it is not possible" in prefix_text
511
- ):
512
- # skip this summary, no info
513
- logger.info(
514
- "{} {}/{}/{}/{}".format(
515
- site, len(response), len(url_text), len(extract_text), 0
516
- )
517
- )
518
- # logger.info('************')
519
- # logger.info(extract_text)
520
- # logger.info('************')
521
- sys.stdout.flush()
522
- log_url_process(site, "no info", url_text, extract_text, "")
523
- site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
524
- return ""
525
-
526
- if (
527
- prefix_text.startswith("i'm sorry")
528
- or prefix_text.startswith("there is no ")
529
- or (
530
- prefix_text.startswith("the provided text")
531
- or prefix_text.startswith("i cannot")
532
- or prefix_text.startswith("unfortunately")
533
- or prefix_text.startswith("sorry")
534
- or prefix_text.startswith("the text")
535
- )
536
- and (
537
- "is not relevant" in prefix_text
538
- or "no information" in prefix_text
539
- or "does not provide" in prefix_text
540
- or "does not contain" in prefix_text
541
- or "no relevant information" in prefix_text
542
- )
543
- ):
544
- # skip this summary, no info
545
- log_url_process(site, "no info 2", url_text, extract_text, "")
546
- logger.info(
547
- "{} {}/{}/{}/{}".format(
548
- site, len(response), len(url_text), len(extract_text), 0
549
- )
550
- )
551
- ###logger.info('************')
552
- ###logger.info(extract_text)
553
- ###logger.info('************')
554
- site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
555
- return ""
556
- else:
557
- sentences = nltk.sent_tokenize(response_text)
558
- response_text = ""
559
- for sentence in sentences:
560
- if (
561
- "no inform" in sentence.lower()
562
- or "no specific inform" in sentence.lower()
563
- or "is unclear" in sentence.lower()
564
- or "not mention" in sentence.lower()
565
- or "not specifically mention" in sentence.lower()
566
- ):
567
- pass
568
- else:
569
- response_text += "\n \u2022 " + sentence + ". "
570
- site_stats.update_site_stats(
571
- site, len(response_text), get_time, extract_time, openai_time
572
- )
573
- # logger.info('\n',response_text)
574
- log_url_process(site, "response", url_text, extract_text, response_text)
575
- logger.info(
576
- "{} {}/{}/{}/{}".format(
577
- site,
578
- len(response),
579
- len(url_text),
580
- len(extract_text),
581
- len(response_text),
582
- )
583
- )
584
- # logger.info('************')
585
- # logger.info(google_tldr)
586
- # logger.info('************ site response ***********')
587
- # logger.info(response_text)
588
- # logger.info('************')
589
- return response_text + "\n"
590
- site_stats.update_site_stats(site, 0, get_time, extract_time, openai_time)
591
- log_url_process(site, "no return", "", "", "")
592
- logger.info(
593
- "{} {}/{}/{}/{}".format(
594
- site, len(response), len(url_text), len(extract_text), 0
595
- )
596
- )
597
- ##logger.info('************')
598
- ##logger.info(extract_text)
599
- ##logger.info('************')
600
- return ""
601
-
602
-
603
- def extract_items_from_numbered_list(text):
604
- items = ""
605
- elements = text.split("\n")
606
- for candidate in elements:
607
- candidate = candidate.lstrip(". \t")
608
- if len(candidate) > 4 and candidate[0].isdigit():
609
- candidate = candidate[1:].lstrip(". ")
610
- if (
611
- len(candidate) > 4 and candidate[0].isdigit()
612
- ): # strip second digit if more than 10 items
613
- candidate = candidate[1:].lstrip(". ")
614
- logger.info("E {}".format(candidate))
615
- items += candidate + " "
616
- return items
617
-
618
-
619
- def search_google_mod(query_phrase):
620
- full_text = ""
621
- try:
622
- gpt_phrase_urls = []
623
- if len(query_phrase) > 0:
624
- gpt_phrase_urls = search(query_phrase)
625
- full_text = process_urls_mod(query_phrase, gpt_phrase_urls)
626
- logger.info("return from url processing")
627
- except:
628
- traceback.print_exc()
629
- return full_text
630
-
631
-
632
- def search_google(original_query, search_level, query_phrase, keywords, chat_history):
633
- start_time = time.time()
634
- all_urls = []
635
- urls_used = []
636
- urls_tried = []
637
- index = 0
638
- tried_index = 0
639
- full_text = ""
640
- keyword_weights = {}
641
- for keyword in keywords:
642
- zipf = wf.zipf_frequency(keyword, "en")
643
- weight = max(0, int((8 - zipf)))
644
- if weight > 0:
645
- keyword_weights[keyword] = weight
646
- logger.info(f"keyword {keyword} wf.ziff {zipf} weight {weight}")
647
- subwds = keyword.split(" ")
648
- if len(subwds) > 1:
649
- for subwd in subwds:
650
- sub_z = wf.zipf_frequency(subwd, "en")
651
- sub_wgt = max(0, int((8 - zipf) * 1 / 2))
652
- if sub_wgt > 0:
653
- keyword_weights[subwd] = sub_wgt
654
- logger.info(f"keyword {subwd} weight {sub_wgt}")
655
-
656
- try: # query google for recent info
657
- sort = ""
658
- if "today" in original_query or "latest" in original_query:
659
- original_query = today.strip("\n") + " " + original_query
660
- extract_query = ""
661
- orig_phrase_urls = []
662
- if len(original_query) > 0:
663
- orig_phrase_urls = search(original_query[: min(len(original_query), 128)])
664
- extract_query = original_query[: min(len(original_query), 128)]
665
- gpt_phrase_urls = []
666
- if len(query_phrase) > 0:
667
- gpt_phrase_urls = search(query_phrase)
668
- extract_query = (
669
- query_phrase # prefer more succinct query phrase if available
670
- )
671
- if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
672
- return "", [], 0, [""], 0, [""]
673
-
674
- for url in orig_phrase_urls:
675
- if url in gpt_phrase_urls:
676
- gpt_phrase_urls.remove(url)
677
-
678
- # interleave both lists now that duplicates are removed
679
- urls = [
680
- val
681
- for tup in zip_longest(orig_phrase_urls, gpt_phrase_urls)
682
- for val in tup
683
- if val is not None
684
- ]
685
- # urls = [val for tup in zip_longest(urls, kwd_phrase_urls) for val in tup if val is not None]
686
- all_urls = copy.deepcopy(urls)
687
- # initialize scan of google urls
688
- # compute keyword weights
689
- start_wall_time = time.time()
690
- full_text, index, urls_used, tried_index, urls_tried = process_urls(
691
- extract_query, keywords, keyword_weights, all_urls, search_level
692
- )
693
- site_stats.ckpt()
694
- logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
695
- # logger.info("return from url processsing")
696
- except:
697
- traceback.print_exc()
698
- return full_text, all_urls, index, urls_used, tried_index, urls_tried
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llmsearch/meta.py DELETED
@@ -1,357 +0,0 @@
1
- from llmsearch import utilityV2 as ut, google_search_concurrent as gs
2
- import re
3
- import time
4
-
5
- ABORT = False
6
- CONTINUE = True
7
- history = []
8
-
9
-
10
- class history_entry:
11
- def __init__(self, turn, vector=None):
12
- self.message = turn.message.lower()
13
- self.role = turn.role
14
-
15
- def equal(self, he2):
16
- return self.message == he2.message and self.role == turn.role
17
-
18
-
19
- def add(turn):
20
- he = history_entry(turn)
21
- history.append(he)
22
-
23
-
24
- def is_metaCyclic(turn):
25
- he = history_entry(turn)
26
- count = 0
27
- for prior_he in history:
28
- if he.equal(prior_he):
29
- count += 1
30
- return count > 1
31
-
32
-
33
- def is_cyclic(turn):
34
- he = history_entry(turn)
35
- for prior_he in history:
36
- if he.equal(prior_he):
37
- return True
38
- return False
39
-
40
-
41
- def clear():
42
- global history
43
- history = []
44
- return
45
-
46
-
47
- def test_history():
48
- he1 = history_entry(ut.turn(role="assistant", message="who is Noriel Roubini"))
49
- he2 = history_entry(ut.turn(role="assistant", message="who was Noriel Roubini"))
50
- he3 = history_entry(ut.turn(role="assistant", message="who was Nsriel Roubini"))
51
- he4 = history_entry(ut.turn(role="assistant", message="where is the Pinnacles"))
52
- for hea in (he1, he2, he3, he4):
53
- for heb in (he1, he2, he3, he4):
54
- print(cosine(hea, heb))
55
-
56
-
57
- def test_parse_decomp():
58
- test_text = """<Subquery 1>? What is the birthplace of Hugh Jackman?
59
- <Subquery 2>? What is the Japanese name of the birthplace of Hugh Jackman?
60
- <Keywords 1>: Hugh Jackman, birthplace
61
- <Keywords 2>: Japanese name, birthplace, Hugh Jackman"""
62
-
63
- decomp = parse_decomposition(test_text)
64
- for subquery in decomp:
65
- print("Subquery\n", subquery)
66
-
67
-
68
- def parse_decomposition(text):
69
- ### expecting:
70
- ### <Subquery 1>
71
- ### Birthplace of Hugh Jackman
72
- ### <Subquery 2>
73
- ### Japanese name of Birthplace of Hugh Jackman
74
- ### note that 'Birthplace of Hugh Jackson' operates as both a strinq google query and a variable in subsequent occurences
75
- subquery_indecies = re.finditer(
76
- "<Subquery", text
77
- ) # Action: Ask {Google, User} "query"
78
- subqueries = []
79
- for index in subquery_indecies:
80
- hdr_end = text[index.start() :].find(">") + index.start()
81
- query_start = hdr_end + 1
82
- query_end = text[query_start:].find("<")
83
- if query_end < 0:
84
- query = text[query_start:].strip()
85
- else:
86
- query = text[query_start : query_start + query_end].lstrip("?").strip()
87
- print("Query:", query)
88
- subqueries.append(query)
89
- return subqueries
90
-
91
-
92
- def query_keywords(query):
93
- start_wall_time = time.time()
94
- gpt_key_message = [
95
- {
96
- "role": "user",
97
- "content": "Extract keywords and named-entities from the following text.",
98
- },
99
- {"role": "user", "content": query},
100
- ]
101
- # for item in gpt_key_message:
102
- # print(item)
103
- gpt_parse = ut.ask_gpt_with_retries(
104
- "gpt-3.5-turbo", gpt_key_message, tokens=25, temp=0, timeout=5, tries=2
105
- )
106
- # print(f'\n***** keywords and named-entities {gpt_parse}')
107
- # parse result Keywords: {comma separated list}\n\nNamed-entities: {comma-separated-list}
108
- keywords = []
109
- # do named entities first, they might be compounds of keywords
110
- ne_start = gpt_parse.find("Named-entities")
111
- print(f"***** keyword extract {int((time.time()-start_wall_time)*10)/10} sec")
112
- if ne_start > 0:
113
- nes = gpt_parse[ne_start + len("Named-entities") + 1 :].split(
114
- ","
115
- ) # assume string ends with colon or space:].split(',')
116
- # print(f'Named-entity candidates {nes}')
117
- for ne in nes:
118
- ne = ne.strip(" .,;:\n")
119
- # print(f' appending {ne}')
120
- if ne != "None":
121
- keywords.append(ne)
122
- else:
123
- ne_start = len(gpt_parse) + 1
124
- kwd_start = gpt_parse.find("Keywords")
125
- if kwd_start > -1:
126
- kwds = gpt_parse[kwd_start + len("Keywords") + 1 : ne_start].split(",")
127
- # print(f'Keyword candidates {kwds}')
128
- for kwd in kwds:
129
- kwd = kwd.strip(" .\n,;:")
130
- skip = False
131
- for kwd2 in keywords:
132
- if kwd in kwd2:
133
- skip = True
134
- if not skip:
135
- # print('appending', kwd)
136
- keywords.append(kwd)
137
- # else: print("Keywords index < 0")
138
- if len(keywords) > 0:
139
- print(f"***** query_keywords found keywords {keywords}")
140
- return keywords
141
- # fallback - just use query words
142
- candidates = query.split(" ")
143
- for candidate in candidates:
144
- candidate = candidate.strip()
145
- if len(candidate) > 2:
146
- keywords.append(candidate)
147
- # print(f'***** query_keywords using default keywords {keywords}')
148
- return keywords
149
-
150
-
151
- def substitute(Q1, A1, Q2, debug=False):
152
- gpt_sub_message = [
153
- {
154
- "role": "user",
155
- "content": "replace '" + Q1 + "' with '" + A1 + "' in '" + Q2 + "'",
156
- }
157
- ]
158
- if debug:
159
- print("\n\n**************")
160
- for item in gpt_sub_message:
161
- print(item)
162
- google_tldr = ut.ask_gpt_with_retries(
163
- "gpt-3.5-turbo", gpt_sub_message, tokens=25, temp=0.1, timeout=5, tries=2
164
- )
165
- print("\n\n**************")
166
- if len(google_tldr) == 0 or "no information" in google_tldr:
167
- print("Returning original Q2")
168
- return Q2
169
- print("Substituted", Q2, google_tldr)
170
- return google_tldr
171
-
172
-
173
- def meta(query, chat_history, debug=False):
174
- print("***** entering meta")
175
- turn = ut.turn(
176
- role=ut.ASSISTANT, source=ut.ASSISTANT, message='Action: search "' + query + '"'
177
- )
178
- if is_metaCyclic(turn):
179
- return [], ABORT
180
-
181
- prompt = """Decompose a compound <Query> into two smaller <Subquery>. Use the following format for output:
182
- <Subquery 1>
183
- <Subquery 2>"""
184
- gpt_message = [
185
- {"role": "user", "content": prompt},
186
- {"role": "user", "content": "<Query>\n" + query},
187
- ]
188
- response_text = ""
189
- completion = None
190
- if debug:
191
- for role in gpt_message:
192
- print(role)
193
- print("starting gpt decomp query")
194
- response_text = ut.ask_gpt_with_retries(
195
- "gpt-3.5-turbo", gpt_message, tokens=75, temp=0.1, timeout=5, tries=2
196
- )
197
- if debug:
198
- print(f"initial gpt query response:\n{response_text}")
199
- print("**** executing decomp ****")
200
- subqueries = parse_decomposition(response_text)
201
- meta_chat_history = []
202
- prev_tldr = ""
203
- google_tldr = ""
204
- for n, subquery in enumerate(subqueries):
205
- # do variable substituion into subquery
206
- # ask google
207
- # send google results as notes plus subquery to gpt to extract <answer i>
208
- # return chat history extended with each subquery and its answer
209
- # (or maybe just all google notes, let next level down do the rest?)
210
- # bad idea, can exceed token limit!
211
- if debug:
212
- print(f'subquery {n}, "{subquery}"')
213
- if n > 0:
214
- subquery = substitute(subqueries[n - 1], prev_tldr, subquery)
215
- keyword_set = query_keywords(subquery)
216
-
217
- keyword_set = query_keywords(subquery)
218
- print("*****Executing subquery", subquery, "\n with keywords", keyword_set)
219
- gpt_initial_message = [
220
- {
221
- "role": "user",
222
- "content": subquery + " If fact is unavailable, respond: 'Unknown'",
223
- }
224
- ]
225
-
226
- # for turn in meta_chat_history:
227
- # gpt_initial_message.append({"role":"user","content":turn.tldr})
228
-
229
- initial_gpt_answer = ut.ask_gpt_with_retries(
230
- "gpt-3.5-turbo",
231
- gpt_initial_message,
232
- tokens=25,
233
- temp=0.0,
234
- timeout=5,
235
- tries=2,
236
- )
237
- if debug:
238
- print(f"***** google extract\n {initial_gpt_answer}\n")
239
- if (
240
- "unknown" not in initial_gpt_answer.lower()
241
- and "cannot provide" not in initial_gpt_answer
242
- and "do not have access" not in initial_gpt_answer
243
- ):
244
- meta_chat_history.append(
245
- ut.turn(
246
- role="assistant",
247
- message=subquery,
248
- source=ut.ASSISTANT,
249
- tldr=subquery,
250
- keywords=keyword_set,
251
- )
252
- )
253
- meta_chat_history.append(
254
- ut.turn(
255
- role="assistant",
256
- message="<note>\n" + initial_gpt_answer + "\n<note>",
257
- source=ut.GOOGLE,
258
- tldr=initial_gpt_answer,
259
- keywords=keyword_set,
260
- )
261
- )
262
- prev_tldr = initial_gpt_answer
263
- print(f"***** Answer to {subquery}: {initial_gpt_answer}\n")
264
- google_tldr = initial_gpt_answer
265
- continue
266
- # ask google
267
- (
268
- google_text,
269
- urls_all,
270
- index,
271
- urls_used,
272
- tried_index,
273
- urls_tried,
274
- ) = gs.search_google(
275
- subquery,
276
- gs.QUICK_SEARCH,
277
- "",
278
- ut.INFORMATION_QUERY,
279
- keyword_set,
280
- meta_chat_history,
281
- )
282
- if len(google_text) > 0:
283
- # digest google response into an answer for this subquery
284
- if debug:
285
- print(f"***** search result\n{google_text}\n")
286
- gpt_tldr_message = [
287
- {
288
- "role": "user",
289
- "content": 'Summarize the set of <note> provided. Including only the direct answer to <Query>. Do not include any qualifiers or modifiers from the <Query> such as "where x was born".',
290
- },
291
- {"role": "user", "content": google_text},
292
- {"role": "user", "content": "<Query>\n" + subquery},
293
- ]
294
- # for turn in meta_chat_history:
295
- # gpt_tldr_message.append({"role":"user","content":turn.tldr})
296
-
297
- google_tldr = ut.ask_gpt_with_retries(
298
- "gpt-3.5-turbo",
299
- gpt_tldr_message,
300
- tokens=150,
301
- temp=0.1,
302
- timeout=5,
303
- tries=2,
304
- )
305
- # print('\n\n**************')
306
- # for item in gpt_tldr_message:
307
- # print(item)
308
- print(f"***** Answer to {subquery}: {google_tldr}\n")
309
- meta_chat_history.append(
310
- ut.turn(
311
- role="assistant",
312
- message=subquery,
313
- source=ut.ASSISTANT,
314
- tldr=subquery,
315
- keywords=keyword_set,
316
- )
317
- )
318
- meta_chat_history.append(
319
- ut.turn(
320
- role="assistant",
321
- message="Observation: " + google_tldr,
322
- source=ut.GOOGLE,
323
- tldr=google_tldr,
324
- keywords=keyword_set,
325
- )
326
- )
327
- prev_tldr = google_tldr
328
- # print(f"\n******meta return: {google_tldr} *****\n")
329
- return meta_chat_history, CONTINUE
330
-
331
-
332
- if __name__ == "__main__":
333
- # test_parse_decomp()
334
- # meta("what is the Japanese name of the birthplace of Hugh Jackman", [])
335
- # meta("What is the capital of the birthplace of Levy Mwanawasa?",[])
336
- # meta("What is the (rounded down) latitude of the birthplace of Ferenc Puskas?",[])
337
- # meta("What is the (rounded down) longitude of the birthplace of Juliane Koepcke?",[])
338
- # meta("What is the top-level domain of the birthplace of Norodom Sihamoni?",[])
339
- # meta("What is the 3166-1 numeric code for the birthplace of Gilgamesh?",[])
340
- # meta("What is the currency in the birthplace of Joel Campbell?",[])
341
- # meta("What is the currency abbreviation in the birthplace of Antonio Valencia?",[])
342
- # meta("What is the currency symbol in the birthplace of Marek Hamsˇ´ık?",[])
343
- # meta("What is the Japanese name of the birthplace of Hugh Jackman?",[])
344
- # meta("What is the Spanish name of the birthplace of Fred´ eric Chopin? ",[])
345
- # meta("What is the Russian name of the birthplace of Confucius?",[])
346
- # meta("What is the Estonian name of the birthplace of Kofi Annan?",[])
347
- # meta("What is the Urdu name of the birthplace of Nicki Minaj?",[])
348
- # meta("What is the calling code of the birthplace of Milla Jovovich?",[])
349
- # meta("Who was the champion of the Masters Tournament in the year that Bob Dylan was born?",[])
350
- # meta("Who won the Nobel Prize in Literature in the year Matt Damon was born?",[])
351
- # meta("Who was the President of the United States when Sting was born?",[])
352
- meta(
353
- "What are the latest reviewer opinions on Tesla Full Self Driving Beta version 11.3.4?",
354
- [],
355
- debug=True,
356
- )
357
- meta("Michael D'Ambrosio Hound Labs", [], debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
web_search.py CHANGED
@@ -14,7 +14,7 @@ from selenium import webdriver
14
  from selenium.webdriver.chrome.options import Options
15
  from unstructured.partition.html import partition_html
16
 
17
- from llmsearch import meta as mt, site_stats
18
  # this import style works in pycharm
19
  from llmsearch import utilityV2 as ut
20
 
@@ -35,7 +35,6 @@ def search(msg, query_phrase):
35
  try:
36
  # this call extracts keywords from the statement and rewrites it into a better search phrase with gpt3.5
37
  # query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
38
- mt.clear()
39
  google_text = ""
40
  try:
41
  logger.info(f"asking google {msg}; rephrased: {query_phrase}")
 
14
  from selenium.webdriver.chrome.options import Options
15
  from unstructured.partition.html import partition_html
16
 
17
+ from llmsearch import site_stats
18
  # this import style works in pycharm
19
  from llmsearch import utilityV2 as ut
20
 
 
35
  try:
36
  # this call extracts keywords from the statement and rewrites it into a better search phrase with gpt3.5
37
  # query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
 
38
  google_text = ""
39
  try:
40
  logger.info(f"asking google {msg}; rephrased: {query_phrase}")