Spaces:

arabellastrange
/

search-assistant

Sleeping

App Files Files

arabellastrange commited on Jul 26, 2024

Commit

200cc44

1 Parent(s): f096f8f

different url processing loop

Browse files

Files changed (1) hide show

web_search.py +11 -82

web_search.py CHANGED Viewed

@@ -9,10 +9,8 @@ import warnings
 from itertools import zip_longest
 import requests
-from unstructured.partition.html import partition_html
 from zenrows import ZenRowsClient
-from llmsearch import site_stats
 # this import style works in pycharm
 from llmsearch import utilityV2 as ut
@@ -35,7 +33,7 @@ def search(msg, query_phrase):
         google_text = ""
         try:
             print(f"asking google {msg}; rephrased: {query_phrase}")
-            google_text, urls_all, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
         except:
             traceback.print_exc()
@@ -77,43 +75,23 @@ def process_url(url):
         traceback.print_exc()
         print(f"{site} err")
         pass
-    print(f"Processed {site}: {len(response.text)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
     return result, url
 def process_urls(urls):
-    # Create a ThreadPoolExecutor with 5 worker threads
     response = []
-    print("entering process urls")
-    full_text = ""
-    used_index = 0
-    urls_used = ["" for i in range(30)]
-    tried_index = 0
-    urls_tried = ["" for i in range(30)]
     start_time = time.time()
-    in_process = []
     try:
-        while (len(urls) > 0
-               # no sense starting if not much time left
-               and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
-        ):
-            recommendation = site_stats.get_next(urls, sample_unknown=True)
-            # set timeout so we don't wait for a slow site forever
-            timeout = 12 - int(time.time() - start_time)
-            url = recommendation[1]
             result, url = process_url(url)
-            urls_tried[tried_index] = url
-            tried_index += 1
-            urls.remove(url)
-            print(f"queued {ut.extract_site(url)}, {timeout}")
             if len(result) > 0:
-                urls_used[used_index] = url
-                used_index += 1
-                print(
-                    f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
-                )
                 if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
                     response.append(
                         {
                             "source": ut.extract_domain(url),
@@ -121,18 +99,13 @@ def process_urls(urls):
                             "text": result,
                         }
                     )
-        if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
-            print(
-                f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
-            )
-            return response, used_index, urls_used, tried_index, urls_tried
     except:
         traceback.print_exc()
     print(
         f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
     )
-    return response, urls_used, tried_index, urls_tried
 def extract_subtext(text):
@@ -182,50 +155,7 @@ def request_google(query_phrase):
     return urls
-# def response_text_extract(url, response):
-#     extract_text = ""
-#     if url.endswith("pdf"):
-#         pass
-#     else:
-#         if response is not None:
-#             elements = partition_html(text=response)
-#             str_elements = []
-#             logger.info('\n***** elements')
-#             for e in elements:
-#                 stre = str(e).replace("  ", " ")
-#                 str_elements.append(stre)
-#             extract_text = ''.join(extract_subtext(str_elements))
-#             logger.info(
-#                 f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
-#             )
-#
-#     if len(extract_text.strip()) < 8:
-#         return ""
-#     else:
-#         return extract_text
-# def extract_items_from_numbered_list(text):
-#     items = ""
-#     elements = text.split("\n")
-#     for candidate in elements:
-#         candidate = candidate.lstrip(". \t")
-#         if len(candidate) > 4 and candidate[0].isdigit():
-#             candidate = candidate[1:].lstrip(". ")
-#             if (
-#                     len(candidate) > 4 and candidate[0].isdigit()
-#             ):  # strip second digit if more than 10 items
-#                 candidate = candidate[1:].lstrip(". ")
-#             logger.info("E {}".format(candidate))
-#             items += candidate + " "
-#     return items
 def search_google(original_query, query_phrase):
-    all_urls = []
-    urls_used = []
-    urls_tried = []
-    tried_index = 0
     full_text = ""
     try:  # query google for recent info
@@ -252,9 +182,8 @@ def search_google(original_query, query_phrase):
         all_urls = copy.deepcopy(urls)
         # initialize scan of Google urls
         start_wall_time = time.time()
-        full_text, urls_used, tried_index, urls_tried = process_urls(all_urls)
         print(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
-        print("return from url processsing")
     except:
         traceback.print_exc()
-    return full_text, all_urls, urls_used, tried_index, urls_tried

 from itertools import zip_longest
 import requests
 from zenrows import ZenRowsClient
 # this import style works in pycharm
 from llmsearch import utilityV2 as ut
         google_text = ""
         try:
             print(f"asking google {msg}; rephrased: {query_phrase}")
+            google_text = search_google(msg, query_phrase)
         except:
             traceback.print_exc()
         traceback.print_exc()
         print(f"{site} err")
         pass
+    print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
     return result, url
 def process_urls(urls):
     response = []
+    print(f"entering process urls: {len(urls)} found. {urls}")
     start_time = time.time()
     try:
+        for url in urls:
             result, url = process_url(url)
             if len(result) > 0:
                 if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
+                    print(
+                        f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
+                    )
                     response.append(
                         {
                             "source": ut.extract_domain(url),
                             "text": result,
                         }
                     )
     except:
         traceback.print_exc()
     print(
         f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
     )
+    return response
 def extract_subtext(text):
     return urls
 def search_google(original_query, query_phrase):
     full_text = ""
     try:  # query google for recent info
         all_urls = copy.deepcopy(urls)
         # initialize scan of Google urls
         start_wall_time = time.time()
+        full_text = process_urls(all_urls)
         print(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
     except:
         traceback.print_exc()
+    return full_text