Spaces:

arabellastrange
/

search-assistant

Sleeping

App Files Files

arabellastrange commited on Jul 26, 2024

Commit

1aca16d

1 Parent(s): d1ac8cf

returned concurrency

Browse files

Files changed (1) hide show

web_search.py +54 -80

web_search.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import copy
 import json
 import logging
@@ -9,7 +10,6 @@ import warnings
 from itertools import zip_longest
 import requests
-from unstructured.partition.html import partition_html
 from zenrows import ZenRowsClient
 from llmsearch import site_stats
@@ -58,48 +58,22 @@ def search(msg, query_phrase):
 # Define a function to make a single URL request and process the response
-def process_url(url, timeout):
     start_time = time.time()
     site = ut.extract_site(url)
     result = ""
     try:
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            # options = Options()
-            # options.page_load_strategy = "eager"
-            # options.add_argument("--headless")
-            # options.add_argument("--no-sandbox")
-            # options.add_argument("--disable-dev-shm-usage")
-            #
-            # options.add_argument("start-maximized")
-            # options.add_argument("disable-infobars")
-            # options.add_argument("--disable-extensions")
-            # options.add_argument("--disable-gpu")
-            # options.add_argument("--disable-dev-shm-usage")
             result = ""
-            # make driver exec
-            # os.chmod('chromedriver-linux64/chromedriver', stat.S_IEXEC)
             try:
-                # driver = webdriver.Chrome(service=ChromeService(executable_path='chromedriver-linux64/chromedriver'),
-                #                           options=options)
-                # logger.info(f"*****setting page load timeout {timeout}")
-                # driver.set_page_load_timeout(timeout)
-                # driver.get(url)
-                # response = driver.page_source
                 client = ZenRowsClient(os.getenv('zenrows_api_key'))
                 response = client.get(url)
                 print(f'got response, status: {response.status_code}')
-                # result = response_text_extract(url=url, response=response)
                 result = response.text
             except Exception:
                 traceback.print_exc()
                 return "", url
-            # except selenium.common.exceptions.TimeoutException:
-            #     return "", url
-            # except selenium.common.exceptions.WebDriverException:
-            #     traceback.print_exc()
-            #     logger.info(f"webdriver failed to load")
-            #     return "", url
     except Exception:
         traceback.print_exc()
         print(f"{site} err")
@@ -119,59 +93,59 @@ def process_urls(urls):
     urls_tried = ["" for i in range(30)]
     start_time = time.time()
     in_process = []
-    # processed = []
-    # google_futures = []
-    # with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
-    # initialize scan of Google urls
-    try:
-        while (len(urls) > 0
-               # no sense starting if not much time left
-               and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
-        ):
-            recommendation = site_stats.get_next(urls, sample_unknown=True)
-            # set timeout so we don't wait for a slow site forever
-            timeout = 12 - int(time.time() - start_time)
-            url = recommendation[1]
-            # future = executor.submit(process_url, query_phrase, url, timeout)
-            result, url = process_url(url, timeout)
-            # google_futures.append(future)
-            # in_process.append(future)
-            urls_tried[tried_index] = url
-            tried_index += 1
-            urls.remove(url)
-            print(f"queued {ut.extract_site(url)}, {timeout}")
-            # Process the responses as they arrive
-            # for future in in_process:
-            #     if future.done():
-            #         result, url = future.result()
-            #         processed.append(future)
-            #         in_process.remove(future)
-            if len(result) > 0:
-                urls_used[used_index] = url
-                used_index += 1
-                print(
-                    f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
-                )
-                if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
-                    response.append(
-                        {
-                            "source": ut.extract_domain(url),
-                            "url": url,
-                            "text": result,
-                        }
-                    )
-        if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
-            # executor.shutdown(wait=False)
-            print(
-                f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
-            )
-            return response, used_index, urls_used, tried_index, urls_tried
-        time.sleep(0.5)
-    except:
-        traceback.print_exc()
-    # executor.shutdown(wait=False)
     print(
         f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
     )

+import concurrent.futures
 import copy
 import json
 import logging
 from itertools import zip_longest
 import requests
 from zenrows import ZenRowsClient
 from llmsearch import site_stats
 # Define a function to make a single URL request and process the response
+def process_url(url):
     start_time = time.time()
     site = ut.extract_site(url)
     result = ""
     try:
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             result = ""
             try:
                 client = ZenRowsClient(os.getenv('zenrows_api_key'))
                 response = client.get(url)
                 print(f'got response, status: {response.status_code}')
                 result = response.text
             except Exception:
                 traceback.print_exc()
                 return "", url
     except Exception:
         traceback.print_exc()
         print(f"{site} err")
     urls_tried = ["" for i in range(30)]
     start_time = time.time()
     in_process = []
+    processed = []
+    google_futures = []
+    with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
+        # initialize scan of Google urls
+        while True:
+            try:
+                while (len(urls) > 0
+                       # no sense starting if not much time left
+                       and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
+                ):
+                    recommendation = site_stats.get_next(urls, sample_unknown=True)
+                    # set timeout so we don't wait for a slow site forever
+                    timeout = 12 - int(time.time() - start_time)
+                    url = recommendation[1]
+                    future = executor.submit(process_url, url)
+                    # result, url = process_url(url)
+                    google_futures.append(future)
+                    in_process.append(future)
+                    urls_tried[tried_index] = url
+                    tried_index += 1
+                    urls.remove(url)
+                    print(f"queued {ut.extract_site(url)}, {timeout}")
+                    # Process the responses as they arrive
+                    for future in in_process:
+                        if future.done():
+                            result, url = future.result()
+                            processed.append(future)
+                            in_process.remove(future)
+                            if len(result) > 0:
+                                urls_used[used_index] = url
+                                used_index += 1
+                                print(
+                                    f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
+                                )
+                                if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
+                                    response.append(
+                                        {
+                                            "source": ut.extract_domain(url),
+                                            "url": url,
+                                            "text": result,
+                                        }
+                                    )
+                    if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
+                        executor.shutdown(wait=False)
+                        print(
+                            f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
+                        )
+                        return response, used_index, urls_used, tried_index, urls_tried
+                time.sleep(0.5)
+            except:
+                traceback.print_exc()
+        executor.shutdown(wait=False)
     print(
         f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
     )