Spaces:

arabellastrange
/

search-assistant

Sleeping

App Files Files

arabellastrange commited on Jul 26, 2024

Commit

f096f8f

1 Parent(s): 1aca16d

removed concurrency

Browse files

Files changed (1) hide show

web_search.py +37 -54

web_search.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import concurrent.futures
 import copy
 import json
 import logging
@@ -10,6 +9,7 @@ import warnings
 from itertools import zip_longest
 import requests
 from zenrows import ZenRowsClient
 from llmsearch import site_stats
@@ -23,7 +23,6 @@ from llmsearch import utilityV2 as ut
 # from llmsearch import utilityV2 as ut
 logger = logging.getLogger("agent_logger")
-logger = logging.getLogger("agent_logger")
 # todo drop blocked pages > see og llmsearch code
@@ -93,59 +92,43 @@ def process_urls(urls):
     urls_tried = ["" for i in range(30)]
     start_time = time.time()
     in_process = []
-    processed = []
-    google_futures = []
-    with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
-        # initialize scan of Google urls
-        while True:
-            try:
-                while (len(urls) > 0
-                       # no sense starting if not much time left
-                       and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
-                ):
-                    recommendation = site_stats.get_next(urls, sample_unknown=True)
-                    # set timeout so we don't wait for a slow site forever
-                    timeout = 12 - int(time.time() - start_time)
-                    url = recommendation[1]
-                    future = executor.submit(process_url, url)
-                    # result, url = process_url(url)
-                    google_futures.append(future)
-                    in_process.append(future)
-                    urls_tried[tried_index] = url
-                    tried_index += 1
-                    urls.remove(url)
-                    print(f"queued {ut.extract_site(url)}, {timeout}")
-                    # Process the responses as they arrive
-                    for future in in_process:
-                        if future.done():
-                            result, url = future.result()
-                            processed.append(future)
-                            in_process.remove(future)
-                            if len(result) > 0:
-                                urls_used[used_index] = url
-                                used_index += 1
-                                print(
-                                    f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
-                                )
-                                if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
-                                    response.append(
-                                        {
-                                            "source": ut.extract_domain(url),
-                                            "url": url,
-                                            "text": result,
-                                        }
-                                    )
-                    if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
-                        executor.shutdown(wait=False)
-                        print(
-                            f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
-                        )
-                        return response, used_index, urls_used, tried_index, urls_tried
-                time.sleep(0.5)
-            except:
-                traceback.print_exc()
-        executor.shutdown(wait=False)
     print(
         f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
     )

 import copy
 import json
 import logging
 from itertools import zip_longest
 import requests
+from unstructured.partition.html import partition_html
 from zenrows import ZenRowsClient
 from llmsearch import site_stats
 # from llmsearch import utilityV2 as ut
 logger = logging.getLogger("agent_logger")
 # todo drop blocked pages > see og llmsearch code
     urls_tried = ["" for i in range(30)]
     start_time = time.time()
     in_process = []
+    try:
+        while (len(urls) > 0
+               # no sense starting if not much time left
+               and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
+        ):
+            recommendation = site_stats.get_next(urls, sample_unknown=True)
+            # set timeout so we don't wait for a slow site forever
+            timeout = 12 - int(time.time() - start_time)
+            url = recommendation[1]
+            result, url = process_url(url)
+            urls_tried[tried_index] = url
+            tried_index += 1
+            urls.remove(url)
+            print(f"queued {ut.extract_site(url)}, {timeout}")
+            if len(result) > 0:
+                urls_used[used_index] = url
+                used_index += 1
+                print(
+                    f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
+                )
+                if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
+                    response.append(
+                        {
+                            "source": ut.extract_domain(url),
+                            "url": url,
+                            "text": result,
+                        }
+                    )
+        if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
+            print(
+                f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
+            )
+            return response, used_index, urls_used, tried_index, urls_tried
+    except:
+        traceback.print_exc()
     print(
         f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
     )