Spaces:

arabellastrange
/

search-assistant

Paused

App Files Files

arabellastrange commited on Jul 25, 2024

Commit

906b1c0

1 Parent(s): e0cdb15

experiment with no concurrency

Browse files

Files changed (2) hide show

app.py +1 -1
web_search.py +57 -64

app.py CHANGED Viewed

@@ -108,4 +108,4 @@ if __name__ == '__main__':
     demo = gr.ChatInterface(fn=google_search_chat,
                             title="Search Assistant", retry_btn=None, undo_btn=None, clear_btn=None,
                             theme="soft")
-    # demo.launch(auth=('convo', 'session2024'))

     demo = gr.ChatInterface(fn=google_search_chat,
                             title="Search Assistant", retry_btn=None, undo_btn=None, clear_btn=None,
                             theme="soft")
+    demo.launch(auth=('convo', 'session2024'))

web_search.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import concurrent.futures
 import copy
 import json
 import logging
@@ -37,7 +36,7 @@ def search(msg, query_phrase):
         google_text = ""
         try:
             logger.info(f"asking google {msg}; rephrased: {query_phrase}")
-            google_text, urls_all, index, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
         except:
             traceback.print_exc()
@@ -59,7 +58,7 @@ def search(msg, query_phrase):
 # Define a function to make a single URL request and process the response
-def process_url(query_phrase, url, timeout):
     start_time = time.time()
     site = ut.extract_site(url)
     result = ""
@@ -87,7 +86,7 @@ def process_url(query_phrase, url, timeout):
     return result, url
-def process_urls(query_phrase, urls):
     # Create a ThreadPoolExecutor with 5 worker threads
     response = []
     logger.info("entering process urls")
@@ -98,63 +97,63 @@ def process_urls(query_phrase, urls):
     urls_tried = ["" for i in range(30)]
     start_time = time.time()
     in_process = []
-    processed = []
-    google_futures = []
-    with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
-        # initialize scan of google urls
-        while True:
-            try:
-                while (len(urls) > 0
-                       # no sense starting if not much time left
-                       and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
-                ):
-                    recommendation = site_stats.get_next(urls, sample_unknown=True)
-                    # set timeout so we don't wait for a slow site forever
-                    timeout = 12 - int(time.time() - start_time)
-                    url = recommendation[1]
-                    future = executor.submit(process_url, query_phrase, url, timeout)
-                    google_futures.append(future)
-                    in_process.append(future)
-                    urls_tried[tried_index] = url
-                    tried_index += 1
-                    urls.remove(url)
-                    logger.info(f"queued {ut.extract_site(url)}, {timeout}")
-                # Process the responses as they arrive
-                for future in in_process:
-                    if future.done():
-                        result, url = future.result()
-                        processed.append(future)
-                        in_process.remove(future)
-                        if len(result) > 0:
-                            urls_used[used_index] = url
-                            used_index += 1
-                            logger.info(
-                                f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
-                            )
-                            if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
-                                response.append(
-                                    {
-                                        "source": ut.extract_domain(url),
-                                        "url": url,
-                                        "text": result,
-                                    }
-                                )
-                if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
-                    executor.shutdown(wait=False)
-                    logger.info(
-                        f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
                     )
-                    return response, used_index, urls_used, tried_index, urls_tried
-                time.sleep(0.5)
-            except:
-                traceback.print_exc()
-        executor.shutdown(wait=False)
     logger.info(
         f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
     )
-    return response, index, urls_used, tried_index, urls_tried
 def extract_subtext(text):
@@ -247,22 +246,16 @@ def search_google(original_query, query_phrase):
     all_urls = []
     urls_used = []
     urls_tried = []
-    index = 0
     tried_index = 0
     full_text = ""
     try:  # query google for recent info
-        extract_query = ""
         orig_phrase_urls = []
         if len(original_query) > 0:
             orig_phrase_urls = request_google(original_query[: min(len(original_query), 128)])
-            extract_query = original_query[: min(len(original_query), 128)]
         gpt_phrase_urls = []
         if len(query_phrase) > 0:
             gpt_phrase_urls = request_google(query_phrase)
-            extract_query = (
-                query_phrase  # prefer more succinct query phrase if available
-            )
         if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
             return "", [], 0, [""], 0, [""]
@@ -280,9 +273,9 @@ def search_google(original_query, query_phrase):
         all_urls = copy.deepcopy(urls)
         # initialize scan of google urls
         start_wall_time = time.time()
-        full_text, index, urls_used, tried_index, urls_tried = process_urls(extract_query, all_urls)
         logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
         logger.info("return from url processsing")
     except:
         traceback.print_exc()
-    return full_text, all_urls, index, urls_used, tried_index, urls_tried

 import copy
 import json
 import logging
         google_text = ""
         try:
             logger.info(f"asking google {msg}; rephrased: {query_phrase}")
+            google_text, urls_all, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
         except:
             traceback.print_exc()
 # Define a function to make a single URL request and process the response
+def process_url(url, timeout):
     start_time = time.time()
     site = ut.extract_site(url)
     result = ""
     return result, url
+def process_urls(urls):
     # Create a ThreadPoolExecutor with 5 worker threads
     response = []
     logger.info("entering process urls")
     urls_tried = ["" for i in range(30)]
     start_time = time.time()
     in_process = []
+    # processed = []
+    # google_futures = []
+    # with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
+    # initialize scan of google urls
+    try:
+        while (len(urls) > 0
+               # no sense starting if not much time left
+               and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
+        ):
+            recommendation = site_stats.get_next(urls, sample_unknown=True)
+            # set timeout so we don't wait for a slow site forever
+            timeout = 12 - int(time.time() - start_time)
+            url = recommendation[1]
+            # future = executor.submit(process_url, query_phrase, url, timeout)
+            result, url = process_url(url, timeout)
+            # google_futures.append(future)
+            # in_process.append(future)
+            urls_tried[tried_index] = url
+            tried_index += 1
+            urls.remove(url)
+            logger.info(f"queued {ut.extract_site(url)}, {timeout}")
+        # Process the responses as they arrive
+        # for future in in_process:
+        #     if future.done():
+        #         result, url = future.result()
+        #         processed.append(future)
+        #         in_process.remove(future)
+            if len(result) > 0:
+                urls_used[used_index] = url
+                used_index += 1
+                logger.info(
+                    f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
+                )
+                if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
+                    response.append(
+                        {
+                            "source": ut.extract_domain(url),
+                            "url": url,
+                            "text": result,
+                        }
                     )
+        if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
+            # executor.shutdown(wait=False)
+            logger.info(
+                f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
+            )
+            return response, used_index, urls_used, tried_index, urls_tried
+        time.sleep(0.5)
+    except:
+        traceback.print_exc()
+    # executor.shutdown(wait=False)
     logger.info(
         f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
     )
+    return response, urls_used, tried_index, urls_tried
 def extract_subtext(text):
     all_urls = []
     urls_used = []
     urls_tried = []
     tried_index = 0
     full_text = ""
     try:  # query google for recent info
         orig_phrase_urls = []
         if len(original_query) > 0:
             orig_phrase_urls = request_google(original_query[: min(len(original_query), 128)])
         gpt_phrase_urls = []
         if len(query_phrase) > 0:
             gpt_phrase_urls = request_google(query_phrase)
         if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
             return "", [], 0, [""], 0, [""]
         all_urls = copy.deepcopy(urls)
         # initialize scan of google urls
         start_wall_time = time.time()
+        full_text, urls_used, tried_index, urls_tried = process_urls(all_urls)
         logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
         logger.info("return from url processsing")
     except:
         traceback.print_exc()
+    return full_text, all_urls, urls_used, tried_index, urls_tried