Spaces:
Sleeping
Sleeping
arabellastrange
commited on
Commit
·
906b1c0
1
Parent(s):
e0cdb15
experiment with no concurrency
Browse files- app.py +1 -1
- web_search.py +57 -64
app.py
CHANGED
@@ -108,4 +108,4 @@ if __name__ == '__main__':
|
|
108 |
demo = gr.ChatInterface(fn=google_search_chat,
|
109 |
title="Search Assistant", retry_btn=None, undo_btn=None, clear_btn=None,
|
110 |
theme="soft")
|
111 |
-
|
|
|
108 |
demo = gr.ChatInterface(fn=google_search_chat,
|
109 |
title="Search Assistant", retry_btn=None, undo_btn=None, clear_btn=None,
|
110 |
theme="soft")
|
111 |
+
demo.launch(auth=('convo', 'session2024'))
|
web_search.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import concurrent.futures
|
2 |
import copy
|
3 |
import json
|
4 |
import logging
|
@@ -37,7 +36,7 @@ def search(msg, query_phrase):
|
|
37 |
google_text = ""
|
38 |
try:
|
39 |
logger.info(f"asking google {msg}; rephrased: {query_phrase}")
|
40 |
-
google_text, urls_all,
|
41 |
except:
|
42 |
traceback.print_exc()
|
43 |
|
@@ -59,7 +58,7 @@ def search(msg, query_phrase):
|
|
59 |
|
60 |
|
61 |
# Define a function to make a single URL request and process the response
|
62 |
-
def process_url(
|
63 |
start_time = time.time()
|
64 |
site = ut.extract_site(url)
|
65 |
result = ""
|
@@ -87,7 +86,7 @@ def process_url(query_phrase, url, timeout):
|
|
87 |
return result, url
|
88 |
|
89 |
|
90 |
-
def process_urls(
|
91 |
# Create a ThreadPoolExecutor with 5 worker threads
|
92 |
response = []
|
93 |
logger.info("entering process urls")
|
@@ -98,63 +97,63 @@ def process_urls(query_phrase, urls):
|
|
98 |
urls_tried = ["" for i in range(30)]
|
99 |
start_time = time.time()
|
100 |
in_process = []
|
101 |
-
processed = []
|
102 |
-
google_futures = []
|
103 |
|
104 |
-
with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
)
|
143 |
-
|
144 |
-
if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
|
145 |
-
executor.shutdown(wait=False)
|
146 |
-
logger.info(
|
147 |
-
f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
|
148 |
)
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
logger.info(
|
155 |
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
156 |
)
|
157 |
-
return response,
|
158 |
|
159 |
|
160 |
def extract_subtext(text):
|
@@ -247,22 +246,16 @@ def search_google(original_query, query_phrase):
|
|
247 |
all_urls = []
|
248 |
urls_used = []
|
249 |
urls_tried = []
|
250 |
-
index = 0
|
251 |
tried_index = 0
|
252 |
full_text = ""
|
253 |
|
254 |
try: # query google for recent info
|
255 |
-
extract_query = ""
|
256 |
orig_phrase_urls = []
|
257 |
if len(original_query) > 0:
|
258 |
orig_phrase_urls = request_google(original_query[: min(len(original_query), 128)])
|
259 |
-
extract_query = original_query[: min(len(original_query), 128)]
|
260 |
gpt_phrase_urls = []
|
261 |
if len(query_phrase) > 0:
|
262 |
gpt_phrase_urls = request_google(query_phrase)
|
263 |
-
extract_query = (
|
264 |
-
query_phrase # prefer more succinct query phrase if available
|
265 |
-
)
|
266 |
if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
|
267 |
return "", [], 0, [""], 0, [""]
|
268 |
|
@@ -280,9 +273,9 @@ def search_google(original_query, query_phrase):
|
|
280 |
all_urls = copy.deepcopy(urls)
|
281 |
# initialize scan of google urls
|
282 |
start_wall_time = time.time()
|
283 |
-
full_text,
|
284 |
logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
|
285 |
logger.info("return from url processsing")
|
286 |
except:
|
287 |
traceback.print_exc()
|
288 |
-
return full_text, all_urls,
|
|
|
|
|
1 |
import copy
|
2 |
import json
|
3 |
import logging
|
|
|
36 |
google_text = ""
|
37 |
try:
|
38 |
logger.info(f"asking google {msg}; rephrased: {query_phrase}")
|
39 |
+
google_text, urls_all, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
|
40 |
except:
|
41 |
traceback.print_exc()
|
42 |
|
|
|
58 |
|
59 |
|
60 |
# Define a function to make a single URL request and process the response
|
61 |
+
def process_url(url, timeout):
|
62 |
start_time = time.time()
|
63 |
site = ut.extract_site(url)
|
64 |
result = ""
|
|
|
86 |
return result, url
|
87 |
|
88 |
|
89 |
+
def process_urls(urls):
|
90 |
# Create a ThreadPoolExecutor with 5 worker threads
|
91 |
response = []
|
92 |
logger.info("entering process urls")
|
|
|
97 |
urls_tried = ["" for i in range(30)]
|
98 |
start_time = time.time()
|
99 |
in_process = []
|
100 |
+
# processed = []
|
101 |
+
# google_futures = []
|
102 |
|
103 |
+
# with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
|
104 |
+
# initialize scan of google urls
|
105 |
+
try:
|
106 |
+
while (len(urls) > 0
|
107 |
+
# no sense starting if not much time left
|
108 |
+
and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
|
109 |
+
):
|
110 |
+
recommendation = site_stats.get_next(urls, sample_unknown=True)
|
111 |
+
# set timeout so we don't wait for a slow site forever
|
112 |
+
timeout = 12 - int(time.time() - start_time)
|
113 |
+
url = recommendation[1]
|
114 |
+
# future = executor.submit(process_url, query_phrase, url, timeout)
|
115 |
+
result, url = process_url(url, timeout)
|
116 |
+
# google_futures.append(future)
|
117 |
+
# in_process.append(future)
|
118 |
+
urls_tried[tried_index] = url
|
119 |
+
tried_index += 1
|
120 |
+
urls.remove(url)
|
121 |
+
logger.info(f"queued {ut.extract_site(url)}, {timeout}")
|
122 |
+
# Process the responses as they arrive
|
123 |
+
# for future in in_process:
|
124 |
+
# if future.done():
|
125 |
+
# result, url = future.result()
|
126 |
+
# processed.append(future)
|
127 |
+
# in_process.remove(future)
|
128 |
+
if len(result) > 0:
|
129 |
+
urls_used[used_index] = url
|
130 |
+
used_index += 1
|
131 |
+
logger.info(
|
132 |
+
f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
|
133 |
+
)
|
134 |
+
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
135 |
+
response.append(
|
136 |
+
{
|
137 |
+
"source": ut.extract_domain(url),
|
138 |
+
"url": url,
|
139 |
+
"text": result,
|
140 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
)
|
142 |
+
|
143 |
+
if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
|
144 |
+
# executor.shutdown(wait=False)
|
145 |
+
logger.info(
|
146 |
+
f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
|
147 |
+
)
|
148 |
+
return response, used_index, urls_used, tried_index, urls_tried
|
149 |
+
time.sleep(0.5)
|
150 |
+
except:
|
151 |
+
traceback.print_exc()
|
152 |
+
# executor.shutdown(wait=False)
|
153 |
logger.info(
|
154 |
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
155 |
)
|
156 |
+
return response, urls_used, tried_index, urls_tried
|
157 |
|
158 |
|
159 |
def extract_subtext(text):
|
|
|
246 |
all_urls = []
|
247 |
urls_used = []
|
248 |
urls_tried = []
|
|
|
249 |
tried_index = 0
|
250 |
full_text = ""
|
251 |
|
252 |
try: # query google for recent info
|
|
|
253 |
orig_phrase_urls = []
|
254 |
if len(original_query) > 0:
|
255 |
orig_phrase_urls = request_google(original_query[: min(len(original_query), 128)])
|
|
|
256 |
gpt_phrase_urls = []
|
257 |
if len(query_phrase) > 0:
|
258 |
gpt_phrase_urls = request_google(query_phrase)
|
|
|
|
|
|
|
259 |
if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
|
260 |
return "", [], 0, [""], 0, [""]
|
261 |
|
|
|
273 |
all_urls = copy.deepcopy(urls)
|
274 |
# initialize scan of google urls
|
275 |
start_wall_time = time.time()
|
276 |
+
full_text, urls_used, tried_index, urls_tried = process_urls(all_urls)
|
277 |
logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
|
278 |
logger.info("return from url processsing")
|
279 |
except:
|
280 |
traceback.print_exc()
|
281 |
+
return full_text, all_urls, urls_used, tried_index, urls_tried
|