arabellastrange commited on
Commit
906b1c0
·
1 Parent(s): e0cdb15

experiment with no concurrency

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. web_search.py +57 -64
app.py CHANGED
@@ -108,4 +108,4 @@ if __name__ == '__main__':
108
  demo = gr.ChatInterface(fn=google_search_chat,
109
  title="Search Assistant", retry_btn=None, undo_btn=None, clear_btn=None,
110
  theme="soft")
111
- # demo.launch(auth=('convo', 'session2024'))
 
108
  demo = gr.ChatInterface(fn=google_search_chat,
109
  title="Search Assistant", retry_btn=None, undo_btn=None, clear_btn=None,
110
  theme="soft")
111
+ demo.launch(auth=('convo', 'session2024'))
web_search.py CHANGED
@@ -1,4 +1,3 @@
1
- import concurrent.futures
2
  import copy
3
  import json
4
  import logging
@@ -37,7 +36,7 @@ def search(msg, query_phrase):
37
  google_text = ""
38
  try:
39
  logger.info(f"asking google {msg}; rephrased: {query_phrase}")
40
- google_text, urls_all, index, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
41
  except:
42
  traceback.print_exc()
43
 
@@ -59,7 +58,7 @@ def search(msg, query_phrase):
59
 
60
 
61
  # Define a function to make a single URL request and process the response
62
- def process_url(query_phrase, url, timeout):
63
  start_time = time.time()
64
  site = ut.extract_site(url)
65
  result = ""
@@ -87,7 +86,7 @@ def process_url(query_phrase, url, timeout):
87
  return result, url
88
 
89
 
90
- def process_urls(query_phrase, urls):
91
  # Create a ThreadPoolExecutor with 5 worker threads
92
  response = []
93
  logger.info("entering process urls")
@@ -98,63 +97,63 @@ def process_urls(query_phrase, urls):
98
  urls_tried = ["" for i in range(30)]
99
  start_time = time.time()
100
  in_process = []
101
- processed = []
102
- google_futures = []
103
 
104
- with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
105
- # initialize scan of google urls
106
- while True:
107
- try:
108
- while (len(urls) > 0
109
- # no sense starting if not much time left
110
- and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
111
- ):
112
- recommendation = site_stats.get_next(urls, sample_unknown=True)
113
- # set timeout so we don't wait for a slow site forever
114
- timeout = 12 - int(time.time() - start_time)
115
- url = recommendation[1]
116
- future = executor.submit(process_url, query_phrase, url, timeout)
117
- google_futures.append(future)
118
- in_process.append(future)
119
- urls_tried[tried_index] = url
120
- tried_index += 1
121
- urls.remove(url)
122
- logger.info(f"queued {ut.extract_site(url)}, {timeout}")
123
- # Process the responses as they arrive
124
- for future in in_process:
125
- if future.done():
126
- result, url = future.result()
127
- processed.append(future)
128
- in_process.remove(future)
129
- if len(result) > 0:
130
- urls_used[used_index] = url
131
- used_index += 1
132
- logger.info(
133
- f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
134
- )
135
- if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
136
- response.append(
137
- {
138
- "source": ut.extract_domain(url),
139
- "url": url,
140
- "text": result,
141
- }
142
- )
143
-
144
- if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
145
- executor.shutdown(wait=False)
146
- logger.info(
147
- f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
148
  )
149
- return response, used_index, urls_used, tried_index, urls_tried
150
- time.sleep(0.5)
151
- except:
152
- traceback.print_exc()
153
- executor.shutdown(wait=False)
 
 
 
 
 
 
154
  logger.info(
155
  f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
156
  )
157
- return response, index, urls_used, tried_index, urls_tried
158
 
159
 
160
  def extract_subtext(text):
@@ -247,22 +246,16 @@ def search_google(original_query, query_phrase):
247
  all_urls = []
248
  urls_used = []
249
  urls_tried = []
250
- index = 0
251
  tried_index = 0
252
  full_text = ""
253
 
254
  try: # query google for recent info
255
- extract_query = ""
256
  orig_phrase_urls = []
257
  if len(original_query) > 0:
258
  orig_phrase_urls = request_google(original_query[: min(len(original_query), 128)])
259
- extract_query = original_query[: min(len(original_query), 128)]
260
  gpt_phrase_urls = []
261
  if len(query_phrase) > 0:
262
  gpt_phrase_urls = request_google(query_phrase)
263
- extract_query = (
264
- query_phrase # prefer more succinct query phrase if available
265
- )
266
  if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
267
  return "", [], 0, [""], 0, [""]
268
 
@@ -280,9 +273,9 @@ def search_google(original_query, query_phrase):
280
  all_urls = copy.deepcopy(urls)
281
  # initialize scan of google urls
282
  start_wall_time = time.time()
283
- full_text, index, urls_used, tried_index, urls_tried = process_urls(extract_query, all_urls)
284
  logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
285
  logger.info("return from url processsing")
286
  except:
287
  traceback.print_exc()
288
- return full_text, all_urls, index, urls_used, tried_index, urls_tried
 
 
1
  import copy
2
  import json
3
  import logging
 
36
  google_text = ""
37
  try:
38
  logger.info(f"asking google {msg}; rephrased: {query_phrase}")
39
+ google_text, urls_all, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
40
  except:
41
  traceback.print_exc()
42
 
 
58
 
59
 
60
  # Define a function to make a single URL request and process the response
61
+ def process_url(url, timeout):
62
  start_time = time.time()
63
  site = ut.extract_site(url)
64
  result = ""
 
86
  return result, url
87
 
88
 
89
+ def process_urls(urls):
90
  # Create a ThreadPoolExecutor with 5 worker threads
91
  response = []
92
  logger.info("entering process urls")
 
97
  urls_tried = ["" for i in range(30)]
98
  start_time = time.time()
99
  in_process = []
100
+ # processed = []
101
+ # google_futures = []
102
 
103
+ # with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
104
+ # initialize scan of google urls
105
+ try:
106
+ while (len(urls) > 0
107
+ # no sense starting if not much time left
108
+ and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
109
+ ):
110
+ recommendation = site_stats.get_next(urls, sample_unknown=True)
111
+ # set timeout so we don't wait for a slow site forever
112
+ timeout = 12 - int(time.time() - start_time)
113
+ url = recommendation[1]
114
+ # future = executor.submit(process_url, query_phrase, url, timeout)
115
+ result, url = process_url(url, timeout)
116
+ # google_futures.append(future)
117
+ # in_process.append(future)
118
+ urls_tried[tried_index] = url
119
+ tried_index += 1
120
+ urls.remove(url)
121
+ logger.info(f"queued {ut.extract_site(url)}, {timeout}")
122
+ # Process the responses as they arrive
123
+ # for future in in_process:
124
+ # if future.done():
125
+ # result, url = future.result()
126
+ # processed.append(future)
127
+ # in_process.remove(future)
128
+ if len(result) > 0:
129
+ urls_used[used_index] = url
130
+ used_index += 1
131
+ logger.info(
132
+ f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
133
+ )
134
+ if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
135
+ response.append(
136
+ {
137
+ "source": ut.extract_domain(url),
138
+ "url": url,
139
+ "text": result,
140
+ }
 
 
 
 
 
 
141
  )
142
+
143
+ if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
144
+ # executor.shutdown(wait=False)
145
+ logger.info(
146
+ f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
147
+ )
148
+ return response, used_index, urls_used, tried_index, urls_tried
149
+ time.sleep(0.5)
150
+ except:
151
+ traceback.print_exc()
152
+ # executor.shutdown(wait=False)
153
  logger.info(
154
  f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
155
  )
156
+ return response, urls_used, tried_index, urls_tried
157
 
158
 
159
  def extract_subtext(text):
 
246
  all_urls = []
247
  urls_used = []
248
  urls_tried = []
 
249
  tried_index = 0
250
  full_text = ""
251
 
252
  try: # query google for recent info
 
253
  orig_phrase_urls = []
254
  if len(original_query) > 0:
255
  orig_phrase_urls = request_google(original_query[: min(len(original_query), 128)])
 
256
  gpt_phrase_urls = []
257
  if len(query_phrase) > 0:
258
  gpt_phrase_urls = request_google(query_phrase)
 
 
 
259
  if len(orig_phrase_urls) == 0 and len(gpt_phrase_urls) == 0:
260
  return "", [], 0, [""], 0, [""]
261
 
 
273
  all_urls = copy.deepcopy(urls)
274
  # initialize scan of google urls
275
  start_wall_time = time.time()
276
+ full_text, urls_used, tried_index, urls_tried = process_urls(all_urls)
277
  logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
278
  logger.info("return from url processsing")
279
  except:
280
  traceback.print_exc()
281
+ return full_text, all_urls, urls_used, tried_index, urls_tried