arabellastrange commited on
Commit
f096f8f
·
1 Parent(s): 1aca16d

removed concurrency

Browse files
Files changed (1) hide show
  1. web_search.py +37 -54
web_search.py CHANGED
@@ -1,4 +1,3 @@
1
- import concurrent.futures
2
  import copy
3
  import json
4
  import logging
@@ -10,6 +9,7 @@ import warnings
10
  from itertools import zip_longest
11
 
12
  import requests
 
13
  from zenrows import ZenRowsClient
14
 
15
  from llmsearch import site_stats
@@ -23,7 +23,6 @@ from llmsearch import utilityV2 as ut
23
  # from llmsearch import utilityV2 as ut
24
 
25
  logger = logging.getLogger("agent_logger")
26
- logger = logging.getLogger("agent_logger")
27
 
28
 
29
  # todo drop blocked pages > see og llmsearch code
@@ -93,59 +92,43 @@ def process_urls(urls):
93
  urls_tried = ["" for i in range(30)]
94
  start_time = time.time()
95
  in_process = []
96
- processed = []
97
- google_futures = []
98
 
99
- with (concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor):
100
- # initialize scan of Google urls
101
- while True:
102
- try:
103
- while (len(urls) > 0
104
- # no sense starting if not much time left
105
- and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
106
- ):
107
- recommendation = site_stats.get_next(urls, sample_unknown=True)
108
- # set timeout so we don't wait for a slow site forever
109
- timeout = 12 - int(time.time() - start_time)
110
- url = recommendation[1]
111
- future = executor.submit(process_url, url)
112
- # result, url = process_url(url)
113
- google_futures.append(future)
114
- in_process.append(future)
115
- urls_tried[tried_index] = url
116
- tried_index += 1
117
- urls.remove(url)
118
- print(f"queued {ut.extract_site(url)}, {timeout}")
119
- # Process the responses as they arrive
120
- for future in in_process:
121
- if future.done():
122
- result, url = future.result()
123
- processed.append(future)
124
- in_process.remove(future)
125
- if len(result) > 0:
126
- urls_used[used_index] = url
127
- used_index += 1
128
- print(
129
- f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
130
- )
131
- if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
132
- response.append(
133
- {
134
- "source": ut.extract_domain(url),
135
- "url": url,
136
- "text": result,
137
- }
138
- )
139
- if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
140
- executor.shutdown(wait=False)
141
- print(
142
- f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
143
- )
144
- return response, used_index, urls_used, tried_index, urls_tried
145
- time.sleep(0.5)
146
- except:
147
- traceback.print_exc()
148
- executor.shutdown(wait=False)
149
  print(
150
  f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
151
  )
 
 
1
  import copy
2
  import json
3
  import logging
 
9
  from itertools import zip_longest
10
 
11
  import requests
12
+ from unstructured.partition.html import partition_html
13
  from zenrows import ZenRowsClient
14
 
15
  from llmsearch import site_stats
 
23
  # from llmsearch import utilityV2 as ut
24
 
25
  logger = logging.getLogger("agent_logger")
 
26
 
27
 
28
  # todo drop blocked pages > see og llmsearch code
 
92
  urls_tried = ["" for i in range(30)]
93
  start_time = time.time()
94
  in_process = []
 
 
95
 
96
+ try:
97
+ while (len(urls) > 0
98
+ # no sense starting if not much time left
99
+ and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
100
+ ):
101
+ recommendation = site_stats.get_next(urls, sample_unknown=True)
102
+ # set timeout so we don't wait for a slow site forever
103
+ timeout = 12 - int(time.time() - start_time)
104
+ url = recommendation[1]
105
+ result, url = process_url(url)
106
+ urls_tried[tried_index] = url
107
+ tried_index += 1
108
+ urls.remove(url)
109
+ print(f"queued {ut.extract_site(url)}, {timeout}")
110
+ if len(result) > 0:
111
+ urls_used[used_index] = url
112
+ used_index += 1
113
+ print(
114
+ f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
115
+ )
116
+ if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
117
+ response.append(
118
+ {
119
+ "source": ut.extract_domain(url),
120
+ "url": url,
121
+ "text": result,
122
+ }
123
+ )
124
+
125
+ if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
126
+ print(
127
+ f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
128
+ )
129
+ return response, used_index, urls_used, tried_index, urls_tried
130
+ except:
131
+ traceback.print_exc()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  print(
133
  f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
134
  )