Spaces:
Sleeping
Sleeping
arabellastrange
commited on
Commit
·
f096f8f
1
Parent(s):
1aca16d
removed concurrency
Browse files- web_search.py +37 -54
web_search.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import concurrent.futures
|
2 |
import copy
|
3 |
import json
|
4 |
import logging
|
@@ -10,6 +9,7 @@ import warnings
|
|
10 |
from itertools import zip_longest
|
11 |
|
12 |
import requests
|
|
|
13 |
from zenrows import ZenRowsClient
|
14 |
|
15 |
from llmsearch import site_stats
|
@@ -23,7 +23,6 @@ from llmsearch import utilityV2 as ut
|
|
23 |
# from llmsearch import utilityV2 as ut
|
24 |
|
25 |
logger = logging.getLogger("agent_logger")
|
26 |
-
logger = logging.getLogger("agent_logger")
|
27 |
|
28 |
|
29 |
# todo drop blocked pages > see og llmsearch code
|
@@ -93,59 +92,43 @@ def process_urls(urls):
|
|
93 |
urls_tried = ["" for i in range(30)]
|
94 |
start_time = time.time()
|
95 |
in_process = []
|
96 |
-
processed = []
|
97 |
-
google_futures = []
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
"url": url,
|
136 |
-
"text": result,
|
137 |
-
}
|
138 |
-
)
|
139 |
-
if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
|
140 |
-
executor.shutdown(wait=False)
|
141 |
-
print(
|
142 |
-
f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
|
143 |
-
)
|
144 |
-
return response, used_index, urls_used, tried_index, urls_tried
|
145 |
-
time.sleep(0.5)
|
146 |
-
except:
|
147 |
-
traceback.print_exc()
|
148 |
-
executor.shutdown(wait=False)
|
149 |
print(
|
150 |
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
151 |
)
|
|
|
|
|
1 |
import copy
|
2 |
import json
|
3 |
import logging
|
|
|
9 |
from itertools import zip_longest
|
10 |
|
11 |
import requests
|
12 |
+
from unstructured.partition.html import partition_html
|
13 |
from zenrows import ZenRowsClient
|
14 |
|
15 |
from llmsearch import site_stats
|
|
|
23 |
# from llmsearch import utilityV2 as ut
|
24 |
|
25 |
logger = logging.getLogger("agent_logger")
|
|
|
26 |
|
27 |
|
28 |
# todo drop blocked pages > see og llmsearch code
|
|
|
92 |
urls_tried = ["" for i in range(30)]
|
93 |
start_time = time.time()
|
94 |
in_process = []
|
|
|
|
|
95 |
|
96 |
+
try:
|
97 |
+
while (len(urls) > 0
|
98 |
+
# no sense starting if not much time left
|
99 |
+
and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
|
100 |
+
):
|
101 |
+
recommendation = site_stats.get_next(urls, sample_unknown=True)
|
102 |
+
# set timeout so we don't wait for a slow site forever
|
103 |
+
timeout = 12 - int(time.time() - start_time)
|
104 |
+
url = recommendation[1]
|
105 |
+
result, url = process_url(url)
|
106 |
+
urls_tried[tried_index] = url
|
107 |
+
tried_index += 1
|
108 |
+
urls.remove(url)
|
109 |
+
print(f"queued {ut.extract_site(url)}, {timeout}")
|
110 |
+
if len(result) > 0:
|
111 |
+
urls_used[used_index] = url
|
112 |
+
used_index += 1
|
113 |
+
print(
|
114 |
+
f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
|
115 |
+
)
|
116 |
+
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
117 |
+
response.append(
|
118 |
+
{
|
119 |
+
"source": ut.extract_domain(url),
|
120 |
+
"url": url,
|
121 |
+
"text": result,
|
122 |
+
}
|
123 |
+
)
|
124 |
+
|
125 |
+
if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
|
126 |
+
print(
|
127 |
+
f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
|
128 |
+
)
|
129 |
+
return response, used_index, urls_used, tried_index, urls_tried
|
130 |
+
except:
|
131 |
+
traceback.print_exc()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
print(
|
133 |
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
134 |
)
|