Spaces:
Sleeping
Sleeping
arabellastrange
commited on
Commit
·
200cc44
1
Parent(s):
f096f8f
different url processing loop
Browse files- web_search.py +11 -82
web_search.py
CHANGED
@@ -9,10 +9,8 @@ import warnings
|
|
9 |
from itertools import zip_longest
|
10 |
|
11 |
import requests
|
12 |
-
from unstructured.partition.html import partition_html
|
13 |
from zenrows import ZenRowsClient
|
14 |
|
15 |
-
from llmsearch import site_stats
|
16 |
# this import style works in pycharm
|
17 |
from llmsearch import utilityV2 as ut
|
18 |
|
@@ -35,7 +33,7 @@ def search(msg, query_phrase):
|
|
35 |
google_text = ""
|
36 |
try:
|
37 |
print(f"asking google {msg}; rephrased: {query_phrase}")
|
38 |
-
google_text
|
39 |
except:
|
40 |
traceback.print_exc()
|
41 |
|
@@ -77,43 +75,23 @@ def process_url(url):
|
|
77 |
traceback.print_exc()
|
78 |
print(f"{site} err")
|
79 |
pass
|
80 |
-
print(f"Processed {site}: {len(
|
81 |
return result, url
|
82 |
|
83 |
|
84 |
def process_urls(urls):
|
85 |
-
# Create a ThreadPoolExecutor with 5 worker threads
|
86 |
response = []
|
87 |
-
print("entering process urls")
|
88 |
-
full_text = ""
|
89 |
-
used_index = 0
|
90 |
-
urls_used = ["" for i in range(30)]
|
91 |
-
tried_index = 0
|
92 |
-
urls_tried = ["" for i in range(30)]
|
93 |
start_time = time.time()
|
94 |
-
in_process = []
|
95 |
|
96 |
try:
|
97 |
-
|
98 |
-
# no sense starting if not much time left
|
99 |
-
and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
|
100 |
-
):
|
101 |
-
recommendation = site_stats.get_next(urls, sample_unknown=True)
|
102 |
-
# set timeout so we don't wait for a slow site forever
|
103 |
-
timeout = 12 - int(time.time() - start_time)
|
104 |
-
url = recommendation[1]
|
105 |
result, url = process_url(url)
|
106 |
-
urls_tried[tried_index] = url
|
107 |
-
tried_index += 1
|
108 |
-
urls.remove(url)
|
109 |
-
print(f"queued {ut.extract_site(url)}, {timeout}")
|
110 |
if len(result) > 0:
|
111 |
-
urls_used[used_index] = url
|
112 |
-
used_index += 1
|
113 |
-
print(
|
114 |
-
f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
|
115 |
-
)
|
116 |
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
|
|
|
|
|
|
117 |
response.append(
|
118 |
{
|
119 |
"source": ut.extract_domain(url),
|
@@ -121,18 +99,13 @@ def process_urls(urls):
|
|
121 |
"text": result,
|
122 |
}
|
123 |
)
|
124 |
-
|
125 |
-
if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
|
126 |
-
print(
|
127 |
-
f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
|
128 |
-
)
|
129 |
-
return response, used_index, urls_used, tried_index, urls_tried
|
130 |
except:
|
131 |
traceback.print_exc()
|
|
|
132 |
print(
|
133 |
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
134 |
)
|
135 |
-
return response
|
136 |
|
137 |
|
138 |
def extract_subtext(text):
|
@@ -182,50 +155,7 @@ def request_google(query_phrase):
|
|
182 |
return urls
|
183 |
|
184 |
|
185 |
-
# def response_text_extract(url, response):
|
186 |
-
# extract_text = ""
|
187 |
-
# if url.endswith("pdf"):
|
188 |
-
# pass
|
189 |
-
# else:
|
190 |
-
# if response is not None:
|
191 |
-
# elements = partition_html(text=response)
|
192 |
-
# str_elements = []
|
193 |
-
# logger.info('\n***** elements')
|
194 |
-
# for e in elements:
|
195 |
-
# stre = str(e).replace(" ", " ")
|
196 |
-
# str_elements.append(stre)
|
197 |
-
# extract_text = ''.join(extract_subtext(str_elements))
|
198 |
-
# logger.info(
|
199 |
-
# f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
|
200 |
-
# )
|
201 |
-
#
|
202 |
-
# if len(extract_text.strip()) < 8:
|
203 |
-
# return ""
|
204 |
-
# else:
|
205 |
-
# return extract_text
|
206 |
-
|
207 |
-
|
208 |
-
# def extract_items_from_numbered_list(text):
|
209 |
-
# items = ""
|
210 |
-
# elements = text.split("\n")
|
211 |
-
# for candidate in elements:
|
212 |
-
# candidate = candidate.lstrip(". \t")
|
213 |
-
# if len(candidate) > 4 and candidate[0].isdigit():
|
214 |
-
# candidate = candidate[1:].lstrip(". ")
|
215 |
-
# if (
|
216 |
-
# len(candidate) > 4 and candidate[0].isdigit()
|
217 |
-
# ): # strip second digit if more than 10 items
|
218 |
-
# candidate = candidate[1:].lstrip(". ")
|
219 |
-
# logger.info("E {}".format(candidate))
|
220 |
-
# items += candidate + " "
|
221 |
-
# return items
|
222 |
-
|
223 |
-
|
224 |
def search_google(original_query, query_phrase):
|
225 |
-
all_urls = []
|
226 |
-
urls_used = []
|
227 |
-
urls_tried = []
|
228 |
-
tried_index = 0
|
229 |
full_text = ""
|
230 |
|
231 |
try: # query google for recent info
|
@@ -252,9 +182,8 @@ def search_google(original_query, query_phrase):
|
|
252 |
all_urls = copy.deepcopy(urls)
|
253 |
# initialize scan of Google urls
|
254 |
start_wall_time = time.time()
|
255 |
-
full_text
|
256 |
print(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
|
257 |
-
print("return from url processsing")
|
258 |
except:
|
259 |
traceback.print_exc()
|
260 |
-
return full_text
|
|
|
9 |
from itertools import zip_longest
|
10 |
|
11 |
import requests
|
|
|
12 |
from zenrows import ZenRowsClient
|
13 |
|
|
|
14 |
# this import style works in pycharm
|
15 |
from llmsearch import utilityV2 as ut
|
16 |
|
|
|
33 |
google_text = ""
|
34 |
try:
|
35 |
print(f"asking google {msg}; rephrased: {query_phrase}")
|
36 |
+
google_text = search_google(msg, query_phrase)
|
37 |
except:
|
38 |
traceback.print_exc()
|
39 |
|
|
|
75 |
traceback.print_exc()
|
76 |
print(f"{site} err")
|
77 |
pass
|
78 |
+
print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
79 |
return result, url
|
80 |
|
81 |
|
82 |
def process_urls(urls):
|
|
|
83 |
response = []
|
84 |
+
print(f"entering process urls: {len(urls)} found. {urls}")
|
|
|
|
|
|
|
|
|
|
|
85 |
start_time = time.time()
|
|
|
86 |
|
87 |
try:
|
88 |
+
for url in urls:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
result, url = process_url(url)
|
|
|
|
|
|
|
|
|
90 |
if len(result) > 0:
|
|
|
|
|
|
|
|
|
|
|
91 |
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
92 |
+
print(
|
93 |
+
f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
|
94 |
+
)
|
95 |
response.append(
|
96 |
{
|
97 |
"source": ut.extract_domain(url),
|
|
|
99 |
"text": result,
|
100 |
}
|
101 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
except:
|
103 |
traceback.print_exc()
|
104 |
+
|
105 |
print(
|
106 |
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
107 |
)
|
108 |
+
return response
|
109 |
|
110 |
|
111 |
def extract_subtext(text):
|
|
|
155 |
return urls
|
156 |
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
def search_google(original_query, query_phrase):
|
|
|
|
|
|
|
|
|
159 |
full_text = ""
|
160 |
|
161 |
try: # query google for recent info
|
|
|
182 |
all_urls = copy.deepcopy(urls)
|
183 |
# initialize scan of Google urls
|
184 |
start_wall_time = time.time()
|
185 |
+
full_text = process_urls(all_urls)
|
186 |
print(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
|
|
|
187 |
except:
|
188 |
traceback.print_exc()
|
189 |
+
return full_text
|