arabellastrange commited on
Commit
200cc44
·
1 Parent(s): f096f8f

different url processing loop

Browse files
Files changed (1) hide show
  1. web_search.py +11 -82
web_search.py CHANGED
@@ -9,10 +9,8 @@ import warnings
9
  from itertools import zip_longest
10
 
11
  import requests
12
- from unstructured.partition.html import partition_html
13
  from zenrows import ZenRowsClient
14
 
15
- from llmsearch import site_stats
16
  # this import style works in pycharm
17
  from llmsearch import utilityV2 as ut
18
 
@@ -35,7 +33,7 @@ def search(msg, query_phrase):
35
  google_text = ""
36
  try:
37
  print(f"asking google {msg}; rephrased: {query_phrase}")
38
- google_text, urls_all, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
39
  except:
40
  traceback.print_exc()
41
 
@@ -77,43 +75,23 @@ def process_url(url):
77
  traceback.print_exc()
78
  print(f"{site} err")
79
  pass
80
- print(f"Processed {site}: {len(response.text)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
81
  return result, url
82
 
83
 
84
  def process_urls(urls):
85
- # Create a ThreadPoolExecutor with 5 worker threads
86
  response = []
87
- print("entering process urls")
88
- full_text = ""
89
- used_index = 0
90
- urls_used = ["" for i in range(30)]
91
- tried_index = 0
92
- urls_tried = ["" for i in range(30)]
93
  start_time = time.time()
94
- in_process = []
95
 
96
  try:
97
- while (len(urls) > 0
98
- # no sense starting if not much time left
99
- and (len(full_text) < 4800 and len(in_process) < 10 and time.time() - start_time < 8)
100
- ):
101
- recommendation = site_stats.get_next(urls, sample_unknown=True)
102
- # set timeout so we don't wait for a slow site forever
103
- timeout = 12 - int(time.time() - start_time)
104
- url = recommendation[1]
105
  result, url = process_url(url)
106
- urls_tried[tried_index] = url
107
- tried_index += 1
108
- urls.remove(url)
109
- print(f"queued {ut.extract_site(url)}, {timeout}")
110
  if len(result) > 0:
111
- urls_used[used_index] = url
112
- used_index += 1
113
- print(
114
- f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
115
- )
116
  if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
 
 
 
117
  response.append(
118
  {
119
  "source": ut.extract_domain(url),
@@ -121,18 +99,13 @@ def process_urls(urls):
121
  "text": result,
122
  }
123
  )
124
-
125
- if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
126
- print(
127
- f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
128
- )
129
- return response, used_index, urls_used, tried_index, urls_tried
130
  except:
131
  traceback.print_exc()
 
132
  print(
133
  f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
134
  )
135
- return response, urls_used, tried_index, urls_tried
136
 
137
 
138
  def extract_subtext(text):
@@ -182,50 +155,7 @@ def request_google(query_phrase):
182
  return urls
183
 
184
 
185
- # def response_text_extract(url, response):
186
- # extract_text = ""
187
- # if url.endswith("pdf"):
188
- # pass
189
- # else:
190
- # if response is not None:
191
- # elements = partition_html(text=response)
192
- # str_elements = []
193
- # logger.info('\n***** elements')
194
- # for e in elements:
195
- # stre = str(e).replace(" ", " ")
196
- # str_elements.append(stre)
197
- # extract_text = ''.join(extract_subtext(str_elements))
198
- # logger.info(
199
- # f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
200
- # )
201
- #
202
- # if len(extract_text.strip()) < 8:
203
- # return ""
204
- # else:
205
- # return extract_text
206
-
207
-
208
- # def extract_items_from_numbered_list(text):
209
- # items = ""
210
- # elements = text.split("\n")
211
- # for candidate in elements:
212
- # candidate = candidate.lstrip(". \t")
213
- # if len(candidate) > 4 and candidate[0].isdigit():
214
- # candidate = candidate[1:].lstrip(". ")
215
- # if (
216
- # len(candidate) > 4 and candidate[0].isdigit()
217
- # ): # strip second digit if more than 10 items
218
- # candidate = candidate[1:].lstrip(". ")
219
- # logger.info("E {}".format(candidate))
220
- # items += candidate + " "
221
- # return items
222
-
223
-
224
  def search_google(original_query, query_phrase):
225
- all_urls = []
226
- urls_used = []
227
- urls_tried = []
228
- tried_index = 0
229
  full_text = ""
230
 
231
  try: # query google for recent info
@@ -252,9 +182,8 @@ def search_google(original_query, query_phrase):
252
  all_urls = copy.deepcopy(urls)
253
  # initialize scan of Google urls
254
  start_wall_time = time.time()
255
- full_text, urls_used, tried_index, urls_tried = process_urls(all_urls)
256
  print(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
257
- print("return from url processsing")
258
  except:
259
  traceback.print_exc()
260
- return full_text, all_urls, urls_used, tried_index, urls_tried
 
9
  from itertools import zip_longest
10
 
11
  import requests
 
12
  from zenrows import ZenRowsClient
13
 
 
14
  # this import style works in pycharm
15
  from llmsearch import utilityV2 as ut
16
 
 
33
  google_text = ""
34
  try:
35
  print(f"asking google {msg}; rephrased: {query_phrase}")
36
+ google_text = search_google(msg, query_phrase)
37
  except:
38
  traceback.print_exc()
39
 
 
75
  traceback.print_exc()
76
  print(f"{site} err")
77
  pass
78
+ print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
79
  return result, url
80
 
81
 
82
  def process_urls(urls):
 
83
  response = []
84
+ print(f"entering process urls: {len(urls)} found. {urls}")
 
 
 
 
 
85
  start_time = time.time()
 
86
 
87
  try:
88
+ for url in urls:
 
 
 
 
 
 
 
89
  result, url = process_url(url)
 
 
 
 
90
  if len(result) > 0:
 
 
 
 
 
91
  if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
92
+ print(
93
+ f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
94
+ )
95
  response.append(
96
  {
97
  "source": ut.extract_domain(url),
 
99
  "text": result,
100
  }
101
  )
 
 
 
 
 
 
102
  except:
103
  traceback.print_exc()
104
+
105
  print(
106
  f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
107
  )
108
+ return response
109
 
110
 
111
  def extract_subtext(text):
 
155
  return urls
156
 
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  def search_google(original_query, query_phrase):
 
 
 
 
159
  full_text = ""
160
 
161
  try: # query google for recent info
 
182
  all_urls = copy.deepcopy(urls)
183
  # initialize scan of Google urls
184
  start_wall_time = time.time()
185
+ full_text = process_urls(all_urls)
186
  print(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
 
187
  except:
188
  traceback.print_exc()
189
+ return full_text