arabellastrange commited on
Commit
2417a86
·
1 Parent(s): 6ce8a36

concurrent pool for process_url

Browse files
Files changed (1) hide show
  1. web_search.py +8 -12
web_search.py CHANGED
@@ -50,14 +50,11 @@ def search(msg, query_phrase):
50
 
51
  # Define a function to make a single URL request and process the response
52
  def process_url(url):
53
- processed_page = []
54
  start_time = time.time()
55
- site = ut.extract_site(url)
56
- result = ""
57
  try:
58
  with warnings.catch_warnings():
59
  warnings.simplefilter("ignore")
60
- result = ""
61
  try:
62
  client = ZenRowsClient(os.getenv('zenrows_api_key'))
63
  response = client.get(url)
@@ -65,22 +62,19 @@ def process_url(url):
65
  result = response.text
66
  if len(result) > 0:
67
  if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
68
- processed_page.append(
69
- {
70
  "source": ut.extract_domain(url),
71
  "url": url,
72
  "text": result,
73
  }
74
- )
 
75
  except Exception:
76
  traceback.print_exc()
77
  return processed_page
78
  except Exception:
79
  traceback.print_exc()
80
- print(f"{site} err")
81
- pass
82
- print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
83
- return processed_page
84
 
85
 
86
  def process_urls(urls):
@@ -90,7 +84,9 @@ def process_urls(urls):
90
 
91
  try:
92
  with ThreadPoolExecutor(max_workers=len(urls)) as pool:
93
- results = pool.map(process_url, urls)
 
 
94
  except:
95
  traceback.print_exc()
96
 
 
50
 
51
  # Define a function to make a single URL request and process the response
52
  def process_url(url):
53
+ processed_page = {}
54
  start_time = time.time()
 
 
55
  try:
56
  with warnings.catch_warnings():
57
  warnings.simplefilter("ignore")
 
58
  try:
59
  client = ZenRowsClient(os.getenv('zenrows_api_key'))
60
  response = client.get(url)
 
62
  result = response.text
63
  if len(result) > 0:
64
  if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
65
+ processed_page = {
 
66
  "source": ut.extract_domain(url),
67
  "url": url,
68
  "text": result,
69
  }
70
+ print(f"Processed {url}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
71
+ return processed_page
72
  except Exception:
73
  traceback.print_exc()
74
  return processed_page
75
  except Exception:
76
  traceback.print_exc()
77
+ return processed_page
 
 
 
78
 
79
 
80
  def process_urls(urls):
 
84
 
85
  try:
86
  with ThreadPoolExecutor(max_workers=len(urls)) as pool:
87
+ for result in pool.map(process_url, urls):
88
+ print(f'returned {result}')
89
+ results.append(result)
90
  except:
91
  traceback.print_exc()
92