Spaces:
Sleeping
Sleeping
arabellastrange
commited on
Commit
·
2417a86
1
Parent(s):
6ce8a36
concurrent pool for process_url
Browse files- web_search.py +8 -12
web_search.py
CHANGED
@@ -50,14 +50,11 @@ def search(msg, query_phrase):
|
|
50 |
|
51 |
# Define a function to make a single URL request and process the response
|
52 |
def process_url(url):
|
53 |
-
processed_page =
|
54 |
start_time = time.time()
|
55 |
-
site = ut.extract_site(url)
|
56 |
-
result = ""
|
57 |
try:
|
58 |
with warnings.catch_warnings():
|
59 |
warnings.simplefilter("ignore")
|
60 |
-
result = ""
|
61 |
try:
|
62 |
client = ZenRowsClient(os.getenv('zenrows_api_key'))
|
63 |
response = client.get(url)
|
@@ -65,22 +62,19 @@ def process_url(url):
|
|
65 |
result = response.text
|
66 |
if len(result) > 0:
|
67 |
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
68 |
-
processed_page
|
69 |
-
{
|
70 |
"source": ut.extract_domain(url),
|
71 |
"url": url,
|
72 |
"text": result,
|
73 |
}
|
74 |
-
|
|
|
75 |
except Exception:
|
76 |
traceback.print_exc()
|
77 |
return processed_page
|
78 |
except Exception:
|
79 |
traceback.print_exc()
|
80 |
-
|
81 |
-
pass
|
82 |
-
print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
83 |
-
return processed_page
|
84 |
|
85 |
|
86 |
def process_urls(urls):
|
@@ -90,7 +84,9 @@ def process_urls(urls):
|
|
90 |
|
91 |
try:
|
92 |
with ThreadPoolExecutor(max_workers=len(urls)) as pool:
|
93 |
-
|
|
|
|
|
94 |
except:
|
95 |
traceback.print_exc()
|
96 |
|
|
|
50 |
|
51 |
# Define a function to make a single URL request and process the response
|
52 |
def process_url(url):
|
53 |
+
processed_page = {}
|
54 |
start_time = time.time()
|
|
|
|
|
55 |
try:
|
56 |
with warnings.catch_warnings():
|
57 |
warnings.simplefilter("ignore")
|
|
|
58 |
try:
|
59 |
client = ZenRowsClient(os.getenv('zenrows_api_key'))
|
60 |
response = client.get(url)
|
|
|
62 |
result = response.text
|
63 |
if len(result) > 0:
|
64 |
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
65 |
+
processed_page = {
|
|
|
66 |
"source": ut.extract_domain(url),
|
67 |
"url": url,
|
68 |
"text": result,
|
69 |
}
|
70 |
+
print(f"Processed {url}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
71 |
+
return processed_page
|
72 |
except Exception:
|
73 |
traceback.print_exc()
|
74 |
return processed_page
|
75 |
except Exception:
|
76 |
traceback.print_exc()
|
77 |
+
return processed_page
|
|
|
|
|
|
|
78 |
|
79 |
|
80 |
def process_urls(urls):
|
|
|
84 |
|
85 |
try:
|
86 |
with ThreadPoolExecutor(max_workers=len(urls)) as pool:
|
87 |
+
for result in pool.map(process_url, urls):
|
88 |
+
print(f'returned {result}')
|
89 |
+
results.append(result)
|
90 |
except:
|
91 |
traceback.print_exc()
|
92 |
|