Spaces:
Runtime error
Runtime error
initial commit
Browse files- google_search.py +23 -19
google_search.py
CHANGED
@@ -16,6 +16,7 @@ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
|
16 |
# Number of pages to scrape
|
17 |
NUM_PAGES = 20
|
18 |
|
|
|
19 |
def build_results_beautifulsoup(url_list):
|
20 |
print("Starting to scrape URLs...")
|
21 |
start_time = time.perf_counter()
|
@@ -37,7 +38,7 @@ def build_results_beautifulsoup(url_list):
|
|
37 |
|
38 |
if soup:
|
39 |
print(f"Processing URL: {url}")
|
40 |
-
|
41 |
text = extract(
|
42 |
soup,
|
43 |
include_tables=False,
|
@@ -143,8 +144,8 @@ async def extract_pdf_text(content):
|
|
143 |
</body>
|
144 |
</html>
|
145 |
"""
|
146 |
-
html_bytes = html_content.encode(
|
147 |
-
return html_bytes
|
148 |
except Exception as e:
|
149 |
print(f"Error extracting PDF text: {str(e)}")
|
150 |
return None
|
@@ -183,39 +184,41 @@ def google_search_urls(
|
|
183 |
):
|
184 |
if skip_urls is None:
|
185 |
skip_urls = [] # Initialize as empty list if not provided
|
186 |
-
|
187 |
service = build("customsearch", "v1", developerKey=api_key)
|
188 |
url_list = []
|
189 |
start_index = 1 # Initial index for the search results
|
190 |
while len(url_list) < total_results:
|
191 |
# Fetch a page of results
|
192 |
-
results =
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
201 |
if "items" in results and len(results["items"]) > 0:
|
202 |
for count, link in enumerate(results["items"]):
|
203 |
url = link["link"]
|
204 |
# Skip if the URL is in the skip_urls list or doesn't match the domain filter
|
205 |
if url in skip_urls:
|
206 |
continue
|
207 |
-
if (domains_to_include is None) or any(
|
208 |
-
("." + domain) in url for domain in domains_to_include
|
209 |
-
):
|
210 |
if url not in url_list:
|
211 |
url_list.append(url)
|
212 |
else:
|
213 |
# No more results
|
214 |
break
|
215 |
-
|
216 |
# Move to the next page of results
|
217 |
start_index += num_results
|
218 |
-
|
219 |
return url_list[:total_results]
|
220 |
|
221 |
|
@@ -240,7 +243,8 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
|
|
240 |
result_content = build_results_beautifulsoup(url_list)
|
241 |
return result_content
|
242 |
|
|
|
243 |
if __name__ == "__main__":
|
244 |
res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
|
245 |
print(res.keys())
|
246 |
-
print(len(res))
|
|
|
16 |
# Number of pages to scrape
|
17 |
NUM_PAGES = 20
|
18 |
|
19 |
+
|
20 |
def build_results_beautifulsoup(url_list):
|
21 |
print("Starting to scrape URLs...")
|
22 |
start_time = time.perf_counter()
|
|
|
38 |
|
39 |
if soup:
|
40 |
print(f"Processing URL: {url}")
|
41 |
+
|
42 |
text = extract(
|
43 |
soup,
|
44 |
include_tables=False,
|
|
|
144 |
</body>
|
145 |
</html>
|
146 |
"""
|
147 |
+
html_bytes = html_content.encode("utf-8")
|
148 |
+
return html_bytes # Return in such a format that is parsable by trafilatura
|
149 |
except Exception as e:
|
150 |
print(f"Error extracting PDF text: {str(e)}")
|
151 |
return None
|
|
|
184 |
):
|
185 |
if skip_urls is None:
|
186 |
skip_urls = [] # Initialize as empty list if not provided
|
187 |
+
|
188 |
service = build("customsearch", "v1", developerKey=api_key)
|
189 |
url_list = []
|
190 |
start_index = 1 # Initial index for the search results
|
191 |
while len(url_list) < total_results:
|
192 |
# Fetch a page of results
|
193 |
+
results = (
|
194 |
+
service.cse()
|
195 |
+
.list(
|
196 |
+
q=text,
|
197 |
+
cx=cse_id,
|
198 |
+
sort=sorted_date,
|
199 |
+
start=start_index,
|
200 |
+
num=min(num_results, total_results - len(url_list)),
|
201 |
+
**kwargs,
|
202 |
+
)
|
203 |
+
.execute()
|
204 |
+
)
|
205 |
+
|
206 |
if "items" in results and len(results["items"]) > 0:
|
207 |
for count, link in enumerate(results["items"]):
|
208 |
url = link["link"]
|
209 |
# Skip if the URL is in the skip_urls list or doesn't match the domain filter
|
210 |
if url in skip_urls:
|
211 |
continue
|
212 |
+
if (domains_to_include is None) or any(("." + domain) in url for domain in domains_to_include):
|
|
|
|
|
213 |
if url not in url_list:
|
214 |
url_list.append(url)
|
215 |
else:
|
216 |
# No more results
|
217 |
break
|
218 |
+
|
219 |
# Move to the next page of results
|
220 |
start_index += num_results
|
221 |
+
|
222 |
return url_list[:total_results]
|
223 |
|
224 |
|
|
|
243 |
result_content = build_results_beautifulsoup(url_list)
|
244 |
return result_content
|
245 |
|
246 |
+
|
247 |
if __name__ == "__main__":
|
248 |
res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
|
249 |
print(res.keys())
|
250 |
+
print(len(res))
|