minko186 commited on
Commit
9a9aac4
·
1 Parent(s): a6fbfb6

initial commit

Browse files
Files changed (1) hide show
  1. google_search.py +23 -19
google_search.py CHANGED
@@ -16,6 +16,7 @@ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
16
  # Number of pages to scrape
17
  NUM_PAGES = 20
18
 
 
19
  def build_results_beautifulsoup(url_list):
20
  print("Starting to scrape URLs...")
21
  start_time = time.perf_counter()
@@ -37,7 +38,7 @@ def build_results_beautifulsoup(url_list):
37
 
38
  if soup:
39
  print(f"Processing URL: {url}")
40
-
41
  text = extract(
42
  soup,
43
  include_tables=False,
@@ -143,8 +144,8 @@ async def extract_pdf_text(content):
143
  </body>
144
  </html>
145
  """
146
- html_bytes = html_content.encode('utf-8')
147
- return html_bytes # Return in such a format that is parsable by trafilatura
148
  except Exception as e:
149
  print(f"Error extracting PDF text: {str(e)}")
150
  return None
@@ -183,39 +184,41 @@ def google_search_urls(
183
  ):
184
  if skip_urls is None:
185
  skip_urls = [] # Initialize as empty list if not provided
186
-
187
  service = build("customsearch", "v1", developerKey=api_key)
188
  url_list = []
189
  start_index = 1 # Initial index for the search results
190
  while len(url_list) < total_results:
191
  # Fetch a page of results
192
- results = service.cse().list(
193
- q=text,
194
- cx=cse_id,
195
- sort=sorted_date,
196
- start=start_index,
197
- num=min(num_results, total_results - len(url_list)),
198
- **kwargs
199
- ).execute()
200
-
 
 
 
 
201
  if "items" in results and len(results["items"]) > 0:
202
  for count, link in enumerate(results["items"]):
203
  url = link["link"]
204
  # Skip if the URL is in the skip_urls list or doesn't match the domain filter
205
  if url in skip_urls:
206
  continue
207
- if (domains_to_include is None) or any(
208
- ("." + domain) in url for domain in domains_to_include
209
- ):
210
  if url not in url_list:
211
  url_list.append(url)
212
  else:
213
  # No more results
214
  break
215
-
216
  # Move to the next page of results
217
  start_index += num_results
218
-
219
  return url_list[:total_results]
220
 
221
 
@@ -240,7 +243,8 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
240
  result_content = build_results_beautifulsoup(url_list)
241
  return result_content
242
 
 
243
  if __name__ == "__main__":
244
  res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
245
  print(res.keys())
246
- print(len(res))
 
16
  # Number of pages to scrape
17
  NUM_PAGES = 20
18
 
19
+
20
  def build_results_beautifulsoup(url_list):
21
  print("Starting to scrape URLs...")
22
  start_time = time.perf_counter()
 
38
 
39
  if soup:
40
  print(f"Processing URL: {url}")
41
+
42
  text = extract(
43
  soup,
44
  include_tables=False,
 
144
  </body>
145
  </html>
146
  """
147
+ html_bytes = html_content.encode("utf-8")
148
+ return html_bytes # Return in such a format that is parsable by trafilatura
149
  except Exception as e:
150
  print(f"Error extracting PDF text: {str(e)}")
151
  return None
 
184
  ):
185
  if skip_urls is None:
186
  skip_urls = [] # Initialize as empty list if not provided
187
+
188
  service = build("customsearch", "v1", developerKey=api_key)
189
  url_list = []
190
  start_index = 1 # Initial index for the search results
191
  while len(url_list) < total_results:
192
  # Fetch a page of results
193
+ results = (
194
+ service.cse()
195
+ .list(
196
+ q=text,
197
+ cx=cse_id,
198
+ sort=sorted_date,
199
+ start=start_index,
200
+ num=min(num_results, total_results - len(url_list)),
201
+ **kwargs,
202
+ )
203
+ .execute()
204
+ )
205
+
206
  if "items" in results and len(results["items"]) > 0:
207
  for count, link in enumerate(results["items"]):
208
  url = link["link"]
209
  # Skip if the URL is in the skip_urls list or doesn't match the domain filter
210
  if url in skip_urls:
211
  continue
212
+ if (domains_to_include is None) or any(("." + domain) in url for domain in domains_to_include):
 
 
213
  if url not in url_list:
214
  url_list.append(url)
215
  else:
216
  # No more results
217
  break
218
+
219
  # Move to the next page of results
220
  start_index += num_results
221
+
222
  return url_list[:total_results]
223
 
224
 
 
243
  result_content = build_results_beautifulsoup(url_list)
244
  return result_content
245
 
246
+
247
  if __name__ == "__main__":
248
  res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
249
  print(res.keys())
250
+ print(len(res))