minko186 commited on
Commit
b51be98
·
1 Parent(s): 9a9aac4

add semantic scholar

Browse files
Files changed (2) hide show
  1. app.py +7 -4
  2. google_search.py +74 -11
app.py CHANGED
@@ -22,6 +22,7 @@ from google.cloud import storage
22
 
23
  if gr.NO_RELOAD:
24
  from humanize import humanize_text, device
 
25
  # humanize_text = None
26
  # device = None
27
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
@@ -805,7 +806,7 @@ scholar_urls = [
805
  "springer.com",
806
  # "sciencedirect.com", # 400
807
  # "onlinelibrary.wiley.com", # 400
808
- "jstor.org", # 400
809
  "semanticscholar.org",
810
  "biorxiv.org",
811
  "medrxiv.org",
@@ -814,6 +815,7 @@ scholar_urls = [
814
  "cochranelibrary.com",
815
  ]
816
 
 
817
  def generate_and_format(
818
  input_role,
819
  topic,
@@ -858,9 +860,10 @@ def generate_and_format(
858
  input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
859
  )
860
  if scholar_mode_check:
861
- scholar_site_queries = [f"site:{site.strip()}" for site in scholar_urls]
862
- final_query += " " + " OR ".join(scholar_site_queries)
863
- else:
 
864
  if include_sites:
865
  site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
866
  final_query += " " + " OR ".join(site_queries)
 
22
 
23
  if gr.NO_RELOAD:
24
  from humanize import humanize_text, device
25
+
26
  # humanize_text = None
27
  # device = None
28
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
 
806
  "springer.com",
807
  # "sciencedirect.com", # 400
808
  # "onlinelibrary.wiley.com", # 400
809
+ "jstor.org", # 400
810
  "semanticscholar.org",
811
  "biorxiv.org",
812
  "medrxiv.org",
 
815
  "cochranelibrary.com",
816
  ]
817
 
818
+
819
  def generate_and_format(
820
  input_role,
821
  topic,
 
860
  input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
861
  )
862
  if scholar_mode_check:
863
+ # scholar_site_queries = [f"site:{site.strip()}" for site in scholar_urls]
864
+ # final_query += " " + " OR ".join(scholar_site_queries)
865
+ pass
866
+ else:
867
  if include_sites:
868
  site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
869
  final_query += " " + " OR ".join(site_queries)
google_search.py CHANGED
@@ -7,6 +7,7 @@ from dotenv import load_dotenv
7
  import requests
8
  import fitz
9
  from trafilatura import extract
 
10
 
11
  load_dotenv()
12
 
@@ -17,7 +18,7 @@ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
17
  NUM_PAGES = 20
18
 
19
 
20
- def build_results_beautifulsoup(url_list):
21
  print("Starting to scrape URLs...")
22
  start_time = time.perf_counter()
23
 
@@ -54,6 +55,9 @@ def build_results_beautifulsoup(url_list):
54
  count += 1
55
  else:
56
  print(f"Skipped URL: {url}, content too short (length: {len(text)})")
 
 
 
57
  else:
58
  print(f"Skipped URL: {url}, no soup content available.")
59
 
@@ -222,29 +226,88 @@ def google_search_urls(
222
  return url_list[:total_results]
223
 
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
226
  api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
227
  cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
228
  start_time = time.perf_counter()
229
  # if scholar_mode_check:
230
  # topic += " -filetype:pdf"
231
- url_list = google_search_urls(
232
- topic,
233
- sorted_date,
234
- domains_to_include,
235
- api_key,
236
- cse_id,
237
- )
 
 
 
 
238
  print("---")
239
  print(len(url_list))
240
  print(url_list)
241
  print("---")
242
- print("Google Search processing time: ", time.perf_counter() - start_time)
243
- result_content = build_results_beautifulsoup(url_list)
 
 
 
244
  return result_content
245
 
246
 
247
  if __name__ == "__main__":
248
- res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
249
  print(res.keys())
250
  print(len(res))
 
 
7
  import requests
8
  import fitz
9
  from trafilatura import extract
10
+ from bs4 import BeautifulSoup
11
 
12
  load_dotenv()
13
 
 
18
  NUM_PAGES = 20
19
 
20
 
21
+ def build_results_beautifulsoup(url_list, scholar_abstracts: list[str] = None):
22
  print("Starting to scrape URLs...")
23
  start_time = time.perf_counter()
24
 
 
55
  count += 1
56
  else:
57
  print(f"Skipped URL: {url}, content too short (length: {len(text)})")
58
+ elif scholar_abstracts and scholar_abstracts.get(url):
59
+ print(f"Skipped URL: {url}, no soup content available. Returning scholar abstract instead.")
60
+ result_content[url] = scholar_abstracts.get(url)
61
  else:
62
  print(f"Skipped URL: {url}, no soup content available.")
63
 
 
226
  return url_list[:total_results]
227
 
228
 
229
+ def scrape_abstract(url, title):
230
+ response = requests.get(url)
231
+ soup = BeautifulSoup(response.content, "html.parser")
232
+ abstract_section = soup.find("div", class_="tldr-abstract-replacement paper-detail-page__tldr-abstract")
233
+ abstract = abstract_section.get_text().strip() if abstract_section else ""
234
+ return title + "\n" + abstract if abstract != "" else None
235
+
236
+
237
+ def semantic_scholar_urls(
238
+ text,
239
+ sorted_date,
240
+ total_results=30, # Total number of results to fetch
241
+ skip_urls=None, # List of URLs to skip
242
+ **kwargs,
243
+ ):
244
+ ss_api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
245
+ semantic_scholar_endpoint = "http://api.semanticscholar.org/graph/v1/paper/search/"
246
+
247
+ date_from, date_to = sorted_date.split(":r:")[1].split(":")
248
+ year_from = date_from[:4]
249
+ year_to = date_to[:4]
250
+ success_count = 0
251
+
252
+ print(f"Dates: {year_from}-{year_to}")
253
+ query_params = {
254
+ "query": text,
255
+ "fields": "title,abstract,url,publicationTypes,publicationDate,openAccessPdf,fieldsOfStudy",
256
+ "year": f"{year_from}-{year_to}",
257
+ "limit": 3 * total_results,
258
+ }
259
+ headers = {"x-api-key": ss_api_key}
260
+ response = requests.get(semantic_scholar_endpoint, params=query_params, headers=headers).json()
261
+ url_list = []
262
+ scholar_abstracts = {}
263
+ for row in response.get("data", []):
264
+ if success_count >= total_results:
265
+ break
266
+ url = row.get("url")
267
+ if isinstance(url, dict) and url.get("url"):
268
+ url = url.get("url")
269
+ url_list.append(url)
270
+ abstract = row.get("abstract")
271
+ if abstract:
272
+ scholar_abstracts[url] = abstract
273
+ success_count += 1
274
+ if row.get("openAccessPdf") and row.get("url"):
275
+ url_list.append(row.get("openAccessPdf").get("url"))
276
+ success_count += 1
277
+ return url_list, scholar_abstracts
278
+
279
+
280
  def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
281
  api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
282
  cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
283
  start_time = time.perf_counter()
284
  # if scholar_mode_check:
285
  # topic += " -filetype:pdf"
286
+ scholar_abstracts = None
287
+ if not scholar_mode_check:
288
+ url_list = google_search_urls(
289
+ topic,
290
+ sorted_date,
291
+ domains_to_include,
292
+ api_key,
293
+ cse_id,
294
+ )
295
+ else:
296
+ url_list, scholar_abstracts = semantic_scholar_urls(topic, sorted_date)
297
  print("---")
298
  print(len(url_list))
299
  print(url_list)
300
  print("---")
301
+ if scholar_mode_check:
302
+ print("Semantic Scholar processing time: ", time.perf_counter() - start_time)
303
+ else:
304
+ print("Google Search processing time: ", time.perf_counter() - start_time)
305
+ result_content = build_results_beautifulsoup(url_list, scholar_abstracts)
306
  return result_content
307
 
308
 
309
  if __name__ == "__main__":
310
+ res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, True)
311
  print(res.keys())
312
  print(len(res))
313
+ print(res)