Spaces:
Runtime error
Runtime error
add semantic scholar
Browse files- app.py +7 -4
- google_search.py +74 -11
app.py
CHANGED
@@ -22,6 +22,7 @@ from google.cloud import storage
|
|
22 |
|
23 |
if gr.NO_RELOAD:
|
24 |
from humanize import humanize_text, device
|
|
|
25 |
# humanize_text = None
|
26 |
# device = None
|
27 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
@@ -805,7 +806,7 @@ scholar_urls = [
|
|
805 |
"springer.com",
|
806 |
# "sciencedirect.com", # 400
|
807 |
# "onlinelibrary.wiley.com", # 400
|
808 |
-
"jstor.org",
|
809 |
"semanticscholar.org",
|
810 |
"biorxiv.org",
|
811 |
"medrxiv.org",
|
@@ -814,6 +815,7 @@ scholar_urls = [
|
|
814 |
"cochranelibrary.com",
|
815 |
]
|
816 |
|
|
|
817 |
def generate_and_format(
|
818 |
input_role,
|
819 |
topic,
|
@@ -858,9 +860,10 @@ def generate_and_format(
|
|
858 |
input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
|
859 |
)
|
860 |
if scholar_mode_check:
|
861 |
-
scholar_site_queries = [f"site:{site.strip()}" for site in scholar_urls]
|
862 |
-
final_query += " " + " OR ".join(scholar_site_queries)
|
863 |
-
|
|
|
864 |
if include_sites:
|
865 |
site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
|
866 |
final_query += " " + " OR ".join(site_queries)
|
|
|
22 |
|
23 |
if gr.NO_RELOAD:
|
24 |
from humanize import humanize_text, device
|
25 |
+
|
26 |
# humanize_text = None
|
27 |
# device = None
|
28 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
|
|
806 |
"springer.com",
|
807 |
# "sciencedirect.com", # 400
|
808 |
# "onlinelibrary.wiley.com", # 400
|
809 |
+
"jstor.org", # 400
|
810 |
"semanticscholar.org",
|
811 |
"biorxiv.org",
|
812 |
"medrxiv.org",
|
|
|
815 |
"cochranelibrary.com",
|
816 |
]
|
817 |
|
818 |
+
|
819 |
def generate_and_format(
|
820 |
input_role,
|
821 |
topic,
|
|
|
860 |
input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
|
861 |
)
|
862 |
if scholar_mode_check:
|
863 |
+
# scholar_site_queries = [f"site:{site.strip()}" for site in scholar_urls]
|
864 |
+
# final_query += " " + " OR ".join(scholar_site_queries)
|
865 |
+
pass
|
866 |
+
else:
|
867 |
if include_sites:
|
868 |
site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
|
869 |
final_query += " " + " OR ".join(site_queries)
|
google_search.py
CHANGED
@@ -7,6 +7,7 @@ from dotenv import load_dotenv
|
|
7 |
import requests
|
8 |
import fitz
|
9 |
from trafilatura import extract
|
|
|
10 |
|
11 |
load_dotenv()
|
12 |
|
@@ -17,7 +18,7 @@ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
|
17 |
NUM_PAGES = 20
|
18 |
|
19 |
|
20 |
-
def build_results_beautifulsoup(url_list):
|
21 |
print("Starting to scrape URLs...")
|
22 |
start_time = time.perf_counter()
|
23 |
|
@@ -54,6 +55,9 @@ def build_results_beautifulsoup(url_list):
|
|
54 |
count += 1
|
55 |
else:
|
56 |
print(f"Skipped URL: {url}, content too short (length: {len(text)})")
|
|
|
|
|
|
|
57 |
else:
|
58 |
print(f"Skipped URL: {url}, no soup content available.")
|
59 |
|
@@ -222,29 +226,88 @@ def google_search_urls(
|
|
222 |
return url_list[:total_results]
|
223 |
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
|
226 |
api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
|
227 |
cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
228 |
start_time = time.perf_counter()
|
229 |
# if scholar_mode_check:
|
230 |
# topic += " -filetype:pdf"
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
238 |
print("---")
|
239 |
print(len(url_list))
|
240 |
print(url_list)
|
241 |
print("---")
|
242 |
-
|
243 |
-
|
|
|
|
|
|
|
244 |
return result_content
|
245 |
|
246 |
|
247 |
if __name__ == "__main__":
|
248 |
-
res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list,
|
249 |
print(res.keys())
|
250 |
print(len(res))
|
|
|
|
7 |
import requests
|
8 |
import fitz
|
9 |
from trafilatura import extract
|
10 |
+
from bs4 import BeautifulSoup
|
11 |
|
12 |
load_dotenv()
|
13 |
|
|
|
18 |
NUM_PAGES = 20
|
19 |
|
20 |
|
21 |
+
def build_results_beautifulsoup(url_list, scholar_abstracts: list[str] = None):
|
22 |
print("Starting to scrape URLs...")
|
23 |
start_time = time.perf_counter()
|
24 |
|
|
|
55 |
count += 1
|
56 |
else:
|
57 |
print(f"Skipped URL: {url}, content too short (length: {len(text)})")
|
58 |
+
elif scholar_abstracts and scholar_abstracts.get(url):
|
59 |
+
print(f"Skipped URL: {url}, no soup content available. Returning scholar abstract instead.")
|
60 |
+
result_content[url] = scholar_abstracts.get(url)
|
61 |
else:
|
62 |
print(f"Skipped URL: {url}, no soup content available.")
|
63 |
|
|
|
226 |
return url_list[:total_results]
|
227 |
|
228 |
|
229 |
+
def scrape_abstract(url, title):
|
230 |
+
response = requests.get(url)
|
231 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
232 |
+
abstract_section = soup.find("div", class_="tldr-abstract-replacement paper-detail-page__tldr-abstract")
|
233 |
+
abstract = abstract_section.get_text().strip() if abstract_section else ""
|
234 |
+
return title + "\n" + abstract if abstract != "" else None
|
235 |
+
|
236 |
+
|
237 |
+
def semantic_scholar_urls(
|
238 |
+
text,
|
239 |
+
sorted_date,
|
240 |
+
total_results=30, # Total number of results to fetch
|
241 |
+
skip_urls=None, # List of URLs to skip
|
242 |
+
**kwargs,
|
243 |
+
):
|
244 |
+
ss_api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
|
245 |
+
semantic_scholar_endpoint = "http://api.semanticscholar.org/graph/v1/paper/search/"
|
246 |
+
|
247 |
+
date_from, date_to = sorted_date.split(":r:")[1].split(":")
|
248 |
+
year_from = date_from[:4]
|
249 |
+
year_to = date_to[:4]
|
250 |
+
success_count = 0
|
251 |
+
|
252 |
+
print(f"Dates: {year_from}-{year_to}")
|
253 |
+
query_params = {
|
254 |
+
"query": text,
|
255 |
+
"fields": "title,abstract,url,publicationTypes,publicationDate,openAccessPdf,fieldsOfStudy",
|
256 |
+
"year": f"{year_from}-{year_to}",
|
257 |
+
"limit": 3 * total_results,
|
258 |
+
}
|
259 |
+
headers = {"x-api-key": ss_api_key}
|
260 |
+
response = requests.get(semantic_scholar_endpoint, params=query_params, headers=headers).json()
|
261 |
+
url_list = []
|
262 |
+
scholar_abstracts = {}
|
263 |
+
for row in response.get("data", []):
|
264 |
+
if success_count >= total_results:
|
265 |
+
break
|
266 |
+
url = row.get("url")
|
267 |
+
if isinstance(url, dict) and url.get("url"):
|
268 |
+
url = url.get("url")
|
269 |
+
url_list.append(url)
|
270 |
+
abstract = row.get("abstract")
|
271 |
+
if abstract:
|
272 |
+
scholar_abstracts[url] = abstract
|
273 |
+
success_count += 1
|
274 |
+
if row.get("openAccessPdf") and row.get("url"):
|
275 |
+
url_list.append(row.get("openAccessPdf").get("url"))
|
276 |
+
success_count += 1
|
277 |
+
return url_list, scholar_abstracts
|
278 |
+
|
279 |
+
|
280 |
def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
|
281 |
api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
|
282 |
cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
|
283 |
start_time = time.perf_counter()
|
284 |
# if scholar_mode_check:
|
285 |
# topic += " -filetype:pdf"
|
286 |
+
scholar_abstracts = None
|
287 |
+
if not scholar_mode_check:
|
288 |
+
url_list = google_search_urls(
|
289 |
+
topic,
|
290 |
+
sorted_date,
|
291 |
+
domains_to_include,
|
292 |
+
api_key,
|
293 |
+
cse_id,
|
294 |
+
)
|
295 |
+
else:
|
296 |
+
url_list, scholar_abstracts = semantic_scholar_urls(topic, sorted_date)
|
297 |
print("---")
|
298 |
print(len(url_list))
|
299 |
print(url_list)
|
300 |
print("---")
|
301 |
+
if scholar_mode_check:
|
302 |
+
print("Semantic Scholar processing time: ", time.perf_counter() - start_time)
|
303 |
+
else:
|
304 |
+
print("Google Search processing time: ", time.perf_counter() - start_time)
|
305 |
+
result_content = build_results_beautifulsoup(url_list, scholar_abstracts)
|
306 |
return result_content
|
307 |
|
308 |
|
309 |
if __name__ == "__main__":
|
310 |
+
res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, True)
|
311 |
print(res.keys())
|
312 |
print(len(res))
|
313 |
+
print(res)
|