Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Sep 30, 2024

Commit

b51be98

1 Parent(s): 9a9aac4

add semantic scholar

Browse files

Files changed (2) hide show

app.py +7 -4
google_search.py +74 -11

app.py CHANGED Viewed

@@ -22,6 +22,7 @@ from google.cloud import storage
 if gr.NO_RELOAD:
     from humanize import humanize_text, device
     # humanize_text = None
     # device = None
     from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
@@ -805,7 +806,7 @@ scholar_urls = [
     "springer.com",
     # "sciencedirect.com", # 400
     # "onlinelibrary.wiley.com", # 400
-    "jstor.org", # 400
     "semanticscholar.org",
     "biorxiv.org",
     "medrxiv.org",
@@ -814,6 +815,7 @@ scholar_urls = [
     "cochranelibrary.com",
 ]
 def generate_and_format(
     input_role,
     topic,
@@ -858,9 +860,10 @@ def generate_and_format(
             input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
         )
         if scholar_mode_check:
-            scholar_site_queries = [f"site:{site.strip()}" for site in scholar_urls]
-            final_query += " " + " OR ".join(scholar_site_queries)
-        else:
             if include_sites:
                 site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
                 final_query += " " + " OR ".join(site_queries)

 if gr.NO_RELOAD:
     from humanize import humanize_text, device
     # humanize_text = None
     # device = None
     from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
     "springer.com",
     # "sciencedirect.com", # 400
     # "onlinelibrary.wiley.com", # 400
+    "jstor.org",  # 400
     "semanticscholar.org",
     "biorxiv.org",
     "medrxiv.org",
     "cochranelibrary.com",
 ]
 def generate_and_format(
     input_role,
     topic,
             input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
         )
         if scholar_mode_check:
+            # scholar_site_queries = [f"site:{site.strip()}" for site in scholar_urls]
+            # final_query += " " + " OR ".join(scholar_site_queries)
+            pass
+        else:
             if include_sites:
                 site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
                 final_query += " " + " OR ".join(site_queries)

google_search.py CHANGED Viewed

@@ -7,6 +7,7 @@ from dotenv import load_dotenv
 import requests
 import fitz
 from trafilatura import extract
 load_dotenv()
@@ -17,7 +18,7 @@ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
 NUM_PAGES = 20
-def build_results_beautifulsoup(url_list):
     print("Starting to scrape URLs...")
     start_time = time.perf_counter()
@@ -54,6 +55,9 @@ def build_results_beautifulsoup(url_list):
                 count += 1
             else:
                 print(f"Skipped URL: {url}, content too short (length: {len(text)})")
         else:
             print(f"Skipped URL: {url}, no soup content available.")
@@ -222,29 +226,88 @@ def google_search_urls(
     return url_list[:total_results]
 def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
     api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
     cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
     start_time = time.perf_counter()
     # if scholar_mode_check:
     #     topic += " -filetype:pdf"
-    url_list = google_search_urls(
-        topic,
-        sorted_date,
-        domains_to_include,
-        api_key,
-        cse_id,
-    )
     print("---")
     print(len(url_list))
     print(url_list)
     print("---")
-    print("Google Search processing time: ", time.perf_counter() - start_time)
-    result_content = build_results_beautifulsoup(url_list)
     return result_content
 if __name__ == "__main__":
-    res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
     print(res.keys())
     print(len(res))

 import requests
 import fitz
 from trafilatura import extract
+from bs4 import BeautifulSoup
 load_dotenv()
 NUM_PAGES = 20
+def build_results_beautifulsoup(url_list, scholar_abstracts: list[str] = None):
     print("Starting to scrape URLs...")
     start_time = time.perf_counter()
                 count += 1
             else:
                 print(f"Skipped URL: {url}, content too short (length: {len(text)})")
+        elif scholar_abstracts and scholar_abstracts.get(url):
+            print(f"Skipped URL: {url}, no soup content available. Returning scholar abstract instead.")
+            result_content[url] = scholar_abstracts.get(url)
         else:
             print(f"Skipped URL: {url}, no soup content available.")
     return url_list[:total_results]
+def scrape_abstract(url, title):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, "html.parser")
+    abstract_section = soup.find("div", class_="tldr-abstract-replacement paper-detail-page__tldr-abstract")
+    abstract = abstract_section.get_text().strip() if abstract_section else ""
+    return title + "\n" + abstract if abstract != "" else None
+def semantic_scholar_urls(
+    text,
+    sorted_date,
+    total_results=30,  # Total number of results to fetch
+    skip_urls=None,  # List of URLs to skip
+    **kwargs,
+):
+    ss_api_key = os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
+    semantic_scholar_endpoint = "http://api.semanticscholar.org/graph/v1/paper/search/"
+    date_from, date_to = sorted_date.split(":r:")[1].split(":")
+    year_from = date_from[:4]
+    year_to = date_to[:4]
+    success_count = 0
+    print(f"Dates: {year_from}-{year_to}")
+    query_params = {
+        "query": text,
+        "fields": "title,abstract,url,publicationTypes,publicationDate,openAccessPdf,fieldsOfStudy",
+        "year": f"{year_from}-{year_to}",
+        "limit": 3 * total_results,
+    }
+    headers = {"x-api-key": ss_api_key}
+    response = requests.get(semantic_scholar_endpoint, params=query_params, headers=headers).json()
+    url_list = []
+    scholar_abstracts = {}
+    for row in response.get("data", []):
+        if success_count >= total_results:
+            break
+        url = row.get("url")
+        if isinstance(url, dict) and url.get("url"):
+            url = url.get("url")
+        url_list.append(url)
+        abstract = row.get("abstract")
+        if abstract:
+            scholar_abstracts[url] = abstract
+            success_count += 1
+        if row.get("openAccessPdf") and row.get("url"):
+            url_list.append(row.get("openAccessPdf").get("url"))
+            success_count += 1
+    return url_list, scholar_abstracts
 def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
     api_key = os.environ.get("GOOGLE_SEARCH_API_KEY")
     cse_id = os.environ.get("GOOGLE_SEARCH_CSE_ID")
     start_time = time.perf_counter()
     # if scholar_mode_check:
     #     topic += " -filetype:pdf"
+    scholar_abstracts = None
+    if not scholar_mode_check:
+        url_list = google_search_urls(
+            topic,
+            sorted_date,
+            domains_to_include,
+            api_key,
+            cse_id,
+        )
+    else:
+        url_list, scholar_abstracts = semantic_scholar_urls(topic, sorted_date)
     print("---")
     print(len(url_list))
     print(url_list)
     print("---")
+    if scholar_mode_check:
+        print("Semantic Scholar processing time: ", time.perf_counter() - start_time)
+    else:
+        print("Google Search processing time: ", time.perf_counter() - start_time)
+    result_content = build_results_beautifulsoup(url_list, scholar_abstracts)
     return result_content
 if __name__ == "__main__":
+    res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, True)
     print(res.keys())
     print(len(res))
+    print(res)