Spaces:

raannakasturi
/

ReXploreIDFetchingAPI

Running

App Files Files Community

raannakasturi commited on 29 days ago

Commit

a4b6d0b

1 Parent(s): 8a1664b

Refactor data extraction functions: rename extract_pmc_data to extract_sd_data, add extract_phys_data, and update file handling for new data sources

Browse files

Files changed (4) hide show

arvix.py +4 -4
fetch_data.py +16 -4
phys.py +98 -0
pmc.py +7 -7

arvix.py CHANGED Viewed

@@ -55,8 +55,8 @@ def extract_data(category):
     return list(all_ids)
 def extract_arxiv_data():
-    if not utils.download_datafile('arxiv.txt'):
-        raise Exception("Failed to download datafile")
     categories = {
         "Astrophysics": ["astro-ph"],
         "Condensed Matter": ["cond-mat"],
@@ -93,8 +93,8 @@ def extract_arxiv_data():
         while len(category_ids) < 2:
             category_ids.add(random.choice(list(used_ids)))
         data[category] = {"ids": list(category_ids), "count": len(category_ids)}
-    if not utils.upload_datafile('arxiv.txt'):
-        raise Exception("Failed to upload datafile")
     return json.dumps(data, indent=4, ensure_ascii=False)
 if __name__ == '__main__':

     return list(all_ids)
 def extract_arxiv_data():
+    # if not utils.download_datafile('arxiv.txt'):
+    #     raise Exception("Failed to download datafile")
     categories = {
         "Astrophysics": ["astro-ph"],
         "Condensed Matter": ["cond-mat"],
         while len(category_ids) < 2:
             category_ids.add(random.choice(list(used_ids)))
         data[category] = {"ids": list(category_ids), "count": len(category_ids)}
+    # if not utils.upload_datafile('arxiv.txt'):
+    #     raise Exception("Failed to upload datafile")
     return json.dumps(data, indent=4, ensure_ascii=False)
 if __name__ == '__main__':

fetch_data.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from arvix import extract_arxiv_data
-from pmc import extract_pmc_data
 import json
 import dotenv
 import os
@@ -11,8 +12,11 @@ ACCESS_KEY = os.getenv("ACCESS_KEY")
 def fetch_arxiv_data():
     return json.loads(extract_arxiv_data())
-def fetch_pmc_data():
-    return json.loads(extract_pmc_data())
 def fetch_data(user_access_key):
     if user_access_key != ACCESS_KEY:
@@ -23,10 +27,12 @@ def fetch_data(user_access_key):
             papers_data['status'] = 'success'
             papers_data['data'] = {}
             with ThreadPoolExecutor() as executor:
-                pmc_future = executor.submit(fetch_pmc_data)
                 arxiv_future = executor.submit(fetch_arxiv_data)
                 pmc_data = pmc_future.result()
                 arxiv_data = arxiv_future.result()
             for topic, topic_data in pmc_data.items():
                 if topic_data['count'] == 0:
                     continue
@@ -39,6 +45,12 @@ def fetch_data(user_access_key):
                 else:
                     papers_data['data'][topic] = {}
                     papers_data['data'][topic]['ids'] = topic_data['ids']
         except Exception as e:
             print(str(e))
             papers_data['status'] = 'error'

 from arvix import extract_arxiv_data
+from pmc import extract_sd_data
+from phys import extract_phys_data
 import json
 import dotenv
 import os
 def fetch_arxiv_data():
     return json.loads(extract_arxiv_data())
+def fetch_sd_data():
+    return json.loads(extract_sd_data())
+def fetch_phys_data():
+    return json.loads(extract_phys_data())
 def fetch_data(user_access_key):
     if user_access_key != ACCESS_KEY:
             papers_data['status'] = 'success'
             papers_data['data'] = {}
             with ThreadPoolExecutor() as executor:
+                pmc_future = executor.submit(fetch_sd_data)
                 arxiv_future = executor.submit(fetch_arxiv_data)
+                phys_future = executor.submit(fetch_phys_data)
                 pmc_data = pmc_future.result()
                 arxiv_data = arxiv_future.result()
+                phys_data = phys_future.result()
             for topic, topic_data in pmc_data.items():
                 if topic_data['count'] == 0:
                     continue
                 else:
                     papers_data['data'][topic] = {}
                     papers_data['data'][topic]['ids'] = topic_data['ids']
+            for topic, topic_data in phys_data.items():
+                if topic_data['count'] == 0:
+                    continue
+                else:
+                    papers_data['data'][topic] = {}
+                    papers_data['data'][topic]['ids'] = topic_data['ids']
         except Exception as e:
             print(str(e))
             papers_data['status'] = 'error'

phys.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import json
+import xml.etree.ElementTree as ET
+from bs4 import BeautifulSoup
+import requests
+import threading
+import utils
+def fetch_links(link):
+    links = []
+    xml_data = utils.fetch_page(link)
+    items = ET.fromstring(xml_data).findall('channel/item')
+    for item in items:
+        link = item.find('link').text
+        links.append(link)
+    return links
+def fetch_all_links():
+    category_link_data = {
+        "Earth": "https://phys.org/rss-feed/breaking/earth-news/",
+        "Science": "https://phys.org/rss-feed/breaking/science-news/",
+        "Nano-technology": "https://phys.org/rss-feed/breaking/nanotech-news/",
+        "Physics": "https://phys.org/rss-feed/breaking/physics-news/",
+        "Astronomy & Space": "https://phys.org/rss-feed/breaking/space-news/",
+        "Biology": "https://phys.org/rss-feed/breaking/biology-news/",
+        "Chemistry": "https://phys.org/rss-feed/breaking/chemistry-news/",
+    }
+    sd_links_data = {}
+    for category, link in category_link_data.items():
+        links = fetch_links(link)
+        sd_links_data[category] = links
+    return json.dumps(sd_links_data, indent=4, ensure_ascii=False)
+def fetch_dois():
+    doi_data = {}
+    data = json.loads(fetch_all_links())
+    for topic, links in data.items():
+        doi_list = []
+        for link in links:
+            page_content = utils.fetch_page(link)
+            page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div",class_="article-main__more")
+            for page_data in page_datas:
+                doi_link = page_data.find("a", attrs={"data-doi":"1"})
+                if doi_link:
+                    doi = doi_link.text.split('DOI: ')[-1]
+                    if doi.startswith('10.'):
+                        doi_list.append(doi)
+        doi_data[topic] = doi_list
+    return json.dumps(doi_data, indent=4, ensure_ascii=False)
+def fetch_doi_data():
+    result = []
+    def fetch_and_store():
+        result.append(fetch_dois())
+    thread = threading.Thread(target=fetch_and_store)
+    thread.start()
+    thread.join()
+    return result[0] if result else {}
+def doi_to_pmc():
+    data = json.loads(fetch_doi_data())
+    pmc_data = {}
+    for topic, dois in data.items():
+        if not dois:
+            continue
+        doi_list = ",".join(dois)
+        try:
+            url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
+            doi_pmc_data = requests.get(url).json()
+            if doi_pmc_data['status'] == 'ok':
+                pmc_list = [record['pmcid'] for record in doi_pmc_data['records'] if 'pmcid' in record and record.get('live', True)]
+                pmc_data[topic] = pmc_list[:2]
+        except Exception as e:
+            print(f"Error: {str(e)}")
+    return json.dumps(pmc_data, indent=4, ensure_ascii=False)
+def extract_phys_data():
+    if not utils.download_datafile('phys.txt'):
+        raise Exception("Failed to download datafile")
+    pmc_data = {}
+    pmcid_data = json.loads(doi_to_pmc())
+    for topic, pmcids in pmcid_data.items():
+        pmc_ids = []
+        for pmcid in pmcids:
+            if len(pmc_ids) >= 2:
+                break
+            if not utils.check_data_in_file(pmcid, 'phys.txt'):
+                utils.write_data_to_file(pmcid, 'phys.txt')
+                pmc_ids.append(pmcid)
+        pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
+    if not utils.upload_datafile('phys.txt'):
+        raise Exception("Failed to upload datafile")
+    return json.dumps(pmc_data, indent=4, ensure_ascii=False)
+if __name__ == "__main__":
+    data = extract_phys_data()
+    with open('phys_data.json', 'w') as f:
+        f.write(data)

pmc.py CHANGED Viewed

@@ -66,8 +66,8 @@ def doi_to_pmc():
             print(f"Error: {str(e)}")
     return json.dumps(pmc_data, indent=4, ensure_ascii=False)
-def extract_pmc_data():
-    if not utils.download_datafile('pmc.txt'):
         raise Exception("Failed to download datafile")
     pmc_data = {}
     pmcid_data = json.loads(doi_to_pmc())
@@ -76,15 +76,15 @@ def extract_pmc_data():
         for pmcid in pmcids:
             if len(pmc_ids) >= 2:
                 break
-            if not utils.check_data_in_file(pmcid, 'pmc.txt'):
-                utils.write_data_to_file(pmcid, 'pmc.txt')
                 pmc_ids.append(pmcid)
         pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
-    if not utils.upload_datafile('pmc.txt'):
         raise Exception("Failed to upload datafile")
     return json.dumps(pmc_data, indent=4, ensure_ascii=False)
 if __name__ == "__main__":
-    data = extract_pmc_data()
-    with open('pmc_data.json', 'w') as f:
         f.write(data)

             print(f"Error: {str(e)}")
     return json.dumps(pmc_data, indent=4, ensure_ascii=False)
+def extract_sd_data():
+    if not utils.download_datafile('sd.txt'):
         raise Exception("Failed to download datafile")
     pmc_data = {}
     pmcid_data = json.loads(doi_to_pmc())
         for pmcid in pmcids:
             if len(pmc_ids) >= 2:
                 break
+            if not utils.check_data_in_file(pmcid, 'sd.txt'):
+                utils.write_data_to_file(pmcid, 'sd.txt')
                 pmc_ids.append(pmcid)
         pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
+    if not utils.upload_datafile('sd.txt'):
         raise Exception("Failed to upload datafile")
     return json.dumps(pmc_data, indent=4, ensure_ascii=False)
 if __name__ == "__main__":
+    data = extract_sd_data()
+    with open('sd_data.json', 'w') as f:
         f.write(data)