Spaces:

raannakasturi
/

ReXploreIDFetchingAPI

Running

App Files Files Community

raannakasturi commited on Dec 19, 2024

Commit

c680313

1 Parent(s): c440916

Add initial project structure with core functionality and dependencies

Browse files

Files changed (7) hide show

.gitignore +8 -0
app.py +28 -0
arvix.py +113 -0
fetch_data.py +45 -0
pmc.py +108 -0
requirements.txt +5 -0
tools.py +74 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+/.cache
+/__pycache__
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+.DS_Store

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import gradio as gr
+from fetch_data import fetch_data
+from tools import reset_datafiles
+theme = gr.themes.Soft(
+    primary_hue="purple",
+    secondary_hue="cyan",
+    neutral_hue="slate",
+    font=[
+        gr.themes.GoogleFont('Syne'),
+        gr.themes.GoogleFont('Poppins'),
+        gr.themes.GoogleFont('Poppins'),
+        gr.themes.GoogleFont('Poppins')
+    ],
+)
+with gr.Blocks(theme=theme, title="fetch Research Paper IDS") as app:
+    with gr.Row():
+        with gr.Column():
+            user_access_key = gr.Textbox(label="Access Key", placeholder="Enter your access key", type="password")
+            with gr.Row():
+                fetch_data_btn = gr.Button(value="Fetch Data")
+                reset_files = gr.Button(value="Reset Files")
+        raw_data = gr.Textbox(lines=15, label="Raw IDs Data", interactive=False, placeholder="IDs starting with PMC are PMC IDs and rest all are Arxiv IDs", show_copy_button=True)
+    fetch_data_btn.click(fn=fetch_data, inputs=[user_access_key], outputs=[raw_data], api_name="fetch_paper_ids")
+    reset_files.click(fn=reset_datafiles, inputs=[user_access_key], outputs=[raw_data], show_api=False)
+app.queue(default_concurrency_limit=25).launch(max_threads=5000)

arvix.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import json
+import random
+import tools
+from bs4 import BeautifulSoup
+def fetch_new_page(category):
+    url = f'https://arxiv.org/list/{category}/new'
+    return tools.fetch_page(url)
+def fetch_recent_page(category):
+    url = f'https://arxiv.org/list/{category}/recent'
+    return tools.fetch_page(url)
+def extract_new_data(category):
+    paper_ids = []
+    page_content = fetch_new_page(category)
+    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
+    for list in lists:
+        papers = list.find_all('dt')
+        paper_contents = list.find_all('dd')
+        titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
+        for paper, title in zip(papers, titles):
+            if not tools.verify_simple_title(title):
+                continue
+            else:
+                paper_link = paper.find('a', href=True)
+                if paper_link:
+                    paper_id = paper_link.text.strip().split(':')[1]
+                    paper_ids.append(paper_id)
+                else:
+                    continue
+    return paper_ids
+def extract_recent_data(category):
+    paper_ids = []
+    page_content = fetch_recent_page(category)
+    lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
+    for list in lists:
+        papers = list.find_all('dt')
+        for paper in papers:
+            paper_link = paper.find('a', href=True)
+            if paper_link:
+                paper_id = paper_link.text.strip().split(':')[1]
+                paper_ids.append(paper_id)
+            else:
+                continue
+    return paper_ids
+def extract_data(category):
+    sanitized_data = []
+    new_data = extract_new_data(category)
+    recent_data = extract_recent_data(category)
+    data = list(set(new_data + recent_data))
+    for id in data:
+        if len(sanitized_data) >= 12:
+            break
+        if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
+            id = id[:3]
+        if tools.check_data_in_file(id, 'arxiv.txt'):
+            continue
+        else:
+            tools.write_data_to_file(id, 'arxiv.txt')
+            sanitized_data.append(id)
+    random.shuffle(sanitized_data)
+    return sanitized_data[:12]
+def extract_arxiv_data():
+    if not tools.download_datafile('arxiv.txt'):
+        raise Exception("Failed to download datafile")
+    categories = {
+        "Astrophysics": ["astro-ph"],
+        "Condensed Matter": ["cond-mat"],
+        "General Relativity and Quantum Cosmology": ["gr-qc"],
+        "High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
+        "Mathematical Physics": ["math-ph"],
+        "Nonlinear Sciences": ["nlin"],
+        "Nuclear Experiment": ["nucl-ex"],
+        "Nuclear Theory": ["nucl-th"],
+        "Physics": ["physics"],
+        "Quantum Physics": ["quant-ph"],
+        "Mathematics": ["math"],
+        "Computer Science": ["cs"],
+        "Quantitative Biology": ["q-bio"],
+        "Quantitative Finance": ["q-fin"],
+        "Statistics": ["stat"],
+        "Electrical Engineering and Systems Science": ["eess"],
+        "Economics": ["econ"]
+    }
+    data = {}
+    for category, subcategories in categories.items():
+        category_data = {}
+        all_ids = []
+        for subcategory in subcategories:
+            ids = extract_data(subcategory)
+            for id in ids:
+                all_ids.append(id)
+        if len(all_ids) > 12:
+            print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
+            random.shuffle(all_ids)
+            all_ids = all_ids[:12]
+        category_data['count'] = len(all_ids)
+        category_data['ids'] = all_ids
+        data[category] = category_data
+    data = json.dumps(data, indent=4, ensure_ascii=False)
+    if not tools.upload_datafile('arxiv.txt'):
+        raise Exception("Failed to upload datafile")
+    return data
+if __name__ == '__main__':
+    data = extract_arxiv_data()
+    with open('arxiv_data.json', 'w') as f:
+        f.write(data)

fetch_data.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from arvix import extract_arxiv_data
+from pmc import extract_pmc_data
+import json
+import dotenv
+import os
+from concurrent.futures import ThreadPoolExecutor
+dotenv.load_dotenv()
+ACCESS_KEY = os.getenv("ACCESS_KEY")
+def fetch_arxiv_data():
+    return json.loads(extract_arxiv_data())
+def fetch_pmc_data():
+    return json.loads(extract_pmc_data())
+def fetch_data(user_access_key):
+    if user_access_key != ACCESS_KEY:
+        papers_data = {"error": "Invalid access key"}
+    else:
+        papers_data = {}
+        with ThreadPoolExecutor() as executor:
+            arxiv_future = executor.submit(fetch_arxiv_data)
+            pmc_future = executor.submit(fetch_pmc_data)
+            arxiv_data = arxiv_future.result()
+            pmc_data = pmc_future.result()
+        for topic, topic_data in arxiv_data.items():
+            if topic_data['count'] == 0:
+                continue
+            else:
+                papers_data[topic] = {}
+                papers_data[topic]['ids'] = topic_data['ids']
+        for topic, topic_data in pmc_data.items():
+            if topic_data['count'] == 0:
+                continue
+            else:
+                papers_data[topic] = {}
+                papers_data[topic]['ids'] = topic_data['ids']
+    data = json.dumps(papers_data, indent=4, ensure_ascii=False)
+    return data
+if __name__ == '__main__':
+    data = fetch_data()
+    with open('data.json', 'w') as f:
+        f.write(data)

pmc.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import json
+import xml.etree.ElementTree as ET
+from bs4 import BeautifulSoup
+import requests
+import tools
+import threading
+def fetch_links(category):
+    links = []
+    xml_data = tools.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
+    items = ET.fromstring(xml_data).findall('channel/item')
+    for item in items:
+        link = item.find('link').text
+        links.append(link)
+    return links
+def fetch_all_links():
+    categories = ["Science", "Health", "Environment", "Technology", "Society"]
+    sd_links_data = {}
+    for category in categories:
+        links = fetch_links(category)
+        sd_links_data[category] = links
+    data = json.dumps(sd_links_data, indent=4, ensure_ascii=False)
+    return data
+def fetch_dois():
+    doi_data = {}
+    data = json.loads(fetch_all_links())
+    for topic, links in data.items():
+        doi_list = []
+        for link in links:
+            page_content = tools.fetch_page(link)
+            page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
+            for page_data in page_datas:
+                doi = page_data.find("a", href=True).text
+                if doi.startswith('10.'):
+                    doi_list.append(doi)
+                else:
+                    continue
+        doi_data[topic] = doi_list
+    data = json.dumps(doi_data, indent=4, ensure_ascii=False)
+    return data
+def fetch_doi_data():
+    result = []
+    def fetch_and_store():
+        result.append(fetch_dois())
+    thread = threading.Thread(target=fetch_and_store)
+    thread.start()
+    thread.join()
+    return result[0]
+def doi_to_pmc():
+    data = json.loads(fetch_doi_data())
+    pmc_data = {}
+    for topic, dois in data.items():
+        if len(dois) > 0:
+            doi_list = ""
+            for doi in dois:
+                doi_list += doi + ","
+            doi_list = doi_list.rstrip(',')
+            try:
+                url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
+                doi_pmc_data = requests.get(url).json()
+            except Exception as e:
+                print(f"Error: {str(e)}")
+            if doi_pmc_data['status'] == 'ok':
+                pmc_list = []
+                for record in doi_pmc_data['records']:
+                    if 'pmcid' in record:
+                        if 'live' in record and record['live'] == False:
+                            continue
+                        pmc_list.append(record['pmcid'])
+                    else:
+                        continue
+                pmc_data[topic] = pmc_list
+            else:
+                continue
+        else:
+            continue
+    data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
+    return data
+def extract_pmc_data():
+    if not tools.download_datafile('pmc.txt'):
+        raise Exception("Failed to download datafile")
+    pmc_data ={}
+    pmcid_data = json.loads(doi_to_pmc())
+    for topic, pmcids in pmcid_data.items():
+        pmc_ids = []
+        for id in pmcids:
+            if tools.check_data_in_file(id, 'pmc.txt'):
+                continue
+            else:
+                tools.write_data_to_file(id, 'pmc.txt')
+                pmc_ids.append(id)
+        pmc_data[topic] = {}
+        pmc_data[topic]['count'] = len(pmc_ids)
+        pmc_data[topic]['ids'] = pmc_ids
+    data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
+    if not tools.upload_datafile('pmc.txt'):
+        raise Exception("Failed to upload datafile")
+    return data
+if __name__ == "__main__":
+    data = extract_pmc_data()
+    with open('pmc_data.json', 'w') as f:
+        f.write(data)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==5.8.0
+python-dotenv==1.0.1
+beautifulsoup4==4.12.3
+requests==2.32.3
+huggingface-hub==0.27.0

tools.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import requests
+import re
+import os
+import dotenv
+from huggingface_hub import HfApi
+dotenv.load_dotenv()
+hf_token = os.getenv("HF_API_TOKEN")
+access_key = os.getenv("ACCESS_KEY")
+api = HfApi(token=hf_token)
+def fetch_page(url):
+    HEADERS = {
+        'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
+    }
+    page_content = requests.get(url, headers=HEADERS).content
+    return page_content
+def check_data_in_file(data, file):
+    with open(file, 'r') as f:
+        existing_data = f.read().splitlines()
+    if data in existing_data:
+        return True
+    else:
+        return False
+def write_data_to_file(data, file):
+    with open(file, 'a') as f:
+        f.write(data + '\n')
+    return True
+def verify_simple_title(title):
+    pattern = re.compile(r'^[a-zA-Z0-9\s\.\-\+\*/=\(\)\[\]\{\},:;"\'?\>\<\@\#\%\^\*\|\_\~\`]+$')
+    if pattern.match(title):
+        return True
+    else:
+        return False
+def download_datafile(filename):
+    try:
+        api.hf_hub_download(repo_id="raannakasturi/ReXploreData", filename=filename, repo_type="dataset", local_dir='.', cache_dir='.', force_download=True)
+        return True
+    except Exception as e:
+        print(str(e))
+        return False
+def upload_datafile(filename):
+    try:
+        api.upload_file(path_or_fileobj=filename, path_in_repo=filename, repo_id="raannakasturi/ReXploreData", repo_type="dataset")
+        os.remove(filename)
+        return True
+    except Exception as e:
+        print(str(e))
+        return False
+def reset_datafiles(user_access_key):
+    if user_access_key != access_key:
+        return "Invalid access key"
+    else:
+        files  = ['arxiv.txt', 'pmc.txt']
+        try:
+            for filename in files:
+                try:
+                    download_datafile(filename)
+                    with open(filename, 'w') as f:
+                        f.write('')
+                    upload_datafile(filename)
+                except Exception as e:
+                    print(str(e))
+                    continue
+            return True
+        except Exception as e:
+            print(str(e))
+            return False