raannakasturi
commited on
Commit
·
c680313
1
Parent(s):
c440916
Add initial project structure with core functionality and dependencies
Browse files- .gitignore +8 -0
- app.py +28 -0
- arvix.py +113 -0
- fetch_data.py +45 -0
- pmc.py +108 -0
- requirements.txt +5 -0
- tools.py +74 -0
.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/.cache
|
2 |
+
/__pycache__
|
3 |
+
.env
|
4 |
+
.env.local
|
5 |
+
.env.development.local
|
6 |
+
.env.test.local
|
7 |
+
.env.production.local
|
8 |
+
.DS_Store
|
app.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from fetch_data import fetch_data
|
3 |
+
from tools import reset_datafiles
|
4 |
+
|
5 |
+
theme = gr.themes.Soft(
|
6 |
+
primary_hue="purple",
|
7 |
+
secondary_hue="cyan",
|
8 |
+
neutral_hue="slate",
|
9 |
+
font=[
|
10 |
+
gr.themes.GoogleFont('Syne'),
|
11 |
+
gr.themes.GoogleFont('Poppins'),
|
12 |
+
gr.themes.GoogleFont('Poppins'),
|
13 |
+
gr.themes.GoogleFont('Poppins')
|
14 |
+
],
|
15 |
+
)
|
16 |
+
|
17 |
+
with gr.Blocks(theme=theme, title="fetch Research Paper IDS") as app:
|
18 |
+
with gr.Row():
|
19 |
+
with gr.Column():
|
20 |
+
user_access_key = gr.Textbox(label="Access Key", placeholder="Enter your access key", type="password")
|
21 |
+
with gr.Row():
|
22 |
+
fetch_data_btn = gr.Button(value="Fetch Data")
|
23 |
+
reset_files = gr.Button(value="Reset Files")
|
24 |
+
raw_data = gr.Textbox(lines=15, label="Raw IDs Data", interactive=False, placeholder="IDs starting with PMC are PMC IDs and rest all are Arxiv IDs", show_copy_button=True)
|
25 |
+
fetch_data_btn.click(fn=fetch_data, inputs=[user_access_key], outputs=[raw_data], api_name="fetch_paper_ids")
|
26 |
+
reset_files.click(fn=reset_datafiles, inputs=[user_access_key], outputs=[raw_data], show_api=False)
|
27 |
+
|
28 |
+
app.queue(default_concurrency_limit=25).launch(max_threads=5000)
|
arvix.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
import tools
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
|
6 |
+
def fetch_new_page(category):
|
7 |
+
url = f'https://arxiv.org/list/{category}/new'
|
8 |
+
return tools.fetch_page(url)
|
9 |
+
|
10 |
+
def fetch_recent_page(category):
|
11 |
+
url = f'https://arxiv.org/list/{category}/recent'
|
12 |
+
return tools.fetch_page(url)
|
13 |
+
|
14 |
+
def extract_new_data(category):
|
15 |
+
paper_ids = []
|
16 |
+
page_content = fetch_new_page(category)
|
17 |
+
lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
|
18 |
+
for list in lists:
|
19 |
+
papers = list.find_all('dt')
|
20 |
+
paper_contents = list.find_all('dd')
|
21 |
+
titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
|
22 |
+
for paper, title in zip(papers, titles):
|
23 |
+
if not tools.verify_simple_title(title):
|
24 |
+
continue
|
25 |
+
else:
|
26 |
+
paper_link = paper.find('a', href=True)
|
27 |
+
if paper_link:
|
28 |
+
paper_id = paper_link.text.strip().split(':')[1]
|
29 |
+
paper_ids.append(paper_id)
|
30 |
+
else:
|
31 |
+
continue
|
32 |
+
return paper_ids
|
33 |
+
|
34 |
+
def extract_recent_data(category):
|
35 |
+
paper_ids = []
|
36 |
+
page_content = fetch_recent_page(category)
|
37 |
+
lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
|
38 |
+
for list in lists:
|
39 |
+
papers = list.find_all('dt')
|
40 |
+
for paper in papers:
|
41 |
+
paper_link = paper.find('a', href=True)
|
42 |
+
if paper_link:
|
43 |
+
paper_id = paper_link.text.strip().split(':')[1]
|
44 |
+
paper_ids.append(paper_id)
|
45 |
+
else:
|
46 |
+
continue
|
47 |
+
return paper_ids
|
48 |
+
|
49 |
+
def extract_data(category):
|
50 |
+
sanitized_data = []
|
51 |
+
new_data = extract_new_data(category)
|
52 |
+
recent_data = extract_recent_data(category)
|
53 |
+
data = list(set(new_data + recent_data))
|
54 |
+
for id in data:
|
55 |
+
if len(sanitized_data) >= 12:
|
56 |
+
break
|
57 |
+
if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
|
58 |
+
id = id[:3]
|
59 |
+
if tools.check_data_in_file(id, 'arxiv.txt'):
|
60 |
+
continue
|
61 |
+
else:
|
62 |
+
tools.write_data_to_file(id, 'arxiv.txt')
|
63 |
+
sanitized_data.append(id)
|
64 |
+
random.shuffle(sanitized_data)
|
65 |
+
return sanitized_data[:12]
|
66 |
+
|
67 |
+
def extract_arxiv_data():
|
68 |
+
if not tools.download_datafile('arxiv.txt'):
|
69 |
+
raise Exception("Failed to download datafile")
|
70 |
+
categories = {
|
71 |
+
"Astrophysics": ["astro-ph"],
|
72 |
+
"Condensed Matter": ["cond-mat"],
|
73 |
+
"General Relativity and Quantum Cosmology": ["gr-qc"],
|
74 |
+
"High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
|
75 |
+
"Mathematical Physics": ["math-ph"],
|
76 |
+
"Nonlinear Sciences": ["nlin"],
|
77 |
+
"Nuclear Experiment": ["nucl-ex"],
|
78 |
+
"Nuclear Theory": ["nucl-th"],
|
79 |
+
"Physics": ["physics"],
|
80 |
+
"Quantum Physics": ["quant-ph"],
|
81 |
+
"Mathematics": ["math"],
|
82 |
+
"Computer Science": ["cs"],
|
83 |
+
"Quantitative Biology": ["q-bio"],
|
84 |
+
"Quantitative Finance": ["q-fin"],
|
85 |
+
"Statistics": ["stat"],
|
86 |
+
"Electrical Engineering and Systems Science": ["eess"],
|
87 |
+
"Economics": ["econ"]
|
88 |
+
}
|
89 |
+
data = {}
|
90 |
+
for category, subcategories in categories.items():
|
91 |
+
category_data = {}
|
92 |
+
all_ids = []
|
93 |
+
for subcategory in subcategories:
|
94 |
+
ids = extract_data(subcategory)
|
95 |
+
for id in ids:
|
96 |
+
all_ids.append(id)
|
97 |
+
if len(all_ids) > 12:
|
98 |
+
print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
|
99 |
+
random.shuffle(all_ids)
|
100 |
+
all_ids = all_ids[:12]
|
101 |
+
category_data['count'] = len(all_ids)
|
102 |
+
category_data['ids'] = all_ids
|
103 |
+
data[category] = category_data
|
104 |
+
data = json.dumps(data, indent=4, ensure_ascii=False)
|
105 |
+
if not tools.upload_datafile('arxiv.txt'):
|
106 |
+
raise Exception("Failed to upload datafile")
|
107 |
+
return data
|
108 |
+
|
109 |
+
|
110 |
+
if __name__ == '__main__':
|
111 |
+
data = extract_arxiv_data()
|
112 |
+
with open('arxiv_data.json', 'w') as f:
|
113 |
+
f.write(data)
|
fetch_data.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from arvix import extract_arxiv_data
|
2 |
+
from pmc import extract_pmc_data
|
3 |
+
import json
|
4 |
+
import dotenv
|
5 |
+
import os
|
6 |
+
from concurrent.futures import ThreadPoolExecutor
|
7 |
+
|
8 |
+
dotenv.load_dotenv()
|
9 |
+
ACCESS_KEY = os.getenv("ACCESS_KEY")
|
10 |
+
|
11 |
+
def fetch_arxiv_data():
|
12 |
+
return json.loads(extract_arxiv_data())
|
13 |
+
|
14 |
+
def fetch_pmc_data():
|
15 |
+
return json.loads(extract_pmc_data())
|
16 |
+
|
17 |
+
def fetch_data(user_access_key):
|
18 |
+
if user_access_key != ACCESS_KEY:
|
19 |
+
papers_data = {"error": "Invalid access key"}
|
20 |
+
else:
|
21 |
+
papers_data = {}
|
22 |
+
with ThreadPoolExecutor() as executor:
|
23 |
+
arxiv_future = executor.submit(fetch_arxiv_data)
|
24 |
+
pmc_future = executor.submit(fetch_pmc_data)
|
25 |
+
arxiv_data = arxiv_future.result()
|
26 |
+
pmc_data = pmc_future.result()
|
27 |
+
for topic, topic_data in arxiv_data.items():
|
28 |
+
if topic_data['count'] == 0:
|
29 |
+
continue
|
30 |
+
else:
|
31 |
+
papers_data[topic] = {}
|
32 |
+
papers_data[topic]['ids'] = topic_data['ids']
|
33 |
+
for topic, topic_data in pmc_data.items():
|
34 |
+
if topic_data['count'] == 0:
|
35 |
+
continue
|
36 |
+
else:
|
37 |
+
papers_data[topic] = {}
|
38 |
+
papers_data[topic]['ids'] = topic_data['ids']
|
39 |
+
data = json.dumps(papers_data, indent=4, ensure_ascii=False)
|
40 |
+
return data
|
41 |
+
|
42 |
+
if __name__ == '__main__':
|
43 |
+
data = fetch_data()
|
44 |
+
with open('data.json', 'w') as f:
|
45 |
+
f.write(data)
|
pmc.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import xml.etree.ElementTree as ET
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import requests
|
5 |
+
import tools
|
6 |
+
import threading
|
7 |
+
|
8 |
+
def fetch_links(category):
|
9 |
+
links = []
|
10 |
+
xml_data = tools.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
|
11 |
+
items = ET.fromstring(xml_data).findall('channel/item')
|
12 |
+
for item in items:
|
13 |
+
link = item.find('link').text
|
14 |
+
links.append(link)
|
15 |
+
return links
|
16 |
+
|
17 |
+
def fetch_all_links():
|
18 |
+
categories = ["Science", "Health", "Environment", "Technology", "Society"]
|
19 |
+
sd_links_data = {}
|
20 |
+
for category in categories:
|
21 |
+
links = fetch_links(category)
|
22 |
+
sd_links_data[category] = links
|
23 |
+
data = json.dumps(sd_links_data, indent=4, ensure_ascii=False)
|
24 |
+
return data
|
25 |
+
|
26 |
+
def fetch_dois():
|
27 |
+
doi_data = {}
|
28 |
+
data = json.loads(fetch_all_links())
|
29 |
+
for topic, links in data.items():
|
30 |
+
doi_list = []
|
31 |
+
for link in links:
|
32 |
+
page_content = tools.fetch_page(link)
|
33 |
+
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
|
34 |
+
for page_data in page_datas:
|
35 |
+
doi = page_data.find("a", href=True).text
|
36 |
+
if doi.startswith('10.'):
|
37 |
+
doi_list.append(doi)
|
38 |
+
else:
|
39 |
+
continue
|
40 |
+
doi_data[topic] = doi_list
|
41 |
+
data = json.dumps(doi_data, indent=4, ensure_ascii=False)
|
42 |
+
return data
|
43 |
+
|
44 |
+
def fetch_doi_data():
|
45 |
+
result = []
|
46 |
+
def fetch_and_store():
|
47 |
+
result.append(fetch_dois())
|
48 |
+
thread = threading.Thread(target=fetch_and_store)
|
49 |
+
thread.start()
|
50 |
+
thread.join()
|
51 |
+
return result[0]
|
52 |
+
|
53 |
+
def doi_to_pmc():
|
54 |
+
data = json.loads(fetch_doi_data())
|
55 |
+
pmc_data = {}
|
56 |
+
for topic, dois in data.items():
|
57 |
+
if len(dois) > 0:
|
58 |
+
doi_list = ""
|
59 |
+
for doi in dois:
|
60 |
+
doi_list += doi + ","
|
61 |
+
doi_list = doi_list.rstrip(',')
|
62 |
+
try:
|
63 |
+
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
|
64 |
+
doi_pmc_data = requests.get(url).json()
|
65 |
+
except Exception as e:
|
66 |
+
print(f"Error: {str(e)}")
|
67 |
+
if doi_pmc_data['status'] == 'ok':
|
68 |
+
pmc_list = []
|
69 |
+
for record in doi_pmc_data['records']:
|
70 |
+
if 'pmcid' in record:
|
71 |
+
if 'live' in record and record['live'] == False:
|
72 |
+
continue
|
73 |
+
pmc_list.append(record['pmcid'])
|
74 |
+
else:
|
75 |
+
continue
|
76 |
+
pmc_data[topic] = pmc_list
|
77 |
+
else:
|
78 |
+
continue
|
79 |
+
else:
|
80 |
+
continue
|
81 |
+
data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
82 |
+
return data
|
83 |
+
|
84 |
+
def extract_pmc_data():
|
85 |
+
if not tools.download_datafile('pmc.txt'):
|
86 |
+
raise Exception("Failed to download datafile")
|
87 |
+
pmc_data ={}
|
88 |
+
pmcid_data = json.loads(doi_to_pmc())
|
89 |
+
for topic, pmcids in pmcid_data.items():
|
90 |
+
pmc_ids = []
|
91 |
+
for id in pmcids:
|
92 |
+
if tools.check_data_in_file(id, 'pmc.txt'):
|
93 |
+
continue
|
94 |
+
else:
|
95 |
+
tools.write_data_to_file(id, 'pmc.txt')
|
96 |
+
pmc_ids.append(id)
|
97 |
+
pmc_data[topic] = {}
|
98 |
+
pmc_data[topic]['count'] = len(pmc_ids)
|
99 |
+
pmc_data[topic]['ids'] = pmc_ids
|
100 |
+
data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
101 |
+
if not tools.upload_datafile('pmc.txt'):
|
102 |
+
raise Exception("Failed to upload datafile")
|
103 |
+
return data
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
data = extract_pmc_data()
|
107 |
+
with open('pmc_data.json', 'w') as f:
|
108 |
+
f.write(data)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==5.8.0
|
2 |
+
python-dotenv==1.0.1
|
3 |
+
beautifulsoup4==4.12.3
|
4 |
+
requests==2.32.3
|
5 |
+
huggingface-hub==0.27.0
|
tools.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import re
|
3 |
+
import os
|
4 |
+
import dotenv
|
5 |
+
from huggingface_hub import HfApi
|
6 |
+
|
7 |
+
dotenv.load_dotenv()
|
8 |
+
hf_token = os.getenv("HF_API_TOKEN")
|
9 |
+
access_key = os.getenv("ACCESS_KEY")
|
10 |
+
api = HfApi(token=hf_token)
|
11 |
+
|
12 |
+
def fetch_page(url):
|
13 |
+
HEADERS = {
|
14 |
+
'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
|
15 |
+
}
|
16 |
+
page_content = requests.get(url, headers=HEADERS).content
|
17 |
+
return page_content
|
18 |
+
|
19 |
+
def check_data_in_file(data, file):
|
20 |
+
with open(file, 'r') as f:
|
21 |
+
existing_data = f.read().splitlines()
|
22 |
+
if data in existing_data:
|
23 |
+
return True
|
24 |
+
else:
|
25 |
+
return False
|
26 |
+
|
27 |
+
def write_data_to_file(data, file):
|
28 |
+
with open(file, 'a') as f:
|
29 |
+
f.write(data + '\n')
|
30 |
+
return True
|
31 |
+
|
32 |
+
def verify_simple_title(title):
|
33 |
+
pattern = re.compile(r'^[a-zA-Z0-9\s\.\-\+\*/=\(\)\[\]\{\},:;"\'?\>\<\@\#\%\^\*\|\_\~\`]+$')
|
34 |
+
if pattern.match(title):
|
35 |
+
return True
|
36 |
+
else:
|
37 |
+
return False
|
38 |
+
|
39 |
+
def download_datafile(filename):
|
40 |
+
try:
|
41 |
+
api.hf_hub_download(repo_id="raannakasturi/ReXploreData", filename=filename, repo_type="dataset", local_dir='.', cache_dir='.', force_download=True)
|
42 |
+
return True
|
43 |
+
except Exception as e:
|
44 |
+
print(str(e))
|
45 |
+
return False
|
46 |
+
|
47 |
+
def upload_datafile(filename):
|
48 |
+
try:
|
49 |
+
api.upload_file(path_or_fileobj=filename, path_in_repo=filename, repo_id="raannakasturi/ReXploreData", repo_type="dataset")
|
50 |
+
os.remove(filename)
|
51 |
+
return True
|
52 |
+
except Exception as e:
|
53 |
+
print(str(e))
|
54 |
+
return False
|
55 |
+
|
56 |
+
def reset_datafiles(user_access_key):
|
57 |
+
if user_access_key != access_key:
|
58 |
+
return "Invalid access key"
|
59 |
+
else:
|
60 |
+
files = ['arxiv.txt', 'pmc.txt']
|
61 |
+
try:
|
62 |
+
for filename in files:
|
63 |
+
try:
|
64 |
+
download_datafile(filename)
|
65 |
+
with open(filename, 'w') as f:
|
66 |
+
f.write('')
|
67 |
+
upload_datafile(filename)
|
68 |
+
except Exception as e:
|
69 |
+
print(str(e))
|
70 |
+
continue
|
71 |
+
return True
|
72 |
+
except Exception as e:
|
73 |
+
print(str(e))
|
74 |
+
return False
|