raannakasturi
commited on
Commit
·
a4b6d0b
1
Parent(s):
8a1664b
Refactor data extraction functions: rename extract_pmc_data to extract_sd_data, add extract_phys_data, and update file handling for new data sources
Browse files
arvix.py
CHANGED
@@ -55,8 +55,8 @@ def extract_data(category):
|
|
55 |
return list(all_ids)
|
56 |
|
57 |
def extract_arxiv_data():
|
58 |
-
if not utils.download_datafile('arxiv.txt'):
|
59 |
-
|
60 |
categories = {
|
61 |
"Astrophysics": ["astro-ph"],
|
62 |
"Condensed Matter": ["cond-mat"],
|
@@ -93,8 +93,8 @@ def extract_arxiv_data():
|
|
93 |
while len(category_ids) < 2:
|
94 |
category_ids.add(random.choice(list(used_ids)))
|
95 |
data[category] = {"ids": list(category_ids), "count": len(category_ids)}
|
96 |
-
if not utils.upload_datafile('arxiv.txt'):
|
97 |
-
|
98 |
return json.dumps(data, indent=4, ensure_ascii=False)
|
99 |
|
100 |
if __name__ == '__main__':
|
|
|
55 |
return list(all_ids)
|
56 |
|
57 |
def extract_arxiv_data():
|
58 |
+
# if not utils.download_datafile('arxiv.txt'):
|
59 |
+
# raise Exception("Failed to download datafile")
|
60 |
categories = {
|
61 |
"Astrophysics": ["astro-ph"],
|
62 |
"Condensed Matter": ["cond-mat"],
|
|
|
93 |
while len(category_ids) < 2:
|
94 |
category_ids.add(random.choice(list(used_ids)))
|
95 |
data[category] = {"ids": list(category_ids), "count": len(category_ids)}
|
96 |
+
# if not utils.upload_datafile('arxiv.txt'):
|
97 |
+
# raise Exception("Failed to upload datafile")
|
98 |
return json.dumps(data, indent=4, ensure_ascii=False)
|
99 |
|
100 |
if __name__ == '__main__':
|
fetch_data.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from arvix import extract_arxiv_data
|
2 |
-
from pmc import
|
|
|
3 |
import json
|
4 |
import dotenv
|
5 |
import os
|
@@ -11,8 +12,11 @@ ACCESS_KEY = os.getenv("ACCESS_KEY")
|
|
11 |
def fetch_arxiv_data():
|
12 |
return json.loads(extract_arxiv_data())
|
13 |
|
14 |
-
def
|
15 |
-
return json.loads(
|
|
|
|
|
|
|
16 |
|
17 |
def fetch_data(user_access_key):
|
18 |
if user_access_key != ACCESS_KEY:
|
@@ -23,10 +27,12 @@ def fetch_data(user_access_key):
|
|
23 |
papers_data['status'] = 'success'
|
24 |
papers_data['data'] = {}
|
25 |
with ThreadPoolExecutor() as executor:
|
26 |
-
pmc_future = executor.submit(
|
27 |
arxiv_future = executor.submit(fetch_arxiv_data)
|
|
|
28 |
pmc_data = pmc_future.result()
|
29 |
arxiv_data = arxiv_future.result()
|
|
|
30 |
for topic, topic_data in pmc_data.items():
|
31 |
if topic_data['count'] == 0:
|
32 |
continue
|
@@ -39,6 +45,12 @@ def fetch_data(user_access_key):
|
|
39 |
else:
|
40 |
papers_data['data'][topic] = {}
|
41 |
papers_data['data'][topic]['ids'] = topic_data['ids']
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
except Exception as e:
|
43 |
print(str(e))
|
44 |
papers_data['status'] = 'error'
|
|
|
1 |
from arvix import extract_arxiv_data
|
2 |
+
from pmc import extract_sd_data
|
3 |
+
from phys import extract_phys_data
|
4 |
import json
|
5 |
import dotenv
|
6 |
import os
|
|
|
12 |
def fetch_arxiv_data():
|
13 |
return json.loads(extract_arxiv_data())
|
14 |
|
15 |
+
def fetch_sd_data():
|
16 |
+
return json.loads(extract_sd_data())
|
17 |
+
|
18 |
+
def fetch_phys_data():
|
19 |
+
return json.loads(extract_phys_data())
|
20 |
|
21 |
def fetch_data(user_access_key):
|
22 |
if user_access_key != ACCESS_KEY:
|
|
|
27 |
papers_data['status'] = 'success'
|
28 |
papers_data['data'] = {}
|
29 |
with ThreadPoolExecutor() as executor:
|
30 |
+
pmc_future = executor.submit(fetch_sd_data)
|
31 |
arxiv_future = executor.submit(fetch_arxiv_data)
|
32 |
+
phys_future = executor.submit(fetch_phys_data)
|
33 |
pmc_data = pmc_future.result()
|
34 |
arxiv_data = arxiv_future.result()
|
35 |
+
phys_data = phys_future.result()
|
36 |
for topic, topic_data in pmc_data.items():
|
37 |
if topic_data['count'] == 0:
|
38 |
continue
|
|
|
45 |
else:
|
46 |
papers_data['data'][topic] = {}
|
47 |
papers_data['data'][topic]['ids'] = topic_data['ids']
|
48 |
+
for topic, topic_data in phys_data.items():
|
49 |
+
if topic_data['count'] == 0:
|
50 |
+
continue
|
51 |
+
else:
|
52 |
+
papers_data['data'][topic] = {}
|
53 |
+
papers_data['data'][topic]['ids'] = topic_data['ids']
|
54 |
except Exception as e:
|
55 |
print(str(e))
|
56 |
papers_data['status'] = 'error'
|
phys.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import xml.etree.ElementTree as ET
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import requests
|
5 |
+
import threading
|
6 |
+
import utils
|
7 |
+
|
8 |
+
def fetch_links(link):
|
9 |
+
links = []
|
10 |
+
xml_data = utils.fetch_page(link)
|
11 |
+
items = ET.fromstring(xml_data).findall('channel/item')
|
12 |
+
for item in items:
|
13 |
+
link = item.find('link').text
|
14 |
+
links.append(link)
|
15 |
+
return links
|
16 |
+
|
17 |
+
def fetch_all_links():
|
18 |
+
category_link_data = {
|
19 |
+
"Earth": "https://phys.org/rss-feed/breaking/earth-news/",
|
20 |
+
"Science": "https://phys.org/rss-feed/breaking/science-news/",
|
21 |
+
"Nano-technology": "https://phys.org/rss-feed/breaking/nanotech-news/",
|
22 |
+
"Physics": "https://phys.org/rss-feed/breaking/physics-news/",
|
23 |
+
"Astronomy & Space": "https://phys.org/rss-feed/breaking/space-news/",
|
24 |
+
"Biology": "https://phys.org/rss-feed/breaking/biology-news/",
|
25 |
+
"Chemistry": "https://phys.org/rss-feed/breaking/chemistry-news/",
|
26 |
+
}
|
27 |
+
sd_links_data = {}
|
28 |
+
for category, link in category_link_data.items():
|
29 |
+
links = fetch_links(link)
|
30 |
+
sd_links_data[category] = links
|
31 |
+
return json.dumps(sd_links_data, indent=4, ensure_ascii=False)
|
32 |
+
|
33 |
+
def fetch_dois():
|
34 |
+
doi_data = {}
|
35 |
+
data = json.loads(fetch_all_links())
|
36 |
+
for topic, links in data.items():
|
37 |
+
doi_list = []
|
38 |
+
for link in links:
|
39 |
+
page_content = utils.fetch_page(link)
|
40 |
+
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div",class_="article-main__more")
|
41 |
+
for page_data in page_datas:
|
42 |
+
doi_link = page_data.find("a", attrs={"data-doi":"1"})
|
43 |
+
if doi_link:
|
44 |
+
doi = doi_link.text.split('DOI: ')[-1]
|
45 |
+
if doi.startswith('10.'):
|
46 |
+
doi_list.append(doi)
|
47 |
+
doi_data[topic] = doi_list
|
48 |
+
return json.dumps(doi_data, indent=4, ensure_ascii=False)
|
49 |
+
|
50 |
+
def fetch_doi_data():
|
51 |
+
result = []
|
52 |
+
def fetch_and_store():
|
53 |
+
result.append(fetch_dois())
|
54 |
+
thread = threading.Thread(target=fetch_and_store)
|
55 |
+
thread.start()
|
56 |
+
thread.join()
|
57 |
+
return result[0] if result else {}
|
58 |
+
|
59 |
+
def doi_to_pmc():
|
60 |
+
data = json.loads(fetch_doi_data())
|
61 |
+
pmc_data = {}
|
62 |
+
for topic, dois in data.items():
|
63 |
+
if not dois:
|
64 |
+
continue
|
65 |
+
doi_list = ",".join(dois)
|
66 |
+
try:
|
67 |
+
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
|
68 |
+
doi_pmc_data = requests.get(url).json()
|
69 |
+
|
70 |
+
if doi_pmc_data['status'] == 'ok':
|
71 |
+
pmc_list = [record['pmcid'] for record in doi_pmc_data['records'] if 'pmcid' in record and record.get('live', True)]
|
72 |
+
pmc_data[topic] = pmc_list[:2]
|
73 |
+
except Exception as e:
|
74 |
+
print(f"Error: {str(e)}")
|
75 |
+
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
76 |
+
|
77 |
+
def extract_phys_data():
|
78 |
+
if not utils.download_datafile('phys.txt'):
|
79 |
+
raise Exception("Failed to download datafile")
|
80 |
+
pmc_data = {}
|
81 |
+
pmcid_data = json.loads(doi_to_pmc())
|
82 |
+
for topic, pmcids in pmcid_data.items():
|
83 |
+
pmc_ids = []
|
84 |
+
for pmcid in pmcids:
|
85 |
+
if len(pmc_ids) >= 2:
|
86 |
+
break
|
87 |
+
if not utils.check_data_in_file(pmcid, 'phys.txt'):
|
88 |
+
utils.write_data_to_file(pmcid, 'phys.txt')
|
89 |
+
pmc_ids.append(pmcid)
|
90 |
+
pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
|
91 |
+
if not utils.upload_datafile('phys.txt'):
|
92 |
+
raise Exception("Failed to upload datafile")
|
93 |
+
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
94 |
+
|
95 |
+
if __name__ == "__main__":
|
96 |
+
data = extract_phys_data()
|
97 |
+
with open('phys_data.json', 'w') as f:
|
98 |
+
f.write(data)
|
pmc.py
CHANGED
@@ -66,8 +66,8 @@ def doi_to_pmc():
|
|
66 |
print(f"Error: {str(e)}")
|
67 |
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
68 |
|
69 |
-
def
|
70 |
-
if not utils.download_datafile('
|
71 |
raise Exception("Failed to download datafile")
|
72 |
pmc_data = {}
|
73 |
pmcid_data = json.loads(doi_to_pmc())
|
@@ -76,15 +76,15 @@ def extract_pmc_data():
|
|
76 |
for pmcid in pmcids:
|
77 |
if len(pmc_ids) >= 2:
|
78 |
break
|
79 |
-
if not utils.check_data_in_file(pmcid, '
|
80 |
-
utils.write_data_to_file(pmcid, '
|
81 |
pmc_ids.append(pmcid)
|
82 |
pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
|
83 |
-
if not utils.upload_datafile('
|
84 |
raise Exception("Failed to upload datafile")
|
85 |
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
86 |
|
87 |
if __name__ == "__main__":
|
88 |
-
data =
|
89 |
-
with open('
|
90 |
f.write(data)
|
|
|
66 |
print(f"Error: {str(e)}")
|
67 |
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
68 |
|
69 |
+
def extract_sd_data():
|
70 |
+
if not utils.download_datafile('sd.txt'):
|
71 |
raise Exception("Failed to download datafile")
|
72 |
pmc_data = {}
|
73 |
pmcid_data = json.loads(doi_to_pmc())
|
|
|
76 |
for pmcid in pmcids:
|
77 |
if len(pmc_ids) >= 2:
|
78 |
break
|
79 |
+
if not utils.check_data_in_file(pmcid, 'sd.txt'):
|
80 |
+
utils.write_data_to_file(pmcid, 'sd.txt')
|
81 |
pmc_ids.append(pmcid)
|
82 |
pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
|
83 |
+
if not utils.upload_datafile('sd.txt'):
|
84 |
raise Exception("Failed to upload datafile")
|
85 |
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
86 |
|
87 |
if __name__ == "__main__":
|
88 |
+
data = extract_sd_data()
|
89 |
+
with open('sd_data.json', 'w') as f:
|
90 |
f.write(data)
|