raannakasturi
commited on
Update pmc.py
Browse files
pmc.py
CHANGED
@@ -2,12 +2,16 @@ import json
|
|
2 |
import xml.etree.ElementTree as ET
|
3 |
from bs4 import BeautifulSoup
|
4 |
import requests
|
5 |
-
import
|
|
|
6 |
import threading
|
7 |
|
|
|
|
|
|
|
8 |
def fetch_links(category):
|
9 |
links = []
|
10 |
-
xml_data =
|
11 |
items = ET.fromstring(xml_data).findall('channel/item')
|
12 |
for item in items:
|
13 |
link = item.find('link').text
|
@@ -20,8 +24,7 @@ def fetch_all_links():
|
|
20 |
for category in categories:
|
21 |
links = fetch_links(category)
|
22 |
sd_links_data[category] = links
|
23 |
-
|
24 |
-
return data
|
25 |
|
26 |
def fetch_dois():
|
27 |
doi_data = {}
|
@@ -29,20 +32,16 @@ def fetch_dois():
|
|
29 |
for topic, links in data.items():
|
30 |
doi_list = []
|
31 |
for link in links:
|
32 |
-
page_content =
|
33 |
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
|
34 |
for page_data in page_datas:
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
doi = page_data.find("a", href=True).text
|
39 |
if doi.startswith('10.'):
|
40 |
doi_list.append(doi)
|
41 |
-
else:
|
42 |
-
continue
|
43 |
doi_data[topic] = doi_list
|
44 |
-
|
45 |
-
return data
|
46 |
|
47 |
def fetch_doi_data():
|
48 |
result = []
|
@@ -51,65 +50,45 @@ def fetch_doi_data():
|
|
51 |
thread = threading.Thread(target=fetch_and_store)
|
52 |
thread.start()
|
53 |
thread.join()
|
54 |
-
|
55 |
-
return []
|
56 |
-
return result[0]
|
57 |
|
58 |
def doi_to_pmc():
|
59 |
data = json.loads(fetch_doi_data())
|
60 |
pmc_data = {}
|
61 |
for topic, dois in data.items():
|
62 |
-
if
|
63 |
-
doi_list = ""
|
64 |
-
for doi in dois:
|
65 |
-
doi_list += doi + ","
|
66 |
-
doi_list = doi_list.rstrip(',')
|
67 |
-
try:
|
68 |
-
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
|
69 |
-
doi_pmc_data = requests.get(url).json()
|
70 |
-
except Exception as e:
|
71 |
-
print(f"Error: {str(e)}")
|
72 |
-
if doi_pmc_data['status'] == 'ok':
|
73 |
-
pmc_list = []
|
74 |
-
for record in doi_pmc_data['records']:
|
75 |
-
if 'pmcid' in record:
|
76 |
-
if 'live' in record and record['live'] == False:
|
77 |
-
continue
|
78 |
-
pmc_list.append(record['pmcid'])
|
79 |
-
else:
|
80 |
-
continue
|
81 |
-
pmc_data[topic] = pmc_list
|
82 |
-
else:
|
83 |
-
continue
|
84 |
-
else:
|
85 |
continue
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
def extract_pmc_data():
|
90 |
-
if not
|
91 |
raise Exception("Failed to download datafile")
|
92 |
-
pmc_data ={}
|
93 |
pmcid_data = json.loads(doi_to_pmc())
|
94 |
for topic, pmcids in pmcid_data.items():
|
95 |
pmc_ids = []
|
96 |
-
for
|
97 |
-
if len(pmc_ids) >=
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
pmc_data[topic] = {}
|
105 |
-
pmc_data[topic]['count'] = len(pmc_ids)
|
106 |
-
pmc_data[topic]['ids'] = pmc_ids
|
107 |
-
data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
108 |
-
if not tools.upload_datafile('pmc.txt'):
|
109 |
raise Exception("Failed to upload datafile")
|
110 |
-
return
|
111 |
|
112 |
if __name__ == "__main__":
|
113 |
data = extract_pmc_data()
|
114 |
with open('pmc_data.json', 'w') as f:
|
115 |
-
f.write(data)
|
|
|
2 |
import xml.etree.ElementTree as ET
|
3 |
from bs4 import BeautifulSoup
|
4 |
import requests
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
import threading
|
8 |
|
9 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
|
10 |
+
import utils
|
11 |
+
|
12 |
def fetch_links(category):
|
13 |
links = []
|
14 |
+
xml_data = utils.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
|
15 |
items = ET.fromstring(xml_data).findall('channel/item')
|
16 |
for item in items:
|
17 |
link = item.find('link').text
|
|
|
24 |
for category in categories:
|
25 |
links = fetch_links(category)
|
26 |
sd_links_data[category] = links
|
27 |
+
return json.dumps(sd_links_data, indent=4, ensure_ascii=False)
|
|
|
28 |
|
29 |
def fetch_dois():
|
30 |
doi_data = {}
|
|
|
32 |
for topic, links in data.items():
|
33 |
doi_list = []
|
34 |
for link in links:
|
35 |
+
page_content = utils.fetch_page(link)
|
36 |
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
|
37 |
for page_data in page_datas:
|
38 |
+
doi_link = page_data.find("a", href=True)
|
39 |
+
if doi_link:
|
40 |
+
doi = doi_link.text
|
|
|
41 |
if doi.startswith('10.'):
|
42 |
doi_list.append(doi)
|
|
|
|
|
43 |
doi_data[topic] = doi_list
|
44 |
+
return json.dumps(doi_data, indent=4, ensure_ascii=False)
|
|
|
45 |
|
46 |
def fetch_doi_data():
|
47 |
result = []
|
|
|
50 |
thread = threading.Thread(target=fetch_and_store)
|
51 |
thread.start()
|
52 |
thread.join()
|
53 |
+
return result[0] if result else {}
|
|
|
|
|
54 |
|
55 |
def doi_to_pmc():
|
56 |
data = json.loads(fetch_doi_data())
|
57 |
pmc_data = {}
|
58 |
for topic, dois in data.items():
|
59 |
+
if not dois:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
continue
|
61 |
+
doi_list = ",".join(dois)
|
62 |
+
try:
|
63 |
+
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
|
64 |
+
doi_pmc_data = requests.get(url).json()
|
65 |
+
|
66 |
+
if doi_pmc_data['status'] == 'ok':
|
67 |
+
pmc_list = [record['pmcid'] for record in doi_pmc_data['records'] if 'pmcid' in record and record.get('live', True)]
|
68 |
+
pmc_data[topic] = pmc_list[:4]
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Error: {str(e)}")
|
71 |
+
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
72 |
|
73 |
def extract_pmc_data():
|
74 |
+
if not utils.download_datafile('pmc.txt'):
|
75 |
raise Exception("Failed to download datafile")
|
76 |
+
pmc_data = {}
|
77 |
pmcid_data = json.loads(doi_to_pmc())
|
78 |
for topic, pmcids in pmcid_data.items():
|
79 |
pmc_ids = []
|
80 |
+
for pmcid in pmcids:
|
81 |
+
if len(pmc_ids) >= 4:
|
82 |
+
break
|
83 |
+
if not utils.check_data_in_file(pmcid, 'pmc.txt'):
|
84 |
+
utils.write_data_to_file(pmcid, 'pmc.txt')
|
85 |
+
pmc_ids.append(pmcid)
|
86 |
+
pmc_data[topic] = {"ids": pmc_ids}
|
87 |
+
if not utils.upload_datafile('pmc.txt'):
|
|
|
|
|
|
|
|
|
|
|
88 |
raise Exception("Failed to upload datafile")
|
89 |
+
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
|
90 |
|
91 |
if __name__ == "__main__":
|
92 |
data = extract_pmc_data()
|
93 |
with open('pmc_data.json', 'w') as f:
|
94 |
+
f.write(data)
|