File size: 3,225 Bytes
11f88e0 351d53e 11f88e0 351d53e 11f88e0 351d53e 11f88e0 351d53e 11f88e0 351d53e 11f88e0 351d53e 11f88e0 351d53e 11f88e0 351d53e 11f88e0 351d53e a88fd16 351d53e 11f88e0 a4b6d0b 11f88e0 351d53e 11f88e0 351d53e a88fd16 351d53e a4b6d0b 351d53e 5a2457c a4b6d0b 11f88e0 351d53e 11f88e0 a4b6d0b 351d53e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import json
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import requests
import threading
import utils
def fetch_links(category):
links = []
xml_data = utils.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
items = ET.fromstring(xml_data).findall('channel/item')
for item in items:
link = item.find('link').text
links.append(link)
return links
def fetch_all_links():
categories = ["Science", "Health", "Environment", "Technology", "Society"]
sd_links_data = {}
for category in categories:
links = fetch_links(category)
sd_links_data[category] = links
return json.dumps(sd_links_data, indent=4, ensure_ascii=False)
def fetch_dois():
doi_data = {}
data = json.loads(fetch_all_links())
for topic, links in data.items():
doi_list = []
for link in links:
page_content = utils.fetch_page(link)
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
for page_data in page_datas:
doi_link = page_data.find("a", href=True)
if doi_link:
doi = doi_link.text
if doi.startswith('10.'):
doi_list.append(doi)
doi_data[topic] = doi_list
return json.dumps(doi_data, indent=4, ensure_ascii=False)
def fetch_doi_data():
result = []
def fetch_and_store():
result.append(fetch_dois())
thread = threading.Thread(target=fetch_and_store)
thread.start()
thread.join()
return result[0] if result else {}
def doi_to_pmc():
data = json.loads(fetch_doi_data())
pmc_data = {}
for topic, dois in data.items():
if not dois:
continue
doi_list = ",".join(dois)
try:
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
doi_pmc_data = requests.get(url).json()
if doi_pmc_data['status'] == 'ok':
pmc_list = [record['pmcid'] for record in doi_pmc_data['records'] if 'pmcid' in record and record.get('live', True)]
pmc_data[topic] = pmc_list[:2]
except Exception as e:
print(f"Error: {str(e)}")
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
def extract_sd_data():
if not utils.download_datafile('sd.txt'):
raise Exception("Failed to download datafile")
pmc_data = {}
pmcid_data = json.loads(doi_to_pmc())
for topic, pmcids in pmcid_data.items():
pmc_ids = []
for pmcid in pmcids:
if len(pmc_ids) >= 2:
break
if not utils.check_data_in_file(pmcid, 'sd.txt'):
utils.write_data_to_file(pmcid, 'sd.txt')
pmc_ids.append(pmcid)
pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
if not utils.upload_datafile('sd.txt'):
raise Exception("Failed to upload datafile")
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
if __name__ == "__main__":
data = extract_sd_data()
with open('sd_data.json', 'w') as f:
f.write(data)
|