raannakasturi commited on
Commit
351d53e
·
verified ·
1 Parent(s): 82ed732

Update pmc.py

Browse files
Files changed (1) hide show
  1. pmc.py +37 -58
pmc.py CHANGED
@@ -2,12 +2,16 @@ import json
2
  import xml.etree.ElementTree as ET
3
  from bs4 import BeautifulSoup
4
  import requests
5
- import tools
 
6
  import threading
7
 
 
 
 
8
  def fetch_links(category):
9
  links = []
10
- xml_data = tools.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
11
  items = ET.fromstring(xml_data).findall('channel/item')
12
  for item in items:
13
  link = item.find('link').text
@@ -20,8 +24,7 @@ def fetch_all_links():
20
  for category in categories:
21
  links = fetch_links(category)
22
  sd_links_data[category] = links
23
- data = json.dumps(sd_links_data, indent=4, ensure_ascii=False)
24
- return data
25
 
26
  def fetch_dois():
27
  doi_data = {}
@@ -29,20 +32,16 @@ def fetch_dois():
29
  for topic, links in data.items():
30
  doi_list = []
31
  for link in links:
32
- page_content = tools.fetch_page(link)
33
  page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
34
  for page_data in page_datas:
35
- if not page_data.find("a", href=True):
36
- continue
37
- else:
38
- doi = page_data.find("a", href=True).text
39
  if doi.startswith('10.'):
40
  doi_list.append(doi)
41
- else:
42
- continue
43
  doi_data[topic] = doi_list
44
- data = json.dumps(doi_data, indent=4, ensure_ascii=False)
45
- return data
46
 
47
  def fetch_doi_data():
48
  result = []
@@ -51,65 +50,45 @@ def fetch_doi_data():
51
  thread = threading.Thread(target=fetch_and_store)
52
  thread.start()
53
  thread.join()
54
- if len(result) == 0 or not result or result[0] == None:
55
- return []
56
- return result[0]
57
 
58
  def doi_to_pmc():
59
  data = json.loads(fetch_doi_data())
60
  pmc_data = {}
61
  for topic, dois in data.items():
62
- if len(dois) > 0:
63
- doi_list = ""
64
- for doi in dois:
65
- doi_list += doi + ","
66
- doi_list = doi_list.rstrip(',')
67
- try:
68
- url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
69
- doi_pmc_data = requests.get(url).json()
70
- except Exception as e:
71
- print(f"Error: {str(e)}")
72
- if doi_pmc_data['status'] == 'ok':
73
- pmc_list = []
74
- for record in doi_pmc_data['records']:
75
- if 'pmcid' in record:
76
- if 'live' in record and record['live'] == False:
77
- continue
78
- pmc_list.append(record['pmcid'])
79
- else:
80
- continue
81
- pmc_data[topic] = pmc_list
82
- else:
83
- continue
84
- else:
85
  continue
86
- data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
87
- return data
 
 
 
 
 
 
 
 
 
88
 
89
  def extract_pmc_data():
90
- if not tools.download_datafile('pmc.txt'):
91
  raise Exception("Failed to download datafile")
92
- pmc_data ={}
93
  pmcid_data = json.loads(doi_to_pmc())
94
  for topic, pmcids in pmcid_data.items():
95
  pmc_ids = []
96
- for id in pmcids:
97
- if len(pmc_ids) >= 3:
98
- continue
99
- elif tools.check_data_in_file(id, 'pmc.txt'):
100
- continue
101
- else:
102
- tools.write_data_to_file(id, 'pmc.txt')
103
- pmc_ids.append(id)
104
- pmc_data[topic] = {}
105
- pmc_data[topic]['count'] = len(pmc_ids)
106
- pmc_data[topic]['ids'] = pmc_ids
107
- data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
108
- if not tools.upload_datafile('pmc.txt'):
109
  raise Exception("Failed to upload datafile")
110
- return data
111
 
112
  if __name__ == "__main__":
113
  data = extract_pmc_data()
114
  with open('pmc_data.json', 'w') as f:
115
- f.write(data)
 
2
  import xml.etree.ElementTree as ET
3
  from bs4 import BeautifulSoup
4
  import requests
5
+ import os
6
+ import sys
7
  import threading
8
 
9
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
10
+ import utils
11
+
12
  def fetch_links(category):
13
  links = []
14
+ xml_data = utils.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
15
  items = ET.fromstring(xml_data).findall('channel/item')
16
  for item in items:
17
  link = item.find('link').text
 
24
  for category in categories:
25
  links = fetch_links(category)
26
  sd_links_data[category] = links
27
+ return json.dumps(sd_links_data, indent=4, ensure_ascii=False)
 
28
 
29
  def fetch_dois():
30
  doi_data = {}
 
32
  for topic, links in data.items():
33
  doi_list = []
34
  for link in links:
35
+ page_content = utils.fetch_page(link)
36
  page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
37
  for page_data in page_datas:
38
+ doi_link = page_data.find("a", href=True)
39
+ if doi_link:
40
+ doi = doi_link.text
 
41
  if doi.startswith('10.'):
42
  doi_list.append(doi)
 
 
43
  doi_data[topic] = doi_list
44
+ return json.dumps(doi_data, indent=4, ensure_ascii=False)
 
45
 
46
  def fetch_doi_data():
47
  result = []
 
50
  thread = threading.Thread(target=fetch_and_store)
51
  thread.start()
52
  thread.join()
53
+ return result[0] if result else {}
 
 
54
 
55
  def doi_to_pmc():
56
  data = json.loads(fetch_doi_data())
57
  pmc_data = {}
58
  for topic, dois in data.items():
59
+ if not dois:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  continue
61
+ doi_list = ",".join(dois)
62
+ try:
63
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
64
+ doi_pmc_data = requests.get(url).json()
65
+
66
+ if doi_pmc_data['status'] == 'ok':
67
+ pmc_list = [record['pmcid'] for record in doi_pmc_data['records'] if 'pmcid' in record and record.get('live', True)]
68
+ pmc_data[topic] = pmc_list[:4]
69
+ except Exception as e:
70
+ print(f"Error: {str(e)}")
71
+ return json.dumps(pmc_data, indent=4, ensure_ascii=False)
72
 
73
  def extract_pmc_data():
74
+ if not utils.download_datafile('pmc.txt'):
75
  raise Exception("Failed to download datafile")
76
+ pmc_data = {}
77
  pmcid_data = json.loads(doi_to_pmc())
78
  for topic, pmcids in pmcid_data.items():
79
  pmc_ids = []
80
+ for pmcid in pmcids:
81
+ if len(pmc_ids) >= 4:
82
+ break
83
+ if not utils.check_data_in_file(pmcid, 'pmc.txt'):
84
+ utils.write_data_to_file(pmcid, 'pmc.txt')
85
+ pmc_ids.append(pmcid)
86
+ pmc_data[topic] = {"ids": pmc_ids}
87
+ if not utils.upload_datafile('pmc.txt'):
 
 
 
 
 
88
  raise Exception("Failed to upload datafile")
89
+ return json.dumps(pmc_data, indent=4, ensure_ascii=False)
90
 
91
  if __name__ == "__main__":
92
  data = extract_pmc_data()
93
  with open('pmc_data.json', 'w') as f:
94
+ f.write(data)