raannakasturi commited on
Commit
a4b6d0b
·
1 Parent(s): 8a1664b

Refactor data extraction functions: rename extract_pmc_data to extract_sd_data, add extract_phys_data, and update file handling for new data sources

Browse files
Files changed (4) hide show
  1. arvix.py +4 -4
  2. fetch_data.py +16 -4
  3. phys.py +98 -0
  4. pmc.py +7 -7
arvix.py CHANGED
@@ -55,8 +55,8 @@ def extract_data(category):
55
  return list(all_ids)
56
 
57
  def extract_arxiv_data():
58
- if not utils.download_datafile('arxiv.txt'):
59
- raise Exception("Failed to download datafile")
60
  categories = {
61
  "Astrophysics": ["astro-ph"],
62
  "Condensed Matter": ["cond-mat"],
@@ -93,8 +93,8 @@ def extract_arxiv_data():
93
  while len(category_ids) < 2:
94
  category_ids.add(random.choice(list(used_ids)))
95
  data[category] = {"ids": list(category_ids), "count": len(category_ids)}
96
- if not utils.upload_datafile('arxiv.txt'):
97
- raise Exception("Failed to upload datafile")
98
  return json.dumps(data, indent=4, ensure_ascii=False)
99
 
100
  if __name__ == '__main__':
 
55
  return list(all_ids)
56
 
57
  def extract_arxiv_data():
58
+ # if not utils.download_datafile('arxiv.txt'):
59
+ # raise Exception("Failed to download datafile")
60
  categories = {
61
  "Astrophysics": ["astro-ph"],
62
  "Condensed Matter": ["cond-mat"],
 
93
  while len(category_ids) < 2:
94
  category_ids.add(random.choice(list(used_ids)))
95
  data[category] = {"ids": list(category_ids), "count": len(category_ids)}
96
+ # if not utils.upload_datafile('arxiv.txt'):
97
+ # raise Exception("Failed to upload datafile")
98
  return json.dumps(data, indent=4, ensure_ascii=False)
99
 
100
  if __name__ == '__main__':
fetch_data.py CHANGED
@@ -1,5 +1,6 @@
1
  from arvix import extract_arxiv_data
2
- from pmc import extract_pmc_data
 
3
  import json
4
  import dotenv
5
  import os
@@ -11,8 +12,11 @@ ACCESS_KEY = os.getenv("ACCESS_KEY")
11
  def fetch_arxiv_data():
12
  return json.loads(extract_arxiv_data())
13
 
14
- def fetch_pmc_data():
15
- return json.loads(extract_pmc_data())
 
 
 
16
 
17
  def fetch_data(user_access_key):
18
  if user_access_key != ACCESS_KEY:
@@ -23,10 +27,12 @@ def fetch_data(user_access_key):
23
  papers_data['status'] = 'success'
24
  papers_data['data'] = {}
25
  with ThreadPoolExecutor() as executor:
26
- pmc_future = executor.submit(fetch_pmc_data)
27
  arxiv_future = executor.submit(fetch_arxiv_data)
 
28
  pmc_data = pmc_future.result()
29
  arxiv_data = arxiv_future.result()
 
30
  for topic, topic_data in pmc_data.items():
31
  if topic_data['count'] == 0:
32
  continue
@@ -39,6 +45,12 @@ def fetch_data(user_access_key):
39
  else:
40
  papers_data['data'][topic] = {}
41
  papers_data['data'][topic]['ids'] = topic_data['ids']
 
 
 
 
 
 
42
  except Exception as e:
43
  print(str(e))
44
  papers_data['status'] = 'error'
 
1
  from arvix import extract_arxiv_data
2
+ from pmc import extract_sd_data
3
+ from phys import extract_phys_data
4
  import json
5
  import dotenv
6
  import os
 
12
  def fetch_arxiv_data():
13
  return json.loads(extract_arxiv_data())
14
 
15
+ def fetch_sd_data():
16
+ return json.loads(extract_sd_data())
17
+
18
+ def fetch_phys_data():
19
+ return json.loads(extract_phys_data())
20
 
21
  def fetch_data(user_access_key):
22
  if user_access_key != ACCESS_KEY:
 
27
  papers_data['status'] = 'success'
28
  papers_data['data'] = {}
29
  with ThreadPoolExecutor() as executor:
30
+ pmc_future = executor.submit(fetch_sd_data)
31
  arxiv_future = executor.submit(fetch_arxiv_data)
32
+ phys_future = executor.submit(fetch_phys_data)
33
  pmc_data = pmc_future.result()
34
  arxiv_data = arxiv_future.result()
35
+ phys_data = phys_future.result()
36
  for topic, topic_data in pmc_data.items():
37
  if topic_data['count'] == 0:
38
  continue
 
45
  else:
46
  papers_data['data'][topic] = {}
47
  papers_data['data'][topic]['ids'] = topic_data['ids']
48
+ for topic, topic_data in phys_data.items():
49
+ if topic_data['count'] == 0:
50
+ continue
51
+ else:
52
+ papers_data['data'][topic] = {}
53
+ papers_data['data'][topic]['ids'] = topic_data['ids']
54
  except Exception as e:
55
  print(str(e))
56
  papers_data['status'] = 'error'
phys.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import xml.etree.ElementTree as ET
3
+ from bs4 import BeautifulSoup
4
+ import requests
5
+ import threading
6
+ import utils
7
+
8
+ def fetch_links(link):
9
+ links = []
10
+ xml_data = utils.fetch_page(link)
11
+ items = ET.fromstring(xml_data).findall('channel/item')
12
+ for item in items:
13
+ link = item.find('link').text
14
+ links.append(link)
15
+ return links
16
+
17
+ def fetch_all_links():
18
+ category_link_data = {
19
+ "Earth": "https://phys.org/rss-feed/breaking/earth-news/",
20
+ "Science": "https://phys.org/rss-feed/breaking/science-news/",
21
+ "Nano-technology": "https://phys.org/rss-feed/breaking/nanotech-news/",
22
+ "Physics": "https://phys.org/rss-feed/breaking/physics-news/",
23
+ "Astronomy & Space": "https://phys.org/rss-feed/breaking/space-news/",
24
+ "Biology": "https://phys.org/rss-feed/breaking/biology-news/",
25
+ "Chemistry": "https://phys.org/rss-feed/breaking/chemistry-news/",
26
+ }
27
+ sd_links_data = {}
28
+ for category, link in category_link_data.items():
29
+ links = fetch_links(link)
30
+ sd_links_data[category] = links
31
+ return json.dumps(sd_links_data, indent=4, ensure_ascii=False)
32
+
33
+ def fetch_dois():
34
+ doi_data = {}
35
+ data = json.loads(fetch_all_links())
36
+ for topic, links in data.items():
37
+ doi_list = []
38
+ for link in links:
39
+ page_content = utils.fetch_page(link)
40
+ page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div",class_="article-main__more")
41
+ for page_data in page_datas:
42
+ doi_link = page_data.find("a", attrs={"data-doi":"1"})
43
+ if doi_link:
44
+ doi = doi_link.text.split('DOI: ')[-1]
45
+ if doi.startswith('10.'):
46
+ doi_list.append(doi)
47
+ doi_data[topic] = doi_list
48
+ return json.dumps(doi_data, indent=4, ensure_ascii=False)
49
+
50
+ def fetch_doi_data():
51
+ result = []
52
+ def fetch_and_store():
53
+ result.append(fetch_dois())
54
+ thread = threading.Thread(target=fetch_and_store)
55
+ thread.start()
56
+ thread.join()
57
+ return result[0] if result else {}
58
+
59
+ def doi_to_pmc():
60
+ data = json.loads(fetch_doi_data())
61
+ pmc_data = {}
62
+ for topic, dois in data.items():
63
+ if not dois:
64
+ continue
65
+ doi_list = ",".join(dois)
66
+ try:
67
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
68
+ doi_pmc_data = requests.get(url).json()
69
+
70
+ if doi_pmc_data['status'] == 'ok':
71
+ pmc_list = [record['pmcid'] for record in doi_pmc_data['records'] if 'pmcid' in record and record.get('live', True)]
72
+ pmc_data[topic] = pmc_list[:2]
73
+ except Exception as e:
74
+ print(f"Error: {str(e)}")
75
+ return json.dumps(pmc_data, indent=4, ensure_ascii=False)
76
+
77
+ def extract_phys_data():
78
+ if not utils.download_datafile('phys.txt'):
79
+ raise Exception("Failed to download datafile")
80
+ pmc_data = {}
81
+ pmcid_data = json.loads(doi_to_pmc())
82
+ for topic, pmcids in pmcid_data.items():
83
+ pmc_ids = []
84
+ for pmcid in pmcids:
85
+ if len(pmc_ids) >= 2:
86
+ break
87
+ if not utils.check_data_in_file(pmcid, 'phys.txt'):
88
+ utils.write_data_to_file(pmcid, 'phys.txt')
89
+ pmc_ids.append(pmcid)
90
+ pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
91
+ if not utils.upload_datafile('phys.txt'):
92
+ raise Exception("Failed to upload datafile")
93
+ return json.dumps(pmc_data, indent=4, ensure_ascii=False)
94
+
95
+ if __name__ == "__main__":
96
+ data = extract_phys_data()
97
+ with open('phys_data.json', 'w') as f:
98
+ f.write(data)
pmc.py CHANGED
@@ -66,8 +66,8 @@ def doi_to_pmc():
66
  print(f"Error: {str(e)}")
67
  return json.dumps(pmc_data, indent=4, ensure_ascii=False)
68
 
69
- def extract_pmc_data():
70
- if not utils.download_datafile('pmc.txt'):
71
  raise Exception("Failed to download datafile")
72
  pmc_data = {}
73
  pmcid_data = json.loads(doi_to_pmc())
@@ -76,15 +76,15 @@ def extract_pmc_data():
76
  for pmcid in pmcids:
77
  if len(pmc_ids) >= 2:
78
  break
79
- if not utils.check_data_in_file(pmcid, 'pmc.txt'):
80
- utils.write_data_to_file(pmcid, 'pmc.txt')
81
  pmc_ids.append(pmcid)
82
  pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
83
- if not utils.upload_datafile('pmc.txt'):
84
  raise Exception("Failed to upload datafile")
85
  return json.dumps(pmc_data, indent=4, ensure_ascii=False)
86
 
87
  if __name__ == "__main__":
88
- data = extract_pmc_data()
89
- with open('pmc_data.json', 'w') as f:
90
  f.write(data)
 
66
  print(f"Error: {str(e)}")
67
  return json.dumps(pmc_data, indent=4, ensure_ascii=False)
68
 
69
+ def extract_sd_data():
70
+ if not utils.download_datafile('sd.txt'):
71
  raise Exception("Failed to download datafile")
72
  pmc_data = {}
73
  pmcid_data = json.loads(doi_to_pmc())
 
76
  for pmcid in pmcids:
77
  if len(pmc_ids) >= 2:
78
  break
79
+ if not utils.check_data_in_file(pmcid, 'sd.txt'):
80
+ utils.write_data_to_file(pmcid, 'sd.txt')
81
  pmc_ids.append(pmcid)
82
  pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
83
+ if not utils.upload_datafile('sd.txt'):
84
  raise Exception("Failed to upload datafile")
85
  return json.dumps(pmc_data, indent=4, ensure_ascii=False)
86
 
87
  if __name__ == "__main__":
88
+ data = extract_sd_data()
89
+ with open('sd_data.json', 'w') as f:
90
  f.write(data)