raannakasturi commited on
Commit
c680313
·
1 Parent(s): c440916

Add initial project structure with core functionality and dependencies

Browse files
Files changed (7) hide show
  1. .gitignore +8 -0
  2. app.py +28 -0
  3. arvix.py +113 -0
  4. fetch_data.py +45 -0
  5. pmc.py +108 -0
  6. requirements.txt +5 -0
  7. tools.py +74 -0
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ /.cache
2
+ /__pycache__
3
+ .env
4
+ .env.local
5
+ .env.development.local
6
+ .env.test.local
7
+ .env.production.local
8
+ .DS_Store
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from fetch_data import fetch_data
3
+ from tools import reset_datafiles
4
+
5
+ theme = gr.themes.Soft(
6
+ primary_hue="purple",
7
+ secondary_hue="cyan",
8
+ neutral_hue="slate",
9
+ font=[
10
+ gr.themes.GoogleFont('Syne'),
11
+ gr.themes.GoogleFont('Poppins'),
12
+ gr.themes.GoogleFont('Poppins'),
13
+ gr.themes.GoogleFont('Poppins')
14
+ ],
15
+ )
16
+
17
+ with gr.Blocks(theme=theme, title="fetch Research Paper IDS") as app:
18
+ with gr.Row():
19
+ with gr.Column():
20
+ user_access_key = gr.Textbox(label="Access Key", placeholder="Enter your access key", type="password")
21
+ with gr.Row():
22
+ fetch_data_btn = gr.Button(value="Fetch Data")
23
+ reset_files = gr.Button(value="Reset Files")
24
+ raw_data = gr.Textbox(lines=15, label="Raw IDs Data", interactive=False, placeholder="IDs starting with PMC are PMC IDs and rest all are Arxiv IDs", show_copy_button=True)
25
+ fetch_data_btn.click(fn=fetch_data, inputs=[user_access_key], outputs=[raw_data], api_name="fetch_paper_ids")
26
+ reset_files.click(fn=reset_datafiles, inputs=[user_access_key], outputs=[raw_data], show_api=False)
27
+
28
+ app.queue(default_concurrency_limit=25).launch(max_threads=5000)
arvix.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import tools
4
+ from bs4 import BeautifulSoup
5
+
6
+ def fetch_new_page(category):
7
+ url = f'https://arxiv.org/list/{category}/new'
8
+ return tools.fetch_page(url)
9
+
10
+ def fetch_recent_page(category):
11
+ url = f'https://arxiv.org/list/{category}/recent'
12
+ return tools.fetch_page(url)
13
+
14
+ def extract_new_data(category):
15
+ paper_ids = []
16
+ page_content = fetch_new_page(category)
17
+ lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
18
+ for list in lists:
19
+ papers = list.find_all('dt')
20
+ paper_contents = list.find_all('dd')
21
+ titles = [paper_content.find('div', class_='list-title').text.strip().split('Title:')[-1].strip() for paper_content in paper_contents]
22
+ for paper, title in zip(papers, titles):
23
+ if not tools.verify_simple_title(title):
24
+ continue
25
+ else:
26
+ paper_link = paper.find('a', href=True)
27
+ if paper_link:
28
+ paper_id = paper_link.text.strip().split(':')[1]
29
+ paper_ids.append(paper_id)
30
+ else:
31
+ continue
32
+ return paper_ids
33
+
34
+ def extract_recent_data(category):
35
+ paper_ids = []
36
+ page_content = fetch_recent_page(category)
37
+ lists = BeautifulSoup(page_content, 'html.parser').find_all('dl')
38
+ for list in lists:
39
+ papers = list.find_all('dt')
40
+ for paper in papers:
41
+ paper_link = paper.find('a', href=True)
42
+ if paper_link:
43
+ paper_id = paper_link.text.strip().split(':')[1]
44
+ paper_ids.append(paper_id)
45
+ else:
46
+ continue
47
+ return paper_ids
48
+
49
+ def extract_data(category):
50
+ sanitized_data = []
51
+ new_data = extract_new_data(category)
52
+ recent_data = extract_recent_data(category)
53
+ data = list(set(new_data + recent_data))
54
+ for id in data:
55
+ if len(sanitized_data) >= 12:
56
+ break
57
+ if category in ["hep-ex", "hep-lat", "hep-ph", "hep-th"]:
58
+ id = id[:3]
59
+ if tools.check_data_in_file(id, 'arxiv.txt'):
60
+ continue
61
+ else:
62
+ tools.write_data_to_file(id, 'arxiv.txt')
63
+ sanitized_data.append(id)
64
+ random.shuffle(sanitized_data)
65
+ return sanitized_data[:12]
66
+
67
+ def extract_arxiv_data():
68
+ if not tools.download_datafile('arxiv.txt'):
69
+ raise Exception("Failed to download datafile")
70
+ categories = {
71
+ "Astrophysics": ["astro-ph"],
72
+ "Condensed Matter": ["cond-mat"],
73
+ "General Relativity and Quantum Cosmology": ["gr-qc"],
74
+ "High Energy Physics": ["hep-ex", "hep-lat", "hep-ph", "hep-th"],
75
+ "Mathematical Physics": ["math-ph"],
76
+ "Nonlinear Sciences": ["nlin"],
77
+ "Nuclear Experiment": ["nucl-ex"],
78
+ "Nuclear Theory": ["nucl-th"],
79
+ "Physics": ["physics"],
80
+ "Quantum Physics": ["quant-ph"],
81
+ "Mathematics": ["math"],
82
+ "Computer Science": ["cs"],
83
+ "Quantitative Biology": ["q-bio"],
84
+ "Quantitative Finance": ["q-fin"],
85
+ "Statistics": ["stat"],
86
+ "Electrical Engineering and Systems Science": ["eess"],
87
+ "Economics": ["econ"]
88
+ }
89
+ data = {}
90
+ for category, subcategories in categories.items():
91
+ category_data = {}
92
+ all_ids = []
93
+ for subcategory in subcategories:
94
+ ids = extract_data(subcategory)
95
+ for id in ids:
96
+ all_ids.append(id)
97
+ if len(all_ids) > 12:
98
+ print(f"Found more than 12 papers for {category}. Randomly selecting 12 papers.")
99
+ random.shuffle(all_ids)
100
+ all_ids = all_ids[:12]
101
+ category_data['count'] = len(all_ids)
102
+ category_data['ids'] = all_ids
103
+ data[category] = category_data
104
+ data = json.dumps(data, indent=4, ensure_ascii=False)
105
+ if not tools.upload_datafile('arxiv.txt'):
106
+ raise Exception("Failed to upload datafile")
107
+ return data
108
+
109
+
110
+ if __name__ == '__main__':
111
+ data = extract_arxiv_data()
112
+ with open('arxiv_data.json', 'w') as f:
113
+ f.write(data)
fetch_data.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from arvix import extract_arxiv_data
2
+ from pmc import extract_pmc_data
3
+ import json
4
+ import dotenv
5
+ import os
6
+ from concurrent.futures import ThreadPoolExecutor
7
+
8
+ dotenv.load_dotenv()
9
+ ACCESS_KEY = os.getenv("ACCESS_KEY")
10
+
11
+ def fetch_arxiv_data():
12
+ return json.loads(extract_arxiv_data())
13
+
14
+ def fetch_pmc_data():
15
+ return json.loads(extract_pmc_data())
16
+
17
+ def fetch_data(user_access_key):
18
+ if user_access_key != ACCESS_KEY:
19
+ papers_data = {"error": "Invalid access key"}
20
+ else:
21
+ papers_data = {}
22
+ with ThreadPoolExecutor() as executor:
23
+ arxiv_future = executor.submit(fetch_arxiv_data)
24
+ pmc_future = executor.submit(fetch_pmc_data)
25
+ arxiv_data = arxiv_future.result()
26
+ pmc_data = pmc_future.result()
27
+ for topic, topic_data in arxiv_data.items():
28
+ if topic_data['count'] == 0:
29
+ continue
30
+ else:
31
+ papers_data[topic] = {}
32
+ papers_data[topic]['ids'] = topic_data['ids']
33
+ for topic, topic_data in pmc_data.items():
34
+ if topic_data['count'] == 0:
35
+ continue
36
+ else:
37
+ papers_data[topic] = {}
38
+ papers_data[topic]['ids'] = topic_data['ids']
39
+ data = json.dumps(papers_data, indent=4, ensure_ascii=False)
40
+ return data
41
+
42
+ if __name__ == '__main__':
43
+ data = fetch_data()
44
+ with open('data.json', 'w') as f:
45
+ f.write(data)
pmc.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import xml.etree.ElementTree as ET
3
+ from bs4 import BeautifulSoup
4
+ import requests
5
+ import tools
6
+ import threading
7
+
8
+ def fetch_links(category):
9
+ links = []
10
+ xml_data = tools.fetch_page(f"https://www.sciencedaily.com/rss/top/{category.lower()}.xml")
11
+ items = ET.fromstring(xml_data).findall('channel/item')
12
+ for item in items:
13
+ link = item.find('link').text
14
+ links.append(link)
15
+ return links
16
+
17
+ def fetch_all_links():
18
+ categories = ["Science", "Health", "Environment", "Technology", "Society"]
19
+ sd_links_data = {}
20
+ for category in categories:
21
+ links = fetch_links(category)
22
+ sd_links_data[category] = links
23
+ data = json.dumps(sd_links_data, indent=4, ensure_ascii=False)
24
+ return data
25
+
26
+ def fetch_dois():
27
+ doi_data = {}
28
+ data = json.loads(fetch_all_links())
29
+ for topic, links in data.items():
30
+ doi_list = []
31
+ for link in links:
32
+ page_content = tools.fetch_page(link)
33
+ page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div", id="journal_references")
34
+ for page_data in page_datas:
35
+ doi = page_data.find("a", href=True).text
36
+ if doi.startswith('10.'):
37
+ doi_list.append(doi)
38
+ else:
39
+ continue
40
+ doi_data[topic] = doi_list
41
+ data = json.dumps(doi_data, indent=4, ensure_ascii=False)
42
+ return data
43
+
44
+ def fetch_doi_data():
45
+ result = []
46
+ def fetch_and_store():
47
+ result.append(fetch_dois())
48
+ thread = threading.Thread(target=fetch_and_store)
49
+ thread.start()
50
+ thread.join()
51
+ return result[0]
52
+
53
+ def doi_to_pmc():
54
+ data = json.loads(fetch_doi_data())
55
+ pmc_data = {}
56
+ for topic, dois in data.items():
57
+ if len(dois) > 0:
58
+ doi_list = ""
59
+ for doi in dois:
60
+ doi_list += doi + ","
61
+ doi_list = doi_list.rstrip(',')
62
+ try:
63
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={doi_list}&format=json"
64
+ doi_pmc_data = requests.get(url).json()
65
+ except Exception as e:
66
+ print(f"Error: {str(e)}")
67
+ if doi_pmc_data['status'] == 'ok':
68
+ pmc_list = []
69
+ for record in doi_pmc_data['records']:
70
+ if 'pmcid' in record:
71
+ if 'live' in record and record['live'] == False:
72
+ continue
73
+ pmc_list.append(record['pmcid'])
74
+ else:
75
+ continue
76
+ pmc_data[topic] = pmc_list
77
+ else:
78
+ continue
79
+ else:
80
+ continue
81
+ data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
82
+ return data
83
+
84
+ def extract_pmc_data():
85
+ if not tools.download_datafile('pmc.txt'):
86
+ raise Exception("Failed to download datafile")
87
+ pmc_data ={}
88
+ pmcid_data = json.loads(doi_to_pmc())
89
+ for topic, pmcids in pmcid_data.items():
90
+ pmc_ids = []
91
+ for id in pmcids:
92
+ if tools.check_data_in_file(id, 'pmc.txt'):
93
+ continue
94
+ else:
95
+ tools.write_data_to_file(id, 'pmc.txt')
96
+ pmc_ids.append(id)
97
+ pmc_data[topic] = {}
98
+ pmc_data[topic]['count'] = len(pmc_ids)
99
+ pmc_data[topic]['ids'] = pmc_ids
100
+ data = json.dumps(pmc_data, indent=4, ensure_ascii=False)
101
+ if not tools.upload_datafile('pmc.txt'):
102
+ raise Exception("Failed to upload datafile")
103
+ return data
104
+
105
+ if __name__ == "__main__":
106
+ data = extract_pmc_data()
107
+ with open('pmc_data.json', 'w') as f:
108
+ f.write(data)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==5.8.0
2
+ python-dotenv==1.0.1
3
+ beautifulsoup4==4.12.3
4
+ requests==2.32.3
5
+ huggingface-hub==0.27.0
tools.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import re
3
+ import os
4
+ import dotenv
5
+ from huggingface_hub import HfApi
6
+
7
+ dotenv.load_dotenv()
8
+ hf_token = os.getenv("HF_API_TOKEN")
9
+ access_key = os.getenv("ACCESS_KEY")
10
+ api = HfApi(token=hf_token)
11
+
12
+ def fetch_page(url):
13
+ HEADERS = {
14
+ 'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
15
+ }
16
+ page_content = requests.get(url, headers=HEADERS).content
17
+ return page_content
18
+
19
+ def check_data_in_file(data, file):
20
+ with open(file, 'r') as f:
21
+ existing_data = f.read().splitlines()
22
+ if data in existing_data:
23
+ return True
24
+ else:
25
+ return False
26
+
27
+ def write_data_to_file(data, file):
28
+ with open(file, 'a') as f:
29
+ f.write(data + '\n')
30
+ return True
31
+
32
+ def verify_simple_title(title):
33
+ pattern = re.compile(r'^[a-zA-Z0-9\s\.\-\+\*/=\(\)\[\]\{\},:;"\'?\>\<\@\#\%\^\*\|\_\~\`]+$')
34
+ if pattern.match(title):
35
+ return True
36
+ else:
37
+ return False
38
+
39
+ def download_datafile(filename):
40
+ try:
41
+ api.hf_hub_download(repo_id="raannakasturi/ReXploreData", filename=filename, repo_type="dataset", local_dir='.', cache_dir='.', force_download=True)
42
+ return True
43
+ except Exception as e:
44
+ print(str(e))
45
+ return False
46
+
47
+ def upload_datafile(filename):
48
+ try:
49
+ api.upload_file(path_or_fileobj=filename, path_in_repo=filename, repo_id="raannakasturi/ReXploreData", repo_type="dataset")
50
+ os.remove(filename)
51
+ return True
52
+ except Exception as e:
53
+ print(str(e))
54
+ return False
55
+
56
+ def reset_datafiles(user_access_key):
57
+ if user_access_key != access_key:
58
+ return "Invalid access key"
59
+ else:
60
+ files = ['arxiv.txt', 'pmc.txt']
61
+ try:
62
+ for filename in files:
63
+ try:
64
+ download_datafile(filename)
65
+ with open(filename, 'w') as f:
66
+ f.write('')
67
+ upload_datafile(filename)
68
+ except Exception as e:
69
+ print(str(e))
70
+ continue
71
+ return True
72
+ except Exception as e:
73
+ print(str(e))
74
+ return False