C2MV commited on
Commit
574ae04
1 Parent(s): 9986610

Create App.py

Browse files
Files changed (1) hide show
  1. App.py +392 -0
App.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import logging
5
+ import zipfile
6
+ import requests
7
+ import bibtexparser
8
+ from tqdm import tqdm
9
+ from urllib.parse import quote, urlencode
10
+ import gradio as gr
11
+ from bs4 import BeautifulSoup
12
+ import io
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO,
16
+ format='%(asctime)s - %(levelname)s: %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class PaperDownloader:
20
+ def __init__(self, output_dir='papers'):
21
+ self.output_dir = output_dir
22
+ os.makedirs(output_dir, exist_ok=True)
23
+
24
+ # Updated download sources
25
+ self.download_sources = [
26
+ 'https://sci-hub.ee/',
27
+ 'https://sci-hub.st/',
28
+ 'https://sci-hub.ru/',
29
+ 'https://sci-hub.ren/',
30
+ 'https://sci-hub.mksa.top/',
31
+ 'https://sci-hub.se/',
32
+ 'https://libgen.rs/scimag/'
33
+ ]
34
+
35
+ # Request headers
36
+ self.headers = {
37
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
38
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
39
+ }
40
+
41
+ def clean_doi(self, doi):
42
+ """Clean and encode DOI for URL"""
43
+ return quote(doi.strip()) if doi else None
44
+
45
+ def download_paper_scihub(self, doi):
46
+ """Improved method to download paper from Sci-Hub"""
47
+ if not doi:
48
+ logger.warning("DOI not provided")
49
+ return None
50
+
51
+ for base_url in self.download_sources:
52
+ try:
53
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
54
+
55
+ # Request with more tolerance
56
+ response = requests.get(scihub_url,
57
+ headers=self.headers,
58
+ allow_redirects=True,
59
+ timeout=15)
60
+
61
+ # Search for multiple PDF URL patterns
62
+ pdf_patterns = [
63
+ r'(https?://[^\s<>"]+?\.pdf)',
64
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
65
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
66
+ ]
67
+
68
+ pdf_urls = []
69
+ for pattern in pdf_patterns:
70
+ pdf_urls.extend(re.findall(pattern, response.text))
71
+
72
+ # Try downloading from found URLs
73
+ for pdf_url in pdf_urls:
74
+ try:
75
+ pdf_response = requests.get(pdf_url,
76
+ headers=self.headers,
77
+ timeout=10)
78
+
79
+ # Verify if it's a PDF
80
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
81
+ logger.debug(f"Found PDF from: {pdf_url}")
82
+ return pdf_response.content
83
+ except Exception as e:
84
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
85
+
86
+ except Exception as e:
87
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
88
+
89
+ return None
90
+
91
+ def download_paper_libgen(self, doi):
92
+ """Download from Libgen, handles the query and the redirection"""
93
+ if not doi:
94
+ return None
95
+
96
+ base_url = 'https://libgen.rs/scimag/'
97
+ try:
98
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
99
+ response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
100
+ response.raise_for_status()
101
+
102
+ if "No results" in response.text:
103
+ logger.debug(f"No results for DOI: {doi} on libgen")
104
+ return None
105
+
106
+ soup = BeautifulSoup(response.text, 'html.parser')
107
+
108
+ # Find the link using a specific selector
109
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
110
+
111
+ if links:
112
+ link = links[0]
113
+ pdf_url = link['href']
114
+ pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
115
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
116
+ logger.debug(f"Found PDF from: {pdf_url}")
117
+ return pdf_response.content
118
+
119
+ except Exception as e:
120
+ logger.debug(f"Error trying to download {doi} from libgen: {e}")
121
+ return None
122
+
123
+ def download_paper_google_scholar(self, doi):
124
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
125
+ if not doi:
126
+ return None
127
+
128
+ try:
129
+
130
+ query = f'doi:"{doi}"'
131
+ params = {'q': query}
132
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
133
+
134
+ response = requests.get(url, headers = self.headers, timeout = 10)
135
+ response.raise_for_status()
136
+
137
+ soup = BeautifulSoup(response.text, 'html.parser')
138
+
139
+ # Find any links with [PDF]
140
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
141
+
142
+ if links:
143
+ pdf_url = links[0]['href']
144
+ pdf_response = requests.get(pdf_url, headers = self.headers, timeout=10)
145
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
146
+ logger.debug(f"Found PDF from: {pdf_url}")
147
+ return pdf_response.content
148
+ except Exception as e:
149
+ logger.debug(f"Google Scholar error for {doi}: {e}")
150
+
151
+ return None
152
+
153
+ def download_paper_crossref(self, doi):
154
+ """Alternative search method using Crossref"""
155
+ if not doi:
156
+ return None
157
+
158
+ try:
159
+ # Search for open access link
160
+ url = f"https://api.crossref.org/works/{doi}"
161
+ response = requests.get(url, headers=self.headers, timeout=10)
162
+
163
+ if response.status_code == 200:
164
+ data = response.json()
165
+ work = data.get('message', {})
166
+
167
+ # Search for open access links
168
+ links = work.get('link', [])
169
+ for link in links:
170
+ if link.get('content-type') == 'application/pdf':
171
+ pdf_url = link.get('URL')
172
+ if pdf_url:
173
+ pdf_response = requests.get(pdf_url, headers=self.headers)
174
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
175
+ logger.debug(f"Found PDF from: {pdf_url}")
176
+ return pdf_response.content
177
+
178
+ except Exception as e:
179
+ logger.debug(f"Crossref error for {doi}: {e}")
180
+
181
+ return None
182
+
183
+
184
+ def download_with_retry(self, doi, max_retries=3, initial_delay=2):
185
+ """Downloads a paper using multiple strategies with exponential backoff"""
186
+ pdf_content = None
187
+ retries = 0
188
+ delay = initial_delay
189
+
190
+ while retries < max_retries and not pdf_content:
191
+ try:
192
+ pdf_content = (
193
+ self.download_paper_scihub(doi) or
194
+ self.download_paper_libgen(doi) or
195
+ self.download_paper_google_scholar(doi) or
196
+ self.download_paper_crossref(doi)
197
+
198
+ )
199
+
200
+ if pdf_content:
201
+ return pdf_content
202
+ except Exception as e:
203
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
204
+
205
+ if not pdf_content:
206
+ retries += 1
207
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
208
+ time.sleep(delay)
209
+ delay *= 2 # Exponential backoff
210
+
211
+ return None
212
+
213
+ def download_single_doi(self, doi):
214
+ """Downloads a single paper using a DOI"""
215
+ if not doi:
216
+ return None, "Error: DOI not provided", "Error: DOI not provided"
217
+
218
+ try:
219
+ pdf_content = self.download_with_retry(doi)
220
+
221
+ if pdf_content:
222
+ filename = f"{doi.replace('/', '_').replace('.', '_')}.pdf"
223
+ filepath = os.path.join(self.output_dir, filename)
224
+ with open(filepath, 'wb') as f:
225
+ f.write(pdf_content)
226
+ logger.info(f"Successfully downloaded: {filename}")
227
+ return filepath, "Successfully downloaded", ""
228
+ else:
229
+ logger.warning(f"Could not download: {doi}")
230
+ return None, f"Could not download {doi}", f"Could not download {doi}"
231
+
232
+ except Exception as e:
233
+ logger.error(f"Error processing {doi}: {e}")
234
+ return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
235
+
236
+ def download_multiple_dois(self, dois_text):
237
+ """Downloads multiple papers from a list of DOIs"""
238
+ if not dois_text:
239
+ return None, "Error: No DOIs provided", "Error: No DOIs provided"
240
+
241
+ dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
242
+ if not dois:
243
+ return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
244
+
245
+ downloaded_files = []
246
+ failed_dois = []
247
+ for doi in tqdm(dois, desc="Downloading papers"):
248
+ filepath, success_message, fail_message = self.download_single_doi(doi)
249
+ if filepath:
250
+ downloaded_files.append(filepath)
251
+ else:
252
+ failed_dois.append(doi)
253
+
254
+ if downloaded_files:
255
+ zip_filename = 'papers.zip'
256
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
257
+ for file_path in downloaded_files:
258
+ zipf.write(file_path, arcname=os.path.basename(file_path))
259
+ logger.info(f"ZIP file created: {zip_filename}")
260
+
261
+ return zip_filename if downloaded_files else None, "\n".join(downloaded_files), "\n".join(failed_dois)
262
+
263
+
264
+
265
+ def process_bibtex(self, bib_file):
266
+ """Process BibTeX file and download papers with multiple strategies"""
267
+ # Read BibTeX file content from the uploaded object
268
+ try:
269
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
270
+ bib_content = f.read()
271
+ except Exception as e:
272
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
273
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}"
274
+
275
+ # Parse BibTeX data
276
+ try:
277
+ bib_database = bibtexparser.loads(bib_content)
278
+ except Exception as e:
279
+ logger.error(f"Error parsing BibTeX data: {e}")
280
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}"
281
+
282
+ # Extract DOIs
283
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
284
+ logger.info(f"Found {len(dois)} DOIs to download")
285
+
286
+ # Result lists
287
+ downloaded_files = []
288
+ failed_dois = []
289
+
290
+ # Download PDFs
291
+ for doi in tqdm(dois, desc="Downloading papers"):
292
+ try:
293
+ # Try to download with multiple methods with retries
294
+ pdf_content = self.download_with_retry(doi)
295
+
296
+ # Save PDF
297
+ if pdf_content:
298
+ filename = f"{doi.replace('/', '_').replace('.', '_')}.pdf"
299
+ filepath = os.path.join(self.output_dir, filename)
300
+
301
+ with open(filepath, 'wb') as f:
302
+ f.write(pdf_content)
303
+
304
+ downloaded_files.append(filepath)
305
+ logger.info(f"Successfully downloaded: {filename}")
306
+ else:
307
+ failed_dois.append(doi)
308
+ logger.warning(f"Could not download: {doi}")
309
+
310
+ except Exception as e:
311
+ failed_dois.append(doi)
312
+ logger.error(f"Error processing {doi}: {e}")
313
+
314
+ # Create ZIP of downloaded papers
315
+ if downloaded_files:
316
+ zip_filename = 'papers.zip'
317
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
318
+ for file_path in downloaded_files:
319
+ zipf.write(file_path, arcname=os.path.basename(file_path))
320
+ logger.info(f"ZIP file created: {zip_filename}")
321
+
322
+ return zip_filename, "\n".join(downloaded_files), "\n".join(failed_dois)
323
+
324
+
325
+ def create_gradio_interface():
326
+ """Create Gradio interface for Paper Downloader"""
327
+ downloader = PaperDownloader()
328
+
329
+ def download_papers(bib_file, doi_input, dois_input):
330
+ if bib_file:
331
+ # Check file type
332
+ if not bib_file.name.lower().endswith('.bib'):
333
+ return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
334
+
335
+ zip_path, downloaded_dois, failed_dois = downloader.process_bibtex(bib_file)
336
+ return zip_path, downloaded_dois, failed_dois, None
337
+ elif doi_input:
338
+ filepath, message, failed_doi = downloader.download_single_doi(doi_input)
339
+ return None, message, failed_doi, filepath
340
+ elif dois_input:
341
+ zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
342
+ return zip_path, downloaded_dois, failed_dois, None
343
+ else:
344
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
345
+
346
+
347
+ # Gradio Interface
348
+ interface = gr.Interface(
349
+ fn=download_papers,
350
+ inputs=[
351
+ gr.File(file_types=['.bib'], label="Upload BibTeX File"),
352
+ gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
353
+ gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
354
+ ],
355
+ outputs=[
356
+ gr.File(label="Download Papers (ZIP) or Single PDF"),
357
+ gr.Textbox(label="Downloaded DOIs/Message"),
358
+ gr.Textbox(label="Failed DOIs"),
359
+ gr.File(label="Downloaded Single PDF")
360
+ ],
361
+ title="🔬 Academic Paper Batch Downloader",
362
+ description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
363
+ theme="soft",
364
+ examples=[
365
+ ["example.bib", None, None], # Bibtex File
366
+ [None, "10.1038/nature12373", None], # Single DOI
367
+ [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
368
+ ],
369
+ css="""
370
+ .gradio-container {
371
+ background-color: #f4f4f4;
372
+ }
373
+ .gr-interface {
374
+ max-width: 800px;
375
+ margin: 0 auto;
376
+ }
377
+ .gr-box {
378
+ background-color: white;
379
+ border-radius: 10px;
380
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
381
+ }
382
+ """
383
+ )
384
+
385
+ return interface
386
+
387
+ def main():
388
+ interface = create_gradio_interface()
389
+ interface.launch(share=True)
390
+
391
+ if __name__ == "__main__":
392
+ main()