C2MV commited on
Commit
a14a3e4
1 Parent(s): bf73a1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +741 -111
app.py CHANGED
@@ -1,112 +1,742 @@
1
- async def download_papers(bib_file, doi_input, dois_input, progress=gr.Progress()):
2
- if bib_file:
3
- # Check file type
4
- if not bib_file.name.lower().endswith('.bib'):
5
- return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
6
-
7
- zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file, progress)
8
- return zip_path, downloaded_dois, failed_dois, None
9
- elif doi_input:
10
- filepath, message, failed_doi = downloader.download_single_doi(doi_input, progress)
11
- return None, message, failed_doi, filepath
12
- elif dois_input:
13
- zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input, progress)
14
- return zip_path, downloaded_dois, failed_dois, None
15
- else:
16
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
17
-
18
- # Gradio Interface
19
- interface = gr.Interface(
20
- fn=download_papers,
21
- inputs=[
22
- gr.File(file_types=['.bib'], label="Upload BibTeX File"),
23
- gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
24
- gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
25
- ],
26
- outputs=[
27
- gr.File(label="Download Papers (ZIP) or Single PDF"),
28
- gr.HTML(label="""
29
- <div style='padding-bottom: 5px; font-weight: bold;'>
30
- Found DOIs
31
- </div>
32
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
33
- <div id="downloaded-dois"></div>
34
- </div>
35
- """),
36
- gr.HTML(label="""
37
- <div style='padding-bottom: 5px; font-weight: bold;'>
38
- Missed DOIs
39
- </div>
40
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
41
- <div id="failed-dois"></div>
42
- </div>
43
- """),
44
- gr.File(label="Downloaded Single PDF")
45
- ],
46
- title="🔬 Academic Paper Batch Downloader",
47
- description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
48
- theme="Hev832/Applio",
49
- examples=[
50
- ["example.bib", None, None], # Bibtex File
51
- [None, "10.1038/nature12373", None], # Single DOI
52
- [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
53
- ],
54
- css="""
55
- .gradio-container {
56
- background-color: black;
57
- }
58
- .gr-interface {
59
- max-width: 800px;
60
- margin: 0 auto;
61
- }
62
- .gr-box {
63
- background-color: black;
64
- border-radius: 10px;
65
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
66
- }
67
- .output-text a {
68
- color: #007bff; /* Blue color for hyperlinks */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  }
70
- """,
71
- cache_examples=False,
72
- )
73
-
74
- # Add Javascript to update HTML
75
- interface.load = """
76
- function(downloaded_dois, failed_dois) {
77
- let downloaded_html = '';
78
- downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
79
- downloaded_html += doi + '<br>';
80
- });
81
- document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
82
-
83
- let failed_html = '';
84
- failed_dois.split('\\n').filter(Boolean).forEach(doi => {
85
- failed_html += doi + '<br>';
86
- });
87
- document.querySelector("#failed-dois").innerHTML = failed_html;
88
- return [downloaded_html, failed_html];
89
- }
90
- """
91
-
92
- interface.head = """
93
- <script>
94
- function copyLink(button) {
95
- const linkElement = button.previousElementSibling;
96
- const link = linkElement.href;
97
- navigator.clipboard.writeText(link)
98
- .then(() => {
99
- button.innerText = '✓ Copied';
100
- button.style.color = 'green';
101
- setTimeout(() => {
102
- button.innerText = 'Copy';
103
- button.style.color = '';
104
- }, 2000);
105
- })
106
- .catch(err => {
107
- console.error('Failed to copy link: ', err);
108
- });
109
- }
110
- </script>
111
- """
112
- return interface
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import logging
5
+ import zipfile
6
+ import requests
7
+ import bibtexparser
8
+ from tqdm import tqdm
9
+ from urllib.parse import quote, urlencode
10
+ import gradio as gr
11
+ from bs4 import BeautifulSoup
12
+ import io
13
+ import asyncio
14
+ import aiohttp
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s: %(message)s')
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class PaperDownloader:
23
+ def __init__(self, output_dir='papers'):
24
+ self.output_dir = output_dir
25
+ os.makedirs(output_dir, exist_ok=True)
26
+
27
+ # Updated download sources
28
+ self.download_sources = [
29
+ 'https://sci-hub.ee/',
30
+ 'https://sci-hub.st/',
31
+ 'https://sci-hub.ru/',
32
+ 'https://sci-hub.ren/',
33
+ 'https://sci-hub.mksa.top/',
34
+ 'https://sci-hub.se/',
35
+ 'https://libgen.rs/scimag/'
36
+ ]
37
+
38
+ # Request headers
39
+ self.headers = {
40
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
41
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
+ 'Accept-Language': 'en-US,en;q=0.9',
43
+ }
44
+
45
+ def clean_doi(self, doi):
46
+ """Clean and encode DOI for URL"""
47
+ if not isinstance(doi, str):
48
+ return None
49
+ return quote(doi.strip()) if doi else None
50
+
51
+ async def fetch_with_headers(self, session, url, timeout=10):
52
+ """Utility method to fetch an URL with headers and timeout"""
53
+ try:
54
+ async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
+ response.raise_for_status()
56
+ return await response.text(), response.headers
57
+ except Exception as e:
58
+ logger.debug(f"Error fetching {url}: {e}")
59
+ return None, None
60
+
61
+
62
+ async def download_paper_direct_doi_async(self, session, doi):
63
+ """Attempt to download the pdf from the landing page of the doi"""
64
+ if not doi:
65
+ return None
66
+
67
+ try:
68
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
69
+ text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
70
+ if not text:
71
+ return None
72
+
73
+ pdf_patterns = [
74
+ r'(https?://[^\s<>"]+?\.pdf)',
75
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
76
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
77
+ ]
78
+
79
+ pdf_urls = []
80
+ for pattern in pdf_patterns:
81
+ pdf_urls.extend(re.findall(pattern, text))
82
+
83
+ for pdf_url in pdf_urls:
84
+ try:
85
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
+ logger.debug(f"Found PDF from: {pdf_url}")
88
+ return await pdf_response.read()
89
+ except Exception as e:
90
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
+
92
+
93
+ except Exception as e:
94
+ logger.debug(f"Error trying to get the PDF from {doi}: {e}")
95
+
96
+ return None
97
+
98
+ async def download_paper_scihub_async(self, session, doi):
99
+ """Improved method to download paper from Sci-Hub using async requests"""
100
+ if not doi:
101
+ logger.warning("DOI not provided")
102
+ return None
103
+
104
+ for base_url in self.download_sources:
105
+ try:
106
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
107
+ text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
108
+ if not text:
109
+ continue
110
+
111
+ # Search for multiple PDF URL patterns
112
+ pdf_patterns = [
113
+ r'(https?://[^\s<>"]+?\.pdf)',
114
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
115
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
116
+ ]
117
+
118
+ pdf_urls = []
119
+ for pattern in pdf_patterns:
120
+ pdf_urls.extend(re.findall(pattern, text))
121
+
122
+ # Try downloading from found URLs
123
+ for pdf_url in pdf_urls:
124
+ try:
125
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
126
+ # Verify if it's a PDF
127
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
128
+ logger.debug(f"Found PDF from: {pdf_url}")
129
+ return await pdf_response.read()
130
+ except Exception as e:
131
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
132
+
133
+ except Exception as e:
134
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
135
+
136
+ return None
137
+
138
+ async def download_paper_libgen_async(self, session, doi):
139
+ """Download from Libgen, handles the query and the redirection"""
140
+ if not doi:
141
+ return None
142
+
143
+ base_url = 'https://libgen.rs/scimag/'
144
+ try:
145
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
146
+ text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
147
+
148
+ if not text or "No results" in text:
149
+ logger.debug(f"No results for DOI: {doi} on libgen")
150
+ return None
151
+
152
+ soup = BeautifulSoup(text, 'html.parser')
153
+
154
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
155
+
156
+ if links:
157
+ link = links[0]
158
+ pdf_url = link['href']
159
+ pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
160
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
161
+ logger.debug(f"Found PDF from: {pdf_url}")
162
+ return await pdf_response.read()
163
+ except Exception as e:
164
+ logger.debug(f"Error trying to download {doi} from libgen: {e}")
165
+ return None
166
+
167
+ async def download_paper_google_scholar_async(self, session, doi):
168
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
169
+ if not doi:
170
+ return None
171
+
172
+ try:
173
+ query = f'doi:"{doi}"'
174
+ params = {'q': query}
175
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
176
+
177
+ text, headers = await self.fetch_with_headers(session, url, timeout=10)
178
+ if not text:
179
+ return None
180
+
181
+ soup = BeautifulSoup(text, 'html.parser')
182
+
183
+ # Find any links with [PDF]
184
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
185
+
186
+ if links:
187
+ pdf_url = links[0]['href']
188
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
189
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
190
+ logger.debug(f"Found PDF from: {pdf_url}")
191
+ return await pdf_response.read()
192
+ except Exception as e:
193
+ logger.debug(f"Google Scholar error for {doi}: {e}")
194
+
195
+ return None
196
+
197
+ async def download_paper_crossref_async(self, session, doi):
198
+ """Alternative search method using Crossref"""
199
+ if not doi:
200
+ return None
201
+
202
+ try:
203
+ # Search for open access link
204
+ url = f"https://api.crossref.org/works/{doi}"
205
+ response = await session.get(url, headers=self.headers, timeout=10)
206
+
207
+ if response.status == 200:
208
+ data = await response.json()
209
+ work = data.get('message', {})
210
+
211
+ # Search for open access links
212
+ links = work.get('link', [])
213
+ for link in links:
214
+ if link.get('content-type') == 'application/pdf':
215
+ pdf_url = link.get('URL')
216
+ if pdf_url:
217
+ pdf_response = await session.get(pdf_url, headers=self.headers)
218
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
219
+ logger.debug(f"Found PDF from: {pdf_url}")
220
+ return await pdf_response.read()
221
+
222
+ except Exception as e:
223
+ logger.debug(f"Crossref error for {doi}: {e}")
224
+
225
+ return None
226
+
227
+ async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
228
+ """Downloads a paper using multiple strategies with exponential backoff and async requests"""
229
+ pdf_content = None
230
+ retries = 0
231
+ delay = initial_delay
232
+
233
+ async with aiohttp.ClientSession() as session:
234
+ while retries < max_retries and not pdf_content:
235
+ try:
236
+ pdf_content = (
237
+ await self.download_paper_direct_doi_async(session, doi) or
238
+ await self.download_paper_scihub_async(session, doi) or
239
+ await self.download_paper_libgen_async(session, doi) or
240
+ await self.download_paper_google_scholar_async(session, doi) or
241
+ await self.download_paper_crossref_async(session, doi)
242
+
243
+ )
244
+ if pdf_content:
245
+ return pdf_content
246
+ except Exception as e:
247
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
248
+
249
+ if not pdf_content:
250
+ retries += 1
251
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
252
+ await asyncio.sleep(delay)
253
+ delay *= 2 # Exponential backoff
254
+
255
+ return None
256
+
257
+ def download_paper_scihub(self, doi):
258
+ """Improved method to download paper from Sci-Hub"""
259
+ if not doi:
260
+ logger.warning("DOI not provided")
261
+ return None
262
+
263
+ for base_url in self.download_sources:
264
+ try:
265
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
266
+
267
+ # Request with more tolerance
268
+ response = requests.get(scihub_url,
269
+ headers=self.headers,
270
+ allow_redirects=True,
271
+ timeout=15)
272
+
273
+ # Search for multiple PDF URL patterns
274
+ pdf_patterns = [
275
+ r'(https?://[^\s<>"]+?\.pdf)',
276
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
277
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
278
+ ]
279
+
280
+ pdf_urls = []
281
+ for pattern in pdf_patterns:
282
+ pdf_urls.extend(re.findall(pattern, response.text))
283
+
284
+ # Try downloading from found URLs
285
+ for pdf_url in pdf_urls:
286
+ try:
287
+ pdf_response = requests.get(pdf_url,
288
+ headers=self.headers,
289
+ timeout=10)
290
+
291
+ # Verify if it's a PDF
292
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
293
+ logger.debug(f"Found PDF from: {pdf_url}")
294
+ return pdf_response.content
295
+ except Exception as e:
296
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
297
+
298
+ except Exception as e:
299
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
300
+
301
+ return None
302
+
303
+ def download_paper_libgen(self, doi):
304
+ """Download from Libgen, handles the query and the redirection"""
305
+ if not doi:
306
+ return None
307
+
308
+ base_url = 'https://libgen.rs/scimag/'
309
+ try:
310
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
311
+ response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
312
+ response.raise_for_status()
313
+
314
+ if "No results" in response.text:
315
+ logger.debug(f"No results for DOI: {doi} on libgen")
316
+ return None
317
+
318
+ soup = BeautifulSoup(response.text, 'html.parser')
319
+
320
+ # Find the link using a specific selector
321
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
322
+
323
+ if links:
324
+ link = links[0]
325
+ pdf_url = link['href']
326
+ pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
327
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
328
+ logger.debug(f"Found PDF from: {pdf_url}")
329
+ return pdf_response.content
330
+
331
+ except Exception as e:
332
+ logger.debug(f"Error trying to download {doi} from libgen: {e}")
333
+ return None
334
+
335
+ def download_paper_google_scholar(self, doi):
336
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
337
+ if not doi:
338
+ return None
339
+
340
+ try:
341
+ query = f'doi:"{doi}"'
342
+ params = {'q': query}
343
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
344
+
345
+ response = requests.get(url, headers=self.headers, timeout=10)
346
+ response.raise_for_status()
347
+
348
+ soup = BeautifulSoup(response.text, 'html.parser')
349
+
350
+ # Find any links with [PDF]
351
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
352
+
353
+ if links:
354
+ pdf_url = links[0]['href']
355
+ pdf_response = requests.get(pdf_url, headers=self.headers, timeout=10)
356
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
357
+ logger.debug(f"Found PDF from: {pdf_url}")
358
+ return pdf_response.content
359
+ except Exception as e:
360
+ logger.debug(f"Google Scholar error for {doi}: {e}")
361
+
362
+ return None
363
+
364
+ def download_paper_crossref(self, doi):
365
+ """Alternative search method using Crossref"""
366
+ if not doi:
367
+ return None
368
+
369
+ try:
370
+ # Search for open access link
371
+ url = f"https://api.crossref.org/works/{doi}"
372
+ response = requests.get(url, headers=self.headers, timeout=10)
373
+
374
+ if response.status_code == 200:
375
+ data = response.json()
376
+ work = data.get('message', {})
377
+
378
+ # Search for open access links
379
+ links = work.get('link', [])
380
+ for link in links:
381
+ if link.get('content-type') == 'application/pdf':
382
+ pdf_url = link.get('URL')
383
+ if pdf_url:
384
+ pdf_response = requests.get(pdf_url, headers=self.headers)
385
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
386
+ logger.debug(f"Found PDF from: {pdf_url}")
387
+ return pdf_response.content
388
+
389
+ except Exception as e:
390
+ logger.debug(f"Crossref error for {doi}: {e}")
391
+
392
+ return None
393
+
394
+ def download_with_retry(self, doi, max_retries=3, initial_delay=2):
395
+ """Downloads a paper using multiple strategies with exponential backoff"""
396
+ pdf_content = None
397
+ retries = 0
398
+ delay = initial_delay
399
+
400
+ while retries < max_retries and not pdf_content:
401
+ try:
402
+ pdf_content = (
403
+ self.download_paper_scihub(doi) or
404
+ self.download_paper_libgen(doi) or
405
+ self.download_paper_google_scholar(doi) or
406
+ self.download_paper_crossref(doi)
407
+
408
+ )
409
+
410
+ if pdf_content:
411
+ return pdf_content
412
+ except Exception as e:
413
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
414
+
415
+ if not pdf_content:
416
+ retries += 1
417
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
418
+ time.sleep(delay)
419
+ delay *= 2 # Exponential backoff
420
+
421
+ return None
422
+
423
+ def download_single_doi(self, doi, progress=gr.Progress()):
424
+ """Downloads a single paper using a DOI with progress bar"""
425
+ if not doi:
426
+ return None, "Error: DOI not provided", "Error: DOI not provided"
427
+
428
+ try:
429
+ pdf_content = self.download_with_retry(doi)
430
+
431
+ if pdf_content:
432
+ if doi is None:
433
+ return None, "Error: DOI not provided", "Error: DOI not provided"
434
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
435
+ filepath = os.path.join(self.output_dir, filename)
436
+ with open(filepath, 'wb') as f:
437
+ f.write(pdf_content)
438
+ logger.info(f"Successfully downloaded: {filename}")
439
+ progress(1)
440
+ return filepath, f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>', ""
441
+ else:
442
+ logger.warning(f"Could not download: {doi}")
443
+ progress(1)
444
+ return None, f"Could not download {doi}", f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>'
445
+
446
+ except Exception as e:
447
+ logger.error(f"Error processing {doi}: {e}")
448
+ progress(1)
449
+ return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
450
+
451
+ def download_multiple_dois(self, dois_text, progress=gr.Progress()):
452
+ """Downloads multiple papers from a list of DOIs with progress bar"""
453
+ if not dois_text:
454
+ return None, "Error: No DOIs provided", "Error: No DOIs provided"
455
+
456
+ dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
457
+ if not dois:
458
+ return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
459
+
460
+ total_dois = len(dois)
461
+ downloaded_files = []
462
+ failed_dois = []
463
+ downloaded_links = []
464
+
465
+ for i, doi in enumerate(dois):
466
+ filepath, success_message, fail_message = self.download_single_doi(doi, progress=progress)
467
+ if filepath:
468
+ # Unique filename for zip
469
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
470
+ filepath_unique = os.path.join(self.output_dir, filename)
471
+ os.rename(filepath, filepath_unique)
472
+ downloaded_files.append(filepath_unique)
473
+ downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
474
+
475
+ else:
476
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
477
+ progress((i + 1) / total_dois)
478
+
479
+
480
+ if downloaded_files:
481
+ zip_filename = 'papers.zip'
482
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
483
+ for file_path in downloaded_files:
484
+ zipf.write(file_path, arcname=os.path.basename(file_path))
485
+ logger.info(f"ZIP file created: {zip_filename}")
486
+
487
+ return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
488
+
489
+ def process_bibtex(self, bib_file, progress=gr.Progress()):
490
+ """Process BibTeX file and download papers with multiple strategies with progress bar"""
491
+ # Read BibTeX file content from the uploaded object
492
+ try:
493
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
494
+ bib_content = f.read()
495
+ except Exception as e:
496
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
497
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
498
+
499
+ # Parse BibTeX data
500
+ try:
501
+ bib_database = bibtexparser.loads(bib_content)
502
+ except Exception as e:
503
+ logger.error(f"Error parsing BibTeX data: {e}")
504
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
505
+
506
+ # Extract DOIs
507
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
508
+ logger.info(f"Found {len(dois)} DOIs to download")
509
+
510
+ # Result lists
511
+ downloaded_files = []
512
+ failed_dois = []
513
+ downloaded_links = []
514
+
515
+ total_dois = len(dois)
516
+ # Download PDFs
517
+ for i, doi in enumerate(dois):
518
+ try:
519
+ # Try to download with multiple methods with retries
520
+ pdf_content = self.download_with_retry(doi)
521
+
522
+ # Save PDF
523
+ if pdf_content:
524
+ if doi is None:
525
+ return None, "Error: DOI not provided", "Error: DOI not provided", None
526
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
527
+ filepath = os.path.join(self.output_dir, filename)
528
+
529
+ with open(filepath, 'wb') as f:
530
+ f.write(pdf_content)
531
+
532
+ downloaded_files.append(filepath)
533
+ downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
534
+ logger.info(f"Successfully downloaded: {filename}")
535
+ else:
536
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
537
+
538
+ except Exception as e:
539
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
540
+ logger.error(f"Error processing {doi}: {e}")
541
+ progress((i + 1) / total_dois)
542
+
543
+ # Create ZIP of downloaded papers
544
+ if downloaded_files:
545
+ zip_filename = 'papers.zip'
546
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
547
+ for file_path in downloaded_files:
548
+ zipf.write(file_path, arcname=os.path.basename(file_path))
549
+ logger.info(f"ZIP file created: {zip_filename}")
550
+
551
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
552
+
553
+ async def process_bibtex_async(self, bib_file, progress=gr.Progress()):
554
+ """Process BibTeX file and download papers with multiple strategies"""
555
+ # Read BibTeX file content from the uploaded object
556
+ try:
557
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
558
+ bib_content = f.read()
559
+ except Exception as e:
560
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
561
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
562
+
563
+ # Parse BibTeX data
564
+ try:
565
+ bib_database = bibtexparser.loads(bib_content)
566
+ except Exception as e:
567
+ logger.error(f"Error parsing BibTeX data: {e}")
568
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
569
+
570
+ # Extract DOIs
571
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
572
+ logger.info(f"Found {len(dois)} DOIs to download")
573
+
574
+ # Result lists
575
+ downloaded_files = []
576
+ failed_dois = []
577
+ downloaded_links = []
578
+
579
+ total_dois = len(dois)
580
+ # Download PDFs
581
+ for i, doi in enumerate(dois):
582
+ try:
583
+ # Try to download with multiple methods with retries
584
+ pdf_content = await self.download_with_retry_async(doi)
585
+
586
+ # Save PDF
587
+ if pdf_content:
588
+ if doi is None:
589
+ return None, "Error: DOI not provided", "Error: DOI not provided", None
590
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
591
+ filepath = os.path.join(self.output_dir, filename)
592
+
593
+ with open(filepath, 'wb') as f:
594
+ f.write(pdf_content)
595
+
596
+ downloaded_files.append(filepath)
597
+ downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
598
+ logger.info(f"Successfully downloaded: {filename}")
599
+ else:
600
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
601
+
602
+ except Exception as e:
603
+ failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
604
+ logger.error(f"Error processing {doi}: {e}")
605
+ progress((i + 1) / total_dois)
606
+
607
+
608
+ # Create ZIP of downloaded papers
609
+ if downloaded_files:
610
+ zip_filename = 'papers.zip'
611
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
612
+ for file_path in downloaded_files:
613
+ zipf.write(file_path, arcname=os.path.basename(file_path))
614
+ logger.info(f"ZIP file created: {zip_filename}")
615
+
616
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
617
+
618
+ def create_gradio_interface():
619
+ """Create Gradio interface for Paper Downloader"""
620
+ downloader = PaperDownloader()
621
+
622
+ async def download_papers(bib_file, doi_input, dois_input, progress=gr.Progress()):
623
+ if bib_file:
624
+ # Check file type
625
+ if not bib_file.name.lower().endswith('.bib'):
626
+ return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
627
+
628
+ zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file, progress)
629
+ return zip_path, downloaded_dois, failed_dois, None
630
+ elif doi_input:
631
+ filepath, message, failed_doi = downloader.download_single_doi(doi_input, progress)
632
+ return None, message, failed_doi, filepath
633
+ elif dois_input:
634
+ zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input, progress)
635
+ return zip_path, downloaded_dois, failed_dois, None
636
+ else:
637
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
638
+
639
+ # Gradio Interface
640
+ interface = gr.Interface(
641
+ fn=download_papers,
642
+ inputs=[
643
+ gr.File(file_types=['.bib'], label="Upload BibTeX File"),
644
+ gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
645
+ gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
646
+ ],
647
+ outputs=[
648
+ gr.File(label="Download Papers (ZIP) or Single PDF"),
649
+ gr.HTML(label="""
650
+ <div style='padding-bottom: 5px; font-weight: bold;'>
651
+ Found DOIs
652
+ </div>
653
+ <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
654
+ <div id="downloaded-dois"></div>
655
+ </div>
656
+ """),
657
+ gr.HTML(label="""
658
+ <div style='padding-bottom: 5px; font-weight: bold;'>
659
+ Missed DOIs
660
+ </div>
661
+ <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
662
+ <div id="failed-dois"></div>
663
+ </div>
664
+ """),
665
+ gr.File(label="Downloaded Single PDF")
666
+ ],
667
+ title="🔬 Academic Paper Batch Downloader",
668
+ description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
669
+ theme="Hev832/Applio",
670
+ examples=[
671
+ ["example.bib", None, None], # Bibtex File
672
+ [None, "10.1038/nature12373", None], # Single DOI
673
+ [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
674
+ ],
675
+ css="""
676
+ .gradio-container {
677
+ background-color: black;
678
+ }
679
+ .gr-interface {
680
+ max-width: 800px;
681
+ margin: 0 auto;
682
+ }
683
+ .gr-box {
684
+ background-color: black;
685
+ border-radius: 10px;
686
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
687
  }
688
+ .output-text a {
689
+ color: #007bff; /* Blue color for hyperlinks */
690
+ }
691
+ """,
692
+ cache_examples=False,
693
+ )
694
+
695
+ # Add Javascript to update HTML
696
+ interface.load = """
697
+ function(downloaded_dois, failed_dois) {
698
+ let downloaded_html = '';
699
+ downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
700
+ downloaded_html += doi + '<br>';
701
+ });
702
+ document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
703
+
704
+ let failed_html = '';
705
+ failed_dois.split('\\n').filter(Boolean).forEach(doi => {
706
+ failed_html += doi + '<br>';
707
+ });
708
+ document.querySelector("#failed-dois").innerHTML = failed_html;
709
+ return [downloaded_html, failed_html];
710
+ }
711
+ """
712
+
713
+ interface.head = """
714
+ <script>
715
+ function copyLink(button) {
716
+ const linkElement = button.previousElementSibling;
717
+ const link = linkElement.href;
718
+ navigator.clipboard.writeText(link)
719
+ .then(() => {
720
+ button.innerText = '✓ Copied';
721
+ button.style.color = 'green';
722
+ setTimeout(() => {
723
+ button.innerText = 'Copy';
724
+ button.style.color = '';
725
+ }, 2000);
726
+ })
727
+ .catch(err => {
728
+ console.error('Failed to copy link: ', err);
729
+ });
730
+ }
731
+ </script>
732
+ """
733
+ return interface
734
+
735
+
736
+ def main():
737
+ interface = create_gradio_interface()
738
+ interface.launch(share=True)
739
+
740
+
741
+ if __name__ == "__main__":
742
+ main()