C2MV commited on
Commit
bf73a1e
1 Parent(s): 4601f20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -732
app.py CHANGED
@@ -1,733 +1,112 @@
1
- import os
2
- import re
3
- import time
4
- import logging
5
- import zipfile
6
- import requests
7
- import bibtexparser
8
- from tqdm import tqdm
9
- from urllib.parse import quote, urlencode
10
- import gradio as gr
11
- from bs4 import BeautifulSoup
12
- import io
13
- import asyncio
14
- import aiohttp
15
-
16
- # Configure logging
17
- logging.basicConfig(level=logging.INFO,
18
- format='%(asctime)s - %(levelname)s: %(message)s')
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class PaperDownloader:
23
- def __init__(self, output_dir='papers'):
24
- self.output_dir = output_dir
25
- os.makedirs(output_dir, exist_ok=True)
26
-
27
- # Updated download sources
28
- self.download_sources = [
29
- 'https://sci-hub.ee/',
30
- 'https://sci-hub.st/',
31
- 'https://sci-hub.ru/',
32
- 'https://sci-hub.ren/',
33
- 'https://sci-hub.mksa.top/',
34
- 'https://sci-hub.se/',
35
- 'https://libgen.rs/scimag/'
36
- ]
37
-
38
- # Request headers
39
- self.headers = {
40
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
41
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
- 'Accept-Language': 'en-US,en;q=0.9',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
-
45
- def clean_doi(self, doi):
46
- """Clean and encode DOI for URL"""
47
- if not isinstance(doi, str):
48
- return None
49
- return quote(doi.strip()) if doi else None
50
-
51
- async def fetch_with_headers(self, session, url, timeout=10):
52
- """Utility method to fetch an URL with headers and timeout"""
53
- try:
54
- async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
- response.raise_for_status()
56
- return await response.text(), response.headers
57
- except Exception as e:
58
- logger.debug(f"Error fetching {url}: {e}")
59
- return None, None
60
-
61
-
62
- async def download_paper_direct_doi_async(self, session, doi):
63
- """Attempt to download the pdf from the landing page of the doi"""
64
- if not doi:
65
- return None
66
-
67
- try:
68
- doi_url = f"https://doi.org/{self.clean_doi(doi)}"
69
- text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
70
- if not text:
71
- return None
72
-
73
- pdf_patterns = [
74
- r'(https?://[^\s<>"]+?\.pdf)',
75
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
76
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
77
- ]
78
-
79
- pdf_urls = []
80
- for pattern in pdf_patterns:
81
- pdf_urls.extend(re.findall(pattern, text))
82
-
83
- for pdf_url in pdf_urls:
84
- try:
85
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
- logger.debug(f"Found PDF from: {pdf_url}")
88
- return await pdf_response.read()
89
- except Exception as e:
90
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
-
92
-
93
- except Exception as e:
94
- logger.debug(f"Error trying to get the PDF from {doi}: {e}")
95
-
96
- return None
97
-
98
- async def download_paper_scihub_async(self, session, doi):
99
- """Improved method to download paper from Sci-Hub using async requests"""
100
- if not doi:
101
- logger.warning("DOI not provided")
102
- return None
103
-
104
- for base_url in self.download_sources:
105
- try:
106
- scihub_url = f"{base_url}{self.clean_doi(doi)}"
107
- text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
108
- if not text:
109
- continue
110
-
111
- # Search for multiple PDF URL patterns
112
- pdf_patterns = [
113
- r'(https?://[^\s<>"]+?\.pdf)',
114
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
115
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
116
- ]
117
-
118
- pdf_urls = []
119
- for pattern in pdf_patterns:
120
- pdf_urls.extend(re.findall(pattern, text))
121
-
122
- # Try downloading from found URLs
123
- for pdf_url in pdf_urls:
124
- try:
125
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
126
- # Verify if it's a PDF
127
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
128
- logger.debug(f"Found PDF from: {pdf_url}")
129
- return await pdf_response.read()
130
- except Exception as e:
131
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
132
-
133
- except Exception as e:
134
- logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
135
-
136
- return None
137
-
138
- async def download_paper_libgen_async(self, session, doi):
139
- """Download from Libgen, handles the query and the redirection"""
140
- if not doi:
141
- return None
142
-
143
- base_url = 'https://libgen.rs/scimag/'
144
- try:
145
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
146
- text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
147
-
148
- if not text or "No results" in text:
149
- logger.debug(f"No results for DOI: {doi} on libgen")
150
- return None
151
-
152
- soup = BeautifulSoup(text, 'html.parser')
153
-
154
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
155
-
156
- if links:
157
- link = links[0]
158
- pdf_url = link['href']
159
- pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
160
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
161
- logger.debug(f"Found PDF from: {pdf_url}")
162
- return await pdf_response.read()
163
- except Exception as e:
164
- logger.debug(f"Error trying to download {doi} from libgen: {e}")
165
- return None
166
-
167
- async def download_paper_google_scholar_async(self, session, doi):
168
- """Search google scholar to find an article with the given doi, try to get the pdf"""
169
- if not doi:
170
- return None
171
-
172
- try:
173
- query = f'doi:"{doi}"'
174
- params = {'q': query}
175
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
176
-
177
- text, headers = await self.fetch_with_headers(session, url, timeout=10)
178
- if not text:
179
- return None
180
-
181
- soup = BeautifulSoup(text, 'html.parser')
182
-
183
- # Find any links with [PDF]
184
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
185
-
186
- if links:
187
- pdf_url = links[0]['href']
188
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
189
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
190
- logger.debug(f"Found PDF from: {pdf_url}")
191
- return await pdf_response.read()
192
- except Exception as e:
193
- logger.debug(f"Google Scholar error for {doi}: {e}")
194
-
195
- return None
196
-
197
- async def download_paper_crossref_async(self, session, doi):
198
- """Alternative search method using Crossref"""
199
- if not doi:
200
- return None
201
-
202
- try:
203
- # Search for open access link
204
- url = f"https://api.crossref.org/works/{doi}"
205
- response = await session.get(url, headers=self.headers, timeout=10)
206
-
207
- if response.status == 200:
208
- data = await response.json()
209
- work = data.get('message', {})
210
-
211
- # Search for open access links
212
- links = work.get('link', [])
213
- for link in links:
214
- if link.get('content-type') == 'application/pdf':
215
- pdf_url = link.get('URL')
216
- if pdf_url:
217
- pdf_response = await session.get(pdf_url, headers=self.headers)
218
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
219
- logger.debug(f"Found PDF from: {pdf_url}")
220
- return await pdf_response.read()
221
-
222
- except Exception as e:
223
- logger.debug(f"Crossref error for {doi}: {e}")
224
-
225
- return None
226
-
227
- async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
228
- """Downloads a paper using multiple strategies with exponential backoff and async requests"""
229
- pdf_content = None
230
- retries = 0
231
- delay = initial_delay
232
-
233
- async with aiohttp.ClientSession() as session:
234
- while retries < max_retries and not pdf_content:
235
- try:
236
- pdf_content = (
237
- await self.download_paper_direct_doi_async(session, doi) or
238
- await self.download_paper_scihub_async(session, doi) or
239
- await self.download_paper_libgen_async(session, doi) or
240
- await self.download_paper_google_scholar_async(session, doi) or
241
- await self.download_paper_crossref_async(session, doi)
242
-
243
- )
244
- if pdf_content:
245
- return pdf_content
246
- except Exception as e:
247
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
248
-
249
- if not pdf_content:
250
- retries += 1
251
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
252
- await asyncio.sleep(delay)
253
- delay *= 2 # Exponential backoff
254
-
255
- return None
256
-
257
- def download_paper_scihub(self, doi):
258
- """Improved method to download paper from Sci-Hub"""
259
- if not doi:
260
- logger.warning("DOI not provided")
261
- return None
262
-
263
- for base_url in self.download_sources:
264
- try:
265
- scihub_url = f"{base_url}{self.clean_doi(doi)}"
266
-
267
- # Request with more tolerance
268
- response = requests.get(scihub_url,
269
- headers=self.headers,
270
- allow_redirects=True,
271
- timeout=15)
272
-
273
- # Search for multiple PDF URL patterns
274
- pdf_patterns = [
275
- r'(https?://[^\s<>"]+?\.pdf)',
276
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
277
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
278
- ]
279
-
280
- pdf_urls = []
281
- for pattern in pdf_patterns:
282
- pdf_urls.extend(re.findall(pattern, response.text))
283
-
284
- # Try downloading from found URLs
285
- for pdf_url in pdf_urls:
286
- try:
287
- pdf_response = requests.get(pdf_url,
288
- headers=self.headers,
289
- timeout=10)
290
-
291
- # Verify if it's a PDF
292
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
293
- logger.debug(f"Found PDF from: {pdf_url}")
294
- return pdf_response.content
295
- except Exception as e:
296
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
297
-
298
- except Exception as e:
299
- logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
300
-
301
- return None
302
-
303
- def download_paper_libgen(self, doi):
304
- """Download from Libgen, handles the query and the redirection"""
305
- if not doi:
306
- return None
307
-
308
- base_url = 'https://libgen.rs/scimag/'
309
- try:
310
- search_url = f"{base_url}?q={self.clean_doi(doi)}"
311
- response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
312
- response.raise_for_status()
313
-
314
- if "No results" in response.text:
315
- logger.debug(f"No results for DOI: {doi} on libgen")
316
- return None
317
-
318
- soup = BeautifulSoup(response.text, 'html.parser')
319
-
320
- # Find the link using a specific selector
321
- links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
322
-
323
- if links:
324
- link = links[0]
325
- pdf_url = link['href']
326
- pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
327
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
328
- logger.debug(f"Found PDF from: {pdf_url}")
329
- return pdf_response.content
330
-
331
- except Exception as e:
332
- logger.debug(f"Error trying to download {doi} from libgen: {e}")
333
- return None
334
-
335
- def download_paper_google_scholar(self, doi):
336
- """Search google scholar to find an article with the given doi, try to get the pdf"""
337
- if not doi:
338
- return None
339
-
340
- try:
341
- query = f'doi:"{doi}"'
342
- params = {'q': query}
343
- url = f'https://scholar.google.com/scholar?{urlencode(params)}'
344
-
345
- response = requests.get(url, headers=self.headers, timeout=10)
346
- response.raise_for_status()
347
-
348
- soup = BeautifulSoup(response.text, 'html.parser')
349
-
350
- # Find any links with [PDF]
351
- links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
352
-
353
- if links:
354
- pdf_url = links[0]['href']
355
- pdf_response = requests.get(pdf_url, headers=self.headers, timeout=10)
356
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
357
- logger.debug(f"Found PDF from: {pdf_url}")
358
- return pdf_response.content
359
- except Exception as e:
360
- logger.debug(f"Google Scholar error for {doi}: {e}")
361
-
362
- return None
363
-
364
- def download_paper_crossref(self, doi):
365
- """Alternative search method using Crossref"""
366
- if not doi:
367
- return None
368
-
369
- try:
370
- # Search for open access link
371
- url = f"https://api.crossref.org/works/{doi}"
372
- response = requests.get(url, headers=self.headers, timeout=10)
373
-
374
- if response.status_code == 200:
375
- data = response.json()
376
- work = data.get('message', {})
377
-
378
- # Search for open access links
379
- links = work.get('link', [])
380
- for link in links:
381
- if link.get('content-type') == 'application/pdf':
382
- pdf_url = link.get('URL')
383
- if pdf_url:
384
- pdf_response = requests.get(pdf_url, headers=self.headers)
385
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
386
- logger.debug(f"Found PDF from: {pdf_url}")
387
- return pdf_response.content
388
-
389
- except Exception as e:
390
- logger.debug(f"Crossref error for {doi}: {e}")
391
-
392
- return None
393
-
394
- def download_with_retry(self, doi, max_retries=3, initial_delay=2):
395
- """Downloads a paper using multiple strategies with exponential backoff"""
396
- pdf_content = None
397
- retries = 0
398
- delay = initial_delay
399
-
400
- while retries < max_retries and not pdf_content:
401
- try:
402
- pdf_content = (
403
- self.download_paper_scihub(doi) or
404
- self.download_paper_libgen(doi) or
405
- self.download_paper_google_scholar(doi) or
406
- self.download_paper_crossref(doi)
407
-
408
- )
409
-
410
- if pdf_content:
411
- return pdf_content
412
- except Exception as e:
413
- logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
414
-
415
- if not pdf_content:
416
- retries += 1
417
- logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
418
- time.sleep(delay)
419
- delay *= 2 # Exponential backoff
420
-
421
- return None
422
-
423
- def download_single_doi(self, doi, progress=gr.Progress()):
424
- """Downloads a single paper using a DOI with progress bar"""
425
- if not doi:
426
- return None, "Error: DOI not provided", "Error: DOI not provided"
427
-
428
- try:
429
- pdf_content = self.download_with_retry(doi)
430
-
431
- if pdf_content:
432
- if doi is None:
433
- return None, "Error: DOI not provided", "Error: DOI not provided"
434
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
435
- filepath = os.path.join(self.output_dir, filename)
436
- with open(filepath, 'wb') as f:
437
- f.write(pdf_content)
438
- logger.info(f"Successfully downloaded: {filename}")
439
- progress(1, desc=f"Downloaded {doi}") #update progress
440
- return filepath, f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>', ""
441
- else:
442
- logger.warning(f"Could not download: {doi}")
443
- progress(1, desc=f"Failed {doi}") #update progress
444
- return None, f"Could not download {doi}", f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>'
445
-
446
- except Exception as e:
447
- logger.error(f"Error processing {doi}: {e}")
448
- progress(1, desc=f"Error {doi}") #update progress
449
- return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
450
-
451
- def download_multiple_dois(self, dois_text, progress=gr.Progress()):
452
- """Downloads multiple papers from a list of DOIs with progress bar"""
453
- if not dois_text:
454
- return None, "Error: No DOIs provided", "Error: No DOIs provided"
455
-
456
- dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
457
- if not dois:
458
- return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
459
-
460
- downloaded_files = []
461
- failed_dois = []
462
- downloaded_links = []
463
- for i, doi in enumerate(progress(dois, desc="Downloading papers")):
464
- filepath, success_message, fail_message = self.download_single_doi(doi, progress=progress)
465
- if filepath:
466
- # Unique filename for zip
467
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
468
- filepath_unique = os.path.join(self.output_dir, filename)
469
- os.rename(filepath, filepath_unique)
470
- downloaded_files.append(filepath_unique)
471
- downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
472
-
473
- else:
474
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
475
-
476
- if downloaded_files:
477
- zip_filename = 'papers.zip'
478
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
479
- for file_path in downloaded_files:
480
- zipf.write(file_path, arcname=os.path.basename(file_path))
481
- logger.info(f"ZIP file created: {zip_filename}")
482
-
483
- return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
484
-
485
- def process_bibtex(self, bib_file, progress=gr.Progress()):
486
- """Process BibTeX file and download papers with multiple strategies with progress bar"""
487
- # Read BibTeX file content from the uploaded object
488
- try:
489
- with open(bib_file.name, 'r', encoding='utf-8') as f:
490
- bib_content = f.read()
491
- except Exception as e:
492
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
493
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
494
-
495
- # Parse BibTeX data
496
- try:
497
- bib_database = bibtexparser.loads(bib_content)
498
- except Exception as e:
499
- logger.error(f"Error parsing BibTeX data: {e}")
500
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
501
-
502
- # Extract DOIs
503
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
504
- logger.info(f"Found {len(dois)} DOIs to download")
505
-
506
- # Result lists
507
- downloaded_files = []
508
- failed_dois = []
509
- downloaded_links = []
510
-
511
- # Download PDFs
512
- for doi in progress(dois, desc="Downloading papers"):
513
- try:
514
- # Try to download with multiple methods with retries
515
- pdf_content = self.download_with_retry(doi)
516
-
517
- # Save PDF
518
- if pdf_content:
519
- if doi is None:
520
- return None, "Error: DOI not provided", "Error: DOI not provided", None
521
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
522
- filepath = os.path.join(self.output_dir, filename)
523
-
524
- with open(filepath, 'wb') as f:
525
- f.write(pdf_content)
526
-
527
- downloaded_files.append(filepath)
528
- downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
529
- logger.info(f"Successfully downloaded: {filename}")
530
- else:
531
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
532
-
533
- except Exception as e:
534
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
535
- logger.error(f"Error processing {doi}: {e}")
536
-
537
- # Create ZIP of downloaded papers
538
- if downloaded_files:
539
- zip_filename = 'papers.zip'
540
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
541
- for file_path in downloaded_files:
542
- zipf.write(file_path, arcname=os.path.basename(file_path))
543
- logger.info(f"ZIP file created: {zip_filename}")
544
-
545
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
546
-
547
- async def process_bibtex_async(self, bib_file, progress=gr.Progress()):
548
- """Process BibTeX file and download papers with multiple strategies"""
549
- # Read BibTeX file content from the uploaded object
550
- try:
551
- with open(bib_file.name, 'r', encoding='utf-8') as f:
552
- bib_content = f.read()
553
- except Exception as e:
554
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
555
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
556
-
557
- # Parse BibTeX data
558
- try:
559
- bib_database = bibtexparser.loads(bib_content)
560
- except Exception as e:
561
- logger.error(f"Error parsing BibTeX data: {e}")
562
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
563
-
564
- # Extract DOIs
565
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
566
- logger.info(f"Found {len(dois)} DOIs to download")
567
-
568
- # Result lists
569
- downloaded_files = []
570
- failed_dois = []
571
- downloaded_links = []
572
-
573
- # Download PDFs
574
- for doi in progress(dois, desc="Downloading papers"):
575
- try:
576
- # Try to download with multiple methods with retries
577
- pdf_content = await self.download_with_retry_async(doi)
578
-
579
- # Save PDF
580
- if pdf_content:
581
- if doi is None:
582
- return None, "Error: DOI not provided", "Error: DOI not provided", None
583
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
584
- filepath = os.path.join(self.output_dir, filename)
585
-
586
- with open(filepath, 'wb') as f:
587
- f.write(pdf_content)
588
-
589
- downloaded_files.append(filepath)
590
- downloaded_links.append(f'<div style="display: flex; align-items: center;">✓ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
591
- logger.info(f"Successfully downloaded: {filename}")
592
- else:
593
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
594
-
595
- except Exception as e:
596
- failed_dois.append(f'<div style="display: flex; align-items: center;">❌ <a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
597
- logger.error(f"Error processing {doi}: {e}")
598
-
599
- # Create ZIP of downloaded papers
600
- if downloaded_files:
601
- zip_filename = 'papers.zip'
602
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
603
- for file_path in downloaded_files:
604
- zipf.write(file_path, arcname=os.path.basename(file_path))
605
- logger.info(f"ZIP file created: {zip_filename}")
606
-
607
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
608
-
609
- def create_gradio_interface():
610
- """Create Gradio interface for Paper Downloader"""
611
- downloader = PaperDownloader()
612
-
613
- async def download_papers(bib_file, doi_input, dois_input, progress=gr.Progress()):
614
- if bib_file:
615
- # Check file type
616
- if not bib_file.name.lower().endswith('.bib'):
617
- return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
618
-
619
- zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file, progress)
620
- return zip_path, downloaded_dois, failed_dois, None
621
- elif doi_input:
622
- filepath, message, failed_doi = downloader.download_single_doi(doi_input, progress)
623
- return None, message, failed_doi, filepath
624
- elif dois_input:
625
- zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input, progress)
626
- return zip_path, downloaded_dois, failed_dois, None
627
- else:
628
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
629
-
630
- # Gradio Interface
631
- interface = gr.Interface(
632
- fn=download_papers,
633
- inputs=[
634
- gr.File(file_types=['.bib'], label="Upload BibTeX File"),
635
- gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
636
- gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
637
- ],
638
- outputs=[
639
- gr.File(label="Download Papers (ZIP) or Single PDF"),
640
- gr.HTML(label="""
641
- <div style='padding-bottom: 5px; font-weight: bold;'>
642
- Found DOIs
643
- </div>
644
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
645
- <div id="downloaded-dois"></div>
646
- </div>
647
- """),
648
- gr.HTML(label="""
649
- <div style='padding-bottom: 5px; font-weight: bold;'>
650
- Missed DOIs
651
- </div>
652
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
653
- <div id="failed-dois"></div>
654
- </div>
655
- """),
656
- gr.File(label="Downloaded Single PDF")
657
- ],
658
- title="🔬 Academic Paper Batch Downloader",
659
- description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
660
- theme="Hev832/Applio",
661
- examples=[
662
- ["example.bib", None, None], # Bibtex File
663
- [None, "10.1038/nature12373", None], # Single DOI
664
- [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
665
- ],
666
- css="""
667
- .gradio-container {
668
- background-color: black;
669
- }
670
- .gr-interface {
671
- max-width: 800px;
672
- margin: 0 auto;
673
- }
674
- .gr-box {
675
- background-color: black;
676
- border-radius: 10px;
677
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
678
- }
679
- .output-text a {
680
- color: #007bff; /* Blue color for hyperlinks */
681
- }
682
- """,
683
- cache_examples=False,
684
- )
685
-
686
- # Add Javascript to update HTML
687
- interface.load = """
688
- function(downloaded_dois, failed_dois) {
689
- let downloaded_html = '';
690
- downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
691
- downloaded_html += doi + '<br>';
692
- });
693
- document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
694
-
695
- let failed_html = '';
696
- failed_dois.split('\\n').filter(Boolean).forEach(doi => {
697
- failed_html += doi + '<br>';
698
- });
699
- document.querySelector("#failed-dois").innerHTML = failed_html;
700
- return [downloaded_html, failed_html];
701
- }
702
- """
703
-
704
- interface.head = """
705
- <script>
706
- function copyLink(button) {
707
- const linkElement = button.previousElementSibling;
708
- const link = linkElement.href;
709
- navigator.clipboard.writeText(link)
710
- .then(() => {
711
- button.innerText = '✓ Copied';
712
- button.style.color = 'green';
713
- setTimeout(() => {
714
- button.innerText = 'Copy';
715
- button.style.color = '';
716
- }, 2000);
717
- })
718
- .catch(err => {
719
- console.error('Failed to copy link: ', err);
720
- });
721
- }
722
- </script>
723
- """
724
- return interface
725
-
726
-
727
- def main():
728
- interface = create_gradio_interface()
729
- interface.launch(share=True)
730
-
731
-
732
- if __name__ == "__main__":
733
- main()
 
1
+ async def download_papers(bib_file, doi_input, dois_input, progress=gr.Progress()):
2
+ if bib_file:
3
+ # Check file type
4
+ if not bib_file.name.lower().endswith('.bib'):
5
+ return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
6
+
7
+ zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file, progress)
8
+ return zip_path, downloaded_dois, failed_dois, None
9
+ elif doi_input:
10
+ filepath, message, failed_doi = downloader.download_single_doi(doi_input, progress)
11
+ return None, message, failed_doi, filepath
12
+ elif dois_input:
13
+ zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input, progress)
14
+ return zip_path, downloaded_dois, failed_dois, None
15
+ else:
16
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
17
+
18
+ # Gradio Interface
19
+ interface = gr.Interface(
20
+ fn=download_papers,
21
+ inputs=[
22
+ gr.File(file_types=['.bib'], label="Upload BibTeX File"),
23
+ gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
24
+ gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
25
+ ],
26
+ outputs=[
27
+ gr.File(label="Download Papers (ZIP) or Single PDF"),
28
+ gr.HTML(label="""
29
+ <div style='padding-bottom: 5px; font-weight: bold;'>
30
+ Found DOIs
31
+ </div>
32
+ <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
33
+ <div id="downloaded-dois"></div>
34
+ </div>
35
+ """),
36
+ gr.HTML(label="""
37
+ <div style='padding-bottom: 5px; font-weight: bold;'>
38
+ Missed DOIs
39
+ </div>
40
+ <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
41
+ <div id="failed-dois"></div>
42
+ </div>
43
+ """),
44
+ gr.File(label="Downloaded Single PDF")
45
+ ],
46
+ title="🔬 Academic Paper Batch Downloader",
47
+ description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
48
+ theme="Hev832/Applio",
49
+ examples=[
50
+ ["example.bib", None, None], # Bibtex File
51
+ [None, "10.1038/nature12373", None], # Single DOI
52
+ [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
53
+ ],
54
+ css="""
55
+ .gradio-container {
56
+ background-color: black;
57
+ }
58
+ .gr-interface {
59
+ max-width: 800px;
60
+ margin: 0 auto;
61
+ }
62
+ .gr-box {
63
+ background-color: black;
64
+ border-radius: 10px;
65
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
66
+ }
67
+ .output-text a {
68
+ color: #007bff; /* Blue color for hyperlinks */
69
  }
70
+ """,
71
+ cache_examples=False,
72
+ )
73
+
74
+ # Add Javascript to update HTML
75
+ interface.load = """
76
+ function(downloaded_dois, failed_dois) {
77
+ let downloaded_html = '';
78
+ downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
79
+ downloaded_html += doi + '<br>';
80
+ });
81
+ document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
82
+
83
+ let failed_html = '';
84
+ failed_dois.split('\\n').filter(Boolean).forEach(doi => {
85
+ failed_html += doi + '<br>';
86
+ });
87
+ document.querySelector("#failed-dois").innerHTML = failed_html;
88
+ return [downloaded_html, failed_html];
89
+ }
90
+ """
91
+
92
+ interface.head = """
93
+ <script>
94
+ function copyLink(button) {
95
+ const linkElement = button.previousElementSibling;
96
+ const link = linkElement.href;
97
+ navigator.clipboard.writeText(link)
98
+ .then(() => {
99
+ button.innerText = '✓ Copied';
100
+ button.style.color = 'green';
101
+ setTimeout(() => {
102
+ button.innerText = 'Copy';
103
+ button.style.color = '';
104
+ }, 2000);
105
+ })
106
+ .catch(err => {
107
+ console.error('Failed to copy link: ', err);
108
+ });
109
+ }
110
+ </script>
111
+ """
112
+ return interface