C2MV commited on
Commit
0c2df7b
·
verified ·
1 Parent(s): f732808

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +457 -327
app.py CHANGED
@@ -12,13 +12,13 @@ from bs4 import BeautifulSoup
12
  import io
13
  import asyncio
14
  import aiohttp
15
- from concurrent.futures import ThreadPoolExecutor, CancelledError
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO,
19
  format='%(asctime)s - %(levelname)s: %(message)s')
20
  logger = logging.getLogger(__name__)
21
 
 
22
  class PaperDownloader:
23
  def __init__(self, output_dir='papers'):
24
  self.output_dir = output_dir
@@ -41,121 +41,67 @@ class PaperDownloader:
41
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
  'Accept-Language': 'en-US,en;q=0.9',
43
  }
44
- self.executor = ThreadPoolExecutor(max_workers=4)
45
- self.download_task = None # Added attribute
46
- self.results_dict = {}
47
  def clean_doi(self, doi):
48
  """Clean and encode DOI for URL"""
49
  if not isinstance(doi, str):
50
  return None
51
  return quote(doi.strip()) if doi else None
52
-
53
- async def fetch_with_headers(self, session, url, timeout=10):
54
- """Utility method to fetch an URL with headers and timeout"""
55
- try:
56
- async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
57
- response.raise_for_status()
58
- return await response.text(), response.headers
59
- except Exception as e:
60
- logger.debug(f"Error fetching {url}: {e}")
61
- return None, None
62
-
63
- async def fetch_pdf_content(self, session, url, max_redirects=5, max_retries=2, retry_delay=1):
64
- """Fetch content and validate if response is PDF, following up to max_redirects redirections with retries."""
65
-
66
- current_url = url
67
- redirect_count = 0
68
- retry_count = 0
69
-
70
- while redirect_count <= max_redirects:
71
-
72
- try:
73
- while retry_count <= max_retries:
74
- try:
75
- logger.debug(f"Fetching PDF from {current_url} - Retry {retry_count + 1}")#ADDED
76
- async with session.get(current_url, headers=self.headers, timeout=10, allow_redirects=False) as response:
77
- if response.status in [301, 302, 307, 308]:
78
- current_url = response.headers['Location']
79
- redirect_count += 1
80
- logger.debug(f"Following redirect from {url} to {current_url}")
81
- break # Break out of the retry loop for a redirect
82
-
83
- response.raise_for_status()
84
-
85
- if 'application/pdf' in response.headers.get('Content-Type', ''):
86
- logger.debug(f"Successfully fetched PDF from {current_url}")#ADDED
87
- return await response.read()
88
- else:
89
- logger.debug(f"Content type not PDF for {current_url}: {response.headers.get('Content-Type', '')}")
90
- return None
91
- except Exception as e:
92
- logger.debug(f"Error getting PDF, retrying ({retry_count}/{max_retries}) from {current_url}: {e}")
93
- retry_count += 1
94
- await asyncio.sleep(retry_delay)
95
-
96
- retry_count = 0 # Reset the retry count, in case there's a next redirect attempt
97
- except CancelledError:
98
- logger.info(f"Fetch PDF cancelled from: {url}")
99
- return None
100
-
101
- except Exception as e:
102
- logger.debug(f"Error getting PDF from {current_url}: {e}")
103
- return None
104
 
105
-
106
-
107
- logger.debug(f"Too many redirects or retries {url}, not following this link further")
108
- return None
109
-
110
- async def download_paper_direct_doi_async(self, session, doi):
111
- """Attempt to download the pdf from the landing page of the doi"""
112
- if not doi:
113
- return None
114
-
115
- try:
116
- doi_url = f"https://doi.org/{self.clean_doi(doi)}"
117
-
118
- # First, let's try to download the URL directly in case it is already the pdf.
119
- pdf_content = await self.fetch_pdf_content(session, doi_url)
120
- if pdf_content:
121
- logger.debug(f"Direct DOI resolved to PDF from {doi_url}")
122
- return pdf_content
123
-
124
- # If direct DOI link was not a pdf, fetch landing page and extract links
125
- text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
126
- if not text:
127
- return None
128
-
129
- pdf_patterns = [
130
- r'(https?://[^\s<>"]+?\.pdf)',
131
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
132
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
133
- ]
134
-
135
- pdf_urls = []
136
- for pattern in pdf_patterns:
137
- pdf_urls.extend(re.findall(pattern, text))
138
 
139
- # Attempt each pdf url and break when you find a PDF content.
140
- for pdf_url in pdf_urls:
141
- pdf_content = await self.fetch_pdf_content(session, pdf_url)
142
- if pdf_content:
143
- logger.debug(f"Found PDF from: {pdf_url}")
144
- return pdf_content
145
 
146
- except Exception as e:
147
- logger.debug(f"Error trying to get the PDF from {doi}: {e}")
148
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  async def download_paper_scihub_async(self, session, doi):
151
  """Improved method to download paper from Sci-Hub using async requests"""
152
  if not doi:
153
  logger.warning("DOI not provided")
154
  return None
155
-
156
-
157
  for base_url in self.download_sources:
158
-
159
  try:
160
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
161
  text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
@@ -307,223 +253,414 @@ class PaperDownloader:
307
  delay *= 2 # Exponential backoff
308
 
309
  return None
310
- async def _download_single_doi(self, doi):
311
- """Descargar un único DOI con retroalimentación de progreso"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  if not doi:
313
- return None, "Error: DOI no proporcionado", "Error: DOI no proporcionado"
314
- logger.info(f"Starting download process for DOI: {doi}")
315
-
316
  try:
317
- pdf_content = await self.download_with_retry_async(doi)
 
318
  if pdf_content:
319
- logger.info(f"Downloaded PDF for DOI: {doi}")
320
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
321
- filepath = os.path.join(self.output_dir, filename)
322
- with open(filepath, 'wb') as f:
323
- f.write(pdf_content)
324
- logger.info(f"Saved PDF to file: {filepath}")
325
- logger.info(f"Descarga exitosa: {filename}")
326
- return filepath, f"Descargado exitosamente: <a href='https://doi.org/{doi}'>{doi}</a>", ""
327
-
328
  else:
329
- logger.warning(f"No se pudo descargar: {doi}")
330
- return None, f"No se pudo descargar {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
331
-
332
- except CancelledError:
333
- logger.info(f"Download Cancelled DOI: {doi}")
334
- return None, f"Download cancelled {doi}","Download Cancelled"
335
-
336
  except Exception as e:
337
  logger.error(f"Error processing {doi}: {e}")
338
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
339
-
340
- async def download_multiple_dois(self, dois_text, cancel_event):
341
- """Download multiple DOIs"""
342
- if not dois_text:
343
- return None, "Error: No DOIs provided", "Error: No DOIs provided", ""
344
-
345
- # Sanitizar y filtrar DOIs
346
- # Eliminar líneas vacías, espacios en blanco, y DOIs duplicados
347
- dois = list(set([doi.strip() for doi in dois_text.split('\n') if doi.strip()]))
348
-
349
- # Validar lista de DOIs
350
- if not dois:
351
- return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided", ""
352
-
353
- # Listas para rastrear resultados
354
- downloaded_files = [] # Rutas de archivos descargados
355
- failed_dois = [] # DOIs que no se pudieron descargar
356
- downloaded_links = [] # Links de DOIs descargados
357
-
358
- for i, doi in enumerate(dois):
359
- result = await self._download_single_doi(doi)
360
-
361
- if cancel_event.is_set():
362
- logger.info("Downloads cancelled on multiple dois download")
363
- return None,"Downloads cancelled","Downloads cancelled", ""
364
- if result is None:
365
- continue
366
- if isinstance(result, Exception):
367
- # Excepción inesperada
368
- error_msg = f"Unexpected error: {str(result)}"
369
- logger.error(f"Error downloading {doi}: {error_msg}")
370
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
371
-
372
- elif result[0] is None:
373
- # Descarga fallida (resultado de download_single_doi_async)
374
- error_msg = result[1]
375
- logger.warning(f"Failed to download {doi}: {error_msg}")
376
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
377
-
378
- else:
379
- # Descarga exitosa
380
- filepath = result[0]
381
-
382
- # Generar nombre de archivo único
383
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf" # indent fix.
384
- filepath_unique = os.path.join(self.output_dir, filename)
385
-
386
- try:
387
- # Renombrar archivo
388
- os.rename(filepath, filepath_unique) #Fixed ident
389
-
390
- # Añadir a lista de archivos descargados
391
- downloaded_files.append(filepath_unique) #Fixed ident
392
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')#Fixed ident
393
-
394
- except Exception as rename_error:
395
- logger.error(f"Error renaming file for {doi}: {rename_error}")
396
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - Error saving file')#Fixed ident
397
-
398
-
399
- # Crear archivo ZIP si hay archivos descargados
400
- zip_filename = None
401
- if downloaded_files:
402
- zip_filename = 'papers.zip'
403
- loop = asyncio.get_running_loop()
404
-
405
- # Ejecutar creación de ZIP en un executor para no bloquear
406
- loop.run_in_executor(
407
- self.executor,
408
- lambda: self.create_zip(zip_filename, downloaded_files)
409
- )
410
- logger.info(f"ZIP file created: {zip_filename}")
411
-
412
- return zip_filename if downloaded_files else None, "\n".join(downloaded_links),"\n".join(failed_dois),""
413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
- async def process_bibtex(self, bib_file, cancel_event):
416
- """Process BibTeX file and download papers with multiple strategies and reports UI updates using a callback"""
 
 
 
 
 
 
 
 
 
417
  # Read BibTeX file content from the uploaded object
418
  try:
419
  with open(bib_file.name, 'r', encoding='utf-8') as f:
420
  bib_content = f.read()
421
  except Exception as e:
422
  logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
423
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", ""
424
 
425
  # Parse BibTeX data
426
  try:
427
  bib_database = bibtexparser.loads(bib_content)
428
  except Exception as e:
429
  logger.error(f"Error parsing BibTeX data: {e}")
430
- return None,f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}",""
431
 
432
  # Extract DOIs
433
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
434
  logger.info(f"Found {len(dois)} DOIs to download")
435
 
436
- # Result lists
437
  downloaded_files = []
438
  failed_dois = []
439
  downloaded_links = []
440
-
441
- for i, doi in enumerate(dois):
442
- result = await self._download_single_doi(doi, cancel_event) # now its async directly here
443
-
444
- if cancel_event.is_set():
445
- logger.info("Download Cancelled in bibtex mode")
446
- return None, "Download Cancelled", "Download Cancelled", ""
447
-
448
- if result is None:
449
- continue
450
-
451
- if isinstance(result, Exception):
452
- # Excepción inesperada
453
- error_msg = f"Unexpected error: {str(result)}"
454
- logger.error(f"Error downloading {doi}: {error_msg}")
455
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
456
-
457
- elif result[0] is None:
458
- # Descarga fallida (resultado de download_single_doi_async)
459
- error_msg = result[1]
460
- logger.warning(f"Failed to download {doi}: {error_msg}")
461
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a> - {error_msg}')
462
-
463
  else:
464
- # Descarga exitosa
465
- filepath = result[0]
466
-
467
- # Unique filename for zip
468
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
469
- filepath_unique = os.path.join(self.output_dir, filename)
470
- os.rename(filepath, filepath_unique)
471
- downloaded_files.append(filepath_unique)
472
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
473
 
 
474
  if downloaded_files:
475
  zip_filename = 'papers.zip'
476
- loop = asyncio.get_running_loop()
477
- loop.run_in_executor(self.executor, lambda: self.create_zip(zip_filename,downloaded_files))
 
478
  logger.info(f"ZIP file created: {zip_filename}")
479
-
480
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois),""
481
-
482
-
483
- def create_zip(self, zip_filename, files):
484
- """Crea un archivo zip con los pdfs descargados"""
485
- with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
486
- for file in files:
487
- zf.write(file, os.path.basename(file))
488
- def cancel_download(self):
489
- if self.download_task:
490
- self.cancel_event.set()
491
- # Cancel the download task if it exists and it is cancelable
492
-
493
- self.download_task.cancel()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
495
  def create_gradio_interface():
496
  """Create Gradio interface for Paper Downloader"""
497
  downloader = PaperDownloader()
498
 
499
- def update_progress( message="", logs=""):
500
- return gr.Textbox.update(value=f"{message}"),gr.Textbox.update(value=f"<pre>{logs}</pre>")
501
-
502
-
503
  async def download_papers(bib_file, doi_input, dois_input):
504
- cancel_event = asyncio.Event() # Create cancellation event for every submission.
505
- downloader.cancel_event = cancel_event # store the event so that it is available to stop the process
506
-
507
- if bib_file:
508
- # Check file type
509
- if not bib_file.name.lower().endswith('.bib'):
510
- return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", "", None
511
- zip_file, downloaded_dois, failed_dois, logs_text= await downloader.process_bibtex(bib_file, cancel_event)
512
-
513
- return zip_file, downloaded_dois, failed_dois, logs_text, None #all outputs at return.
514
-
515
- elif doi_input:
516
- filepath, message, error = await downloader._download_single_doi(doi_input,cancel_event)
517
- return None, message, error, "", filepath
518
-
519
- elif dois_input:
520
- zip_file, downloaded_dois, failed_dois, logs_text= await downloader.download_multiple_dois(dois_input, cancel_event)
521
- return zip_file, downloaded_dois, failed_dois, logs_text, None
522
- else:
523
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs","", None #all output data returned
524
-
525
-
526
- with gr.Blocks(theme="Hev832/Applio", css="""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
  .gradio-container {
528
  background-color: black;
529
  }
@@ -539,60 +676,53 @@ def create_gradio_interface():
539
  .output-text a {
540
  color: #007bff; /* Blue color for hyperlinks */
541
  }
542
- .logs_box {
543
-
544
- }
545
- """) as interface:
546
- with gr.Row():
547
- with gr.Column():
548
- bib_file = gr.File(file_types=['.bib'], label="Upload BibTeX File")
549
- doi_input = gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx")
550
- dois_input = gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
551
- with gr.Row():
552
- clear_button= gr.ClearButton(value = "Clear") # added a clear button
553
- submit_button= gr.Button(value="Submit")
554
- examples= gr.Examples([
555
- ["example.bib", None, None], # Bibtex File
556
- [None, "10.1038/nature12373", None], # Single DOI
557
- [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
558
- ],
559
- inputs=[bib_file, doi_input, dois_input]
560
- )
561
-
562
- with gr.Column():
563
-
564
-
565
- output_file = gr.File(label="Download Papers (ZIP) or Single PDF")
566
- downloaded_dois_textbox = gr.HTML(label="""
567
- Found DOIs
568
- """,)
569
- failed_dois_textbox=gr.HTML(label="""
570
- Missed DOIs
571
- """,)
572
- logs = gr.Textbox(label="""
573
- Logs
574
- """, lines = 10)
575
-
576
- single_file= gr.File(label="Downloaded Single PDF")
577
-
578
- with gr.Row():
579
- stop_button = gr.Button(value="Stop Downloads")
580
- stop_button.click(lambda: downloader.cancel_download(), outputs=None) # added function in object downloader
581
-
582
- submit_button.click(
583
- download_papers,
584
- inputs=[bib_file, doi_input, dois_input],
585
- outputs=[output_file, downloaded_dois_textbox, failed_dois_textbox,logs, single_file ], # the new output should be a tuple and we output logs too for debugging.
586
- )
587
-
588
- interface.title="🔬 Academic Paper Batch Downloader"
589
- interface.description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment."
590
-
591
  return interface
592
 
 
593
  def main():
594
  interface = create_gradio_interface()
595
  interface.launch(share=True)
596
 
 
597
  if __name__ == "__main__":
598
  main()
 
12
  import io
13
  import asyncio
14
  import aiohttp
 
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO,
18
  format='%(asctime)s - %(levelname)s: %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
21
+
22
  class PaperDownloader:
23
  def __init__(self, output_dir='papers'):
24
  self.output_dir = output_dir
 
41
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
42
  'Accept-Language': 'en-US,en;q=0.9',
43
  }
44
+
 
 
45
  def clean_doi(self, doi):
46
  """Clean and encode DOI for URL"""
47
  if not isinstance(doi, str):
48
  return None
49
  return quote(doi.strip()) if doi else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ async def fetch_with_headers(self, session, url, timeout=10):
52
+ """Utility method to fetch an URL with headers and timeout"""
53
+ try:
54
+ async with session.get(url, headers=self.headers, timeout=timeout, allow_redirects=True) as response:
55
+ response.raise_for_status()
56
+ return await response.text(), response.headers
57
+ except Exception as e:
58
+ logger.debug(f"Error fetching {url}: {e}")
59
+ return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
 
 
 
 
 
 
61
 
62
+ async def download_paper_direct_doi_async(self, session, doi):
63
+ """Attempt to download the pdf from the landing page of the doi"""
64
+ if not doi:
65
+ return None
66
+
67
+ try:
68
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
69
+ text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
70
+ if not text:
71
+ return None
72
+
73
+ pdf_patterns = [
74
+ r'(https?://[^\s<>"]+?\.pdf)',
75
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
76
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
77
+ ]
78
 
79
+ pdf_urls = []
80
+ for pattern in pdf_patterns:
81
+ pdf_urls.extend(re.findall(pattern, text))
82
+
83
+ for pdf_url in pdf_urls:
84
+ try:
85
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
+ logger.debug(f"Found PDF from: {pdf_url}")
88
+ return await pdf_response.read()
89
+ except Exception as e:
90
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
+
92
+
93
+ except Exception as e:
94
+ logger.debug(f"Error trying to get the PDF from {doi}: {e}")
95
+
96
+ return None
97
+
98
  async def download_paper_scihub_async(self, session, doi):
99
  """Improved method to download paper from Sci-Hub using async requests"""
100
  if not doi:
101
  logger.warning("DOI not provided")
102
  return None
103
+
 
104
  for base_url in self.download_sources:
 
105
  try:
106
  scihub_url = f"{base_url}{self.clean_doi(doi)}"
107
  text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
 
253
  delay *= 2 # Exponential backoff
254
 
255
  return None
256
+
257
+ def download_paper_scihub(self, doi):
258
+ """Improved method to download paper from Sci-Hub"""
259
+ if not doi:
260
+ logger.warning("DOI not provided")
261
+ return None
262
+
263
+ for base_url in self.download_sources:
264
+ try:
265
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
266
+
267
+ # Request with more tolerance
268
+ response = requests.get(scihub_url,
269
+ headers=self.headers,
270
+ allow_redirects=True,
271
+ timeout=15)
272
+
273
+ # Search for multiple PDF URL patterns
274
+ pdf_patterns = [
275
+ r'(https?://[^\s<>"]+?\.pdf)',
276
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
277
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
278
+ ]
279
+
280
+ pdf_urls = []
281
+ for pattern in pdf_patterns:
282
+ pdf_urls.extend(re.findall(pattern, response.text))
283
+
284
+ # Try downloading from found URLs
285
+ for pdf_url in pdf_urls:
286
+ try:
287
+ pdf_response = requests.get(pdf_url,
288
+ headers=self.headers,
289
+ timeout=10)
290
+
291
+ # Verify if it's a PDF
292
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
293
+ logger.debug(f"Found PDF from: {pdf_url}")
294
+ return pdf_response.content
295
+ except Exception as e:
296
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
297
+
298
+ except Exception as e:
299
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
300
+
301
+ return None
302
+
303
+ def download_paper_libgen(self, doi):
304
+ """Download from Libgen, handles the query and the redirection"""
305
+ if not doi:
306
+ return None
307
+
308
+ base_url = 'https://libgen.rs/scimag/'
309
+ try:
310
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
311
+ response = requests.get(search_url, headers=self.headers, allow_redirects=True, timeout=10)
312
+ response.raise_for_status()
313
+
314
+ if "No results" in response.text:
315
+ logger.debug(f"No results for DOI: {doi} on libgen")
316
+ return None
317
+
318
+ soup = BeautifulSoup(response.text, 'html.parser')
319
+
320
+ # Find the link using a specific selector
321
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
322
+
323
+ if links:
324
+ link = links[0]
325
+ pdf_url = link['href']
326
+ pdf_response = requests.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
327
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
328
+ logger.debug(f"Found PDF from: {pdf_url}")
329
+ return pdf_response.content
330
+
331
+ except Exception as e:
332
+ logger.debug(f"Error trying to download {doi} from libgen: {e}")
333
+ return None
334
+
335
+ def download_paper_google_scholar(self, doi):
336
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
337
+ if not doi:
338
+ return None
339
+
340
+ try:
341
+ query = f'doi:"{doi}"'
342
+ params = {'q': query}
343
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
344
+
345
+ response = requests.get(url, headers=self.headers, timeout=10)
346
+ response.raise_for_status()
347
+
348
+ soup = BeautifulSoup(response.text, 'html.parser')
349
+
350
+ # Find any links with [PDF]
351
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
352
+
353
+ if links:
354
+ pdf_url = links[0]['href']
355
+ pdf_response = requests.get(pdf_url, headers=self.headers, timeout=10)
356
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
357
+ logger.debug(f"Found PDF from: {pdf_url}")
358
+ return pdf_response.content
359
+ except Exception as e:
360
+ logger.debug(f"Google Scholar error for {doi}: {e}")
361
+
362
+ return None
363
+
364
+ def download_paper_crossref(self, doi):
365
+ """Alternative search method using Crossref"""
366
+ if not doi:
367
+ return None
368
+
369
+ try:
370
+ # Search for open access link
371
+ url = f"https://api.crossref.org/works/{doi}"
372
+ response = requests.get(url, headers=self.headers, timeout=10)
373
+
374
+ if response.status_code == 200:
375
+ data = response.json()
376
+ work = data.get('message', {})
377
+
378
+ # Search for open access links
379
+ links = work.get('link', [])
380
+ for link in links:
381
+ if link.get('content-type') == 'application/pdf':
382
+ pdf_url = link.get('URL')
383
+ if pdf_url:
384
+ pdf_response = requests.get(pdf_url, headers=self.headers)
385
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
386
+ logger.debug(f"Found PDF from: {pdf_url}")
387
+ return pdf_response.content
388
+
389
+ except Exception as e:
390
+ logger.debug(f"Crossref error for {doi}: {e}")
391
+
392
+ return None
393
+
394
+ def download_with_retry(self, doi, max_retries=3, initial_delay=2):
395
+ """Downloads a paper using multiple strategies with exponential backoff"""
396
+ pdf_content = None
397
+ retries = 0
398
+ delay = initial_delay
399
+
400
+ while retries < max_retries and not pdf_content:
401
+ try:
402
+ pdf_content = (
403
+ self.download_paper_scihub(doi) or
404
+ self.download_paper_libgen(doi) or
405
+ self.download_paper_google_scholar(doi) or
406
+ self.download_paper_crossref(doi)
407
+
408
+ )
409
+
410
+ if pdf_content:
411
+ return pdf_content
412
+ except Exception as e:
413
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
414
+
415
+ if not pdf_content:
416
+ retries += 1
417
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
418
+ time.sleep(delay)
419
+ delay *= 2 # Exponential backoff
420
+
421
+ return None
422
+
423
+ def download_single_doi(self, doi):
424
+ """Downloads a single paper using a DOI"""
425
  if not doi:
426
+ return None, "Error: DOI not provided", "Error: DOI not provided"
427
+
 
428
  try:
429
+ pdf_content = self.download_with_retry(doi)
430
+
431
  if pdf_content:
432
+ if doi is None:
433
+ return None, "Error: DOI not provided", "Error: DOI not provided"
434
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
435
+ filepath = os.path.join(self.output_dir, filename)
436
+ with open(filepath, 'wb') as f:
437
+ f.write(pdf_content)
438
+ logger.info(f"Successfully downloaded: {filename}")
439
+ return filepath, f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>', ""
 
440
  else:
441
+ logger.warning(f"Could not download: {doi}")
442
+ return None, f"Could not download {doi}", f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>'
443
+
 
 
 
 
444
  except Exception as e:
445
  logger.error(f"Error processing {doi}: {e}")
446
  return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
+ def download_multiple_dois(self, dois_text):
449
+ """Downloads multiple papers from a list of DOIs"""
450
+ if not dois_text:
451
+ return None, "Error: No DOIs provided", "Error: No DOIs provided"
452
+
453
+ dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
454
+ if not dois:
455
+ return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
456
+
457
+ downloaded_files = []
458
+ failed_dois = []
459
+ downloaded_links = []
460
+ for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
461
+ filepath, success_message, fail_message = self.download_single_doi(doi)
462
+ if filepath:
463
+ # Unique filename for zip
464
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
465
+ filepath_unique = os.path.join(self.output_dir, filename)
466
+ os.rename(filepath, filepath_unique)
467
+ downloaded_files.append(filepath_unique)
468
+ downloaded_links.append(f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
469
+
470
+ else:
471
+ failed_dois.append(f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
472
 
473
+ if downloaded_files:
474
+ zip_filename = 'papers.zip'
475
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
476
+ for file_path in downloaded_files:
477
+ zipf.write(file_path, arcname=os.path.basename(file_path))
478
+ logger.info(f"ZIP file created: {zip_filename}")
479
+
480
+ return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
481
+
482
+ def process_bibtex(self, bib_file):
483
+ """Process BibTeX file and download papers with multiple strategies"""
484
  # Read BibTeX file content from the uploaded object
485
  try:
486
  with open(bib_file.name, 'r', encoding='utf-8') as f:
487
  bib_content = f.read()
488
  except Exception as e:
489
  logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
490
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
491
 
492
  # Parse BibTeX data
493
  try:
494
  bib_database = bibtexparser.loads(bib_content)
495
  except Exception as e:
496
  logger.error(f"Error parsing BibTeX data: {e}")
497
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
498
 
499
  # Extract DOIs
500
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
501
  logger.info(f"Found {len(dois)} DOIs to download")
502
 
503
+ # Result lists
504
  downloaded_files = []
505
  failed_dois = []
506
  downloaded_links = []
507
+
508
+ # Download PDFs
509
+ for doi in tqdm(dois, desc="Downloading papers"):
510
+ try:
511
+ # Try to download with multiple methods with retries
512
+ pdf_content = self.download_with_retry(doi)
513
+
514
+ # Save PDF
515
+ if pdf_content:
516
+ if doi is None:
517
+ return None, "Error: DOI not provided", "Error: DOI not provided", None
518
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
519
+ filepath = os.path.join(self.output_dir, filename)
520
+
521
+ with open(filepath, 'wb') as f:
522
+ f.write(pdf_content)
523
+
524
+ downloaded_files.append(filepath)
525
+ downloaded_links.append(f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
526
+ logger.info(f"Successfully downloaded: {filename}")
 
 
 
527
  else:
528
+ failed_dois.append(f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
529
+
530
+ except Exception as e:
531
+ failed_dois.append(f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
532
+ logger.error(f"Error processing {doi}: {e}")
 
 
 
 
533
 
534
+ # Create ZIP of downloaded papers
535
  if downloaded_files:
536
  zip_filename = 'papers.zip'
537
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
538
+ for file_path in downloaded_files:
539
+ zipf.write(file_path, arcname=os.path.basename(file_path))
540
  logger.info(f"ZIP file created: {zip_filename}")
541
+
542
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
543
+
544
+ async def process_bibtex_async(self, bib_file):
545
+ """Process BibTeX file and download papers with multiple strategies"""
546
+ # Read BibTeX file content from the uploaded object
547
+ try:
548
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
549
+ bib_content = f.read()
550
+ except Exception as e:
551
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
552
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
553
+
554
+ # Parse BibTeX data
555
+ try:
556
+ bib_database = bibtexparser.loads(bib_content)
557
+ except Exception as e:
558
+ logger.error(f"Error parsing BibTeX data: {e}")
559
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
560
+
561
+ # Extract DOIs
562
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
563
+ logger.info(f"Found {len(dois)} DOIs to download")
564
+
565
+ # Result lists
566
+ downloaded_files = []
567
+ failed_dois = []
568
+ downloaded_links = []
569
+
570
+ # Download PDFs
571
+ for doi in tqdm(dois, desc="Downloading papers"):
572
+ try:
573
+ # Try to download with multiple methods with retries
574
+ pdf_content = await self.download_with_retry_async(doi)
575
+
576
+ # Save PDF
577
+ if pdf_content:
578
+ if doi is None:
579
+ return None, "Error: DOI not provided", "Error: DOI not provided", None
580
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
581
+ filepath = os.path.join(self.output_dir, filename)
582
+
583
+ with open(filepath, 'wb') as f:
584
+ f.write(pdf_content)
585
+
586
+ downloaded_files.append(filepath)
587
+ downloaded_links.append(f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
588
+ logger.info(f"Successfully downloaded: {filename}")
589
+ else:
590
+ failed_dois.append(f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
591
+
592
+ except Exception as e:
593
+ failed_dois.append(f'<div style="display: flex; align-items: center;"><a href="https://doi.org/{doi}">{doi}</a> <button onclick="copyLink(this)">Copy</button></div>')
594
+ logger.error(f"Error processing {doi}: {e}")
595
+
596
+ # Create ZIP of downloaded papers
597
+ if downloaded_files:
598
+ zip_filename = 'papers.zip'
599
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
600
+ for file_path in downloaded_files:
601
+ zipf.write(file_path, arcname=os.path.basename(file_path))
602
+ logger.info(f"ZIP file created: {zip_filename}")
603
+
604
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
605
 
606
  def create_gradio_interface():
607
  """Create Gradio interface for Paper Downloader"""
608
  downloader = PaperDownloader()
609
 
 
 
 
 
610
  async def download_papers(bib_file, doi_input, dois_input):
611
+ if bib_file:
612
+ # Check file type
613
+ if not bib_file.name.lower().endswith('.bib'):
614
+ return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
615
+
616
+ zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file)
617
+ return zip_path, downloaded_dois, failed_dois, None
618
+ elif doi_input:
619
+ filepath, message, failed_doi = downloader.download_single_doi(doi_input)
620
+ return None, message, failed_doi, filepath
621
+ elif dois_input:
622
+ zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
623
+ return zip_path, downloaded_dois, failed_dois, None
624
+ else:
625
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
626
+
627
+ # Gradio Interface
628
+ interface = gr.Interface(
629
+ fn=download_papers,
630
+ inputs=[
631
+ gr.File(file_types=['.bib'], label="Upload BibTeX File"),
632
+ gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
633
+ gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
634
+ ],
635
+ outputs=[
636
+ gr.File(label="Download Papers (ZIP) or Single PDF"),
637
+ gr.HTML(label="""
638
+ <div style='padding-bottom: 5px; font-weight: bold;'>
639
+ Found DOIs
640
+ </div>
641
+ <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
642
+ <div id="downloaded-dois"></div>
643
+ </div>
644
+ """),
645
+ gr.HTML(label="""
646
+ <div style='padding-bottom: 5px; font-weight: bold;'>
647
+ Missed DOIs
648
+ </div>
649
+ <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
650
+ <div id="failed-dois"></div>
651
+ </div>
652
+ """),
653
+ gr.File(label="Downloaded Single PDF")
654
+ ],
655
+ title="🔬 Academic Paper Batch Downloader",
656
+ description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
657
+ theme="Hev832/Applio",
658
+ examples=[
659
+ ["example.bib", None, None], # Bibtex File
660
+ [None, "10.1038/nature12373", None], # Single DOI
661
+ [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
662
+ ],
663
+ css="""
664
  .gradio-container {
665
  background-color: black;
666
  }
 
676
  .output-text a {
677
  color: #007bff; /* Blue color for hyperlinks */
678
  }
679
+ """,
680
+ cache_examples=False,
681
+ )
682
+
683
+ # Add Javascript to update HTML
684
+ interface.load = """
685
+ function(downloaded_dois, failed_dois) {
686
+ let downloaded_html = '';
687
+ downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
688
+ downloaded_html += doi + '<br>';
689
+ });
690
+ document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
691
+
692
+ let failed_html = '';
693
+ failed_dois.split('\\n').filter(Boolean).forEach(doi => {
694
+ failed_html += doi + '<br>';
695
+ });
696
+ document.querySelector("#failed-dois").innerHTML = failed_html;
697
+ return [downloaded_html, failed_html];
698
+ }
699
+ """
700
+
701
+ interface.head = """
702
+ <script>
703
+ function copyLink(button) {
704
+ const linkElement = button.previousElementSibling;
705
+ const link = linkElement.href;
706
+ navigator.clipboard.writeText(link)
707
+ .then(() => {
708
+ button.innerText = 'Copied!';
709
+ setTimeout(() => {
710
+ button.innerText = 'Copy';
711
+ }, 2000);
712
+ })
713
+ .catch(err => {
714
+ console.error('Failed to copy link: ', err);
715
+ });
716
+ }
717
+ </script>
718
+ """
 
 
 
 
 
 
 
 
 
719
  return interface
720
 
721
+
722
  def main():
723
  interface = create_gradio_interface()
724
  interface.launch(share=True)
725
 
726
+
727
  if __name__ == "__main__":
728
  main()