C2MV commited on
Commit
401d3db
1 Parent(s): bf37274

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -139
app.py CHANGED
@@ -10,10 +10,6 @@ from urllib.parse import quote, urlencode
10
  import gradio as gr
11
  from bs4 import BeautifulSoup
12
  import io
13
- from docx import Document
14
- from docx.shared import Inches
15
- from docx.enum.text import WD_ALIGN_PARAGRAPH
16
-
17
 
18
  # Configure logging
19
  logging.basicConfig(level=logging.INFO,
@@ -219,42 +215,42 @@ class PaperDownloader:
219
  def download_single_doi(self, doi):
220
  """Downloads a single paper using a DOI"""
221
  if not doi:
222
- return None, "Error: DOI not provided", "Error: DOI not provided", None, None
223
 
224
  try:
225
  pdf_content = self.download_with_retry(doi)
226
 
227
  if pdf_content:
228
  if doi is None:
229
- return None, "Error: DOI not provided", "", None, None
230
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
231
  filepath = os.path.join(self.output_dir, filename)
232
  with open(filepath, 'wb') as f:
233
  f.write(pdf_content)
234
  logger.info(f"Successfully downloaded: {filename}")
235
- return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', "", None
236
  else:
237
  logger.warning(f"Could not download: {doi}")
238
- return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>', None, None
239
 
240
  except Exception as e:
241
  logger.error(f"Error processing {doi}: {e}")
242
- return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}", None, None
243
 
244
  def download_multiple_dois(self, dois_text):
245
  """Downloads multiple papers from a list of DOIs"""
246
  if not dois_text:
247
- return None, "Error: No DOIs provided", "Error: No DOIs provided", None, None
248
 
249
  dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
250
  if not dois:
251
- return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided", None, None
252
 
253
  downloaded_files = []
254
  failed_dois = []
255
  downloaded_links = []
256
  for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
257
- filepath, success_message, fail_message, _ = self.download_single_doi(doi)
258
  if filepath:
259
  # Unique filename for zip
260
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
@@ -273,7 +269,7 @@ class PaperDownloader:
273
  zipf.write(file_path, arcname=os.path.basename(file_path))
274
  logger.info(f"ZIP file created: {zip_filename}")
275
 
276
- return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois), None, None
277
 
278
 
279
  def process_bibtex(self, bib_file):
@@ -284,14 +280,14 @@ class PaperDownloader:
284
  bib_content = f.read()
285
  except Exception as e:
286
  logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
287
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None, None
288
 
289
  # Parse BibTeX data
290
  try:
291
  bib_database = bibtexparser.loads(bib_content)
292
  except Exception as e:
293
  logger.error(f"Error parsing BibTeX data: {e}")
294
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None, None
295
 
296
  # Extract DOIs
297
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
@@ -311,7 +307,7 @@ class PaperDownloader:
311
  # Save PDF
312
  if pdf_content:
313
  if doi is None:
314
- return None, "Error: DOI not provided", "", None, None
315
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
316
  filepath = os.path.join(self.output_dir, filename)
317
 
@@ -336,41 +332,7 @@ class PaperDownloader:
336
  zipf.write(file_path, arcname=os.path.basename(file_path))
337
  logger.info(f"ZIP file created: {zip_filename}")
338
 
339
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None, None
340
-
341
- def create_report_docx(self, downloaded_dois, failed_dois):
342
- """Creates a Word document report of downloaded and failed DOIs."""
343
- document = Document()
344
-
345
- # Add a title
346
- title_paragraph = document.add_paragraph()
347
- title_run = title_paragraph.add_run("DOI Download Report")
348
- title_run.bold = True
349
- title_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
350
-
351
- # Add downloaded DOIs
352
- document.add_heading('Downloaded DOIs', level=2)
353
- if downloaded_dois:
354
- for doi in downloaded_dois.split('\n'):
355
- if doi:
356
- document.add_paragraph(doi)
357
- else:
358
- document.add_paragraph("No DOIs were successfully downloaded.")
359
-
360
-
361
- # Add failed DOIs
362
- document.add_heading('Failed DOIs', level=2)
363
- if failed_dois:
364
- for doi in failed_dois.split('\n'):
365
- if doi:
366
- document.add_paragraph(doi)
367
- else:
368
- document.add_paragraph("No DOIs failed to download.")
369
-
370
-
371
- report_path = "doi_report.docx"
372
- document.save(report_path)
373
- return report_path
374
 
375
  def create_gradio_interface():
376
  """Create Gradio interface for Paper Downloader"""
@@ -380,42 +342,31 @@ def create_gradio_interface():
380
  if bib_file:
381
  # Check file type
382
  if not bib_file.name.lower().endswith('.bib'):
383
- return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None, None
384
 
385
- zip_path, downloaded_dois, failed_dois, _ , _= downloader.process_bibtex(bib_file)
386
- return zip_path, downloaded_dois, failed_dois, None, None
387
  elif doi_input:
388
- filepath, message, failed_doi, _ = downloader.download_single_doi(doi_input)
389
- return None, message, failed_doi, filepath, None
390
  elif dois_input:
391
- zip_path, downloaded_dois, failed_dois, _ , _= downloader.download_multiple_dois(dois_input)
392
- return zip_path, downloaded_dois, failed_dois, None, None
393
  else:
394
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None, None
395
-
396
- def create_report(downloaded_dois_html, failed_dois_html):
397
- """Creates a report and returns the report path."""
398
- downloaded_dois = ""
399
- failed_dois = ""
400
-
401
- # Extract text content from HTML elements
402
- if downloaded_dois_html:
403
- soup = BeautifulSoup(downloaded_dois_html, 'html.parser')
404
- downloaded_dois_items = [li.text for li in soup.find_all('li')]
405
- downloaded_dois = "\n".join(downloaded_dois_items)
406
-
407
- if failed_dois_html:
408
- soup = BeautifulSoup(failed_dois_html, 'html.parser')
409
- failed_dois_items = [li.text for li in soup.find_all('li')]
410
- failed_dois = "\n".join(failed_dois_items)
411
-
412
- if downloaded_dois or failed_dois:
413
- report_path = downloader.create_report_docx(downloaded_dois, failed_dois)
414
- return report_path
415
- return None
416
-
417
-
418
- downloaded_dois_html = gr.HTML(label="""
419
  <div style='padding-bottom: 5px; font-weight: bold;'>
420
  Enter Single DOI
421
  </div>
@@ -425,76 +376,62 @@ def create_gradio_interface():
425
  </div>
426
  <div id="downloaded-dois"></div>
427
  </div>
428
- """)
429
- failed_dois_html = gr.HTML(label="""
430
  <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
431
  <div style='padding-bottom: 5px; font-weight: bold;'>
432
  Failed DOIs
433
  </div>
434
  <div id="failed-dois"></div>
435
  </div>
436
- """)
437
-
438
- with gr.Blocks(theme="Hev832/Applio") as interface:
439
- gr.Markdown("""# 🔬 Academic Paper Batch Downloader""")
440
- gr.Markdown("Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.")
441
- with gr.Row():
442
- with gr.Column():
443
- bib_file_input = gr.File(file_types=['.bib'], label="Upload BibTeX File")
444
- doi_input = gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx")
445
- dois_input = gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
446
-
447
- with gr.Row():
448
- clear_button = gr.Button("Clear")
449
- submit_button = gr.Button("Submit")
450
-
451
- with gr.Accordion("Examples"):
452
- gr.Examples(
453
- examples=[
454
- ["example.bib", None, None], # Bibtex File
455
- [None, "10.1038/nature12373", None], # Single DOI
456
- [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
457
- ],
458
- inputs=[bib_file_input, doi_input, dois_input]
459
- )
460
-
461
- with gr.Column():
462
- file_output = gr.File(label="Download Papers (ZIP) or Single PDF")
463
- downloaded_dois_html
464
- failed_dois_html
465
- single_pdf_output = gr.File(label="Downloaded Single PDF")
466
- with gr.Row():
467
- report_button = gr.Button("Create Report")
468
- report_output = gr.File(label="Download Report")
469
-
470
- interface.load = """
471
  function(downloaded_dois, failed_dois){
472
- let downloaded_html = '<ul>';
473
  downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
474
- downloaded_html += '<li>' + doi + '</li>';
475
  });
476
- downloaded_html += '</ul>';
477
  document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
478
-
479
- let failed_html = '<ul>';
480
  failed_dois.split('\\n').filter(Boolean).forEach(doi => {
481
- failed_html += '<li>' + doi + '</li>';
482
  });
483
- failed_html += '</ul>';
484
  document.querySelector("#failed-dois").innerHTML = failed_html;
485
  return [downloaded_html, failed_html];
486
-
487
  }
488
- """
489
-
490
- submit_button.click(
491
- download_papers,
492
- inputs=[bib_file_input, doi_input, dois_input],
493
- outputs=[file_output, downloaded_dois_html, failed_dois_html, single_pdf_output],
494
- )
495
-
496
- report_button.click(create_report, inputs = [downloaded_dois_html,failed_dois_html], outputs=report_output)
497
-
498
  return interface
499
 
500
  def main():
@@ -502,4 +439,4 @@ def main():
502
  interface.launch(share=True)
503
 
504
  if __name__ == "__main__":
505
- main()
 
10
  import gradio as gr
11
  from bs4 import BeautifulSoup
12
  import io
 
 
 
 
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO,
 
215
  def download_single_doi(self, doi):
216
  """Downloads a single paper using a DOI"""
217
  if not doi:
218
+ return None, "Error: DOI not provided", "Error: DOI not provided"
219
 
220
  try:
221
  pdf_content = self.download_with_retry(doi)
222
 
223
  if pdf_content:
224
  if doi is None:
225
+ return None, "Error: DOI not provided", "Error: DOI not provided"
226
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
227
  filepath = os.path.join(self.output_dir, filename)
228
  with open(filepath, 'wb') as f:
229
  f.write(pdf_content)
230
  logger.info(f"Successfully downloaded: {filename}")
231
+ return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
232
  else:
233
  logger.warning(f"Could not download: {doi}")
234
+ return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
235
 
236
  except Exception as e:
237
  logger.error(f"Error processing {doi}: {e}")
238
+ return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
239
 
240
  def download_multiple_dois(self, dois_text):
241
  """Downloads multiple papers from a list of DOIs"""
242
  if not dois_text:
243
+ return None, "Error: No DOIs provided", "Error: No DOIs provided"
244
 
245
  dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
246
  if not dois:
247
+ return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
248
 
249
  downloaded_files = []
250
  failed_dois = []
251
  downloaded_links = []
252
  for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
253
+ filepath, success_message, fail_message = self.download_single_doi(doi)
254
  if filepath:
255
  # Unique filename for zip
256
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
 
269
  zipf.write(file_path, arcname=os.path.basename(file_path))
270
  logger.info(f"ZIP file created: {zip_filename}")
271
 
272
+ return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
273
 
274
 
275
  def process_bibtex(self, bib_file):
 
280
  bib_content = f.read()
281
  except Exception as e:
282
  logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
283
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
284
 
285
  # Parse BibTeX data
286
  try:
287
  bib_database = bibtexparser.loads(bib_content)
288
  except Exception as e:
289
  logger.error(f"Error parsing BibTeX data: {e}")
290
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
291
 
292
  # Extract DOIs
293
  dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
 
307
  # Save PDF
308
  if pdf_content:
309
  if doi is None:
310
+ return None, "Error: DOI not provided", "Error: DOI not provided", None
311
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
312
  filepath = os.path.join(self.output_dir, filename)
313
 
 
332
  zipf.write(file_path, arcname=os.path.basename(file_path))
333
  logger.info(f"ZIP file created: {zip_filename}")
334
 
335
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
  def create_gradio_interface():
338
  """Create Gradio interface for Paper Downloader"""
 
342
  if bib_file:
343
  # Check file type
344
  if not bib_file.name.lower().endswith('.bib'):
345
+ return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
346
 
347
+ zip_path, downloaded_dois, failed_dois, _ = downloader.process_bibtex(bib_file)
348
+ return zip_path, downloaded_dois, failed_dois, None
349
  elif doi_input:
350
+ filepath, message, failed_doi = downloader.download_single_doi(doi_input)
351
+ return None, message, failed_doi, filepath
352
  elif dois_input:
353
+ zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
354
+ return zip_path, downloaded_dois, failed_dois, None
355
  else:
356
+ return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
357
+
358
+
359
+ # Gradio Interface
360
+ interface = gr.Interface(
361
+ fn=download_papers,
362
+ inputs=[
363
+ gr.File(file_types=['.bib'], label="Upload BibTeX File"),
364
+ gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
365
+ gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n...")
366
+ ],
367
+ outputs=[
368
+ gr.File(label="Download Papers (ZIP) or Single PDF"),
369
+ gr.HTML(label="""
 
 
 
 
 
 
 
 
 
 
 
370
  <div style='padding-bottom: 5px; font-weight: bold;'>
371
  Enter Single DOI
372
  </div>
 
376
  </div>
377
  <div id="downloaded-dois"></div>
378
  </div>
379
+ """),
380
+ gr.HTML(label="""
381
  <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
382
  <div style='padding-bottom: 5px; font-weight: bold;'>
383
  Failed DOIs
384
  </div>
385
  <div id="failed-dois"></div>
386
  </div>
387
+ """),
388
+ gr.File(label="Downloaded Single PDF")
389
+ ],
390
+ title="🔬 Academic Paper Batch Downloader",
391
+ description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
392
+ theme="Hev832/Applio",
393
+ examples=[
394
+ ["example.bib", None, None], # Bibtex File
395
+ [None, "10.1038/nature12373", None], # Single DOI
396
+ [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
397
+ ],
398
+ css="""
399
+ .gradio-container {
400
+ background-color: black;
401
+ }
402
+ .gr-interface {
403
+ max-width: 800px;
404
+ margin: 0 auto;
405
+ }
406
+ .gr-box {
407
+ background-color: black;
408
+ border-radius: 10px;
409
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
410
+ }
411
+ .output-text a {
412
+ color: #007bff; /* Blue color for hyperlinks */
413
+ }
414
+ """,
415
+ cache_examples = False,
416
+ )
417
+
418
+ # Add Javascript to update HTML
419
+ interface.load = """
 
 
420
  function(downloaded_dois, failed_dois){
421
+ let downloaded_html = '';
422
  downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
423
+ downloaded_html += doi + '<br>';
424
  });
 
425
  document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
426
+
427
+ let failed_html = '';
428
  failed_dois.split('\\n').filter(Boolean).forEach(doi => {
429
+ failed_html += doi + '<br>';
430
  });
 
431
  document.querySelector("#failed-dois").innerHTML = failed_html;
432
  return [downloaded_html, failed_html];
 
433
  }
434
+ """
 
 
 
 
 
 
 
 
 
435
  return interface
436
 
437
  def main():
 
439
  interface.launch(share=True)
440
 
441
  if __name__ == "__main__":
442
+ main()