C2MV commited on
Commit
449d754
1 Parent(s): 2d4634b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -215
app.py CHANGED
@@ -10,6 +10,10 @@ from urllib.parse import quote, urlencode
10
  import gradio as gr
11
  from bs4 import BeautifulSoup
12
  import io
 
 
 
 
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO,
@@ -215,241 +219,51 @@ class PaperDownloader:
215
  def download_single_doi(self, doi):
216
  """Downloads a single paper using a DOI"""
217
  if not doi:
218
- return None, "Error: DOI not provided", "Error: DOI not provided"
219
 
220
  try:
221
  pdf_content = self.download_with_retry(doi)
222
 
223
  if pdf_content:
224
  if doi is None:
225
- return None, "Error: DOI not provided", "Error: DOI not provided"
226
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
227
  filepath = os.path.join(self.output_dir, filename)
228
  with open(filepath, 'wb') as f:
229
  f.write(pdf_content)
230
  logger.info(f"Successfully downloaded: {filename}")
231
- return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', ""
232
  else:
233
  logger.warning(f"Could not download: {doi}")
234
- return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>'
235
 
236
  except Exception as e:
237
  logger.error(f"Error processing {doi}: {e}")
238
- return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}"
239
 
240
- def download_multiple_dois(self, dois_text):
241
- """Downloads multiple papers from a list of DOIs"""
242
- if not dois_text:
243
- return None, "Error: No DOIs provided", "Error: No DOIs provided"
244
-
245
- dois = [doi.strip() for doi in dois_text.split('\n') if doi.strip()]
246
- if not dois:
247
- return None, "Error: No valid DOIs provided", "Error: No valid DOIs provided"
248
-
249
- downloaded_files = []
250
- failed_dois = []
251
- downloaded_links = []
252
- for i, doi in enumerate(tqdm(dois, desc="Downloading papers")):
253
- filepath, success_message, fail_message = self.download_single_doi(doi)
254
- if filepath:
255
- # Unique filename for zip
256
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}_{i}.pdf"
257
- filepath_unique = os.path.join(self.output_dir, filename)
258
- os.rename(filepath,filepath_unique)
259
- downloaded_files.append(filepath_unique)
260
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
261
-
262
- else:
263
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
264
 
265
- if downloaded_files:
266
- zip_filename = 'papers.zip'
267
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
268
- for file_path in downloaded_files:
269
- zipf.write(file_path, arcname=os.path.basename(file_path))
270
- logger.info(f"ZIP file created: {zip_filename}")
271
 
272
- return zip_filename if downloaded_files else None, "\n".join(downloaded_links), "\n".join(failed_dois)
 
273
 
 
 
 
 
 
 
274
 
275
- def process_bibtex(self, bib_file):
276
- """Process BibTeX file and download papers with multiple strategies"""
277
- # Read BibTeX file content from the uploaded object
278
- try:
279
- with open(bib_file.name, 'r', encoding='utf-8') as f:
280
- bib_content = f.read()
281
- except Exception as e:
282
- logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
283
- return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
284
-
285
- # Parse BibTeX data
286
- try:
287
- bib_database = bibtexparser.loads(bib_content)
288
- except Exception as e:
289
- logger.error(f"Error parsing BibTeX data: {e}")
290
- return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
291
-
292
- # Extract DOIs
293
- dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
294
- logger.info(f"Found {len(dois)} DOIs to download")
295
-
296
- # Result lists
297
- downloaded_files = []
298
- failed_dois = []
299
- downloaded_links = []
300
-
301
- # Download PDFs
302
- for doi in tqdm(dois, desc="Downloading papers"):
303
- try:
304
- # Try to download with multiple methods with retries
305
- pdf_content = self.download_with_retry(doi)
306
-
307
- # Save PDF
308
- if pdf_content:
309
- if doi is None:
310
- return None, "Error: DOI not provided", "Error: DOI not provided", None
311
- filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
312
- filepath = os.path.join(self.output_dir, filename)
313
-
314
- with open(filepath, 'wb') as f:
315
- f.write(pdf_content)
316
-
317
- downloaded_files.append(filepath)
318
- downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
319
- logger.info(f"Successfully downloaded: {filename}")
320
- else:
321
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
322
-
323
- except Exception as e:
324
- failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
325
- logger.error(f"Error processing {doi}: {e}")
326
-
327
- # Create ZIP of downloaded papers
328
- if downloaded_files:
329
- zip_filename = 'papers.zip'
330
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
331
- for file_path in downloaded_files:
332
- zipf.write(file_path, arcname=os.path.basename(file_path))
333
- logger.info(f"ZIP file created: {zip_filename}")
334
-
335
- return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
336
-
337
- def create_gradio_interface():
338
- """Create Gradio interface for Paper Downloader"""
339
- downloader = PaperDownloader()
340
-
341
- def download_papers(bib_file, doi_input, dois_input, theme_mode):
342
- if theme_mode == "dark":
343
- theme = gr.themes.Monochrome(
344
- primary_hue="indigo",
345
- secondary_hue="neutral",
346
- )
347
-
348
- else:
349
- theme = "gstaff/sketch"
350
-
351
- if bib_file:
352
- # Check file type
353
- if not bib_file.name.lower().endswith('.bib'):
354
- return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
355
-
356
- zip_path, downloaded_dois, failed_dois, _ = downloader.process_bibtex(bib_file)
357
- return zip_path, downloaded_dois, failed_dois, None
358
- elif doi_input:
359
- filepath, message, failed_doi = downloader.download_single_doi(doi_input)
360
- return None, message, failed_doi, filepath
361
- elif dois_input:
362
- zip_path, downloaded_dois, failed_dois = downloader.download_multiple_dois(dois_input)
363
- return zip_path, downloaded_dois, failed_dois, None
364
- else:
365
- return None, "Please provide a .bib file, a single DOI, or a list of DOIs", "Please provide a .bib file, a single DOI, or a list of DOIs", None
366
-
367
-
368
- # Gradio Interface
369
- interface = gr.Interface(
370
- fn=download_papers,
371
- inputs=[
372
- gr.File(file_types=['.bib'], label="Upload BibTeX File"),
373
- gr.Textbox(label="Enter Single DOI", placeholder="10.xxxx/xxxx"),
374
- gr.Textbox(label="Enter Multiple DOIs (one per line)", placeholder="10.xxxx/xxxx\n10.yyyy/yyyy\n..."),
375
- gr.Dropdown(choices=["light", "dark"], value="light", label="Theme mode"),
376
- ],
377
- outputs=[
378
- gr.File(label="Download Papers (ZIP) or Single PDF"),
379
- gr.HTML(label="""
380
- <div style='padding-bottom: 5px; font-weight: bold;'>
381
- Enter Single DOI
382
- </div>
383
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
384
- <div style='padding-bottom: 5px; font-weight: bold;'>
385
- Downloaded DOIs
386
- </div>
387
- <div id="downloaded-dois"></div>
388
- </div>
389
- """),
390
- gr.HTML(label="""
391
- <div style='border: 1px solid #ddd; padding: 5px; border-radius: 5px;'>
392
- <div style='padding-bottom: 5px; font-weight: bold;'>
393
- Failed DOIs
394
- </div>
395
- <div id="failed-dois"></div>
396
- </div>
397
- """),
398
- gr.File(label="Downloaded Single PDF")
399
- ],
400
- title="🔬 Academic Paper Batch Downloader",
401
- description="Upload a BibTeX file or enter DOIs to download PDFs. We'll attempt to fetch PDFs from multiple sources like Sci-Hub, Libgen, Google Scholar and Crossref. You can use any of the three inputs at any moment.",
402
- theme="gstaff/sketch",
403
- examples=[
404
- ["example.bib", None, None], # Bibtex File
405
- [None, "10.1038/nature12373", None], # Single DOI
406
- [None, None, "10.1109/5.771073\n10.3390/horticulturae8080677"], # Multiple DOIs
407
- ],
408
- css="""
409
- .gradio-container {
410
- background-color: #222222;
411
- }
412
- .gr-interface {
413
- max-width: 800px;
414
- margin: 0 auto;
415
- }
416
- .gr-box {
417
- background-color: white;
418
- border-radius: 10px;
419
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
420
- }
421
- .output-text a {
422
- color: #007bff; /* Blue color for hyperlinks */
423
- }
424
- """,
425
- cache_examples = False,
426
- )
427
-
428
- # Add Javascript to update HTML
429
- interface.load = """
430
- function(downloaded_dois, failed_dois){
431
- let downloaded_html = '<ul>';
432
- downloaded_dois.split('\\n').filter(Boolean).forEach(doi => {
433
- downloaded_html += '<li>' + doi + '</li>';
434
- });
435
- downloaded_html += '</ul>';
436
- document.querySelector("#downloaded-dois").innerHTML = downloaded_html;
437
-
438
- let failed_html = '<ul>';
439
- failed_dois.split('\\n').filter(Boolean).forEach(doi => {
440
- failed_html += '<li>' + doi + '</li>';
441
- });
442
- failed_html += '</ul>';
443
- document.querySelector("#failed-dois").innerHTML = failed_html;
444
- return [downloaded_html, failed_html];
445
-
446
- }
447
- """
448
- return interface
449
 
450
- def main():
451
- interface = create_gradio_interface()
452
- interface.launch(share=True)
 
 
 
 
 
 
 
 
453
 
454
- if __name__ == "__main__":
455
- main()
 
10
  import gradio as gr
11
  from bs4 import BeautifulSoup
12
  import io
13
+ from docx import Document
14
+ from docx.shared import Inches
15
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
16
+
17
 
18
  # Configure logging
19
  logging.basicConfig(level=logging.INFO,
 
219
  def download_single_doi(self, doi):
220
  """Downloads a single paper using a DOI"""
221
  if not doi:
222
+ return None, "Error: DOI not provided", "Error: DOI not provided", None, None
223
 
224
  try:
225
  pdf_content = self.download_with_retry(doi)
226
 
227
  if pdf_content:
228
  if doi is None:
229
+ return None, "Error: DOI not provided", "", None, None
230
  filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
231
  filepath = os.path.join(self.output_dir, filename)
232
  with open(filepath, 'wb') as f:
233
  f.write(pdf_content)
234
  logger.info(f"Successfully downloaded: {filename}")
235
+ return filepath, f'<a href="https://doi.org/{doi}">{doi}</a>', "", None
236
  else:
237
  logger.warning(f"Could not download: {doi}")
238
+ return None, f"Could not download {doi}", f'<a href="https://doi.org/{doi}">{doi}</a>', None, None
239
 
240
  except Exception as e:
241
  logger.error(f"Error processing {doi}: {e}")
242
+ return None, f"Error processing {doi}: {e}", f"Error processing {doi}: {e}", None, None
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
 
 
 
 
 
 
245
 
246
+ # Create PaperDownloader instance
247
+ downloader = PaperDownloader()
248
 
249
+ def download_doi_callback(doi):
250
+ filepath, success_message, fail_message, _, _ = downloader.download_single_doi(doi)
251
+ if filepath:
252
+ return filepath, success_message, fail_message
253
+ else:
254
+ return None, fail_message, success_message
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ # Test or connect to a Gradio interface
258
+ iface = gr.Interface(
259
+ fn=download_doi_callback,
260
+ inputs=gr.Textbox(label="Enter DOI", placeholder="Enter DOI to download"),
261
+ outputs=[
262
+ gr.File(label="Download Paper"),
263
+ gr.HTML(label="Success Message"),
264
+ gr.HTML(label="Failure Message")
265
+ ],
266
+ live=True,
267
+ )
268
 
269
+ iface.launch()