C2MV commited on
Commit
63912a7
·
verified ·
1 Parent(s): 62e450c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +241 -6
app.py CHANGED
@@ -10,6 +10,8 @@ from urllib.parse import quote, urlencode
10
  import gradio as gr
11
  from bs4 import BeautifulSoup
12
  import io
 
 
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO,
@@ -35,15 +37,186 @@ class PaperDownloader:
35
  # Request headers
36
  self.headers = {
37
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
38
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
 
39
  }
40
-
41
  def clean_doi(self, doi):
42
  """Clean and encode DOI for URL"""
43
  if not isinstance(doi, str):
44
  return None
45
  return quote(doi.strip()) if doi else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def download_paper_scihub(self, doi):
48
  """Improved method to download paper from Sci-Hub"""
49
  if not doi:
@@ -299,7 +472,7 @@ class PaperDownloader:
299
  downloaded_links = []
300
 
301
  # Download PDFs
302
- for doi in tqdm(dois, desc="Downloading papers"):
303
  try:
304
  # Try to download with multiple methods with retries
305
  pdf_content = self.download_with_retry(doi)
@@ -334,17 +507,79 @@ class PaperDownloader:
334
 
335
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  def create_gradio_interface():
338
  """Create Gradio interface for Paper Downloader"""
339
  downloader = PaperDownloader()
340
 
341
- def download_papers(bib_file, doi_input, dois_input):
342
  if bib_file:
343
  # Check file type
344
  if not bib_file.name.lower().endswith('.bib'):
345
  return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
346
 
347
- zip_path, downloaded_dois, failed_dois, _ = downloader.process_bibtex(bib_file)
348
  return zip_path, downloaded_dois, failed_dois, None
349
  elif doi_input:
350
  filepath, message, failed_doi = downloader.download_single_doi(doi_input)
@@ -436,4 +671,4 @@ def main():
436
  interface.launch(share=True)
437
 
438
  if __name__ == "__main__":
439
- main()
 
10
  import gradio as gr
11
  from bs4 import BeautifulSoup
12
  import io
13
+ import asyncio
14
+ import aiohttp
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO,
 
37
  # Request headers
38
  self.headers = {
39
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
40
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
41
+ 'Accept-Language': 'en-US,en;q=0.9',
42
  }
43
+
44
  def clean_doi(self, doi):
45
  """Clean and encode DOI for URL"""
46
  if not isinstance(doi, str):
47
  return None
48
  return quote(doi.strip()) if doi else None
49
+
50
+ async def fetch_with_headers(self, session, url, timeout=10):
51
+ """Utility method to fetch an URL with headers and timeout"""
52
+ try:
53
+ async with session.get(url, headers=self.headers, timeout=timeout) as response:
54
+ response.raise_for_status()
55
+ return await response.text(), response.headers
56
+ except Exception as e:
57
+ logger.debug(f"Error fetching {url}: {e}")
58
+ return None, None
59
+
60
+ async def download_paper_scihub_async(self, session, doi):
61
+ """Improved method to download paper from Sci-Hub using async requests"""
62
+ if not doi:
63
+ logger.warning("DOI not provided")
64
+ return None
65
+
66
+ for base_url in self.download_sources:
67
+ try:
68
+ scihub_url = f"{base_url}{self.clean_doi(doi)}"
69
+ text, headers = await self.fetch_with_headers(session, scihub_url, timeout=15)
70
+ if not text:
71
+ continue
72
 
73
+ # Search for multiple PDF URL patterns
74
+ pdf_patterns = [
75
+ r'(https?://[^\s<>"]+?\.pdf)',
76
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
77
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
78
+ ]
79
+
80
+ pdf_urls = []
81
+ for pattern in pdf_patterns:
82
+ pdf_urls.extend(re.findall(pattern, text))
83
+
84
+ # Try downloading from found URLs
85
+ for pdf_url in pdf_urls:
86
+ try:
87
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
88
+ # Verify if it's a PDF
89
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
90
+ logger.debug(f"Found PDF from: {pdf_url}")
91
+ return await pdf_response.read()
92
+ except Exception as e:
93
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
94
+
95
+ except Exception as e:
96
+ logger.debug(f"Error trying to download {doi} from {base_url}: {e}")
97
+
98
+ return None
99
+
100
+ async def download_paper_libgen_async(self, session, doi):
101
+ """Download from Libgen, handles the query and the redirection"""
102
+ if not doi:
103
+ return None
104
+
105
+ base_url = 'https://libgen.rs/scimag/'
106
+ try:
107
+ search_url = f"{base_url}?q={self.clean_doi(doi)}"
108
+ text, headers = await self.fetch_with_headers(session, search_url, timeout=10)
109
+
110
+ if not text or "No results" in text:
111
+ logger.debug(f"No results for DOI: {doi} on libgen")
112
+ return None
113
+
114
+ soup = BeautifulSoup(text, 'html.parser')
115
+
116
+ links = soup.select('table.c > tbody > tr:nth-child(2) > td:nth-child(1) > a')
117
+
118
+ if links:
119
+ link = links[0]
120
+ pdf_url = link['href']
121
+ pdf_response = await session.get(pdf_url, headers=self.headers, allow_redirects=True, timeout=10)
122
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
123
+ logger.debug(f"Found PDF from: {pdf_url}")
124
+ return await pdf_response.read()
125
+ except Exception as e:
126
+ logger.debug(f"Error trying to download {doi} from libgen: {e}")
127
+ return None
128
+
129
+ async def download_paper_google_scholar_async(self, session, doi):
130
+ """Search google scholar to find an article with the given doi, try to get the pdf"""
131
+ if not doi:
132
+ return None
133
+
134
+ try:
135
+
136
+ query = f'doi:"{doi}"'
137
+ params = {'q': query}
138
+ url = f'https://scholar.google.com/scholar?{urlencode(params)}'
139
+
140
+ text, headers = await self.fetch_with_headers(session, url, timeout = 10)
141
+ if not text:
142
+ return None
143
+
144
+ soup = BeautifulSoup(text, 'html.parser')
145
+
146
+ # Find any links with [PDF]
147
+ links = soup.find_all('a', string=re.compile(r'\[PDF\]', re.IGNORECASE))
148
+
149
+ if links:
150
+ pdf_url = links[0]['href']
151
+ pdf_response = await session.get(pdf_url, headers = self.headers, timeout=10)
152
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
153
+ logger.debug(f"Found PDF from: {pdf_url}")
154
+ return await pdf_response.read()
155
+ except Exception as e:
156
+ logger.debug(f"Google Scholar error for {doi}: {e}")
157
+
158
+ return None
159
+
160
+ async def download_paper_crossref_async(self, session, doi):
161
+ """Alternative search method using Crossref"""
162
+ if not doi:
163
+ return None
164
+
165
+ try:
166
+ # Search for open access link
167
+ url = f"https://api.crossref.org/works/{doi}"
168
+ response = await session.get(url, headers=self.headers, timeout=10)
169
+
170
+ if response.status == 200:
171
+ data = await response.json()
172
+ work = data.get('message', {})
173
+
174
+ # Search for open access links
175
+ links = work.get('link', [])
176
+ for link in links:
177
+ if link.get('content-type') == 'application/pdf':
178
+ pdf_url = link.get('URL')
179
+ if pdf_url:
180
+ pdf_response = await session.get(pdf_url, headers = self.headers)
181
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
182
+ logger.debug(f"Found PDF from: {pdf_url}")
183
+ return await pdf_response.read()
184
+
185
+ except Exception as e:
186
+ logger.debug(f"Crossref error for {doi}: {e}")
187
+
188
+ return None
189
+
190
+
191
+ async def download_with_retry_async(self, doi, max_retries=3, initial_delay=2):
192
+ """Downloads a paper using multiple strategies with exponential backoff and async requests"""
193
+ pdf_content = None
194
+ retries = 0
195
+ delay = initial_delay
196
+
197
+ async with aiohttp.ClientSession() as session:
198
+ while retries < max_retries and not pdf_content:
199
+ try:
200
+ pdf_content = (
201
+ await self.download_paper_scihub_async(session, doi) or
202
+ await self.download_paper_libgen_async(session, doi) or
203
+ await self.download_paper_google_scholar_async(session, doi) or
204
+ await self.download_paper_crossref_async(session, doi)
205
+
206
+ )
207
+ if pdf_content:
208
+ return pdf_content
209
+ except Exception as e:
210
+ logger.error(f"Error in download attempt {retries + 1} for DOI {doi}: {e}")
211
+
212
+ if not pdf_content:
213
+ retries += 1
214
+ logger.warning(f"Retry attempt {retries} for DOI: {doi} after {delay} seconds")
215
+ await asyncio.sleep(delay)
216
+ delay *= 2 # Exponential backoff
217
+
218
+ return None
219
+
220
  def download_paper_scihub(self, doi):
221
  """Improved method to download paper from Sci-Hub"""
222
  if not doi:
 
472
  downloaded_links = []
473
 
474
  # Download PDFs
475
+ for doi in tqdm(dois, desc="Downloading papers")):
476
  try:
477
  # Try to download with multiple methods with retries
478
  pdf_content = self.download_with_retry(doi)
 
507
 
508
  return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
509
 
510
+ async def process_bibtex_async(self, bib_file):
511
+ """Process BibTeX file and download papers with multiple strategies"""
512
+ # Read BibTeX file content from the uploaded object
513
+ try:
514
+ with open(bib_file.name, 'r', encoding='utf-8') as f:
515
+ bib_content = f.read()
516
+ except Exception as e:
517
+ logger.error(f"Error reading uploaded file {bib_file.name}: {e}")
518
+ return None, f"Error reading uploaded file {bib_file.name}: {e}", f"Error reading uploaded file {bib_file.name}: {e}", None
519
+
520
+ # Parse BibTeX data
521
+ try:
522
+ bib_database = bibtexparser.loads(bib_content)
523
+ except Exception as e:
524
+ logger.error(f"Error parsing BibTeX data: {e}")
525
+ return None, f"Error parsing BibTeX data: {e}", f"Error parsing BibTeX data: {e}", None
526
+
527
+ # Extract DOIs
528
+ dois = [entry.get('doi') for entry in bib_database.entries if entry.get('doi')]
529
+ logger.info(f"Found {len(dois)} DOIs to download")
530
+
531
+ # Result lists
532
+ downloaded_files = []
533
+ failed_dois = []
534
+ downloaded_links = []
535
+
536
+ # Download PDFs
537
+ for doi in tqdm(dois, desc="Downloading papers"):
538
+ try:
539
+ # Try to download with multiple methods with retries
540
+ pdf_content = await self.download_with_retry_async(doi)
541
+
542
+ # Save PDF
543
+ if pdf_content:
544
+ if doi is None:
545
+ return None, "Error: DOI not provided", "Error: DOI not provided", None
546
+ filename = f"{str(doi).replace('/', '_').replace('.', '_')}.pdf"
547
+ filepath = os.path.join(self.output_dir, filename)
548
+
549
+ with open(filepath, 'wb') as f:
550
+ f.write(pdf_content)
551
+
552
+ downloaded_files.append(filepath)
553
+ downloaded_links.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
554
+ logger.info(f"Successfully downloaded: {filename}")
555
+ else:
556
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
557
+
558
+ except Exception as e:
559
+ failed_dois.append(f'<a href="https://doi.org/{doi}">{doi}</a>')
560
+ logger.error(f"Error processing {doi}: {e}")
561
+
562
+ # Create ZIP of downloaded papers
563
+ if downloaded_files:
564
+ zip_filename = 'papers.zip'
565
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
566
+ for file_path in downloaded_files:
567
+ zipf.write(file_path, arcname=os.path.basename(file_path))
568
+ logger.info(f"ZIP file created: {zip_filename}")
569
+
570
+ return zip_filename, "\n".join(downloaded_links), "\n".join(failed_dois), None
571
+
572
  def create_gradio_interface():
573
  """Create Gradio interface for Paper Downloader"""
574
  downloader = PaperDownloader()
575
 
576
+ async def download_papers(bib_file, doi_input, dois_input):
577
  if bib_file:
578
  # Check file type
579
  if not bib_file.name.lower().endswith('.bib'):
580
  return None, "Error: Please upload a .bib file", "Error: Please upload a .bib file", None
581
 
582
+ zip_path, downloaded_dois, failed_dois, _ = await downloader.process_bibtex_async(bib_file)
583
  return zip_path, downloaded_dois, failed_dois, None
584
  elif doi_input:
585
  filepath, message, failed_doi = downloader.download_single_doi(doi_input)
 
671
  interface.launch(share=True)
672
 
673
  if __name__ == "__main__":
674
+ main()