FreeBibTec2

Sleeping

App Files Files Community

C2MV commited on Dec 14, 2024

Commit

8ea1432

verified ·

1 Parent(s): 9bba764

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -31

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from bs4 import BeautifulSoup
 import io
 import asyncio
 import aiohttp
 # Configure logging
 logging.basicConfig(level=logging.INFO,
@@ -57,43 +59,55 @@ class PaperDownloader:
         except Exception as e:
             logger.debug(f"Error fetching {url}: {e}")
             return None, None
     async def download_paper_direct_doi_async(self, session, doi):
-      """Attempt to download the pdf from the landing page of the doi"""
-      if not doi:
-         return None
-      try:
-         doi_url = f"https://doi.org/{self.clean_doi(doi)}"
-         text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
-         if not text:
-            return None
-         pdf_patterns = [
-           r'(https?://[^\s<>"]+?\.pdf)',
-           r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
-           r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
-         ]
-         pdf_urls = []
-         for pattern in pdf_patterns:
-            pdf_urls.extend(re.findall(pattern, text))
-         for pdf_url in pdf_urls:
-           try:
-               pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
-               if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
-                    logger.debug(f"Found PDF from: {pdf_url}")
-                    return await pdf_response.read()
-           except Exception as e:
-               logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
-      except Exception as e:
            logger.debug(f"Error trying to get the PDF from {doi}: {e}")
-      return None
     async def download_paper_scihub_async(self, session, doi):
         """Improved method to download paper from Sci-Hub using async requests"""

 import io
 import asyncio
 import aiohttp
+from playwright.async_api import async_playwright
 # Configure logging
 logging.basicConfig(level=logging.INFO,
         except Exception as e:
             logger.debug(f"Error fetching {url}: {e}")
             return None, None
     async def download_paper_direct_doi_async(self, session, doi):
+        """Attempt to download the pdf from the landing page of the doi, now with javascript rendering"""
+        if not doi:
+           return None
+        try:
+             doi_url = f"https://doi.org/{self.clean_doi(doi)}"
+             # Use Playwright to render JavaScript content
+             async with async_playwright() as p:
+                browser = await p.chromium.launch() # You can use different browsers
+                page = await browser.new_page()
+                try:
+                     await page.goto(doi_url, timeout=30000)
+                     html_content = await page.content()
+                except Exception as e:
+                      logger.debug(f"Error trying to navigate {doi}: {e}")
+                      await browser.close()
+                      return None
+                soup = BeautifulSoup(html_content, 'html.parser')
+                await browser.close()
+             pdf_patterns = [
+              r'(https?://[^\s<>"]+?\.pdf)',
+              r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
+              r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
+             ]
+             pdf_urls = []
+             for pattern in pdf_patterns:
+               pdf_urls.extend(re.findall(pattern, html_content))
+             for pdf_url in pdf_urls:
+                 try:
+                    pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
+                    if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
+                       logger.debug(f"Found PDF from: {pdf_url}")
+                       return await pdf_response.read()
+                 except Exception as e:
+                    logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
+        except Exception as e:
            logger.debug(f"Error trying to get the PDF from {doi}: {e}")
+        return None
     async def download_paper_scihub_async(self, session, doi):
         """Improved method to download paper from Sci-Hub using async requests"""