C2MV commited on
Commit
8ea1432
1 Parent(s): 9bba764

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -31
app.py CHANGED
@@ -12,6 +12,8 @@ from bs4 import BeautifulSoup
12
  import io
13
  import asyncio
14
  import aiohttp
 
 
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO,
@@ -57,43 +59,55 @@ class PaperDownloader:
57
  except Exception as e:
58
  logger.debug(f"Error fetching {url}: {e}")
59
  return None, None
60
-
61
-
62
  async def download_paper_direct_doi_async(self, session, doi):
63
- """Attempt to download the pdf from the landing page of the doi"""
64
- if not doi:
65
- return None
66
-
67
- try:
68
- doi_url = f"https://doi.org/{self.clean_doi(doi)}"
69
- text, headers = await self.fetch_with_headers(session, doi_url, timeout=15)
70
- if not text:
71
- return None
72
 
73
- pdf_patterns = [
74
- r'(https?://[^\s<>"]+?\.pdf)',
75
- r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
76
- r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
77
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- pdf_urls = []
80
- for pattern in pdf_patterns:
81
- pdf_urls.extend(re.findall(pattern, text))
82
-
83
- for pdf_url in pdf_urls:
84
- try:
85
- pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
86
- if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
87
- logger.debug(f"Found PDF from: {pdf_url}")
88
- return await pdf_response.read()
89
- except Exception as e:
90
- logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
91
 
 
 
 
 
 
 
 
 
92
 
93
- except Exception as e:
 
94
  logger.debug(f"Error trying to get the PDF from {doi}: {e}")
95
-
96
- return None
97
 
98
  async def download_paper_scihub_async(self, session, doi):
99
  """Improved method to download paper from Sci-Hub using async requests"""
 
12
  import io
13
  import asyncio
14
  import aiohttp
15
+ from playwright.async_api import async_playwright
16
+
17
 
18
  # Configure logging
19
  logging.basicConfig(level=logging.INFO,
 
59
  except Exception as e:
60
  logger.debug(f"Error fetching {url}: {e}")
61
  return None, None
62
+
 
63
  async def download_paper_direct_doi_async(self, session, doi):
64
+ """Attempt to download the pdf from the landing page of the doi, now with javascript rendering"""
65
+ if not doi:
66
+ return None
 
 
 
 
 
 
67
 
68
+ try:
69
+ doi_url = f"https://doi.org/{self.clean_doi(doi)}"
70
+
71
+ # Use Playwright to render JavaScript content
72
+ async with async_playwright() as p:
73
+ browser = await p.chromium.launch() # You can use different browsers
74
+ page = await browser.new_page()
75
+
76
+ try:
77
+ await page.goto(doi_url, timeout=30000)
78
+ html_content = await page.content()
79
+ except Exception as e:
80
+ logger.debug(f"Error trying to navigate {doi}: {e}")
81
+ await browser.close()
82
+ return None
83
+
84
+ soup = BeautifulSoup(html_content, 'html.parser')
85
+ await browser.close()
86
+
87
+ pdf_patterns = [
88
+ r'(https?://[^\s<>"]+?\.pdf)',
89
+ r'(https?://[^\s<>"]+?download/[^\s<>"]+)',
90
+ r'(https?://[^\s<>"]+?\/pdf\/[^\s<>"]+)',
91
+ ]
92
 
93
+ pdf_urls = []
94
+ for pattern in pdf_patterns:
95
+ pdf_urls.extend(re.findall(pattern, html_content))
96
+
 
 
 
 
 
 
 
 
97
 
98
+ for pdf_url in pdf_urls:
99
+ try:
100
+ pdf_response = await session.get(pdf_url, headers=self.headers, timeout=10)
101
+ if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
102
+ logger.debug(f"Found PDF from: {pdf_url}")
103
+ return await pdf_response.read()
104
+ except Exception as e:
105
+ logger.debug(f"Error downloading PDF from {pdf_url}: {e}")
106
 
107
+
108
+ except Exception as e:
109
  logger.debug(f"Error trying to get the PDF from {doi}: {e}")
110
+ return None
 
111
 
112
  async def download_paper_scihub_async(self, session, doi):
113
  """Improved method to download paper from Sci-Hub using async requests"""