gamingflexer commited on
Commit
2f67f06
·
1 Parent(s): d1e24cf

Debugged & expections added

Browse files
Files changed (2) hide show
  1. src/scrapper/arxiv.py +19 -10
  2. src/scrapper/main.py +13 -11
src/scrapper/arxiv.py CHANGED
@@ -10,6 +10,7 @@ import PyPDF2
10
  import requests
11
  from tqdm.auto import tqdm
12
  from decouple import config
 
13
 
14
  """
15
  Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
@@ -91,7 +92,7 @@ class Arxiv:
91
  # initialize the requests session
92
  self.session = requests.Session()
93
 
94
- def load(self, save: bool = False):
95
  """Load the paper from the ArXiv API or from a local file
96
  if it already exists. Stores the paper's text content and
97
  meta data in self.content and other attributes.
@@ -101,6 +102,7 @@ class Arxiv:
101
  :type save: bool, optional
102
  """
103
  # check if pdf already exists
 
104
  if os.path.exists(f'papers/{self.id}.json'):
105
  print(f'Loading papers/{self.id}.json from file')
106
  with open(f'papers/{self.id}.json', 'r') as fp:
@@ -108,15 +110,22 @@ class Arxiv:
108
  for key, value in attributes.items():
109
  setattr(self, key, value)
110
  else:
111
- res = self.session.get(self.url)
112
- with open(f'temp.pdf', 'wb') as fp:
113
- fp.write(res.content)
114
- # extract text content
115
- self._convert_pdf_to_text()
116
- # get meta for PDF
117
- self._download_meta()
118
- if save:
119
- self.save()
 
 
 
 
 
 
 
120
 
121
  def get_refs(self, extractor, text_splitter):
122
  """Get the references for the paper.
 
10
  import requests
11
  from tqdm.auto import tqdm
12
  from decouple import config
13
+ import uuid
14
 
15
  """
16
  Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
 
92
  # initialize the requests session
93
  self.session = requests.Session()
94
 
95
+ def load(self, path_author : str ,save: bool = False):
96
  """Load the paper from the ArXiv API or from a local file
97
  if it already exists. Stores the paper's text content and
98
  meta data in self.content and other attributes.
 
102
  :type save: bool, optional
103
  """
104
  # check if pdf already exists
105
+ # to_save_path = os.path.join(path_author, str(self.id)+".json")
106
  if os.path.exists(f'papers/{self.id}.json'):
107
  print(f'Loading papers/{self.id}.json from file')
108
  with open(f'papers/{self.id}.json', 'r') as fp:
 
110
  for key, value in attributes.items():
111
  setattr(self, key, value)
112
  else:
113
+ try:
114
+ res = self.session.get(self.url)
115
+ print(f'Downloading {self.url}')
116
+ # uuid_small = str(uuid.uuid4())[:8]
117
+ temp_pdf_path = f'./temp.pdf'
118
+ with open(temp_pdf_path, 'wb') as fp:
119
+ fp.write(res.content)
120
+ # extract text content
121
+ self._convert_pdf_to_text()
122
+ # get meta for PDF
123
+ self._download_meta()
124
+ if save:
125
+ self.save()
126
+ except Exception as e:
127
+ print(f"Error while downloading paper {self.id}: {e}")
128
+ raise e
129
 
130
  def get_refs(self, extractor, text_splitter):
131
  """Get the references for the paper.
src/scrapper/main.py CHANGED
@@ -11,8 +11,8 @@ class ArxivPaper:
11
  self.author_name = author_name
12
  self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
13
 
14
- def get_results_google(self, number_of_results: int = 25):
15
- result_dict = get_google_scrape(self.author_name +" research papers arxiv.org",num=number_of_results)
16
  paper_links = []
17
  for i in result_dict['organic_results']:
18
  if "arxiv.org" in i['link']:
@@ -36,12 +36,14 @@ class ArxivPaper:
36
  path_author = os.path.join(path, self.author_name.replace(" ", "_"))
37
  data = {}
38
  for i in tqdm(paper_ids):
39
- paper = Arxiv(i)
40
- paper.load()
41
- paper.get_meta()
42
- refs = paper.get_refs(
43
- extractor=self.extractor,
44
- text_splitter=self.text_splitter,)
45
- paper.chunker()
46
- paper.save_chunks(include_metadata=True, path=path_author)
47
-
 
 
 
11
  self.author_name = author_name
12
  self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
13
 
14
+ def get_results_google(self, number_of_results = 25):
15
+ result_dict = get_google_scrape(str(self.author_name)+" research papers arxiv.org",num=number_of_results)
16
  paper_links = []
17
  for i in result_dict['organic_results']:
18
  if "arxiv.org" in i['link']:
 
36
  path_author = os.path.join(path, self.author_name.replace(" ", "_"))
37
  data = {}
38
  for i in tqdm(paper_ids):
39
+ try:
40
+ paper = Arxiv(i)
41
+ paper.load(path_author)
42
+ paper.get_meta()
43
+ refs = paper.get_refs(
44
+ extractor=self.extractor,
45
+ text_splitter=self.text_splitter,)
46
+ paper.chunker()
47
+ paper.save_chunks(include_metadata=True, path=path_author)
48
+ except Exception as e:
49
+ print(f"Error processing paper {i}: {e}")