Spaces:
Runtime error
Runtime error
gamingflexer
commited on
Commit
·
2f67f06
1
Parent(s):
d1e24cf
Debugged & expections added
Browse files- src/scrapper/arxiv.py +19 -10
- src/scrapper/main.py +13 -11
src/scrapper/arxiv.py
CHANGED
@@ -10,6 +10,7 @@ import PyPDF2
|
|
10 |
import requests
|
11 |
from tqdm.auto import tqdm
|
12 |
from decouple import config
|
|
|
13 |
|
14 |
"""
|
15 |
Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
|
@@ -91,7 +92,7 @@ class Arxiv:
|
|
91 |
# initialize the requests session
|
92 |
self.session = requests.Session()
|
93 |
|
94 |
-
def load(self, save: bool = False):
|
95 |
"""Load the paper from the ArXiv API or from a local file
|
96 |
if it already exists. Stores the paper's text content and
|
97 |
meta data in self.content and other attributes.
|
@@ -101,6 +102,7 @@ class Arxiv:
|
|
101 |
:type save: bool, optional
|
102 |
"""
|
103 |
# check if pdf already exists
|
|
|
104 |
if os.path.exists(f'papers/{self.id}.json'):
|
105 |
print(f'Loading papers/{self.id}.json from file')
|
106 |
with open(f'papers/{self.id}.json', 'r') as fp:
|
@@ -108,15 +110,22 @@ class Arxiv:
|
|
108 |
for key, value in attributes.items():
|
109 |
setattr(self, key, value)
|
110 |
else:
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
def get_refs(self, extractor, text_splitter):
|
122 |
"""Get the references for the paper.
|
|
|
10 |
import requests
|
11 |
from tqdm.auto import tqdm
|
12 |
from decouple import config
|
13 |
+
import uuid
|
14 |
|
15 |
"""
|
16 |
Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
|
|
|
92 |
# initialize the requests session
|
93 |
self.session = requests.Session()
|
94 |
|
95 |
+
def load(self, path_author : str ,save: bool = False):
|
96 |
"""Load the paper from the ArXiv API or from a local file
|
97 |
if it already exists. Stores the paper's text content and
|
98 |
meta data in self.content and other attributes.
|
|
|
102 |
:type save: bool, optional
|
103 |
"""
|
104 |
# check if pdf already exists
|
105 |
+
# to_save_path = os.path.join(path_author, str(self.id)+".json")
|
106 |
if os.path.exists(f'papers/{self.id}.json'):
|
107 |
print(f'Loading papers/{self.id}.json from file')
|
108 |
with open(f'papers/{self.id}.json', 'r') as fp:
|
|
|
110 |
for key, value in attributes.items():
|
111 |
setattr(self, key, value)
|
112 |
else:
|
113 |
+
try:
|
114 |
+
res = self.session.get(self.url)
|
115 |
+
print(f'Downloading {self.url}')
|
116 |
+
# uuid_small = str(uuid.uuid4())[:8]
|
117 |
+
temp_pdf_path = f'./temp.pdf'
|
118 |
+
with open(temp_pdf_path, 'wb') as fp:
|
119 |
+
fp.write(res.content)
|
120 |
+
# extract text content
|
121 |
+
self._convert_pdf_to_text()
|
122 |
+
# get meta for PDF
|
123 |
+
self._download_meta()
|
124 |
+
if save:
|
125 |
+
self.save()
|
126 |
+
except Exception as e:
|
127 |
+
print(f"Error while downloading paper {self.id}: {e}")
|
128 |
+
raise e
|
129 |
|
130 |
def get_refs(self, extractor, text_splitter):
|
131 |
"""Get the references for the paper.
|
src/scrapper/main.py
CHANGED
@@ -11,8 +11,8 @@ class ArxivPaper:
|
|
11 |
self.author_name = author_name
|
12 |
self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
|
13 |
|
14 |
-
def get_results_google(self, number_of_results
|
15 |
-
result_dict = get_google_scrape(self.author_name
|
16 |
paper_links = []
|
17 |
for i in result_dict['organic_results']:
|
18 |
if "arxiv.org" in i['link']:
|
@@ -36,12 +36,14 @@ class ArxivPaper:
|
|
36 |
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
|
37 |
data = {}
|
38 |
for i in tqdm(paper_ids):
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
11 |
self.author_name = author_name
|
12 |
self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
|
13 |
|
14 |
+
def get_results_google(self, number_of_results = 25):
|
15 |
+
result_dict = get_google_scrape(str(self.author_name)+" research papers arxiv.org",num=number_of_results)
|
16 |
paper_links = []
|
17 |
for i in result_dict['organic_results']:
|
18 |
if "arxiv.org" in i['link']:
|
|
|
36 |
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
|
37 |
data = {}
|
38 |
for i in tqdm(paper_ids):
|
39 |
+
try:
|
40 |
+
paper = Arxiv(i)
|
41 |
+
paper.load(path_author)
|
42 |
+
paper.get_meta()
|
43 |
+
refs = paper.get_refs(
|
44 |
+
extractor=self.extractor,
|
45 |
+
text_splitter=self.text_splitter,)
|
46 |
+
paper.chunker()
|
47 |
+
paper.save_chunks(include_metadata=True, path=path_author)
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error processing paper {i}: {e}")
|