Spaces:

asach
/

arxiv-plagiarism-checker-Ilm

Runtime error

App Files Files Community

gamingflexer commited on Jan 17, 2024

Commit

2f67f06

1 Parent(s): d1e24cf

Debugged & expections added

Browse files

Files changed (2) hide show

src/scrapper/arxiv.py +19 -10
src/scrapper/main.py +13 -11

src/scrapper/arxiv.py CHANGED Viewed

@@ -10,6 +10,7 @@ import PyPDF2
 import requests
 from tqdm.auto import tqdm
 from decouple import config
 """
 Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
@@ -91,7 +92,7 @@ class Arxiv:
         # initialize the requests session
         self.session = requests.Session()
-    def load(self, save: bool = False):
         """Load the paper from the ArXiv API or from a local file
         if it already exists. Stores the paper's text content and
         meta data in self.content and other attributes.
@@ -101,6 +102,7 @@ class Arxiv:
         :type save: bool, optional
         """
         # check if pdf already exists
         if os.path.exists(f'papers/{self.id}.json'):
             print(f'Loading papers/{self.id}.json from file')
             with open(f'papers/{self.id}.json', 'r') as fp:
@@ -108,15 +110,22 @@ class Arxiv:
             for key, value in attributes.items():
                 setattr(self, key, value)
         else:
-            res = self.session.get(self.url)
-            with open(f'temp.pdf', 'wb') as fp:
-                fp.write(res.content)
-            # extract text content
-            self._convert_pdf_to_text()
-            # get meta for PDF
-            self._download_meta()
-            if save:
-                self.save()
     def get_refs(self, extractor, text_splitter):
         """Get the references for the paper.

 import requests
 from tqdm.auto import tqdm
 from decouple import config
+import uuid
 """
 Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
         # initialize the requests session
         self.session = requests.Session()
+    def load(self, path_author : str ,save: bool = False):
         """Load the paper from the ArXiv API or from a local file
         if it already exists. Stores the paper's text content and
         meta data in self.content and other attributes.
         :type save: bool, optional
         """
         # check if pdf already exists
+        # to_save_path = os.path.join(path_author, str(self.id)+".json")
         if os.path.exists(f'papers/{self.id}.json'):
             print(f'Loading papers/{self.id}.json from file')
             with open(f'papers/{self.id}.json', 'r') as fp:
             for key, value in attributes.items():
                 setattr(self, key, value)
         else:
+            try:
+                res = self.session.get(self.url)
+                print(f'Downloading {self.url}')
+                # uuid_small = str(uuid.uuid4())[:8]
+                temp_pdf_path = f'./temp.pdf'
+                with open(temp_pdf_path, 'wb') as fp:
+                    fp.write(res.content)
+                # extract text content
+                self._convert_pdf_to_text()
+                # get meta for PDF
+                self._download_meta()
+                if save:
+                    self.save()
+            except Exception as e:
+                print(f"Error while downloading paper {self.id}: {e}")
+                raise e
     def get_refs(self, extractor, text_splitter):
         """Get the references for the paper.

src/scrapper/main.py CHANGED Viewed

@@ -11,8 +11,8 @@ class ArxivPaper:
         self.author_name = author_name
         self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
-    def get_results_google(self, number_of_results: int = 25):
-        result_dict = get_google_scrape(self.author_name +" research papers arxiv.org",num=number_of_results)
         paper_links = []
         for i in result_dict['organic_results']:
             if "arxiv.org" in i['link']:
@@ -36,12 +36,14 @@ class ArxivPaper:
         path_author = os.path.join(path, self.author_name.replace(" ", "_"))
         data = {}
         for i in tqdm(paper_ids):
-            paper = Arxiv(i)
-            paper.load()
-            paper.get_meta()
-            refs = paper.get_refs(
-            extractor=self.extractor,
-            text_splitter=self.text_splitter,)
-            paper.chunker()
-            paper.save_chunks(include_metadata=True, path=path_author)

         self.author_name = author_name
         self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
+    def get_results_google(self, number_of_results = 25):
+        result_dict = get_google_scrape(str(self.author_name)+" research papers arxiv.org",num=number_of_results)
         paper_links = []
         for i in result_dict['organic_results']:
             if "arxiv.org" in i['link']:
         path_author = os.path.join(path, self.author_name.replace(" ", "_"))
         data = {}
         for i in tqdm(paper_ids):
+            try:
+                paper = Arxiv(i)
+                paper.load(path_author)
+                paper.get_meta()
+                refs = paper.get_refs(
+                extractor=self.extractor,
+                text_splitter=self.text_splitter,)
+                paper.chunker()
+                paper.save_chunks(include_metadata=True, path=path_author)
+            except Exception as e:
+                print(f"Error processing paper {i}: {e}")