Spaces:
Runtime error
Runtime error
import os | |
import json | |
import logging | |
from typing import Optional | |
import re | |
import requests | |
from requests.adapters import HTTPAdapter, Retry | |
import arxiv | |
import PyPDF2 | |
import requests | |
from tqdm.auto import tqdm | |
from decouple import config | |
import uuid | |
""" | |
Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680 | |
""" | |
paper_id_re = re.compile(r'https://arxiv.org/abs/(\d+\.\d+)') | |
def retry_request_session(retries: Optional[int] = 5): | |
# we setup retry strategy to retry on common errors | |
retries = Retry( | |
total=retries, | |
backoff_factor=0.1, | |
status_forcelist=[ | |
408, # request timeout | |
500, # internal server error | |
502, # bad gateway | |
503, # service unavailable | |
504 # gateway timeout | |
] | |
) | |
# we setup a session with the retry strategy | |
session = requests.Session() | |
session.mount('https://', HTTPAdapter(max_retries=retries)) | |
return session | |
def get_paper_id(query: str, handle_not_found: bool = True): | |
"""Get the paper ID from a query. | |
:param query: The query to search with | |
:type query: str | |
:param handle_not_found: Whether to return None if no paper is found, | |
defaults to True | |
:type handle_not_found: bool, optional | |
:return: The paper ID | |
:rtype: str | |
""" | |
special_chars = { | |
":": "%3A", | |
"|": "%7C", | |
",": "%2C", | |
" ": "+" | |
} | |
# create a translation table from the special_chars dictionary | |
translation_table = query.maketrans(special_chars) | |
# use the translate method to replace the special characters | |
search_term = query.translate(translation_table) | |
# init requests search session | |
session = retry_request_session() | |
# get the search results | |
res = session.get(f"https://www.google.com/search?q={search_term}&sclient=gws-wiz-serp") | |
try: | |
# extract the paper id | |
paper_id = paper_id_re.findall(res.text)[0] | |
except IndexError: | |
if handle_not_found: | |
# if no paper is found, return None | |
return None | |
else: | |
# if no paper is found, raise an error | |
raise Exception(f'No paper found for query: {query}') | |
return paper_id | |
class Arxiv: | |
refs_re = re.compile(r'\n(References|REFERENCES)\n') | |
references = [] | |
llm = None | |
def __init__(self, paper_id: str): | |
"""Object to handle the extraction of an ArXiv paper and its | |
relevant information. | |
:param paper_id: The ID of the paper to extract | |
:type paper_id: str | |
""" | |
self.id = paper_id | |
self.url = f"https://export.arxiv.org/pdf/{paper_id}.pdf" | |
# initialize the requests session | |
self.session = requests.Session() | |
def load(self, path_author : str ,save: bool = False): | |
"""Load the paper from the ArXiv API or from a local file | |
if it already exists. Stores the paper's text content and | |
meta data in self.content and other attributes. | |
:param save: Whether to save the paper to a local file, | |
defaults to False | |
:type save: bool, optional | |
""" | |
# check if pdf already exists | |
# to_save_path = os.path.join(path_author, str(self.id)+".json") | |
if os.path.exists(f'papers/{self.id}.json'): | |
print(f'Loading papers/{self.id}.json from file') | |
with open(f'papers/{self.id}.json', 'r') as fp: | |
attributes = json.loads(fp.read()) | |
for key, value in attributes.items(): | |
setattr(self, key, value) | |
else: | |
try: | |
res = self.session.get(self.url) | |
print(f'Downloading {self.url}') | |
# uuid_small = str(uuid.uuid4())[:8] | |
temp_pdf_path = f'./temp.pdf' | |
with open(temp_pdf_path, 'wb') as fp: | |
fp.write(res.content) | |
# extract text content | |
self._convert_pdf_to_text() | |
# get meta for PDF | |
self._download_meta() | |
if save: | |
self.save() | |
except Exception as e: | |
print(f"Error while downloading paper {self.id}: {e}") | |
raise e | |
def get_refs(self, extractor, text_splitter): | |
"""Get the references for the paper. | |
:param extractor: The LLMChain extractor model | |
:type extractor: LLMChain | |
:param text_splitter: The text splitter to use | |
:type text_splitter: TokenTextSplitter | |
:return: The references for the paper | |
:rtype: list | |
""" | |
if len(self.references) == 0: | |
self._download_refs(extractor, text_splitter) | |
return self.references | |
def _download_refs(self, extractor, text_splitter): | |
"""Download the references for the paper. Stores them in | |
the self.references attribute. | |
:param extractor: The LLMChain extractor model | |
:type extractor: LLMChain | |
:param text_splitter: The text splitter to use | |
:type text_splitter: TokenTextSplitter | |
""" | |
# get references section of paper | |
refs = self.refs_re.split(self.content)[-1] | |
# we don't need the full thing, just the first page | |
refs_page = text_splitter.split_text(refs)[0] | |
# use LLM extractor to extract references | |
out = extractor.run(refs=refs_page) | |
out = out.split('\n') | |
out = [o for o in out if o != ''] | |
# with list of references, find the paper IDs | |
ids = [get_paper_id(o) for o in out] | |
# clean up into JSONL type format | |
out = [o.split(' | ') for o in out] | |
# in case we're missing some fields | |
out = [o for o in out if len(o) == 3] | |
meta = [{ | |
'id': _id, | |
'title': o[0], | |
'authors': o[1], | |
'year': o[2] | |
} for o, _id in zip(out, ids) if _id is not None] | |
logging.debug(f"Extracted {len(meta)} references") | |
self.references = meta | |
def _convert_pdf_to_text(self): | |
"""Convert the PDF to text and store it in the self.content | |
attribute. | |
""" | |
text = [] | |
with open("temp.pdf", 'rb') as f: | |
# create a PDF object | |
pdf = PyPDF2.PdfReader(f) | |
# iterate over every page in the PDF | |
for page in range(len(pdf.pages)): | |
# get the page object | |
page_obj = pdf.pages[page] | |
# extract text from the page | |
text.append(page_obj.extract_text()) | |
text = "\n".join(text) | |
self.content = text | |
def _download_meta(self): | |
"""Download the meta information for the paper from the | |
ArXiv API and store it in the self attributes. | |
""" | |
search = arxiv.Search( | |
query=f'id:{self.id}', | |
max_results=1, | |
sort_by=arxiv.SortCriterion.SubmittedDate | |
) | |
result = list(search.results()) | |
if len(result) == 0: | |
raise ValueError(f"No paper found for paper '{self.id}'") | |
result = result[0] | |
# remove 'v1', 'v2', etc. from the end of the pdf_url | |
result.pdf_url = re.sub(r'v\d+$', '', result.pdf_url) | |
self.authors = [author.name for author in result.authors] | |
self.categories = result.categories | |
self.comment = result.comment | |
self.journal_ref = result.journal_ref | |
self.source = result.pdf_url | |
self.primary_category = result.primary_category | |
self.published = result.published.strftime('%Y%m%d') | |
self.summary = result.summary | |
self.title = result.title | |
self.updated = result.updated.strftime('%Y%m%d') | |
logging.debug(f"Downloaded metadata for paper '{self.id}'") | |
def save(self): | |
"""Save the paper to a local JSON file. | |
""" | |
with open(f'papers/{self.id}.json', 'w') as fp: | |
json.dump(self.__dict__(), fp, indent=4) | |
def save_chunks( | |
self, | |
include_metadata: bool = True, | |
path: str = "chunks" | |
): | |
"""Save the paper's chunks to a local JSONL file. | |
:param include_metadata: Whether to include the paper's | |
metadata in the chunks, defaults | |
to True | |
:type include_metadata: bool, optional | |
:param path: The path to save the file to, defaults to "papers" | |
:type path: str, optional | |
""" | |
if not os.path.exists(path): | |
os.makedirs(path) | |
with open(f'{path}/{self.id}.jsonl', 'w') as fp: | |
for chunk in self.dataset: | |
if include_metadata: | |
chunk.update(self.get_meta()) | |
fp.write(json.dumps(chunk) + '\n') | |
logging.debug(f"Saved paper to '{path}/{self.id}.jsonl'") | |
def get_meta(self): | |
"""Returns the meta information for the paper. | |
:return: The meta information for the paper | |
:rtype: dict | |
""" | |
fields = self.__dict__() | |
# drop content field because it's big | |
fields.pop('content') | |
return fields | |
def chunker(self, chunk_size=300): | |
# Single Chunk is made for now | |
clean_paper = self._clean_text(self.content) | |
langchain_dataset = [] | |
langchain_dataset.append({ | |
'doi': self.id, | |
'chunk-id': 1, | |
'chunk': clean_paper | |
}) | |
self.dataset = langchain_dataset | |
def _clean_text(self, text): | |
text = re.sub(r'-\n', '', text) | |
return text | |
def __dict__(self): | |
return { | |
'id': self.id, | |
'title': self.title, | |
'summary': self.summary, | |
'source': self.source, | |
'authors': self.authors, | |
'categories': self.categories, | |
'comment': self.comment, | |
'journal_ref': self.journal_ref, | |
'primary_category': self.primary_category, | |
'published': self.published, | |
'updated': self.updated, | |
'content': self.content, | |
'references': self.references | |
} | |
def __repr__(self): | |
return f"Arxiv(paper_id='{self.id}')" |